我目前正在工作的一个零碎的程序,有可用性下载文件从我刮的页面,我目前遇到的问题是,一些页面有一个数据表像这个页面-https://www.tyconsystems.com/rpms24-720-720-而其他人不喜欢这个页面-https://www.tyconsystems.com/tpdin-cable-232-
当页面上找不到文件时,传递数据的正确方法是什么?附加问题,当项目数据长度过长时,是否有任何方法可以解决csv文件中每个项目有多行的问题?例如项目-rpms 24 -720-720。
下面是我正在使用的代码。
productInfo.py
from copyreg import clear_extension_cache
import scrapy
from ..items import tyconItem
class ProductInfoSpider(scrapy.Spider):
name = "productInfo"
allowed_domains = ['tyconsystems.com']
start_urls = [
'https://www.tyconsystems.com/rpms24-720-720',
'https://www.tyconsystems.com/tpdin-cable-232',
]
def parse(self, response):
for product in response.css('section#listing'):
items = tyconItem() # Unique item for each iteration
name_dirty = product.css('div.product-id span#product_id::text').get()
product_sku = name_dirty.strip()
product_sub_title_dirty = product.css('div.product-details h1.page_headers::text').get()
product_sub_title = product_sub_title_dirty.strip()
#product_store_description = product.css('p.series-card__intro').get()
if product.xpath('//p[contains(@class, "MsoNormal")]'):
summary = product.css('div.item > div p.MsoNormal').getall()
elif product.xpath('//div[contains(@class, "item")]/div'):
summary = product.css('div.item > div').getall()
else:
summary = product.css('div.item').getall()
category_list = product.xpath('//div[@class="container"]//ol//li//a/span//text()').getall()
category = category_list[-2].strip()
description = product.css('div.item > p.MsoNormal::text').getall()
if product.css('div.extrafieldsBlock span.info a::attr(href)').get() == '':
datasheet = 'no-file'
else:
datasheet = product.css('div.extrafieldsBlock span.info a::attr(href)').get()
file_urls = datasheet
specification = product.css('div#tab-6 div.info > table').getall()
price = product.css('span#price::text').get()
products_zoom_image = name_dirty.strip() + '.jpg'
main_image = product.css('div#addl-images a::attr(href)').getall()
image_urls = [response.urljoin(i) for i in main_image]
items['category'] = category,
items['datasheet'] = datasheet,
items['description'] = description,
items['main_image'] = main_image,
items['price'] = price,
items['product_link'] = response.url, # get the product link from response
items['product_sku'] = product_sku,
items['product_sub_title'] = product_sub_title,
items['products_zoom_image'] = products_zoom_image
items['specification'] = specification,
items['summary'] = summary,
items['file_urls'] = [file_urls]
items["name"] = product_sku
items["image_urls"] = image_urls
yield items
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy import Field, Item
class tyconItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
category = scrapy.Field()
datasheet = scrapy.Field()
description = scrapy.Field()
file_urls = scrapy.Field()
files = scrapy.Field()
name = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
main_image = scrapy.Field()
price = scrapy.Field()
product_link = scrapy.Field()
product_sku = scrapy.Field()
product_sub_title = scrapy.Field()
products_zoom_image = scrapy.Field()
specification = scrapy.Field()
summary = scrapy.Field()
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
# from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request
from scrapy.pipelines.files import FilesPipeline
from scrapy.pipelines.images import ImagesPipeline
from io import BytesIO
from PIL import Image
class tyconPipeline:
def process_item(self, item, spider):
return item
class DownfilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
file_name: str = request.url.split("/")[-1]
return file_name
class ImagePipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, *args, item=None):
filename = request.meta["filename"].strip()
number = request.meta["file_num"]
return filename + "_" + str(number) + ".jpg"
def thumb_path(self, request, thumb_id, response=None, info=None):
filename = request.meta["filename"]
number = request.meta["file_num"]
return f'thumbs/{thumb_id}/{filename}_{str(number)}.jpg'
def get_media_requests(self, item, info):
name = item["name"]
for i, url in enumerate(item["image_urls"]):
meta = {"filename": name, "file_num": i}
yield Request(url, meta=meta)
def convert_image(self, image, size=None):
if size is not None:
# If the size is not None then it is a thumbnail
# so we resize it according the parameter
image = image.resize(size, Image.ANTIALIAS)
else:
# otherwise we give the image to back to the superclass version of
# this method for it to process.
return super().convert_image(image, size=size)
buf = BytesIO() # These next 3 lines are from the scrapy source code.
image.save(buf, 'JPEG', quality=72)
return image, buf
日志中出现Scrapy错误
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 857, in _runCallbacks
current.result = callback( # type: ignore[misc]
File "/usr/lib/python3/dist-packages/scrapy/utils/defer.py", line 162, in f
return deferred_from_coro(coro_f(*coro_args,**coro_kwargs))
File "/usr/lib/python3/dist-packages/scrapy/pipelines/media.py", line 87, in process_item
requests = arg_to_iter(self.get_media_requests(item, info))
File "/usr/lib/python3/dist-packages/scrapy/pipelines/files.py", line 492, in get_media_requests
return [Request(u) for u in urls]
File "/usr/lib/python3/dist-packages/scrapy/pipelines/files.py", line 492, in <listcomp>
return [Request(u) for u in urls]
File "/usr/lib/python3/dist-packages/scrapy/http/request/__init__.py", line 60, in __init__
self._set_url(url)
File "/usr/lib/python3/dist-packages/scrapy/http/request/__init__.py", line 98, in _set_url
raise TypeError(f"Request url must be str, got {type(url).__name__}")
TypeError: Request url must be str, got NoneType
谢谢大家!
1条答案
按热度按时间cuxqih211#
有两种可能的方法:
1.覆盖
get_media_requests
重写管道中的
get_media_requests
以检查URL是否存在,如下所示:2.返回不同的项目
你可以根据你是否有图片下载,从Spider返回不同的条目类型。为了方便起见,我更喜欢使用匿名字典,如下所示:
希望能有所帮助!