当我进入scrapy将我的网页抓取数据转换成csv!不管我有多少行,在一行中,所有行的数据都被插入

hgb9j2n6  于 2022-12-13  发布在  其他
关注(0)|答案(1)|浏览(84)
import scrapy
from ..items import AmazondawinItem

class AmazonspiderSpider(scrapy.Spider):
    name = 'amazon'
    pagenumber = 3
    allowed_domains = ['amazon.com']
    start_urls = [
        'https://www.amazon.com/s?k=laptop&i=computers&crid=27GFGJVF4KNRP&sprefix=%2Ccomputers%2C725&ref=nb_sb_ss_recent_1_0_recent'
    ]

    def parse(self, response):
        items = AmazondawinItem()
        name = response.css('.a-size-medium::text').extract()
        try:
            old_price = response.css('.a-spacing-top-micro .a-text-price span::text').extract()
        except:
            old_price = None
        price = response.css('.a-spacing-top-micro .a-price-whole::text').extract()
        try:
            review = response.css('.s-link-style .s-underline-text::text').extract()
        except:
            review = None

        imagelink = response.css('.s-image::attr(src)').extract()

        items['name'] = name
        items['old_price'] = old_price
        items['price'] = price
        items['review'] = review
        items['imagelink'] = imagelink
        # description =
        # ram =
        # brand =
        # cpu_model =
        yield items

在这里,当我去scrapy转换我的网页抓取数据到csv文件或任何文件!无论我有多少行。在一行,所有行的数据都被插入。或导入。假设,我有200行在1列。但我得到200行的数据在一行。

jutyujz0

jutyujz01#

这是因为你是在产出所有的项目,而不是分别产出每一个项目。

一个不太好的解决方案

import scrapy
# from ..items import AmazondawinItem

class AmazonspiderSpider(scrapy.Spider):
    name = 'amazon'
    pagenumber = 3
    allowed_domains = ['amazon.com']
    start_urls = [
        'https://www.amazon.com/s?k=laptop&i=computers&crid=27GFGJVF4KNRP&sprefix=%2Ccomputers%2C725&ref=nb_sb_ss_recent_1_0_recent'
    ]

    def parse(self, response):
        # items = AmazondawinItem()

        name = response.css('.a-size-medium::text').extract()
        try:
            old_price = response.css('.a-spacing-top-micro .a-text-price span::text').extract()
        except:
            old_price = None
        price = response.css('.a-spacing-top-micro .a-price-whole::text').extract()
        try:
            review = response.css('.s-link-style .s-underline-text::text').extract()
        except:
            review = None

        imagelink = response.css('.s-image::attr(src)').extract()
        # items = dict()
        # items['name'] = name
        # items['old_price'] = old_price
        # items['price'] = price
        # items['review'] = review
        # items['imagelink'] = imagelink

        items = dict()
        for (items['name'], items['old_price'], items['price'], items['review'], items['imagelink']) in zip(name, old_price, price, review, imagelink):
            yield items
        # description =
        # ram =
        # brand =
        # cpu_model =
        # yield items

更好的解决方案:

1.删除try except,如果没有找到值,get()函数将返回none。无论如何,最好不要在spider中使用它。
1.一件一件拿东西。
1.只要用你的项替换dict部分,只要确保它在循环内。

import scrapy
# from ..items import AmazondawinItem

class AmazonspiderSpider(scrapy.Spider):
    name = 'amazon'
    pagenumber = 3
    allowed_domains = ['amazon.com']
    start_urls = [
        'https://www.amazon.com/s?k=laptop&i=computers&crid=27GFGJVF4KNRP&sprefix=%2Ccomputers%2C725&ref=nb_sb_ss_recent_1_0_recent'
    ]

    def parse(self, response):
        for row in response.css('div.s-result-list div.s-result-item.s-asin'):
            # items = AmazondawinItem()
            items = dict()
            items['name'] = row.css('.a-size-medium::text').get()
            items['old_price'] = row.css('.a-spacing-top-micro .a-text-price span::text').get()
            items['price'] = response.css('.a-spacing-top-micro .a-price-whole::text').get()
            items['review'] = row.css('.s-link-style .s-underline-text::text').get()
            items['imagelink'] = row.css('.s-image::attr(src)').get()
            yield items
        # description =
        # ram =
        # brand =
        # cpu_model =
        # yield items

相关问题