python 下一页的CSS选择器

jvlzgdj9  于 2023-05-21  发布在  Python
关注(0)|答案(1)|浏览(109)

我正在尝试使用ASIN号码下载亚马逊评论。但是,我只能下载第一页,而不是所有的页面。
“下一页”的CSS选择器不工作。谢谢你。
我在下面的代码中遇到了困难:

import scrapy from urllib.parse import urljoin

class AmazonReviewsSpider(scrapy.Spider): name = "amazon_reviews"
custom_settings = {
    'FEEDS': { 'data/%(name)s_%(time)s.csv': { 'format': 'csv',}}
    }

def start_requests(self):
    asin_list = ['B08GKK7NMH']
    for asin in asin_list:
        amazon_reviews_url = f'https://www.amazon.com/product-reviews/{asin}/'
        yield scrapy.Request(url=amazon_reviews_url, callback=self.parse_reviews, meta={'asin': asin, 'retry_count': 0})

def parse_reviews(self, response):
    asin = response.meta['asin']
    retry_count = response.meta['retry_count']

    next_page_relative_url = response.css(".a-pagination .a-last>a::attr(href)::after").get()
    if next_page_relative_url is not None:
        retry_count = 0
        next_page = urljoin('https://www.amazon.com/', next_page_relative_url)
        yield scrapy.Request(url=next_page, callback=self.parse_reviews, meta={'asin': asin, 'retry_count': retry_count})

    ## Adding this retry_count here so we retry any amazon js rendered review pages
    elif retry_count < 3:
        retry_count = retry_count+1
        yield scrapy.Request(url=response.url, callback=self.parse_reviews, dont_filter=True, meta={'asin': asin, 'retry_count': retry_count})

    ## Parse Product Reviews
    review_elements = response.css("#cm_cr-review_list div.review")
    for review_element in review_elements:
        yield {
                "asin": asin,
                "text": "".join(review_element.css("span[data-hook=review-body] ::text").getall()).strip(),
                "title": review_element.css("*[data-hook=review-title]>span::text").get(),
                "location_and_date": review_element.css("span[data-hook=review-date] ::text").get(),
                "verified": bool(review_element.css("span[data-hook=avp-badge] ::text").get()),
                "rating": review_element.css("[data-hook=review-star-rating] ::text").re(r"(\d+\.\d) out")[0],
                }
vddsk6oq

vddsk6oq1#

这似乎是你必须通过.attrib方法获取href属性。

next_page_anchor = response.css("li.a-last > a")[0]
if next_page_anchor is None: 
   # do your error handling
else:
   next_page_relative_url = next_page_anchor.attrib['href']

参考文献

相关问题