Python爬行-亚马逊审查爬行与BeautifulSoup

mi7gmzs6  于 2023-03-04  发布在  Python
关注(0)|答案(3)|浏览(169)

我试着用Jupeter Notebook从亚马逊上爬取评论数据。
但是有来自服务器的响应503。
有人知道它出了什么问题吗?
这里是网址https://www.amazon.com/Apple-MWP22AM-A-AirPods-Pro/product-reviews/B07ZPC9QD4/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=
这是我的代码。

import re, requests, csv 
from bs4 import BeautifulSoup 
from time import sleep

def reviews_info(div): 
    review_text = div.find("div", "a-row a-spacing-small review-data").get_text() 
    review_author = div.find("span", "a-profile-name").get_text()
    review_stars = div.find("span", "a-icon-alt").get_text() 
    on_review_date = div.find('span', 'a-size-base a-color-secondary review-date').get_text() 
    review_date = [x.strip() for x in re.sub("on ", "", on_review_date).split(",")] 

    return { "review_text" : review_text, 
            "review_author" : review_author, 
            "review_stars" : review_stars, 
            "review_date": review_date }
base_url = 'https://www.amazon.com/Apple-MWP22AM-A-AirPods-Pro/product-reviews/B07ZPC9QD4/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber='

reviews = [] 

NUM_PAGES = 8

for page_num in range(1, NUM_PAGES + 1): 
    print("souping page", page_num, ",", len(reviews), "data collected") 
    url = base_url + str(page_num) 
    soup = BeautifulSoup(requests.get(url).text, 'lxml') 

    for div in soup('div', 'a-section review'): 
        reviews.append(reviews_info(div)) 
    
    sleep(30)

最后我试了

requests.get(url)

输出为

<Response [503]>

我也试过

requests.get(url).text()

输出为

TypeError: 'str' object is not callable

Amazon是否阻止了爬网?
我很感激你的回答!

8oomwypt

8oomwypt1#

当你试图抓取请求时,Amazon会使用python request lib来阻止你对他们服务器的请求。你可以尝试使用Selenium和chromium浏览器来实现这个功能。这里是Python版本的Selenium:https://selenium-python.readthedocs.io/.

b91juud3

b91juud32#

我试过webdriver。
这是我的代码。

from selenium import webdriver
import re
import requests 
import csv 
from bs4 import BeautifulSoup 
from time import sleep

review_list = []
NUM_PAGE = 8

base_url = 'https://www.amazon.com/Apple-MWP22AM-A-AirPods-Pro/product-reviews/B07ZPC9QD4/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber='

for num_page in range(1, NUM_PAGE + 1):
    chrome_driver = '/Users/chromedriver'
    driver = webdriver.Chrome(chrome_driver)

    url = base_url + str(num_page)
    driver.get(url)

    src = driver.page_source
    source = BeautifulSoup(src, 'lxml', from_encoding='utf-8')

    driver.close()

    print("souping page", num_page, ",", len(source.find_all('div', 'a-section celwidget')), "의 data를 수집")

    for source in source.find_all('div', 'a-section celwidget'): 
        review_text = source.find("div", "a-row a-spacing-small review-data").get_text() 
        review_author = source.find("span", "a-profile-name").get_text()
        review_stars = source.find("span", "a-icon-alt").get_text() 
        on_review_date = source.find('span', 'a-size-base a-color-secondary review-date').get_text() 
        #review_date = [x.strip() for x in re.sub("on ", "", on_review_date).split(",")] 

        review = { "review_text" : review_text, 
                "review_author" : review_author, 
                "review_stars" : review_stars, 
                "review_date": on_review_date }

        review_list.append(review)
    
    sleep(10)
0dxa2lsx

0dxa2lsx3#

一个比selenium/webdriver更快的解决方案是使用代理,但成本更高。我使用proxycrawl--除了作为客户之外,我与他们没有任何关系。我还推荐使用Scrapy这样的抓取框架。它将有助于避免使用请求之间的可变时间和其他功能进行检测。
你支付每成功刮-你不收取不成功刮。这是最便宜的代理解决方案,我发现。
您可以这样使用它:

import scrapy  # scraping framework to parse data
from proxycrawl.proxycrawl_api import ProxyCrawlAPI
from datetime import datetime  # used to convert review date string into datetime object. Useful if you plan to insert into an SQL db.

api = ProxyCrawlAPI({'token': 'NON-JS TOKEN'})
apijava = ProxyCrawlAPI({'token': 'JS TOKEN'})

def start_requests(self):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
    }
    url = 'https://www.amazon.com/product-reviews/B07ZPC9QD4/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews'  # you don't need the product title in the url
    # build proxcrawl url
    pcurl = api.buildURL(url, {})
    yield scrapy.Request(pcurl, callback=self.parse, errback=self.errback_httpbin, headers=headers, meta={'asin': 'B07ZC90D4'})

def parse(self, response):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
    }
    reviews_count = response.xpath('//*[@id="cm_cr-review_list"]/div[@data-hook="review"]').getall()
    asin = response.meta['asin']
    asin_title = response.xpath('//*[@id="cm_cr-product_info"]/div/div[2]/div/div/div[2]/div[1]/h1/a/text()').get()
    if reviews_count is not None:  # review_count = number of reviews
        for review_index in range(len(reviews_count)):
            review_index += 1
            review_title = response.xpath('//*[@id="cm_cr-review_list"]/div[@data-hook="review"][' +
                                          str(review_index) + ']/div/div/div[2]/a[2]/span/text()').get()
            review_rating_string = response.xpath('//*[@id="cm_cr-review_list"]/div[@data-hook="review"][' +
                                                  str(review_index) + ']/div/div/div[2]/a[1]/@title').get()
            review_date_string = response.xpath('//*[@id="cm_cr-review_list"]/div[@data-hook="review"][' +
                                                str(
                                                    review_index) + ']/div/div/span[@data-hook="review-date"]/text()').get()
            review_body = response.xpath('//*[@id="cm_cr-review_list"]/div[@data-hook="review"][' +
                                         str(review_index) + ']/div/div/div[4]/span/span/text()').get()
            review_rating = str(review_rating_string).split(' ', 1)[0]
            # get rid of the 00:00:00 time
            review_date = str(datetime.strptime(review_date_string, '%B %d, %Y')).split(' ', 1)[0]
            date_of_cur_review = datetime.strptime(review_date, '%Y-%m-%d')
            
            # DO SOMETHING HERE. INSERT INTO A DB?
            #####
            
            # go to next page if there is one
            if review_index == 10:
                next_page = response.xpath('//*[@class="a-last"]/a/@href').get()
                if next_page is not None:
                    headers = {'User-Agent': headers}
                    yield response.follow(api.buildURL('https://www.amazon.com' + next_page, {}),
                                          callback=self.parse, errback=self.errback_httpbin, headers=headers,
                                          meta={'asin': response.meta['asin']})

相关问题