`
import scrapy
class AudibleSpider(scrapy.Spider):
name = 'audible'
allowed_domains = ['www.audible.com']
def start_requests(self):
yield scrapy.Request(
url='https://www.audible.com/search/',
callback=self.parse,
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}
)
def parse(self, response):
# Getting the box that contains all the info we want (title, author, length)
product_container = response.xpath('//div[@class="adbl-impression-container "]/li')
# Looping through each product listed in the product_container box
for product in product_container:
book_title = product.xpath('.//h3[contains(@class , "bc-heading")]/a/text()').get()
book_author = product.xpath('.//li[contains(@class , "authorLabel")]/span/a/text()').getall()
book_length = product.xpath('.//li[contains(@class , "runtimeLabel")]/span/text()').get()
book_author_string = ' '.join(book_author)
# Return data extracted
yield {
'title': book_title,
'author': book_author_string,
'length': book_length,
'User-Agent': response.request.headers['User-Agent']
}
# Print the scraped item for debugging
self.logger.debug("Scraped item: %s", {
'title': book_title,
'author': book_author_string,
'length': book_length,
})
# Getting the pagination bar (pagination) and then the link within the next page button (next_page_url)
pagination = response.xpath('//ul[contains(@class , "pagingElements")]')
next_page_url = pagination.xpath('.//span[contains(@class , "nextButton")]/a/@href').get()
# Going to the "next_page_url" link
if next_page_url:
yield response.follow(
url=next_page_url,
callback=self.parse,
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}
)`
scrappy crawl audible -o output.csv
- 当我运行这个scrappy代码时,csv文件显示为空白。我还附上了源代码链接。请检查一下。*
这是我正在尝试抓取的网站。他们使用了防刮机制。我希望我会得到一个解决方案。这个项目只是教育目的。
https://www.audible.com/robots.txt
源代码链接`
1条答案
按热度按时间u0sqgete1#
这是我的尝试与刮的网站的声音:
Note:
您可以删除打印内容,并且最后20个产品将被复制,因此您可以删除它们或更改代码。使用
scrapy crawl audible -o output.csv
从output.csv中得到一点:一个产品的Html片段: