Scrapy问题- Whan我运行scrapy文件,没有输出

omhiaaxx  于 2023-10-20  发布在  其他
关注(0)|答案(1)|浏览(124)

`

import scrapy

class AudibleSpider(scrapy.Spider):
    name = 'audible'
    allowed_domains = ['www.audible.com']
    
    def start_requests(self):
        yield scrapy.Request(
            url='https://www.audible.com/search/',
            callback=self.parse,
            headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,  like Gecko) Chrome/115.0.0.0 Safari/537.36'}
        )

    def parse(self, response):
        # Getting the box that contains all the info we want (title, author, length)
        product_container = response.xpath('//div[@class="adbl-impression-container "]/li')

        # Looping through each product listed in the product_container box
        for product in product_container:
            book_title = product.xpath('.//h3[contains(@class , "bc-heading")]/a/text()').get()
            book_author = product.xpath('.//li[contains(@class , "authorLabel")]/span/a/text()').getall()
            book_length = product.xpath('.//li[contains(@class , "runtimeLabel")]/span/text()').get()
            
            book_author_string = ' '.join(book_author)

            # Return data extracted
            yield {
                'title': book_title,
                'author': book_author_string,
                'length': book_length,
                'User-Agent': response.request.headers['User-Agent']
            }

            # Print the scraped item for debugging
            self.logger.debug("Scraped item: %s", {
                'title': book_title,
                'author': book_author_string,
                'length': book_length,
            })

        # Getting the pagination bar (pagination) and then the link within the next page button (next_page_url)
        pagination = response.xpath('//ul[contains(@class , "pagingElements")]')
        next_page_url = pagination.xpath('.//span[contains(@class , "nextButton")]/a/@href').get()

        # Going to the "next_page_url" link
        if next_page_url:
            yield response.follow(
                url=next_page_url,
                callback=self.parse, 
                headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}
            )`

scrappy crawl audible -o output.csv

  • 当我运行这个scrappy代码时,csv文件显示为空白。我还附上了源代码链接。请检查一下。*

这是我正在尝试抓取的网站。他们使用了防刮机制。我希望我会得到一个解决方案。这个项目只是教育目的。
https://www.audible.com/robots.txt
源代码链接`

u0sqgete

u0sqgete1#

这是我的尝试与刮的网站的声音:

import scrapy
from urllib.parse import urlparse, parse_qs

class AudibleSpider(scrapy.Spider):
    name = 'audible'
    allowed_domains = ['www.audible.com']
    
    def start_requests(self):
        yield scrapy.Request(
            url='https://www.audible.com/search/',
            callback=self.parse,
            headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}
        )

    def parse(self, response):
        # Change here by id to better get only one div
        products_div = response.xpath('//div[@id="center-3"]')
        product_container = products_div.xpath('.//div[@class="bc-row-responsive"]')
        print("Number of products:", len(product_container))
        # This variable is for avoiding unecessary repetitions (the website like 3 same books and 4 empty books that are repeated by the time)
        number_of_the_same_Book = 0
        for product in product_container:
            book_title = product.xpath('.//h3[contains(@class, "bc-heading")]/a/text()').get()
            book_author_xpath = './/li[contains(@class, "authorLabel")]/span/a/text()'
            book_author = product.xpath(book_author_xpath).get()
            book_length_xpath = './/li[contains(@class, "runtimeLabel")]/span/text()'
            book_length = product.xpath(book_length_xpath).get()
            number_of_the_same_Book=number_of_the_same_Book+1
            print("Scraped item:",str(number_of_the_same_Book))
            print("Title:", book_title)
            print("Author:", book_author)
            print("Length:", book_length)
            if number_of_the_same_Book == 7:
                yield {
                    'title': book_title,
                    'author': book_author,
                    'length': book_length,
                    'User-Agent': response.request.headers['User-Agent']
                }
                self.logger.debug("Scraped item: %s", {
                    'title': book_title,
                    'author': book_author,
                    'length': book_length,
                })
                number_of_the_same_Book = 0
        pagination = response.xpath('//ul[contains(@class , "pagingElements")]')
        next_page_url = pagination.xpath('.//span[contains(@class , "nextButton")]/a/@href').get()
        # it's to gather the number of next page from the link
        parsed_url_next = urlparse(next_page_url)
        query_params_next = parse_qs(parsed_url_next.query)
        page_number_next = query_params_next.get('page', [])[0] if 'page' in query_params_next else None

        current_page_url = response.url
        # it's to gather the number of actual page from the link
        parsed_url_in = urlparse(current_page_url)
        query_params_in = parse_qs(parsed_url_in.query)
        page_number_in = query_params_in.get('page', [])[0] if 'page' in query_params_in else None
        
        if next_page_url:
            next_page_url = response.urljoin(next_page_url)
            # Comparing those 2 numbers
            if page_number_in != page_number_next:
                print("Current page is:",current_page_url)
                print("Next page is:",next_page_url)
                yield response.follow(
                    url=next_page_url,
                    callback=self.parse,
                    headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}
                )
            else:
                self.logger.info("Next page URL is the same.")
        else:
            self.logger.info("No more next pages.")

Note:您可以删除打印内容,并且最后20个产品将被复制,因此您可以删除它们或更改代码。
使用scrapy crawl audible -o output.csv从output.csv中得到一点:

title,author,length,User-Agent
The Eye of the Bedlam Bride,Matt Dinniman,Length: 26 hrs and 46 mins,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
Things We Left Behind,Lucy Score,Length: 19 hrs and 14 mins,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
Tucker,Chadwick Moore,Length: 5 hrs and 47 mins,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
Dark Future,Glenn Beck,Length: 16 hrs and 5 mins,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
The Way I Hate Him,Meghan Quinn,Length: 15 hrs and 4 mins,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
Jackson,Neven Iliev,Length: 18 hrs and 1 min,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
The Primal Hunter 6,Zogarth,Length: 20 hrs and 8 mins,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
Payback in Death,J. D. Robb,Length: 13 hrs and 54 mins,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
Iron Flame,Rebecca Yarros,Length: 14 hrs and 15 mins,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
The 32 Principles,Rener Gracie,Length: 8 hrs and 18 mins,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
Mimic & Me,Cassius Lange,Length: 16 hrs and 57 mins,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
...
On Writing,Stephen King,Length: 9 hrs and 7 mins,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
Untamed,Glennon Doyle,Length: 8 hrs and 22 mins,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
The Wisdom of the Bullfrog,Admiral William H. McRaven,Length: 3 hrs and 44 mins,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"

一个产品的Html片段:

<div class="bc-col-responsive bc-col-6">
    <div id="" class="bc-row-responsive" style="">
        <div class="bc-col-responsive bc-col-12">
            <span>
                <ul class="bc-list bc-list-nostyle">
                    <li class="bc-list-item">
                        <h3 class="bc-heading bc-color-link bc-pub-break-word bc-size-medium">
                            <a class="bc-link bc-color-link" tabindex="0"
                                href="/pd/The-Eye-of-the-Bedlam-Bride-Audiobook/B0CDXWSS5D?qid=1692559712&amp;sr=1-1&amp;ref=a_search_c3_lProduct_1_1&amp;pf_rd_p=83218cca-c308-412f-bfcf-90198b687a2f&amp;pf_rd_r=GD7AAMFPDHG4AG0PKH3J&amp;pageLoadId=l7u7RxG4ZKAjU3nV&amp;creativeId=0d6f6720-f41c-457e-a42b-8c8dceb62f2c">The
                                Eye of the Bedlam Bride</a>
                        </h3>
                    </li>
                    <li class="bc-list-item subtitle">
                        <span class="bc-text bc-size-base bc-color-secondary">Dungeon Crawler Carl, Book
                            6</span>
                    </li>
                    <li class="bc-list-item authorLabel">
                        <span class="bc-text bc-size-small bc-color-secondary">
                            By:
                            <a class="bc-link bc-color-link" tabindex="0"
                                href="/author/Matt-Dinniman/B002BLP1QY?ref=a_search_c3_lAuthor_1_1_1&amp;pf_rd_p=83218cca-c308-412f-bfcf-90198b687a2f&amp;pf_rd_r=GD7AAMFPDHG4AG0PKH3J&amp;pageLoadId=l7u7RxG4ZKAjU3nV&amp;creativeId=0d6f6720-f41c-457e-a42b-8c8dceb62f2c">Matt
                                Dinniman</a>
                        </span>
                    </li>
                    <li class="bc-list-item narratorLabel">
                        <span class="bc-text bc-size-small bc-color-secondary">
                            Narrated by:
                            <a class="bc-link bc-color-link" tabindex="0"
                                href="/search?searchNarrator=Jeff+Hays&amp;ref=a_search_c3_lNarrator_1_1_1&amp;pf_rd_p=83218cca-c308-412f-bfcf-90198b687a2f&amp;pf_rd_r=GD7AAMFPDHG4AG0PKH3J&amp;pageLoadId=l7u7RxG4ZKAjU3nV&amp;creativeId=0d6f6720-f41c-457e-a42b-8c8dceb62f2c">Jeff
                                Hays</a>, <a class="bc-link bc-color-link" tabindex="0"
                                href="/search?searchNarrator=Patrick+Warburton&amp;ref=a_search_c3_lNarrator_1_1_2&amp;pf_rd_p=83218cca-c308-412f-bfcf-90198b687a2f&amp;pf_rd_r=GD7AAMFPDHG4AG0PKH3J&amp;pageLoadId=l7u7RxG4ZKAjU3nV&amp;creativeId=0d6f6720-f41c-457e-a42b-8c8dceb62f2c">Patrick
                                Warburton</a>, <a class="bc-link bc-color-link" tabindex="0"
                                href="/search?searchNarrator=Travis+Baldree&amp;ref=a_search_c3_lNarrator_1_1_3&amp;pf_rd_p=83218cca-c308-412f-bfcf-90198b687a2f&amp;pf_rd_r=GD7AAMFPDHG4AG0PKH3J&amp;pageLoadId=l7u7RxG4ZKAjU3nV&amp;creativeId=0d6f6720-f41c-457e-a42b-8c8dceb62f2c">Travis
                                Baldree</a>, <a class="bc-link bc-color-link" tabindex="0"
                                href="/search?searchNarrator=Annie+Ellicott&amp;ref=a_search_c3_lNarrator_1_1_4&amp;pf_rd_p=83218cca-c308-412f-bfcf-90198b687a2f&amp;pf_rd_r=GD7AAMFPDHG4AG0PKH3J&amp;pageLoadId=l7u7RxG4ZKAjU3nV&amp;creativeId=0d6f6720-f41c-457e-a42b-8c8dceb62f2c">Annie
                                Ellicott</a>
                        </span>
                    </li>
                    <li class="bc-list-item seriesLabel">
                        <span class="bc-text bc-size-small bc-color-secondary">
                            Series:
                            <a class="bc-link bc-color-link" tabindex="0"
                                href="/series/Dungeon-Crawler-Carl-Audiobooks/B0937JMKYV?ref=a_search_c3_lSeries_1_1_1&amp;pf_rd_p=83218cca-c308-412f-bfcf-90198b687a2f&amp;pf_rd_r=GD7AAMFPDHG4AG0PKH3J&amp;pageLoadId=l7u7RxG4ZKAjU3nV&amp;creativeId=0d6f6720-f41c-457e-a42b-8c8dceb62f2c">Dungeon
                                Crawler Carl</a>, Book 6
                        </span>
                    </li>
                    <li class="bc-list-item runtimeLabel">
                        <span class="bc-text bc-size-small bc-color-secondary">Length: 26 hrs and 46
                            mins</span>
                    </li>
                    <li class="bc-list-item releaseDateLabel">
                        <span class="bc-text bc-size-small bc-color-secondary">Release date:
                            09-01-23
                        </span>
                    </li>
                    <li class="bc-list-item languageLabel">
                        <span class="bc-text bc-size-small bc-color-secondary">Language:
                            English
                        </span>
                    </li>
                    <li class="bc-list-item ratingsLabel">
                        <span class="bc-text bc-size-small bc-color-secondary">Not rated yet</span>
                    </li>
                </ul>
            </span>
        </div>
    </div>
</div>

相关问题