检测无限爬行页面并使用Scrapy进行爬行

pod7payv  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(183)

我尝试使用scrapy抓取一个网站的所有url。但是网站中的一些页面有无限滚动,抓取的数据不完整。使用的代码是

import re
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from w3lib.url import url_query_cleaner

def process_links(links):
    for link in links:
        link.url = url_query_cleaner(link.url)
        yield link

class myCrawler(CrawlSpider):
    name = 'symphony'
    allowed_domains = ['theshell.org']
    start_urls = ['https://www.theshell.org/']
    base_url = 'https://www.theshell.org/'

    custom_settings = {
        # in order to reduce the risk of getting blocked
        'DOWNLOADER_MIDDLEWARES': {'sitescrapper.middlewares.RotateUserAgentMiddleware': 400, },
        'COOKIES_ENABLED': False,
        'CONCURRENT_REQUESTS': 6,
        'DOWNLOAD_DELAY': 1,
        # Duplicates pipeline
        'ITEM_PIPELINES': {'sitescrapper.pipelines.DuplicatesPipeline': 300},

        # In order to create a CSV file:
        'FEEDS': {'csv_file.csv': {'format': 'csv'}}
    }

    rules = (
        Rule(
            LinkExtractor(allow_domains='theshell.org',
                            deny=[
                                 r'calendar',
                                 ],
                        ),
            process_links=process_links,
            callback='parse_item',
            follow=True
        ),
    )

    def parse_item(self, response):
        yield {
            'url': response.url,
            'html_data':response.text
        }

这个页面有无限爬网机制。如何使用scrapy来检测和爬网这个无限爬网。

ffvjumwh

ffvjumwh1#

Infinity/load更多的时候是 AJAX 请求。所以你可以使用API url。这里我使用API url和scrapy默认模板代替crawlSpider。

import scrapy
from scrapy.crawler import CrawlerProcess

class myCrawler(scrapy.Spider):
    name = 'symphony'
    def start_requests(self):
        headers={
            "accept": "*/*",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "en-US,en;q=0.9",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36",
            "x-requested-with": "XMLHttpRequest"
        }
        custom_settings = {'DOWNLOAD_DELAY': 5,}
        urls = ['https://www.theshell.org/performances/list/?page='+str(x)+'' for x in range(1,5)]
        for url in urls:
            yield scrapy.Request(
                url=url,
                callback=self.parse,
                method="GET",
                headers=headers
            )    

    def parse(self, response):
        for link in response.xpath('//*[@class="content"]/h2/a/@href').getall():
            yield response.follow(link,callback=self.parse_item)
    def parse_item(self, response):
        yield {
            'title':response.xpath('//*[@class="info"]/h1/text()').get(),
            'url': response.url,     
        }

if __name__ == "__main__":
    process =CrawlerProcess()
    process.crawl(myCrawler)
    process.start()

相关问题