Scrapy-剧作家scraper在响应的 meta中不返回'page'或'playwright_page'

eimct9ow  于 2023-04-12  发布在  其他
关注(0)|答案(1)|浏览(621)

我被困在我的项目的刮板部分,我继续排 debugging 误,我最新的方法是至少没有崩溃和燃烧.然而,响应. meta我得到无论什么原因是不返回剧作家页面.
硬件/设置:

  • 运行Monterey v12.6.4的基于英特尔的MacBook Pro
  • Python 3.11.2
  • pipenv环境
  • 所有软件包都已更新到最新稳定版本

我所追求的功能是相当简单的;从谷歌刮结果.然而,我需要自动化这最好用一个无头浏览器,并能够在一些用户定义的参数,包括网址,以及有多少结果停止前刮.
以下是我的scraper的主要部分,即导入和蜘蛛定义:

from scrapy.crawler import CrawlerProcess
import scrapy

class GoogleSpider(scrapy.Spider):
    name = 'google_spider'
    allowed_domains = ['www.google.com']
    custom_settings = {
        'CONCURRENT_REQUESTS': 1,
        'DOWNLOAD_DELAY': 3,
        'COOKIES_ENABLED': False,
        'PLAYWRIGHT_BROWSER_TYPE': 'chromium',
        'MIDDLEWARES': {
            'scrapy_playwright.middleware.PlaywrightMiddleware': 800,
        },
    }

    def __init__(self, domain, stop, user_agent, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.domain = domain
        self.stop = int(stop)
        self.custom_settings['USER_AGENT'] = user_agent
        self.start_urls = [f'https://www.google.com/search?q=intitle%3A%28%22Data+Scientist%22+OR+%22Data+Engineer%22+OR+%22Machine+Learning%22+OR+%22Data+Analyst%22+OR+%22Software+Engineer%22%29+Remote+-%22Director%22+-%22Principal%22+-%22Staff%22+-%22Frontend%22+-%22Front+End%22+-%22Full+Stack%22+site%3A{self.domain}%2F%2A+after%3A2023-03-27']
        self.urls_collected = []

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        return super().from_crawler(crawler, *args, **kwargs)

    def start_requests(self):
        yield scrapy.Request(self.start_urls[0], meta={"playwright": True,
                                                       "playwright_include_page": True})

    async def parse(self, response):
        print(f"\n\nRESPONSE STATUS: {response.status}, RESPONSE URL: {response.url}\n\n")
        print(f"RESPONSE META KEYS: {response.meta.keys()}\n\n")
        page = response.meta['page']
        current_urls_length = 0

        while True:
            locator = page.locator('.yuRUbf>a')
            urls = await locator.evaluate_all('nodes => nodes.map(n => n.href)')
            
            new_urls = [url for url in urls if self.domain in url and url not in self.urls_collected]

            self.urls_collected.extend(new_urls)

            if len(self.urls_collected) >= self.stop:
                self.urls_collected = self.urls_collected[:self.stop]
                break

            if len(urls) > current_urls_length:
                current_urls_length = len(urls)
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                await page.waitForTimeout(1000)
            else:
                break

        self.logger.info(f'Collected {len(self.urls_collected)} URLs:')
        for url in self.urls_collected:
            self.logger.info(url)

和最新的执行文件:

from scrapy.crawler import CrawlerProcess
from spiders.googlespider import GoogleSpider

def main(domain, stop, user_agent):
    process = CrawlerProcess()
    process.crawl(GoogleSpider, domain=domain, stop=stop, user_agent=user_agent)
    process.start()

if __name__ == '__main__':
    domain = 'jobs.lever.co'
    stop = 25
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    user_agent2 = "Opera/9.80 (Windows NT 5.1; U; MRA 5.5 (build 02842); ru) Presto/2.7.62 Version/11.00"
    user_agent3 = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)"
    main(domain=domain, stop=stop, user_agent=user_agent3)

和日志:

2023-04-07 09:01:17 [scrapy.utils.log] INFO: Scrapy 2.8.0 started (bot: scrapybot)
2023-04-07 09:01:17 [scrapy.utils.log] INFO: Versions: lxml 4.9.2.0, libxml2 2.9.4, cssselect 1.2.0, parsel 1.7.0, w3lib 2.1.1, Twisted 22.10.0, Python 3.11.2 (v3.11.2:878ead1ac1, Feb  7 2023, 10:02:41) [Clang 13.0.0 (clang-1300.0.29.30)], pyOpenSSL 23.1.1 (OpenSSL 3.1.0 14 Mar 2023), cryptography 40.0.1, Platform macOS-12.6.4-x86_64-i386-64bit
2023-04-07 09:01:17 [scrapy.crawler] INFO: Overridden settings:
{'CONCURRENT_REQUESTS': 1, 'COOKIES_ENABLED': False, 'DOWNLOAD_DELAY': 3}
2023-04-07 09:01:17 [py.warnings] WARNING: /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/scrapy/utils/request.py:232: ScrapyDeprecationWarning: '2.6' is a deprecated value for the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting.

It is also the default value. In other words, it is normal to get this warning if you have not defined a value for the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting. This is so for backward compatibility reasons, but it will change in a future version of Scrapy.

See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2023-04-07 09:01:17 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2023-04-07 09:01:17 [scrapy.extensions.telnet] INFO: Telnet Password: f1350e3a3455ff22
2023-04-07 09:01:17 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2023-04-07 09:01:18 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2023-04-07 09:01:18 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.spidermiddlewares.referer.RefererMiddleware',
 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
 'scrapy.spidermiddlewares.depth.DepthMiddleware']
2023-04-07 09:01:18 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2023-04-07 09:01:18 [scrapy.core.engine] INFO: Spider opened
2023-04-07 09:01:18 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2023-04-07 09:01:18 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6024
2023-04-07 09:01:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.google.com/search?q=intitle%3A%28%22Data+Scientist%22+OR+%22Data+Engineer%22+OR+%22Machine+Learning%22+OR+%22Data+Analyst%22+OR+%22Software+Engineer%22%29+Remote+-%22Director%22+-%22Principal%22+-%22Staff%22+-%22Frontend%22+-%22Front+End%22+-%22Full+Stack%22+site%3Ajobs.lever.co%2F%2A+after%3A2023-03-27> (referer: None)

RESPONSE STATUS: 200, RESPONSE URL: https://www.google.com/search?q=intitle%3A%28%22Data+Scientist%22+OR+%22Data+Engineer%22+OR+%22Machine+Learning%22+OR+%22Data+Analyst%22+OR+%22Software+Engineer%22%29+Remote+-%22Director%22+-%22Principal%22+-%22Staff%22+-%22Frontend%22+-%22Front+End%22+-%22Full+Stack%22+site%3Ajobs.lever.co%2F%2A+after%3A2023-03-27

RESPONSE META KEYS: dict_keys(['playwright', 'playwright_include_page', 'download_timeout', 'download_slot', 'download_latency'])

2023-04-07 09:01:18 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.google.com/search?q=intitle%3A%28%22Data+Scientist%22+OR+%22Data+Engineer%22+OR+%22Machine+Learning%22+OR+%22Data+Analyst%22+OR+%22Software+Engineer%22%29+Remote+-%22Director%22+-%22Principal%22+-%22Staff%22+-%22Frontend%22+-%22Front+End%22+-%22Full+Stack%22+site%3Ajobs.lever.co%2F%2A+after%3A2023-03-27> (referer: None)
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/twisted/internet/defer.py", line 1697, in _inlineCallbacks
    result = context.run(gen.send, result)
  File "/Users/reesh/Projects/qj/app/gs/gs/spiders/googlespider.py", line 37, in parse
    page = response.meta['page']
KeyError: 'page'
2023-04-07 09:01:19 [scrapy.core.engine] INFO: Closing spider (finished)
2023-04-07 09:01:19 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 507,
 'downloader/request_count': 1,
 'downloader/request_method_count/GET': 1,
 'downloader/response_bytes': 17104,
 'downloader/response_count': 1,
 'downloader/response_status_count/200': 1,
 'elapsed_time_seconds': 0.874591,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 4, 7, 16, 1, 19, 103146),
 'httpcompression/response_bytes': 53816,
 'httpcompression/response_count': 1,
 'log_count/DEBUG': 2,
 'log_count/ERROR': 1,
 'log_count/INFO': 10,
 'log_count/WARNING': 1,
 'memusage/max': 61571072,
 'memusage/startup': 61571072,
 'response_received_count': 1,
 'scheduler/dequeued': 1,
 'scheduler/dequeued/memory': 1,
 'scheduler/enqueued': 1,
 'scheduler/enqueued/memory': 1,
 'spider_exceptions/KeyError': 1,
 'start_time': datetime.datetime(2023, 4, 7, 16, 1, 18, 228555)}
2023-04-07 09:01:19 [scrapy.core.engine] INFO: Spider closed (finished)

所以response. meta完全没有“playwright_page”或“page”条目,这就是我的spider停止工作的地方。事实上,我不确定在那个定义之后的任何东西是否有效。
说实话,我并没有嫁给使用Scrapy-playwright,它只是我发现的第一个解决方案,以处理谷歌的新的无限滚动界面。我真的不介意回到绘图板,并重新开始,只要我的刮板工程作为预期。
请权衡,我是开放的任何和所有的建议!

nfg76nw0

nfg76nw01#

您在浏览器中显示的内容并不总是与您使用无头浏览器时可能收到的内容相同。
如果有疑问,最好将页面的全部内容写入html文件,然后用代码编辑器或浏览器检查它,这样你就可以确切地看到你在响应对象中实际接收到的页面。
第一件事是你的自定义设置需要调整。
使用scrapy playwright时,您应该安装http和https下载处理程序...如下所示:

custom_settings = {
        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
        'CONCURRENT_REQUESTS': 1,
        'DOWNLOAD_DELAY': 3,
        'COOKIES_ENABLED': False,
        'PLAYWRIGHT_BROWSER_TYPE': 'chromium',
        "DOWNLOAD_HANDLERS": {
            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
            "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        }
    }

你也需要在响应中搜索的值。 meta是'playwright_page',这应该可以解决你没有收到正确页面的问题。
最后,如果你遵循我的第一条建议,你会发现你的html选择器可能不存在于你从无头浏览器收到的实际页面中。在我的例子中,没有实现无限滚动,而是在每个页面的底部有一个需要点击的"Next"链接。而且类选择器与浏览器中显示的都不一样。
下面的例子对我来说是有效的,但对你来说可能不一样,但是使用上面描述的过程,你可能能够得到你想要的结果。

import re
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector

class GoogleSpider(scrapy.Spider):
    name = 'google_spider'
    allowed_domains = ['www.google.com']
    custom_settings = {
        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
        "DOWNLOAD_HANDLERS": {
            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
            "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        }
    }

    def __init__(self, domain, stop, user_agent, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.domain = domain
        self.stop = int(stop)
        self.custom_settings['USER_AGENT'] = user_agent
        self.start_urls = [f'https://www.google.com/search?q=intitle%3A%28%22Data+Scientist%22+OR+%22Data+Engineer%22+OR+%22Machine+Learning%22+OR+%22Data+Analyst%22+OR+%22Software+Engineer%22%29+Remote+-%22Director%22+-%22Principal%22+-%22Staff%22+-%22Frontend%22+-%22Front+End%22+-%22Full+Stack%22+site%3A{self.domain}%2F%2A+after%3A2023-03-27']
        self.urls_collected = []

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        return super().from_crawler(crawler, *args, **kwargs)

    def start_requests(self):
        yield scrapy.Request(self.start_urls[0], meta={"playwright": True,
                                                       "playwright_include_page": True})

    async def get_page_info(self, page):
        for i in range(10):
            val = page.viewport_size["height"]
            await page.mouse.wheel(0, val)
            await page.wait_for_timeout(1000)
        text = await page.content()
        selector = Selector(text=text)
        urls = []
        for row in selector.xpath("//div[contains(@class, 'kCrYT')]"):
            text = row.xpath(".//h3//text()").get()
            url = row.xpath(".//a/@href").get()
            if url:
                urls.append({text: url})
                print(urls)
        self.urls_collected += urls
        return urls 
    
    
    async def parse(self, response):
        page = response.meta['playwright_page']
        urls = await self.get_page_info(page)       
        found = True
        while found:
            try:
                element = page.get_by_text("Next")
                print(element, "parsing next page")
                await element.click()
                more_urls = await self.get_page_info(page)
                urls += more_urls
            except:
                found = False
        return urls
 

def main(domain, stop, user_agent):
    process = CrawlerProcess()
    process.crawl(GoogleSpider, domain=domain, stop=stop, user_agent=user_agent)
    process.start()

if __name__ == '__main__':
    domain = 'jobs.lever.co'
    stop = 25
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    user_agent2 = "Opera/9.80 (Windows NT 5.1; U; MRA 5.5 (build 02842); ru) Presto/2.7.62 Version/11.00"
    user_agent3 = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)"
    main(domain=domain, stop=stop, user_agent=user_agent3)

相关问题