scrapy Python Playwright的async并不处理所有被擦除的页面

yh2wf1be 于 2022-11-09 发布在 Python

关注(0)|答案(1)|浏览(196)

在Playwright中抓取和解析Javascript页面。大约有100个URL，但过程没有完成所有URL就结束了。
这可能是什么原因？目前代码工作正常。
for语法的位置是否错误？
如果你能告诉我我是否错误地使用了异步，我将不胜感激。

**更改为当前代码。**以下命令在Scrapy. scrapy runspider kuti_info.py中执行

import scrapy
import requests
from bs4 import BeautifulSoup
from time import sleep
from scrapy.selector import Selector
from playwright.sync_api import sync_playwright
import asyncio

class KutiSpider(scrapy.Spider):
    name = 'kuti'
    allowed_domains = ['xxxxxxx.jp']
    start_urls = ['https://xxxxxxx.jp/']

    def parse(self, response):
        urls = response.xpath('//ul[@class="areaList"]/a/@href')[0].get()
        yield response.follow(url=urls, callback=self.parse_area)

        # urls = response.xpath('//ul[@class="areaList"]')
        # for url in urls:
        #     yield response.follow(url=url.xpath('.//a/@href').get(), callback=self.parse_area)

    def parse_area(self, response):
        urls = response.xpath('//div[@class="salonName"]')
        for url in urls:

            yield response.follow(url=url.xpath('.//h3/a/@href').get(), callback=self.parse_shop)

        # next_page = response.xpath('//div[@class="pager"]//li/a[contains(text(), "次へ")]/@href').get()
        # if next_page:
        #     yield response.follow(url=next_page, callback=self.parse_area)

    async def parse_shop(self, response):
        try:
            r = requests.get(response.url)
            soup = BeautifulSoup(r.text, 'html.parser')
            repo = soup.find('div', {'class': 'abbr uTxt'})
        except:
            pass

        urls = response.xpath('//div[@class="viewMore"]/a/@href').get()
        for url in [urls]:
            newurls = response.urljoin(url) href="/therapistlist.php?id=!!!!"
            yield response.follow(url=newurls, callback=self.parse_therapist)

        # yield SeleniumRequest(url=str(newurls), screenshot=True, callback=self.parse_therapist, wait_time=2)
        try:
            yield {
            'shop_name': response.xpath('//span[@class="now"]/a/span/text()').get(),
            'shop_url': response.xpath('//dd/a/@href').get(),
            'area': response.xpath('//div[@class="basicInfo"]/dl/dt[contains(text(), "エリア")]/following-sibling::dd/text()').get(),
            'report-therapi-name': response.xpath('//div[@class="heading"]//span[@class="thName"]/a[1]/text()').get(),
            'report': repo.text
            }
        except:
            pass

    async def parse_therapist(self, response):
        with sync_playwright() as p:
            browser = p.chromium.launch()
            page = browser.new_page()
            page.goto(response.url)
            sleep(2)
            html = page.content()
            selector = Selector(text=html)
            idurls =  selector.xpath('//li[@therapist_id]/a/@href').get()
            # browser.close()
            yield response.follow(url=idurls, callback=self.parse_thera_page)

    async def parse_thera_page(self, response):
        with sync_playwright() as p:
            browser = p.chromium.launch()
            page = browser.new_page()
            print(response.url)
            page.goto(response.url)
            sleep(2)
            html = page.content()
            selector = Selector(text=html)
            print(selector.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()'))
    # try:
    #     r = requests.get(response.url)
    #     soup = BeautifulSoup(r.text, 'html.parser')
    #     repo = soup.find('div', {'class': 'txt'})
    # except:
    #     pass
            yield {
            'therapist_name': selector.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()').get(),
            # 'report': repo.text
            }

scrapy

来源：https://stackoverflow.com/questions/71500291/python-playwrights-async-does-not-process-all-of-the-scraped-pages

1条答案

按热度按时间

2exbekwf1#

我在一些地方看到.get()，所以它只得到列表中的第一个项目--即，它得到列表中的第一个治疗师。
我发现therapistlist.php?id=...使用JavaScript从therapistlist.php?id=...&more（以&more结尾）读取所有数据作为JSON，并渲染页面。这样我就可以读取治疗师列表作为JSON数据，而不需要Playwright，所以我得到结果的速度要快得多。
我在1分钟内就能得到800个治疗师。
如果你用CSV格式写数据，那么你可能会遇到另一个问题。在CSV格式中，所有项目必须有相同的列-如果Scrapy看到{'therapist_name': ...}有therapist_name列，而shop data中没有，那么它会跳过它-你可能只得到没有治疗师的商店文件。我在商店数据中添加了therapist_name字段，现在CSV也保存治疗师。

import scrapy
from time import sleep
from scrapy.selector import Selector

class KutiSpider(scrapy.Spider):
    name = 'kuti'
    allowed_domains = ['men-esthe.jp']
    start_urls = ['https://men-esthe.jp/']

    def parse(self, response):
        print('[parse] url:', response.url)

        urls = response.xpath('//ul[@class="areaList"]/a/@href')[0].get()
        print('[parse] len(urls):', len(urls), type(urls))

        yield response.follow(url=urls, callback=self.parse_area)

        # urls = response.xpath('//ul[@class="areaList"]')
        # for url in urls:
        #     yield response.follow(url=url.xpath('.//a/@href').get(), callback=self.parse_area)

    def parse_area(self, response):
        print('[parse_area] url:', response.url)

        urls = response.xpath('//div[@class="salonName"]')
        print('[parse_area] len(urls):', len(urls), type(urls))

        for url in urls:
            url = url.xpath('.//h3/a/@href').get()
            yield response.follow(url, callback=self.parse_shop)

        # next_page = response.xpath('//div[@class="pager"]//li/a[contains(text(), "次へ")]/@href').get()
        # if next_page:
        #     yield response.follow(url=next_page, callback=self.parse_area)

    def parse_shop(self, response):
        print('[parse_shop] url:', response.url)

        urls = response.xpath('//div[@class="viewMore"]/a/@href')
        print('[parse_shop] len(urls):', len(urls), type(urls))

        for url in urls.getall():
            print('[parse_shop] url:', url)
            yield response.follow(url=url + '&more', callback=self.parse_therapist)

        yield {
            'shop_name': response.xpath('//span[@class="now"]/a/span/text()').get(),
            'shop_url': response.xpath('//dd/a/@href').get(),
            'area': response.xpath('//div[@class="basicInfo"]/dl/dt[contains(text(), "エリア")]/following-sibling::dd/text()').get(),
            'report-therapi-name': response.xpath('//div[@class="heading"]//span[@class="thName"]/a[1]/text()').get(),
            'report': response.css('div.abbr.uTxt').text,
            'therapist_name': "",
        }

    def parse_therapist(self, response):
        print('[parse_therapist] url:', response.url)

        data = response.json()

        for item in data:
            url = '/therapist.php?id=' + item['id']
            yield response.follow(url=url, callback=self.parse_thera_page)

    def parse_thera_page(self, response):
        print('[parse_thera_page] url:', response.url)

        print('now:', response.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()'))

        yield {
            'shop_name': '',
            'shop_url': '',
            'area': '',
            'report-therapi-name': '',
            'report': '',
            'therapist_name': response.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()').get(),
        }

赞(0）回复(0）举报 2022-11-09

我来回答

scrapy Python Playwright的async并不处理所有被擦除的页面

1条答案

相关问题

热门标签

最新问答