在Playwright中抓取和解析Javascript页面。大约有100个URL,但过程没有完成所有URL就结束了。
这可能是什么原因?目前代码工作正常。
for语法的位置是否错误?
如果你能告诉我我是否错误地使用了异步,我将不胜感激。
**更改为当前代码。**以下命令在Scrapy. scrapy runspider kuti_info.py中执行
import scrapy
import requests
from bs4 import BeautifulSoup
from time import sleep
from scrapy.selector import Selector
from playwright.sync_api import sync_playwright
import asyncio
class KutiSpider(scrapy.Spider):
name = 'kuti'
allowed_domains = ['xxxxxxx.jp']
start_urls = ['https://xxxxxxx.jp/']
def parse(self, response):
urls = response.xpath('//ul[@class="areaList"]/a/@href')[0].get()
yield response.follow(url=urls, callback=self.parse_area)
# urls = response.xpath('//ul[@class="areaList"]')
# for url in urls:
# yield response.follow(url=url.xpath('.//a/@href').get(), callback=self.parse_area)
def parse_area(self, response):
urls = response.xpath('//div[@class="salonName"]')
for url in urls:
yield response.follow(url=url.xpath('.//h3/a/@href').get(), callback=self.parse_shop)
# next_page = response.xpath('//div[@class="pager"]//li/a[contains(text(), "次へ")]/@href').get()
# if next_page:
# yield response.follow(url=next_page, callback=self.parse_area)
async def parse_shop(self, response):
try:
r = requests.get(response.url)
soup = BeautifulSoup(r.text, 'html.parser')
repo = soup.find('div', {'class': 'abbr uTxt'})
except:
pass
urls = response.xpath('//div[@class="viewMore"]/a/@href').get()
for url in [urls]:
newurls = response.urljoin(url) href="/therapistlist.php?id=!!!!"
yield response.follow(url=newurls, callback=self.parse_therapist)
# yield SeleniumRequest(url=str(newurls), screenshot=True, callback=self.parse_therapist, wait_time=2)
try:
yield {
'shop_name': response.xpath('//span[@class="now"]/a/span/text()').get(),
'shop_url': response.xpath('//dd/a/@href').get(),
'area': response.xpath('//div[@class="basicInfo"]/dl/dt[contains(text(), "エリア")]/following-sibling::dd/text()').get(),
'report-therapi-name': response.xpath('//div[@class="heading"]//span[@class="thName"]/a[1]/text()').get(),
'report': repo.text
}
except:
pass
async def parse_therapist(self, response):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(response.url)
sleep(2)
html = page.content()
selector = Selector(text=html)
idurls = selector.xpath('//li[@therapist_id]/a/@href').get()
# browser.close()
yield response.follow(url=idurls, callback=self.parse_thera_page)
async def parse_thera_page(self, response):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
print(response.url)
page.goto(response.url)
sleep(2)
html = page.content()
selector = Selector(text=html)
print(selector.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()'))
# try:
# r = requests.get(response.url)
# soup = BeautifulSoup(r.text, 'html.parser')
# repo = soup.find('div', {'class': 'txt'})
# except:
# pass
yield {
'therapist_name': selector.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()').get(),
# 'report': repo.text
}
1条答案
按热度按时间2exbekwf1#
我在一些地方看到
.get()
,所以它只得到列表中的第一个项目--即,它得到列表中的第一个治疗师。我发现
therapistlist.php?id=...
使用JavaScript从therapistlist.php?id=...&more
(以&more
结尾)读取所有数据作为JSON,并渲染页面。这样我就可以读取治疗师列表作为JSON数据,而不需要Playwright
,所以我得到结果的速度要快得多。我在1分钟内就能得到800个治疗师。
如果你用CSV格式写数据,那么你可能会遇到另一个问题。在CSV格式中,所有项目必须有相同的列-如果
Scrapy
看到{'therapist_name': ...}
有therapist_name
列,而shop data
中没有,那么它会跳过它-你可能只得到没有治疗师的商店文件。我在商店数据中添加了therapist_name
字段,现在CSV也保存治疗师。