import requests
import scrapy
from scrapy.http import Request
from bs4 import BeautifulSoup
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[@class='icon_link']//a//@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
detail=response.xpath("//div[@class='line_list_K']")
for i in range(len(detail)):
title=detail[i].xpath("//span[contains(text(), 'Status:')]//div").get()
print(title)
我试图从status
获取数据,从email
获取数据,但它没有给予我,这是页面链接https://rejestradwokatow.pl/adwokat/abaewicz-dominik-49965
2条答案
按热度按时间mnemlml81#
请尝试:
此xpath表达式将帮助您提取所有5行的所有数据,如
hpcdzsge2#
我将在不使用Scrappy的情况下展示一个例子。我希望你能理解并能将它应用到你的代码中。唯一的困难是email由属性中的两部分组成
输出: