import scrapy
from scrapy.http import Request
from scrapy.crawler import CrawlerProcess
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[@class='icon_link']//a//@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
wev={}
d1=response.xpath("//*[@class='line_list_K']//div//span")
for i in range(len(d1)):
if 'Status:' in d1[i].get():
d2=response.xpath("//div["+str(i+1)+"]//text()").get()
print(d2)
我将得到status
值,但他们将给予我空输出,这是页面链接https://rejestradwokatow.pl/adwokat/abramska-danuta-51494
2条答案
按热度按时间kmbjn2e31#
为什么不通过
text
更具体地选择您的元素,并从它的下一个同级中获取文本:示例:http://xpather.com/ZUWI58a4
要获取电子邮件,请执行以下操作:
wmtdaxz32#
您的
d2
xpath没有指向正确的div
。这应该可行: