scrapy 给出Xpath的无效表达式错误

xxhby3vn  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(136)

它将给予我无效的路径表达式,我正在尝试抓取电子邮件https://rejestradwokatow.pl/adwokat/abaewicz-dominik-49965

import scrapy
from scrapy.http import Request
from scrapy.crawler import CrawlerProcess

class TestSpider(scrapy.Spider):
    name = 'test'
    start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1,
        'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
        }

    def parse(self, response):
        books = response.xpath("//td[@class='icon_link']//a//@href").extract()
        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)

    def parse_book(self, response):

        data=response.xpath("//span[text()[contains(.,'Email')]]/following-sibling::div/(concat(@data-ea,'@',@data-eb)")

        yield{
            'email':data

        }
vngu2lb8

vngu2lb81#

正如您所说,您的xpath是错误的:

import scrapy
from scrapy.http import Request
from scrapy.crawler import CrawlerProcess

class TestSpider(scrapy.Spider):
    name = 'test'
    start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1,
        'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
    }

    def parse(self, response):
        books = response.xpath("//td[@class='icon_link']//a//@href").extract()
        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)

    def parse_book(self, response):
        data = response.xpath("concat(//span[text()[contains(.,'Email')]]/following-sibling::div/@data-ea, '@',//span[text()[contains(.,'Email')]]/following-sibling::div/@data-eb)").get()
        if data == '@':
            data = 'No Email Address'
        yield {
            'email': data
        }

(BTW如果你想的话,你可以不用concat就得到它)

相关问题