如何< script nonce>使用scrapy从这种类型的标记中删除数据

amrnrhlw  于 2023-08-05  发布在  其他
关注(0)|答案(1)|浏览(80)

我想从网站“https://pfchangsmexico.com.mx/ubicaciones/“中删除xpath= /html/body/script[1]中每个餐厅的经纬度值。提供我的代码与scrapy如何从这个scrap这些值

import json
from scrapy import Selector
import scrapy
import re

class TrySpider(scrapy.Spider):
    name = "try"
    allowed_domains = ["pfchangsmexico.com.mx"]
    start_urls = ["https://pfchangsmexico.com.mx/index.html"]

    def parse(self, response):
        location_page = response.css('div a::attr(href)').get()
        yield scrapy.Request(url=location_page, callback=self.parse_info)

    def parse_info(self, response):
        restaurant_names = []
        for location in response.css('div.wpb_wrapper p span::text').getall():
            match = re.search(r'(P\.F\. Chang\’s)(.*)', location)
            if match:
                restaurant_names.append(match.group(1) + match.group(2))

        print(restaurant_names)
        print(len(restaurant_names))

        addresses = response.xpath('//div/p/span[2]').getall()

        addresses = [address for address in addresses if address !=
                     '<span style="font-family: Avenir;">Servicio a domicilio:</span>']
        addresses = [address.replace('</span>', '') for address in addresses]
        addresses = [re.sub(r'<.*?>', '', address) for address in addresses]

        print(addresses)
        print(len(addresses))

        sel = Selector(response=response)

        # Extract the content of the <script nonce> element
        script_content = sel.xpath(
            '//html/body/script[1][contains(text(), "onEmbedLoad")]/text()').get()

        # Extract the latitude and longitude values
        latitude = sel.re_first(r'latitude":(\d+\.\d+)')
        longitude = sel.re_first(r'longitude":(-\d+\.\d+)')

        if latitude and longitude:
            print("Latitude:", latitude)
            print("Longitude:", longitude)
        else:
            print("Latitude and longitude not found.")

        yield {
            'name': restaurant_names,
            'address': addresses
        }
# /html/body/script[1]

字符串
这是我试过的代码,但有了这个,我无法从给予wesbite其他数据刮纬度和经度正确地为我提供正确的代码来刮纬度和经度的每一个餐厅网站= https://pfchangsmexico.com.mx/ubicaciones/

zbsbpyhn

zbsbpyhn1#

不存在与r'latitude":(\d+\.\d+)匹配的文本。在xpath('//html/body/script[1][contains(text(),"onEmbedLoad")]/text()')旁边指向iframe。所以我怀疑你是否能直接访问它。您可能必须使用scrapy-seleniumdriver.switch_to.frame(frameElement)
之后你可以这样做。

place_name = driver.find_element(By.CSS_SELECTOR, 'div.place-name').get_attribute('innerText')
script_content = driver.find_element(By.XPATH, '//html/body/script[1][contains(text(), "onEmbedLoad")]/text()').get_attribute('innerText')

# match for 'P. F. Chang's Plaza Carso, C. Lago Zurich 245, Amp Granada, Miguel Hidalgo, 11520 Ciudad de México, CDMX",[19.441211,-99.204493],'
location = re.findall(f"{place_name}[^[]*(\[[\d.,-]+\])", script_content)[0]

print(location) # String [19.441211,-99.204493]

字符串

相关问题