scrapy 抓取 AJAX 页面

axr492tv  于 2022-11-09  发布在  其他
关注(0)|答案(2)|浏览(150)

我不知道如何刮 AJAX 页面有没有在网站上分页的网站将加载通过点击load more button这些是页面链接https://aaos22.mapyourshow.com/8_0/explore/exhibitor-gallery.cfm?featured=false

import scrapy
from scrapy.http import Request
from selenium import webdriver
from scrapy_selenium import SeleniumRequest
import pandas  as pd

class TestSpider(scrapy.Spider):
    name = 'test'

    def start_requests(self):
        yield SeleniumRequest(
            url="https://aaos22.mapyourshow.com/8_0/explore/exhibitor-gallery.cfm?featured=false",
            wait_time=3,
            screenshot=True,
            callback=self.parse,
            dont_filter=True
        )

    def parse(self, response):
        books = response.xpath("//h3[@class='card-Title\nbreak-word\nf3\nmb1\nmt0']//a//@href").extract()

        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)

    def parse_book(self, response):
        title = response.css(".mr3-m::text").get()

        address = response.css(".showcase-address::text").get()
        address=address.strip()

        website = response.xpath("//li[@class='dib  ml3  mr3']//a[starts-with(@href, 'http')]/@href").get() 
        website=website.strip()

        phone = response.xpath("//li[@class='dib  ml3  mr3'] //span[contains(text(), 'Phone:')]/following-sibling::text()").get()
        phone=phone.strip().replace("-","")

        yield{
            'title':title,
            'address':address,
            'website':website,
            'phone':phone

        }
u0sqgete

u0sqgete1#

好的,尝试以下脚本,遍历所有参展商列表,获取您希望从中获取的所有字段:

import scrapy
from scrapy.selector import Selector

class MapYourShowSpider(scrapy.Spider):
    name = "mapyourshow"

    content_url = 'https://aaos22.mapyourshow.com/8_0/ajax/remote-proxy.cfm'
    inner_base = 'https://aaos22.mapyourshow.com/8_0/exhibitor/exhibitor-details.cfm?exhid={}'

    headers = {
        'x-requested-with': 'XMLHttpRequest',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
    }
    params = {
        'action': 'search',
        'searchtype': 'exhibitorgallery',
        'searchsize': '557',
        'start': '0',
    }

    def start_requests(self):
        yield scrapy.FormRequest(
            url=self.content_url,
            method='GET',
            headers=self.headers,
            formdata=self.params,
            callback=self.parse,
        )

    def parse(self,response):
        for item in response.json()['DATA']['results']['exhibitor']['hit']:
            inner_link = self.inner_base.format(item['fields']['exhid_l'])
            yield scrapy.Request(
                url=inner_link,
                headers=self.headers,
                callback=self.parse_content,
            )

    def parse_content(self,response):
        elem = response.json()['DATA']['BODYHTML']
        sel = Selector(text=elem)
        title = sel.css("h2::text").get()
        try:
            address = ' '.join([' '.join(i.split()) for i in sel.css("p.showcase-address::text").getall()])
        except AttributeError: address = ""
        website = sel.css("a[title*='website']::text").get()
        phone = sel.xpath("normalize-space(//*[starts-with(@class,'showcase-web-phone')]/li[./*[.='Phone:']]/span/following::text())").get()
        yield {"title":title,"address":address,"website":website,"phone":phone}
liwlm1x9

liwlm1x92#

我没有使用你的代码,而是按照我的方式来做(因为我不是 selenium 的超级粉丝)。但我希望这对我有帮助:

import requests
import json
import time
from bs4 import BeautifulSoup
import re

headers = {
    'x-requested-with': 'XMLHttpRequest',
}

params = {
    'action': 'search',
    'searchtype': 'exhibitorgallery',
    'searchsize': '200', # don`t increase this too much (increase the start parameter instead and send a new request after some delay)
    'start': '0',
}

response = requests.get('https://aaos22.mapyourshow.com/8_0/ajax/remote-proxy.cfm', params=params, headers=headers)

data = json.loads(response.text)

all_sites = []
for exs in data["DATA"]["results"]["exhibitor"]["hit"]:
    id = exs["fields"]["exhid_l"]
    site = f"https://aaos22.mapyourshow.com/8_0/exhibitor/exhibitor-details.cfm?exhid={id}"
    all_sites.append(site)

for site in all_sites:
    response = requests.get(site)

    soup = BeautifulSoup(response.text, "html.parser")

    info_box = soup.find("div", {"id":"showroomContentDiv"})
    title = info_box.find("section", {"id":"scroll-description"}).text.strip().split("\n")[0][6:]
    address = " ".join(info_box.find("p", {"class":"showcase-address"}).text.strip().split())
    website = info_box.find("ul", {"class":"showcase-web-phone"}).find_all("li")[0].text.strip()
    phone = info_box.find("ul", {"class":"showcase-web-phone"}).find_all("li")[1].text[7:].strip()

    print(title)
    print(address)
    print(website)
    print(phone)

    # delay so you don't create too much traffic
    time.sleep(1)

相关问题