scrapy 仅刮擦1页而不是刮擦多页

ef1yzkbh  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(118)

他们将刮只有1页的数据,只有不移动第二页是任何解决方案,然后提供给我有尝试不同的方法,但我没有成功地解决这些问题,如果任何解决方案,然后提供给我这些是页面URL https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx

import scrapy
from scrapy.http import Request
from selenium import webdriver

class TestSpider(scrapy.Spider):
    name = 'test'
    start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1,
        'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
    }

    def parse(self, response):
        books = response.xpath("//div[@class='list-group']//@href").extract()
        for book in books:
            url = response.urljoin(book)
            if url.endswith('.ro') or url.endswith('.ro/'):
                continue
            yield Request(url, callback=self.parse_book)

    def __init__(self):
        self.driver = webdriver.Chrome(
            'C:\Program Files (x86)\chromedriver.exe')

    def parse_book(self, response):
        self.driver.get(
            "https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx")
        title = response.xpath(
            "//span[@id='HeadingContent_lblTitle']//text()").get()
        d1 = response.xpath("//div[@class='col-md-10']//p[1]//text()").get()
        d1 = d1.strip()
        d2 = response.xpath("//div[@class='col-md-10']//p[2]//text()").get()
        d2 = d2.strip()
        d3 = response.xpath(
            "//div[@class='col-md-10']//p[3]//span//text()").get()
        d3 = d3.strip()
        d4 = response.xpath("//div[@class='col-md-10']//p[4]//text()").get()
        d4 = d4.strip()

        yield{
            "title1": title,
            "title2": d1,
            "title3": d2,
            "title4": d3,
            "title5": d4,
        }

    max_page_el = WebDriverWait(driver, 10).until(
    EC.visibility_of_element_located((By.ID, "MainContent_PagerTop_lblPages"))
    )

    max_page = int(max_page_el.text.split("din").pop().split(")")[0])

    # test with smaller number 
    for i in range(1, 4):
    WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.ID, f"MainContent_PagerTop_NavToPage{i}"))
    ).click()
    # scrap here
    elements = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CLASS_NAME, f"list-group"))
    )
    # just an example
    print(elements)
    driver.quit()

他们给予我输出了1页:

2admgd59

2admgd591#

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Setup your driver, open the page, etc.

max_page_el = WebDriverWait(driver, 10).until(
    EC.visibility_of_element_located((By.ID, "MainContent_PagerTop_lblPages"))
)

# use can use regex here

max_page = int(max_page_el.text.split("din").pop().split(")")[0])

# test with smaller number

for i in range(1, max_page):
    WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.ID, f"MainContent_PagerTop_NavToPage{i}"))
    ).click()
    # scrap here
    elements = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CLASS_NAME, f"list-group"))
    )
    # just an example
    print(elements)
driver.quit()

相关问题