他们将刮只有1页的数据,只有不移动第二页是任何解决方案,然后提供给我有尝试不同的方法,但我没有成功地解决这些问题,如果任何解决方案,然后提供给我这些是页面URL https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx
import scrapy
from scrapy.http import Request
from selenium import webdriver
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[@class='list-group']//@href").extract()
for book in books:
url = response.urljoin(book)
if url.endswith('.ro') or url.endswith('.ro/'):
continue
yield Request(url, callback=self.parse_book)
def __init__(self):
self.driver = webdriver.Chrome(
'C:\Program Files (x86)\chromedriver.exe')
def parse_book(self, response):
self.driver.get(
"https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx")
title = response.xpath(
"//span[@id='HeadingContent_lblTitle']//text()").get()
d1 = response.xpath("//div[@class='col-md-10']//p[1]//text()").get()
d1 = d1.strip()
d2 = response.xpath("//div[@class='col-md-10']//p[2]//text()").get()
d2 = d2.strip()
d3 = response.xpath(
"//div[@class='col-md-10']//p[3]//span//text()").get()
d3 = d3.strip()
d4 = response.xpath("//div[@class='col-md-10']//p[4]//text()").get()
d4 = d4.strip()
yield{
"title1": title,
"title2": d1,
"title3": d2,
"title4": d3,
"title5": d4,
}
max_page_el = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.ID, "MainContent_PagerTop_lblPages"))
)
max_page = int(max_page_el.text.split("din").pop().split(")")[0])
# test with smaller number
for i in range(1, 4):
WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.ID, f"MainContent_PagerTop_NavToPage{i}"))
).click()
# scrap here
elements = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, f"list-group"))
)
# just an example
print(elements)
driver.quit()
他们给予我输出了1页:
1条答案
按热度按时间2admgd591#