import scrapy
import json
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http.request import Request
class NepaliSpider(CrawlSpider):
name = "nepalitimes"
allowed_domains = ["nepalitimes.com"]
# Start URL for the spider
start_urls = ['https://www.nepalitimes.com/news']
custom_settings = {
'FEED_FORMAT': 'csv',
'FEED_URI': 'nepali_times.csv'
# Rule to follow links to individual article pages
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
# Handling the load button using Selenium --- En cours de pulvérisation <3
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response, **kwargs):
# Parse the articles from the initial page
for result in response.xpath(".//div[contains(@class,'main--left')]/a"):
relative_url = result.xpath("@href").extract_first()
absolute_url = response.urljoin(relative_url)
yield scrapy.Request(url=absolute_url, callback=self.parse_item)
# Check if there is a "Load More" button
load_more_button = response.xpath(".//button[contains(@class, 'btn btn--load center') and contains(., 'load more')]")
if load_more_button:
print("Load more button detected")
tenant_code = "epz639"
routeId = 8
limit = 10
offset = 10
# Prepare the data payload for the POST request
data = {
"query": "query getMoreArticles($tenant_code: String, $routeId: Int, $limit: Int, $offset: Int) { articles: getPublicContent(tenant_code: $tenant_code, routeId: $routeId, limit: $limit, offset: $offset) { id } }",
"variables": {
"tenant_code": tenant_code,
"routeId": routeId,
"limit": limit,
"offset": offset
# Send a POST request to the endpoint using scrapy.FormRequest
yield scrapy.FormRequest(url="https://nepalitimes-hasura.superdesk.org/v1/graphql",
formdata={"query": json.dumps(data["query"]), "variables": json.dumps(data["variables"])},
headers={"Content-Type": "application/json"},
print("Post resquest sent")
def parse_ajax_response(self, response):
if 'data' in json_response and 'articles' in json_response['data']:
articles = json_response['data']['articles']
print("Articles :", articles)
for article in articles:
# Assuming there's an 'slug' field in the response representing the article slug
article_slug = article['slug']
article_url = f"https://www.nepalitimes.com/news/{article_slug}" # Adjust this based on the actual URL structure
yield scrapy.Request(url=article_url, callback=self.parse_item)
def parse_item(self, response):
# This function should extract the article information from the provided response
# and yield the scraped data as a dictionary
# Extract article information using XPath selectors
title = response.xpath('.//article[contains(@class,"article__full")]/h1/text()').get()
subtitle = response.xpath('.//span[contains(@class,"article__subhead")]/text()').get()
date = response.xpath(".//div/time[contains(@class,'article__time')]/text()").get()
author = response.xpath('.//div/span[contains(@class,"article__author")]/span/text()').get()
category = response.xpath(".//a[contains(@class,'active')]/text()").get()
url = response.xpath(".//meta[contains(@property, 'og:url')]/@content").get()
# Parse the HTML content
content_elements = response.xpath('.//div[contains(@class,"article__text")]/p')
text_content = [element.xpath("string(.)").get().strip() for element in content_elements]
cleaned_content = ' '.join(text_content)
yield {
'title': title,
'subtitle': subtitle,
'author': author,
'date': date,
'content': cleaned_content,
'category': category,
'URL': url
好吧,所以我尝试了@Leandro的建议,也就是说,使用chrom devtools而不是Selenium,但它似乎无法启动def parse_ AJAX 函数。但它仍然没有给出我想要的结果(只有9个项目被废弃)。我需要帮助
以下是我点击“加载按钮”时得到的内容:x1c 0d1x和
import scrapy
import json
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http.request import Request
class NepaliSpider(CrawlSpider):
name = "nepalitimes"
allowed_domains = ["nepalitimes.com"]
# Start URL for the spider
start_urls = ['https://www.nepalitimes.com/news']
custom_settings = {
'FEED_FORMAT': 'csv',
'FEED_URI': 'nepali_times.csv'
# Rule to follow links to individual article pages
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
# Handling the load button using Selenium --- En cours de pulvérisation <3
def parse(self, response, **kwargs):
# Parse the articles from the initial page
for result in response.xpath(".//div[contains(@class,'main--left')]/a"):
relative_url = result.xpath("@href").extract_first()
absolute_url = response.urljoin(relative_url)
yield scrapy.Request(url=absolute_url, callback=self.parse_item)
# Fetch additional articles using GraphQL API with different offset values
tenant_code = "epz639"
routeId = 8
limit = 10
offset = 10
while True:
data = {
"query": "query getMoreArticles($tenant_code: String, $routeId: Int, $limit: Int, $offset: Int) { articles: getPublicContent(tenant_code: $tenant_code, routeId: $routeId, limit: $limit, offset: $offset) { id } }",
"variables": {
"tenant_code": tenant_code,
"routeId": routeId,
"limit": limit,
"offset": offset
yield scrapy.Request(url="https://nepalitimes-hasura.superdesk.org/v1/graphql",
headers={'Content-Type': 'application/json'},
offset += limit
def parse_ajax_response(self, response):
json_response = json.loads(response.text)
if 'items' in json_response:
articles = json_response['data']['items']
print("DAta found", articles)
for article in articles:
article_id = article['id']
article_url = f"https://www.nepalitimes.com/news/{article_id}"
yield scrapy.Request(url=article_url, callback=self.parse_item)
def parse_item(self, response):
# This function should extract the article information from the provided response
# and yield the scraped data as a dictionary
# Extract article information using XPath selectors
title = response.xpath('.//article[contains(@class,"article__full")]/h1/text()').get()
subtitle = response.xpath('.//span[contains(@class,"article__subhead")]/text()').get()
date = response.xpath(".//div/time[contains(@class,'article__time')]/text()").get()
author = response.xpath('.//div/span[contains(@class,"article__author")]/span/text()').get()
category = response.xpath(".//a[contains(@class,'active')]/text()").get()
url = response.xpath(".//meta[contains(@property, 'og:url')]/@content").get()
# Parse the HTML content
content_elements = response.xpath('.//div[contains(@class,"article__text")]/p')
text_content = [element.xpath("string(.)").get().strip() for element in content_elements]
cleaned_content = ' '.join(text_content)
yield {
'title': title,
'subtitle': subtitle,
'author': author,
'date': date,
'content': cleaned_content,
'category': category,
'URL': url
它确实加载了其他页面(不仅仅关注新闻页面),而且似乎没有考虑到def parse_ AJAX _response()函数...此外,它试图抓取https://archive.nepalitimes.com/news结构,但我不希望脚本这样做。
参数)。这样,你可以摆脱 selenium ,使你的刮刀更轻。我希望这对你有帮助:)
对于您的情况,它使用GraphQL API来查询更多对象。这个请求可能看起来有点可怕,但它说明了应该从服务器返回哪些数据:
方法中向scraper添加一个yield Request(...)
的属性,该属性具有您可以在Chrome Dev Tools上看到的字符串,以及一个variables
yield Request(...)
方法。响应带有一个"totalCount": 1321