python 使用scrapy和selenium的Webscraping的“加载更多”按钮[编辑]

我目前正试图从网站NepalItimes刮文章。我面临的挑战是，该网站采用了“加载更多”按钮，我需要点击加载更多的文章。然而，我的抓取过程成功地检索到了包含前六篇文章的初始页面，但它无法单击“加载更多”按钮来加载其余的文章。因此，我无法在最初的六篇文章之外找到任何东西。
此外，在抓取过程中，它继续获取URL，但不是获得所需的内容，而是返回“oops”页面，表明Selenium和按钮点击功能存在问题。
如果有人能给我解释一下我该怎么处理这个？我会很感激的！

import scrapy
import json
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http.request import Request

class NepaliSpider(CrawlSpider):
    name = "nepalitimes"
    allowed_domains = ["nepalitimes.com"]
    # Start URL for the spider
    start_urls = ['https://www.nepalitimes.com/news']

    custom_settings = {
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'nepali_times.csv'
    }

    # Rule to follow links to individual article pages
    rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )

# Handling the load button using Selenium --- En cours de pulvérisation <3
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, callback=self.parse)

    def parse(self, response, **kwargs):
        # Parse the articles from the initial page
        for result in response.xpath(".//div[contains(@class,'main--left')]/a"):
            relative_url = result.xpath("@href").extract_first()
            absolute_url = response.urljoin(relative_url)
            yield scrapy.Request(url=absolute_url, callback=self.parse_item)

        # Check if there is a "Load More" button
        load_more_button = response.xpath(".//button[contains(@class, 'btn btn--load center') and contains(., 'load more')]")
        if load_more_button:
            print("Load more button detected")
            tenant_code = "epz639"
            routeId = 8
            limit = 10
            offset = 10  

            # Prepare the data payload for the POST request
            data = {
                "query": "query getMoreArticles($tenant_code: String, $routeId: Int, $limit: Int, $offset: Int) { articles: getPublicContent(tenant_code: $tenant_code, routeId: $routeId, limit: $limit, offset: $offset) { id } }",
                "variables": {
                    "tenant_code": tenant_code,
                    "routeId": routeId,
                    "limit": limit,
                    "offset": offset
                }
            }

            # Send a POST request to the endpoint using scrapy.FormRequest
            yield scrapy.FormRequest(url="https://nepalitimes-hasura.superdesk.org/v1/graphql",
                                     formdata={"query": json.dumps(data["query"]), "variables": json.dumps(data["variables"])},
                                     headers={"Content-Type": "application/json"},
                                     callback=self.parse_ajax_response)
            print("Post resquest sent")

    def parse_ajax_response(self, response):
        if 'data' in json_response and 'articles' in json_response['data']:
            articles = json_response['data']['articles']
            print("Articles :", articles)
            for article in articles:
                # Assuming there's an 'slug' field in the response representing the article slug
                article_slug = article['slug']
                article_url = f"https://www.nepalitimes.com/news/{article_slug}"  # Adjust this based on the actual URL structure
                yield scrapy.Request(url=article_url, callback=self.parse_item)

    def parse_item(self, response):
        # This function should extract the article information from the provided response
        # and yield the scraped data as a dictionary

        # Extract article information using XPath selectors
        title = response.xpath('.//article[contains(@class,"article__full")]/h1/text()').get()
        subtitle = response.xpath('.//span[contains(@class,"article__subhead")]/text()').get()
        date = response.xpath(".//div/time[contains(@class,'article__time')]/text()").get()
        author = response.xpath('.//div/span[contains(@class,"article__author")]/span/text()').get()
        category = response.xpath(".//a[contains(@class,'active')]/text()").get()
        url = response.xpath(".//meta[contains(@property, 'og:url')]/@content").get()

        # Parse the HTML content
        content_elements = response.xpath('.//div[contains(@class,"article__text")]/p')
        text_content = [element.xpath("string(.)").get().strip() for element in content_elements]
        cleaned_content = ' '.join(text_content)

        yield {
            'title': title,
            'subtitle': subtitle,
            'author': author,
            'date': date,
            'content': cleaned_content,
            'category': category,
            'URL': url
        }

字符串
好吧，所以我尝试了@Leandro的建议，也就是说，使用chrom devtools而不是Selenium，但它似乎无法启动def parse_ AJAX 函数。但它仍然没有给出我想要的结果（只有9个项目被废弃）。我需要帮助
以下是我点击“加载按钮”时得到的内容：x1c 0d1x和

下面是编辑后的代码：

import scrapy
import json
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http.request import Request

class NepaliSpider(CrawlSpider):
    name = "nepalitimes"
    allowed_domains = ["nepalitimes.com"]
    # Start URL for the spider
    start_urls = ['https://www.nepalitimes.com/news']

    custom_settings = {
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'nepali_times.csv'
    }

    # Rule to follow links to individual article pages
    rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )

# Handling the load button using Selenium --- En cours de pulvérisation <3

    def parse(self, response, **kwargs):
        # Parse the articles from the initial page
        for result in response.xpath(".//div[contains(@class,'main--left')]/a"):
            relative_url = result.xpath("@href").extract_first()
            absolute_url = response.urljoin(relative_url)
            yield scrapy.Request(url=absolute_url, callback=self.parse_item)

        # Fetch additional articles using GraphQL API with different offset values
        tenant_code = "epz639"
        routeId = 8
        limit = 10
        offset = 10

        while True:
            data = {
                "query": "query getMoreArticles($tenant_code: String, $routeId: Int, $limit: Int, $offset: Int) { articles: getPublicContent(tenant_code: $tenant_code, routeId: $routeId, limit: $limit, offset: $offset) { id } }",
                "variables": {
                    "tenant_code": tenant_code,
                    "routeId": routeId,
                    "limit": limit,
                    "offset": offset
                }
            }

            yield scrapy.Request(url="https://nepalitimes-hasura.superdesk.org/v1/graphql",
                                 method='POST',
                                 body=json.dumps(data),
                                 headers={'Content-Type': 'application/json'},
                                 callback=self.parse_ajax_response)

            offset += limit

    def parse_ajax_response(self, response):
        json_response = json.loads(response.text)
        if 'items' in json_response:
            articles = json_response['data']['items']
            print("DAta found", articles)
            for article in articles:
                article_id = article['id']
                article_url = f"https://www.nepalitimes.com/news/{article_id}"
                yield scrapy.Request(url=article_url, callback=self.parse_item)

    def parse_item(self, response):
        # This function should extract the article information from the provided response
        # and yield the scraped data as a dictionary

        # Extract article information using XPath selectors
        title = response.xpath('.//article[contains(@class,"article__full")]/h1/text()').get()
        subtitle = response.xpath('.//span[contains(@class,"article__subhead")]/text()').get()
        date = response.xpath(".//div/time[contains(@class,'article__time')]/text()").get()
        author = response.xpath('.//div/span[contains(@class,"article__author")]/span/text()').get()
        category = response.xpath(".//a[contains(@class,'active')]/text()").get()
        url = response.xpath(".//meta[contains(@property, 'og:url')]/@content").get()

        # Parse the HTML content
        content_elements = response.xpath('.//div[contains(@class,"article__text")]/p')
        text_content = [element.xpath("string(.)").get().strip() for element in content_elements]
        cleaned_content = ' '.join(text_content)

        yield {
            'title': title,
            'subtitle': subtitle,
            'author': author,
            'date': date,
            'content': cleaned_content,
            'category': category,
            'URL': url
        }

型
它确实加载了其他页面（不仅仅关注新闻页面），而且似乎没有考虑到def parse_ AJAX _response（）函数...此外，它试图抓取https://archive.nepalitimes.com/news结构，但我不希望脚本这样做。

我认为最好的方法是在单击“LoadMore”按钮时查看正在发出的请求。例如，这可以使用Chrome开发工具中的网络选项卡来完成。然后，您可以在加载第一个页面后在Scrapy中调度此请求。这个请求可能会返回一些类似于JSON的结构，您可以用不同的方法来处理（请参阅Request对象中的callback参数）。
这样，你可以摆脱 selenium ，使你的刮刀更轻。我希望这对你有帮助：）
对于您的情况，它使用GraphQL API来查询更多对象。这个请求可能看起来有点可怕，但它说明了应该从服务器返回哪些数据：

的数据
如果您查看响应选项卡，您会看到响应如下所示：

。
因此，您应该在scrape方法中向scraper添加一个yield Request(...)，以模仿我发送的图像。因此，您的请求将具有一个主体，该主体具有名为query的属性，该属性具有您可以在Chrome Dev Tools上看到的字符串，以及一个variables参数，这是一个绑定到query参数的JSON。（实际发送的字符串可以勾选Payload选项卡，点击查看源代码）。
考虑到限制和偏移量参数，您可能需要根据要抓取的页面执行多少次此操作（yield Request(...)）。您还可以检查当您点击最后一页时会发生什么。
提示：您可以为第一个请求使用parse_first_load_more方法。响应带有一个"totalCount": 1321，您可以使用它来计算必须发出的请求数量。然后，下面的请求可以有一个不同的回调，或者您可以在请求中使用meta参数来指示这不是第一个回调。
最后的结果将是这样的（注意，这只是一个示例代码）：

import json
import scrapy

GRAPHQL_QUERY = """
    query getArticles($tenant_code: String = \"\", $routeId: Int, $limit: Int = 10, $offset: Int = 0) {
        metadata: swp_article_aggregate(where: {tenant_code: {_eq: $tenant_code}, route_id: {_eq: $routeId}}) {
            aggregate {
                totalCount: count
            }
        }
        items: swp_article(limit: $limit, offset: $offset, order_by: {published_at: desc}, where: {tenant_code: {_eq: $tenant_code}
     ...
"""

class NepalTimesScraper(scrapy.Spider):
    name = "nepaltimes"
    start_urls = ["https://www.nepalitimes.com/news"]

    def parse(self, response):
        articles = response.xpath("//article[@class='list']/..")

        for article in articles:
            title = article.css("h3::text").get()
            link = article.attrib["href"]

            yield {"title": title, "link": link}

        # Now, load more
        graphql_req = {
            "query": GRAPHQL_QUERY,
            "variables": {
                "tenant_code": "epz639",
                "routeId": 8,
                "limit": 10,
                "offset": 10,
            },
        }

        yield scrapy.Request(
            "https://nepalitimes-hasura.superdesk.org/v1/graphql",
            method="POST",
            body=json.dumps(graphql_req),
            meta={"current_offset": 10},
            callback=self.parse_more,
        )

    def parse_more(self, response):
        json_response = json.loads(response.text)
        total_number_of_articles = json_response["data"]["metadata"]["aggregate"][
            "totalCount"
        ]
        current_offset = response.meta["current_offset"]

        for article in json_response["data"]["items"]:
            yield {
                "title": article["title"],
                "link": f"{article['swp_route']['staticprefix']}/{article['slug']}",
            }

        if current_offset * 10 < total_number_of_articles:
            current_offset = current_offset + 10

            graphql_req = {
                "query": GRAPHQL_QUERY,
                "variables": {
                    "tenant_code": "epz639",
                    "routeId": 8,
                    "limit": 10,
                    "offset": current_offset,
                },
            }
            yield scrapy.Request(
                "https://nepalitimes-hasura.superdesk.org/v1/graphql",
                method="POST",
                body=json.dumps(graphql_req),
                meta={"current_offset": current_offset},
                callback=self.parse_more,
            )

字符串
希望这对你有帮助

python 使用scrapy和selenium的Webscraping的“加载更多”按钮[编辑]

1条答案

相关问题

热门标签

最新问答