python 使用scrapy和selenium的Webscraping的“加载更多”按钮[编辑]

erhoui1w  于 2023-08-02  发布在  Python
关注(0)|答案(1)|浏览(138)

我目前正试图从网站NepalItimes刮文章。我面临的挑战是,该网站采用了“加载更多”按钮,我需要点击加载更多的文章。然而,我的抓取过程成功地检索到了包含前六篇文章的初始页面,但它无法单击“加载更多”按钮来加载其余的文章。因此,我无法在最初的六篇文章之外找到任何东西。
此外,在抓取过程中,它继续获取URL,但不是获得所需的内容,而是返回“oops”页面,表明Selenium和按钮点击功能存在问题。
如果有人能给我解释一下我该怎么处理这个?我会很感激的!

import scrapy
import json
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http.request import Request

class NepaliSpider(CrawlSpider):
    name = "nepalitimes"
    allowed_domains = ["nepalitimes.com"]
    # Start URL for the spider
    start_urls = ['https://www.nepalitimes.com/news']

    custom_settings = {
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'nepali_times.csv'
    }

    # Rule to follow links to individual article pages
    rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )

# Handling the load button using Selenium --- En cours de pulvérisation <3
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, callback=self.parse)

    def parse(self, response, **kwargs):
        # Parse the articles from the initial page
        for result in response.xpath(".//div[contains(@class,'main--left')]/a"):
            relative_url = result.xpath("@href").extract_first()
            absolute_url = response.urljoin(relative_url)
            yield scrapy.Request(url=absolute_url, callback=self.parse_item)

        # Check if there is a "Load More" button
        load_more_button = response.xpath(".//button[contains(@class, 'btn btn--load center') and contains(., 'load more')]")
        if load_more_button:
            print("Load more button detected")
            tenant_code = "epz639"
            routeId = 8
            limit = 10
            offset = 10  

            # Prepare the data payload for the POST request
            data = {
                "query": "query getMoreArticles($tenant_code: String, $routeId: Int, $limit: Int, $offset: Int) { articles: getPublicContent(tenant_code: $tenant_code, routeId: $routeId, limit: $limit, offset: $offset) { id } }",
                "variables": {
                    "tenant_code": tenant_code,
                    "routeId": routeId,
                    "limit": limit,
                    "offset": offset
                }
            }

            # Send a POST request to the endpoint using scrapy.FormRequest
            yield scrapy.FormRequest(url="https://nepalitimes-hasura.superdesk.org/v1/graphql",
                                     formdata={"query": json.dumps(data["query"]), "variables": json.dumps(data["variables"])},
                                     headers={"Content-Type": "application/json"},
                                     callback=self.parse_ajax_response)
            print("Post resquest sent")

    def parse_ajax_response(self, response):
        if 'data' in json_response and 'articles' in json_response['data']:
            articles = json_response['data']['articles']
            print("Articles :", articles)
            for article in articles:
                # Assuming there's an 'slug' field in the response representing the article slug
                article_slug = article['slug']
                article_url = f"https://www.nepalitimes.com/news/{article_slug}"  # Adjust this based on the actual URL structure
                yield scrapy.Request(url=article_url, callback=self.parse_item)

    def parse_item(self, response):
        # This function should extract the article information from the provided response
        # and yield the scraped data as a dictionary

        # Extract article information using XPath selectors
        title = response.xpath('.//article[contains(@class,"article__full")]/h1/text()').get()
        subtitle = response.xpath('.//span[contains(@class,"article__subhead")]/text()').get()
        date = response.xpath(".//div/time[contains(@class,'article__time')]/text()").get()
        author = response.xpath('.//div/span[contains(@class,"article__author")]/span/text()').get()
        category = response.xpath(".//a[contains(@class,'active')]/text()").get()
        url = response.xpath(".//meta[contains(@property, 'og:url')]/@content").get()

        # Parse the HTML content
        content_elements = response.xpath('.//div[contains(@class,"article__text")]/p')
        text_content = [element.xpath("string(.)").get().strip() for element in content_elements]
        cleaned_content = ' '.join(text_content)

        yield {
            'title': title,
            'subtitle': subtitle,
            'author': author,
            'date': date,
            'content': cleaned_content,
            'category': category,
            'URL': url
        }

字符串
好吧,所以我尝试了@Leandro的建议,也就是说,使用chrom devtools而不是Selenium,但它似乎无法启动def parse_ AJAX 函数。但它仍然没有给出我想要的结果(只有9个项目被废弃)。我需要帮助
以下是我点击“加载按钮”时得到的内容:x1c 0d1x和

下面是编辑后的代码:

import scrapy
import json
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http.request import Request

class NepaliSpider(CrawlSpider):
    name = "nepalitimes"
    allowed_domains = ["nepalitimes.com"]
    # Start URL for the spider
    start_urls = ['https://www.nepalitimes.com/news']

    custom_settings = {
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'nepali_times.csv'
    }

    # Rule to follow links to individual article pages
    rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )

# Handling the load button using Selenium --- En cours de pulvérisation <3

    def parse(self, response, **kwargs):
        # Parse the articles from the initial page
        for result in response.xpath(".//div[contains(@class,'main--left')]/a"):
            relative_url = result.xpath("@href").extract_first()
            absolute_url = response.urljoin(relative_url)
            yield scrapy.Request(url=absolute_url, callback=self.parse_item)

        # Fetch additional articles using GraphQL API with different offset values
        tenant_code = "epz639"
        routeId = 8
        limit = 10
        offset = 10

        while True:
            data = {
                "query": "query getMoreArticles($tenant_code: String, $routeId: Int, $limit: Int, $offset: Int) { articles: getPublicContent(tenant_code: $tenant_code, routeId: $routeId, limit: $limit, offset: $offset) { id } }",
                "variables": {
                    "tenant_code": tenant_code,
                    "routeId": routeId,
                    "limit": limit,
                    "offset": offset
                }
            }

            yield scrapy.Request(url="https://nepalitimes-hasura.superdesk.org/v1/graphql",
                                 method='POST',
                                 body=json.dumps(data),
                                 headers={'Content-Type': 'application/json'},
                                 callback=self.parse_ajax_response)

            offset += limit

    def parse_ajax_response(self, response):
        json_response = json.loads(response.text)
        if 'items' in json_response:
            articles = json_response['data']['items']
            print("DAta found", articles)
            for article in articles:
                article_id = article['id']
                article_url = f"https://www.nepalitimes.com/news/{article_id}"
                yield scrapy.Request(url=article_url, callback=self.parse_item)

    def parse_item(self, response):
        # This function should extract the article information from the provided response
        # and yield the scraped data as a dictionary

        # Extract article information using XPath selectors
        title = response.xpath('.//article[contains(@class,"article__full")]/h1/text()').get()
        subtitle = response.xpath('.//span[contains(@class,"article__subhead")]/text()').get()
        date = response.xpath(".//div/time[contains(@class,'article__time')]/text()").get()
        author = response.xpath('.//div/span[contains(@class,"article__author")]/span/text()').get()
        category = response.xpath(".//a[contains(@class,'active')]/text()").get()
        url = response.xpath(".//meta[contains(@property, 'og:url')]/@content").get()

        # Parse the HTML content
        content_elements = response.xpath('.//div[contains(@class,"article__text")]/p')
        text_content = [element.xpath("string(.)").get().strip() for element in content_elements]
        cleaned_content = ' '.join(text_content)

        yield {
            'title': title,
            'subtitle': subtitle,
            'author': author,
            'date': date,
            'content': cleaned_content,
            'category': category,
            'URL': url
        }


它确实加载了其他页面(不仅仅关注新闻页面),而且似乎没有考虑到def parse_ AJAX _response()函数...此外,它试图抓取https://archive.nepalitimes.com/news结构,但我不希望脚本这样做。

k0pti3hp

k0pti3hp1#

我认为最好的方法是在单击“LoadMore”按钮时查看正在发出的请求。例如,这可以使用Chrome开发工具中的网络选项卡来完成。然后,您可以在加载第一个页面后在Scrapy中调度此请求。这个请求可能会返回一些类似于JSON的结构,您可以用不同的方法来处理(请参阅Request对象中的callback参数)。
这样,你可以摆脱 selenium ,使你的刮刀更轻。我希望这对你有帮助:)
对于您的情况,它使用GraphQL API来查询更多对象。这个请求可能看起来有点可怕,但它说明了应该从服务器返回哪些数据:


的数据
如果您查看响应选项卡,您会看到响应如下所示:



因此,您应该在scrape方法中向scraper添加一个yield Request(...),以模仿我发送的图像。因此,您的请求将具有一个主体,该主体具有名为query的属性,该属性具有您可以在Chrome Dev Tools上看到的字符串,以及一个variables参数,这是一个绑定到query参数的JSON。(实际发送的字符串可以勾选Payload选项卡,点击查看源代码)。
考虑到限制和偏移量参数,您可能需要根据要抓取的页面执行多少次此操作(yield Request(...))。您还可以检查当您点击最后一页时会发生什么。
提示:您可以为第一个请求使用parse_first_load_more方法。响应带有一个"totalCount": 1321,您可以使用它来计算必须发出的请求数量。然后,下面的请求可以有一个不同的回调,或者您可以在请求中使用meta参数来指示这不是第一个回调。
最后的结果将是这样的(注意,这只是一个示例代码):

import json
import scrapy

GRAPHQL_QUERY = """
    query getArticles($tenant_code: String = \"\", $routeId: Int, $limit: Int = 10, $offset: Int = 0) {
        metadata: swp_article_aggregate(where: {tenant_code: {_eq: $tenant_code}, route_id: {_eq: $routeId}}) {
            aggregate {
                totalCount: count
            }
        }
        items: swp_article(limit: $limit, offset: $offset, order_by: {published_at: desc}, where: {tenant_code: {_eq: $tenant_code}
     ...
"""

class NepalTimesScraper(scrapy.Spider):
    name = "nepaltimes"
    start_urls = ["https://www.nepalitimes.com/news"]

    def parse(self, response):
        articles = response.xpath("//article[@class='list']/..")

        for article in articles:
            title = article.css("h3::text").get()
            link = article.attrib["href"]

            yield {"title": title, "link": link}

        # Now, load more
        graphql_req = {
            "query": GRAPHQL_QUERY,
            "variables": {
                "tenant_code": "epz639",
                "routeId": 8,
                "limit": 10,
                "offset": 10,
            },
        }

        yield scrapy.Request(
            "https://nepalitimes-hasura.superdesk.org/v1/graphql",
            method="POST",
            body=json.dumps(graphql_req),
            meta={"current_offset": 10},
            callback=self.parse_more,
        )

    def parse_more(self, response):
        json_response = json.loads(response.text)
        total_number_of_articles = json_response["data"]["metadata"]["aggregate"][
            "totalCount"
        ]
        current_offset = response.meta["current_offset"]

        for article in json_response["data"]["items"]:
            yield {
                "title": article["title"],
                "link": f"{article['swp_route']['staticprefix']}/{article['slug']}",
            }

        if current_offset * 10 < total_number_of_articles:
            current_offset = current_offset + 10

            graphql_req = {
                "query": GRAPHQL_QUERY,
                "variables": {
                    "tenant_code": "epz639",
                    "routeId": 8,
                    "limit": 10,
                    "offset": current_offset,
                },
            }
            yield scrapy.Request(
                "https://nepalitimes-hasura.superdesk.org/v1/graphql",
                method="POST",
                body=json.dumps(graphql_req),
                meta={"current_offset": current_offset},
                callback=self.parse_more,
            )

字符串
希望这对你有帮助

相关问题