scrapy 蜘蛛在第一次请求后停止爬行

u59ebvdq  于 2023-04-30  发布在  其他
关注(0)|答案(1)|浏览(159)

我的代码有问题。发送第一个请求后,蜘蛛停止爬行,不访问其他网站。没有回调方法,我得到了所有我想要的网站列表。

from bs4 import BeautifulSoup
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

class CrawlingSpider(CrawlSpider):
    name = "myfancycrawler"
    allowed_domains = ["domiporta.pl"]
    start_urls = ["https://www.domiporta.pl/"]

    rules = (
        Rule(LinkExtractor(allow=(r"mieszkanie/wynajme/*[a-z]*(\?PageNumber=[0-9]*)*$")), callback="parse_html"),
    )

    def parse_html(self, response):
        soup = BeautifulSoup(response.text, "html.parser")
        datas = soup.findAll("article")
        for data in datas:
            clean_data = self.clean_data(data)

            # print(data)
            # yield {
            #   "price": clean_data[0],
            #   "price_for_m": clean_data[3],
            #   "area": clean_data[1],
            #   "rooms_amount": clean_data[2],
            #   "title": clean_data[4],
            #   "offer": "for rent" if "wynajem" in clean_data[10] or "wynajem" in clean_data[4] else "for sale",
            #   "short_description": clean_data[10],
            # }

    def clean_data(self, data):
        return [el.strip().replace("\xa0", " ") for el in data.text.split("\n") if
                el.strip().replace("\xa0", " ") != "" and el not in (
                'WYRÓŻNIONE', 'OBEJRZANE', 'Więcej', 'Skontaktuj się')]

你知道可能出了什么问题吗?
贪婪!

beq87vna

beq87vna1#

您需要为您要抓取回scrapy引擎的文章元素中包含的每个链接分派请求。
我还建议不要使用Beautiful Soup,而是使用内置的scrapy解析功能。它比使用bs4快得多
例如:

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class CrawlingSpider(CrawlSpider):
    name = "myfancycrawler"
    allowed_domains = ["domiporta.pl"]
    start_urls = ["https://www.domiporta.pl/"]

    rules = (
        Rule(LinkExtractor(allow=(r"mieszkanie/wynajme/*[a-z]*(\?PageNumber=[0-9]*)*$")), callback="parse_html"),
    )

    def parse_html(self, response):
        fields = ["price","price_for_m","area","rooms_amount","title","offer","short_description"]
        for article in response.xpath("//article"):
            data = [el.strip().replace("\xa0", " ")
                    for el in article.xpath(".//text()").getall()
                    if el.strip().replace("\xa0", " ")
                    and el not in ('WYRÓŻNIONE', 'OBEJRZANE', 'Więcej', 'Skontaktuj się')]
            offer = "for rent" if "wynajem" in data[10] or "wynajem" in data[4] else "for sale"
            yield dict(zip(fields, [data[0], data[3], data[1], data[4], offer, data[10]]))
            yield scrapy.Request(response.urljoin(article.xpath(".//a/@href").get()))

相关问题