我的代码有问题。发送第一个请求后,蜘蛛停止爬行,不访问其他网站。没有回调方法,我得到了所有我想要的网站列表。
from bs4 import BeautifulSoup
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class CrawlingSpider(CrawlSpider):
name = "myfancycrawler"
allowed_domains = ["domiporta.pl"]
start_urls = ["https://www.domiporta.pl/"]
rules = (
Rule(LinkExtractor(allow=(r"mieszkanie/wynajme/*[a-z]*(\?PageNumber=[0-9]*)*$")), callback="parse_html"),
)
def parse_html(self, response):
soup = BeautifulSoup(response.text, "html.parser")
datas = soup.findAll("article")
for data in datas:
clean_data = self.clean_data(data)
# print(data)
# yield {
# "price": clean_data[0],
# "price_for_m": clean_data[3],
# "area": clean_data[1],
# "rooms_amount": clean_data[2],
# "title": clean_data[4],
# "offer": "for rent" if "wynajem" in clean_data[10] or "wynajem" in clean_data[4] else "for sale",
# "short_description": clean_data[10],
# }
def clean_data(self, data):
return [el.strip().replace("\xa0", " ") for el in data.text.split("\n") if
el.strip().replace("\xa0", " ") != "" and el not in (
'WYRÓŻNIONE', 'OBEJRZANE', 'Więcej', 'Skontaktuj się')]
你知道可能出了什么问题吗?
贪婪!
1条答案
按热度按时间beq87vna1#
您需要为您要抓取回scrapy引擎的文章元素中包含的每个链接分派请求。
我还建议不要使用Beautiful Soup,而是使用内置的scrapy解析功能。它比使用bs4快得多
例如: