Scrapy爬行行为异常

ddrv8njm  于 2022-12-13  发布在  其他
关注(0)|答案(1)|浏览(150)

在Windows 10上使用Python 3.11和Scrapy 2.71。在Scrapy example从www.example.com下载文件之后nirsoft.net,我做了一些调整,用于抓取另一个网站(https://www.midi-karaoke.info),请看一看。
我不是舒尔,但我希望得到我的修改脚本(〉100.000),但没有.mid文件的html页面的大部分。
这个网站本身的行为很奇怪。它是一个非常扁平的设计,有超过100.000个编号的页面名称。如果我向下浏览到一个Midi文件链接下载它,什么也没有发生。如果我在浏览器中检查源代码并点击.mid文件,我就得到了它;或者在浏览器的地址字段中将. page.extension重命名为.mid,并带有指向.mid文件(https://www.midi-karaoke.info/21110cbd.html-〉https://www.midi-karaoke.info/21110cbd.mid)的链接。
此外,对我的脚本所做的更改,有时候它们起作用了,但根本就不起作用。下一次,第二天,它们可能在同一个脚本中不再起作用。下面是我使用的脚本:

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from webcrawler.items import WebcrawlerItem # import C:\..\scrapy\webcrawler\webcrawler\items.py

class WebcrawlSpider(CrawlSpider):
    name = 'webcrawl'
    allowed_domains = ['www.midi-karaoke.info']
    start_urls = ['https://www.midi-karaoke.info']

    # Redirections vermeiden ?
    custom_settings = {'REDIRECT_ENABLED': False}
    handle_httpstatus_list = [302, 301]

    rules = (
        Rule(LinkExtractor(allow=r'/'), callback='parse_item', follow=True),
        # webseite befindet sich nur in '/'
        Rule(LinkExtractor(allow=(r'/'), deny_extensions=[], restrict_xpaths=('//a[@href]')), callback="parse_items", follow= True),
        # extrahiere 'href' links
        Rule(LinkExtractor(allow=(r'/'), restrict_xpaths=('//a[@class="MIDI"]',)), callback="parse_items", follow= True),
        # href links die uns interessieren befinden sich in class='MIDI'   
    )

    def parse_item(self, response):
        file_url = response.css('.downloadline::attr(href)').getall() # hole alle gefundenen Seiten
        file_url = response.urljoin(file_url)       
        file_extension = file_url.split('.')[-1]
        # filtere links nach Dateien mir Extension (Optional)
        if file_extension not in ('mid' , 'html', 'zip'): 
            return
        #if '.ru.' in file_url or '.en.' in file_url:
         #   return
        item = WebcrawlerItem()
        item['file_urls'] = [file_url]
        item['original_file_name'] = file_url.split('/')[-1]
        yield item

这有时有用,有时不管用,请帮帮忙。

# Redirections vermeiden ?
    custom_settings = {'REDIRECT_ENABLED': False}
    handle_httpstatus_list = [302, 301]

settings.py:

# Scrapy settings for webcrawler project

BOT_NAME = 'webcrawler'
SPIDER_MODULES = ['webcrawler.spiders']
NEWSPIDER_MODULE = 'webcrawler.spiders'
DUPEFILTER_DEBUG = False
REDIRECT_ENABLED = False 
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
    'webcrawler.pipelines.WebcrawlerPipeline': 1,
}
FILES_STORE = r"C:\Users\wiwa53\scrapy\webcrawler\downloads"
REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'

USER_AGENT = 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'

items.py:

# Define here the models for your scraped items
import scrapy

class WebcrawlerItem(scrapy.Item):
    file_urls = scrapy.Field()
    original_file_name = scrapy.Field()
    files = scrapy.Field

pipelines.py:

# Define your item pipelines here
from scrapy.pipelines.files import FilesPipeline

class WebcrawlerPipeline(FilesPipeline):
    def file_path(self, request, response=None, info=None, *, item=None):        
        file_name: str = request.url.split("/")[-1]
        #print(file_name)
        return file_name
yqyhoc1h

yqyhoc1h1#

在你的代码中有一些值得注意的问题。例如,你有多个规则来匹配相同的url,你还在你的规则中列出了一个不存在的回调方法parse_items。你也没有提供你的WebCrawlerItem,所以不可能知道它是否有合适的字段。我也没有看到任何尝试来提取你提到需要的描述信息。
这里是我做的一个例子,它抓取了网站的第一页,碰巧是所有的A,然后解析内部页面的轨道信息,然后下载文件到各自的文件夹。
我只使用了一个标准的scrapy.Spider,并在同一脚本中包含了item类和FilePipeline的版本以及所有自定义设置:

import scrapy
from scrapy.pipelines.files import FilesPipeline
# from scrapy.crawler import CrawlerProcess
import os

class MyPipeline(FilesPipeline):
    def file_path(self, request, response=None, info=None, *, item=None):
        return os.path.join(item['artist'], item['title'])

class WebcrawlerItem(scrapy.Item):
    file_urls = scrapy.Field()
    files = scrapy.Field()
    original_file_name = scrapy.Field()
    artist = scrapy.Field()
    title = scrapy.Field()

class WebcrawlSpider(scrapy.Spider):
    name = 'webcrawl'
    allowed_domains = ['www.midi-karaoke.info']
    start_urls = ['https://www.midi-karaoke.info']
    custom_settings = {'REDIRECT_ENABLED': False}
    handle_httpstatus_list = [302, 301]

    def parse(self, response, **kwargs):
        for link in response.xpath("//div[@class='folders_and_files']/a"):
            text = link.xpath('./text()').get()
            if link.xpath('./@class').get() == 'f':
                kw = {'title': text}
                callback = self.parse_item
            elif link.xpath('./text()').get() != '..':
                kw = {'artist': text}
                callback = self.parse
            else:
                continue
            url = response.urljoin(link.xpath('./@href').get())
            kw.update(kwargs)
            yield scrapy.Request(url, callback=callback, cb_kwargs=kw)

    def parse_item(self, response, artist="", title=""):
        midi = response.xpath('//table[@class="MIDI"]//table//a/@href').get()
        link = response.urljoin(midi)
        item = WebcrawlerItem()
        item["artist"] = artist
        item["title"] = title
        item['file_urls'] = [link]
        item['original_file_name'] = midi
        yield item

# def main():
#     process = CrawlerProcess(settings={
#         "USER_AGNET": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
#         "ROBOTSTXT_OBEY": False,
#         "ITEM_PIPELINES" : {
#                 MyPipeline: 100,
#                 },
#         "FILES_STORE": './folder',
#         "FILES_URLS_FIELD": 'file_urls',
#         "FILES_RESULT_FIELD": 'files',
#         })
#     process.crawl(WebcrawlSpider)
#     process.start()

if __name__ == "__main__":
    main()

相关问题