如何在while循环中运行Scrapy

ioekq8ef  于 2022-11-09  发布在  其他
关注(0)|答案(3)|浏览(164)

所以我正在做一个项目,使用多个蜘蛛抓取不同的网站。我想让它,使蜘蛛再次运行时,用户说“是”时,要求继续。

keyword = input("enter keyword: ")
page_range = input("enter page range: ")

flag = True

while flag:

   process = CrawlProcess()
   process.crawl(crawler1, keyword, page_range)
   process.crawl(crawler2, keyword, page_range)
   process.crawl(crawler3, keyword, page_range)
   process.start()

   isContinue = input("Do you want to continue? (y/n): ")

   if isContinue == 'n':
      flag = False

但我得到一个错误,说React堆是不可重启的。

Traceback (most recent call last):
  File "/Users/user/Desktop/programs/eshopSpider/eshopSpider.py", line 47, in <module>
    process.start()
  File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/scrapy/crawler.py", line 327, in start
    reactor.run(installSignalHandlers=False)  # blocking call
  File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 1317, in run
    self.startRunning(installSignalHandlers=installSignalHandlers)
  File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 1299, in startRunning
    ReactorBase.startRunning(cast(ReactorBase, self))
  File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 843, in startRunning
    raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable

所以我想使用while循环是行不通的。我甚至不知道从哪里开始...

5cnsuln7

5cnsuln71#

Method 1:

scrapy creates Reactor which can't be reused after stop but if you will run Crawler in separated process then new process will have to create new Reactor .

import multiprocessing

def run_crawler(keyword, page_range):
   process = CrawlProcess()
   process.crawl(crawler1, keyword, page_range)
   process.crawl(crawler2, keyword, page_range)
   process.crawl(crawler3, keyword, page_range)
   process.start()

# --- main ---

keyword = input("enter keyword: ")
page_range = input("enter page range: ")

flag = True

while flag:

   p = multiprocessing(target=run_crawler, args=(keyword, page_range))
   p.start()
   p.join()

   isContinue = input("Do you want to continue? (y/n): ")

   if isContinue == 'n':
      flag = False

It will not work if you use threading instead of multiprocessing because threads share variables so new thread will use the same Reactor as previous thread.
Minimal working code (tested on Linux).

import scrapy

class MySpider(scrapy.Spider):

    name = 'myspider'

    #start_urls = ['https://books.toscrape.com/']

    def __init__(self, keyword, page, *args,**kwargs):
        '''generate start_urls list'''
        super().__init__(*args,**kwargs)

        self.keyword = keyword
        self.page = int(page)
        self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']

    def parse(self, response):
        print('[parse] url:', response.url)

        for book in response.css('article.product_pod'):
            title = book.css('h3 a::text').get()
            url = book.css('img::attr(src)').get()
            url = response.urljoin(url)
            yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}

# --- run without project and save in `output.csv` ---

import multiprocessing
from scrapy.crawler import CrawlerProcess

def run_crawler(keyword, page_range):
    #from scrapy.crawler import CrawlerProcess

    c = CrawlerProcess({
        'USER_AGENT': 'Mozilla/5.0',
        # save in file CSV, JSON or XML
        'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
    })
    c.crawl(MySpider, keyword, page)
    c.crawl(MySpider, keyword, int(page)+1)
    c.crawl(MySpider, keyword, int(page)+2)
    c.start()

# --- main ---

if __name__ == '__main__':
    keyword = input("enter keyword: ")
    page    = input("enter page: ")

    running = True
    while running:

        p = multiprocessing.Process(target=run_crawler, args=(keyword, page))
        p.start()
        p.join()

        answer = input('Repeat [Y/n]? ').strip().lower()

        if answer == 'n':
            running = False

Method 2:

Found in Google: Restarting a Twisted Reactor .
It is old post which uses del to remove module twisted from memory and later it imports it again.

keyword = input("enter keyword: ")
page_range = input("enter page range: ")

flag = True

while flag:

   process = CrawlProcess()
   process.crawl(crawler1, keyword, page_range)
   process.crawl(crawler2, keyword, page_range)
   process.crawl(crawler3, keyword, page_range)
   process.start()

   isContinue = input("Do you want to continue? (y/n): ")

   if isContinue == 'n':
      flag = False

   import sys
   del sys.modules['twisted.internet.reactor']
   from twisted.internet import reactor
   from twisted.internet import default
   default.install()

Minimal working code (tested on Linux)

import scrapy

class MySpider(scrapy.Spider):

    name = 'myspider'

    #start_urls = ['https://books.toscrape.com/']

    def __init__(self, keyword, page, *args,**kwargs):
        '''generate start_urls list'''
        super().__init__(*args,**kwargs)

        self.keyword = keyword
        self.page = int(page)
        self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']

    def parse(self, response):
        print('[parse] url:', response.url)

        for book in response.css('article.product_pod'):
            title = book.css('h3 a::text').get()
            url = book.css('img::attr(src)').get()
            url = response.urljoin(url)
            yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}

# --- run without project and save in `output.csv` ---

from scrapy.crawler import CrawlerProcess

def run_crawler(keyword, page):

    c = CrawlerProcess({
        'USER_AGENT': 'Mozilla/5.0',
        # save in file CSV, JSON or XML
        'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
    })
    c.crawl(MySpider, keyword, page)
    c.crawl(MySpider, keyword, int(page)+1)
    c.crawl(MySpider, keyword, int(page)+2)
    c.start()

# --- main ---

if __name__ == '__main__':
    keyword = input("enter keyword: ")
    page    = input("enter page: ")

    running = True
    while running:

        run_crawler(keyword, page)

        answer = input('Repeat [Y/n]? ').strip().lower()

        if answer == 'n':
            running = False

        import sys
        del sys.modules['twisted.internet.reactor']
        from twisted.internet import reactor
        from twisted.internet import default
        default.install()

Method 3:

It seems you could use use CrawlRunner instead of CrawlProcess - but I didn't test it yet.
Base on last example in doc for Running multiple spiders in the same process I created code which runs while -loop inside reactor (so it doesn't have to stop it) but it first starts one Spider, next runs second Spider, next it asks for contiuation and it runs again first Spider, next runs second Spider. It doesn't runs both Spiders at the same time but maybe it could be somehow changed.

import scrapy

class MySpider(scrapy.Spider):

    name = 'myspider'

    #start_urls = ['https://books.toscrape.com/']

    def __init__(self, keyword, page, *args,**kwargs):
        '''generate start_urls list'''
        super().__init__(*args,**kwargs)

        self.keyword = keyword
        self.page = int(page)
        self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']

    def parse(self, response):
        print('[parse] url:', response.url)

        for book in response.css('article.product_pod'):
            title = book.css('h3 a::text').get()
            url = book.css('img::attr(src)').get()
            url = response.urljoin(url)
            yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}

# --- run without project and save in `output.csv` ---

from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging

@defer.inlineCallbacks
def run_crawler():

    running = True
    while running:

        yield runner.crawl(MySpider, keyword, page)
        yield runner.crawl(MySpider, keyword, int(page)+1)
        yield runner.crawl(MySpider, keyword, int(page)+2)

        answer = input('Repeat [Y/n]? ').strip().lower()

        if answer == 'n':
            running = False
            reactor.stop()
            #return

# --- main ---

if __name__ == '__main__':
    keyword = input("enter keyword: ")
    page    = input("enter page: ")

    configure_logging()        

    runner = CrawlerRunner({
        'USER_AGENT': 'Mozilla/5.0',
        # save in file CSV, JSON or XML
        'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
    })

    run_crawler()

    reactor.run()

EDIT:

The same but now all crawlers run at the same time

@defer.inlineCallbacks
def run_crawler():

    running = True
    while running:

        runner.crawl(MySpider, keyword, page)
        runner.crawl(MySpider, keyword, int(page)+1)
        runner.crawl(MySpider, keyword, int(page)+2)

        d = runner.join()
        yield d

        answer = input('Repeat [Y/n]? ').strip().lower()

        if answer == 'n':
            running = False
            reactor.stop()
            #return
yrefmtwq

yrefmtwq2#

您可以移除while循环,并改用回呼。
编辑:添加了示例:

def callback_f():
    # stuff #
    calling_f()

def calling_f():
    answer = input("Continue? (y/n)")
    if not answer == 'n':
        callback_f()

callback_f()
7ajki6be

7ajki6be3#

from twisted.internet import reactor #only this is supposed to be here, we will be deleting the reactor after each run, using the main

configure_logging()
settings = get_project_settings()
runner = CrawlerRunner(settings)    
d = runner.crawl('your spider class name')
d.addBoth(lambda _: reactor.stop())
reactor.run()  # the script will block here until all crawling jobs are finished

del sys.modules['twisted.internet.reactor'] #deleting the reactor, because we want to run a for loop, the reactor will be imported again at the top
default.install()

相关问题