scrapy 如何使用Google Cloud Functions运行网页搜罗器?

1aaf6o9v  于 2023-02-08  发布在  Go
关注(0)|答案(1)|浏览(207)

先谢谢你的帮助。
我目前正在运行一个网页抓取器-这是我第一次做这样的事情-它从URL中提取地址,然后将地址匹配到用户输入。这将进入一个聊天机器人,我想知道我如何才能使其在谷歌功能上运行。什么是这样做的过程,有一个教程吗?
这是我目前为止的代码。还有一个小项目文件

from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from ..items import DataItem
from fuzzywuzzy import fuzz
from urllib.parse import urljoin
import scrapy

class AddressesSpider(scrapy.Spider):
    name = 'Addresses'
    allowed_domains = ['find-energy-certificate.service.gov.uk']
    postcode = "bh10+4ah"
    start_urls = ['https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode=' + postcode]
    
    
   ## def start_requests(self):
   ##     self.first = input("Please enter the address you would like to match: ")
   ##     yield scrapy.Request(url=self.start_urls[0], callback=self.parse)
   
    
    def parse(self, response):
        
        first = input("Please enter the address you would like to match: ")
        highest_ratios = []
        highest_item = None

        for row in response.xpath('//table[@class="govuk-table"]//tr'):
            address = row.xpath("normalize-space(.//a[@class='govuk-link']/text())").extract()[0].lower()
            address = address.rsplit(',', 2)[0]
            link = row.xpath('.//a[@class="govuk-link"]/@href').extract()
            details = row.xpath("normalize-space(.//td/following-sibling::td)").extract()

            ratio = fuzz.token_set_ratio(address, first)
            item = DataItem()
            item['link'] = link
            item['details'] = details
            item['address'] = address
            item['ratioresult'] = ratio
            if len(highest_ratios) < 3:
                highest_ratios.append(item)
            elif ratio > min(highest_ratios, key=lambda x: x['ratioresult'])['ratioresult']:
                highest_ratios.remove(min(highest_ratios, key=lambda x: x['ratioresult']))
                highest_ratios.append(item)

        highest_ratios_100 = [item for item in highest_ratios if item['ratioresult'] == 100]
        if highest_ratios_100:
            for item in highest_ratios_100:
                yield item
        else:
            yield max(highest_ratios, key=lambda x: x['ratioresult'])

        if len(highest_ratios_100) > 1:
            for i, item in enumerate(highest_ratios_100):
                print(f"{i+1}: {item['address']}")
            selected = int(input("Please select the correct address by entering the number corresponding to the address: ")) - 1
            selected_item = highest_ratios_100[selected]
        else:
            selected_item = highest_ratios_100[0] if highest_ratios_100 else max(highest_ratios, key=lambda x: x['ratioresult'])

        new_url = selected_item['link'][0]
        new_url = str(new_url)

        if new_url:
            base_url = 'https://find-energy-certificate.service.gov.uk'
            print(f'Base URL: {base_url}')
            print(f'New URL: {new_url}')
            new_url = urljoin(base_url, new_url)
            print(f'Combined URL: {new_url}')
            yield scrapy.Request(new_url, callback=self.parse_new_page)

    def parse_new_page(self, response):
        Postcode = response.xpath('normalize-space((//p[@class="epc-address govuk-body"]/text())[last()])').extract()
        Town = response.xpath('normalize-space((//p[@class="epc-address govuk-body"]/text())[last()-1])').extract()
        First = response.xpath(".//p[@class='epc-address govuk-body']").extract()
        Type = response.xpath('normalize-space(//dd[1]/text())').extract_first()
        Walls = response.xpath("//th[contains(text(), 'Wall')]/following-sibling::td[1]/text()").extract()
        Roof = response.xpath("//th[contains(text(), 'Roof')]/following-sibling::td[1]/text()").extract()
        Heating = response.xpath("//th[text()='Main heating']/following-sibling::td[1]/text()").extract_first()
        CurrentScore = response.xpath('//body[1]/div[2]/main[1]/div[1]/div[3]/div[3]/svg[1]/svg[1]/text[1]/text()').re_first("[0-9+]{1,2}")
        Maxscore = response.xpath('//body[1]/div[2]/main[1]/div[1]/div[3]/div[3]/svg[1]/svg[2]/text[1]/text()').re_first("[0-9+]{2}")
        Expiry = response.xpath('normalize-space(//b)').extract_first()
        FloorArea = response.xpath('//dt[contains(text(), "floor area")]/following-sibling::dd/text()').re_first("[0-9+]{2,3}")
        Steps = response.xpath("//h3[contains(text(),'Step')]/text()").extract()
    
        yield {
            'Postcode': Postcode,
            'Town': Town,
            'First': First,
            'Type': Type,
            'Walls': Walls,
            'Roof': Roof,
            'Heating': Heating,
            'CurrentScore': CurrentScore,
            'Maxscore': Maxscore,
            'Expiry': Expiry,
            'FloorArea': FloorArea,
            'Steps': Steps
    }

我试过谷歌搜索,四处看看,不能得到如何部署这个作为一个项目运行谷歌功能,或者我可以只是复制到控制台的代码某处?

cygmwpex

cygmwpex1#

您可以尝试从脚本运行您的spider,但是,更好的解决方案是将scrappy Package 在它自己的子进程中。
例如:

from multiprocessing import Process, Queue
from ... import MySpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

def my_cloud_function(event, context):
    def script(queue):
        try:
            settings = get_project_settings()

            settings.setdict({
                'LOG_LEVEL': 'ERROR',
                'LOG_ENABLED': True,
            })

            process = CrawlerProcess(settings)
            process.crawl(MySpider)
            process.start()
            queue.put(None)
        except Exception as e:
            queue.put(e)

    queue = Queue()

    # wrap the spider in a child process
    main_process = Process(target=script, args=(queue,))
    main_process.start()    # start the process
    main_process.join()     # block until the spider finishes

    result = queue.get()    # check the process did not return an error
    if result is not None:
        raise result 

    return 'ok'

您可以参考此tutorial了解更多信息。

相关问题