python Scrapy框架添加到代码代理

wgx48brx  于 2023-03-12  发布在  Python
关注(0)|答案(1)|浏览(198)

我正在尝试为自己添加代理端口到我的python scraper代码的新功能。
我从这个site中获得了免费的代理,并从SO中寻找答案。在user @dskrypa的帮助下,我修改了我的代码meta={'proxy':'103.42.162.50:8080'}
现在它给出了一个错误,如果我不停止代码运行,这个错误会沿着持续下去。

File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\downloader\handlers\http11.py", line 279, in _get_agent
    proxyScheme, proxyNetloc, proxyHost, proxyPort, proxyParams = _parse(proxy)
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\downloader\webclient.py", line 39, in _parse
    return _parsed_url_args(parsed)
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\downloader\webclient.py", line 20, in _parsed_url_args
    host = to_bytes(parsed.hostname, encoding="ascii")
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\python.py", line 108, in to_bytes
    raise TypeError('to_bytes must receive a str or bytes '
TypeError: to_bytes must receive a str or bytes object, got NoneType
2023-03-12 02:47:32 [scrapy.core.scraper] ERROR: Error downloading <GET https://dvlaregistrations.dvla.gov.uk/search/results.html?search=N11CKY&action=index&pricefrom=0&priceto=&prefixmatches=&currentmatches=&limitprefix=&limitcurrent=&limitauction=&searched=true&openoption=&language=en&prefix2=Search&super=&super_pricefrom=&super_priceto=>

这是我的代码;

import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd
import scrapy_xlsx

itemList=[]
class plateScraper(scrapy.Spider):
    name = 'scrapePlate'
    allowed_domains = ['dvlaregistrations.dvla.gov.uk']
    FEED_EXPORTERS = {'xlsx': 'scrapy_xlsx.XlsxItemExporter'}
    custom_settings = {'FEED_EXPORTERS' :FEED_EXPORTERS,'FEED_FORMAT': 'xlsx','FEED_URI': 'output_r00.xlsx', 'LOG_LEVEL':'INFO','DOWNLOAD_DELAY': 0}
    DOWNLOADER_MIDDLEWARES = {
        'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 1
    }

    def start_requests(self):
        df=pd.read_excel('data.xlsx')
        columnA_values=df['PLATE']
        for row in columnA_values:
            global  plate_num_xlsx
            plate_num_xlsx=row
            base_url =f"https://dvlaregistrations.dvla.gov.uk/search/results.html?search={plate_num_xlsx}&action=index&pricefrom=0&priceto=&prefixmatches=&currentmatches=&limitprefix=&limitcurrent=&limitauction=&searched=true&openoption=&language=en&prefix2=Search&super=&super_pricefrom=&super_priceto="
            url=base_url
            yield scrapy.Request(url,callback=self.parse, cb_kwargs={'plate_num_xlsx': plate_num_xlsx},meta={'proxy':'103.42.162.50:8080'})

    def parse(self, response, plate_num_xlsx=None):
        plate = response.xpath('//div[@class="resultsstrip"]/a/text()').extract_first()
        price = response.xpath('//div[@class="resultsstrip"]/p/text()').extract_first()

        try:
            a = plate.replace(" ", "").strip()
            if plate_num_xlsx == plate.replace(" ", "").strip():
                item = {"plate": plate_num_xlsx, "price": price.strip()}
                itemList.append(item)
                print(item)
                yield item
            else:
                item = {"plate": plate_num_xlsx, "price": "-"}
                itemList.append(item)
                print(item)
                yield item
        except:
            item = {"plate": plate_num_xlsx, "price": "-"}
            itemList.append(item)
            print(item)
            yield item

process = CrawlerProcess()
process.crawl(plateScraper)
process.start()

import winsound
winsound.Beep(555,333)
2nc8po8w

2nc8po8w1#

你应该在代理url中包含协议:

meta={"proxy": "http://103.42.162.50:8080"}

相关问题