var id = getParameterByName("ID");
if (id != null && id.length > 0) {
var numeroProtocolo = id;
var codInstituicao = 2;
}
else
{
var numeroProtocolo = getParameterByName("NumeroProtocoloEntrega");
var codInstituicao = 1;
}
我们可以用Python编写函数,然后我们只需要重新创建请求。
import scrapy
import re
import base64
import logging
import os
from urllib.parse import unquote
class ExampleSpider(scrapy.Spider):
name = "example_spider"
start_urls = ['https://www.rad.cvm.gov.br/ENET/frmExibirArquivoIPEExterno.aspx?NumeroProtocoloEntrega=1106753']
base_dir = './pdf_downloads'
def parse(self, response):
id_ = self.get_parameter_by_name("ID", response.url)
if id_:
numeroProtocolo = id_
codInstituicao = 2
else:
numeroProtocolo = self.get_parameter_by_name("NumeroProtocoloEntrega", response.url)
codInstituicao = 1
dataValue = "{ codigoInstituicao: '" + str(codInstituicao) + "', numeroProtocolo: '" + str(numeroProtocolo) + "'"
token = response.xpath('//*[@id="hdnTokenB3"]/@value').get(default='')
versaoCaptcha = ''
if response.xpath('//*[@id="hdnHabilitaCaptcha"]/@value').get(default='') == 'S':
if not token:
versaoCaptcha = 'V3'
payload = dataValue + ", token: '" + token + "', versaoCaptcha: '" + versaoCaptcha + "'}"
url = 'https://www.rad.cvm.gov.br/ENET/frmExibirArquivoIPEExterno.aspx/ExibirPDF'
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type": "application/json; charset=utf-8",
"DNT": "1",
"Host": "www.rad.cvm.gov.br",
"Origin": "https://www.rad.cvm.gov.br",
"Pragma": "no-cache",
"Referer": f"https://www.rad.cvm.gov.br/ENET/frmExibirArquivoIPEExterno.aspx?NumeroProtocoloEntrega={numeroProtocolo}",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
"X-Requested-With": "XMLHttpRequest"
}
yield scrapy.Request(url=url, headers=headers, body=payload, method='POST', callback=self.download_pdf, cb_kwargs={'protocol_num': numeroProtocolo})
def download_pdf(self, response, protocol_num):
json_data = response.json()
b64 = json_data.get('d')
if b64:
pdf = base64.b64decode(b64)
filename = f'{protocol_num}.pdf'
p = os.path.join(self.base_dir, filename)
if not os.path.isdir(self.base_dir):
os.mkdir(self.base_dir)
with open(p, 'wb') as f:
f.write(pdf)
self.log(f"Saved {filename} in {self.base_dir}")
else:
self.log("Couldn't download pdf", logging.ERROR)
@staticmethod
def get_parameter_by_name(name, url):
name = name.replace('[', '\\[').replace(']', '\\]')
results = re.search(r"[?&]" + name + r"(=([^&#]*)|&|#|$)", url)
if not results:
return None
if len(results.groups()) < 2 or not results[2]:
return ''
return unquote(results[2])
1条答案
按热度按时间voj3qocg1#
如果您在浏览器中打开devtools并转到
network
选项卡并加载页面,您可以看到PDF文件是如何加载的。你可以看到它是用64进制编码的。如果我们在源代码中搜索它,我们可以找到JavaScript代码,让我们看看相关部分:
在函数
getParameterByName
中设置id。我们可以用Python编写函数,然后我们只需要重新创建请求。