这是我的蜘蛛它应该分配从谷歌表获得的列表到全局变量denied
.在代码中,这个函数只调用一次,但在日志中,它的执行次数与post request to endpoint的执行次数一样多(send_to_endpoint()
).错误在哪里?
import scrapy
from scrapy import Request
from scrapy.linkextractors import LinkExtractor
import json
from datetime import datetime
import json
import logging
import requests
# from scrapy.utils.project import get_project_settings
class Code1Spider(scrapy.Spider):
name = 'c_cointelegraph'
allowed_domains = ['cointelegraph.com']
start_urls = ['https://cointelegraph.com/press-releases/']
id = int(str(datetime.now().timestamp()).split('.')[0])
denied=[]
gs_id = ''
endpoint_url = ''
def parse(self, response):
#Returns settings values as dict
settings=self.settings.copy_to_dict()
self.gs_id = settings.get('GS_ID')
self.endpoint_url = settings.get('ENDPOINT_URL')
#assigns a list of stop words from GS to global variable
self.denied = self.load_gsheet()
for i in response.xpath('//a[@class="post-card-inline__title-link"]/@href').getall():
yield Request(response.urljoin(i), callback = self.parsed)
def parsed(self, response):
#set deny_domains to current domain so we could get all external urls
denied_domains = self.allowed_domains[0]
links = LinkExtractor(deny_domains=denied_domains,restrict_xpaths=('//article[@class="post__article"]'))
links = links.extract_links(response)
links = [i.url for i in links]
#checks the list of external links agains the list of stop words
links = [i for i in links if not any(b in i for b in self.denied)]
company = response.xpath('//h2//text()').getall()
if company: company = [i.split('About ')[-1].strip() for i in company if 'About ' in i.strip()]
if company: company = company[0]
else: company = ''
d = {'heading' : response.xpath('//h1[@class="post__title"]/text()').get().strip(),
'url' : response.url,
'pubDate' : self.get_pub_date(response.xpath('//script[contains(text(),"datePublished")]/text()').get()),
'links' : links,
'company_name' : company,
'ScrapeID' : self.id,
}
# is used for debuging. just to see printed item.
yield d
#create post request to endpoint
req = self.send_to_endpoint(d)
#send request to endpoint
yield req
def get_pub_date(self, d):
d = json.loads(d)
pub_date = d['datePublished']
return pub_date
def load_gsheet(self):
#Loads a list of stop words from predefined google sheet
gs_id=self.gs_id
url = 'https://docs.google.com/spreadsheets/d/{}/export?format=csv'.format(gs_id)
r = requests.get(url)
denied = r.text.splitlines()[1:]
logging.info(denied)
return denied
def send_to_endpoint(self, d):
url = self.endpoint_url
r = scrapy.Request( url, method='POST',
body=json.dumps(d),
headers={'Content-Type':'application/json'},
dont_filter = True)
return r
每当我yield req
,load_gsheet()
函数运行以及触发谷歌表。如果我注解掉yield req
,load_gsheet()
被调用一次,因为它应该是。为什么会发生这种情况?我有三倍检查代码逐行,添加注解。不知道我错过了什么。
1条答案
按热度按时间omqzjyyz1#
发生这种情况是因为您没有将回调分配给在
send_to_endpoint()
方法中构造的请求对象。默认回调函数是
parse
方法,因此在send_to_endpoint
方法中创建的所有请求都将自动发送到parse
方法,该方法将为每个post请求调用load_gsheet
方法。解决方案是从
parse
方法中取出load_gsheet
调用,或者显式地为所有非self.parse
的POST请求分配回调。