我正在学习scrapy,并试图刮这个房地产经纪人网站在魁北克省。我正在使用他们的API来收集房屋和打印网址到屏幕上。但我的最后一个函数print_urls()不会运行。我真的被困在这里,我试图调试它,它只是跳过我的整个功能块。
class CentrishomesSpider(scrapy.Spider):
name = 'centrisHomes'
# allowed_domains = ['www.centris.ca']
# start_urls = ['http://www.centris.ca/']
def start_requests(self):
query = {...
}
yield scrapy.Request(
url='https://www.centris.ca/property/UpdateQuery',
method='POST',
body=json.dumps(query),
headers={
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
},
callback=self.get_inscriptions
)
...
def get_inscriptions(self, response):
resp, success = self.success(response)
if success == True:
print(Fore.GREEN + 'Query Updated' + Style.RESET_ALL)
else:
print(Fore.RED + 'Query Not Updated' + Style.RESET_ALL)
yield scrapy.Request(
url='https://www.centris.ca/Property/GetInscriptions',
method='POST',
body=json.dumps({"startPosition": 0}),
headers={
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
},
callback=self.handle_inscriptions
)
def handle_inscriptions(self, response):
homes, success = self.success(response)
if success == True:
print(Fore.GREEN + 'Count ' + str(homes['d']['Result']['count']) + Style.RESET_ALL)
# self.test()
self.html = Selector(text=homes['d']['Result']['html'])
self.print_urls()
# print(response.body)
...
def success(self, response):
my_dict = literal_eval(response.body.decode(
'utf-8').replace(':true}', ':True}'))
if my_dict['d']['Succeeded'] == True:
return my_dict, True
else:
return False
def print_urls(self):
print('try')
# page_html = Selector(resp['d']['Result']['html'])
page_html = self.html
homes = page_html.xpath('//div[contains(@class, "property-thumbnail-item")]')
for home in homes:
yield{
'home_url':home.xpath('.//a[@class="property-thumbnail-summary-link"]/@href').get()
}
...
1条答案
按热度按时间xkftehaa1#
我发现了我自己的问题,这是因为我把print_urls函数变成了一个生成器,调用self.print_urls()并不能让我的生成器做任何事情。S/o @AbdealiJK我发现了它,因为他的答案。
https://stackoverflow.com/a/34609397/19966841