我正在Python Scrapy上写一个网页抓取器,当我发送一个POST请求时,我得到json响应。我怎样才能在请求后得到页面的HTML。问题是当我在网站中选择一个类别时,它会发送一个POST请求而不重新加载页面,我需要在发送POST请求后得到数据。我的Spider:
import urllib
import scrapy
from scrapy.http import Request
from scrapy.utils.response import open_in_browser
class NonprofitSpider(scrapy.Spider):
name = 'nonprofit'
start_urls = ['https://www.guidestar.org/search']
def parse(self, response):
url = 'https://www.guidestar.org/search/SubmitSearch'
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}
data = {
"CurrentPage": "1",
"SearchType": "org",
"GroupExemption": "",
"AffiliateOrgName": "",
"RelatedOrgName": "",
"RelatedOrgEin": "",
"RelationType": "",
"RelatedOrgs": "",
"SelectedCityNav[]": "",
"SelectedCountyNav[]": "",
"Eins": "",
"ul": "",
"PCSSubjectCodes[]": "",
"PeoplePCSSubjectCodes[]": "",
"PCSPopulationCodes[]": "",
"AutoSelectTaxonomyFacet": "",
"AutoSelectTaxonomyText": "",
"Keywords": "",
"State": "Alaska",
"City": "",
"PeopleZip": "",
"PeopleZipRadius": "Zip+Only",
"PeopleCity": "",
"PeopleRevenueRangeLow": "$0",
"PeopleRevenueRangeHigh": "max",
"PeopleAssetsRangeLow": "$0",
"PeopleAssetsRangeHigh": "max",
"Sort": ""
}
return Request(
url=url,
method='POST',
headers=headers,
body=urllib.parse.urlencode(data),
callback=self.start
)
def start(self, response):
print(response.body) # json, but I need to get html
1条答案
按热度按时间svujldwt1#
@Albert以下是一个工作解决方案的示例:
代码:
Settings.py 文件:您必须按如下所示更改取消注解部分:
输出量: