如何在Scrapy中发送POST请求后获得HTML响应

ut6juiuv  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(156)

我正在Python Scrapy上写一个网页抓取器,当我发送一个POST请求时,我得到json响应。我怎样才能在请求后得到页面的HTML。问题是当我在网站中选择一个类别时,它会发送一个POST请求而不重新加载页面,我需要在发送POST请求后得到数据。我的Spider:

import urllib
import scrapy
from scrapy.http import Request
from scrapy.utils.response import open_in_browser

class NonprofitSpider(scrapy.Spider):
    name = 'nonprofit'
    start_urls = ['https://www.guidestar.org/search']

    def parse(self, response):

        url = 'https://www.guidestar.org/search/SubmitSearch'

        headers = {
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        }

        data = {
            "CurrentPage": "1",
            "SearchType": "org",
            "GroupExemption": "",
            "AffiliateOrgName": "",
            "RelatedOrgName": "",
            "RelatedOrgEin": "",
            "RelationType": "",
            "RelatedOrgs": "",
            "SelectedCityNav[]": "",
            "SelectedCountyNav[]": "",
            "Eins": "",
            "ul": "",
            "PCSSubjectCodes[]": "",
            "PeoplePCSSubjectCodes[]": "",
            "PCSPopulationCodes[]": "",
            "AutoSelectTaxonomyFacet": "",
            "AutoSelectTaxonomyText": "",
            "Keywords": "",
            "State": "Alaska",
            "City": "",
            "PeopleZip": "",
            "PeopleZipRadius": "Zip+Only",
            "PeopleCity": "",
            "PeopleRevenueRangeLow": "$0",
            "PeopleRevenueRangeHigh": "max",
            "PeopleAssetsRangeLow": "$0",
            "PeopleAssetsRangeHigh": "max",
            "Sort": ""
        }

        return Request(
            url=url,
            method='POST',
            headers=headers,
            body=urllib.parse.urlencode(data),
            callback=self.start
        )

    def start(self, response):
        print(response.body) # json, but I need to get html
svujldwt

svujldwt1#

@Albert以下是一个工作解决方案的示例:
代码:

import scrapy
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.keys import Keys
from scrapy.selector import Selector
import time

class AlaskaSpider(scrapy.Spider):

    name = 'alaska'

    def start_requests(self):
        url = 'https://www.guidestar.org/search'
        yield SeleniumRequest(
            url=url,
            wait_time=6,
            callback=self.parse)

    def parse(self, response):

        driver = response.meta['driver']
        search_input = driver.find_element_by_xpath('//*[@class="form-control searchButton"]')
        search_input.send_keys('Alaska')
        search_input.send_keys(Keys.ENTER)
        time.sleep(8)
        driver.save_screenshot('search_result.png')

        html = driver.page_source

        resp = Selector(text=html)

        cards = resp.xpath('//*[@id="result-0"]')
        for card in cards:
            yield {
                'EIN': card.xpath('.//*[@class="mb-0"]/text()').get()}

    def spider_closed(self):
        self.driver.close()

Settings.py 文件:您必须按如下所示更改取消注解部分:


# Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'

# Obey robots.txt rules

ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)

# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

# DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

# CONCURRENT_REQUESTS_PER_DOMAIN = 16

# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

# TELNETCONSOLE_ENABLED = False

# Override the default request headers:

# DEFAULT_REQUEST_HEADERS = {

# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

# 'Accept-Language': 'en',

# }

# Enable or disable spider middlewares

# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html

# SPIDER_MIDDLEWARES = {

# 'scrapy_sr.middlewares.ScrapySrSpiderMiddleware': 543,

# }

# Enable or disable downloader middlewares

# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

# DOWNLOADER_MIDDLEWARES = {

# 'scrapy_sr.middlewares.ScrapySrDownloaderMiddleware': 543,

# }

# Enable or disable extensions

# See https://docs.scrapy.org/en/latest/topics/extensions.html

# EXTENSIONS = {

# 'scrapy.extensions.telnet.TelnetConsole': None,

# }

# Configure item pipelines

# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html

# ITEM_PIPELINES = {

# 'scrapy_sr.pipelines.ScrapySrPipeline': 300,

# }

# Enable and configure the AutoThrottle extension (disabled by default)

# See https://docs.scrapy.org/en/latest/topics/autothrottle.html

# AUTOTHROTTLE_ENABLED = True

# The initial download delay

# AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

# AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

# HTTPCACHE_ENABLED = True

# HTTPCACHE_EXPIRATION_SECS = 0

# HTTPCACHE_DIR = 'httpcache'

# HTTPCACHE_IGNORE_HTTP_CODES = []

# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# Middleware

DOWNLOADER_MIDDLEWARES = {
    'scrapy_selenium.SeleniumMiddleware': 800
}

# Selenium

SELENIUM_DRIVER_NAME = 'chrome'
SELENIUM_DRIVER_EXECUTABLE_PATH = which('chromedriver')

# '--headless' if using chrome instead of firefox

SELENIUM_DRIVER_ARGUMENTS = ['--headless']

输出量:

{'EIN': '51-0152394'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-0155010'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '27-2390076'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-0055697'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '83-4051246'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '31-1207314'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-6009991'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-6009764'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '20-2590220'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-0073478'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-6001032'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '23-7302803'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '51-0210787'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-6002348'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-0155067'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-0150993'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-0043154'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '30-0854378'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '84-3893461'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '46-1837510'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-1039013'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '14-1958727'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '92-0098901'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '23-7394629'}
2021-11-04 18:33:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.guidestar.org/search>
{'EIN': '81-2318960'}
2021-11-04 18:33:11 [scrapy.core.engine] INFO: Closing spider (finished)
2021-11-04 18:33:11 [selenium.webdriver.remote.remote_connection] DEBUG: DELETE http://127.0.0.1:59720/session/05a78d4be9af7205aac54abc0b91118b {}
2021-11-04 18:33:11 [urllib3.connectionpool] DEBUG: http://127.0.0.1:59720 "DELETE /session/05a78d4be9af7205aac54abc0b91118b HTTP/1.1" 200 14
2021-11-04 18:33:11 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2021-11-04 18:33:14 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/response_bytes': 203850,
 'downloader/response_count': 1,
 'downloader/response_status_count/200':

相关问题