这个代码对我有效:'''
import scrapy
import pandas as pd
from datetime import datetime
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
driver = webdriver.Chrome(r"""chromedriver.exe""", options=options)
wait = WebDriverWait(driver, 10)
url = r"""https://www.sports-reference.com/cbb/boxscores/2023-03-23-21-tennessee.html'"""
driver.get(url)
html_source = driver.page_source
scrapy_response = TextResponse(url=driver.current_url, body=html_source, encoding='utf-8')
teams_lst_raw = scrapy_response.css('#four-factors a::attr(href)').getall()
print(teams_lst_raw)
'''
然而,当我尝试使用scrapy spider时,css选择器返回一个空白的teams_lst。为什么在上面的工作中会出现这种情况?
class CBBSpider(scrapy.Spider):
name = "cbb_spider"
start_urls = ['https://www.sports-reference.com/cbb/boxscores/2023-03-23-21-tennessee.html']
def parse(self, response):
# Extract the HTML source
html_source = response.body
# Scrape data
teams_lst_raw = response.css('#four-factors a::attr(href)').getall()
teams_lst = [s[13:-14] for s in teams_lst_raw]
print(teams_lst_raw)
process = CrawlerProcess(settings={'DOWNLOAD_TIMEOUT': 300,
'AUTOTHROTTLE_ENABLED': True,
'DOWNLOAD_DELAY': 5,
'AUTOTHROTTLE_TARGET_CONCURRENCY': 18,
'AUTOTHROTTLE_START_DELAY': 30})
process.crawl(CBBSpider)
process.start()
'''
1条答案
按热度按时间polhcujo1#
因为你在任何javascript有机会以任何方式操作html之前就收到了响应,而且似乎html中包含id为
four-factors
的元素的部分被注解掉了,scrapy选择器没有选择它进行解析。这可以通过使用regex从响应中删除所有注解掉的部分来修复。
例如:
输出: