scrapy 没有返回任何结果,即使我刮取了域的不同部分,没有问题

xsuvu9jc  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(157)

我在lottery.net/powerball/numbers/#year上使用了基本上相同的代码,没有问题。为什么这次它不工作了?我已经修改了所有我需要做的信息,如链接和XPath的差异。

import scrapy

class MegaMillionsDrawingsSpider(scrapy.Spider):
        name = 'mega_millions_drawings'
        allowed_domains = ['www.lottery.net']
        user_agent = # my user agent

def start_request(self):
    start_urls = []
    for i in reversed(range(1996,2023)):
        current_url = 'http://www.lottery.net/mega-millions/numbers/'+ str(i)
        start_urls.append(current_url)

    for url in start_urls:
        yield scrapy.Request(    
            url=url, 
            callback=self.parse,
            headers={
                'User-Agent': self.user_agent
            }
        )

def parse(self, response):
    from scrapy.shell import inspect_response
    inspect_response(response, self)

    for drawing in response.xpath("//table[@class='prizes archive ']/tbody/tr"):
        yield {
            'date': drawing.xpath(".//td/a/text()[2]").get(),
            #'url': response.urljoin(drawing.xpath(".//")).get(),
            'first': drawing.xpath(".//td/ul[@class='multi results mega-millions']/li[@class='ball'][position() = 1]/text()").get(),
            'second': drawing.xpath(".//td/ul[@class='multi results mega-millions']/li[@class='ball'][position() = 2]/text()").get(),
            'third': drawing.xpath(".//td/ul[@class='multi results mega-millions']/li[@class='ball'][position() = 3]/text()").get(),
            'fourth': drawing.xpath(".//td/ul[@class='multi results mega-millions']/li[@class='ball'][position() = 4]/text()").get(),
            'fifth': drawing.xpath(".//td/ul[@class='multi results mega-millions']/li[@class='ball'][position() = 5]/text()").get(),
            'mega-ball': drawing.xpath(".//td/ul[@class='multi results mega-millions']/li[@class='mega-ball']/text()").get()
        }
jrcvhitl

jrcvhitl1#

有几个问题,我可以看到。您的一些xpath表达式是关闭的,缩进是远远关闭,您使用的是http而不是https
使用我在下面的示例中对格式和示例方法所做的轻微修改将修复这些问题。

import scrapy

class MegaMillionsDrawingsSpider(scrapy.Spider):
    name = 'mega-millions-drawings'
    allowed_domains = ['lottery.net']

    def start_requests(self):
        for num in reversed(range(1996, 2023)):  # only need one loop
            url = "https://www.lottery.net/mega-millions/numbers/" + str(num)
            yield scrapy.Request(url)  

    def parse(self, response):
        for drawing in response.xpath("//table[@class='prizes archive ']/tbody/tr"):
            item = {"date" : drawing.xpath(".//td/a/text()[2]").get().strip()}
            for number, text in zip(    # use a loop to gather all the numbers
                ["first", "second", "third", "fourth", "fifth"],
                drawing.xpath(".//td/ul/li[@class='ball']/text()").getall()
            ):
                item[number] = text.strip()  # remove whitespace
            item["mega-ball"] = drawing.xpath(".//td/ul/li[@class='mega-ball']/text()").getall().strip()
            yield item

相关问题