Scrapy解析器工作不正常

dfuffjeb 于 2022-11-09 发布在其他

关注(0)|答案(1)|浏览(111)

解析器遍历所有药店的名称，但不是药店之间所有药品的名称，有些药店被完全加载（超过1000行），有些只有前20行（这是一页上的行数）。
我认为'page medicines'，'page pharmacy'和重置计数器有问题。另外，解析器可能多次遍历某些页面（重复调试）
也许我需要改变代码的设计，把分页放在单独的函数中。
你能帮我纠正和重新设计我的代码吗？

import scrapy
from scrapy.http import FormRequest
from scrapy.utils.response import open_in_browser
from scrapy.shell import inspect_response
from ..items import AptekaItem
from urllib.request import Request
from urllib.parse import urljoin
from subprocess import call
from gc import callbacks
import sys

class AptSpider(scrapy.Spider):
    name = 'apt'
    allowed_domains = ['tabletka.by']
    #start_urls = ['https://tabletka.by/pharmacies?region=1006&page=1&sort=name&sorttype=asc']  # гродненская область
    start_urls = ['https://tabletka.by/pharmacies?region=38&page=1&sort=name&sorttype=asc']  # гродно
    page_pharmacies = 1 # номер страницы с аптеками
    page_medicines = 1 # номер страницы с лекарствами

    def parse(self, response):

        for row in (response.css("tbody tr")):
            # перебор по аптекам (на странице 20 штук)
            items = AptekaItem()
            name_of_pharmacy = row.css(".pharm-name .text-wrap a::text").get()
            location_of_pharmacy = row.css(".tooltip-info-header .text-wrap span::text").get()
            number_of_pharmacy = row.css(".phone.tooltip-info .tooltip-info-header .text-wrap a::text").get()

            items['name_of_pharmacy'] = name_of_pharmacy
            items['location_of_pharmacy'] = location_of_pharmacy
            items['number_of_pharmacy'] = number_of_pharmacy

            inner_link = urljoin('https://tabletka.by/', row.css(".pharm-name .text-wrap a::attr(href)").get())
            self.page_medicines = 1 # обнуление счетчика
            yield response.follow(inner_link, callback=self.parse_medicines, meta={'items' : items})

        # пагинация по аптекам
        while self.page_pharmacies < 10 and not response.css(".table-pagination.last-page"):
            #self.page_pharmacies += 1
            #yield response.follow('https://tabletka.by/pharmacies?region=1006&page=' + str(self.page_pharmacies) + '&sort=name&sorttype=asc', callback=self.parse)
            yield response.follow('https://tabletka.by/pharmacies?region=38&page=' + str(self.page_pharmacies) + '&sort=name&sorttype=asc', callback=self.parse)

    def parse_medicines(self, response):

        self.page_medicines = 1 # обнуление счетчика

        for low in response.css("tbody tr"):
            # перебор по названиям одной аптеки
            items = response.meta['items']
            name_of_medicine = low.css(".name.tooltip-info .tooltip-info-header a::text").get()

            # разделение на лекарства и остальное (разная структура так как у лекарств ссылка)
            a_or_t = low.css(".name.tooltip-info .capture::text").get().strip()
            if a_or_t:
                active_ingredient_or_type = a_or_t
            else:
                active_ingredient_or_type = low.css(".name.tooltip-info .capture a::text").get().strip()

            dosage_form = low.css(".form-title::text").get()
            prescribed = low.css(".form.tooltip-info .capture::text").get()
            name_of_manufacturer = low.css(".produce.tooltip-info .tooltip-info-header span a::text").get().strip()
            country_of_manufaturer = low.css(".produce.tooltip-info .capture::text").get().strip()
            price_of_medicine = low.css(".price-value::text").get().strip()

            items['name_of_medicine'] = name_of_medicine
            items['active_ingredient_or_type'] = active_ingredient_or_type
            items['dosage_form'] = dosage_form
            items['prescribed'] = prescribed
            items['name_of_manufacturer'] = name_of_manufacturer
            items['country_of_manufaturer'] = country_of_manufaturer
            items['price_of_medicine'] = price_of_medicine

            items['page'] = str(self.page_medicines)

            yield items

        # пагинация по лекарствам аптеки
        while self.page_medicines < 280 and not response.css(".table-pagination.last-page"):
            self.page_medicines += 1
            yield response.follow(response.urljoin('?page=') + str(self.page_medicines), callback = self.parse_medicines, meta=response.meta)

设置文件：

BOT_NAME = 'apteka'

SPIDER_MODULES = ['apteka.spiders']
NEWSPIDER_MODULE = 'apteka.spiders'

USER_AGENT = "it's need for my studying project"

ROBOTSTXT_OBEY = True

CONCURRENT_REQUESTS = 32

scrapy

来源：https://stackoverflow.com/questions/73908601/scrapy-parser-works-bad

1条答案

按热度按时间

8wtpewkr1#

好的，我用“解析”函数中的for循环解决了它，效果很好。也许我可以帮助别人

for i in range(1, 240):
                inner_link = urljoin('https://tabletka.by/', (row.css(".pharm-name .text-wrap a::attr(href)").get() + '?page=' + str(i)))
                yield response.follow(inner_link, callback=self.parse_medicines, meta={'items' : items})

这都是我的代码

import scrapy
from scrapy.utils.response import open_in_browser
from scrapy.shell import inspect_response
from ..items import AptekaItem
from urllib.request import Request
from urllib.parse import urljoin
from subprocess import call
from gc import callbacks

class AptSpider(scrapy.Spider):
    name = 'apt'
    allowed_domains = ['tabletka.by']
    #start_urls = ['https://tabletka.by/pharmacies?region=1006&page=1&sort=name&sorttype=asc']  # гродненская область
    start_urls = ['https://tabletka.by/pharmacies?region=38&page=1&sort=name&sorttype=asc']  # гродно
    page_pharmacies = 1 # номер страницы с аптеками
    page_medicines = 1 # номер страницы с лекарствами

    def __init__(self, page_pharmacies=1, page_medicines=1, *args,**kwargs):
        super(AptSpider, self).__init__(*args,**kwargs)
        self.page_pharmacies = page_pharmacies

    def parse(self, response):

        for row in (response.css("tbody tr")):
            # перебор по аптекам (на странице 20 штук)
            items = AptekaItem()
            name_of_pharmacy = row.css(".pharm-name .text-wrap a::text").get()
            location_of_pharmacy = row.css(".tooltip-info-header .text-wrap span::text").get()
            number_of_pharmacy = row.css(".phone.tooltip-info .tooltip-info-header .text-wrap a::text").get()

            items['name_of_pharmacy'] = name_of_pharmacy
            items['location_of_pharmacy'] = location_of_pharmacy
            items['number_of_pharmacy'] = number_of_pharmacy

            for i in range(1, 240):
                inner_link = urljoin('https://tabletka.by/', (row.css(".pharm-name .text-wrap a::attr(href)").get() + '?page=' + str(i)))
                yield response.follow(inner_link, callback=self.parse_medicines, meta={'items' : items})

        # пагинация по аптекам
        if self.page_pharmacies < 10:  #and not response.css(".table-pagination.last-page")
            self.page_pharmacies += 1
            #yield response.follow('https://tabletka.by/pharmacies?region=1006&page=' + str(self.page_pharmacies) + '&sort=name&sorttype=asc', callback=self.parse)
            yield response.follow('https://tabletka.by/pharmacies?region=38&page=' + str(self.page_pharmacies) + '&sort=name&sorttype=asc', callback=self.parse)

    def parse_medicines(self, response):

        for low in response.css("tbody tr"):
            # перебор по названиям одной аптеки
            items = response.meta['items']
            name_of_medicine = low.css(".name.tooltip-info .tooltip-info-header a::text").get()

            # разделение на лекарства и остальное (разная структура так как у лекарств ссылка)
            a_or_t = low.css(".name.tooltip-info .capture::text").get().strip()
            if a_or_t:
                active_ingredient_or_type = a_or_t
            else:
                active_ingredient_or_type = low.css(".name.tooltip-info .capture a::text").get().strip()

            dosage_form = low.css(".form-title::text").get()
            prescribed = low.css(".form.tooltip-info .capture::text").get()
            name_of_manufacturer = low.css(".produce.tooltip-info .tooltip-info-header span a::text").get().strip()
            country_of_manufaturer = low.css(".produce.tooltip-info .capture::text").get().strip()
            price_of_medicine = low.css(".price-value::text").get().strip()

            items['name_of_medicine'] = name_of_medicine
            items['active_ingredient_or_type'] = active_ingredient_or_type
            items['dosage_form'] = dosage_form
            items['prescribed'] = prescribed
            items['name_of_manufacturer'] = name_of_manufacturer
            items['country_of_manufaturer'] = country_of_manufaturer
            items['price_of_medicine'] = price_of_medicine

            items['page'] = str(self.page_medicines)

            yield items

赞(0）回复(0）举报 2022-11-09

我来回答

Scrapy解析器工作不正常

1条答案

相关问题

热门标签

最新问答