scrapy 跟踪链接并对其进行爬网

fgw7neuy  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(141)

我试图让一个爬虫跟随链接,用这个代码

import scrapy
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import json

class DicionarioSpider(scrapy.Spider):
    name = 'dicionario'
    allowed_domains = ['www.mediktor.com']
    start_urls = ['http://www.mediktor.com/']

    def start_requests(self):
        url = "https://www.mediktor.com/pt-br/glossario"
        options = Options()
        options.headless = True
        driver = webdriver.Chrome(options=options)
        driver.get(url)
        time.sleep(10)

        doencas = driver.find_elements(
            By.XPATH, "//a[@class='mdk-dictionary-list__glossary-item']")
        for doenca in doencas:
            url = doenca.get_attribute('href')
            yield scrapy.Request(url)
        driver.quit()

    def parse(self, response):
        urls = response.css(
            '.mdk-dictionary-list__glossary-item a::attr(href)')
        for url in urls:
            yield response.follow(url.get(), callback=self.parse_info)

    def parse_info(self, response):
        contents = response.css('div.page-glossary-detail__main-content')
        for desc in response.css('div.mdk-conclusion-detail__main-description'):
            desc = response.css('p ::text').getall()
        yield {
            'desc': desc
        }
        for content in contents:
            yield{
                'name': content.css(
                    'div.mdk-conclusion-detail__main-title ::text').get().strip(),
                'espec': content.css(
                    'div.mdk-ui-list-item__text mdc-list-item__text span::text').strip()
            }

我能够得到链接,但进入链接的一部分,并获得我需要的信息是不工作,所以一个朋友帮我想出了这个代码

import pandas as pd
import requests
from bs4 import BeautifulSoup

def get_auth_code():
    url = "https://www.mediktor.com/vendor.js"
    response = requests.get(url)
    start_index = response.text.index('APP_API_AUTH_CODE:"', 0) + len('APP_API_AUTH_CODE:"')
    end_index = response.text.index('"', start_index)
    return response.text[start_index:end_index]

def get_auth_token_and_device_id():
    url = "https://euapi01.mediktor.com/backoffice/services/login"
    payload = "{\"useCache\":0,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"," \
              "\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"pt_BR\"," \
              "\"timezoneRaw\":180,\"authTokenRefreshExpiresIn\":null}"
    headers = {
        'authorization': f'Basic {get_auth_code()}',
        'Content-Type': 'text/plain'
    }
    response = requests.request("POST", url, headers=headers, data=payload)
    return response.json()['authToken'], response.json()['deviceId']

def get_conclusion_list(auth_token, device_id):
    url = "https://euapi01.mediktor.com/backoffice/services/conclusionList"
    payload = "{\"useCache\":168,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"" \
              ",\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"pt_BR\"," \
              "\"timezoneRaw\":180,\"deviceId\":\"" + device_id + "\"}"
    headers = {
        'accept': 'application/json, text/plain, */*',
        'authorization': f'Bearer {auth_token}',
        'content-type': 'application/json;charset=UTF-8'
    }
    response = requests.request("POST", url, headers=headers, data=payload)
    return [conclusionId['conclusionId'] for conclusionId in response.json()['conclusions']]

def get_details(conclusionId, auth_token, device_id):
    url = "https://euapi01.mediktor.com/backoffice/services/conclusionDetail"
    payload = "{\"useCache\":0,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"," \
              "\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"en_EN\"," \
              "\"timezoneRaw\":180,\"deviceId\":\"" + device_id + "\"," \
              "\"conclusionId\":\"" + conclusionId + "\"," \
              "\"conclusionTemplate\":\"conclusion_description_body\",\"includeActions\":true}"
    headers = {
        'authorization': f'Bearer {auth_token}',
        'content-type': 'application/json;charset=UTF-8'
    }
    response = requests.request("POST", url, headers=headers, data=payload)
    return response.text

auth_token, device_id = get_auth_token_and_device_id()
conclusion_list = get_conclusion_list(auth_token, device_id)
for conclusion in conclusion_list:
    print(get_details(conclusion, auth_token, device_id))

它获取带有页面项的json,但在循环编号230中,它开始返回以下错误,并且不会离开循环

{"error":{"code":"ME667","description":"Expired user identification token.","retry":true}}

我想做的是,将所有这些传递到一个文件,这样我就可以看到它是否获得了页面上我需要的所有项目,然后留下一个json,其中只包含我需要的信息,而不是来自站点的所有信息,因为它现在正在返回

piok6c0g

piok6c0g1#

我经过许多不眠之夜解决了我的问题,我会把它留在这里,以防它帮助别人。

import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import json

class DicionarioSpider(scrapy.Spider):
    name = 'dicionario'
    allowed_domains = ['www.mediktor.com']
    start_urls = ['http://www.mediktor.com/']

    def parse(self, response):
        url = "https://www.mediktor.com/pt-br/glossario"
        option = Options()
        option.headless = True
        driver = webdriver.Chrome(options=option)
        driver.get(url)
        time.sleep(10)

        el_links = driver.find_elements(
            By.XPATH, "//a[@class='mdk-dictionary-list__glossary-item']")
        urls = []
        nome_doenca = []

        for i in range(len(el_links)):
            urls.append(el_links[i].get_attribute('href'))

        for link in urls:
            driver.get(link)

            myElem = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH,
                                                "//div[@class='mdk-conclusion-detail__main-title']"
                                                )))
            nome_source = driver.find_element(By.XPATH,
                                              "//div[@class='mdk-conclusion-detail__main-title']"
                                              ).text

            nome_doenca.append(nome_source)

            driver.back()
        print(nome_doenca)
        driver.quit()

我只是修改了我的代码,没有使用scrapy,只是使用了selenium选择器。

相关问题