import scrapy
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import numpy as np
from time import sleep
from random import randint
PATH = "/Users/miguelcorredor/Desktop/cnn/chromedriver" # where the driver is located driver = webdriver.Chrome(PATH)
i = 0 #list number
words= ["RNA+Delivery+vehicles", "animal+models+for+rare+genetic+diseases", "invitro+models+for+rare+genetic+diseases","genes","muscle"]
link = 'https://pubmed.ncbi.nlm.nih.gov/?term=animal+models+for+rare+genetic+diseases' #list of words pages = np.arange(1, 5, 1)
class umpSpider(scrapy.Spider):
name = "pubmed"
start_urls= ["https://pubmed.ncbi.nlm.nih.gov/?term=structural+Visualization+of+LNP"]
page="https://pubmed.ncbi.nlm.nih.gov/?term="+ str(words[i])
for page in pages:
page="https://pubmed.ncbi.nlm.nih.gov/?term="+ str(words[i])
driver.get(page)
sleep(randint(2,10))
i=i+1
def parse(self, response):
for product in response.css('div.docsum-content'):
yield{
'tittle': product.css('a.docsum-title::text').get(),
'Author': product.css('span.docsum-authors.short-authors::text').get(),
'year': product.css('span.docsum-journal-citation.short-journal-citation::text').get(),
}
yield response.follow(next_page, callback=self.parse)
1条答案
按热度按时间e4eetjau1#
我意识到我做的循环是错误的,我不得不在def解析中做它