import scrapy
from scraper_api import ScraperAPIClient
client = ScraperAPIClient("67e5e7755771b9abf8062e595dd5cc2a")
class Sip2Spider(scrapy.Spider):
name = 'sip2'
domain = 'https://www.homeadvisor.com'
start_urls =[client.scrapyGet(url = 'https://www.homeadvisor.com/c.Additions-Remodeling.Atlanta.GA.-12001.html')]
def parse(self, response):
print(response)
links = [self.domain + i if not i.startswith('https://') else i for i in response.xpath("//a/@href").getall()]
yield {"links" : list(set(links))}
def parse(self, response):
print(response)
links = [self.domain + i if not i.startswith('https://') else i for i in response.xpath("//a/@href").getall()]
yield {"links" : list(set(links))}
# some filtering process
for link in links:
yield scrapy.Request(client.scrapyGet(url = link))
最新消息: 试试这个...
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from urllib.parse import urlencode
APIKEY = "67e5e7755771b9abf8062e595dd5cc2a" # <- your api key
APIDOMAIN = "http://api.scraperapi.com/"
DOMAIN = 'https://www.homeadvisor.com/'
def get_scraperapi_url(url):
payload = {'api_key': APIKEY, 'url': url}
proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
return proxy_url
def process_links(links):
for link in links:
i = link.url.index('rated')
link.url = DOMAIN + link.url[i:]
link.url = get_scraperapi_url(link.url)
return links
class Sip2Spider(CrawlSpider):
name = 'sip2'
domain = 'https://www.homeadvisor.com'
start_urls =[get_scraperapi_url('https://www.homeadvisor.com/c.Additions-Remodeling.Atlanta.GA.-12001.html')]
rules= [
Rule(LinkExtractor(allow="/rated"), callback="parse_page", follow=True, process_links=process_links)
]
def parse_page(self, response):
company_name = response.xpath("//h1[contains(@class,'@w-full @text-3xl')]/text()").get()
yield {
"company_name" : company_name
}
1条答案
按热度按时间ykejflvf1#
看起来在使用ScraperAPIClient的时候,它要求你对每个请求都使用特定的语法
client.scrapyGet(url=...)
。但是,由于你使用的是带有链接提取器的crawlspider,scrapy会自动以它通常的方式发送请求,所以这些请求会被阻止。你最好自己提取所有的链接,然后过滤你想跟踪的链接。例如:
这将产生:
实际输出几乎是400个链接...
然后你可以使用某种过滤来决定你想跟随哪些链接,并使用相同的api sdk语法来跟随它们。应用某种过滤系统也将减少发送的请求数量,这将节省api调用,这也将保存你的钱。
例如:
最新消息:
试试这个...