使用Scrapy抓取数据

cgfeq70w  于 2022-11-09  发布在  其他
关注(0)|答案(2)|浏览(204)
import requests
import scrapy
from scrapy.http import Request
from bs4 import BeautifulSoup

class TestSpider(scrapy.Spider):
    name = 'test'
    start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1,
        'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
        }

    def parse(self, response):
        books = response.xpath("//td[@class='icon_link']//a//@href").extract()
        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)

    def parse_book(self, response):
        detail=response.xpath("//div[@class='line_list_K']")
        for i in range(len(detail)):
            title=detail[i].xpath("//span[contains(text(), 'Status:')]//div").get()
            print(title)

我试图从status获取数据,从email获取数据,但它没有给予我,这是页面链接https://rejestradwokatow.pl/adwokat/abaewicz-dominik-49965

mnemlml8

mnemlml81#

请尝试:

import scrapy
from scrapy.http import Request
from scrapy.crawler import CrawlerProcess

class TestSpider(scrapy.Spider):
    name = 'test'
    start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1,
        'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
        }

    def parse(self, response):
        books = response.xpath("//td[@class='icon_link']//a//@href").extract()
        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)

    def parse_book(self, response):
        e1 = response.xpath('(//*[@class="address_e"])[1]//@data-ea')
        e1=e1.get() if e1 else None
        e2=response.xpath('(//*[@class="address_e"])[1]//@data-eb')
        e2=e2.get() if e2 else None
        try:
            data = e1 +'@'+ e2
            yield {
                'status':response.xpath("//span[contains(text(), 'Status:')]/../div/text()").get(),
                'email': data,
                'url':response.url
                }

        except:
            pass

if __name__ == "__main__":
    process =CrawlerProcess(TestSpider)
    process.crawl()
    process.start()

此xpath表达式将帮助您提取所有5行的所有数据,如

//span[contains(text(), 'Status:')]/../following-sibling::div[1]/div

//span[contains(text(), 'Status:')]/../following-sibling::div[1]/span
hpcdzsge

hpcdzsge2#

我将在不使用Scrappy的情况下展示一个例子。我希望你能理解并能将它应用到你的代码中。唯一的困难是email由属性中的两部分组成

import requests
from bs4 import BeautifulSoup
url = "https://rejestradwokatow.pl/adwokat/abaewicz-dominik-49965"

response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
status = soup.find('span', string='Status:').findNext('div').getText()
data_ea = soup.find('span', string='Email:').findNext('div').get('data-ea')
data_eb = soup.find('span', string='Email:').findNext('div').get('data-eb')
email = f"{data_ea}@{data_eb}"

print(status, email)

输出:

Wykonujący zawód kancelaria@ablazewicz.pl

相关问题