scrapy Python网页搜罗- Wep页面资源

k0pti3hp  于 2022-12-23  发布在  Python
关注(0)|答案(1)|浏览(105)

我试图刮一个特定的网站,但数据是动态加载的。我发现数据是在json files,但我不能得到网站上的所有元素的列表,我需要所有的页面。

  • 我怎样才能得到从数字开始的所有相似的json的列表?
  • 我怎么能用这样的逻辑通读所有的页面呢?

我不知道该用什么,我试过Scrapy,但等待页面加载太复杂了,我想知道beautifulsoup或其他是否有更快的响应。

编辑:添加剪贴代码

  • 我在scrappy中完成了这段代码,但是我不知道如何动态地从页面中获取所有的json
# https://www.fincaraiz.com.co/_next/data/build/proyecto-de-vivienda/altos-del-eden/el-eden/barranquilla/7109201.json?title=altos-del-eden&location1=el-eden&location2=barranquilla&code=7109201

import logging
import scrapy
from scrapy_playwright.page import PageMethod
import json

# scrapy crawl fincaraiz-home -O output-home.json
class PwspiderSpider(scrapy.Spider):
    name = "fincaraiz-home"
    base_url = "https://www.fincaraiz.com.co"
    build_url = "https://www.fincaraiz.com.co/_next/data/build"

    def start_requests(self):
        yield scrapy.Request(
            "https://www.fincaraiz.com.co/finca-raiz/venta/antioquia",
            meta=dict(
                playwright=True,
                playwright_include_page=True,
                playwright_page_methods=[
                    #PageMethod("wait_for_selector", 'div[id="listingContainer"]')
                    PageMethod("wait_for_selector", 'button:has-text("1")')
                ],
            ),
            errback=self.errback,
        )

    async def parse(self, response):
        for anuncio in response.xpath("//div[@id='listingContainer']/div"):
            # if anuncio.xpath("article/a/@href").extract():

            #     yield scrapy.Request(
            #         self.build_url + anuncio.xpath("article/a/@href").extract()[0]+".json",
            #         callback=self.parse_json,
            #         # meta=dict(
            #         #     callback=self.parse_json,
            #         #     # playwright=True,
            #         #     # playwright_include_page=True,
            #         #     # playwright_page_methods=[
            #         #     #     PageMethod("wait_for_selector", 'button:has-text("1")')
            #         #     # ],
            #         # ),
            #         errback=self.errback,
            #     )
            yield {
                "link": anuncio.xpath("article/a/@href").extract(),
                "tipo_anuncio": anuncio.xpath("article/a/ul/li[1]/div/span/text()").extract(),
                "tipo_vendedor": anuncio.xpath("article/a/ul/li[2]/div/span/text()").extract(),
                "valor": anuncio.xpath("article/a/div/section/div[1]/span[1]/b/text()").extract(),
                "area": anuncio.xpath("article/a/div/section/div[2]/span[1]/text()").extract(),
                "habitaciones": anuncio.xpath("article/a/div/section/div[2]/span[3]/text()").extract(),
                "banos": anuncio.xpath("article/a/div/section/div[2]/span[5]/text()").extract(),
                "parqueadero": anuncio.xpath("article/a/div/section/div[2]/span[7]/text()").extract(),
                "ubicacion": anuncio.xpath("article/a/div/section/div[3]/div/span/text()").extract(),
                "imagen": anuncio.xpath("article/a/figure/img/@src").extract(),
                "tipo_inmueble": anuncio.xpath("article/a/div/footer/div/span/b/text()").extract(),
                "inmobiliaria": anuncio.xpath("article/a/div/footer/div/div/div").extract(),
            }

    # async def parse_json(self, response):
    #     yield json.loads(response.text)

    def errback(self, failure):
        logging.info(
            "Handling failure in errback, request=%r, exception=%r", failure.request, failure.value
        )
5w9g7ksd

5w9g7ksd1#

在这个网站上使用playwright不是正确的方法,你应该使用他们的公共搜索API
下面是一个示例,说明如何向API发出POST请求,并非常快速地获得json响应中的所有信息。

import scrapy
import json

payload = {"filter":{"offer":{"slug":["sell"]},"property_type":{"slug":["apartment"]},"locations":{"cities":{"slug":["colombia-atlántico-5700003-barranquilla"]}}},"fields":{"exclude":[],"facets":[],"include":["area","baths.id","baths.name","baths.slug","client.client_type","client.company_name","client.first_name","client.fr_client_id","client.last_name","client.logo.full_size","garages.name","is_new","locations.cities.fr_place_id","locations.cities.name","locations.cities.slug","locations.countries.fr_place_id","locations.countries.name","locations.countries.slug","locations.groups.name","locations.groups.slug","locations.groups.subgroups.name","locations.groups.subgroups.slug","locations.neighbourhoods.fr_place_id","locations.neighbourhoods.name","locations.neighbourhoods.slug","locations.states.fr_place_id","locations.states.name","locations.states.slug","locations.location_point","max_area","max_price","media.photos.list.image.full_size","media.photos.list.is_main","media.videos.list.is_main","media.videos.list.video","media.logo.full_size","min_area","min_price","offer.name","price","products.configuration.tag_id","products.configuration.tag_name","products.label","products.name","products.slug","property_id","property_type.name","fr_property_id","fr_parent_property_id","rooms.id","rooms.name","rooms.slug","stratum.name","title"],"limit":25,"offset":0,"ordering":[],"platform":40,"with_algorithm":True}}

class PwspiderSpider(scrapy.Spider):
    name = "fincaraiz-home"
    def start_requests(self):
        for i in range(20):
            offset = i * 25
            payload["fields"]["offset"] = offset
            _payload = json.dumps(payload)
            yield scrapy.Request(
                "https://api.fincaraiz.com.co/document/api/1.0/listing/search",
                method="POST",
                body=_payload,
                headers={"content-type": "application/json"}
            )

    def parse(self, response):
        data = response.json()
        for item in data["hits"]["hits"]:
            yield {"item": item }

这段代码生成了20个页面,每个页面有25个结果,大约3秒钟,它生成的每个项目都包含了你试图用playwright提取的所有信息,看起来像这样。

2022-12-21 17:38:01 [scrapy.core.scraper] DEBUG: Scraped from <200 https://api.fincaraiz.com.co/document/api/1.0/listing/search>
{'item': {'_index': 'fr-site-listing', '_type': '_doc', '_id': 'ac5fe39b-fb51-4702-a248-c23cf358ea17', '_score': 49.7314, '_source': {'listing': {'area': '53.0', 'rooms': {'name': '3', 'id': 3, 'slug': 'ROOM_3'},
 'max_area': '0', 'is_new': True, 'media': {'logo': {'full_size': 'https://s3.amazonaws.com/imagenes.fincaraiz.com.co/FC_COL/2021/11/19/6857997/proyecto-nuevo-venta-atlantico-barranquilla-501212856_m.jpg'}, 'vide
os': [{'list': [{'is_main': False, 'video': 'https://www.youtube.com/embed/NZMDh5SQy4w'}]}], 'photos': [{'list': [{'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18
/3427045_252_14.jpg'}, 'is_main': True}, {'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_192_15.jpg'}, 'is_main': False}, {'image': {'full_size': 'https:
//s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_710_19.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427
045_825_17.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_584_16.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3
.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_311_18.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_
825_21.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_512_20.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3.ama
zonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_855_22.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_437_
12.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_268_13.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3.amazona
ws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_232_11.jpg'}, 'is_main': False}, {'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_516_9.jp
g'}, 'is_main': False}, {'image': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincaraiz.com.co/OVFR_COL/2022/1/18/3427045_319_10.jpg'}, 'is_main': False}]}]}, 'title': 'Barloa  Trivento', 'property_id':
'ac5fe39b-fb51-4702-a248-c23cf358ea17', 'stratum': {'name': 'Estrato 2'}, 'offer': [{'name': 'Venta'}], 'garages': {'name': 'Sin especificar'}, 'baths': {'name': '1', 'id': 1, 'slug': 'BATH_1'}, 'max_price': '0',
 'min_price': '0', 'fr_parent_property_id': 6857997, 'price': '127500000.0', 'min_area': '0', 'client': {'company_name': 'CONSTRUCTORA BOLIVAR', 'logo': {'full_size': 'https://s3.amazonaws.com/imagenesprof.fincar
aiz.com.co/OVFR_COL/2015/11/18/201511181089RHXMDSJYOFULAQGWMCRIXODTJZOFULBQHWN.jpg'}, 'last_name': '', 'client_type': 'BUILDER', 'first_name': '', 'fr_client_id': 28249}, 'property_type': [{'name': 'Apartamento'}
], 'locations': {'neighbourhoods': [{'fr_place_id': 0, 'name': 'Caribe Verde', 'slug': 'colombia-atlantico-barranquilla-0-caribe-verde'}, {'fr_place_id': 0, 'name': 'A.s.d.', 'slug': ['neighbourhood-colombia-08-0
01-000190']}], 'cities': [{'fr_place_id': 5700003, 'name': 'Barranquilla', 'slug': 'colombia-atlántico-5700003-barranquilla'}, {'fr_place_id': 5700003, 'name': 'Barranquilla', 'slug': ['city-colombia-08-001', 'co
lombia-atlántico-5700003-barranquilla']}], 'location_point': 'POINT (-74.8502426147461 10.95703411102295)', 'groups': [{'subgroups': {'name': 'Zona Sur Occidente', 'slug': 'colombia-atlantico-barranquilla-5700104
-zona-sur-occidente'}, 'name': 'Zonas', 'slug': 'zonas'}], 'countries': [{'fr_place_id': 1, 'name': 'Colombia', 'slug': 'colombia'}, {'fr_place_id': 1, 'name': 'Colombia', 'slug': ['country-48-colombia', 'colombi
a']}], 'states': [{'fr_place_id': 57, 'name': 'Atlántico', 'slug': 'colombia-atlántico'}, {'fr_place_id': 57, 'name': 'Atlantico', 'slug': ['state-colombia-08-atlantico', 'colombia-atlántico']}]}, 'fr_property_id
': 6858014}}}}

相关问题