pandas 如何从不一致的网页抓取数据创建一个框架?

wyyhbhjk  于 2023-11-15  发布在  其他
关注(0)|答案(1)|浏览(103)

我正在从已发布的属性列表中抓取信息。当尝试将此数据导出到CSV文件时,我遇到一个错误,

'All arrays must be of the same length.'

字符串
这是因为并非所有属性都包含我提取的相同信息。
因此,列在所有行中的数据量并不相同。我需要帮助来处理这种情况并完成代码,以便将缺少某些信息的产品填充为空值。您可以帮助我吗?

# Instalar las bibliotecas necesarias si no están instaladas
!pip install requests
!pip install beautifulsoup4
!pip install pandas

# Importar las bibliotecas
import requests
from bs4 import BeautifulSoup
import pandas as pd
from lxml import etree

link = 'https://listado.mercadolibre.cl/inmuebles/departamentos/venta/propiedades-usadas/_NoIndex_True#applied_filter_id%3DOPERATION_SUBTYPE%26applied_filter_name%3DModalidad%26applied_filter_order%3D5%26applied_value_id%3D244562%26applied_value_name%3DPropiedades+usadas%26applied_value_order%3D1%26applied_value_results%3D68139%26is_custom%3Dfalse'

r = requests.get(link)

r.status_code

soup = BeautifulSoup(r.content, 'html.parser')

dom=etree.HTML(str(soup))

titulos = soup.find_all('h2', attrs={"class":"ui-search-item__title"})
titulos = [i.text for i in titulos]

urls = soup.find_all('div',attrs={"class":"ui-search-item__group__element ui-search-item__title-grid"})
urls = [item.find('a')['href'] for item in urls]

moneda = soup.find_all('span', attrs={"class":"andes-money-amount__currency-symbol"})
moneda = [i.text for i in moneda]

precio = soup.find_all('span', attrs={"class":"andes-money-amount__fraction"})
precio = [i.text for i in precio]

dormitorio = dom.xpath('//div[@class="ui-search-item__attributes-container-grid"]/ul/li[contains(normalize-space(.), "dormitorios")]')
dormitorio = [i.text for i in dormitorio]

baño = dom.xpath('//div[@class="ui-search-item__attributes-container-grid"]/ul/li[contains(normalize-space(.), "baño")]')
baño = [i.text for i in baño]

m2 = dom.xpath('//div[@class="ui-search-item__attributes-container-grid"]/ul/li[contains(normalize-space(.), "m²")]')
m2 = [i.text for i in m2]

locacion = soup.find_all('span',attrs={"class":"ui-search-item__location-label"})
locacion = [i.text for i in locacion]

vendedor = soup.find_all('div',attrs={"class":"ui-search-item__official-store-grid"})
vendedor = [i.text for i in vendedor]
vendedor

# df = pd.DataFrame({"titulos" :titulos, "urs" :urls})

# df.to_csv('Ejemplo')

siguiente = dom.xpath('//div[@class="ui-search-pagination"]/nav/ul/li[contains(@class,"--next")]/a')[0].get('href')

ini = soup.find('span',attrs={"class":"andes-pagination__link"}).text
ini = int(ini)

can = soup.find('li',attrs={"class":"andes-pagination__page-count"})
can = int(can.text.split(" ")[1])

lista_titulos = []
lista_urls = []
lista_dormitorio = []
lista_baño = []
lista_m2 = []
lista_moneda = []
lista_precio = []
lista_vendedor = []
lista_locacion = []

siguiente = link
while True:
    r = requests.get(siguiente)
    if r.status_code == 200:
        soup = BeautifulSoup(r.content, 'html.parser')

        #titulos
        titulos = soup.find_all('h2', attrs={"class":"ui-search-item__title"})
        titulos = [i.text for i in titulos]
        lista_titulos.extend(titulos)

        #urls
        urls = soup.find_all('div',attrs={"class":"ui-search-item__group__element ui-search-item__title-grid"})
        urls = [item.find('a')['href'] for item in urls]
        lista_urls.extend(urls)

        #moneda
        moneda = soup.find_all('span', attrs={"class":"andes-money-amount__currency-symbol"})
        moneda = [i.text for i in moneda]
        lista_moneda.extend(moneda)

        #precio
        precio = soup.find_all('span', attrs={"class":"andes-money-amount__fraction"})
        precio = [i.text for i in precio]
        lista_precio.extend(precio)

        #dormitorios
        dormitorio = dom.xpath('//div[@class="ui-search-item__attributes-container-grid"]/ul/li[contains(normalize-space(.), "dormitorios")]')
        dormitorio = [i.text for i in dormitorio]
        lista_dormitorio.extend(dormitorio)

        #baños
        baño = dom.xpath('//div[@class="ui-search-item__attributes-container-grid"]/ul/li[contains(normalize-space(.), "baño")]')
        baño = [i.text for i in baño]
        lista_baño.extend(baño)

        #m2
        m2 = dom.xpath('//div[@class="ui-search-item__attributes-container-grid"]/ul/li[contains(normalize-space(.), "m²")]')
        m2 = [i.text for i in m2]
        lista_m2.extend(m2)

        #vendedor
        vendedor = soup.find_all('div',attrs={"class":"ui-search-item__official-store-grid"})
        vendedor = [i.text for i in vendedor]
        lista_vendedor.extend(vendedor)

        #locacion
        locacion = soup.find_all('span',attrs={"class":"ui-search-item__location-label"})
        locacion = [i.text for i in locacion]
        lista_locacion.extend(locacion)

        #inicial
        ini = soup.find('span',attrs={"class":"andes-pagination__link"}).text
        ini = int(ini)

        #cantidad
        dom=etree.HTML(str(soup))
        can = soup.find('li',attrs={"class":"andes-pagination__page-count"})
        can = int(can.text.split(" ")[1])

    else:
          break
    print(ini,can)
    if ini==can:
          break
    siguiente = dom.xpath('//div[@class="ui-search-pagination"]/nav/ul/li[contains(@class,"--next")]/a')[0].get('href')

print(len(lista_titulos))
print(len(lista_urls))
print(len(lista_dormitorio))
print(len(lista_baño))
print(len(lista_m2))
print(len(lista_locacion))
print(len(lista_vendedor))

df = pd.DataFrame({"titulos" :lista_titulos, "url" :lista_urls, "moneda" :lista_moneda, "precio" :lista_precio, "dormitorios" :lista_dormitorio, "baños" :lista_baño, "m2" :lista_m2})
df.to_csv('Venta_Departamentos_Usados_MasRelevantes')

8zzbczxx

8zzbczxx1#

尽量避免使用lists,而是使用更结构化的数据,如dictdictslist-使用dict的关键是键也可能完全丢失,然后pandas简单地为它们设置nan值:

data = []
data.append({
        'titulos': e.h2.text,
        'url':e.a.get('href'),
        'banos':a.text if (a := e.select_one('li.ui-search-card-attributes__attribute:-soup-contains("baño")')) else None,
        'dormitorios':a.text if (a := e.select_one('li.ui-search-card-attributes__attribute:-soup-contains("dormitorio")')) else None,
        'm2':a.text if (a := e.select_one('li.ui-search-card-attributes__attribute:-soup-contains("m²")')) else None
    })

字符串
或者使用一个函数来获取像这样的属性:

def get_attributes(list_of_elements, list_of_attributes):
    return {a:e.text for a in list_of_attributes for e in list_of_elements if a in e.text}

data = []

for e in soup.select('li.ui-search-layout__item'):
    d = {
        'titulos': e.h2.text,
        'url':e.a.get('href')
    }
    d.update(get_attributes(e.select('li.ui-search-card-attributes__attribute'), ['baño','m²','dormitorios']))
    data.append(d)

示例

import requests
from bs4 import BeautifulSoup
import pandas as pd

link = 'https://listado.mercadolibre.cl/inmuebles/departamentos/venta/propiedades-usadas/_NoIndex_True#applied_filter_id%3DOPERATION_SUBTYPE%26applied_filter_name%3DModalidad%26applied_filter_order%3D5%26applied_value_id%3D244562%26applied_value_name%3DPropiedades+usadas%26applied_value_order%3D1%26applied_value_results%3D68139%26is_custom%3Dfalse'
r = requests.get(link)

soup = BeautifulSoup(r.content, 'html.parser')

def get_attributes(list_of_elements, list_of_attributes):
    return {a:e.text for a in list_of_attributes for e in list_of_elements if a in e.text}

data = []

for e in soup.select('li.ui-search-layout__item'):
    d = {
        'titulos': e.h2.text,
        'url':e.a.get('href')
    }
    d.update(get_attributes(e.select('li.ui-search-card-attributes__attribute'), ['baño','m²','dormitorios']))
    data.append(d)

pd.DataFrame(data)

输出

| | URL|巴诺斯|dormitorios| M2| m2 |
| --|--|--|--|--|--|
| 0 |Hermoso Departamento A 700 Metros Del Casino De Viña|https://departamento.mercadolibre.cl/MLC-1434260233-hermoso-departamento-a-700-metros-del-casino-de-vina-_JM#position=1&search_layout=grid&type=item&tracking_id=ab99a7d7-b89b-4135-8126-78b27b99dbf6|2个婴儿|2 dormitorios| 63平方米套房|
| ......这是什么?||||||
| 47 |Remodelado Y Luminoso,Espectacular Vista|https://departamento.mercadolibre.cl/MLC-2070912132-remodelado-y-luminoso-espectacular-vista-_JM#position=48&search_layout=grid&type=item&tracking_id=ab99a7d7-b89b-4135-8126-78b27b99dbf6|2个婴儿|3间宿舍|99平方米套房|

相关问题