我正在从已发布的属性列表中抓取信息。当尝试将此数据导出到CSV文件时,我遇到一个错误,
'All arrays must be of the same length.'
字符串
这是因为并非所有属性都包含我提取的相同信息。
因此,列在所有行中的数据量并不相同。我需要帮助来处理这种情况并完成代码,以便将缺少某些信息的产品填充为空值。您可以帮助我吗?
# Instalar las bibliotecas necesarias si no están instaladas
!pip install requests
!pip install beautifulsoup4
!pip install pandas
# Importar las bibliotecas
import requests
from bs4 import BeautifulSoup
import pandas as pd
from lxml import etree
link = 'https://listado.mercadolibre.cl/inmuebles/departamentos/venta/propiedades-usadas/_NoIndex_True#applied_filter_id%3DOPERATION_SUBTYPE%26applied_filter_name%3DModalidad%26applied_filter_order%3D5%26applied_value_id%3D244562%26applied_value_name%3DPropiedades+usadas%26applied_value_order%3D1%26applied_value_results%3D68139%26is_custom%3Dfalse'
r = requests.get(link)
r.status_code
soup = BeautifulSoup(r.content, 'html.parser')
dom=etree.HTML(str(soup))
titulos = soup.find_all('h2', attrs={"class":"ui-search-item__title"})
titulos = [i.text for i in titulos]
urls = soup.find_all('div',attrs={"class":"ui-search-item__group__element ui-search-item__title-grid"})
urls = [item.find('a')['href'] for item in urls]
moneda = soup.find_all('span', attrs={"class":"andes-money-amount__currency-symbol"})
moneda = [i.text for i in moneda]
precio = soup.find_all('span', attrs={"class":"andes-money-amount__fraction"})
precio = [i.text for i in precio]
dormitorio = dom.xpath('//div[@class="ui-search-item__attributes-container-grid"]/ul/li[contains(normalize-space(.), "dormitorios")]')
dormitorio = [i.text for i in dormitorio]
baño = dom.xpath('//div[@class="ui-search-item__attributes-container-grid"]/ul/li[contains(normalize-space(.), "baño")]')
baño = [i.text for i in baño]
m2 = dom.xpath('//div[@class="ui-search-item__attributes-container-grid"]/ul/li[contains(normalize-space(.), "m²")]')
m2 = [i.text for i in m2]
locacion = soup.find_all('span',attrs={"class":"ui-search-item__location-label"})
locacion = [i.text for i in locacion]
vendedor = soup.find_all('div',attrs={"class":"ui-search-item__official-store-grid"})
vendedor = [i.text for i in vendedor]
vendedor
# df = pd.DataFrame({"titulos" :titulos, "urs" :urls})
# df.to_csv('Ejemplo')
siguiente = dom.xpath('//div[@class="ui-search-pagination"]/nav/ul/li[contains(@class,"--next")]/a')[0].get('href')
ini = soup.find('span',attrs={"class":"andes-pagination__link"}).text
ini = int(ini)
can = soup.find('li',attrs={"class":"andes-pagination__page-count"})
can = int(can.text.split(" ")[1])
lista_titulos = []
lista_urls = []
lista_dormitorio = []
lista_baño = []
lista_m2 = []
lista_moneda = []
lista_precio = []
lista_vendedor = []
lista_locacion = []
siguiente = link
while True:
r = requests.get(siguiente)
if r.status_code == 200:
soup = BeautifulSoup(r.content, 'html.parser')
#titulos
titulos = soup.find_all('h2', attrs={"class":"ui-search-item__title"})
titulos = [i.text for i in titulos]
lista_titulos.extend(titulos)
#urls
urls = soup.find_all('div',attrs={"class":"ui-search-item__group__element ui-search-item__title-grid"})
urls = [item.find('a')['href'] for item in urls]
lista_urls.extend(urls)
#moneda
moneda = soup.find_all('span', attrs={"class":"andes-money-amount__currency-symbol"})
moneda = [i.text for i in moneda]
lista_moneda.extend(moneda)
#precio
precio = soup.find_all('span', attrs={"class":"andes-money-amount__fraction"})
precio = [i.text for i in precio]
lista_precio.extend(precio)
#dormitorios
dormitorio = dom.xpath('//div[@class="ui-search-item__attributes-container-grid"]/ul/li[contains(normalize-space(.), "dormitorios")]')
dormitorio = [i.text for i in dormitorio]
lista_dormitorio.extend(dormitorio)
#baños
baño = dom.xpath('//div[@class="ui-search-item__attributes-container-grid"]/ul/li[contains(normalize-space(.), "baño")]')
baño = [i.text for i in baño]
lista_baño.extend(baño)
#m2
m2 = dom.xpath('//div[@class="ui-search-item__attributes-container-grid"]/ul/li[contains(normalize-space(.), "m²")]')
m2 = [i.text for i in m2]
lista_m2.extend(m2)
#vendedor
vendedor = soup.find_all('div',attrs={"class":"ui-search-item__official-store-grid"})
vendedor = [i.text for i in vendedor]
lista_vendedor.extend(vendedor)
#locacion
locacion = soup.find_all('span',attrs={"class":"ui-search-item__location-label"})
locacion = [i.text for i in locacion]
lista_locacion.extend(locacion)
#inicial
ini = soup.find('span',attrs={"class":"andes-pagination__link"}).text
ini = int(ini)
#cantidad
dom=etree.HTML(str(soup))
can = soup.find('li',attrs={"class":"andes-pagination__page-count"})
can = int(can.text.split(" ")[1])
else:
break
print(ini,can)
if ini==can:
break
siguiente = dom.xpath('//div[@class="ui-search-pagination"]/nav/ul/li[contains(@class,"--next")]/a')[0].get('href')
print(len(lista_titulos))
print(len(lista_urls))
print(len(lista_dormitorio))
print(len(lista_baño))
print(len(lista_m2))
print(len(lista_locacion))
print(len(lista_vendedor))
df = pd.DataFrame({"titulos" :lista_titulos, "url" :lista_urls, "moneda" :lista_moneda, "precio" :lista_precio, "dormitorios" :lista_dormitorio, "baños" :lista_baño, "m2" :lista_m2})
df.to_csv('Venta_Departamentos_Usados_MasRelevantes')
型
1条答案
按热度按时间8zzbczxx1#
尽量避免使用
lists
,而是使用更结构化的数据,如dict
或dicts
的list
-使用dict
的关键是键也可能完全丢失,然后pandas
简单地为它们设置nan
值:字符串
或者使用一个函数来获取像这样的属性:
型
示例
型
输出
| | URL|巴诺斯|dormitorios| M2| m2 |
| --|--|--|--|--|--|
| 0 |Hermoso Departamento A 700 Metros Del Casino De Viña|https://departamento.mercadolibre.cl/MLC-1434260233-hermoso-departamento-a-700-metros-del-casino-de-vina-_JM#position=1&search_layout=grid&type=item&tracking_id=ab99a7d7-b89b-4135-8126-78b27b99dbf6|2个婴儿|2 dormitorios| 63平方米套房|
| ......这是什么?||||||
| 47 |Remodelado Y Luminoso,Espectacular Vista|https://departamento.mercadolibre.cl/MLC-2070912132-remodelado-y-luminoso-espectacular-vista-_JM#position=48&search_layout=grid&type=item&tracking_id=ab99a7d7-b89b-4135-8126-78b27b99dbf6|2个婴儿|3间宿舍|99平方米套房|