我的任务有一个问题,那就是从网站https://world.openfoodfacts.org/中提取食物数据。一切都很顺利,直到页面之间循环的阶段。我想循环查看每种食物的数据。最初,我设法从一个页面中提取所有食物数据并将其打印到CSV文件中,但在尝试从每个页面中提取数据时遇到了问题。这是我第一次使用Python和BeautifulSoup编程。我希望你能给我一些建议来改进我下面的代码。- 谢谢-谢谢
import requests
import openpyxl`
your text`
from bs4 import BeautifulSoup
# excel = openpyxl.Workbook()
# sheet = excel.active
# sheet.title = "Open food facts"
# sheet.append(['Name', 'Barcode', 'Common Name', 'Quantity', 'Packaging', 'Categories', 'Labels, certifications, awards',
# 'Stores', 'Countries where sold', 'Nutri_Score_Description', 'NOVA_Description', 'ECO_Score_Description',
# 'ingredient', 'allergen', 'fat_in_quantity', 'saturated_fat_in_quantity', 'sugar_in_quantity', 'salt_in_quantity'])
baseurl = "https://world.openfoodfacts.org/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
productlinks = []
for x in range(2, 3):
r = requests.get(f'https://world.openfoodfacts.org/{x}')
soup = BeautifulSoup(r.content, 'lxml')
productlist = soup.find_all('li', class_='list_product_a')
for item in productlist:
for link in item.find_all('a', href=True):
productlinks.append(baseurl + link['href'])
food_data = []
for link in productlinks:
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
try:
item_name = soup.find('h2', class_='title-1').text.strip()
except:
item_name = "No names"
try:
common_name = soup.find('span', class_='field_value').text.strip()
except:
common_name = "No common names"
try:
barcode = soup.find('span', id='barcode').text.strip()
except:
barcode = "No barcodes"
try:
qty = soup.find('span', id='field_quantity_value').text.strip()
except:
qty = "No Quantitys"
try:
list_packaging = soup.find_all('span', id='field_packaging_value')
packaging_list = [package.text.strip() for package in list_packaging]
packaging = ', '.join(packaging_list) if packaging_list else 'No Packages'
except:
packaging = 'No Packages'
try:
list_categories = soup.find_all('span', id='field_categories_value')
category_list = [category.text.strip() for category in list_categories]
categories = ', '.join(category_list) if category_list else 'No Categories'
except:
categories = 'No Categories'
try:
list_labels = soup.find_all('span', id='field_labels_value')
label_list = [label.text.strip() for label in list_labels]
labels = ', '.join(label_list) if label_list else 'No Labels, certifications, awards'
except:
labels = 'No Labels, certifications, awards'
try:
list_stores = soup.find_all('span', id='field_stores_value')
store_list = [store.text.strip() for store in list_stores]
stores = ', '.join(store_list) if store_list else 'No Stores'
except:
stores = 'No Stores'
try:
list_countries = soup.find_all('span', id='field_countries_value')
country_list = [country.text.strip() for country in list_countries]
countries = ', '.join(country_list) if country_list else 'No Countries'
except:
countries = 'No Countries'
try:
a_element = soup.find('a', href="#panel_nutriscore_content")
Nutri_Score_Description = a_element.find('h4').text.strip()
except:
Nutri_Score_Description = "NO Nutri Score Description Available"
try:
a_element = soup.find('a', href="#panel_nova_content")
NOVA_Description = a_element.find('h4').text.strip()
except:
NOVA_Description = "NO NOVA Description Available"
try:
a_element = soup.find('a', href="#panel_ecoscore_content")
ECO_Score_Description = a_element.find('h4').text.strip()
except:
ECO_Score_Description = "NO ECO Score Description Available"
try:
ingredient = soup.find('div', id='panel_ingredients_content').find('div', class_='panel_text').get_text(strip=True)
except:
ingredient = 'No Ingredients'
try:
allergen = soup.find('div', id='panel_ingredients_content').find('strong', text='Allergens:').next_sibling.strip()
except:
allergen = 'No Allergen'
try:
a_element = soup.find('a', href="#panel_nutrient_level_fat_content")
fat_in_quantity = a_element.find('h4', class_='evaluation__title').text.strip()
except:
fat_in_quantity = "No Fat In Quantitys"
try:
a_element = soup.find('a', href="#panel_nutrient_level_saturated-fat_content")
saturated_fat_in_quantity = a_element.find('h4', class_='evaluation__title').text.strip()
except:
saturated_fat_in_quantity = "No Saturated Fat In Quantitys"
try:
a_element = soup.find('a', href="#panel_nutrient_level_sugars_content")
sugar_in_quantity = a_element.find('h4', class_='evaluation__title').text.strip()
except:
sugar_in_quantity = "No Sugar In Quantitys"
try:
a_element = soup.find('a', href="#panel_nutrient_level_salt_content")
salt_in_quantity = a_element.find('h4', class_='evaluation__title').text.strip()
except:
salt_in_quantity = "No Salt In Quantitys"
food = {
'Name': item_name,
'Barcode': barcode,
'Common Name': common_name,
'Quantity': qty,
'Packaging': packaging,
'Categories': categories,
'Labels, certifications, awards':labels,
'Stores': stores,
'Countries where sold': countries,
'Nutri_Score_Description': Nutri_Score_Description,
'NOVA_Description': NOVA_Description,
'ECO_Score_Description':ECO_Score_Description,
'ingredient': ingredient,
'allergen': allergen,
'fat_in_quantity': fat_in_quantity,
'saturated_fat_in_quantity': saturated_fat_in_quantity,
'sugar_in_quantity': sugar_in_quantity,
'salt_in_quantity':salt_in_quantity
}
print(item_name, barcode, common_name, qty, packaging, categories, labels,
stores, countries, Nutri_Score_Description, NOVA_Description, ECO_Score_Description,
ingredient, allergen, fat_in_quantity, saturated_fat_in_quantity, sugar_in_quantity, salt_in_quantity)
# sheet.append([item_name, barcode, common_name, qty, packaging, categories, labels,
# stores, countries, Nutri_Score_Description, NOVA_Description, ECO_Score_Description,
# ingredient, allergen, fat_in_quantity, saturated_fat_in_quantity, sugar_in_quantity, salt_in_quantity])
# excel.save('openfoodfact.xlsx')
字符串
起初,我试图删除baseurl,因为我认为baseurl和每个食品项目中的链接具有相同的前缀。所以,当我删除它,我想我会得到所有的食物链接。请看下面的代码
productlinks = []
for x in range(2, 3):
r = requests.get(f'https://world.openfoodfacts.org/{x}')
soup = BeautifulSoup(r.content, 'lxml')
productlist = soup.find_all('li', class_='list_product_a')
for item in productlist:
for link in item.find_all('a', href=True):
productlinks.append(baseurl + link['href'])
型
致:
productlinks = []
for x in range(2, 3):
r = requests.get(f'https://world.openfoodfacts.org/{x}')
soup = BeautifulSoup(r.content, 'lxml')
productlist = soup.find_all('li', class_='list_product_a')
for item in productlist:
for link in item.find_all('a', href=True):
productlinks.append(link['href'])
型
但是VSCode终端中的打印结果仍然是空的。我对编程世界真的很陌生。请帮帮我
1条答案
按热度按时间sbtkgmzw1#
少量的调试会表明您没有仔细检查返回的HTML,这可能是您找不到任何内容的原因。
您还应该知道
in range(2,3)
只会执行nr。2,因为最后一个数字是独占的。读作[2..3[
下面的stump返回一些数据,也许你可以用它来构建。
字符串
结果:
型