csv 如何在Python中使用BeautifulSoup从每个页面中提取数据?

2cmtqfgy  于 2023-07-31  发布在  Python
关注(0)|答案(1)|浏览(95)

我的任务有一个问题,那就是从网站https://world.openfoodfacts.org/中提取食物数据。一切都很顺利,直到页面之间循环的阶段。我想循环查看每种食物的数据。最初,我设法从一个页面中提取所有食物数据并将其打印到CSV文件中,但在尝试从每个页面中提取数据时遇到了问题。这是我第一次使用Python和BeautifulSoup编程。我希望你能给我一些建议来改进我下面的代码。- 谢谢-谢谢

import requests
import openpyxl`
your text`
from bs4 import BeautifulSoup

# excel = openpyxl.Workbook()
# sheet = excel.active
# sheet.title = "Open food facts"

# sheet.append(['Name', 'Barcode', 'Common Name', 'Quantity', 'Packaging', 'Categories', 'Labels, certifications, awards',
#               'Stores', 'Countries where sold', 'Nutri_Score_Description', 'NOVA_Description', 'ECO_Score_Description',
#               'ingredient', 'allergen', 'fat_in_quantity', 'saturated_fat_in_quantity', 'sugar_in_quantity', 'salt_in_quantity'])

baseurl = "https://world.openfoodfacts.org/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

productlinks = []

for x in range(2, 3):
    r = requests.get(f'https://world.openfoodfacts.org/{x}')
    soup = BeautifulSoup(r.content, 'lxml')
    productlist = soup.find_all('li', class_='list_product_a')
    for item in productlist:
        for link in item.find_all('a', href=True):
            productlinks.append(baseurl + link['href'])

food_data = []
for link in productlinks:
    r = requests.get(link, headers=headers)
    soup = BeautifulSoup(r.content, 'lxml')

    try:
        item_name = soup.find('h2', class_='title-1').text.strip()
    except:
        item_name = "No names"

    try:
        common_name = soup.find('span', class_='field_value').text.strip()
    except:
        common_name = "No common names"

    try:
        barcode = soup.find('span', id='barcode').text.strip()
    except:
        barcode = "No barcodes"

    try:
        qty = soup.find('span', id='field_quantity_value').text.strip()
    except:
        qty = "No Quantitys"

    try:
        list_packaging = soup.find_all('span', id='field_packaging_value')
        packaging_list = [package.text.strip() for package in list_packaging]
        packaging = ', '.join(packaging_list) if packaging_list else 'No Packages'
    except:
        packaging = 'No Packages'

    try:
        list_categories = soup.find_all('span', id='field_categories_value')
        category_list = [category.text.strip() for category in list_categories]
        categories = ', '.join(category_list) if category_list else 'No Categories'
    except:
        categories = 'No Categories'

    try:
        list_labels = soup.find_all('span', id='field_labels_value')
        label_list = [label.text.strip() for label in list_labels]
        labels = ', '.join(label_list) if label_list else 'No Labels, certifications, awards'
    except:
        labels = 'No Labels, certifications, awards'

    try:
        list_stores = soup.find_all('span', id='field_stores_value')
        store_list = [store.text.strip() for store in list_stores]
        stores = ', '.join(store_list) if store_list else 'No Stores'
    except:
        stores = 'No Stores'

    try:
        list_countries = soup.find_all('span', id='field_countries_value')
        country_list = [country.text.strip() for country in list_countries]
        countries = ', '.join(country_list) if country_list else 'No Countries'
    except:
        countries = 'No Countries'

    try:
        a_element = soup.find('a', href="#panel_nutriscore_content")
        Nutri_Score_Description = a_element.find('h4').text.strip()
    except:
        Nutri_Score_Description = "NO Nutri Score Description Available"

    try:
        a_element = soup.find('a', href="#panel_nova_content")
        NOVA_Description = a_element.find('h4').text.strip()
    except:
        NOVA_Description = "NO NOVA Description Available"

    try:
        a_element = soup.find('a', href="#panel_ecoscore_content")
        ECO_Score_Description = a_element.find('h4').text.strip()
    except:
        ECO_Score_Description = "NO ECO Score Description Available"

    try:
        ingredient = soup.find('div', id='panel_ingredients_content').find('div', class_='panel_text').get_text(strip=True)
    except:
        ingredient = 'No Ingredients'

    try:
        allergen = soup.find('div', id='panel_ingredients_content').find('strong', text='Allergens:').next_sibling.strip()
    except:
        allergen = 'No Allergen'

    try:
        a_element = soup.find('a', href="#panel_nutrient_level_fat_content")
        fat_in_quantity = a_element.find('h4', class_='evaluation__title').text.strip()
    except:
        fat_in_quantity = "No Fat In Quantitys"

    try:
        a_element = soup.find('a', href="#panel_nutrient_level_saturated-fat_content")
        saturated_fat_in_quantity = a_element.find('h4', class_='evaluation__title').text.strip()
    except:
        saturated_fat_in_quantity = "No Saturated Fat In Quantitys"

    try:
        a_element = soup.find('a', href="#panel_nutrient_level_sugars_content")
        sugar_in_quantity = a_element.find('h4', class_='evaluation__title').text.strip()
    except:
        sugar_in_quantity = "No Sugar In Quantitys"

    try:
        a_element = soup.find('a', href="#panel_nutrient_level_salt_content")
        salt_in_quantity = a_element.find('h4', class_='evaluation__title').text.strip()
    except:
        salt_in_quantity = "No Salt In Quantitys"

    food = {
        'Name': item_name,
        'Barcode': barcode,
        'Common Name': common_name,
        'Quantity': qty,
        'Packaging': packaging,
        'Categories': categories,
        'Labels, certifications, awards':labels,
        'Stores': stores,
        'Countries where sold': countries,
        'Nutri_Score_Description': Nutri_Score_Description,
        'NOVA_Description': NOVA_Description,
        'ECO_Score_Description':ECO_Score_Description,
        'ingredient': ingredient,
        'allergen': allergen,
        'fat_in_quantity': fat_in_quantity,
        'saturated_fat_in_quantity': saturated_fat_in_quantity,
        'sugar_in_quantity': sugar_in_quantity,
        'salt_in_quantity':salt_in_quantity
    }

    print(item_name, barcode, common_name, qty, packaging, categories, labels,
              stores, countries, Nutri_Score_Description, NOVA_Description, ECO_Score_Description,
              ingredient, allergen, fat_in_quantity, saturated_fat_in_quantity, sugar_in_quantity, salt_in_quantity)

    # sheet.append([item_name, barcode, common_name, qty, packaging, categories, labels,
    #           stores, countries, Nutri_Score_Description, NOVA_Description, ECO_Score_Description,
    #           ingredient, allergen, fat_in_quantity, saturated_fat_in_quantity, sugar_in_quantity, salt_in_quantity])
# excel.save('openfoodfact.xlsx')

字符串
起初,我试图删除baseurl,因为我认为baseurl和每个食品项目中的链接具有相同的前缀。所以,当我删除它,我想我会得到所有的食物链接。请看下面的代码

productlinks = []

for x in range(2, 3):
    r = requests.get(f'https://world.openfoodfacts.org/{x}')
    soup = BeautifulSoup(r.content, 'lxml')
    productlist = soup.find_all('li', class_='list_product_a')
    for item in productlist:
        for link in item.find_all('a', href=True):
            productlinks.append(baseurl + link['href'])


致:

productlinks = []

for x in range(2, 3):
    r = requests.get(f'https://world.openfoodfacts.org/{x}')
    soup = BeautifulSoup(r.content, 'lxml')
    productlist = soup.find_all('li', class_='list_product_a')
    for item in productlist:
        for link in item.find_all('a', href=True):
            productlinks.append(link['href'])


但是VSCode终端中的打印结果仍然是空的。我对编程世界真的很陌生。请帮帮我

sbtkgmzw

sbtkgmzw1#

少量的调试会表明您没有仔细检查返回的HTML,这可能是您找不到任何内容的原因。
您还应该知道in range(2,3)只会执行nr。2,因为最后一个数字是独占的。读作[2..3[
下面的stump返回一些数据,也许你可以用它来构建。

import requests
from bs4 import BeautifulSoup

productlinks = []

for x in (2,3,4):
    print("============== nr ", x)
    r = requests.get(f'https://world.openfoodfacts.org/{x}')
    soup = BeautifulSoup(r.content, 'lxml')
    #
    # to see returned HTML uncomment this
    # print(soup)

    # In this case, the li has no class, the ul above does.
    productlist = soup.find_all('ul', class_='products')
    for item in productlist:
        for link in item.find_all('a', href=True):
            productlinks.append(link['href'])
    print(productlinks)
    print("")

    # reset list in this example, to show different output
    # you may wish to leave out
    productlinks=[]

字符串
结果:

============== nr  2
['/product/5411188110835/alpro', '/product/3124480191182/oasis', '/pr..

============== nr  3
['/product/3480341000636/carre-frais-elle-vire', '/product/80895237/n....

============== nr  4
['/product/8410376009392/biscuits-fibra-integral-sin-azucares-gullon'...

相关问题