python-3.x BeautifulSoup网页抓取:UnboundLocalError:赋值前引用的局部变量“soup”

hgtggwj0  于 2023-05-19  发布在  Python
关注(0)|答案(2)|浏览(169)

我尝试了网页抓取与美丽的汤和请求通过以下Youtube视频一切顺利,直到我降落与这个错误,但它的导师工作。

import requests
from bs4 import BeautifulSoup

def get_data(url):
    if not response.ok:
        print('Server Responded: {}'.format(response.status_code))
    else:
        soup = BeautifulSoup(response.text, 'lxml')
    return(soup)

def get_detail_data(soup):
    try:
        title = soup.find('h1', id='itemTitle').text.strip()
    except:
        title = ''
        
    try:
        p = soup.find('span', id='prcIsum').text.strip()
        currency, price = p.split(' ')
    except:
        currency = ''
        price = ''
    
    try:
        sold = soup.find('span', class_='vi-qtyS-hot-red').a.text.strip().split(' ')[0]
    except:
        sold = ''
    
    data = {
        'title' : title,
        'currency' : currency,
        'price' : price,
        'total units sold' : sold
    }

    return data

def get_index_data(soup):
    try:
        links = soup.find_all('a', class_='s-item__link')
    except:
        links = []

    
    urls = [item.get('href') for item in links]
    return urls

def main():
    url = 'https://www.ebay.com/sch/i.html?_nkw=mens+shoes&_sacat=0'
    
    products = get_index_data(get_data(url))

    for link in products:
        data =  get_detail_data(get_data(link))

if __name__ == '__main__':
    main()
n53p2ov0

n53p2ov01#

在get_data中,你缺少了发出实际请求和存储响应的过程。如果response.ok不为True,则需要赋值soup = None。最后,在其他地方,在尝试调用soup上的方法之前,您需要测试soup是否为None。

import requests
from bs4 import BeautifulSoup

def get_data(url):
    
    response = requests.get(url, headers = {'User-Agent':'Mozilla/5.0'}) #this was missing
    
    if not response.ok:
        print('Server Responded: {}'.format(response.status_code))
        soup = None
    else:
        soup = BeautifulSoup(response.text, 'lxml')
    return soup

def get_detail_data(soup):
    
    try:
        title = soup.find('h1', id='itemTitle').text.strip()
    except:
        title = ''
        
    try:
        p = soup.find('span', id='prcIsum').text.strip()
        currency, price = p.split(' ')
    except:
        currency = ''
        price = ''
    
    try:
        sold = soup.find('span', class_='vi-qtyS-hot-red').a.text.strip().split(' ')[0]
    except:
        sold = ''
    
    data = {
        'title' : title,
        'currency' : currency,
        'price' : price,
        'total units sold' : sold
    }

    return data

def get_index_data(soup):
    try:
        links = soup.find_all('a', class_='s-item__link')
    except:
        links = []

    
    urls = [item.get('href') for item in links]
    return urls

def main():
    
    url = 'https://www.ebay.com/sch/i.html?_nkw=mens+shoes&_sacat=0'
    soup = get_data(url)
    
    if not soup is None:
        
        products = get_index_data(soup)
        #print(products)

        for link in products:
            
            soup = get_data(link)
            
            if not soup is None:
                
                data =  get_detail_data(soup)
                print(data)

if __name__ == '__main__':
    main()
vltsax25

vltsax252#

其中一个原因可能是请求被阻止,如果使用requests作为requests库中的默认user-agentpython-requests
额外的步骤可以是旋转user-agent,例如,以在PC、移动的和平板电脑之间切换,以及在浏览器之间切换,例如,Chrome、Firefox、Safari、Edge等。
可以使用一个while循环和基于非令牌的分页来从网站的所有页面收集数据,而不管页面的数量。
online IDE中检查代码的分页。

from bs4 import BeautifulSoup
import requests, json, lxml

# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
}
   
params = {
    "_nkw": "mens shoes",    # search query 
    "_pgn": 1                # page number
}

data = []
limit = 5                 # page limit (if needed)

while True:
    page = requests.get("https://www.ebay.com/sch/i.html", params=params, headers=headers, timeout=30)
    soup = BeautifulSoup(page.text, "lxml")
   
    for products in soup.select(".s-item__info"):
        title = products.select_one(".s-item__title span").text
        price = products.select_one(".s-item__price").text
        link = products.select_one(".s-item__link")["href"]
        try:
            total_sold = products.select_one(".s-item__quantitySold .BOLD").text
        except:
            total_sold = None
        
        data.append({
          "title": title,
          "price": price,
          "link": link,
          "total_sold": total_sold
        })
    
    # exit on the specified page limit    
    if params['_pgn'] == limit:
       break

    # exit if there is no "next page" button on the page
    if soup.select_one(".pagination__next"):
        params['_pgn'] += 1
    else:
        break

print(json.dumps(data, indent=2, ensure_ascii=False))

示例输出:

[
  {
    "title": "Puma Roma Basic + 36957140 Mens White Synthetic Lifestyle Sneakers Shoes",
    "price": "$38.99",
    "link": "https://www.ebay.com/itm/314208235054?hash=item4928452e2e:g:mbsAAOSw575jXGrE&amdata=enc%3AAQAIAAAAwAO9gR8l1ukqLQbl6q8FUyxsVU1Pi4R152gSnqa0l85z9miafGsriCNaYWXnxsZ53uOYze4K%2BjOy8YbcVkZFQfMYf3P%2F%2FfhUdLqwvBY9DM3xCOv%2FCtaIU0ZmkRV6cP4Qyt5svmXSctyo4wBZ0ZD9oSrs8cHYxwc5sXer1q4n40EDty7A%2FS%2FY3rnDe9Hsp0KQFcNI4wyRbVhDds9nFp8lYAM7uZJHE47cg43EwNmzagtl1m7kWWpqHMwaJptXQZQM7w%3D%3D%7Ctkp%3ABk9SR_a21vP8YQ",
    "total_sold": "127+ sold"
  },
  {
    "title": "ASICS Men's GT-2000 10   Running Shoes 1011B564",
    "price": "$75.16",
    "link": "https://www.ebay.com/itm/115765812196?hash=item1af42e1be4:g:tV4AAOSwEHNkNDuX&amdata=enc%3AAQAIAAAAwEYflTdBkix%2FAGgOONX%2BailUG5a6G5GA%2B2kae49rMTrE741MqY1i9%2F1EiP9INab3PMsCJURlLgcpr9xfDxmw4uiN%2BUJboXQtM3X3%2F7KU9D%2BmRwRtgFHz1Q5LKBBAX2bcyFpg1ULd0SQU9jDrN8KZGQVvrebKV21oYcFOiE3pAHnt98K7oeG%2FPVpKSuZwxOLp3TnpdEn%2B8YA5ONquJMrPSqBgVg8DYaWCunZ23iYMF8g%2Bfubu3zGGCz8akbOCfWextA%3D%3D%7Ctkp%3ABk9SR_a21vP8YQ",
    "total_sold": null
  },
  other results ...
]

你也可以使用SerpApi的Ebay Organic Results API。这是一个付费的API,有一个免费的计划,可以在后端处理块和解析。
带分页的示例代码:

from serpapi import EbaySearch
import json

params = {
    "api_key": "...",                 # serpapi key, https://serpapi.com/manage-api-key   
    "engine": "ebay",                 # search engine
    "ebay_domain": "ebay.com",        # ebay domain
    "_nkw": "mens shoes",             # search query
    "_pgn": 1                         # page number
}

search = EbaySearch(params)           # where data extraction happens
limit = 5
page_num = 0
data = []

while True:
    results = search.get_dict()     # JSON -> Python dict

    if "error" in results:
        print(results["error"])
        break
    
    for organic_result in results.get("organic_results", []):
        title = organic_result.get("title")
        price = organic_result.get("price")
        link = organic_result.get("link")
        quantity_sold = organic_result.get("quantity_sold")

        data.append({
          "title" : title,
          "price" : price,
          "link" : link,
          "quantity_sold" : quantity_sold
        })
                    
    page_num += 1
    print(page_num)

    if params['_pgn'] == limit:
       break
    if "next" in results.get("pagination", {}):
        params['_pgn'] += 1
    else:
        break

print(json.dumps(data, indent=2, ensure_ascii=False))

输出:

[
  {
    "title": "Fashion Casual Men's Athletic Tennis Sneakers Outdoor Sports Running Shoes Gym",
    "price": {
      "raw": "$25.99",
      "extracted": 25.99
    },
    "link": "https://www.ebay.com/itm/393848203506?hash=item5bb32ea8f2:g:1W4AAOSwIUVhFNVq&amdata=enc%3AAQAIAAAA4E5DkZ7r%2BsJ163sukfwOV4mjqrJmXYlTK2Drcv85G3E9ZzR0NSuOhfDGUkPE2wKtjHjBDBCiSrihLlmo8JHidOWEJs73cUptRsgvrgDyrSTnP3c4tcLKgSPOE%2BY9brN8PGdIyP0Vnn%2BPTlvVDuiNENuk5v0qiIeZ7gYHwxHx944YiffpPmX4ZXW9dJ0KHosniVIj3gn0oVtNX%2FS4NTDC0TXuk8GzpZN1eCEOXuQKu3CLwIQhu4ngiMrbRF3nIca6YxE6UYAlC3fBigE8ncSMYSSmP8UB9QuemXO6vbZVxtH%2B%7Ctkp%3ABFBMyO-f9Pxh",
    "quantity_sold": "11+ sold"
  },
  other results ...
]

如果你想了解更多关于网站抓取的信息,可以阅读13 ways to scrape any public data from any website博客文章。

相关问题