Python Web scraping BeautifulSoup

but5z9lq  于 2023-04-13  发布在  Python
关注(0)|答案(1)|浏览(146)

我的代码被困在无限循环中,虽然它不是无限的,它只是给予我同样的列表50次或更多次。我的代码昨天工作正常,它只打印了一次,但突然现在相同的代码被困在循环中。下面是代码:

import requests
from bs4 import BeautifulSoup


url = 'https://business.narimn.org/list/searchalpha/a'
response = requests.get(url, timeout=50)
soup = BeautifulSoup(response.content, 'html.parser')

cls = soup.find_all('div', class_='gz-list-card-wrapper')
print(len(cls))
modeling_company = []
for cl in cls:
  try:
    a_tag = cl.find('a')
    if a_tag:
      link = a_tag['href']
      response = requests.get(link, timeout=50)
      soup = BeautifulSoup(response.content, 'html.parser')
      company_name = soup.find('h1', class_='gz-pagetitle').get_text()
      address = soup.find('li', class_='list-group-item gz-card-address')
      street_ad = address.find('span', class_='gz-street-address').get_text() if address else ''
      city_ad = address.find('span', class_='gz-address-city').get_text() if address else ''
      state_ad = address.find('span', itemprop='addressRegion').get_text() if address else ''
      zip_code = address.find('span', itemprop='postalCode').get_text() if address else ''
      p_n = soup.find('li', class_='list-group-item gz-card-phone')
      phone = p_n.find('span', itemprop='telephone').get_text() if p_n else ''
      modeling_company.append([company_name,street_ad,city_ad,state_ad,zip_code, phone])
      print(modeling_company) 
  except Exception as e:
       print(f'Error occurred: {e}')

我尝试了一次for循环,并在modelinig_company变量上追加,但它一直在工作

eyh26e7m

eyh26e7m1#

每当你有一个URL列表要处理时,实现多线程以获得最佳性能通常是一个好主意。例如:

import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from time import perf_counter

url = 'https://business.narimn.org/list/searchalpha/a'
modeling_company = []

def process_url(url):
    with requests.get(url) as response:
        try:
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'lxml')
            company_name = soup.find('h1', class_='gz-pagetitle').get_text()
            address = soup.find('li', class_='list-group-item gz-card-address')
            street_ad = address.find('span', class_='gz-street-address').get_text() if address else ''
            city_ad = address.find('span', class_='gz-address-city').get_text() if address else ''
            state_ad = address.find('span', itemprop='addressRegion').get_text() if address else ''
            zip_code = address.find('span', itemprop='postalCode').get_text() if address else ''
            p_n = soup.find('li', class_='list-group-item gz-card-phone')
            phone = p_n.find('span', itemprop='telephone').get_text() if p_n else ''
            modeling_company.append([company_name, street_ad, city_ad, state_ad, zip_code, phone])
        except Exception as e:
            print(e)

def get_urls(soup):
    for cl in soup.find_all('div', class_='gz-list-card-wrapper'):
        if (a_tag := cl.find('a')):
            try:
                yield a_tag['href']
            except Exception:
                pass

def main():
    with requests.get(url) as response:
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'lxml')
        with ThreadPoolExecutor() as executor:
            executor.map(process_url, get_urls(soup))

if __name__ == '__main__':
    _start = perf_counter()
    main()
    _end = perf_counter()
    print(modeling_company)
    print(f'Duration={_end-_start:.2f}s')

这在我的机器上运行不到3秒

相关问题