使用异步解析(Python)时缺少数据

nxagd54h  于 2023-02-28  发布在  Python
关注(0)|答案(1)|浏览(148)

我正在尝试写一个本地存储的异步解析器,但是结果不稳定。它应该得到~ 11 k个条目,但是有时它会得到一个随机的数量,没有任何例外。
问题可能是什么?我如何捕获/记录它?

URL = 'https://shop.samberi.com'

HEADERS = {
    'Accept': '*/*',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                  'AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/101.0.4951.54 Safari/537.36'
}

all_products = []

async def get_products(url):
    async with aiohttp.ClientSession() as session:
        res = await session.get(url=url, headers=HEADERS)
        bs = BeautifulSoup(await res.text(), 'lxml')
        cats = [URL + cat.get('href') + '?SHOWALL_1=1'
                for cat in bs.find('ul', id='vertical-multilevel-menu')
                .find_all('a', class_='parent')] + [
            #Костыль, не могу получить эти ссылки автоматически(
            'https://shop.samberi.com/catalog/aziya/?SHOWALL_1=1',
            'https://shop.samberi.com/catalog/sportivnye_tovary/?SHOWALL_1=1',
            'https://shop.samberi.com/catalog/upakovka/?SHOWALL_1=1'
        ]
        tasks = [asyncio.shield(parse_page(session, url, max_s)) for url in cats]

        await asyncio.gather(*tasks)

async def parse_page(session, cat_url, max_s):
    async with session.get(url=cat_url, headers=HEADERS) as res:
        res_text = await res.text()
        pagebs = BeautifulSoup(res_text, 'lxml')
        products_on_page = pagebs.find_all('div', class_='product-item')
        for product in products_on_page:
            name = product.find('div', class_='product-item-title').text.strip()
            price = product.find('span', class_='product-item-price-current')\
                .text.strip().strip('₽').strip()
            all_products.append([name, price])
def main():
    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
    asyncio.run(get_products(URL))
vawmfj5a

vawmfj5a1#

您可能因为同时发出太多请求而被服务器踢出。请尝试检查https响应以进一步检查问题。
如果这确实是问题所在,您可以使用TCPConnector来限制同时请求的数量。此程序使用的限制值为8,始终返回10886个产品:

import asyncio
import aiohttp
from bs4 import BeautifulSoup
import json

URL = "https://shop.samberi.com"

HEADERS = {
    "Accept": "*/*",
}

async def get_products(url: str) -> list:
    connector = aiohttp.TCPConnector(limit=8, limit_per_host=8)
    async with aiohttp.ClientSession(connector=connector) as session:
        res = await session.get(url=url, headers=HEADERS)
        bs = BeautifulSoup(await res.text(), "lxml")
        cats = [
            URL + cat.get("href") + "?SHOWALL_1=1"
            for cat in bs.find("ul", id="vertical-multilevel-menu").find_all(
                "a", class_="parent"
            )
        ] + [
            # Костыль, не могу получить эти ссылки автоматически(
            "https://shop.samberi.com/catalog/aziya/?SHOWALL_1=1",
            "https://shop.samberi.com/catalog/sportivnye_tovary/?SHOWALL_1=1",
            "https://shop.samberi.com/catalog/upakovka/?SHOWALL_1=1",
        ]

        tasks = [parse_page(session, url) for url in cats]
        print(f"Fetching {len(tasks)} pages")

        results = await asyncio.gather(*tasks)

        return [product for products in results for product in products]

async def parse_page(session: aiohttp.ClientSession, cat_url: str) -> list:
    all_products = []

    async with session.get(url=cat_url, headers=HEADERS) as res:
        res_text = await res.text()
        pagebs = BeautifulSoup(res_text, "lxml")
        products_on_page = pagebs.find_all("div", class_="product-item")

        print(f"Fething {len(products_on_page)} products")
        for product in products_on_page:
            name = product.find("div", class_="product-item-title").text.strip()
            price = (
                product.find("span", class_="product-item-price-current")
                .text.strip()
                .strip("₽")
                .strip()
            )
            all_products.append([name, price])

    return all_products

async def main():
    products = await get_products(URL)
    print(len(products))

    with open("products.json", "w") as f:
        json.dump({"products": products}, f)

if __name__ == "__main__":
    asyncio.run(main())

相关问题