在python中通过undetected_chromedriver在web抓取中获取输出txt文件的错误

vxf3dgd4  于 2023-08-01  发布在  Go
关注(0)|答案(1)|浏览(143)
import re
import time
import random
import logging
from concurrent.futures import ThreadPoolExecutor
from undetected_chromedriver import Chrome, ChromeOptions
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException, NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import threading

# Configure logging
logging.basicConfig(filename='scraping.log', level=logging.INFO)

# Enable optimizations for faster page load
options = ChromeOptions()
options.page_load_strategy = 'eager'
options.add_argument('--enable-network-prediction')
options.add_argument('--disable-gpu')
options.add_argument('--enable-features=VaapiVideoDecoder')
options.add_argument('--disable-software-rasterizer')
options.add_argument('--disk-cache-size=33554432')

# Disable images and unnecessary content using Chrome options
options.add_argument("--disable-popup-blocking")
options.add_argument("--blink-settings=imagesEnabled=false")
options.add_argument("--disable-notifications")
options.add_argument("--disable-geolocation")
options.add_argument("--disable-plugins")
options.add_argument("--disable-popup-blocking")
options.add_argument("--disable-extensions")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_argument("--disable-infobars")
options.add_argument("--disable-web-security")
options.add_argument("--disable-logging")
options.add_argument("--mute-audio")
options.add_argument("--disable-stylesheet")

# Install AdBlock extension from Chrome Web Store using the extension ID
extension_id = "gighmmpiobklfepjocnamgkkbiglidom"
options.add_encoded_extension(extension_id)

# Create a cache folder for fast loading
cache_folder = os.path.join(os.getcwd(), "cache")
os.makedirs(cache_folder, exist_ok=True)
options.add_argument(f"--disk-cache-dir={cache_folder}")

# Initialize the Chrome WebDriver with error handling
driver = None
try:
    driver = Chrome(options=options)
except WebDriverException as e:
    logging.error(f"Error initializing Chrome WebDriver: {str(e)}")
    raise

# Read URLs from name.txt
with open("name.txt", "r", encoding="utf-8") as file:
    urls = [line.strip() for line in file if line.strip()]

# Set to keep track of processed URLs
processed_urls = set()
lock = threading.Lock()

def scrape_url(url):
    with lock:
        if url in processed_urls:
            return False
        processed_urls.add(url)

    try:
        driver.execute_cdp_cmd("Target.createTarget", {"url": url})
        driver.switch_to.window(driver.window_handles[-1])

        # Wait for the page to load
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.post-title')))
        
        # Find manga name using BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        manga_name_element = soup.select_one('div.post-title > h1')
        if manga_name_element:
            manga_name = manga_name_element.get_text(strip=True)

        # Find image link using BeautifulSoup
        image_link_element = soup.select_one('div.summary_image > a > img')
        if image_link_element:
            image_link = image_link_element['src']

        if manga_name and image_link:
            # Save the extracted information to a text file
            filename = f"{manga_name.replace(':', '_')}.txt"
            with open(filename, 'w', encoding='utf-8') as output_file:
                output_file.write(f"Web link: {driver.current_url}\n")
                output_file.write(f"Image link: {image_link}\n")
                output_file.write(f"Manga Name: {manga_name}\n")

        # Close the current tab
        driver.execute_script("window.close()")

        # Switch back to the original tab
        driver.switch_to.window(driver.window_handles[0])

        return True

    except (WebDriverException, NoSuchElementException) as e:
        logging.error(f"Error processing URL: {url}\nError: {str(e)}")
        return False

# Scrape URLs using multiple threads
with ThreadPoolExecutor() as executor:
    results = executor.map(scrape_url, urls)

# Close the driver
if driver:
    driver.quit()

字符串
错误代码:

WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
ERROR:root:Error processing URL: https://aquamanga.com/read/seoul-exorcism-department/
Error: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=114.0.5735.199)
Stacktrace:
Backtrace:
    GetHandleVerifier [0x00E8A813+48355]
    (No symbol) [0x00E1C4B1]
    (No symbol) [0x00D25358]
    (No symbol) [0x00D0D293]
    (No symbol) [0x00D6E37B]
    (No symbol) [0x00D7C473]
    (No symbol) [0x00D6A536]
    (No symbol) [0x00D482DC]
    (No symbol) [0x00D493DD]
    GetHandleVerifier [0x010EAABD+2539405]
    GetHandleVerifier [0x0112A78F+2800735]
    GetHandleVerifier [0x0112456C+2775612]
    GetHandleVerifier [0x00F151E0+616112]
    (No symbol) [0x00E25F8C]
    (No symbol) [0x00E22328]
    (No symbol) [0x00E2240B]
    (No symbol) [0x00E14FF7]
    BaseThreadInitThunk [0x761A00C9+25]
    RtlGetAppContainerNamedObjectPath [0x777D7B4E+286]
    RtlGetAppContainerNamedObjectPath [0x777D7B1E+238]

WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
ERROR:root:Error processing URL: https://aquamanga.com/read/i-level-up-by-absorbing-everything/
Error: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=114.0.5735.199)
Stacktrace:
Backtrace:
    GetHandleVerifier [0x00E8A813+48355]
    (No symbol) [0x00E1C4B1]
    (No symbol) [0x00D25358]
    (No symbol) [0x00D0D293]
    (No symbol) [0x00D6E37B]
    (No symbol) [0x00D7C473]
    (No symbol) [0x00D6A536]
    (No symbol) [0x00D482DC]
    (No symbol) [0x00D493DD]
    GetHandleVerifier [0x010EAABD+2539405]
    GetHandleVerifier [0x0112A78F+2800735]
    GetHandleVerifier [0x0112456C+2775612]
    GetHandleVerifier [0x00F151E0+616112]
    (No symbol) [0x00E25F8C]
    (No symbol) [0x00E22328]
    (No symbol) [0x00E2240B]
    (No symbol) [0x00E14FF7]
    BaseThreadInitThunk [0x761A00C9+25]
    RtlGetAppContainerNamedObjectPath [0x777D7B4E+286]
    RtlGetAppContainerNamedObjectPath [0x777D7B1E+238]

WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
ERROR:root:Error processing URL: https://aquamanga.com/read/overpowered-healer/
Error: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=114.0.5735.199)
Stacktrace:
Backtrace:
    GetHandleVerifier [0x00E8A813+48355]
    (No symbol) [0x00E1C4B1]
    (No symbol) [0x00D25358]
    (No symbol) [0x00D0D293]
    (No symbol) [0x00D6E37B]
    (No symbol) [0x00D7C473]
    (No symbol) [0x00D6A536]
    (No symbol) [0x00D482DC]
    (No symbol) [0x00D493DD]
    GetHandleVerifier [0x010EAABD+2539405]
    GetHandleVerifier [0x0112A78F+2800735]
    GetHandleVerifier [0x0112456C+2775612]
    GetHandleVerifier [0x00F151E0+616112]
    (No symbol) [0x00E25F8C]
    (No symbol) [0x00E22328]
    (No symbol) [0x00E2240B]
    (No symbol) [0x00E14FF7]
    BaseThreadInitThunk [0x761A00C9+25]
    RtlGetAppContainerNamedObjectPath [0x777D7B4E+286]
    RtlGetAppContainerNamedObjectPath [0x777D7B1E+238]

ERROR:root:Error processing URL: https://aquamanga.com/read/heir-of-mythical-heroes/
Error: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=114.0.5735.199)
Stacktrace:
Backtrace:
    GetHandleVerifier [0x00E8A813+48355]
    (No symbol) [0x00E1C4B1]
    (No symbol) [0x00D25358]
    (No symbol) [0x00D0D293]
    (No symbol) [0x00D6E37B]
    (No symbol) [0x00D7C473]
    (No symbol) [0x00D6A536]
    (No symbol) [0x00D482DC]
    (No symbol) [0x00D493DD]
    GetHandleVerifier [0x010EAABD+2539405]
    GetHandleVerifier [0x0112A78F+2800735]
    GetHandleVerifier [0x0112456C+2775612]
    GetHandleVerifier [0x00F151E0+616112]
    (No symbol) [0x00E25F8C]
    (No symbol) [0x00E22328]
    (No symbol) [0x00E2240B]
    (No symbol) [0x00E14FF7]
    BaseThreadInitThunk [0x761A00C9+25]
    RtlGetAppContainerNamedObjectPath [0x777D7B4E+286]
    RtlGetAppContainerNamedObjectPath [0x777D7B1E+238]

ERROR:root:Error processing URL: https://aquamanga.com/read/demonic-master-of-mount-kunlun/
Error: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=114.0.5735.199)
Stacktrace:
Backtrace:
    GetHandleVerifier [0x00E8A813+48355]
    (No symbol) [0x00E1C4B1]
    (No symbol) [0x00D25358]
    (No symbol) [0x00D0D293]
    (No symbol) [0x00D6E37B]
    (No symbol) [0x00D7C473]
    (No symbol) [0x00D6A536]
    (No symbol) [0x00D482DC]
    (No symbol) [0x00D493DD]
    GetHandleVerifier [0x010EAABD+2539405]
    GetHandleVerifier [0x0112A78F+2800735]
    GetHandleVerifier [0x0112456C+2775612]
    GetHandleVerifier [0x00F151E0+616112]
    (No symbol) [0x00E25F8C]
    (No symbol) [0x00E22328]
    (No symbol) [0x00E2240B]
    (No symbol) [0x00E14FF7]
    BaseThreadInitThunk [0x761A00C9+25]
    RtlGetAppContainerNamedObjectPath [0x777D7B4E+286]
    RtlGetAppContainerNamedObjectPath [0x777D7B1E+238]

ERROR:root:Error processing URL: https://aquamanga.com/read/apocalypse-hero/
Error: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=114.0.5735.199)
Stacktrace:
Backtrace:
    GetHandleVerifier [0x00E8A813+48355]
    (No symbol) [0x00E1C4B1]
    (No symbol) [0x00D25358]
    (No symbol) [0x00D0D293]
    (No symbol) [0x00D6E37B]
    (No symbol) [0x00D7C473]
    (No symbol) [0x00D6A536]
    (No symbol) [0x00D482DC]
    (No symbol) [0x00D493DD]
    GetHandleVerifier [0x010EAABD+2539405]
    GetHandleVerifier [0x0112A78F+2800735]
    GetHandleVerifier [0x0112456C+2775612]
    GetHandleVerifier [0x00F151E0+616112]
    (No symbol) [0x00E25F8C]
    (No symbol) [0x00E22328]
    (No symbol) [0x00E2240B]
    (No symbol) [0x00E14FF7]
    BaseThreadInitThunk [0x761A00C9+25]
    RtlGetAppContainerNamedObjectPath [0x777D7B4E+286]
    RtlGetAppContainerNamedObjectPath [0x777D7B1E+238]

WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1


此Python代码用于scrap https://aquamanga.com/
浏览器中完全加载的网页(从name.txt中提取的网页的url
name.txt每一行都有一个url)
错误:只有第一个url输出txt我得到了其余的url我没有得到
所有网址完全加载后,网页抓取开始
我想要:
1.定位误差
1.不等待所有url加载
1.只要所需的信息尽快获得(有cloudflare保护)
可选:
广告块扩展尝试安装在Chrome浏览器https://chrome.google.com/webstore/detail/adblock-%E2%80%94-best-ad-blocker/gighmmpiobklfepjocnamgkkbiglidom
通过联机和脱机将文件放在同一目录中但不成功
name.txt:

https://aquamanga.com/read/heir-of-mythical-heroes/
https://aquamanga.com/read/apocalypse-hero/
https://aquamanga.com/read/demonic-master-of-mount-kunlun/
https://aquamanga.com/read/overpowered-healer/
https://aquamanga.com/read/seoul-exorcism-department/
https://aquamanga.com/read/aura-recovery-i-get-a-skill-everyday/
https://aquamanga.com/read/i-level-up-by-absorbing-everything/

e7arh2l6

e7arh2l61#

你可以使用SeleniumBase的UC模式来使用未检测的chromedriver,它具有最佳设置和内置的广告拦截功能。我还做了其他优化来简化代码。
首先使用pip install seleniumbase,然后使用python运行以下脚本:

import time
from seleniumbase import DriverContext

with DriverContext(uc=True, ad_block_on=True) as driver:
    driver.get("https://aquamanga.com/")
    time.sleep(1)
    items = driver.find_elements("css selector", "h3 a")
    hrefs = []
    for item in items:
        href = item.get_attribute("href")
        if "aquamanga.com/read" in href:
            hrefs.append(href)
            print(item.get_attribute("href"))

    print()
    for href in hrefs:
        driver.get(href)
        time.sleep(1)
        img = driver.find_element("css selector", ".summary_image a img")
        image_url = img.get_attribute("href")
        name = img.get_attribute("alt")
        print(
            "Web link: %s\nImage link: %s\nManga Name: %s\n"
            % (href, image_url, name)
        )

字符串
它解决了你遇到的问题。不需要为每个URL打开单独的选项卡。
部分输出:(首先列出所有的URL。然后在转到每个URL之后,它打印出每个URL的URL、图像URL和名称。)

https://aquamanga.com/read/heavenly-grand-archive-s-young-master/
https://aquamanga.com/read/awakening-10000-years-into-the-future/
https://aquamanga.com/read/the-chest/
...

Web link: https://aquamanga.com/read/heavenly-grand-archive-s-young-master/
Image link: None
Manga Name: Heavenly_Grand_Archives_Young_Master

Web link: https://aquamanga.com/read/awakening-10000-years-into-the-future/
Image link: None
Manga Name: Awakening 10000 Years Into The Future

Web link: https://aquamanga.com/read/the-chest/
Image link: None
Manga Name: The_Chest
...


如果你想把数据输出到一个文件中,请添加你的文件I/O代码。

相关问题