import re
import time
import random
import logging
from concurrent.futures import ThreadPoolExecutor
from undetected_chromedriver import Chrome, ChromeOptions
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException, NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import threading
# Configure logging
logging.basicConfig(filename='scraping.log', level=logging.INFO)
# Enable optimizations for faster page load
options = ChromeOptions()
options.page_load_strategy = 'eager'
options.add_argument('--enable-network-prediction')
options.add_argument('--disable-gpu')
options.add_argument('--enable-features=VaapiVideoDecoder')
options.add_argument('--disable-software-rasterizer')
options.add_argument('--disk-cache-size=33554432')
# Disable images and unnecessary content using Chrome options
options.add_argument("--disable-popup-blocking")
options.add_argument("--blink-settings=imagesEnabled=false")
options.add_argument("--disable-notifications")
options.add_argument("--disable-geolocation")
options.add_argument("--disable-plugins")
options.add_argument("--disable-popup-blocking")
options.add_argument("--disable-extensions")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_argument("--disable-infobars")
options.add_argument("--disable-web-security")
options.add_argument("--disable-logging")
options.add_argument("--mute-audio")
options.add_argument("--disable-stylesheet")
# Install AdBlock extension from Chrome Web Store using the extension ID
extension_id = "gighmmpiobklfepjocnamgkkbiglidom"
options.add_encoded_extension(extension_id)
# Create a cache folder for fast loading
cache_folder = os.path.join(os.getcwd(), "cache")
os.makedirs(cache_folder, exist_ok=True)
options.add_argument(f"--disk-cache-dir={cache_folder}")
# Initialize the Chrome WebDriver with error handling
driver = None
try:
driver = Chrome(options=options)
except WebDriverException as e:
logging.error(f"Error initializing Chrome WebDriver: {str(e)}")
raise
# Read URLs from name.txt
with open("name.txt", "r", encoding="utf-8") as file:
urls = [line.strip() for line in file if line.strip()]
# Set to keep track of processed URLs
processed_urls = set()
lock = threading.Lock()
def scrape_url(url):
with lock:
if url in processed_urls:
return False
processed_urls.add(url)
try:
driver.execute_cdp_cmd("Target.createTarget", {"url": url})
driver.switch_to.window(driver.window_handles[-1])
# Wait for the page to load
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.post-title')))
# Find manga name using BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
manga_name_element = soup.select_one('div.post-title > h1')
if manga_name_element:
manga_name = manga_name_element.get_text(strip=True)
# Find image link using BeautifulSoup
image_link_element = soup.select_one('div.summary_image > a > img')
if image_link_element:
image_link = image_link_element['src']
if manga_name and image_link:
# Save the extracted information to a text file
filename = f"{manga_name.replace(':', '_')}.txt"
with open(filename, 'w', encoding='utf-8') as output_file:
output_file.write(f"Web link: {driver.current_url}\n")
output_file.write(f"Image link: {image_link}\n")
output_file.write(f"Manga Name: {manga_name}\n")
# Close the current tab
driver.execute_script("window.close()")
# Switch back to the original tab
driver.switch_to.window(driver.window_handles[0])
return True
except (WebDriverException, NoSuchElementException) as e:
logging.error(f"Error processing URL: {url}\nError: {str(e)}")
return False
# Scrape URLs using multiple threads
with ThreadPoolExecutor() as executor:
results = executor.map(scrape_url, urls)
# Close the driver
if driver:
driver.quit()
字符串
错误代码:
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
ERROR:root:Error processing URL: https://aquamanga.com/read/seoul-exorcism-department/
Error: Message: no such window: target window already closed
from unknown error: web view not found
(Session info: chrome=114.0.5735.199)
Stacktrace:
Backtrace:
GetHandleVerifier [0x00E8A813+48355]
(No symbol) [0x00E1C4B1]
(No symbol) [0x00D25358]
(No symbol) [0x00D0D293]
(No symbol) [0x00D6E37B]
(No symbol) [0x00D7C473]
(No symbol) [0x00D6A536]
(No symbol) [0x00D482DC]
(No symbol) [0x00D493DD]
GetHandleVerifier [0x010EAABD+2539405]
GetHandleVerifier [0x0112A78F+2800735]
GetHandleVerifier [0x0112456C+2775612]
GetHandleVerifier [0x00F151E0+616112]
(No symbol) [0x00E25F8C]
(No symbol) [0x00E22328]
(No symbol) [0x00E2240B]
(No symbol) [0x00E14FF7]
BaseThreadInitThunk [0x761A00C9+25]
RtlGetAppContainerNamedObjectPath [0x777D7B4E+286]
RtlGetAppContainerNamedObjectPath [0x777D7B1E+238]
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
ERROR:root:Error processing URL: https://aquamanga.com/read/i-level-up-by-absorbing-everything/
Error: Message: no such window: target window already closed
from unknown error: web view not found
(Session info: chrome=114.0.5735.199)
Stacktrace:
Backtrace:
GetHandleVerifier [0x00E8A813+48355]
(No symbol) [0x00E1C4B1]
(No symbol) [0x00D25358]
(No symbol) [0x00D0D293]
(No symbol) [0x00D6E37B]
(No symbol) [0x00D7C473]
(No symbol) [0x00D6A536]
(No symbol) [0x00D482DC]
(No symbol) [0x00D493DD]
GetHandleVerifier [0x010EAABD+2539405]
GetHandleVerifier [0x0112A78F+2800735]
GetHandleVerifier [0x0112456C+2775612]
GetHandleVerifier [0x00F151E0+616112]
(No symbol) [0x00E25F8C]
(No symbol) [0x00E22328]
(No symbol) [0x00E2240B]
(No symbol) [0x00E14FF7]
BaseThreadInitThunk [0x761A00C9+25]
RtlGetAppContainerNamedObjectPath [0x777D7B4E+286]
RtlGetAppContainerNamedObjectPath [0x777D7B1E+238]
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
ERROR:root:Error processing URL: https://aquamanga.com/read/overpowered-healer/
Error: Message: no such window: target window already closed
from unknown error: web view not found
(Session info: chrome=114.0.5735.199)
Stacktrace:
Backtrace:
GetHandleVerifier [0x00E8A813+48355]
(No symbol) [0x00E1C4B1]
(No symbol) [0x00D25358]
(No symbol) [0x00D0D293]
(No symbol) [0x00D6E37B]
(No symbol) [0x00D7C473]
(No symbol) [0x00D6A536]
(No symbol) [0x00D482DC]
(No symbol) [0x00D493DD]
GetHandleVerifier [0x010EAABD+2539405]
GetHandleVerifier [0x0112A78F+2800735]
GetHandleVerifier [0x0112456C+2775612]
GetHandleVerifier [0x00F151E0+616112]
(No symbol) [0x00E25F8C]
(No symbol) [0x00E22328]
(No symbol) [0x00E2240B]
(No symbol) [0x00E14FF7]
BaseThreadInitThunk [0x761A00C9+25]
RtlGetAppContainerNamedObjectPath [0x777D7B4E+286]
RtlGetAppContainerNamedObjectPath [0x777D7B1E+238]
ERROR:root:Error processing URL: https://aquamanga.com/read/heir-of-mythical-heroes/
Error: Message: no such window: target window already closed
from unknown error: web view not found
(Session info: chrome=114.0.5735.199)
Stacktrace:
Backtrace:
GetHandleVerifier [0x00E8A813+48355]
(No symbol) [0x00E1C4B1]
(No symbol) [0x00D25358]
(No symbol) [0x00D0D293]
(No symbol) [0x00D6E37B]
(No symbol) [0x00D7C473]
(No symbol) [0x00D6A536]
(No symbol) [0x00D482DC]
(No symbol) [0x00D493DD]
GetHandleVerifier [0x010EAABD+2539405]
GetHandleVerifier [0x0112A78F+2800735]
GetHandleVerifier [0x0112456C+2775612]
GetHandleVerifier [0x00F151E0+616112]
(No symbol) [0x00E25F8C]
(No symbol) [0x00E22328]
(No symbol) [0x00E2240B]
(No symbol) [0x00E14FF7]
BaseThreadInitThunk [0x761A00C9+25]
RtlGetAppContainerNamedObjectPath [0x777D7B4E+286]
RtlGetAppContainerNamedObjectPath [0x777D7B1E+238]
ERROR:root:Error processing URL: https://aquamanga.com/read/demonic-master-of-mount-kunlun/
Error: Message: no such window: target window already closed
from unknown error: web view not found
(Session info: chrome=114.0.5735.199)
Stacktrace:
Backtrace:
GetHandleVerifier [0x00E8A813+48355]
(No symbol) [0x00E1C4B1]
(No symbol) [0x00D25358]
(No symbol) [0x00D0D293]
(No symbol) [0x00D6E37B]
(No symbol) [0x00D7C473]
(No symbol) [0x00D6A536]
(No symbol) [0x00D482DC]
(No symbol) [0x00D493DD]
GetHandleVerifier [0x010EAABD+2539405]
GetHandleVerifier [0x0112A78F+2800735]
GetHandleVerifier [0x0112456C+2775612]
GetHandleVerifier [0x00F151E0+616112]
(No symbol) [0x00E25F8C]
(No symbol) [0x00E22328]
(No symbol) [0x00E2240B]
(No symbol) [0x00E14FF7]
BaseThreadInitThunk [0x761A00C9+25]
RtlGetAppContainerNamedObjectPath [0x777D7B4E+286]
RtlGetAppContainerNamedObjectPath [0x777D7B1E+238]
ERROR:root:Error processing URL: https://aquamanga.com/read/apocalypse-hero/
Error: Message: no such window: target window already closed
from unknown error: web view not found
(Session info: chrome=114.0.5735.199)
Stacktrace:
Backtrace:
GetHandleVerifier [0x00E8A813+48355]
(No symbol) [0x00E1C4B1]
(No symbol) [0x00D25358]
(No symbol) [0x00D0D293]
(No symbol) [0x00D6E37B]
(No symbol) [0x00D7C473]
(No symbol) [0x00D6A536]
(No symbol) [0x00D482DC]
(No symbol) [0x00D493DD]
GetHandleVerifier [0x010EAABD+2539405]
GetHandleVerifier [0x0112A78F+2800735]
GetHandleVerifier [0x0112456C+2775612]
GetHandleVerifier [0x00F151E0+616112]
(No symbol) [0x00E25F8C]
(No symbol) [0x00E22328]
(No symbol) [0x00E2240B]
(No symbol) [0x00E14FF7]
BaseThreadInitThunk [0x761A00C9+25]
RtlGetAppContainerNamedObjectPath [0x777D7B4E+286]
RtlGetAppContainerNamedObjectPath [0x777D7B1E+238]
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: localhost. Connection pool size: 1
型
此Python代码用于scrap https://aquamanga.com/
浏览器中完全加载的网页(从name.txt中提取的网页的url
name.txt每一行都有一个url)
错误:只有第一个url输出txt我得到了其余的url我没有得到
所有网址完全加载后,网页抓取开始
我想要:
1.定位误差
1.不等待所有url加载
1.只要所需的信息尽快获得(有cloudflare保护)
可选:
广告块扩展尝试安装在Chrome浏览器https://chrome.google.com/webstore/detail/adblock-%E2%80%94-best-ad-blocker/gighmmpiobklfepjocnamgkkbiglidom
通过联机和脱机将文件放在同一目录中但不成功
name.txt:
https://aquamanga.com/read/heir-of-mythical-heroes/
https://aquamanga.com/read/apocalypse-hero/
https://aquamanga.com/read/demonic-master-of-mount-kunlun/
https://aquamanga.com/read/overpowered-healer/
https://aquamanga.com/read/seoul-exorcism-department/
https://aquamanga.com/read/aura-recovery-i-get-a-skill-everyday/
https://aquamanga.com/read/i-level-up-by-absorbing-everything/
型
1条答案
按热度按时间e7arh2l61#
你可以使用SeleniumBase的UC模式来使用未检测的chromedriver,它具有最佳设置和内置的广告拦截功能。我还做了其他优化来简化代码。
首先使用
pip install seleniumbase
,然后使用python
运行以下脚本:字符串
它解决了你遇到的问题。不需要为每个URL打开单独的选项卡。
部分输出:(首先列出所有的URL。然后在转到每个URL之后,它打印出每个URL的URL、图像URL和名称。)
型
如果你想把数据输出到一个文件中,请添加你的文件I/O代码。