我有一个python脚本,我正在ubuntu上运行一个ec2示例(t3介质)。脚本是一个webscraper,它使用selenium模拟页面点击等。下面是基本脚本:
from bs4 import BeautifulSoup
import requests
import string
import json
import geocoder
import mapbox
import selenium
from selenium import webdriver
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import datetime
from datetime import datetime as dt
import re
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import ElementNotVisibleException
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--window-size=1420,1080')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--remote-debugging-port=9222")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.binary_location='/usr/bin/google-chrome-stable'
chrome_driver_binary = "/usr/bin/chromedriver"
driver = webdriver.Chrome(executable_path=chrome_driver_binary, chrome_options=chrome_options)
# Set base url (SAN FRANCISCO)
base_url = 'https://www.bandsintown.com/?place_id=ChIJIQBpAG2ahYAR_6128GcTUEo&page='#san francisco
# base_url = 'https://www.bandsintown.com/?place_id=ChIJOwg_06VPwokRYv534QaPC8g&page='
events = []
eventContainerBucket = []
for i in range(1,2):
#cycle through pages in range
driver.get(base_url + str(i))
pageURL = base_url + str(i)
# get events links
event_list = driver.find_elements_by_css_selector('div[class^=_3buUBPWBhUz9KBQqgXm-gf] a[class^=_3UX9sLQPbNUbfbaigy35li]')
# collect href attribute of events in even_list
events.extend(list(event.get_attribute("href") for event in event_list))
print ("total events: ", (len(events)))
# iterate through all events and open them.
item = {}
allEvents = []
for event in events:
driver.get(event)
currentUrl = driver.current_url
try:
currentRequest = requests.get(currentUrl)
except requests.exceptions.RequestException as e:
continue
if currentRequest.status_code == 200:
#Do a bunch of searching for text within divs etc.....
通常,脚本可能会遍历几个URL,然后失败,并显示以下消息:
File "/usr/local/lib/python3.6/dist-packages/selenium/webdriver/remote/webdriver.py", line 333, in get
self.execute(Command.GET, {'url': url})
File "/usr/local/lib/python3.6/dist-packages/selenium/webdriver/remote/webdriver.py", line 321, in execute
self.error_handler.check_response(response)
File "/usr/local/lib/python3.6/dist-packages/selenium/webdriver/remote/errorhandler.py", line 242, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.WebDriverException: Message: unknown error: session deleted because of page crash
from unknown error: cannot determine loading status
from tab crashed
(Session info: headless chrome=91.0.4472.114)
当我在本地运行脚本时(即不是在ec2上运行),脚本工作正常,因此我可以放心地说selenium崩溃是因为内存问题。为了证实这一理论,我使用 df -h
:
Filesystem Size Used Avail Use% Mounted on
udev 1.9G 0 1.9G 0% /dev
tmpfs 388M 760K 387M 1% /run
/dev/nvme0n1p1 7.7G 7.6G 96M 99% /
tmpfs 1.9G 0 1.9G 0% /dev/shm
tmpfs 5.0M 0 5.0M 0% /run/lock
tmpfs 1.9G 0 1.9G 0% /sys/fs/cgroup
/dev/loop1 34M 34M 0 100% /snap/amazon-ssm-agent/3552
/dev/loop2 100M 100M 0 100% /snap/core/11187
/dev/loop3 56M 56M 0 100% /snap/core18/2066
/dev/loop0 100M 100M 0 100% /snap/core/11316
/dev/loop4 29M 29M 0 100% /snap/amazon-ssm-agent/2012
/dev/loop5 56M 56M 0 100% /snap/core18/2074
tmpfs 388M 0 388M 0% /run/user/1000
我的 /dev/nvme0n1p1
使用率为99%。看起来这可能会导致selenium崩溃。那我该怎么办?我听说清理日志文件会有帮助。。是否有我应该使用的特定命令?另外,我应该探索交换空间吗?任何关于尝试什么的具体建议都将不胜感激。提前谢谢。
暂无答案!
目前还没有任何答案,快来回答吧!