在数据库中使用Selenium(chrome无法访问)

jqjz2hbq  于 2023-06-27  发布在  Go
关注(0)|答案(1)|浏览(115)

我无法从Databricks中运行Selenium。我在其他各种线程中遵循了其他人的步骤:
https://forums.databricks.com/questions/15480/how-to-add-webdriver-for-selenium-in-databricks.html
How to use Selenium in Databricks and accessing and moving downloaded files to mounted storage
cannot get selenium webdriver to work in azure databricks
我的代码目前看起来像这样:

%sh
sudo add-apt-repository ppa:canonical-chromium-builds/stage
/usr/bin/yes | sudo apt update
/usr/bin/yes | sudo apt install chromium-browser
import os 
from webdrivermanager import ChromeDriverManager
from selenium import webdriver

cdd = ChromeDriverManager().download_and_install()

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--remote-debugging-port=9009')

driver = webdriver.Chrome(executable_path=cdd[0], options=chrome_options)

# Test driver connection
driver.get("https://www.google.com")
driver.find_element_by_css_selector("img").get_attribute("alt")

例外情况:

---------------------------------------------------------------------------
WebDriverException                        Traceback (most recent call last)
<command-2232618947863762> in <module>
     12 chrome_driver = "/usr/bin/chromedriver"
     13 
---> 14 driver = webdriver.Chrome(executable_path=cdd[0], options=chrome_options)
     15 
     16 # Test driver connection

/databricks/python/lib/python3.8/site-packages/selenium/webdriver/chrome/webdriver.py in __init__(self, executable_path, port, options, service_args, desired_capabilities, service_log_path, chrome_options, keep_alive)
     74 
     75         try:
---> 76             RemoteWebDriver.__init__(
     77                 self,
     78                 command_executor=ChromeRemoteConnection(

/databricks/python/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py in __init__(self, command_executor, desired_capabilities, browser_profile, proxy, keep_alive, file_detector, options)
    155             warnings.warn("Please use FirefoxOptions to set browser profile",
    156                           DeprecationWarning, stacklevel=2)
--> 157         self.start_session(capabilities, browser_profile)
    158         self._switch_to = SwitchTo(self)
    159         self._mobile = Mobile(self)

/databricks/python/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py in start_session(self, capabilities, browser_profile)
    250         parameters = {"capabilities": w3c_caps,
    251                       "desiredCapabilities": capabilities}
--> 252         response = self.execute(Command.NEW_SESSION, parameters)
    253         if 'sessionId' not in response:
    254             response = response['value']

/databricks/python/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py in execute(self, driver_command, params)
    319         response = self.command_executor.execute(driver_command, params)
    320         if response:
--> 321             self.error_handler.check_response(response)
    322             response['value'] = self._unwrap_value(
    323                 response.get('value', None))

/databricks/python/lib/python3.8/site-packages/selenium/webdriver/remote/errorhandler.py in check_response(self, response)
    240                 alert_text = value['alert'].get('text')
    241             raise exception_class(message, screen, stacktrace, alert_text)
--> 242         raise exception_class(message, screen, stacktrace)
    243 
    244     def _value_or_default(self, obj, key, default):

WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally.
  (chrome not reachable)
  (The process started from chrome location /usr/bin/chromium-browser is no longer running, so ChromeDriver is assuming that Chrome has crashed.)
lbsnaicq

lbsnaicq1#

对我有效的是这个方法,它来自kindofhungry的问题中的源you linked
我无法在Databricks中成功使用Web驱动程序管理器。

%sh
pip install selenium
# imports needed for notebook
from datetime import datetime
import dateutil.relativedelta
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait

下载并解压缩Chrome驱动程序并确保其为最新版本

%sh
version=`curl -sS https://chromedriver.storage.googleapis.com/LATEST_RELEASE`

wget -N https://chromedriver.storage.googleapis.com/${version}/chromedriver_linux64.zip  -O /tmp/chromedriver_linux64.zip

unzip /tmp/chromedriver_linux64.zip -d /tmp/chromedriver/

我需要获取Ubuntu更新,请参阅this帖子

%sh
sudo rm -r /var/lib/apt/lists/* 
sudo apt clean && 
   sudo apt update --fix-missing -y

添加chrome和必要的软件包

%sh
sudo curl -sS -o - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add
sudo echo "deb https://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list
sudo apt-get -y update
sudo apt-get -y install google-chrome-stable

设置浏览器

def init_chrome_browser(download_path, chrome_driver_path,  url):
     
    options = Options()
    prefs = {'download.default_directory' : download_path, 'profile.default_content_setting_values.automatic_downloads': 1, "download.prompt_for_download": False,
  "download.directory_upgrade": True,   "safebrowsing.enabled": True ,
  "translate_whitelists": {"vi":"en"},
  "translate":{"enabled":"true"}}
    options.add_experimental_option('prefs', prefs)
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')    # wont work without this feature in databricks can't display browser
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--start-maximized')
    options.add_argument('window-size=2560,1440')
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--ignore-ssl-errors')
    options.add_argument('--lang=en')
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    print(f"{datetime.now()}    Launching Chrome...")
    browser = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
    print(f"{datetime.now()}    Chrome launched.")
    #print('Pausing to allow for translation options to kick in')
    #time.sleep(15) # Adding this line worked for some for translation but not for me
    browser.get(url)
    print(f"{datetime.now()}    Browser ready to use.")
    return browser

现在你可以测试它,根据需要更改url,你也可以更改下载路径,我能够指向blob存储中我的数据湖中的文件夹,并通过点击下载链接下载项目,只需等待几秒钟,然后关闭浏览器,以便有时间下载

driver = init_chrome_browser(
    download_path="/tmp/downloads",
    chrome_driver_path="/tmp/chromedriver/chromedriver",
    url= "https://www.google.com"
)

现在您应该可以运行代码了

driver.find_element_by_css_selector("img").get_attribute("alt")

相关问题