Source code for darc.selenium

# -*- coding: utf-8 -*-
# pylint: disable=ungrouped-imports
"""Selenium Wrapper
======================

The :mod:`darc.selenium` module wraps around the :mod:`selenium`
module, and provides some simple interface for the :mod:`darc`
project.

"""

import getpass
import os
import platform
import shutil
from typing import TYPE_CHECKING

import selenium.webdriver.chrome.options as selenium_options
import selenium.webdriver.chrome.webdriver as selenium_webdriver
import selenium.webdriver.common.desired_capabilities as selenium_desired_capabilities

from darc.const import DEBUG
from darc.error import UnsupportedLink, UnsupportedPlatform, UnsupportedProxy
from darc.proxy.i2p import I2P_PORT, I2P_SELENIUM_PROXY
from darc.proxy.tor import TOR_PORT, TOR_SELENIUM_PROXY

if TYPE_CHECKING:
    from typing import Dict

    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.chrome.webdriver import WebDriver

    import darc.link as darc_link  # Link

# Google Chrome binary location.
BINARY_LOCATION = os.getenv('CHROME_BINARY_LOCATION')
if BINARY_LOCATION is None:
    _system = platform.system()

    if _system == 'Darwin':
        BINARY_LOCATION = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
    elif _system == 'Linux':
        BINARY_LOCATION = shutil.which('google-chrome')
    del _system


[docs]def request_driver(link: 'darc_link.Link') -> 'WebDriver': """Get selenium driver. Args: link: Link requesting for :class:`~selenium.webdriver.chrome.webdriver.WebDriver`. Returns: selenium.webdriver.chrome.webdriver.WebDriver: The web driver object with corresponding proxy settings. Raises: UnsupportedLink: If the proxy type of ``link`` if not specified in the :data:`~darc.proxy.LINK_MAP`. See Also: * :data:`darc.proxy.LINK_MAP` """ from darc.proxy import LINK_MAP # pylint: disable=import-outside-toplevel _, driver = LINK_MAP[link.proxy] if driver is None: raise UnsupportedLink(link.url) return driver()
[docs]def get_options(type: str = 'null') -> 'Options': # pylint: disable=redefined-builtin """Generate options. Args: type: Proxy type for options. Returns: selenium.webdriver.chrome.options.Options: The options for the web driver :class:`~selenium.webdriver.chrome.webdriver.WebDriver`. Raises: UnsupportedPlatform: If the operation system is **NOT** macOS or Linux and :envvar:`CHROME_BINARY_LOCATION` is **NOT** set. UnsupportedProxy: If the proxy type is **NOT** ``null``, ``tor`` or ``i2p``. Important: The function raises :exc:`UnsupportedPlatform` in cases where :data:`~darc.selenium.BINARY_LOCATION` is :data:`None`. Please provide :envvar:`CHROME_BINARY_LOCATION` when running :mod:`darc` in ``loader`` mode on non *macOS* and/or *Linux* systems. See Also: * :data:`darc.proxy.tor.TOR_PORT` * :data:`darc.proxy.i2p.I2P_PORT` References: * `Google Chrome command line switches <https://peter.sh/experiments/chromium-command-line-switches/>`__ * Disable sandbox (``--no-sandbox``) when running as ``root`` user - https://crbug.com/638180 - https://stackoverflow.com/a/50642913/7218152 * Disable usage of ``/dev/shm`` - http://crbug.com/715363 * `Using Socks proxy <https://www.chromium.org/developers/design-documents/network-stack/socks-proxy>`__ """ _system = platform.system() # initiate options options = selenium_options.Options() if BINARY_LOCATION is None: raise UnsupportedPlatform(f'unsupported system: {_system}') options.binary_location = BINARY_LOCATION # https://peter.sh/experiments/chromium-command-line-switches/ if not DEBUG: options.add_argument('--headless') if _system == 'Linux': if os.path.isfile('/.dockerenv'): # check if in Docker options.headless = True # force headless option in Docker environment # c.f. https://crbug.com/638180; https://stackoverflow.com/a/50642913/7218152 if getpass.getuser() == 'root': options.add_argument('--no-sandbox') # c.f. http://crbug.com/715363 options.add_argument('--disable-dev-shm-usage') if type != 'null': if type == 'tor': port = TOR_PORT elif type == 'i2p': port = I2P_PORT else: raise UnsupportedProxy(f'unsupported proxy: {type}') # c.f. https://www.chromium.org/developers/design-documents/network-stack/socks-proxy options.add_argument(f'--proxy-server=socks5://localhost:{port}') options.add_argument('--host-resolver-rules="MAP * ~NOTFOUND , EXCLUDE localhost"') return options
[docs]def get_capabilities(type: str = 'null') -> 'Dict[str, str]': # pylint: disable=redefined-builtin """Generate desied capabilities. Args: type: Proxy type for capabilities. Returns: The desied capabilities for the web driver :class:`~selenium.webdriver.chrome.webdriver.WebDriver`. Raises: UnsupportedProxy: If the proxy type is **NOT** ``null``, ``tor`` or ``i2p``. See Also: * :data:`darc.proxy.tor.TOR_SELENIUM_PROXY` * :data:`darc.proxy.i2p.I2P_SELENIUM_PROXY` """ # do not modify source dict capabilities = selenium_desired_capabilities.DesiredCapabilities.CHROME.copy() if type == 'null': pass elif type == 'tor': TOR_SELENIUM_PROXY.add_to_capabilities(capabilities) elif type == 'i2p': I2P_SELENIUM_PROXY.add_to_capabilities(capabilities) else: raise UnsupportedProxy(f'unsupported proxy: {type}') return capabilities
[docs]def i2p_driver() -> 'WebDriver': """I2P (``.i2p``) driver. Returns: selenium.webdriver.chrome.webdriver.WebDriver: The web driver object with I2P proxy settings. See Also: * :func:`darc.selenium.get_options` * :func:`darc.selenium.get_capabilities` """ options = get_options('i2p') capabilities = get_capabilities('i2p') # initiate driver driver = selenium_webdriver.WebDriver(options=options, desired_capabilities=capabilities) return driver
[docs]def tor_driver() -> 'WebDriver': """Tor (``.onion``) driver. Returns: selenium.webdriver.chrome.webdriver.WebDriver: The web driver object with Tor proxy settings. See Also: * :func:`darc.selenium.get_options` * :func:`darc.selenium.get_capabilities` """ options = get_options('tor') capabilities = get_capabilities('tor') # initiate driver driver = selenium_webdriver.WebDriver(options=options, desired_capabilities=capabilities) return driver
[docs]def null_driver() -> 'WebDriver': """No proxy driver. Returns: selenium.webdriver.chrome.webdriver.WebDriver: The web driver object with no proxy settings. See Also: * :func:`darc.selenium.get_options` * :func:`darc.selenium.get_capabilities` """ options = get_options('null') capabilities = get_capabilities('null') # initiate driver driver = selenium_webdriver.WebDriver(options=options, desired_capabilities=capabilities) return driver