Source code for darc.sites

# -*- coding: utf-8 -*-
"""Sites Customisation
=========================

As websites may have authentication requirements, etc., over
its content, the :mod:`darc.sites` module provides site
customisation hooks to both :mod:`requests` and :mod:`selenium`
crawling processes.

"""

import collections

import importlib
import warnings

import darc.typing as typing
import darc.sites.default as default  # import as cache
from darc.error import SiteNotFoundWarning
from darc.link import Link

SITEMAP = collections.defaultdict(lambda: default, {
    # misc/special links
    '(data)': 'darc.sites.data',
    '(script)': 'darc.sites.script',
    '(bitcoin)': 'darc.sites.bitcoin',
    '(ed2k)': 'darc.sites.ed2k',
    '(magnet)': 'darc.sites.magnet',
    '(mail)': 'darc.sites.mail',
    '(tel)': 'darc.sites.tel',
    '(irc)': 'darc.sites.irc',

    # 'www.sample.com': 'sample',  # local customised module
})  # type: typing.DefaultDict[str, typing.Union[str, typing.ModuleType]]


def register(domain: str, module: typing.Union[str, typing.ModuleType]):
    """Register new site map.

    Args:
        domain: Domain name (case insensitive).
        module: Full qualified module name.

    Raises:
        ImportError: If failed to import the specified module name.

    """
    if isinstance(module, str):
        module = importlib.import_module(module)
    SITEMAP[domain.casefold()] = module


[docs]def _get_module(link: Link) -> typing.ModuleType: """Load module if any. If the sites customisation failed to import, it will fallback to the default hooks, :mod:`~darc.sites.default`. Args: link: Link object to fetch sites customisation module. Returns: types.ModuleType: The sites customisation module. Warns: SiteNotFoundWarning: If the sites customisation failed to import. See Also: * :data:`darc.sites.SITEMAP` """ domain = link.host.casefold() module = SITEMAP[domain] if isinstance(module, str): try: module = importlib.import_module(module) except ImportError: warnings.warn(f'site customisation not found: {module}', SiteNotFoundWarning) module = default SITEMAP[domain] = module # set for cache return module
[docs]def crawler_hook(link: Link, session: typing.Session) -> typing.Response: """Customisation as to :mod:`requests` sessions. Args: link: Link object to be crawled. session (requests.Session): Session object with proxy settings. Returns: requests.Response: The final response object with crawled data. See Also: * :data:`darc.sites.SITE_MAP` * :func:`darc.sites._get_module` * :func:`darc.crawl.crawler` """ module = _get_module(link) return module.crawler(session, link) # type: ignore
[docs]def loader_hook(link: Link, driver: typing.Driver) -> typing.Driver: """Customisation as to :mod:`selenium` drivers. Args: link: Link object to be loaded. driver (selenium.webdriver.Chrome): Web driver object with proxy settings. Returns: selenium.webdriver.Chrome: The web driver object with loaded data. See Also: * :data:`darc.sites.SITE_MAP` * :func:`darc.sites._get_module` * :func:`darc.crawl.loader` """ module = _get_module(link) return module.loader(driver, link) # type: ignore