Source code for darc.sites

# -*- coding: utf-8 -*-
"""Sites Customisation
=========================

As websites may have authentication requirements, etc., over
its content, the :mod:`darc.sites` module provides site
customisation hooks to both :mod:`requests` and :mod:`selenium`
crawling processes.

"""

import collections

import importlib
import warnings

import darc.typing as typing
from darc.error import SiteNotFoundWarning
from darc.link import Link

SITEMAP = collections.defaultdict(lambda: 'default', {
    # misc/special links
    '(data)': 'data',
    '(script)': 'script',
    '(bitcoin)': 'bitcoin',
    '(ed2k)': 'ed2k',
    '(magnet)': 'magnet',
    '(mail)': 'mail',
    '(tel)': 'tel',
    '(irc)': 'irc',

    # 'www.sample.com': 'sample',  # darc.sites.sample
})


def register(domain: str, module: str):
    """Register new site map.

    Args:
        domain: Domain name (case insensitive).
        module: Full qualified module name.

    """
    SITEMAP[domain.casefold()] = module


[docs]def _get_spec(link: Link) -> typing.ModuleType: """Load spec if any. If the sites customisation failed to import, it will fallback to the default hooks, :mod:`~darc.sites.default`. Args: link: Link object to fetch sites customisation module. Returns: types.ModuleType: The sites customisation module. Warns: SiteNotFoundWarning: If the sites customisation failed to import. See Also: * :data:`darc.sites.SITEMAP` """ spec = SITEMAP[link.host.casefold()] try: try: return importlib.import_module(f'darc.sites.{spec}') except ImportError: return importlib.import_module(spec) except ImportError: warnings.warn(f'site customisation not found: {spec}', SiteNotFoundWarning) return importlib.import_module('darc.sites.default')
[docs]def crawler_hook(link: Link, session: typing.Session) -> typing.Response: """Customisation as to :mod:`requests` sessions. Args: link: Link object to be crawled. session (requests.Session): Session object with proxy settings. Returns: requests.Response: The final response object with crawled data. See Also: * :data:`darc.sites.SITE_MAP` * :func:`darc.sites._get_spec` * :func:`darc.crawl.crawler` """ spec = _get_spec(link) return spec.crawler(session, link)
[docs]def loader_hook(link: Link, driver: typing.Driver) -> typing.Driver: """Customisation as to :mod:`selenium` drivers. Args: link: Link object to be loaded. driver (selenium.webdriver.Chrome): Web driver object with proxy settings. Returns: selenium.webdriver.Chrome: The web driver object with loaded data. See Also: * :data:`darc.sites.SITE_MAP` * :func:`darc.sites._get_spec` * :func:`darc.crawl.loader` """ spec = _get_spec(link) return spec.loader(driver, link)