Source code for darc.sites

# -*- coding: utf-8 -*-
# pylint: disable=ungrouped-imports
"""Sites Customisation
=========================

As websites may have authentication requirements, etc., over
its content, the :mod:`darc.sites` module provides site
customisation hooks to both :mod:`requests` and :mod:`selenium`
crawling processes.

Important:
    To create a sites customisation, define your class by inheriting
    :class:`darc.sites.BaseSite` and register it to the :mod:`darc`
    module through :func:`darc.sites.register`.

"""

import collections
import warnings
from typing import TYPE_CHECKING, cast

from darc.error import SiteNotFoundWarning
from darc.sites._abc import BaseSite
from darc.sites.bitcoin import Bitcoin
from darc.sites.data import DataURI
from darc.sites.default import DefaultSite
from darc.sites.ed2k import ED2K
from darc.sites.ethereum import Ethereum
from darc.sites.irc import IRC
from darc.sites.magnet import Magnet
from darc.sites.mail import Email
from darc.sites.script import Script
from darc.sites.tel import Tel
from darc.sites.ws import WebSocket

if TYPE_CHECKING:
    from typing import DefaultDict, List, Type

    from requests import Response, Session
    from selenium.webdriver import Chrome as Driver

    import darc.link as darc_link  # Link
    from darc._compat import datetime


SITEMAP = collections.defaultdict(lambda: DefaultSite, {
    # misc/special links
    '(data)': DataURI,
    '(script)': Script,
    '(bitcoin)': Bitcoin,
    '(ed2k)': ED2K,
    '(magnet)': Magnet,
    '(mail)': Email,
    '(tel)': Tel,
    '(irc)': IRC,
    '(ws)': WebSocket,
    '(ethereum)': Ethereum,
})  # type: DefaultDict[str, Type[BaseSite]]


[docs]def register(site: 'Type[BaseSite]', *hostname: str) -> None: """Register new site map. Args: site: Sites customisation class inherited from :class:`~darc.sites._abc.BaseSite`. *hostname (Tuple[str]): Optional list of hostnames the sites customisation should be registered with. By default, we use :attr:`site.hostname`. """ if site.hostname is None: site.hostname = cast('List[str]', hostname) for domain in hostname: SITEMAP[domain.casefold()] = site
[docs]def _get_site(link: 'darc_link.Link') -> 'Type[BaseSite]': """Load sites customisation if any. If the sites customisation does not exist, it will fallback to the default hooks, :class:`~darc.sites.default.DefaultSite`. Args: link: Link object to fetch sites customisation class. Returns: The sites customisation class. See Also: * :data:`darc.sites.SITEMAP` """ host = (link.host or '<null>').casefold() site = SITEMAP.get(host) if site is None: site = DefaultSite warnings.warn(f'sites customisation not found: {host}', SiteNotFoundWarning) SITEMAP[host] = site # set for cache return site
[docs]def crawler_hook(timestamp: 'datetime', session: 'Session', link: 'darc_link.Link') -> 'Response': """Customisation as to :mod:`requests` sessions. Args: timestamp: Timestamp of the worker node reference. session (requests.Session): Session object with proxy settings. link: Link object to be crawled. Returns: requests.Response: The final response object with crawled data. See Also: * :data:`darc.sites.SITE_MAP` * :func:`darc.sites._get_site` * :func:`darc.crawl.crawler` """ site = _get_site(link) return site.crawler(timestamp, session, link)
[docs]def loader_hook(timestamp: 'datetime', driver: 'Driver', link: 'darc_link.Link') -> 'Driver': """Customisation as to :mod:`selenium` drivers. Args: timestamp: Timestamp of the worker node reference. driver (selenium.webdriver.Chrome): Web driver object with proxy settings. link: Link object to be loaded. Returns: selenium.webdriver.Chrome: The web driver object with loaded data. See Also: * :data:`darc.sites.SITE_MAP` * :func:`darc.sites._get_site` * :func:`darc.crawl.loader` """ site = _get_site(link) return site.loader(timestamp, driver, link)