Source code for darc.sites
# -*- coding: utf-8 -*-
"""Sites Customisation
=========================
As websites may have authentication requirements, etc., over
its content, the :mod:`darc.sites` module provides site
customisation hooks to both |requests|_ and |selenium|_
crawling processes.
"""
import collections
import importlib
import warnings
import darc.typing as typing
from darc.error import SiteNotFoundWarning
from darc.link import Link
SITEMAP = collections.defaultdict(lambda: 'default', {
# 'www.sample.com': 'sample', # darc.sites.sample
})
[docs]def _get_spec(link: Link) -> typing.ModuleType:
"""Load spec if any.
If the sites customisation failed to import, it will
fallback to the default hooks, :mod:`~darc.sites.default`.
Args:
link: Link object to fetch sites customisation module.
Returns:
types.ModuleType: The sites customisation module.
Warns:
SiteNotFoundWarning: If the sites customisation failed to import.
See Also:
* :data:`darc.sites.SITEMAP`
"""
spec = SITEMAP[link.host.casefold()]
try:
return importlib.import_module(f'darc.sites.{spec}')
except ImportError:
warnings.warn(f'site customisation not found: {spec}', SiteNotFoundWarning)
return importlib.import_module(f'darc.sites.default')
[docs]def crawler_hook(link: Link, session: typing.Session) -> typing.Response:
"""Customisation as to |requests|_ sessions.
Args:
link: Link object to be crawled.
session (|Session|_): Session object with proxy settings.
Returns:
|Response|_: The final response object with crawled data.
See Also:
* :data:`darc.sites.SITE_MAP`
* :func:`darc.sites._get_spec`
* :func:`darc.crawl.crawler`
"""
spec = _get_spec(link)
return spec.crawler(session, link)
[docs]def loader_hook(link: Link, driver: typing.Driver) -> typing.Driver:
"""Customisation as to |selenium|_ drivers.
Args:
link: Link object to be loaded.
driver (|Chrome|_): Web driver object with proxy settings.
Returns:
|Chrome|_: The web driver object with loaded data.
See Also:
* :data:`darc.sites.SITE_MAP`
* :func:`darc.sites._get_spec`
* :func:`darc.crawl.loader`
"""
spec = _get_spec(link)
return spec.loader(driver, link)