Source code for darc.submit

# -*- coding: utf-8 -*-
# pylint: disable=ungrouped-imports
"""Data Submission
=====================

The :mod:`darc` project integrates the capability of submitting
fetched data and information to a web server, to support real-time
cross-analysis and status display.

There are three submission events:

1. New Host Submission -- :data:`~darc.submit.API_NEW_HOST`

   Submitted in :func:`~darc.crawl.crawler` function call, when the
   crawling URL is marked as a new host.

2. Requests Submission -- :data:`~darc.submit.API_REQUESTS`

   Submitted in :func:`~darc.crawl.crawler` function call, after the
   crawling process of the URL using :mod:`requests`.

3. Selenium Submission -- :data:`~darc.submit.API_SELENIUM`

   Submitted in :func:`~darc.crawl.loader` function call, after the
   loading process of the URL using :mod:`selenium`.

.. seealso::

   Please refer to :doc:`data schema </demo/schema>` for more
   information about the submission data.

"""

import base64
import contextlib
import glob
import json
import os
from datetime import date
from typing import TYPE_CHECKING, cast

import peewee
import requests

from darc._compat import datetime
from darc.const import PATH_DB
from darc.db import _db_operation
from darc.error import APIRequestFailed, DatabaseOperaionFailed
from darc.logging import DEBUG as LOG_DEBUG
from darc.logging import WARNING as LOG_WARNING
from darc.logging import logger
from darc.model import (HostnameModel, HostsModel, RequestsHistoryModel, RequestsModel, RobotsModel,
                        SeleniumModel, SitemapModel, URLModel, URLThroughModel)
from darc.model.utils import Proxy
from darc.requests import null_session

if TYPE_CHECKING:
    from typing import Any, Dict, List, Optional, Tuple

    from requests import Response, Session
    from typing_extensions import Literal

    import darc.link as darc_link  # Link
    from darc._typing import File

    Domain = Literal['new_host', 'requests', 'selenium']

# save submitted data to database
SAVE_DB = bool(int(os.getenv('SAVE_DB', '1')))

# retry times
API_RETRY = int(os.getenv('API_RETRY', '3'))

# API request storage
PATH_API = os.path.join(PATH_DB, 'api')
os.makedirs(PATH_API, exist_ok=True)

# API URLs
API_NEW_HOST = os.getenv('API_NEW_HOST')
API_REQUESTS = os.getenv('API_REQUESTS')
API_SELENIUM = os.getenv('API_SELENIUM')
logger.debug('-*- SUBMIT API -*-\nNEW HOST: %s\nREQUESTS: %s\nSELENIUM: %s\n%s',
             API_NEW_HOST, API_REQUESTS, API_SELENIUM, logger.horizon)

# UNIX epoch
EPOCH = datetime(1970, 1, 1, 0, 0)  # 1970-01-01T00:00:00


[docs]def get_robots(link: 'darc_link.Link') -> 'Optional[File]':
    """Read ``robots.txt``.

    Args:
        link: Link object to read ``robots.txt``.

    Returns:
        * If ``robots.txt`` exists, return the data from ``robots.txt``.

          * ``path`` -- relative path from ``robots.txt`` to root of data storage
            :data:`~darc.const.PATH_DB`, ``<proxy>/<scheme>/<hostname>/robots.txt``
          * ``data`` -- *base64* encoded content of ``robots.txt``

        * If not, return :data:`None`.

    See Also:
        * :func:`darc.crawl.crawler`
        * :func:`darc.proxy.null.save_robots`

    """
    path = os.path.join(link.base, 'robots.txt')
    if not os.path.isfile(path):
        return None
    with open(path, 'rb') as file:
        content = file.read()
    return {
        'path': os.path.relpath(path, PATH_DB),
        'data': base64.b64encode(content).decode(),
    }


[docs]def get_sitemaps(link: 'darc_link.Link') -> 'Optional[List[File]]':
    """Read sitemaps.

    Args:
        link: Link object to read sitemaps.

    Returns:
        * If sitemaps exist, return list of the data from sitemaps.

          * ``path`` -- relative path from sitemap to root of data storage
            :data:`~darc.const.PATH_DB`, ``<proxy>/<scheme>/<hostname>/sitemap_<hash>.xml``
          * ``data`` -- *base64* encoded content of sitemap

        * If not, return :data:`None`.

    See Also:
        * :func:`darc.crawl.crawler`
        * :func:`darc.proxy.null.save_sitemap`

    """
    path_list = glob.glob(os.path.join(link.base, 'sitemap_*.xml'))
    if not path_list:
        return None

    data_list = []  # type: List[File]
    for path in path_list:
        with open(path, 'rb') as file:
            content = file.read()
        data_list.append({
            'path': os.path.relpath(path, PATH_DB),
            'data': base64.b64encode(content).decode(),
        })
    return data_list


[docs]def get_hosts(link: 'darc_link.Link') -> 'Optional[File]':
    """Read ``hosts.txt``.

    Args:
        link: Link object to read ``hosts.txt``.

    Returns:
        * If ``hosts.txt`` exists, return the data from ``hosts.txt``.

          * ``path`` -- relative path from ``hosts.txt`` to root of data storage
            :data:`~darc.const.PATH_DB`, ``<proxy>/<scheme>/<hostname>/hosts.txt``
          * ``data`` -- *base64* encoded content of ``hosts.txt``

        * If not, return :data:`None`.

    See Also:
        * :func:`darc.crawl.crawler`
        * :func:`darc.proxy.i2p.save_hosts`

    """
    if link.proxy != 'i2p':
        return None

    path = os.path.join(link.base, 'hosts.txt')
    if not os.path.isfile(path):
        return None
    with open(path, 'rb') as file:
        content = file.read()
    return {
        'path': os.path.relpath(path, PATH_DB),
        'data': base64.b64encode(content).decode(),
    }


[docs]def save_submit(domain: 'Domain', data: 'Dict[str, Any]') -> None:
    """Save failed submit data.

    Args:
        domain (``'new_host'``, ``'requests'`` or ``'selenium'``): Domain of the submit data.
        data: Submit data.

    Notes:
        The saved files will be categorised by the actual runtime day
        for better maintenance.

    See Also:
        * :data:`darc.submit.PATH_API`
        * :func:`darc.submit.submit`
        * :func:`darc.submit.submit_new_host`
        * :func:`darc.submit.submit_requests`
        * :func:`darc.submit.submit_selenium`

    """
    today = date.today().isoformat()

    metadata = data['[metadata]']
    name = metadata['name']
    ts = data['Timestamp']

    root = os.path.join(PATH_API, today, metadata['base'], domain)
    os.makedirs(root, exist_ok=True)

    with open(os.path.join(root, f'{name}_{ts}.json'), 'w') as file:
        json.dump(data, file, indent=2)


[docs]def submit(api: str, domain: 'Domain', data: 'Dict[str, Any]') -> None:
    """Submit data.

    Args:
        api: API URL.
        domain (``'new_host'``, ``'requests'`` or ``'selenium'``): Domain of the submit data.
        data: Submit data.

    See Also:
        * :data:`darc.submit.API_RETRY`
        * :func:`darc.submit.save_submit`
        * :func:`darc.submit.submit_new_host`
        * :func:`darc.submit.submit_requests`
        * :func:`darc.submit.submit_selenium`

    """
    with null_session() as session:
        for _ in range(API_RETRY+1):
            try:
                response = session.post(api, json=data)
                if response.ok:
                    return
            except requests.RequestException:
                logger.pexc(LOG_WARNING, category=APIRequestFailed,
                            line=f'[{domain.upper()}] response = requests.post(api, json=data)')
    save_submit(domain, data)


[docs]def submit_new_host(time: 'datetime', link: 'darc_link.Link', partial: bool = False, force: bool = False) -> None:
    """Submit new host.

    When a new host is discovered, the :mod:`darc` crawler will submit the
    host information. Such includes ``robots.txt`` (if exists) and
    ``sitemap.xml`` (if any).

    Args:
        time (datetime.datetime): Timestamp of submission.
        link: Link object of submission.
        partial: If the data is not complete, i.e. failed when fetching
            ``robots.txt``, ``hosts.txt`` and/or sitemaps.
        force: If the data is force re-fetched, i.e. cache expired when
            checking with :func:`darc.db.have_hostname`.

    If :data:`~darc.submit.API_NEW_HOST` is :data:`None`, the data for submission
    will directly be save through :func:`~darc.submit.save_submit`.

    The data submitted should have following format:

    .. code-block::

        {
            // partial flag - true / false
            "$PARTIAL$": ...,
            // force flag - true / false
            "$FORCE$": ...,
            // metadata of URL
            "[metadata]": {
                // original URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
                "url": ...,
                // proxy type - null / tor / i2p / zeronet / freenet
                "proxy": ...,
                // hostname / netloc, c.f. ``urllib.parse.urlparse``
                "host": ...,
                // base folder, relative path (to data root path ``PATH_DATA``) in containter - <proxy>/<scheme>/<host>
                "base": ...,
                // sha256 of URL as name for saved files (timestamp is in ISO format)
                //   JSON log as this one - <base>/<name>_<timestamp>.json
                //   HTML from requests - <base>/<name>_<timestamp>_raw.html
                //   HTML from selenium - <base>/<name>_<timestamp>.html
                //   generic data files - <base>/<name>_<timestamp>.dat
                "name": ...,
                // originate URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
                "backref": ...
            },
            // requested timestamp in ISO format as in name of saved file
            "Timestamp": ...,
            // original URL
            "URL": ...,
            // robots.txt from the host (if not exists, then ``null``)
            "Robots": {
                // path of the file, relative path (to data root path ``PATH_DATA``) in container
                //   - <proxy>/<scheme>/<host>/robots.txt
                "path": ...,
                // content of the file (**base64** encoded)
                "data": ...,
            },
            // sitemaps from the host (if none, then ``null``)
            "Sitemaps": [
                {
                    // path of the file, relative path (to data root path ``PATH_DATA``) in container
                    //   - <proxy>/<scheme>/<host>/sitemap_<name>.xml
                    "path": ...,
                    // content of the file (**base64** encoded)
                    "data": ...,
                },
                ...
            ],
            // hosts.txt from the host (if proxy type is ``i2p``; if not exists, then ``null``)
            "Hosts": {
                // path of the file, relative path (to data root path ``PATH_DATA``) in container
                //   - <proxy>/<scheme>/<host>/hosts.txt
                "path": ...,
                // content of the file (**base64** encoded)
                "data": ...,
            }
        }

    See Also:
        * :data:`darc.submit.API_NEW_HOST`
        * :func:`darc.submit.submit`
        * :func:`darc.submit.save_submit`
        * :func:`darc.submit.get_robots`
        * :func:`darc.submit.get_sitemaps`
        * :func:`darc.submit.get_hosts`

    """
    metadata = link.asdict()
    ts = time.isoformat()

    robots = get_robots(link)
    sitemaps = get_sitemaps(link)
    hosts = get_hosts(link)

    if SAVE_DB:
        try:
            model, _ = cast('Tuple[HostnameModel, bool]',
                            _db_operation(HostnameModel.get_or_create, hostname=link.host, defaults={
                                'proxy': Proxy[link.proxy.upper()],
                                'discovery': time,
                                'last_seen': time,
                            }))

            if robots is not None:
                _db_operation(RobotsModel.create,
                              host=model,
                              timestamp=time,
                              document=base64.b64decode(robots['data']).decode())

            if sitemaps is not None:
                for sitemap in sitemaps:
                    _db_operation(SitemapModel.create,
                                  host=model,
                                  timestamp=time,
                                  document=base64.b64decode(sitemap['data']).decode())

            if hosts is not None:
                _db_operation(HostsModel.create,
                              host=model,
                              timestamp=time,
                              document=base64.b64decode(hosts['data']).decode())
        except Exception:
            logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line='submit_new_host(...)')

    data = {
        '$PARTIAL$': partial,
        '$FORCE$': force,
        '[metadata]': metadata,
        'Timestamp': ts,
        'URL': link.host,
        'Robots': robots,
        'Sitemaps': sitemaps,
        'Hosts': hosts,
    }
    logger.plog(LOG_DEBUG, '-*- NEW HOST DATA -*-', object=data)

    if API_NEW_HOST is None:
        save_submit('new_host', data)
        return

    # submit data
    submit(API_NEW_HOST, 'new_host', data)


[docs]def submit_requests(time: 'datetime', link: 'darc_link.Link',
                    response: 'Response', session: 'Session',
                    content: bytes, mime_type: str, html: bool = True) -> None:
    """Submit requests data.

    When crawling, we'll first fetch the URl using :mod:`requests`, to check
    its availability and to save its HTTP headers information. Such information
    will be submitted to the web UI.

    Args:
        time (datetime.datetime): Timestamp of submission.
        link: Link object of submission.
        response (requests.Response): Response object of submission.
        session (requests.Session): Session object of submission.
        content: Raw content of from the response.
        mime_type: Content type.
        html: If current document is HTML or other files.

    If :data:`~darc.submit.API_REQUESTS` is :data:`None`, the data for submission
    will directly be save through :func:`~darc.submit.save_submit`.

    The data submitted should have following format:

    .. code-block::

        {
            // metadata of URL
            "[metadata]": {
                // original URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
                "url": ...,
                // proxy type - null / tor / i2p / zeronet / freenet
                "proxy": ...,
                // hostname / netloc, c.f. ``urllib.parse.urlparse``
                "host": ...,
                // base folder, relative path (to data root path ``PATH_DATA``) in containter - <proxy>/<scheme>/<host>
                "base": ...,
                // sha256 of URL as name for saved files (timestamp is in ISO format)
                //   JSON log as this one - <base>/<name>_<timestamp>.json
                //   HTML from requests - <base>/<name>_<timestamp>_raw.html
                //   HTML from selenium - <base>/<name>_<timestamp>.html
                //   generic data files - <base>/<name>_<timestamp>.dat
                "name": ...,
                // originate URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
                "backref": ...
            },
            // requested timestamp in ISO format as in name of saved file
            "Timestamp": ...,
            // original URL
            "URL": ...,
            // request method
            "Method": "GET",
            // response status code
            "Status-Code": ...,
            // response reason
            "Reason": ...,
            // response cookies (if any)
            "Cookies": {
                ...
            },
            // session cookies (if any)
            "Session": {
                ...
            },
            // request headers (if any)
            "Request": {
                ...
            },
            // response headers (if any)
            "Response": {
                ...
            },
            // content type
            "Content-Type": ...,
            // requested file (if not exists, then ``null``)
            "Document": {
                // path of the file, relative path (to data root path ``PATH_DATA``) in container
                //   - <proxy>/<scheme>/<host>/<name>_<timestamp>_raw.html
                // or if the document is of generic content type, i.e. not HTML
                //   - <proxy>/<scheme>/<host>/<name>_<timestamp>.dat
                "path": ...,
                // content of the file (**base64** encoded)
                "data": ...,
            },
            // redirection history (if any)
            "History": [
                // same record data as the original response
                {"...": "..."}
            ]
        }

    See Also:
        * :data:`darc.submit.API_REQUESTS`
        * :func:`darc.submit.submit`
        * :func:`darc.submit.save_submit`
        * :func:`darc.submit.get_raw`
        * :func:`darc.crawl.crawler`

    """
    if SAVE_DB:
        try:
            model, model_created = cast('Tuple[HostnameModel, bool]',
                                        _db_operation(HostnameModel.get_or_create, hostname=link.host, defaults={
                                            'proxy': Proxy[link.proxy.upper()],
                                            'discovery': time,
                                            'last_seen': time,
                                        }))
            if not model_created:
                model.last_seen = time
                _db_operation(model.save)

            url, url_created = cast('Tuple[URLModel, bool]',
                                    _db_operation(URLModel.get_or_create, hash=link.name, defaults={
                                        'url': link.url,
                                        'hostname': model,
                                        'proxy': Proxy[link.proxy.upper()],
                                        'discovery': time,
                                        'last_seen': time,
                                        'alive': False,
                                        'since': EPOCH,
                                    }))
            if not url.alive and response.ok:
                url.alive = True
                url.since = time
            elif url.alive and not response.ok:
                url.alive = False
                url.since = time
            if not url_created:
                url.last_seen = time
            _db_operation(url.save)

            if link.url_backref is not None:
                with contextlib.suppress(peewee.IntegrityError):
                    _db_operation(URLThroughModel.create,
                                  parent=_db_operation(URLModel.get_by_url, link.url_backref.url),
                                  child=url)

            model = cast('RequestsModel',
                         _db_operation(RequestsModel.create,
                                       url=url,
                                       timestamp=time,
                                       method=response.request.method,
                                       document=content,
                                       mime_type=mime_type,
                                       is_html=html,
                                       status_code=response.status_code,
                                       reason=response.reason,
                                       cookies=response.cookies.get_dict(),
                                       session=response.cookies.get_dict(),
                                       request=dict(response.request.headers),
                                       response=dict(response.headers)))

            for index, history in enumerate(response.history):
                _db_operation(RequestsHistoryModel.create,
                              index=index,
                              model=model,
                              url=history.url,
                              timestamp=time,
                              method=history.request.method,
                              document=history.content,
                              status_code=history.status_code,
                              reason=history.reason,
                              cookies=history.cookies.get_dict(),
                              request=dict(history.request.headers),
                              response=dict(history.headers))
        except Exception:
            logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line='submit_requests(...)')

    metadata = link.asdict()
    ts = time.isoformat()

    if html:
        path = f'{link.base}/{link.name}_{ts}_raw.html'
    else:
        path = f'{link.base}/{link.name}_{ts}.dat'

    data = {
        '[metadata]': metadata,
        'Timestamp': ts,
        'URL': link.url,
        'Method': response.request.method,
        'Status-Code': response.status_code,
        'Reason': response.reason,
        'Cookies': [vars(cookie) for cookie in response.cookies],
        'Session': [vars(cookie) for cookie in session.cookies],
        'Request': dict(response.request.headers),
        'Response': dict(response.headers),
        'Content-Type': mime_type,
        'Document': {
            'path': os.path.relpath(path, PATH_DB),
            'data': base64.b64encode(content).decode(),
        },
        'History': [{
            'URL': history.url,
            'Method': history.request.method,
            'Status-Code': history.status_code,
            'Reason': history.reason,
            'Cookies': history.cookies.get_dict(),
            'Request': dict(history.request.headers),
            'Response': dict(history.headers),
            'Document': base64.b64encode(history.content).decode(),
        } for history in response.history],
    }
    logger.plog(LOG_DEBUG, '-*- REQUESTS DATA -*-', object=data)

    if API_REQUESTS is None:
        save_submit('requests', data)
        return

    # submit data
    submit(API_REQUESTS, 'requests', data)


[docs]def submit_selenium(time: 'datetime', link: 'darc_link.Link',
                    html: str, screenshot: 'Optional[str]') -> None:
    """Submit selenium data.

    After crawling with :mod:`requests`, we'll then render the URl using
    :mod:`selenium` with Google Chrome and its web driver, to provide a fully
    rendered web page. Such information will be submitted to the web UI.

    Args:
        time (datetime.datetime): Timestamp of submission.
        link: Link object of submission.
        html: HTML source of the web page.
        screenshot: *base64* encoded screenshot.

    If :data:`~darc.submit.API_SELENIUM` is :data:`None`, the data for submission
    will directly be save through :func:`~darc.submit.save_submit`.

    Note:
        This information is optional, only provided if the content type from
        :mod:`requests` is HTML, status code not between ``400`` and ``600``, and
        HTML data not empty.

    The data submitted should have following format:

    .. code-block::

        {
            // metadata of URL
            "[metadata]": {
                // original URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
                "url": ...,
                // proxy type - null / tor / i2p / zeronet / freenet
                "proxy": ...,
                // hostname / netloc, c.f. ``urllib.parse.urlparse``
                "host": ...,
                // base folder, relative path (to data root path ``PATH_DATA``) in containter - <proxy>/<scheme>/<host>
                "base": ...,
                // sha256 of URL as name for saved files (timestamp is in ISO format)
                //   JSON log as this one - <base>/<name>_<timestamp>.json
                //   HTML from requests - <base>/<name>_<timestamp>_raw.html
                //   HTML from selenium - <base>/<name>_<timestamp>.html
                //   generic data files - <base>/<name>_<timestamp>.dat
                "name": ...,
                // originate URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
                "backref": ...
            },
            // requested timestamp in ISO format as in name of saved file
            "Timestamp": ...,
            // original URL
            "URL": ...,
            // rendered HTML document (if not exists, then ``null``)
            "Document": {
                // path of the file, relative path (to data root path ``PATH_DATA``) in container
                //   - <proxy>/<scheme>/<host>/<name>_<timestamp>.html
                "path": ...,
                // content of the file (**base64** encoded)
                "data": ...,
            },
            // web page screenshot (if not exists, then ``null``)
            "Screenshot": {
                // path of the file, relative path (to data root path ``PATH_DATA``) in container
                //   - <proxy>/<scheme>/<host>/<name>_<timestamp>.png
                "path": ...,
                // content of the file (**base64** encoded)
                "data": ...,
            }
        }

    See Also:
        * :data:`darc.submit.API_SELENIUM`
        * :func:`darc.submit.submit`
        * :func:`darc.submit.save_submit`
        * :func:`darc.submit.get_html`
        * :func:`darc.submit.get_screenshot`
        * :func:`darc.crawl.loader`

    """
    if SAVE_DB:
        try:
            model, model_created = cast('Tuple[HostnameModel, bool]',
                                        _db_operation(HostnameModel.get_or_create, hostname=link.host, defaults={
                                            'proxy': Proxy[link.proxy.upper()],
                                            'discovery': time,
                                            'last_seen': time,
                                        }))
            if not model_created:
                model.last_seen = time
                _db_operation(model.save)

            url, url_created = cast('Tuple[URLModel, bool]',
                                    _db_operation(URLModel.get_or_create, hash=link.name, defaults={
                                        'url': link.url,
                                        'hostname': model,
                                        'proxy': Proxy[link.proxy.upper()],
                                        'discovery': time,
                                        'last_seen': time,
                                        'alive': True,
                                        'since': time,
                                    }))
            if not url.alive:
                url.alive = True
                url.since = time
            if not url_created:
                url.last_seen = time
            _db_operation(url.save)

            if link.url_backref is not None:
                with contextlib.suppress(peewee.IntegrityError):
                    _db_operation(URLThroughModel.create,
                                  parent=_db_operation(URLModel.get_by_url, link.url_backref.url),
                                  child=url)

            _db_operation(SeleniumModel.create,
                          url=url,
                          timestamp=time,
                          document=html,
                          screenshot=base64.b64decode(screenshot) if screenshot else None)
        except Exception:
            logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line='submit_selenium(...)')

    metadata = link.asdict()
    ts = time.isoformat()

    if screenshot is None:
        ss = None  # type: Optional[File]
    else:
        ss = {
            'path': os.path.relpath(f'{link.base}/{link.name}_{ts}.png', PATH_DB),
            'data': screenshot,
        }

    data = {
        '[metadata]': metadata,
        'Timestamp': ts,
        'URL': link.url,
        'Document': {
            'path': os.path.relpath(f'{link.base}/{link.name}_{ts}.html', PATH_DB),
            'data': base64.b64encode(html.encode()).decode(),
        },
        'Screenshot': ss,
    }
    logger.plog(LOG_DEBUG, '-*- SELENIUM DATA -*-', object=data)

    if API_SELENIUM is None:
        save_submit('selenium', data)
        return

    # submit data
    submit(API_SELENIUM, 'selenium', data)
Source code for darc.submit

darc

Navigation

Related Topics