Source code for darc.submit

# -*- coding: utf-8 -*-
# pylint: disable=ungrouped-imports
"""Data Submission
=====================

The :mod:`darc` project integrates the capability of submitting
fetched data and information to a web server, to support real-time
cross-analysis and status display.

There are three submission events:

1. New Host Submission -- :data:`~darc.submit.API_NEW_HOST`

   Submitted in :func:`~darc.crawl.crawler` function call, when the
   crawling URL is marked as a new host.

2. Requests Submission -- :data:`~darc.submit.API_REQUESTS`

   Submitted in :func:`~darc.crawl.crawler` function call, after the
   crawling process of the URL using :mod:`requests`.

3. Selenium Submission -- :data:`~darc.submit.API_SELENIUM`

   Submitted in :func:`~darc.crawl.loader` function call, after the
   loading process of the URL using :mod:`selenium`.

.. seealso::

   Please refer to :doc:`data schema </demo/schema>` for more
   information about the submission data.

"""

import base64
import contextlib
import glob
import json
import os
from datetime import date
from typing import TYPE_CHECKING, cast

import peewee
import requests

from darc._compat import datetime
from darc.const import PATH_DB
from darc.db import _db_operation
from darc.error import APIRequestFailed, DatabaseOperaionFailed
from darc.logging import DEBUG as LOG_DEBUG
from darc.logging import WARNING as LOG_WARNING
from darc.logging import logger
from darc.model import (HostnameModel, HostsModel, RequestsHistoryModel, RequestsModel, RobotsModel,
                        SeleniumModel, SitemapModel, URLModel, URLThroughModel)
from darc.model.utils import Proxy
from darc.requests import null_session

if TYPE_CHECKING:
    from typing import Any, Dict, List, Optional, Tuple

    from requests import Response, Session
    from typing_extensions import Literal

    import darc.link as darc_link  # Link
    from darc._typing import File

    Domain = Literal['new_host', 'requests', 'selenium']

# save submitted data to database
SAVE_DB = bool(int(os.getenv('SAVE_DB', '1')))

# retry times
API_RETRY = int(os.getenv('API_RETRY', '3'))

# API request storage
PATH_API = os.path.join(PATH_DB, 'api')
os.makedirs(PATH_API, exist_ok=True)

# API URLs
API_NEW_HOST = os.getenv('API_NEW_HOST')
API_REQUESTS = os.getenv('API_REQUESTS')
API_SELENIUM = os.getenv('API_SELENIUM')
logger.debug('-*- SUBMIT API -*-\nNEW HOST: %s\nREQUESTS: %s\nSELENIUM: %s\n%s',
             API_NEW_HOST, API_REQUESTS, API_SELENIUM, logger.horizon)

# UNIX epoch
EPOCH = datetime(1970, 1, 1, 0, 0)  # 1970-01-01T00:00:00


[docs]def get_robots(link: 'darc_link.Link') -> 'Optional[File]': """Read ``robots.txt``. Args: link: Link object to read ``robots.txt``. Returns: * If ``robots.txt`` exists, return the data from ``robots.txt``. * ``path`` -- relative path from ``robots.txt`` to root of data storage :data:`~darc.const.PATH_DB`, ``<proxy>/<scheme>/<hostname>/robots.txt`` * ``data`` -- *base64* encoded content of ``robots.txt`` * If not, return :data:`None`. See Also: * :func:`darc.crawl.crawler` * :func:`darc.proxy.null.save_robots` """ path = os.path.join(link.base, 'robots.txt') if not os.path.isfile(path): return None with open(path, 'rb') as file: content = file.read() return { 'path': os.path.relpath(path, PATH_DB), 'data': base64.b64encode(content).decode(), }
[docs]def get_sitemaps(link: 'darc_link.Link') -> 'Optional[List[File]]': """Read sitemaps. Args: link: Link object to read sitemaps. Returns: * If sitemaps exist, return list of the data from sitemaps. * ``path`` -- relative path from sitemap to root of data storage :data:`~darc.const.PATH_DB`, ``<proxy>/<scheme>/<hostname>/sitemap_<hash>.xml`` * ``data`` -- *base64* encoded content of sitemap * If not, return :data:`None`. See Also: * :func:`darc.crawl.crawler` * :func:`darc.proxy.null.save_sitemap` """ path_list = glob.glob(os.path.join(link.base, 'sitemap_*.xml')) if not path_list: return None data_list = [] # type: List[File] for path in path_list: with open(path, 'rb') as file: content = file.read() data_list.append({ 'path': os.path.relpath(path, PATH_DB), 'data': base64.b64encode(content).decode(), }) return data_list
[docs]def get_hosts(link: 'darc_link.Link') -> 'Optional[File]': """Read ``hosts.txt``. Args: link: Link object to read ``hosts.txt``. Returns: * If ``hosts.txt`` exists, return the data from ``hosts.txt``. * ``path`` -- relative path from ``hosts.txt`` to root of data storage :data:`~darc.const.PATH_DB`, ``<proxy>/<scheme>/<hostname>/hosts.txt`` * ``data`` -- *base64* encoded content of ``hosts.txt`` * If not, return :data:`None`. See Also: * :func:`darc.crawl.crawler` * :func:`darc.proxy.i2p.save_hosts` """ if link.proxy != 'i2p': return None path = os.path.join(link.base, 'hosts.txt') if not os.path.isfile(path): return None with open(path, 'rb') as file: content = file.read() return { 'path': os.path.relpath(path, PATH_DB), 'data': base64.b64encode(content).decode(), }
[docs]def save_submit(domain: 'Domain', data: 'Dict[str, Any]') -> None: """Save failed submit data. Args: domain (``'new_host'``, ``'requests'`` or ``'selenium'``): Domain of the submit data. data: Submit data. Notes: The saved files will be categorised by the actual runtime day for better maintenance. See Also: * :data:`darc.submit.PATH_API` * :func:`darc.submit.submit` * :func:`darc.submit.submit_new_host` * :func:`darc.submit.submit_requests` * :func:`darc.submit.submit_selenium` """ today = date.today().isoformat() metadata = data['[metadata]'] name = metadata['name'] ts = data['Timestamp'] root = os.path.join(PATH_API, today, metadata['base'], domain) os.makedirs(root, exist_ok=True) with open(os.path.join(root, f'{name}_{ts}.json'), 'w') as file: json.dump(data, file, indent=2)
[docs]def submit(api: str, domain: 'Domain', data: 'Dict[str, Any]') -> None: """Submit data. Args: api: API URL. domain (``'new_host'``, ``'requests'`` or ``'selenium'``): Domain of the submit data. data: Submit data. See Also: * :data:`darc.submit.API_RETRY` * :func:`darc.submit.save_submit` * :func:`darc.submit.submit_new_host` * :func:`darc.submit.submit_requests` * :func:`darc.submit.submit_selenium` """ with null_session() as session: for _ in range(API_RETRY+1): try: response = session.post(api, json=data) if response.ok: return except requests.RequestException: logger.pexc(LOG_WARNING, category=APIRequestFailed, line=f'[{domain.upper()}] response = requests.post(api, json=data)') save_submit(domain, data)
[docs]def submit_new_host(time: 'datetime', link: 'darc_link.Link', partial: bool = False, force: bool = False) -> None: """Submit new host. When a new host is discovered, the :mod:`darc` crawler will submit the host information. Such includes ``robots.txt`` (if exists) and ``sitemap.xml`` (if any). Args: time (datetime.datetime): Timestamp of submission. link: Link object of submission. partial: If the data is not complete, i.e. failed when fetching ``robots.txt``, ``hosts.txt`` and/or sitemaps. force: If the data is force re-fetched, i.e. cache expired when checking with :func:`darc.db.have_hostname`. If :data:`~darc.submit.API_NEW_HOST` is :data:`None`, the data for submission will directly be save through :func:`~darc.submit.save_submit`. The data submitted should have following format: .. code-block:: { // partial flag - true / false "$PARTIAL$": ..., // force flag - true / false "$FORCE$": ..., // metadata of URL "[metadata]": { // original URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment> "url": ..., // proxy type - null / tor / i2p / zeronet / freenet "proxy": ..., // hostname / netloc, c.f. ``urllib.parse.urlparse`` "host": ..., // base folder, relative path (to data root path ``PATH_DATA``) in containter - <proxy>/<scheme>/<host> "base": ..., // sha256 of URL as name for saved files (timestamp is in ISO format) // JSON log as this one - <base>/<name>_<timestamp>.json // HTML from requests - <base>/<name>_<timestamp>_raw.html // HTML from selenium - <base>/<name>_<timestamp>.html // generic data files - <base>/<name>_<timestamp>.dat "name": ..., // originate URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment> "backref": ... }, // requested timestamp in ISO format as in name of saved file "Timestamp": ..., // original URL "URL": ..., // robots.txt from the host (if not exists, then ``null``) "Robots": { // path of the file, relative path (to data root path ``PATH_DATA``) in container // - <proxy>/<scheme>/<host>/robots.txt "path": ..., // content of the file (**base64** encoded) "data": ..., }, // sitemaps from the host (if none, then ``null``) "Sitemaps": [ { // path of the file, relative path (to data root path ``PATH_DATA``) in container // - <proxy>/<scheme>/<host>/sitemap_<name>.xml "path": ..., // content of the file (**base64** encoded) "data": ..., }, ... ], // hosts.txt from the host (if proxy type is ``i2p``; if not exists, then ``null``) "Hosts": { // path of the file, relative path (to data root path ``PATH_DATA``) in container // - <proxy>/<scheme>/<host>/hosts.txt "path": ..., // content of the file (**base64** encoded) "data": ..., } } See Also: * :data:`darc.submit.API_NEW_HOST` * :func:`darc.submit.submit` * :func:`darc.submit.save_submit` * :func:`darc.submit.get_robots` * :func:`darc.submit.get_sitemaps` * :func:`darc.submit.get_hosts` """ metadata = link.asdict() ts = time.isoformat() robots = get_robots(link) sitemaps = get_sitemaps(link) hosts = get_hosts(link) if SAVE_DB: try: model, _ = cast('Tuple[HostnameModel, bool]', _db_operation(HostnameModel.get_or_create, hostname=link.host, defaults={ 'proxy': Proxy[link.proxy.upper()], 'discovery': time, 'last_seen': time, })) if robots is not None: _db_operation(RobotsModel.create, host=model, timestamp=time, document=base64.b64decode(robots['data']).decode()) if sitemaps is not None: for sitemap in sitemaps: _db_operation(SitemapModel.create, host=model, timestamp=time, document=base64.b64decode(sitemap['data']).decode()) if hosts is not None: _db_operation(HostsModel.create, host=model, timestamp=time, document=base64.b64decode(hosts['data']).decode()) except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line='submit_new_host(...)') data = { '$PARTIAL$': partial, '$FORCE$': force, '[metadata]': metadata, 'Timestamp': ts, 'URL': link.host, 'Robots': robots, 'Sitemaps': sitemaps, 'Hosts': hosts, } logger.plog(LOG_DEBUG, '-*- NEW HOST DATA -*-', object=data) if API_NEW_HOST is None: save_submit('new_host', data) return # submit data submit(API_NEW_HOST, 'new_host', data)
[docs]def submit_requests(time: 'datetime', link: 'darc_link.Link', response: 'Response', session: 'Session', content: bytes, mime_type: str, html: bool = True) -> None: """Submit requests data. When crawling, we'll first fetch the URl using :mod:`requests`, to check its availability and to save its HTTP headers information. Such information will be submitted to the web UI. Args: time (datetime.datetime): Timestamp of submission. link: Link object of submission. response (requests.Response): Response object of submission. session (requests.Session): Session object of submission. content: Raw content of from the response. mime_type: Content type. html: If current document is HTML or other files. If :data:`~darc.submit.API_REQUESTS` is :data:`None`, the data for submission will directly be save through :func:`~darc.submit.save_submit`. The data submitted should have following format: .. code-block:: { // metadata of URL "[metadata]": { // original URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment> "url": ..., // proxy type - null / tor / i2p / zeronet / freenet "proxy": ..., // hostname / netloc, c.f. ``urllib.parse.urlparse`` "host": ..., // base folder, relative path (to data root path ``PATH_DATA``) in containter - <proxy>/<scheme>/<host> "base": ..., // sha256 of URL as name for saved files (timestamp is in ISO format) // JSON log as this one - <base>/<name>_<timestamp>.json // HTML from requests - <base>/<name>_<timestamp>_raw.html // HTML from selenium - <base>/<name>_<timestamp>.html // generic data files - <base>/<name>_<timestamp>.dat "name": ..., // originate URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment> "backref": ... }, // requested timestamp in ISO format as in name of saved file "Timestamp": ..., // original URL "URL": ..., // request method "Method": "GET", // response status code "Status-Code": ..., // response reason "Reason": ..., // response cookies (if any) "Cookies": { ... }, // session cookies (if any) "Session": { ... }, // request headers (if any) "Request": { ... }, // response headers (if any) "Response": { ... }, // content type "Content-Type": ..., // requested file (if not exists, then ``null``) "Document": { // path of the file, relative path (to data root path ``PATH_DATA``) in container // - <proxy>/<scheme>/<host>/<name>_<timestamp>_raw.html // or if the document is of generic content type, i.e. not HTML // - <proxy>/<scheme>/<host>/<name>_<timestamp>.dat "path": ..., // content of the file (**base64** encoded) "data": ..., }, // redirection history (if any) "History": [ // same record data as the original response {"...": "..."} ] } See Also: * :data:`darc.submit.API_REQUESTS` * :func:`darc.submit.submit` * :func:`darc.submit.save_submit` * :func:`darc.submit.get_raw` * :func:`darc.crawl.crawler` """ if SAVE_DB: try: model, model_created = cast('Tuple[HostnameModel, bool]', _db_operation(HostnameModel.get_or_create, hostname=link.host, defaults={ 'proxy': Proxy[link.proxy.upper()], 'discovery': time, 'last_seen': time, })) if not model_created: model.last_seen = time _db_operation(model.save) url, url_created = cast('Tuple[URLModel, bool]', _db_operation(URLModel.get_or_create, hash=link.name, defaults={ 'url': link.url, 'hostname': model, 'proxy': Proxy[link.proxy.upper()], 'discovery': time, 'last_seen': time, 'alive': False, 'since': EPOCH, })) if not url.alive and response.ok: url.alive = True url.since = time elif url.alive and not response.ok: url.alive = False url.since = time if not url_created: url.last_seen = time _db_operation(url.save) if link.url_backref is not None: with contextlib.suppress(peewee.IntegrityError): _db_operation(URLThroughModel.create, parent=_db_operation(URLModel.get_by_url, link.url_backref.url), child=url) model = cast('RequestsModel', _db_operation(RequestsModel.create, url=url, timestamp=time, method=response.request.method, document=content, mime_type=mime_type, is_html=html, status_code=response.status_code, reason=response.reason, cookies=response.cookies.get_dict(), session=response.cookies.get_dict(), request=dict(response.request.headers), response=dict(response.headers))) for index, history in enumerate(response.history): _db_operation(RequestsHistoryModel.create, index=index, model=model, url=history.url, timestamp=time, method=history.request.method, document=history.content, status_code=history.status_code, reason=history.reason, cookies=history.cookies.get_dict(), request=dict(history.request.headers), response=dict(history.headers)) except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line='submit_requests(...)') metadata = link.asdict() ts = time.isoformat() if html: path = f'{link.base}/{link.name}_{ts}_raw.html' else: path = f'{link.base}/{link.name}_{ts}.dat' data = { '[metadata]': metadata, 'Timestamp': ts, 'URL': link.url, 'Method': response.request.method, 'Status-Code': response.status_code, 'Reason': response.reason, 'Cookies': [vars(cookie) for cookie in response.cookies], 'Session': [vars(cookie) for cookie in session.cookies], 'Request': dict(response.request.headers), 'Response': dict(response.headers), 'Content-Type': mime_type, 'Document': { 'path': os.path.relpath(path, PATH_DB), 'data': base64.b64encode(content).decode(), }, 'History': [{ 'URL': history.url, 'Method': history.request.method, 'Status-Code': history.status_code, 'Reason': history.reason, 'Cookies': history.cookies.get_dict(), 'Request': dict(history.request.headers), 'Response': dict(history.headers), 'Document': base64.b64encode(history.content).decode(), } for history in response.history], } logger.plog(LOG_DEBUG, '-*- REQUESTS DATA -*-', object=data) if API_REQUESTS is None: save_submit('requests', data) return # submit data submit(API_REQUESTS, 'requests', data)
[docs]def submit_selenium(time: 'datetime', link: 'darc_link.Link', html: str, screenshot: 'Optional[str]') -> None: """Submit selenium data. After crawling with :mod:`requests`, we'll then render the URl using :mod:`selenium` with Google Chrome and its web driver, to provide a fully rendered web page. Such information will be submitted to the web UI. Args: time (datetime.datetime): Timestamp of submission. link: Link object of submission. html: HTML source of the web page. screenshot: *base64* encoded screenshot. If :data:`~darc.submit.API_SELENIUM` is :data:`None`, the data for submission will directly be save through :func:`~darc.submit.save_submit`. Note: This information is optional, only provided if the content type from :mod:`requests` is HTML, status code not between ``400`` and ``600``, and HTML data not empty. The data submitted should have following format: .. code-block:: { // metadata of URL "[metadata]": { // original URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment> "url": ..., // proxy type - null / tor / i2p / zeronet / freenet "proxy": ..., // hostname / netloc, c.f. ``urllib.parse.urlparse`` "host": ..., // base folder, relative path (to data root path ``PATH_DATA``) in containter - <proxy>/<scheme>/<host> "base": ..., // sha256 of URL as name for saved files (timestamp is in ISO format) // JSON log as this one - <base>/<name>_<timestamp>.json // HTML from requests - <base>/<name>_<timestamp>_raw.html // HTML from selenium - <base>/<name>_<timestamp>.html // generic data files - <base>/<name>_<timestamp>.dat "name": ..., // originate URL - <scheme>://<netloc>/<path>;<params>?<query>#<fragment> "backref": ... }, // requested timestamp in ISO format as in name of saved file "Timestamp": ..., // original URL "URL": ..., // rendered HTML document (if not exists, then ``null``) "Document": { // path of the file, relative path (to data root path ``PATH_DATA``) in container // - <proxy>/<scheme>/<host>/<name>_<timestamp>.html "path": ..., // content of the file (**base64** encoded) "data": ..., }, // web page screenshot (if not exists, then ``null``) "Screenshot": { // path of the file, relative path (to data root path ``PATH_DATA``) in container // - <proxy>/<scheme>/<host>/<name>_<timestamp>.png "path": ..., // content of the file (**base64** encoded) "data": ..., } } See Also: * :data:`darc.submit.API_SELENIUM` * :func:`darc.submit.submit` * :func:`darc.submit.save_submit` * :func:`darc.submit.get_html` * :func:`darc.submit.get_screenshot` * :func:`darc.crawl.loader` """ if SAVE_DB: try: model, model_created = cast('Tuple[HostnameModel, bool]', _db_operation(HostnameModel.get_or_create, hostname=link.host, defaults={ 'proxy': Proxy[link.proxy.upper()], 'discovery': time, 'last_seen': time, })) if not model_created: model.last_seen = time _db_operation(model.save) url, url_created = cast('Tuple[URLModel, bool]', _db_operation(URLModel.get_or_create, hash=link.name, defaults={ 'url': link.url, 'hostname': model, 'proxy': Proxy[link.proxy.upper()], 'discovery': time, 'last_seen': time, 'alive': True, 'since': time, })) if not url.alive: url.alive = True url.since = time if not url_created: url.last_seen = time _db_operation(url.save) if link.url_backref is not None: with contextlib.suppress(peewee.IntegrityError): _db_operation(URLThroughModel.create, parent=_db_operation(URLModel.get_by_url, link.url_backref.url), child=url) _db_operation(SeleniumModel.create, url=url, timestamp=time, document=html, screenshot=base64.b64decode(screenshot) if screenshot else None) except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line='submit_selenium(...)') metadata = link.asdict() ts = time.isoformat() if screenshot is None: ss = None # type: Optional[File] else: ss = { 'path': os.path.relpath(f'{link.base}/{link.name}_{ts}.png', PATH_DB), 'data': screenshot, } data = { '[metadata]': metadata, 'Timestamp': ts, 'URL': link.url, 'Document': { 'path': os.path.relpath(f'{link.base}/{link.name}_{ts}.html', PATH_DB), 'data': base64.b64encode(html.encode()).decode(), }, 'Screenshot': ss, } logger.plog(LOG_DEBUG, '-*- SELENIUM DATA -*-', object=data) if API_SELENIUM is None: save_submit('selenium', data) return # submit data submit(API_SELENIUM, 'selenium', data)