Source code for darc.db

# -*- coding: utf-8 -*-
"""Link Database
===================

The :mod:`darc` project utilises file system based database
to provide tele-process communication.

.. note::

   In its first implementation, the :mod:`darc` project used
   |Queue|_ to support such communication. However, as noticed
   when runtime, the |Queue| object will be much affected by
   the lack of memory.

   .. |Queue| replace:: ``multiprocessing.Queue``
   .. _Queue: https://docs.python.org/3/library/multiprocessing.html#multiprocessing.Queue

There will be two databases, both locate at root of the
data storage path :data:`~darc.const.PATH_DB`:

* the |requests|_ database -- ``queue_requests.txt``
* the |selenium|_ database -- ``queue_selenium.txt``

At runtime, after reading such database, :mod:`darc`
will keep a backup of the database with ``.tmp`` suffix
to its file extension.

"""

import math
import multiprocessing
import os
import pprint
import random
import shutil
import threading

import stem.util.term

import darc.typing as typing
from darc._compat import nullcontext
from darc.const import CHECK, FLAG_MP, FLAG_TH, PATH_QR, PATH_QS, VERBOSE
from darc.error import render_error
from darc.link import quote, unquote
from darc.parse import _check, match_proxy
from darc.proxy.freenet import _FREENET_BS_FLAG, freenet_bootstrap, has_freenet
from darc.proxy.i2p import _I2P_BS_FLAG, has_i2p, i2p_bootstrap
from darc.proxy.tor import _TOR_BS_FLAG, has_tor, tor_bootstrap
from darc.proxy.zeronet import _ZERONET_BS_FLAG, has_zeronet, zeronet_bootstrap

# max pool
MAX_POOL = float(os.getenv('DARC_MAX_POOL', '1_000'))
if math.isfinite(MAX_POOL):
    MAX_POOL = math.floor(MAX_POOL)

# database I/O lock
QR_LOCK = multiprocessing.Lock()
if FLAG_MP:
    #QS_LOCK = MANAGER.Lock()  # pylint: disable=no-member
    QS_LOCK = multiprocessing.Lock()
elif FLAG_TH:
    QS_LOCK = threading.Lock()
else:
    QS_LOCK = nullcontext()


[docs]def save_requests(entries: typing.Iterable[str], single: bool = False): """Save link to the |requests|_ database. Args: entries: Links to be added to the |requests|_ database. It can be either an *iterable* of links, or a single link string (if ``single`` set as ``True``). single: Indicate if ``entries`` is an *iterable* of links or a single link string. """ with QR_LOCK: with open(PATH_QR, 'a') as file: if single: print(quote(entries), file=file) else: for link in entries: print(quote(link), file=file)
[docs]def save_selenium(entries: typing.Iterable[str], single: bool = False): """Save link to the |selenium|_ database. Args: entries: Links to be added to the |selenium|_ database. It can be either an *iterable* of links, or a single link string (if ``single`` set as ``True``). single: Indicate if ``entries`` is an *iterable* of links or a single link string. """ with QS_LOCK: with open(PATH_QS, 'a') as file: if single: print(quote(entries), file=file) else: for link in entries: print(quote(link), file=file)
[docs]def load_requests(check: bool = CHECK) -> typing.List[str]: """Load link from the |requests|_ database. After loading, :mod:`darc` will backup the original database ``queue_requests.txt`` as ``queue_requests.txt.tmp`` and empty the loaded database. Args: check: If perform checks on loaded links, default to :data:`~darc.const.CHECK`. Returns: List of loaded links from the |requests|_ database. Note: Lines start with ``#`` will be considered as comments. Empty lines and comment lines will be ignored when loading. At runtime, the function will load links with maximum number at :data:`~darc.db.MAX_POOL` to limit the memory usage. """ link_pool = list() if os.path.isfile(PATH_QR): overflow = False link_list = list() with open(PATH_QR) as file: index = 0 for line in filter(None, map(lambda s: s.strip(), file)): if line.startswith('#'): continue link_list.append(unquote(line.strip())) index += 1 if index >= MAX_POOL: overflow = True break if overflow: with open(f'{PATH_QR}.save', 'w') as temp_file: for line in filter(None, map(lambda s: s.strip(), file)): print(line, file=temp_file) if link_list: random.shuffle(link_list) link_pool = sorted(set(link_list)) if check: link_pool = _check(link_pool) if not _TOR_BS_FLAG and has_tor(link_pool) and not match_proxy('tor'): tor_bootstrap() if not _I2P_BS_FLAG and has_i2p(link_pool) and not match_proxy('i2p'): i2p_bootstrap() if not _ZERONET_BS_FLAG and has_zeronet(link_pool) and not match_proxy('zeronet'): zeronet_bootstrap() if not _FREENET_BS_FLAG and has_freenet(link_pool) and not match_proxy('freenet'): freenet_bootstrap() os.rename(PATH_QR, f'{PATH_QR}.tmp') if overflow: os.rename(f'{PATH_QR}.save', PATH_QR) if VERBOSE: print(stem.util.term.format('-*- [REQUESTS] LINK POOL -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print(render_error(pprint.pformat(sorted(link_pool)), stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print(stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member return link_pool
[docs]def load_selenium(check: bool = CHECK) -> typing.List[str]: """Load link from the |selenium|_ database. After loading, :mod:`darc` will backup the original database ``queue_selenium.txt`` as ``queue_selenium.txt.tmp`` and empty the loaded database. Args: check: If perform checks on loaded links, default to :data:`~darc.const.CHECK`. Returns: List of loaded links from the |selenium|_ database. Note: Lines start with ``#`` will be considered as comments. Empty lines and comment lines will be ignored when loading. At runtime, the function will load links with maximum number at :data:`~darc.db.MAX_POOL` to limit the memory usage. """ link_pool = list() if os.path.isfile(PATH_QS): overflow = False link_list = list() with open(PATH_QS) as file: index = 0 for line in filter(None, map(lambda s: s.strip(), file)): if line.startswith('#'): continue link_list.append(unquote(line.strip())) index += 1 if index >= MAX_POOL: overflow = True break if overflow: with open(f'{PATH_QS}.save', 'w') as temp_file: for line in filter(None, map(lambda s: s.strip(), file)): print(line, file=temp_file) if link_list: random.shuffle(link_list) link_pool = sorted(set(link_list)) if check: link_pool = _check(link_pool) os.rename(PATH_QS, f'{PATH_QS}.tmp') if overflow: os.rename(f'{PATH_QS}.save', PATH_QS) if VERBOSE: print(stem.util.term.format('-*- [SELENIUM] LINK POOL -*-', stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print(render_error(pprint.pformat(sorted(link_pool)), stem.util.term.Color.MAGENTA)) # pylint: disable=no-member print(stem.util.term.format('-' * shutil.get_terminal_size().columns, stem.util.term.Color.MAGENTA)) # pylint: disable=no-member return link_pool