Source code for darc.save

# -*- coding: utf-8 -*-
"""Source Saving
===================

The :mod:`darc.save` module contains the core utilities
for managing fetched files and documents.

The data storage under the root path (:data:`~darc.const.PATH_DB`)
is typically as following::

    data
    ├── _queue_requests.txt
    ├── _queue_requests.txt.tmp
    ├── _queue_selenium.txt
    ├── _queue_selenium.txt.tmp
    ├── api
    │   └── <proxy>
    │       └── <scheme>
    │           └── <hostname>
    │               ├── new_host
    │               │   └── <hash>_<timestamp>.json
    │               ├── requests
    │               │   └── <hash>_<timestamp>.json
    │               └── selenium
    │                   └── <hash>_<timestamp>.json
    ├── link.csv
    ├── misc
    │   ├── bitcoin.txt
    │   ├── data
    │   │   └── <hash>_<timestamp>.<ext>
    │   ├── ed2k.txt
    │   ├── invalid.txt
    │   ├── irc.txt
    │   ├── magnet.txt
    │   └── mail.txt
    └── <proxy>
        └── <scheme>
            └── <hostname>
                ├── <hash>_<timestamp>.dat
                ├── <hash>_<timestamp>.json
                ├── <hash>_<timestamp>_raw.html
                ├── <hash>_<timestamp>.html
                ├── <hash>_<timestamp>.png
                ├── robots.txt
                └── sitemap_<hash>.xml

"""

import dataclasses
import glob
import json
import multiprocessing
import os
import pathlib
import posixpath

import darc.typing as typing
from darc._compat import datetime
from darc.const import PATH_DB, PATH_LN, TIME_CACHE
from darc.link import Link, quote

# lock for file I/O
#_SAVE_LOCK = MANAGER.Lock()  # pylint: disable=no-member
_SAVE_LOCK = multiprocessing.Lock()


[docs]def has_folder(link: Link) -> typing.Optional[str]: # pylint: disable=inconsistent-return-statements """Check if is a new host. Args: link: Link object to check if is a new host. Returns: * If ``link`` is a new host, return :attr:`link.base <darc.link.Link.base>`. * If not, return ``None``. """ # <proxy>/<scheme>/<host>/<hash>.json glob_list = glob.glob(os.path.join(link.base, '*.json')) if not glob_list: return return link.base
[docs]def has_robots(link: Link) -> typing.Optional[str]: """Check if ``robots.txt`` already exists. Args: link: Link object to check if ``robots.txt`` already exists. Returns: * If ``robots.txt`` exists, return the path to ``robots.txt``, i.e. ``<root>/<proxy>/<scheme>/<hostname>/robots.txt``. * If not, return ``None``. """ # <proxy>/<scheme>/<host>/robots.txt path = os.path.join(link.base, 'robots.txt') return path if os.path.isfile(path) else None
[docs]def has_sitemap(link: Link) -> typing.Optional[str]: """Check if sitemap already exists. Args: link: Link object to check if sitemap already exists. Returns: * If sitemap exists, return the path to the sitemap, i.e. ``<root>/<proxy>/<scheme>/<hostname>/sitemap_<hash>.xml``. * If not, return ``None``. """ # <proxy>/<scheme>/<host>/sitemap_<hash>.xml path = os.path.join(link.base, f'sitemap_{link.name}.xml') return path if os.path.isfile(path) else None
[docs]def has_raw(time: typing.Datetime, link: Link) -> typing.Optional[str]: # pylint: disable=redefined-outer-name """Check if we need to re-craw the link by |requests|_. Args: link: Link object to check if we need to re-craw the link by |requests|_. Returns: * If no need, return the path to the document, i.e. ``<root>/<proxy>/<scheme>/<hostname>/<hash>_<timestamp>_raw.html``, or ``<root>/<proxy>/<scheme>/<hostname>/<hash>_<timestamp>.dat``. * If needed, return ``None``. See Also: * :data:`darc.const.TIME_CACHE` """ path = os.path.join(link.base, link.name) if data_list := glob.glob(f'{path}_*.dat'): return data_list[0] temp_list = glob.glob(f'{path}_*_raw.html') glob_list = sorted((pathlib.Path(item) for item in temp_list), reverse=True) if not glob_list: return None # disable caching if TIME_CACHE is None: return glob_list[0] for item in glob_list: item_date = item.stem.split('_')[1] date = datetime.fromisoformat(item_date) if time - date <= TIME_CACHE: return item return None
[docs]def has_html(time: typing.Datetime, link: Link) -> typing.Optional[str]: # pylint: disable=redefined-outer-name """Check if we need to re-craw the link by |selenium|_. Args: link: Link object to check if we need to re-craw the link by |selenium|_. Returns: * If no need, return the path to the document, i.e. ``<root>/<proxy>/<scheme>/<hostname>/<hash>_<timestamp>.html``. * If needed, return ``None``. See Also: * :data:`darc.const.TIME_CACHE` """ path = os.path.join(link.base, link.name) temp_list = list() for item in glob.glob(f'{path}_*.html'): temp = pathlib.Path(item) if temp.stem.endswith('_raw'): continue temp_list.append(temp) glob_list = sorted(temp_list, reverse=True) if not glob_list: return None # disable caching if TIME_CACHE is None: return glob_list[0] for item in glob_list: item_date = item.stem.split('_')[1] date = datetime.fromisoformat(item_date) if time - date <= TIME_CACHE: return item return None
[docs]def sanitise(link: Link, time: typing.Optional[typing.Datetime] = None, # pylint: disable=redefined-outer-name raw: bool = False, data: bool = False, headers: bool = False, screenshot: bool = False) -> str: """Sanitise link to path. Args: link: Link object to sanitise the path time (datetime): Timestamp for the path. raw: If this is a raw HTML document from |requests|_. data: If this is a generic content type document. headers: If this is response headers from |requests|_. screenshot: If this is the screenshot from |selenium|_. Returns: * If ``raw`` is ``True``, ``<root>/<proxy>/<scheme>/<hostname>/<hash>_<timestamp>_raw.html``. * If ``data`` is ``True``, ``<root>/<proxy>/<scheme>/<hostname>/<hash>_<timestamp>.dat``. * If ``headers`` is ``True``, ``<root>/<proxy>/<scheme>/<hostname>/<hash>_<timestamp>.json``. * If ``screenshot`` is ``True``, ``<root>/<proxy>/<scheme>/<hostname>/<hash>_<timestamp>.png``. * If none above, ``<root>/<proxy>/<scheme>/<hostname>/<hash>_<timestamp>.html``. See Also: * :func:`darc.crawl.crawler` * :func:`darc.crawl.loader` """ os.makedirs(link.base, exist_ok=True) path = os.path.join(link.base, link.name) if time is None: time = datetime.now() ts = time.isoformat() if raw: return f'{path}_{ts}_raw.html' if headers: return f'{path}_{ts}.json' if data: return f'{path}_{ts}.dat' if screenshot: return f'{path}_{ts}.png' return f'{path}_{ts}.html'
[docs]def save_robots(link: Link, text: str) -> str: """Save ``robots.txt``. Args: link: Link object of ``robots.txt``. text: Content of ``robots.txt``. Returns: Saved path to ``robots.txt``, i.e. ``<root>/<proxy>/<scheme>/<hostname>/robots.txt``. See Also: * :func:`darc.save.sanitise` """ path = os.path.join(link.base, 'robots.txt') root = os.path.split(path)[0] os.makedirs(root, exist_ok=True) with open(path, 'w') as file: print(f'# {link.url}', file=file) file.write(text) return path
[docs]def save_sitemap(link: Link, text: str) -> str: """Save sitemap. Args: link: Link object of sitemap. text: Content of sitemap. Returns: Saved path to sitemap, i.e. ``<root>/<proxy>/<scheme>/<hostname>/sitemap_<hash>.xml``. See Also: * :func:`darc.save.sanitise` """ # <proxy>/<scheme>/<host>/sitemap_<hash>.xml path = os.path.join(link.base, f'sitemap_{link.name}.xml') root = os.path.split(path)[0] os.makedirs(root, exist_ok=True) with open(path, 'w') as file: print(f'<!-- {link.url} -->', file=file) file.write(text) save_link(link) return path
[docs]def save_headers(time: typing.Datetime, link: Link, response: typing.Response, session: typing.Session) -> str: # pylint: disable=redefined-outer-name """Save HTTP response headers. Args: time (datetime): Timestamp of response. link: Link object of response. response (|Response|_): Response object to be saved. session (|Session|_): Session object of response. Returns: Saved path to response headers, i.e. ``<root>/<proxy>/<scheme>/<hostname>/<hash>_<timestamp>.json``. The JSON data saved is as following: .. code:: json { "[metadata]": { "url": "...", "proxy": "...", "host": "...", "base": "...", "name": "..." }, "Timestamp": "...", "URL": "...", "Method": "GET", "Status-Code": "...", "Reason": "...", "Cookies": { "...": "..." }, "Session": { "...": "..." }, "Request": { "...": "..." }, "Response": { "...": "..." }, "History": [ {"...": "..."} ] } See Also: * :func:`darc.save.sanitise` * :func:`darc.crawl.crawler` """ metadata = dataclasses.asdict(link) metadata['base'] = os.path.relpath(link.base, PATH_DB) del metadata['url_parse'] data = { '[metadata]': metadata, 'Timestamp': time.isoformat(), 'URL': response.url, 'Method': response.request.method, 'Status-Code': response.status_code, 'Reason': response.reason, 'Cookies': response.cookies.get_dict(), 'Session': session.cookies.get_dict(), 'Request': dict(response.request.headers), 'Response': dict(response.headers), 'History': [{ 'URL': history.url, 'Method': history.request.method, 'Status-Code': history.status_code, 'Reason': history.reason, 'Cookies': history.cookies.get_dict(), 'Request': dict(history.request.headers), 'Response': dict(history.headers), } for history in response.history], } path = sanitise(link, time, headers=True) with open(path, 'w') as file: json.dump(data, file, indent=2) save_link(link) return path
[docs]def save_html(time: typing.Datetime, link: Link, html: typing.Union[str, bytes], raw: bool = False) -> str: # pylint: disable=redefined-outer-name """Save response. Args: time (datetime): Timestamp of HTML document. link: Link object of original URL. html: Content of HTML document. raw: If is fetched from |requests|_. Returns: Saved path to HTML document. * If ``raw`` is ``True``, ``<root>/<proxy>/<scheme>/<hostname>/<hash>_<timestamp>_raw.html``. * If not, ``<root>/<proxy>/<scheme>/<hostname>/<hash>_<timestamp>.html``. See Also: * :func:`darc.save.sanitise` * :func:`darc.crawl.crawler` * :func:`darc.crawl.loader` """ # comment line comment = f'<!-- {link.url} -->' path = sanitise(link, time, raw=raw) if raw: with open(path, 'wb') as file: file.write(comment.encode()) file.write(os.linesep.encode()) file.write(html) else: with open(path, 'w') as file: print(comment, file=file) file.write(html) return path
[docs]def save_file(time: typing.Datetime, link: Link, content: bytes) -> str: """Save file. The function will also try to make symbolic links from the saved file standard path to the relative path as in the URL. Args: time (datetime): Timestamp of generic file. link: Link object of original URL. content: Content of generic file. Returns: Saved path to generic content type file, ``<root>/<proxy>/<scheme>/<hostname>/<hash>_<timestamp>.dat``. See Also: * :func:`darc.save.sanitise` * :func:`darc.crawl.crawler` """ # real path dest = sanitise(link, time, data=True) with open(dest, 'wb') as file: file.write(content) # remove leading slash '/' temp_path = link.url_parse.path[1:] # <proxy>/<scheme>/<host>/"..." root, name = posixpath.split(temp_path) path = os.path.join(link.base, root) os.makedirs(path, exist_ok=True) # os.chdir(path) # with open(name, 'wb') as file: # file.write(content) # os.chdir(CWD) src = os.path.relpath(dest, path) dst = os.path.join(path, name) os.symlink(src, dst, target_is_directory=False) return dest