Source code for darc.model.web.hostname

# -*- coding: utf-8 -*-
"""Hostname Records
----------------------

The :mod:`darc.model.web.hostname` module defines the data model
representing hostnames, specifically from ``new_host`` submission.

.. seealso::

   Please refer to :func:`darc.submit.submit_new_host` for more
   information.

"""

from typing import TYPE_CHECKING

from peewee import CharField, DateTimeField

from darc._compat import cached_property
from darc._typing import SPHINX_BUILD
from darc.model.abc import BaseModelWeb as BaseModel
from darc.model.utils import IntEnumField, Proxy

if TYPE_CHECKING:
    from typing import Callable, List, TypeVar

    from darc._compat import datetime

    if not SPHINX_BUILD:
        from darc.model.web.hosts import HostsModel  # pylint: disable=unused-import
        from darc.model.web.robots import RobotsModel  # pylint: disable=unused-import
        from darc.model.web.sitemap import SitemapModel  # pylint: disable=unused-import
        from darc.model.web.url import URLModel  # pylint: disable=unused-import
    else:
        HostsModel = TypeVar('HostsModel', bound='darc.model.web.hosts.HostsModel')  # type: ignore[name-defined,unreachable,misc] # pylint: disable=line-too-long
        RobotsModel = TypeVar('RobotsModel', bound='darc.model.web.robots.RobotsModel')  # type: ignore[name-defined,unreachable,misc] # pylint: disable=line-too-long
        SitemapModel = TypeVar('SitemapModel', bound='darc.model.web.sitemap.SitemapModel')  # type: ignore[name-defined,unreachable,misc] # pylint: disable=line-too-long
        URLModel = TypeVar('URLModel', bound='darc.model.web.url.URLModel')  # type: ignore[name-defined,unreachable,misc] # pylint: disable=line-too-long

__all__ = ['HostnameModel']


[docs]class HostnameModel(BaseModel): """Data model for a hostname record. Important: The *alive* of a hostname is toggled if :func:`~darc.crawl.crawler` successfully requested a URL with such hostname. """ #: ``hosts.txt`` for the hostname, back reference from #: :attr:`HostsModel.host <darc.model.web.hosts.HostsModel.host>`. hosts: 'List[HostsModel]' #: ``robots.txt`` for the hostname, back reference from #: :attr:`RobotsModel.host <darc.model.web.robots.RobotsModel.host>`. robots: 'List[RobotsModel]' #: ``sitemap.xml`` for the hostname, back reference from #: :attr:`SitemapModel.sitemaps <darc.model.web.robots.SitemapModel.sitemaps>`. sitemaps: 'List[SitemapModel]' #: URLs with the same hostname, back reference from #: :attr:`URLModel.hostname <darc.model.web.url.URLModel.hostname>`. urls: 'List[URLModel]' #: Hostname (c.f. :attr:`link.host <darc.link.Link.host>`). The maximum length of #: the host name and of the fully qualified domain name (FQDN) is 63 bytes per #: label and 255 characters per FQDN. hostname: str = CharField(max_length=255, unique=True) # a valid FQDN is at most 255 characters #: Proxy type (c.f. :attr:`link.proxy <darc.link.Link.proxy>`). proxy: Proxy = IntEnumField(choices=Proxy) #: Timestamp of first ``new_host`` submission. discovery: 'datetime' = DateTimeField() #: Timestamp of last related submission. last_seen: 'datetime' = DateTimeField() @cached_property def alive(self) -> bool: """If the hostname is still active. We consider the hostname as *inactive*, only if all subsidiary URLs are *inactive*. """ return any(map(lambda url: url.alive, self.urls)) @cached_property def since(self) -> 'datetime': """The hostname is active/inactive since such timestamp. We confider the timestamp by the earlies timestamp of related subsidiary *active/inactive* URLs. """ if self.alive: filtering = lambda url: url.alive # type: Callable[[URLModel], bool] else: filtering = lambda url: not url.alive return min(*filter( filtering, self.urls ), key=lambda url: url.since)