Source code for darc.model.web.url

# -*- coding: utf-8 -*-
"""URL Records
-----------------

The :mod:`darc.model.web.url` module defines the data model
representing URLs, specifically from ``requests`` and
``selenium`` submission.

.. seealso::

   Please refer to :func:`darc.submit.submit_requests` and
   :func:`darc.submit.submit_selenium` for more information.

"""

from typing import TYPE_CHECKING

from peewee import BooleanField, CharField, DateTimeField, ForeignKeyField, TextField

from darc._typing import SPHINX_BUILD
from darc.model.abc import BaseMetaWeb as BaseMeta
from darc.model.abc import BaseModelWeb as BaseModel
from darc.model.utils import IntEnumField, Proxy
from darc.model.web.hostname import HostnameModel

if TYPE_CHECKING:
    from typing import List, TypeVar

    from darc._compat import datetime

    if not SPHINX_BUILD:
        from darc.model.web.requests import RequestsModel  # pylint: disable=unused-import
        from darc.model.web.selenium import SeleniumModel  # pylint: disable=unused-import
    else:
        RequestsModel = TypeVar('RequestsModel', bound='darc.model.web.requests.RequestsModel')  # type: ignore[name-defined,unreachable,misc] # pylint: disable=line-too-long
        SeleniumModel = TypeVar('SeleniumModel', bound='darc.model.web.selenium.SeleniumModel')  # type: ignore[name-defined,unreachable,misc] # pylint: disable=line-too-long

__all__ = ['URLModel', 'URLThroughModel']


[docs]class URLModel(BaseModel): """Data model for a requested URL. Important: The *alive* of a URL is toggled if :func:`~darc.crawl.crawler` successfully requested such URL and the status code is :attr:`~flask.Response.ok`. """ #: ``requests`` submission record, back reference from #: :attr:`RequestsModel.url <darc.models.web.requests.RequestsModel.url>`. requests: 'List[RequestsModel]' #: ``selenium`` submission record, back reference from #: :attr:`SeleniumModel.url <darc.models.web.selenium.SeleniumModel.url>`. selenium: 'List[SeleniumModel]' #: Original URL (c.f. :attr:`link.url <darc.link.Link.url>`). url: str = TextField() #: Sha256 hash value (c.f. :attr:`Link.name <darc.link.Link.name>`). hash: str = CharField(max_length=256, unique=True) #: Hostname (c.f. :attr:`link.host <darc.link.Link.host>`). hostname: 'HostnameModel' = ForeignKeyField(HostnameModel, backref='urls') #: Proxy type (c.f. :attr:`link.proxy <darc.link.Link.proxy>`). proxy: 'Proxy' = IntEnumField(choices=Proxy) #: Timestamp of first submission. discovery: 'datetime' = DateTimeField() #: Timestamp of last submission. last_seen: 'datetime' = DateTimeField() #: If the hostname is still active. alive: bool = BooleanField() #: The hostname is active/inactive since this timestamp. since: 'datetime' = DateTimeField()
[docs] @classmethod def get_by_url(cls, url: str) -> 'URLModel': """Select by URL. Args: url: URL to select. Returns: Selected URL model. """ return cls.get(cls.url == url)
@property def parents(self) -> 'List[URLModel]': """Back reference to where the URL was identified.""" return (URLModel .select() .join(URLThroughModel, on=URLThroughModel.parent) .where(URLThroughModel.child == self) .order_by(URLModel.url)) @property def childrent(self) -> 'List[URLModel]': """Back reference to which URLs were identified from the URL.""" return (URLModel .select() .join(URLThroughModel, on=URLThroughModel.child) .where(URLThroughModel.parent == self) .order_by(URLModel.url))
[docs]class URLThroughModel(BaseModel): """Data model for the map of URL extration chain.""" #: Back reference to where the URL was identified. parent: 'List[URLModel]' = ForeignKeyField(URLModel, backref='parents') #: Back reference to which URLs were identified from the URL. child: 'List[URLModel]' = ForeignKeyField(URLModel, backref='children') class Meta(BaseMeta): indexes = ( # Specify a unique multi-column index on from/to-user. (('parent', 'child'), True), )