Source code for darc.crawl

# -*- coding: utf-8 -*-
"""Web Crawlers
==================

The :mod:`darc.crawl` module provides two types of crawlers.

* :func:`~darc.crawl.crawler` -- crawler powered by :mod:`requests`
* :func:`~darc.crawl.loader` -- crawler powered by :mod:`selenium`

"""

import contextlib
import math
import os
import shutil
import sys
import traceback

import requests
import selenium.common.exceptions
import selenium.webdriver
import selenium.webdriver.common.proxy
import stem
import stem.control
import stem.process
import stem.util.term
import urllib3

import darc.typing as typing
from darc._compat import datetime
from darc.const import FORCE, SE_EMPTY
from darc.db import (drop_hostname, drop_requests, drop_selenium, have_hostname, save_requests,
                     save_selenium)
from darc.error import LinkNoReturn, render_error
from darc.link import Link
from darc.parse import (check_robots, extract_links, get_content_type, match_host, match_mime,
                        match_proxy)
from darc.proxy.i2p import fetch_hosts, read_hosts
from darc.proxy.null import fetch_sitemap, save_invalid
from darc.requests import request_session
from darc.save import save_headers
from darc.selenium import request_driver
from darc.sites import crawler_hook, loader_hook
from darc.submit import SAVE_DB, submit_new_host, submit_requests, submit_selenium
from darc.model import HostnameModel, URLModel


[docs]def crawler(link: Link):
    """Single :mod:`requests` crawler for a entry link.

    Args:
        link: URL to be crawled by :mod:`requests`.

    The function will first parse the URL using
    :func:`~darc.link.parse_link`, and check if need to crawl the
    URL (c.f. :data:`~darc.const.PROXY_WHITE_LIST`, :data:`~darc.const.PROXY_BLACK_LIST`,
    :data:`~darc.const.LINK_WHITE_LIST` and :data:`~darc.const.LINK_BLACK_LIST`);
    if true, then crawl the URL with :mod:`requests`.

    If the URL is from a brand new host, :mod:`darc` will first try
    to fetch and save ``robots.txt`` and sitemaps of the host
    (c.f. :func:`~darc.proxy.null.save_robots` and :func:`~darc.proxy.null.save_sitemap`),
    and extract then save the links from sitemaps (c.f. :func:`~darc.proxy.null.read_sitemap`)
    into link database for future crawling (c.f. :func:`~darc.db.save_requests`).

    .. note::

       A host is new if :func:`~darc.db.have_hostname` returns :data:`True`.

       If :func:`darc.proxy.null.fetch_sitemap` and/or :func:`darc.proxy.i2p.fetch_hosts`
       failed when fetching such documents, the host will be removed from the
       hostname database through :func:`~darc.db.drop_hostname`, and considered
       as new when next encounter.

    Also, if the submission API is provided, :func:`~darc.submit.submit_new_host`
    will be called and submit the documents just fetched.

    If ``robots.txt`` presented, and :data:`~darc.const.FORCE` is
    :data:`False`, :mod:`darc` will check if allowed to crawl the URL.

    .. note::

        The root path (e.g. ``/`` in https://www.example.com/) will always
        be crawled ignoring ``robots.txt``.

    At this point, :mod:`darc` will call the customised hook function
    from :mod:`darc.sites` to crawl and get the final response object.
    :mod:`darc` will save the session cookies and header information,
    using :func:`~darc.save.save_headers`.

    .. note::

        If :exc:`requests.exceptions.InvalidSchema` is raised, the link
        will be saved by :func:`~darc.proxy.null.save_invalid`. Further
        processing is dropped, and the link will be removed from the
        :mod:`requests` database through :func:`~darc.db.drop_requests`.

        If :exc:`~darc.error.LinkNoReturn` is raised, the link will be
        removed from the :mod:`requests` database through
        :func:`~darc.db.drop_requests`.

    If the content type of response document is not ignored (c.f.
    :data:`~darc.const.MIME_WHITE_LIST` and :data:`~darc.const.MIME_BLACK_LIST`),
    :func:`~darc.submit.submit_requests` will be called and submit the document
    just fetched.

    If the response document is HTML (``text/html`` and ``application/xhtml+xml``),
    :func:`~darc.parse.extract_links` will be called then to extract
    all possible links from the HTML document and save such links into
    the database (c.f. :func:`~darc.db.save_requests`).

    And if the response status code is between ``400`` and ``600``,
    the URL will be saved back to the link database
    (c.f. :func:`~darc.db.save_requests`). If **NOT**, the URL will
    be saved into :mod:`selenium` link database to proceed next steps
    (c.f. :func:`~darc.db.save_selenium`).

    """
    print(f'[REQUESTS] Requesting {link.url}')
    try:
        if match_proxy(link.proxy):
            print(render_error(f'[REQUESTS] Ignored proxy type from {link.url} ({link.proxy})',
                               stem.util.term.Color.YELLOW), file=sys.stderr)  # pylint: disable=no-member
            drop_requests(link)
            return

        if match_host(link.host):
            print(render_error(f'[REQUESTS] Ignored hostname from {link.url} ({link.proxy})',
                               stem.util.term.Color.YELLOW), file=sys.stderr)  # pylint: disable=no-member
            drop_requests(link)
            return

        # timestamp
        timestamp = datetime.now()

        # if it's a new host
        flag_have, force_fetch = have_hostname(link)
        if not flag_have or force_fetch:
            partial = False

            if link.proxy not in ('zeronet', 'freenet'):
                # fetch sitemap.xml
                try:
                    fetch_sitemap(link, force=force_fetch)
                except Exception:
                    error = f'[Error fetching sitemap of {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns  # pylint: disable=line-too-long
                    print(render_error(error, stem.util.term.Color.CYAN), file=sys.stderr)  # pylint: disable=no-member
                    partial = True

            if link.proxy == 'i2p':
                # fetch hosts.txt
                try:
                    fetch_hosts(link, force=force_fetch)
                except Exception:
                    error = f'[Error subscribing hosts from {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns  # pylint: disable=line-too-long
                    print(render_error(error, stem.util.term.Color.CYAN), file=sys.stderr)  # pylint: disable=no-member
                    partial = True

            # submit data / drop hostname from db
            if partial:
                drop_hostname(link)
            submit_new_host(timestamp, link, partial=partial)

        if not FORCE and not check_robots(link):
            print(render_error(f'[REQUESTS] Robots disallowed link from {link.url}',
                               stem.util.term.Color.YELLOW), file=sys.stderr)  # pylint: disable=no-member
            return

        with request_session(link) as session:
            try:
                # requests session hook
                response = crawler_hook(link, session)
            except requests.exceptions.InvalidSchema as error:
                print(render_error(f'[REQUESTS] Failed on {link.url} <{error}>',
                                   stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member
                save_invalid(link)
                drop_requests(link)
                return
            except requests.RequestException as error:
                print(render_error(f'[REQUESTS] Failed on {link.url} <{error}>',
                                   stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member
                save_requests(link, single=True)
                return
            except LinkNoReturn:
                print(render_error(f'[REQUESTS] Removing from database: {link.url}',
                                   stem.util.term.Color.YELLOW), file=sys.stderr)  # pylint: disable=no-member
                drop_requests(link)
                return

            # save headers
            save_headers(timestamp, link, response, session)

            # check content type
            ct_type = get_content_type(response)
            if ct_type not in ['text/html', 'application/xhtml+xml']:
                print(render_error(f'[REQUESTS] Generic content type from {link.url} ({ct_type})',
                                   stem.util.term.Color.YELLOW), file=sys.stderr)  # pylint: disable=no-member

                # probably hosts.txt
                if link.proxy == 'i2p' and ct_type in ['text/plain', 'text/text']:
                    text = response.text
                    save_requests(read_hosts(text))

                if match_mime(ct_type):
                    drop_requests(link)
                    return

                # submit data
                data = response.content
                submit_requests(timestamp, link, response, session, data, mime_type=ct_type, html=False)

                return

            html = response.content
            if not html:
                print(render_error(f'[REQUESTS] Empty response from {link.url}',
                                   stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member
                save_requests(link, single=True)
                return

            # submit data
            submit_requests(timestamp, link, response, session, html, mime_type=ct_type, html=True)

            # add link to queue
            save_requests(extract_links(link, html), score=0, nx=True)

            if not response.ok:
                print(render_error(f'[REQUESTS] Failed on {link.url} [{response.status_code}]',
                                   stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member
                save_requests(link, single=True)
                return

            # add link to queue
            save_selenium(link, single=True, score=0, nx=True)
    except Exception:
        if SAVE_DB:
            with contextlib.suppress(Exception):
                host: typing.Optional[HostnameModel] = HostnameModel.get_or_none(HostnameModel.hostname == link.host)
                if host is not None:
                    host.alive = False
                    host.save()

            with contextlib.suppress(Exception):
                url: typing.Optional[URLModel] = URLModel.get_or_none(URLModel.hash == link.name)
                if url is not None:
                    url.alias = False
                    url.save()

        error = f'[Error from {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns  # type: ignore # pylint: disable=line-too-long
        print(render_error(error, stem.util.term.Color.CYAN), file=sys.stderr)  # type: ignore # pylint: disable=no-member
        save_requests(link, single=True)
    print(f'[REQUESTS] Requested {link.url}')


[docs]def loader(link: Link):
    """Single :mod:`selenium` loader for a entry link.

    Args:
        Link: URL to be crawled by :mod:`selenium`.

    The function will first parse the URL using :func:`~darc.link.parse_link`
    and start loading the URL using :mod:`selenium` with Google Chrome.

    At this point, :mod:`darc` will call the customised hook function
    from :mod:`darc.sites` to load and return the original
    :class:`selenium.webdriver.Chrome` object.

    .. note::

        If :exc:`~darc.error.LinkNoReturn` is raised, the link will be
        removed from the :mod:`selenium` database through
        :func:`~darc.db.drop_selenium`.

    If successful, the rendered source HTML document will be saved, and a
    full-page screenshot will be taken and saved.

    .. note::

       When taking full-page screenshot, :func:`~darc.crawl.loader` will
       use :javascript:`document.body.scrollHeight` to get the total
       height of web page. If the page height is *less than* **1,000 pixels**,
       then :mod:`darc` will by default set the height as **1,000 pixels**.

       Later :mod:`darc` will tell :mod:`selenium` to resize the window (in
       *headless* mode) to **1,024 pixels** in width and **110%** of the
       page height in height, and take a *PNG* screenshot.

    If the submission API is provided, :func:`~darc.submit.submit_selenium`
    will be called and submit the document just loaded.

    Later, :func:`~darc.parse.extract_links` will be called then to
    extract all possible links from the HTML document and save such
    links into the :mod:`requests` database (c.f. :func:`~darc.db.save_requests`).

    .. seealso::

       * :data:`darc.const.SE_EMPTY`
       * :data:`darc.const.SE_WAIT`

    """
    print(f'[SELENIUM] Loading {link.url}')
    try:
        # timestamp
        timestamp = datetime.now()

        # retrieve source from Chrome
        with request_driver(link) as driver:
            try:
                # selenium driver hook
                driver = loader_hook(link, driver)
            except urllib3.exceptions.HTTPError as error:
                print(render_error(f'[SELENIUM] Fail to load {link.url} <{error}>',
                                   stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member
                save_selenium(link, single=True)
                return
            except selenium.common.exceptions.WebDriverException as error:
                print(render_error(f'[SELENIUM] Fail to load {link.url} <{error}>',
                                   stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member
                save_selenium(link, single=True)
                return
            except LinkNoReturn:
                print(render_error(f'[SELENIUM] Removing from database: {link.url}',
                                   stem.util.term.Color.YELLOW), file=sys.stderr)  # pylint: disable=no-member
                drop_selenium(link)
                return

            # get HTML source
            html = driver.page_source

            if html == SE_EMPTY:
                print(render_error(f'[SELENIUM] Empty page from {link.url}',
                                   stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member
                save_selenium(link, single=True)
                return

            screenshot = None
            try:
                # get maximum height
                height = driver.execute_script('return document.body.scrollHeight')

                # resize window (with some magic numbers)
                if height < 1000:
                    height = 1000
                driver.set_window_size(1024, math.ceil(height * 1.1))

                # take a full page screenshot
                screenshot = driver.get_screenshot_as_base64()
            except Exception as error:
                print(render_error(f'[SELENIUM] Fail to save screenshot from {link.url} <{error}>',
                                   stem.util.term.Color.RED), file=sys.stderr)  # pylint: disable=no-member

            # submit data
            submit_selenium(timestamp, link, html, screenshot)

            # add link to queue
            save_requests(extract_links(link, html), score=0, nx=True)
    except Exception:
        error = f'[Error from {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns  # type: ignore # pylint: disable=line-too-long
        print(render_error(error, stem.util.term.Color.CYAN), file=sys.stderr)  # type: ignore # pylint: disable=no-member
        save_selenium(link, single=True)
    print(f'[SELENIUM] Loaded {link.url}')
Source code for darc.crawl

darc

Navigation

Related Topics