Source code for darc.crawl

# -*- coding: utf-8 -*-
"""Web Crawlers
==================

The :mod:`darc.crawl` module provides two types of crawlers.

* :func:`~darc.crawl.crawler` -- crawler powered by |requests|_
* :func:`~darc.crawl.loader` -- crawler powered by |selenium|_

"""

import contextlib
import math
import os
import shutil
import sys
import traceback

import magic
import requests
import selenium.common.exceptions
import selenium.webdriver
import selenium.webdriver.common.proxy
import stem
import stem.control
import stem.process
import stem.util.term
import urllib3

from darc._compat import datetime
from darc.const import FORCE, SAVE_REQUESTS, SAVE_SELENIUM, SE_EMPTY
from darc.db import save_requests, save_selenium
from darc.error import render_error
from darc.link import parse_link
from darc.parse import (check_robots, extract_links, get_content_type, match_host, match_mime,
                        match_proxy)
from darc.proxy.bitcoin import save_bitcoin
from darc.proxy.data import save_data
from darc.proxy.ed2k import save_ed2k
from darc.proxy.i2p import fetch_hosts, read_hosts
from darc.proxy.irc import save_irc
from darc.proxy.magnet import save_magnet
from darc.proxy.mail import save_mail
from darc.proxy.null import fetch_sitemap, save_invalid
from darc.requests import request_session
from darc.save import has_folder, has_html, has_raw, sanitise, save_file, save_headers, save_html
from darc.selenium import request_driver
from darc.sites import crawler_hook, loader_hook
from darc.submit import submit_new_host, submit_requests, submit_selenium


[docs]def crawler(url: str): """Single |requests|_ crawler for a entry link. Args: url: URL to be crawled by |requests|_. The function will first parse the URL using :func:`~darc.link.parse_link`, and check if need to crawl the URL (c.f. :data:`~darc.const.PROXY_WHITE_LIST`, :data:`~darc.const.PROXY_BLACK_LIST` , :data:`~darc.const.LINK_WHITE_LIST` and :data:`~darc.const.LINK_BLACK_LIST`); if true, then crawl the URL with |requests|_. If the URL is from a brand new host, :mod:`darc` will first try to fetch and save ``robots.txt`` and sitemaps of the host (c.f. :func:`~darc.save.save_robots` and :func:`~darc.save.save_sitemap`), and extract then save the links from sitemaps (c.f. :func:`~darc.parse.read_sitemap`) into link database for future crawling (c.f. :func:`~darc.db.save_requests`). Also, if the submission API is provided, :func:`~darc.submit.submit_new_host` will be called and submit the documents just fetched. .. seealso:: * :func:`darc.proxy.null.fetch_sitemap` If ``robots.txt`` presented, and :data:`~darc.const.FORCE` is ``False``, :mod:`darc` will check if allowed to crawl the URL. .. note:: The root path (e.g. ``/`` in https://www.example.com/) will always be crawled ignoring ``robots.txt``. At this point, :mod:`darc` will call the customised hook function from :mod:`darc.sites` to crawl and get the final response object. :mod:`darc` will save the session cookies and header information, using :func:`~darc.save.save_headers`. .. note:: If :exc:`requests.exceptions.InvalidSchema` is raised, the link will be saved by :func:`~darc.proxy.null.save_invalid`. Further processing is dropped. If the content type of response document is not ignored (c.f. :data:`~darc.const.MIME_WHITE_LIST` and :data:`~darc.const.MIME_BLACK_LIST`), :mod:`darc` will save the document using :func:`~darc.save.save_html` or :func:`~darc.save.save_file` accordingly. And if the submission API is provided, :func:`~darc.submit.submit_requests` will be called and submit the document just fetched. If the response document is HTML (``text/html`` and ``application/xhtml+xml``), :func:`~darc.parse.extract_links` will be called then to extract all possible links from the HTML document and save such links into the database (c.f. :func:`~darc.db.save_requests`). And if the response status code is between ``400`` and ``600``, the URL will be saved back to the link database (c.f. :func:`~darc.db.save_requests`). If **NOT**, the URL will be saved into |selenium|_ link database to proceed next steps (c.f. :func:`~darc.db.save_selenium`). """ try: link = parse_link(url) if match_proxy(link.proxy): print(render_error(f'[REQUESTS] Ignored proxy type from {link.url} ({link.proxy})', stem.util.term.Color.YELLOW), file=sys.stderr) # pylint: disable=no-member return # save bitcoin address if link.proxy == 'bitcoin': save_bitcoin(link) return # save ed2k link if link.proxy == 'ed2k': save_ed2k(link) return # save magnet link if link.proxy == 'magnet': save_magnet(link) return # save email address if link.proxy == 'mail': save_mail(link) return # save IRC address if link.proxy == 'irc': save_irc(link) return # save data URI if link.proxy == 'data': try: save_data(link) except ValueError as error: print(render_error(f'[REQUESTS] Failed to save data URI from {link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member return if match_host(link.host): print(render_error(f'[REQUESTS] Ignored hostname from {link.url} ({link.proxy})', stem.util.term.Color.YELLOW), file=sys.stderr) # pylint: disable=no-member return # timestamp timestamp = datetime.now() path = has_raw(timestamp, link) if path is not None: if link.proxy not in ('zeronet', 'freenet'): # load sitemap.xml try: fetch_sitemap(link) except Exception: error = f'[Error loading sitemap of {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns # pylint: disable=line-too-long print(render_error(error, stem.util.term.Color.CYAN), file=sys.stderr) # pylint: disable=no-member # load hosts.txt if link.proxy == 'i2p': try: fetch_hosts(link) except Exception: error = f'[Error loading hosts from {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns # pylint: disable=line-too-long print(render_error(error, stem.util.term.Color.CYAN), file=sys.stderr) # pylint: disable=no-member ext = os.path.splitext(path)[1] if ext == '.dat': print(stem.util.term.format(f'[REQUESTS] Cached generic file from {link.url}', stem.util.term.Color.YELLOW)) # pylint: disable=no-member # probably hosts.txt if link.proxy == 'i2p': with contextlib.suppress(Exception): ct_type = magic.detect_from_filename(path).mime_type if ct_type in ['text/plain', 'text/text']: with open(path) as hosts_file: save_requests(read_hosts(hosts_file)) return print(stem.util.term.format(f'[REQUESTS] Cached HTML document from {link.url}', stem.util.term.Color.YELLOW)) # pylint: disable=no-member with open(path, 'rb') as file: html = file.read() # add link to queue #[QUEUE_REQUESTS.put(href) for href in extract_links(link.url, html)] # pylint: disable=expression-not-assigned save_requests(extract_links(link.url, html)) #QUEUE_SELENIUM.put(link.url) save_selenium(link.url, single=True) else: # if it's a new host new_host = has_folder(link) is None print(f'[REQUESTS] Requesting {link.url}') if new_host: if link.proxy not in ('zeronet', 'freenet'): # fetch sitemap.xml try: fetch_sitemap(link) except Exception: error = f'[Error fetching sitemap of {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns # pylint: disable=line-too-long print(render_error(error, stem.util.term.Color.CYAN), file=sys.stderr) # pylint: disable=no-member if link.proxy == 'i2p': # fetch hosts.txt try: fetch_hosts(link) except Exception: error = f'[Error subscribing hosts from {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns # pylint: disable=line-too-long print(render_error(error, stem.util.term.Color.CYAN), file=sys.stderr) # pylint: disable=no-member # submit data submit_new_host(timestamp, link) if not FORCE and not check_robots(link): print(render_error(f'[REQUESTS] Robots disallowed link from {link.url}', stem.util.term.Color.YELLOW), file=sys.stderr) # pylint: disable=no-member return with request_session(link) as session: try: # requests session hook response = crawler_hook(link, session) except requests.exceptions.InvalidSchema as error: print(render_error(f'[REQUESTS] Failed on {link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member save_invalid(link) return except requests.RequestException as error: print(render_error(f'[REQUESTS] Failed on {link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member #QUEUE_REQUESTS.put(link.url) save_requests(link.url, single=True) return # save headers save_headers(timestamp, link, response, session) # check content type ct_type = get_content_type(response) if ct_type not in ['text/html', 'application/xhtml+xml']: print(render_error(f'[REQUESTS] Generic content type from {link.url} ({ct_type})', stem.util.term.Color.YELLOW), file=sys.stderr) # pylint: disable=no-member text = response.content try: path = save_file(timestamp, link, text) except Exception as error: print(render_error(f'[REQUESTS] Failed to save generic file from {link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member return # probably hosts.txt if link.proxy == 'i2p' and ct_type in ['text/plain', 'text/text']: with open(path) as hosts_file: save_requests(read_hosts(hosts_file)) if match_mime(ct_type): return # submit data submit_requests(timestamp, link, response, session) return html = response.content if not html: print(render_error(f'[REQUESTS] Empty response from {link.url}', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member #QUEUE_REQUESTS.put(link.url) save_requests(link.url, single=True) return # save HTML save_html(timestamp, link, html, raw=True) # submit data submit_requests(timestamp, link, response, session) # add link to queue #[QUEUE_REQUESTS.put(href) for href in extract_links(link.url, html)] # pylint: disable=expression-not-assigned save_requests(extract_links(link.url, html)) if not response.ok: print(render_error(f'[REQUESTS] Failed on {link.url} [{response.status_code}]', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member #QUEUE_REQUESTS.put(link.url) save_requests(link.url, single=True) return # add link to queue #QUEUE_SELENIUM.put(link.url) save_selenium(link.url, single=True) if SAVE_REQUESTS: save_requests(link.url, single=True) print(f'[REQUESTS] Requested {link.url}') except Exception: error = f'[Error from {url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns # pylint: disable=line-too-long print(render_error(error, stem.util.term.Color.CYAN), file=sys.stderr) # pylint: disable=no-member #QUEUE_REQUESTS.put(url) save_requests(url, single=True)
[docs]def loader(url: str): """Single |selenium|_ loader for a entry link. Args: url: URL to be crawled by |requests|_. The function will first parse the URL using :func:`~darc.link.parse_link` and start loading the URL using |selenium|_ with Google Chrome. At this point, :mod:`darc` will call the customised hook function from :mod:`darc.sites` to load and return the original |Chrome|_ object. If successful, the rendered source HTML document will be saved using :func:`~darc.save.save_html`, and a full-page screenshot will be taken and saved. .. note:: When taking full-page screenshot, :func:`~darc.crawl.loader` will use :javascript:`document.body.scrollHeight` to get the total height of web page. If the page height is *less than* **1,000 pixels**, then :mod:`darc` will by default set the height as **1,000 pixels**. Later :mod:`darc` will tell |selenium|_ to resize the window (in *headless* mode) to **1,024 pixels** in width and **110%** of the page height in height, and take a *PNG* screenshot. .. seealso:: * :data:`darc.const.SE_EMPTY` * :data:`darc.const.SE_WAIT` If the submission API is provided, :func:`~darc.submit.submit_selenium` will be called and submit the document just loaded. Later, :func:`~darc.parse.extract_links` will be called then to extract all possible links from the HTML document and save such links into the |requests|_ database (c.f. :func:`~darc.db.save_requests`). """ try: link = parse_link(url) # timestamp timestamp = datetime.now() path = has_html(timestamp, link) if path is not None: print(stem.util.term.format(f'[SELENIUM] Cached {link.url}', stem.util.term.Color.YELLOW)) # pylint: disable=no-member with open(path, 'rb') as file: html = file.read() # add link to queue #[QUEUE_REQUESTS.put(href) for href in extract_links(link.url, html)] # pylint: disable=expression-not-assigned save_requests(extract_links(link.url, html)) else: print(f'[SELENIUM] Loading {link.url}') # retrieve source from Chrome with request_driver(link) as driver: try: # selenium driver hook driver = loader_hook(link, driver) except urllib3.exceptions.HTTPError as error: print(render_error(f'[SELENIUM] Fail to load {link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member #QUEUE_SELENIUM.put(link.url) save_selenium(link.url, single=True) return except selenium.common.exceptions.WebDriverException as error: print(render_error(f'[SELENIUM] Fail to load {link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member #QUEUE_SELENIUM.put(link.url) save_selenium(link.url, single=True) return # get HTML source html = driver.page_source if html == SE_EMPTY: print(render_error(f'[SELENIUM] Empty page from {link.url}', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member #QUEUE_SELENIUM.put(link.url) save_selenium(link.url, single=True) return # save HTML save_html(timestamp, link, html) try: # get maximum height height = driver.execute_script('return document.body.scrollHeight') # resize window (with some magic numbers) if height < 1000: height = 1000 driver.set_window_size(1024, math.ceil(height * 1.1)) # take a full page screenshot path = sanitise(link, timestamp, screenshot=True) driver.save_screenshot(path) except Exception as error: print(render_error(f'[SELENIUM] Fail to save screenshot from {link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member # submit data submit_selenium(timestamp, link) # add link to queue #[QUEUE_REQUESTS.put(href) for href in extract_links(link.url, html)] # pylint: disable=expression-not-assigned save_requests(extract_links(link.url, html)) if SAVE_SELENIUM: save_selenium(link.url, single=True) print(f'[SELENIUM] Loaded {link.url}') except Exception: error = f'[Error from {url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns # pylint: disable=line-too-long print(render_error(error, stem.util.term.Color.CYAN), file=sys.stderr) # pylint: disable=no-member #QUEUE_SELENIUM.put(url) save_selenium(url, single=True)