Source code for darc.proxy.null
# -*- coding: utf-8 -*-
"""No Proxy
===============
The :mod:`darc.proxy.null` module contains the auxiliary functions
around managing and processing normal websites with no proxy.
"""
import gzip
import multiprocessing
import os
import sys
import requests
import stem
import stem.control
import stem.process
import stem.util.term
from darc.const import PATH_MISC
from darc.db import save_requests
from darc.error import render_error
from darc.link import Link, parse_link
from darc.parse import get_content_type, get_sitemap, read_robots, read_sitemap, urljoin
from darc.requests import request_session
from darc.save import has_robots, has_sitemap, save_robots, save_sitemap
PATH = os.path.join(PATH_MISC, 'invalid.txt')
LOCK = multiprocessing.Lock()
[docs]def save_invalid(link: Link):
"""Save link with invalid scheme.
The function will save link with invalid scheme to the file
as defined in :data:`~darc.proxy.null.PATH`.
Args:
link: Link object representing the link with invalid scheme.
"""
with LOCK:
with open(PATH, 'a') as file:
print(link.url, file=file)
[docs]def fetch_sitemap(link: Link):
"""Fetch sitemap.
The function will first fetch the ``robots.txt``, then
fetch the sitemaps accordingly.
Args:
link: Link object to fetch for its sitemaps.
See Also:
* :func:`darc.parse.read_robots`
* :func:`darc.parse.read_sitemap`
* :func:`darc.parse.get_sitemap`
"""
robots_path = has_robots(link)
if robots_path is not None:
print(stem.util.term.format(f'[ROBOTS] Cached {link.url}',
stem.util.term.Color.YELLOW)) # pylint: disable=no-member
with open(robots_path) as file:
robots_text = file.read()
else:
robots_link = parse_link(urljoin(link.url, '/robots.txt'))
print(f'[ROBOTS] Checking {robots_link.url}')
with request_session(robots_link) as session:
try:
response = session.get(robots_link.url)
except requests.RequestException as error:
print(render_error(f'[ROBOTS] Failed on {robots_link.url} <{error}>',
stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member
return
if response.ok:
ct_type = get_content_type(response)
if ct_type not in ['text/text', 'text/plain']:
print(render_error(f'[ROBOTS] Unresolved content type on {robots_link.url} ({ct_type}',
stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member
robots_text = ''
else:
robots_text = response.text
save_robots(robots_link, robots_text)
print(f'[ROBOTS] Checked {robots_link.url}')
else:
print(render_error(f'[ROBOTS] Failed on {robots_link.url} [{response.status_code}]',
stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member
robots_text = ''
sitemaps = read_robots(link, robots_text, host=link.host)
for sitemap_link in sitemaps:
sitemap_path = has_sitemap(sitemap_link)
if sitemap_path is not None:
print(stem.util.term.format(f'[SITEMAP] Cached {link.url}',
stem.util.term.Color.YELLOW)) # pylint: disable=no-member
with open(sitemap_path) as file:
sitemap_text = file.read()
else:
print(f'[SITEMAP] Fetching {sitemap_link.url}')
with request_session(sitemap_link) as session:
try:
response = session.get(sitemap_link.url)
except requests.RequestException as error:
print(render_error(f'[SITEMAP] Failed on {sitemap_link.url} <{error}>',
stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member
continue
if not response.ok:
print(render_error(f'[SITEMAP] Failed on {sitemap_link.url} [{response.status_code}]',
stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member
continue
# check content type
ct_type = get_content_type(response)
if ct_type == 'application/gzip':
try:
sitemap_text = gzip.decompress(response.content).decode()
except UnicodeDecodeError:
sitemap_text = response.text
elif ct_type in ['text/xml', 'text/html']:
sitemap_text = response.text
save_sitemap(sitemap_link, sitemap_text)
else:
print(render_error(f'[SITEMAP] Unresolved content type on {sitemap_link.url} ({ct_type}',
stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member
continue
print(f'[SITEMAP] Fetched {sitemap_link.url}')
# get more sitemaps
sitemaps.extend(get_sitemap(sitemap_link, sitemap_text, host=link.host))
# add link to queue
save_requests(read_sitemap(link, sitemap_text))