Skip to content

Commit

Permalink
add advanced fetch_response() and amend fetch_url() (#479)
Browse files Browse the repository at this point in the history
* introduce advanced download function (scaffolding)

* add headers to class and simplify code

* add support for response headers

* refine class

* clean code
  • Loading branch information
adbar authored Jan 19, 2024
1 parent 85cd3d8 commit eec05b2
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 69 deletions.
38 changes: 20 additions & 18 deletions tests/downloads_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

import gzip
from time import sleep
from unittest.mock import Mock, patch
from unittest.mock import patch

from courlan import UrlStore

Expand All @@ -28,15 +28,15 @@
url_processing_pipeline)
from trafilatura.core import extract
import trafilatura.downloads
from trafilatura.downloads import (DEFAULT_HEADERS, USER_AGENT,
from trafilatura.downloads import (DEFAULT_HEADERS, USER_AGENT, Response,
_determine_headers, _handle_response,
_parse_config, _pycurl_is_live_page,
_send_pycurl_request, _send_request,
_send_pycurl_request, _send_urllib_request,
_urllib3_is_live_page,
add_to_compressed_dict, fetch_url,
is_live_page, load_download_buffer)
from trafilatura.settings import DEFAULT_CONFIG, use_config
from trafilatura.utils import decode_response, load_html
from trafilatura.utils import decode_file, decode_response, load_html

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

Expand All @@ -59,7 +59,7 @@ def _reset_downloads_global_objects():
def test_fetch():
'''Test URL fetching.'''
# logic: empty request?
assert _send_request('', True, DEFAULT_CONFIG) is None
assert _send_urllib_request('', True, False, DEFAULT_CONFIG) is None

# is_live general tests
assert _urllib3_is_live_page('https://httpbun.com/status/301') is True
Expand All @@ -74,23 +74,26 @@ def test_fetch():
assert fetch_url('https://httpbun.com/status/404') is None
# test if the functions default to no_ssl
# doesn't work?
# assert _send_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
# assert _send_urllib_request('https://expired.badssl.com/', False, False, DEFAULT_CONFIG) is not None
if pycurl is not None:
assert _send_pycurl_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
assert _send_pycurl_request('https://expired.badssl.com/', False, False, DEFAULT_CONFIG) is not None
# no SSL, no decoding
url = 'https://httpbun.com/status/200'
for no_ssl in (True, False):
response = _send_request('https://httpbun.com/status/200', no_ssl, DEFAULT_CONFIG)
response = _send_urllib_request('https://httpbun.com/status/200', no_ssl, True, DEFAULT_CONFIG)
assert response.data == b''
assert response.headers["X-Powered-By"].startswith("httpbun")
if pycurl is not None:
response1 = _send_pycurl_request('https://httpbun.com/status/200', True, DEFAULT_CONFIG)
response1 = _send_pycurl_request('https://httpbun.com/status/200', True, True, DEFAULT_CONFIG)
assert response1.headers["x-powered-by"].startswith("httpbun")
assert _handle_response(url, response1, False, DEFAULT_CONFIG) == _handle_response(url, response, False, DEFAULT_CONFIG)
assert _handle_response(url, response1, True, DEFAULT_CONFIG) == _handle_response(url, response, True, DEFAULT_CONFIG)
# response object
# too large response object
response = Mock()
response.url = 'https://httpbin.org/encoding/utf8'
response.status = 200
data = ""
status = 200
url = 'https://httpbin.org/encoding/utf8'
response = Response(data, status, url)
# too large
response.data = b'ABC'*10000000
assert _handle_response(response.url, response, False, DEFAULT_CONFIG) is None
Expand All @@ -115,7 +118,7 @@ def test_fetch():
assert res is None
# Also test max redir implementation on pycurl if available
if pycurl is not None:
assert _send_pycurl_request('http://httpbin.org/redirect/1', True, new_config) is None
assert _send_pycurl_request('http://httpbin.org/redirect/1', True, False, new_config) is None
_reset_downloads_global_objects() # reset global objects again to avoid affecting other tests

def test_config():
Expand All @@ -141,17 +144,16 @@ def test_config():
def test_decode():
'''Test how responses are being decoded.'''
# response type
mock = Mock()
mock.data = b' '
assert decode_response(mock) is not None
data = b" "
assert decode_file(data) is not None
# GZip
html_string = "<html><head/><body><div>ABC</div></body></html>"
gz_string = gzip.compress(html_string.encode("utf-8"))
assert decode_response(gz_string) == html_string
assert decode_response(gz_string) == html_string == decode_file(gz_string)
# Brotli
if brotli is not None:
brotli_string = brotli.compress(html_string.encode("utf-8"))
assert decode_response(brotli_string) == html_string
assert decode_file(brotli_string) == html_string


def test_queue():
Expand Down
132 changes: 87 additions & 45 deletions trafilatura/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@

import logging
import random
from collections import namedtuple
import warnings

from concurrent.futures import ThreadPoolExecutor, as_completed
from io import BytesIO
from time import sleep

import certifi
import urllib3

try:
import pycurl
Expand All @@ -24,7 +27,6 @@
except ImportError:
pycurl = None

import urllib3
from courlan import UrlStore
from courlan.network import redirection_test

Expand All @@ -35,12 +37,11 @@


from .settings import DEFAULT_CONFIG
from .utils import (URL_BLACKLIST_REGEX, decode_response, make_chunks,
uniquify_list)
from .utils import (URL_BLACKLIST_REGEX, decode_file,
make_chunks, uniquify_list)


LOGGER = logging.getLogger(__name__)
PKG_VERSION = version("trafilatura")

NUM_CONNECTIONS = 50

Expand All @@ -50,10 +51,30 @@
RETRY_STRATEGY = None

DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True)
USER_AGENT = 'trafilatura/' + PKG_VERSION + ' (+https://github.com/adbar/trafilatura)'
USER_AGENT = 'trafilatura/' + version("trafilatura") + ' (+https://github.com/adbar/trafilatura)'
DEFAULT_HEADERS['User-Agent'] = USER_AGENT

RawResponse = namedtuple('RawResponse', ['data', 'status', 'url'])

class Response:
"Store information gathered in a HTTP response object."
__slots__ = ["data", "headers", "html", "status", "url"]

def __init__(self, data, status, url):
self.data = data
self.headers = None
self.html = None
self.status = status
self.url = url

def store_headers(self, headerdict):
"Store response headers if required."
# control or normalization here?
self.headers = headerdict

def decode_data(self, decode):
"Decode the bytestring in data and store a string in html."
if decode and self.data:
self.html = decode_file(self.data)


# caching throws an error
Expand Down Expand Up @@ -83,7 +104,7 @@ def _determine_headers(config, headers=None):
return headers or DEFAULT_HEADERS


def _send_request(url, no_ssl, config):
def _send_urllib_request(url, no_ssl, with_headers, config):
"Internal function to robustly send a request (SSL or not) and return its result."
# customize headers
global HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY
Expand Down Expand Up @@ -115,59 +136,76 @@ def _send_request(url, no_ssl, config):
response = NO_CERT_POOL.request('GET', url, headers=_determine_headers(config), retries=RETRY_STRATEGY)
except urllib3.exceptions.SSLError:
LOGGER.warning('retrying after SSLError: %s', url)
return _send_request(url, True, config)
return _send_urllib_request(url, True, with_headers, config)
except Exception as err:
LOGGER.error('download error: %s %s', url, err) # sys.exc_info()[0]
else:
# necessary for standardization
return RawResponse(response.data, response.status, response.geturl())
resp = Response(response.data, response.status, response.geturl())
if with_headers:
resp.store_headers(response.headers)
return resp
# catchall
return None


def _handle_response(url, response, decode, config):
'Internal function to run safety checks on response result.'
lentest = len(response.html or response.data or "")
if response.status != 200:
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
elif response.data is None or len(response.data) < config.getint('DEFAULT', 'MIN_FILE_SIZE'):
elif lentest < config.getint('DEFAULT', 'MIN_FILE_SIZE'):
LOGGER.error('too small/incorrect for URL %s', url)
# raise error instead?
elif len(response.data) > config.getint('DEFAULT', 'MAX_FILE_SIZE'):
LOGGER.error('too large: length %s for URL %s', len(response.data), url)
elif lentest > config.getint('DEFAULT', 'MAX_FILE_SIZE'):
LOGGER.error('too large: length %s for URL %s', lentest, url)
# raise error instead?
else:
return decode_response(response.data) if decode is True else response
return response.html if decode else response
# catchall
return None


def fetch_url(url, decode=True, no_ssl=False, config=DEFAULT_CONFIG):
"""Fetches page using urllib3 and decodes the response.
"""Fetches page using urllib3 or pycurl and decodes the response.
Args:
url: URL of the page to fetch.
decode: Decode response instead of returning urllib3 response object (boolean).
decode: Decode response instead of returning response object (boolean).
no_ssl: Don't try to establish a secure connection (to prevent SSLError).
config: Pass configuration values for output control.
Returns:
RawResponse object: data (headers + body), status (HTML code as string) and url
Response object: data (headers + body), status (HTML code as string) and url
or None in case the result is invalid or there was a problem with the network.
"""
LOGGER.debug('sending request: %s', url)
if pycurl is None:
response = _send_request(url, no_ssl, config)
else:
response = _send_pycurl_request(url, no_ssl, config)
if not decode:
warnings.warn(
"""Raw response objects will be deprecated for fetch_url,
use fetch_response instead.""",
PendingDeprecationWarning
)
response = fetch_response(url, decode=decode, no_ssl=no_ssl, config=config)
if response is not None and response != '':
return _handle_response(url, response, decode, config)
# return '' (useful do discard further processing?)
# return response
LOGGER.debug('request failed: %s', url)
return None


def fetch_response(url, *, decode=False, no_ssl=False, with_headers=False, config=DEFAULT_CONFIG):
"Fetches page using urllib3 or pycurl and returns a raw response object."
dl_function = _send_urllib_request if pycurl is None else _send_pycurl_request
LOGGER.debug('sending request: %s', url)
response = dl_function(url, no_ssl, with_headers, config) # Response
if not response: # None or ""
LOGGER.debug('request failed: %s', url)
return None
response.decode_data(decode)
return response


def _pycurl_is_live_page(url):
"Send a basic HTTP HEAD request with pycurl."
# Initialize pycurl object
Expand Down Expand Up @@ -254,12 +292,12 @@ def buffered_downloads(bufferlist, download_threads, decode=True):
yield future_to_url[future], future.result()


def _send_pycurl_request(url, no_ssl, config):
def _send_pycurl_request(url, no_ssl, with_headers, config):
'''Experimental function using libcurl and pycurl to speed up downloads'''
# https://github.com/pycurl/pycurl/blob/master/examples/retriever-multi.py

# init
# headerbytes = BytesIO()
headerbytes = BytesIO()
headers = _determine_headers(config)
headerlist = ['Accept-Encoding: gzip, deflate', 'Accept: */*']
for header, content in headers.items():
Expand All @@ -277,15 +315,18 @@ def _send_pycurl_request(url, no_ssl, config):
curl.setopt(pycurl.MAXREDIRS, config.getint('DEFAULT', 'MAX_REDIRECTS'))
curl.setopt(pycurl.CONNECTTIMEOUT, config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'))
curl.setopt(pycurl.TIMEOUT, config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'))
curl.setopt(pycurl.MAXFILESIZE, config.getint('DEFAULT', 'MAX_FILE_SIZE'))
curl.setopt(pycurl.NOSIGNAL, 1)

if no_ssl is True:
curl.setopt(pycurl.SSL_VERIFYPEER, 0)
curl.setopt(pycurl.SSL_VERIFYHOST, 0)
else:
curl.setopt(pycurl.CAINFO, certifi.where())
curl.setopt(pycurl.MAXFILESIZE, config.getint('DEFAULT', 'MAX_FILE_SIZE'))
#curl.setopt(pycurl.HEADERFUNCTION, headerbytes.write)
#curl.setopt(pycurl.WRITEDATA, bufferbytes)

if with_headers:
curl.setopt(pycurl.HEADERFUNCTION, headerbytes.write)

# TCP_FASTOPEN
# curl.setopt(pycurl.FAILONERROR, 1)
# curl.setopt(pycurl.ACCEPT_ENCODING, '')
Expand All @@ -301,28 +342,29 @@ def _send_pycurl_request(url, no_ssl, config):
# additional error codes: 80, 90, 96, 98
if no_ssl is False and err.args[0] in (35, 54, 58, 59, 60, 64, 66, 77, 82, 83, 91):
LOGGER.debug('retrying after SSL error: %s %s', url, err)
return _send_pycurl_request(url, True, config)
return _send_pycurl_request(url, True, with_headers, config)
# traceback.print_exc(file=sys.stderr)
# sys.stderr.flush()
return None

# https://github.com/pycurl/pycurl/blob/master/examples/quickstart/response_headers.py
#respheaders = dict()
#for header_line in headerbytes.getvalue().decode('iso-8859-1').splitlines(): # re.split(r'\r?\n',
# # This will botch headers that are split on multiple lines...
# if ':' not in header_line:
# continue
# # Break the header line into header name and value.
# name, value = header_line.split(':', 1)
# # Now we can actually record the header name and value.
# respheaders[name.strip()] = value.strip() # name.strip().lower() ## TODO: check
# status
respcode = curl.getinfo(curl.RESPONSE_CODE)
# url
effective_url = curl.getinfo(curl.EFFECTIVE_URL)
# additional info
# ip_info = curl.getinfo(curl.PRIMARY_IP)

# tidy up
resp = Response(bufferbytes, curl.getinfo(curl.RESPONSE_CODE), curl.getinfo(curl.EFFECTIVE_URL))
curl.close()
return RawResponse(bufferbytes, respcode, effective_url)

if with_headers:
respheaders = {}
# https://github.com/pycurl/pycurl/blob/master/examples/quickstart/response_headers.py
for line in headerbytes.getvalue().decode("iso-8859-1", errors="replace").splitlines():
# re.split(r'\r?\n') ?
# This will botch headers that are split on multiple lines...
if ':' not in line:
continue
# Break the header line into header name and value.
name, value = line.split(':', 1)
# Now we can actually record the header name and value.
respheaders[name.strip()] = value.strip() # name.strip().lower() ?
resp.store_headers(respheaders)

return resp
15 changes: 9 additions & 6 deletions trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# import csv
import logging
import re
import warnings

# if brotli is installed
try:
Expand Down Expand Up @@ -129,17 +130,19 @@ def detect_encoding(bytesobject):
return [g for g in guesses if g not in UNICODE_ALIASES]


def decode_response(response):
def decode_response(content):
"""Read the urllib3 object corresponding to the server response,
check if it could be GZip and eventually decompress it, then
try to guess its encoding and decode it to return a unicode string"""
# urllib3 response object / bytes switch
resp_content = response if isinstance(response, bytes) else response.data
return decode_file(resp_content)
warnings.warn(
"decode_response() will be deprecated, use decode_file() on the content.",
PendingDeprecationWarning
)
return decode_file(content)


def decode_file(filecontent):
"""Guess bytestring encoding and try to decode to Unicode string.
"""Check if the bytestring could be GZip and eventually decompress it,
guess bytestring encoding and try to decode to Unicode string.
Resort to destructive conversion otherwise."""
# init
if isinstance(filecontent, str):
Expand Down

0 comments on commit eec05b2

Please sign in to comment.