add advanced fetch_response() and amend fetch_url() (#479)

* introduce advanced download function (scaffolding) * add headers to class and simplify code * add support for response headers * refine class * clean code
adbar · Jan 19, 2024 · eec05b2 · eec05b2
1 parent 85cd3d8
commit eec05b2
Show file tree

Hide file tree

Showing 3 changed files with 116 additions and 69 deletions.
diff --git a/tests/downloads_tests.py b/tests/downloads_tests.py
@@ -19,7 +19,7 @@
 
 import gzip
 from time import sleep
-from unittest.mock import Mock, patch
+from unittest.mock import patch
 
 from courlan import UrlStore
 
@@ -28,15 +28,15 @@
                                    url_processing_pipeline)
 from trafilatura.core import extract
 import trafilatura.downloads
-from trafilatura.downloads import (DEFAULT_HEADERS, USER_AGENT,
+from trafilatura.downloads import (DEFAULT_HEADERS, USER_AGENT, Response,
                                    _determine_headers, _handle_response,
                                    _parse_config, _pycurl_is_live_page,
-                                   _send_pycurl_request, _send_request,
+                                   _send_pycurl_request, _send_urllib_request,
                                    _urllib3_is_live_page,
                                    add_to_compressed_dict, fetch_url,
                                    is_live_page, load_download_buffer)
 from trafilatura.settings import DEFAULT_CONFIG, use_config
-from trafilatura.utils import decode_response, load_html
+from trafilatura.utils import decode_file, decode_response, load_html
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 
@@ -59,7 +59,7 @@ def _reset_downloads_global_objects():
 def test_fetch():
     '''Test URL fetching.'''
     # logic: empty request?
-    assert _send_request('', True, DEFAULT_CONFIG) is None
+    assert _send_urllib_request('', True, False, DEFAULT_CONFIG) is None
 
     # is_live general tests
     assert _urllib3_is_live_page('https://httpbun.com/status/301') is True
@@ -74,23 +74,26 @@ def test_fetch():
     assert fetch_url('https://httpbun.com/status/404') is None
     # test if the functions default to no_ssl
     # doesn't work?
-    # assert _send_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
+    # assert _send_urllib_request('https://expired.badssl.com/', False, False, DEFAULT_CONFIG) is not None
     if pycurl is not None:
-        assert _send_pycurl_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
+        assert _send_pycurl_request('https://expired.badssl.com/', False, False, DEFAULT_CONFIG) is not None
     # no SSL, no decoding
     url = 'https://httpbun.com/status/200'
     for no_ssl in (True, False):
-        response = _send_request('https://httpbun.com/status/200', no_ssl, DEFAULT_CONFIG)
+        response = _send_urllib_request('https://httpbun.com/status/200', no_ssl, True, DEFAULT_CONFIG)
         assert response.data == b''
+        assert response.headers["X-Powered-By"].startswith("httpbun")
     if pycurl is not None:
-        response1 = _send_pycurl_request('https://httpbun.com/status/200', True, DEFAULT_CONFIG)
+        response1 = _send_pycurl_request('https://httpbun.com/status/200', True, True, DEFAULT_CONFIG)
+        assert response1.headers["x-powered-by"].startswith("httpbun")
         assert _handle_response(url, response1, False, DEFAULT_CONFIG) == _handle_response(url, response, False, DEFAULT_CONFIG)
         assert _handle_response(url, response1, True, DEFAULT_CONFIG) == _handle_response(url, response, True, DEFAULT_CONFIG)
     # response object
     # too large response object
-    response = Mock()
-    response.url = 'https://httpbin.org/encoding/utf8'
-    response.status = 200
+    data = ""
+    status = 200
+    url = 'https://httpbin.org/encoding/utf8'
+    response = Response(data, status, url)
     # too large
     response.data = b'ABC'*10000000
     assert _handle_response(response.url, response, False, DEFAULT_CONFIG) is None
@@ -115,7 +118,7 @@ def test_fetch():
     assert res is None
     # Also test max redir implementation on pycurl if available
     if pycurl is not None:
-        assert _send_pycurl_request('http://httpbin.org/redirect/1', True, new_config) is None
+        assert _send_pycurl_request('http://httpbin.org/redirect/1', True, False, new_config) is None
     _reset_downloads_global_objects()  # reset global objects again to avoid affecting other tests
 
 def test_config():
@@ -141,17 +144,16 @@ def test_config():
 def test_decode():
     '''Test how responses are being decoded.'''
     # response type
-    mock = Mock()
-    mock.data = b' '
-    assert decode_response(mock) is not None
+    data = b" "
+    assert decode_file(data) is not None
     # GZip
     html_string = "<html><head/><body><div>ABC</div></body></html>"
     gz_string = gzip.compress(html_string.encode("utf-8"))
-    assert decode_response(gz_string) == html_string
+    assert decode_response(gz_string) == html_string == decode_file(gz_string)
     # Brotli
     if brotli is not None:
         brotli_string = brotli.compress(html_string.encode("utf-8"))
-        assert decode_response(brotli_string) == html_string
+        assert decode_file(brotli_string) == html_string
 
 
 def test_queue():

diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py
@@ -6,11 +6,14 @@
 
 import logging
 import random
-from collections import namedtuple
+import warnings
+
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from io import BytesIO
 from time import sleep
 
 import certifi
+import urllib3
 
 try:
     import pycurl
@@ -24,7 +27,6 @@
 except ImportError:
     pycurl = None
 
-import urllib3
 from courlan import UrlStore
 from courlan.network import redirection_test
 
@@ -35,12 +37,11 @@
 
 
 from .settings import DEFAULT_CONFIG
-from .utils import (URL_BLACKLIST_REGEX, decode_response, make_chunks,
-                    uniquify_list)
+from .utils import (URL_BLACKLIST_REGEX, decode_file,
+                    make_chunks, uniquify_list)
 
 
 LOGGER = logging.getLogger(__name__)
-PKG_VERSION = version("trafilatura")
 
 NUM_CONNECTIONS = 50
 
@@ -50,10 +51,30 @@
 RETRY_STRATEGY = None
 
 DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True)
-USER_AGENT = 'trafilatura/' + PKG_VERSION + ' (+https://github.com/adbar/trafilatura)'
+USER_AGENT = 'trafilatura/' + version("trafilatura") + ' (+https://github.com/adbar/trafilatura)'
 DEFAULT_HEADERS['User-Agent'] = USER_AGENT
 
-RawResponse = namedtuple('RawResponse', ['data', 'status', 'url'])
+
+class Response:
+    "Store information gathered in a HTTP response object."
+    __slots__ = ["data", "headers", "html", "status", "url"]
+
+    def __init__(self, data, status, url):
+        self.data = data
+        self.headers = None
+        self.html = None
+        self.status = status
+        self.url = url
+
+    def store_headers(self, headerdict):
+        "Store response headers if required."
+        # control or normalization here?
+        self.headers = headerdict
+
+    def decode_data(self, decode):
+        "Decode the bytestring in data and store a string in html."
+        if decode and self.data:
+            self.html = decode_file(self.data)
 
 
 # caching throws an error
@@ -83,7 +104,7 @@ def _determine_headers(config, headers=None):
     return headers or DEFAULT_HEADERS
 
 
-def _send_request(url, no_ssl, config):
+def _send_urllib_request(url, no_ssl, with_headers, config):
     "Internal function to robustly send a request (SSL or not) and return its result."
     # customize headers
     global HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY
@@ -115,59 +136,76 @@ def _send_request(url, no_ssl, config):
             response = NO_CERT_POOL.request('GET', url, headers=_determine_headers(config), retries=RETRY_STRATEGY)
     except urllib3.exceptions.SSLError:
         LOGGER.warning('retrying after SSLError: %s', url)
-        return _send_request(url, True, config)
+        return _send_urllib_request(url, True, with_headers, config)
     except Exception as err:
         LOGGER.error('download error: %s %s', url, err)  # sys.exc_info()[0]
     else:
         # necessary for standardization
-        return RawResponse(response.data, response.status, response.geturl())
+        resp = Response(response.data, response.status, response.geturl())
+        if with_headers:
+            resp.store_headers(response.headers)
+        return resp
     # catchall
     return None
 
 
 def _handle_response(url, response, decode, config):
     'Internal function to run safety checks on response result.'
+    lentest = len(response.html or response.data or "")
     if response.status != 200:
         LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
-    elif response.data is None or len(response.data) < config.getint('DEFAULT', 'MIN_FILE_SIZE'):
+    elif lentest < config.getint('DEFAULT', 'MIN_FILE_SIZE'):
         LOGGER.error('too small/incorrect for URL %s', url)
         # raise error instead?
-    elif len(response.data) > config.getint('DEFAULT', 'MAX_FILE_SIZE'):
-        LOGGER.error('too large: length %s for URL %s', len(response.data), url)
+    elif lentest > config.getint('DEFAULT', 'MAX_FILE_SIZE'):
+        LOGGER.error('too large: length %s for URL %s', lentest, url)
         # raise error instead?
     else:
-        return decode_response(response.data) if decode is True else response
+        return response.html if decode else response
     # catchall
     return None
 
 
 def fetch_url(url, decode=True, no_ssl=False, config=DEFAULT_CONFIG):
-    """Fetches page using urllib3 and decodes the response.
+    """Fetches page using urllib3 or pycurl and decodes the response.
 
     Args:
         url: URL of the page to fetch.
-        decode: Decode response instead of returning urllib3 response object (boolean).
+        decode: Decode response instead of returning response object (boolean).
         no_ssl: Don't try to establish a secure connection (to prevent SSLError).
         config: Pass configuration values for output control.
 
     Returns:
-        RawResponse object: data (headers + body), status (HTML code as string) and url
+        Response object: data (headers + body), status (HTML code as string) and url
         or None in case the result is invalid or there was a problem with the network.
 
     """
-    LOGGER.debug('sending request: %s', url)
-    if pycurl is None:
-        response = _send_request(url, no_ssl, config)
-    else:
-        response = _send_pycurl_request(url, no_ssl, config)
+    if not decode:
+        warnings.warn(
+            """Raw response objects will be deprecated for fetch_url,
+               use fetch_response instead.""",
+             PendingDeprecationWarning
+        )
+    response = fetch_response(url, decode=decode, no_ssl=no_ssl, config=config)
     if response is not None and response != '':
         return _handle_response(url, response, decode, config)
         # return '' (useful do discard further processing?)
         # return response
-    LOGGER.debug('request failed: %s', url)
     return None
 
 
+def fetch_response(url, *, decode=False, no_ssl=False, with_headers=False, config=DEFAULT_CONFIG):
+    "Fetches page using urllib3 or pycurl and returns a raw response object."
+    dl_function = _send_urllib_request if pycurl is None else _send_pycurl_request
+    LOGGER.debug('sending request: %s', url)
+    response = dl_function(url, no_ssl, with_headers, config)  # Response
+    if not response:  # None or ""
+        LOGGER.debug('request failed: %s', url)
+        return None
+    response.decode_data(decode)
+    return response
+
+
 def _pycurl_is_live_page(url):
     "Send a basic HTTP HEAD request with pycurl."
     # Initialize pycurl object
@@ -254,12 +292,12 @@ def buffered_downloads(bufferlist, download_threads, decode=True):
                 yield future_to_url[future], future.result()
 
 
-def _send_pycurl_request(url, no_ssl, config):
+def _send_pycurl_request(url, no_ssl, with_headers, config):
     '''Experimental function using libcurl and pycurl to speed up downloads'''
     # https://github.com/pycurl/pycurl/blob/master/examples/retriever-multi.py
 
     # init
-    # headerbytes = BytesIO()
+    headerbytes = BytesIO()
     headers = _determine_headers(config)
     headerlist = ['Accept-Encoding: gzip, deflate', 'Accept: */*']
     for header, content in headers.items():
@@ -277,15 +315,18 @@ def _send_pycurl_request(url, no_ssl, config):
     curl.setopt(pycurl.MAXREDIRS, config.getint('DEFAULT', 'MAX_REDIRECTS'))
     curl.setopt(pycurl.CONNECTTIMEOUT, config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'))
     curl.setopt(pycurl.TIMEOUT, config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'))
+    curl.setopt(pycurl.MAXFILESIZE, config.getint('DEFAULT', 'MAX_FILE_SIZE'))
     curl.setopt(pycurl.NOSIGNAL, 1)
+
     if no_ssl is True:
         curl.setopt(pycurl.SSL_VERIFYPEER, 0)
         curl.setopt(pycurl.SSL_VERIFYHOST, 0)
     else:
         curl.setopt(pycurl.CAINFO, certifi.where())
-    curl.setopt(pycurl.MAXFILESIZE, config.getint('DEFAULT', 'MAX_FILE_SIZE'))
-    #curl.setopt(pycurl.HEADERFUNCTION, headerbytes.write)
-    #curl.setopt(pycurl.WRITEDATA, bufferbytes)
+
+    if with_headers:
+        curl.setopt(pycurl.HEADERFUNCTION, headerbytes.write)
+
     # TCP_FASTOPEN
     # curl.setopt(pycurl.FAILONERROR, 1)
     # curl.setopt(pycurl.ACCEPT_ENCODING, '')
@@ -301,28 +342,29 @@ def _send_pycurl_request(url, no_ssl, config):
         # additional error codes: 80, 90, 96, 98
         if no_ssl is False and err.args[0] in (35, 54, 58, 59, 60, 64, 66, 77, 82, 83, 91):
             LOGGER.debug('retrying after SSL error: %s %s', url, err)
-            return _send_pycurl_request(url, True, config)
+            return _send_pycurl_request(url, True, with_headers, config)
         # traceback.print_exc(file=sys.stderr)
         # sys.stderr.flush()
         return None
 
-    # https://github.com/pycurl/pycurl/blob/master/examples/quickstart/response_headers.py
-    #respheaders = dict()
-    #for header_line in headerbytes.getvalue().decode('iso-8859-1').splitlines(): # re.split(r'\r?\n',
-    #    # This will botch headers that are split on multiple lines...
-    #    if ':' not in header_line:
-    #        continue
-    #    # Break the header line into header name and value.
-    #    name, value = header_line.split(':', 1)
-    #    # Now we can actually record the header name and value.
-    #    respheaders[name.strip()] = value.strip() # name.strip().lower() ## TODO: check
-    # status
-    respcode = curl.getinfo(curl.RESPONSE_CODE)
-    # url
-    effective_url = curl.getinfo(curl.EFFECTIVE_URL)
     # additional info
     # ip_info = curl.getinfo(curl.PRIMARY_IP)
 
-    # tidy up
+    resp = Response(bufferbytes, curl.getinfo(curl.RESPONSE_CODE), curl.getinfo(curl.EFFECTIVE_URL))
     curl.close()
-    return RawResponse(bufferbytes, respcode, effective_url)
+
+    if with_headers:
+        respheaders = {}
+        # https://github.com/pycurl/pycurl/blob/master/examples/quickstart/response_headers.py
+        for line in headerbytes.getvalue().decode("iso-8859-1", errors="replace").splitlines():
+            # re.split(r'\r?\n') ?
+            # This will botch headers that are split on multiple lines...
+            if ':' not in line:
+                continue
+            # Break the header line into header name and value.
+            name, value = line.split(':', 1)
+            # Now we can actually record the header name and value.
+            respheaders[name.strip()] = value.strip() # name.strip().lower() ?
+        resp.store_headers(respheaders)
+
+    return resp
diff --git a/trafilatura/utils.py b/trafilatura/utils.py
@@ -9,6 +9,7 @@
 # import csv
 import logging
 import re
+import warnings
 
 # if brotli is installed
 try:
@@ -129,17 +130,19 @@ def detect_encoding(bytesobject):
     return [g for g in guesses if g not in UNICODE_ALIASES]
 
 
-def decode_response(response):
+def decode_response(content):
     """Read the urllib3 object corresponding to the server response,
-       check if it could be GZip and eventually decompress it, then
        try to guess its encoding and decode it to return a unicode string"""
-    # urllib3 response object / bytes switch
-    resp_content = response if isinstance(response, bytes) else response.data
-    return decode_file(resp_content)
+    warnings.warn(
+        "decode_response() will be deprecated, use decode_file() on the content.",
+         PendingDeprecationWarning
+    )
+    return decode_file(content)
 
 
 def decode_file(filecontent):
-    """Guess bytestring encoding and try to decode to Unicode string.
+    """Check if the bytestring could be GZip and eventually decompress it,
+       guess bytestring encoding and try to decode to Unicode string.
        Resort to destructive conversion otherwise."""
     # init
     if isinstance(filecontent, str):