refactoring: add type hints and review code (#723)

* refactoring: add type hints * fix tests * add further types * add missing types * fix type * fix errors * fix code * fix certain types and tests * polish
adbar · Oct 22, 2024 · 2977af5 · 2977af5
1 parent 6dedfbf
commit 2977af5
Show file tree

Hide file tree

Showing 22 changed files with 708 additions and 397 deletions.
diff --git a/tests/metadata_tests.py b/tests/metadata_tests.py
@@ -271,7 +271,8 @@ def test_meta():
     assert metadata.title == 'Title'
 
     # catch errors
-    assert extract_metadata('') is None
+    metadata = extract_metadata('')
+    assert all(getattr(metadata, a) is None for a in metadata.__slots__)
     metadata = extract_metadata('<html><title></title></html>')
     assert metadata.sitename is None
     metadata = extract_metadata('<html><head><title>' + 'AAA'*10000 + '</title></head></html>')

diff --git a/tests/sitemaps_tests.py b/tests/sitemaps_tests.py
@@ -82,6 +82,7 @@ def test_extraction():
     #sitemap.handle_link(url)  #  (url, '0')
 
     # safety belts
+    assert sitemaps.is_plausible_sitemap('http://example.org/sitemap.xml.gz', None) is False
     assert sitemaps.is_plausible_sitemap('http://example.org/sitemap.xml.gz', b'\x1f\x8bABC') is False
     assert sitemaps.is_plausible_sitemap('http://example.org/sitemap.xml', 'ABC') is False
     assert sitemaps.is_plausible_sitemap('http://test.org/sitemap.xml', '<!DOCTYPE html><html><body/></html>') is False

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -741,6 +741,8 @@ def test_tei():
 
 def test_htmlprocessing():
     '''test html-related functions'''
+    assert xml.xmltotxt(None, include_formatting=False) == ""
+
     options = DEFAULT_OPTIONS
     options.tables = True
     assert trafilatura.htmlprocessing.tree_cleaning(etree.Element('html'), options) is not None
@@ -819,6 +821,7 @@ def test_extraction_options():
     assert extract(my_html, only_with_metadata=False, output_format='xml', config=ZERO_CONFIG) is not None
     assert extract(my_html, only_with_metadata=True, output_format='xml', config=ZERO_CONFIG) is None
     assert extract(my_html, target_language='de', config=ZERO_CONFIG) is None
+    assert extract(my_html, target_language='de', no_fallback=True, config=ZERO_CONFIG) is None
     assert etree.tostring(try_justext(html.fromstring(my_html), None, 'de')) == b'<body/>'
     # assert extract(my_html) is None
 
@@ -1383,6 +1386,8 @@ def test_is_probably_readerable():
     """
     Test is_probably_readerable function.
     """
+    assert not is_probably_readerable("ABC")
+
     very_small_str = "hello there"
     small_str = "hello there " * 11
     large_str = "hello there " * 12

diff --git a/trafilatura/baseline.py b/trafilatura/baseline.py
@@ -8,13 +8,14 @@
 from typing import Any, Tuple
 
 from lxml.etree import _Element, Element, SubElement
+from lxml.html import HtmlElement
 
 from .settings import BASIC_CLEAN_XPATH
 from .utils import load_html, trim
 from .xml import delete_element
 
 
-def basic_cleaning(tree: _Element) -> _Element:
+def basic_cleaning(tree: HtmlElement) -> HtmlElement:
     "Remove a few section types from the document."
     for elem in BASIC_CLEAN_XPATH(tree):
         delete_element(elem)
@@ -62,7 +63,7 @@ def baseline(filecontent: Any) -> Tuple[_Element, str, int]:
     # scrape from article tag
     temp_text = ""
     for article_elem in tree.iterfind('.//article'):
-        text = trim(article_elem.text_content())
+        text = trim(article_elem.text_content()) or ""
         if len(text) > 100:
             SubElement(postbody, 'p').text = text
             temp_text += " " + text if temp_text else text
@@ -75,7 +76,7 @@ def baseline(filecontent: Any) -> Tuple[_Element, str, int]:
     temp_text = ""
     # postbody = Element('body')
     for element in tree.iter('blockquote', 'code', 'p', 'pre', 'q', 'quote'):
-        entry = trim(element.text_content())
+        entry = trim(element.text_content()) or ""
         if entry not in results:
             SubElement(postbody, 'p').text = entry
             temp_text += " " + entry if temp_text else entry
@@ -88,10 +89,11 @@ def baseline(filecontent: Any) -> Tuple[_Element, str, int]:
     postbody = Element('body')
     body_elem = tree.find('.//body')
     if body_elem is not None:
-        elem = SubElement(postbody, 'p')
+        p_elem = SubElement(postbody, 'p')
         # todo: sanitize?
-        elem.text = '\n'.join([trim(e) for e in body_elem.itertext()])
-        return postbody, elem.text, len(elem.text)
+        text_elems = [trim(e) for e in body_elem.itertext()]
+        p_elem.text = '\n'.join([e for e in text_elems if e])
+        return postbody, p_elem.text, len(p_elem.text)
 
     # new fallback
     text = html2txt(tree, clean=False)

diff --git a/trafilatura/cli.py b/trafilatura/cli.py
@@ -16,10 +16,11 @@
                         url_processing_pipeline, write_result)
 from .settings import PARALLEL_CORES, SUPPORTED_FMT_CLI
 
+
 # fix output encoding on some systems
-if sys.stdout.encoding != 'UTF-8':
+if sys.stdout.encoding != 'UTF-8' and hasattr(sys.stdout, 'reconfigure'):
     sys.stdout.reconfigure(encoding='utf-8')
-if sys.stderr.encoding != 'UTF-8':
+if sys.stderr.encoding != 'UTF-8' and hasattr(sys.stderr, 'reconfigure'):
     sys.stderr.reconfigure(encoding='utf-8')
 
 

diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py
@@ -29,7 +29,7 @@
 from .baseline import html2txt
 from .core import extract
 from .deduplication import generate_bow_hash
-from .downloads import add_to_compressed_dict, buffered_downloads, load_download_buffer
+from .downloads import Response, add_to_compressed_dict, buffered_downloads, load_download_buffer
 from .feeds import find_feed_urls
 from .meta import reset_caches
 from .settings import (
@@ -272,7 +272,7 @@ def download_queue_processing(
             bufferlist, args.parallel, options=options
         ):
             # handle result
-            if result:
+            if result and isinstance(result, str):
                 options.url = url
                 counter = process_result(result, args, counter, options)
             else:
@@ -380,7 +380,7 @@ def cli_crawler(
         for url, result in buffered_downloads(
             bufferlist, args.parallel, decode=False, options=options
         ):
-            if result is not None:
+            if result and isinstance(result, Response):
                 spider.process_response(result, param_dict[get_base_url(url)])
         # early exit if maximum count is reached
         if any(c >= n for c in spider.URL_STORE.get_all_counts()):