apply HTML fix and accept LXML v5+ (#485)

* apply HTML fix and accept LXML v5+ * format code
adbar · Jan 23, 2024 · 02c8342 · 02c8342
1 parent c703271
commit 02c8342
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 16 deletions.
diff --git a/setup.py b/setup.py
@@ -117,7 +117,7 @@ def get_long_description():
         "justext >= 3.0.0",
         # see tests on Github Actions
         "lxml == 4.9.2 ; platform_system == 'Darwin' and python_version <= '3.8'",
-        "lxml == 4.9.4 ; platform_system != 'Darwin' or python_version > '3.8'",
+        "lxml >= 4.9.4, < 6; platform_system != 'Darwin' or python_version > '3.8'",
         "urllib3 >= 1.26, < 2; python_version < '3.7'",
         "urllib3 >= 1.26, < 3; python_version >= '3.7'",
     ],

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -114,13 +114,27 @@ def test_trim():
 
 def test_input():
     '''test if loaded strings/trees are handled properly'''
-    assert utils.is_dubious_html('This is a string.') is True
-    htmlstring = "<!DOCTYPE html PUBLIC />\n<html/>"
+    assert utils.is_dubious_html("This is a string.") is True
+
+    htmlstring = "<!DOCTYPE html PUBLIC />\n<html></html>"
     beginning = htmlstring[:50].lower()
-    assert utils.strip_faulty_doctypes(htmlstring, beginning) == "\n<html/>"
+    assert utils.repair_faulty_html(htmlstring, beginning) == "\n<html></html>"
+
     htmlstring = "<html>\n</html>"
     beginning = htmlstring[:50].lower()
-    assert utils.strip_faulty_doctypes(htmlstring, beginning) == htmlstring
+    assert utils.repair_faulty_html(htmlstring, beginning) == htmlstring
+
+    htmlstring = "<html/>\n</html>"
+    beginning = htmlstring[:50].lower()
+    assert utils.repair_faulty_html(htmlstring, beginning) == "<html>\n</html>"
+
+    htmlstring = '<!DOCTYPE html>\n<html lang="en-US"/>\n<head/>\n<body/>\n</html>'
+    beginning = htmlstring[:50].lower()
+    assert (
+        utils.repair_faulty_html(htmlstring, beginning)
+        == '<!DOCTYPE html>\n<html lang="en-US">\n<head/>\n<body/>\n</html>'
+    )
+
     with pytest.raises(TypeError) as err:
         assert utils.load_html(123) is None
     assert 'incompatible' in str(err.value)

diff --git a/trafilatura/utils.py b/trafilatura/utils.py
@@ -39,6 +39,7 @@
 UNICODE_ALIASES = {'utf-8', 'utf_8'}
 
 DOCTYPE_TAG = re.compile("^< ?! ?DOCTYPE.+?/ ?>", re.I)
+FAULTY_HTML = re.compile(r"(<html.*?)\s*/>", re.I)
 
 # note: htmldate could use HTML comments
 # huge_tree=True, remove_blank_text=True
@@ -168,22 +169,29 @@ def is_dubious_html(beginning: str) -> bool:
     return "html" not in beginning
 
 
-def strip_faulty_doctypes(htmlstring: str, beginning: str) -> str:
-    "Repair faulty doctype strings to make then palatable for libxml2."
+def repair_faulty_html(htmlstring: str, beginning: str) -> str:
+    "Repair faulty HTML strings to make then palatable for libxml2."
     # libxml2/LXML issue: https://bugs.launchpad.net/lxml/+bug/1955915
     if "doctype" in beginning:
         firstline, _, rest = htmlstring.partition("\n")
-        return DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest
+        htmlstring = DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest
+    # other issue with malformed documents: check first three lines
+    for i, line in enumerate(iter(htmlstring.splitlines())):
+        if "<html" in line and line.endswith("/>"):
+            htmlstring = FAULTY_HTML.sub(r"\1>", htmlstring, count=1)
+            break
+        if i > 2:
+            break
     return htmlstring
 
 
 def fromstring_bytes(htmlobject):
     "Try to pass bytes to LXML parser."
     tree = None
     try:
-        tree = fromstring(htmlobject.encode('utf8', 'surrogatepass'), parser=HTML_PARSER)
+        tree = fromstring(htmlobject.encode("utf8", "surrogatepass"), parser=HTML_PARSER)
     except Exception as err:
-        LOGGER.error('lxml parser bytestring %s', err)
+        LOGGER.error("lxml parser bytestring %s", err)
     return tree
 
 
@@ -195,11 +203,11 @@ def load_html(htmlobject):
     if isinstance(htmlobject, HtmlElement):
         return htmlobject
     # use trafilatura or urllib3 responses directly
-    if isinstance(htmlobject, HTTPResponse) or hasattr(htmlobject, 'data'):
+    if isinstance(htmlobject, HTTPResponse) or hasattr(htmlobject, "data"):
         htmlobject = htmlobject.data
     # do not accept any other type after this point
     if not isinstance(htmlobject, (bytes, str)):
-        raise TypeError('incompatible input type', type(htmlobject))
+        raise TypeError("incompatible input type", type(htmlobject))
     # start processing
     tree = None
     # try to guess encoding and decode file: if None then keep original
@@ -208,7 +216,7 @@ def load_html(htmlobject):
     beginning = htmlobject[:50].lower()
     check_flag = is_dubious_html(beginning)
     # repair first
-    htmlobject = strip_faulty_doctypes(htmlobject, beginning)
+    htmlobject = repair_faulty_html(htmlobject, beginning)
     # first pass: use Unicode string
     fallback_parse = False
     try:
@@ -217,15 +225,17 @@ def load_html(htmlobject):
         # "Unicode strings with encoding declaration are not supported."
         tree = fromstring_bytes(htmlobject)
         fallback_parse = True
-    except Exception as err:
-        LOGGER.error('lxml parsing failed: %s', err)
+    except Exception as err:  # pragma: no cover
+        LOGGER.error("lxml parsing failed: %s", err)
     # second pass: try passing bytes to LXML
     if (tree is None or len(tree) < 1) and not fallback_parse:
         tree = fromstring_bytes(htmlobject)
     # rejection test: is it (well-formed) HTML at all?
     # log parsing errors
     if tree is not None and check_flag is True and len(tree) < 2:
-        LOGGER.error('parsed tree length: %s, wrong data type or not valid HTML', len(tree))
+        LOGGER.error(
+            "parsed tree length: %s, wrong data type or not valid HTML", len(tree)
+        )
         tree = None
     return tree