Skip to content

Commit

Permalink
apply HTML fix and accept LXML v5+ (#485)
Browse files Browse the repository at this point in the history
* apply HTML fix and accept LXML v5+

* format code
  • Loading branch information
adbar authored Jan 23, 2024
1 parent c703271 commit 02c8342
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 16 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def get_long_description():
"justext >= 3.0.0",
# see tests on Github Actions
"lxml == 4.9.2 ; platform_system == 'Darwin' and python_version <= '3.8'",
"lxml == 4.9.4 ; platform_system != 'Darwin' or python_version > '3.8'",
"lxml >= 4.9.4, < 6; platform_system != 'Darwin' or python_version > '3.8'",
"urllib3 >= 1.26, < 2; python_version < '3.7'",
"urllib3 >= 1.26, < 3; python_version >= '3.7'",
],
Expand Down
22 changes: 18 additions & 4 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,13 +114,27 @@ def test_trim():

def test_input():
'''test if loaded strings/trees are handled properly'''
assert utils.is_dubious_html('This is a string.') is True
htmlstring = "<!DOCTYPE html PUBLIC />\n<html/>"
assert utils.is_dubious_html("This is a string.") is True

htmlstring = "<!DOCTYPE html PUBLIC />\n<html></html>"
beginning = htmlstring[:50].lower()
assert utils.strip_faulty_doctypes(htmlstring, beginning) == "\n<html/>"
assert utils.repair_faulty_html(htmlstring, beginning) == "\n<html></html>"

htmlstring = "<html>\n</html>"
beginning = htmlstring[:50].lower()
assert utils.strip_faulty_doctypes(htmlstring, beginning) == htmlstring
assert utils.repair_faulty_html(htmlstring, beginning) == htmlstring

htmlstring = "<html/>\n</html>"
beginning = htmlstring[:50].lower()
assert utils.repair_faulty_html(htmlstring, beginning) == "<html>\n</html>"

htmlstring = '<!DOCTYPE html>\n<html lang="en-US"/>\n<head/>\n<body/>\n</html>'
beginning = htmlstring[:50].lower()
assert (
utils.repair_faulty_html(htmlstring, beginning)
== '<!DOCTYPE html>\n<html lang="en-US">\n<head/>\n<body/>\n</html>'
)

with pytest.raises(TypeError) as err:
assert utils.load_html(123) is None
assert 'incompatible' in str(err.value)
Expand Down
32 changes: 21 additions & 11 deletions trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
UNICODE_ALIASES = {'utf-8', 'utf_8'}

DOCTYPE_TAG = re.compile("^< ?! ?DOCTYPE.+?/ ?>", re.I)
FAULTY_HTML = re.compile(r"(<html.*?)\s*/>", re.I)

# note: htmldate could use HTML comments
# huge_tree=True, remove_blank_text=True
Expand Down Expand Up @@ -168,22 +169,29 @@ def is_dubious_html(beginning: str) -> bool:
return "html" not in beginning


def strip_faulty_doctypes(htmlstring: str, beginning: str) -> str:
"Repair faulty doctype strings to make then palatable for libxml2."
def repair_faulty_html(htmlstring: str, beginning: str) -> str:
"Repair faulty HTML strings to make then palatable for libxml2."
# libxml2/LXML issue: https://bugs.launchpad.net/lxml/+bug/1955915
if "doctype" in beginning:
firstline, _, rest = htmlstring.partition("\n")
return DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest
htmlstring = DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest
# other issue with malformed documents: check first three lines
for i, line in enumerate(iter(htmlstring.splitlines())):
if "<html" in line and line.endswith("/>"):
htmlstring = FAULTY_HTML.sub(r"\1>", htmlstring, count=1)
break
if i > 2:
break
return htmlstring


def fromstring_bytes(htmlobject):
"Try to pass bytes to LXML parser."
tree = None
try:
tree = fromstring(htmlobject.encode('utf8', 'surrogatepass'), parser=HTML_PARSER)
tree = fromstring(htmlobject.encode("utf8", "surrogatepass"), parser=HTML_PARSER)
except Exception as err:
LOGGER.error('lxml parser bytestring %s', err)
LOGGER.error("lxml parser bytestring %s", err)
return tree


Expand All @@ -195,11 +203,11 @@ def load_html(htmlobject):
if isinstance(htmlobject, HtmlElement):
return htmlobject
# use trafilatura or urllib3 responses directly
if isinstance(htmlobject, HTTPResponse) or hasattr(htmlobject, 'data'):
if isinstance(htmlobject, HTTPResponse) or hasattr(htmlobject, "data"):
htmlobject = htmlobject.data
# do not accept any other type after this point
if not isinstance(htmlobject, (bytes, str)):
raise TypeError('incompatible input type', type(htmlobject))
raise TypeError("incompatible input type", type(htmlobject))
# start processing
tree = None
# try to guess encoding and decode file: if None then keep original
Expand All @@ -208,7 +216,7 @@ def load_html(htmlobject):
beginning = htmlobject[:50].lower()
check_flag = is_dubious_html(beginning)
# repair first
htmlobject = strip_faulty_doctypes(htmlobject, beginning)
htmlobject = repair_faulty_html(htmlobject, beginning)
# first pass: use Unicode string
fallback_parse = False
try:
Expand All @@ -217,15 +225,17 @@ def load_html(htmlobject):
# "Unicode strings with encoding declaration are not supported."
tree = fromstring_bytes(htmlobject)
fallback_parse = True
except Exception as err:
LOGGER.error('lxml parsing failed: %s', err)
except Exception as err: # pragma: no cover
LOGGER.error("lxml parsing failed: %s", err)
# second pass: try passing bytes to LXML
if (tree is None or len(tree) < 1) and not fallback_parse:
tree = fromstring_bytes(htmlobject)
# rejection test: is it (well-formed) HTML at all?
# log parsing errors
if tree is not None and check_flag is True and len(tree) < 2:
LOGGER.error('parsed tree length: %s, wrong data type or not valid HTML', len(tree))
LOGGER.error(
"parsed tree length: %s, wrong data type or not valid HTML", len(tree)
)
tree = None
return tree

Expand Down

0 comments on commit 02c8342

Please sign in to comment.