From 7bb84661cbc488586b7ec5d72a4c80d127de8a20 Mon Sep 17 00:00:00 2001 From: Roland Steinegger Date: Sun, 12 Jan 2025 11:35:24 +0100 Subject: [PATCH] Avoids spaces after inline code --- trafilatura/xml.py | 1 + 1 file changed, 1 insertion(+) diff --git a/trafilatura/xml.py b/trafilatura/xml.py index 141863a7..0b8a5db9 100644 --- a/trafilatura/xml.py +++ b/trafilatura/xml.py @@ -35,6 +35,7 @@ CONTROL_PARSER = XMLParser(remove_blank_text=True) NEWLINE_ELEMS = {'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table'} +SPECIAL_FORMATTING = {'del', 'head', 'hi', 'ref', 'code'} WITH_ATTRIBUTES = {'cell', 'row', 'del', 'graphic', 'head', 'hi', 'item', 'list', 'ref'} NESTING_WHITELIST = {"cell", "figure", "item", "note", "quote"}