add preserve_space option to keep original format in code blocks

adbar · Dec 24, 2024 · eed8c0b · eed8c0b
1 parent 91c567c
commit eed8c0b
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 4 deletions.
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -442,6 +442,26 @@ def test_formatting():
     my_result = extract(my_document, output_format='xml', fast=True, include_formatting=True, config=ZERO_CONFIG)
     assert '<head rend="h4">1) The <code>in</code> Operator</head>' in my_result and '<p>The easiest way to check if a Python string contains a substring is to use the <code>in</code> operator. The <code>in</code> operator is used to check data structures for membership in Python. It returns a Boolean (either <code>True</code> or <code>False</code>) and can be used as follows:</p>' in my_result
 
+    my_document = html.fromstring("""
+    <html><head><body><article>python code below:
+    ```python
+def test:
+    print('hello')
+    print('world')
+    ```
+    </article></body></html> 
+    """)
+    my_result = extract(my_document, output_format='markdown', include_formatting=True)
+    assert "python code below:\n```python\ndef test:\nprint('hello')\nprint('world')\n```" == my_result
+
+    my_result = extract(my_document, output_format='markdown', include_formatting=True, preserve_space=True)
+    assert """python code below:
+    ```python
+def test:
+    print('hello')
+    print('world')
+    ```""" == my_result
+
 
 def test_extract_with_metadata():
     '''Test extract_with_metadata method'''

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -91,9 +91,10 @@ def determine_returnstring(document: Document, options: Extractor) -> str:
             header += "---\n"
         else:
             header = ""
-        returnstring = f"{header}{xmltotxt(document.body, options.formatting)}"
+        returnstring = f"{header}{xmltotxt(document.body, options.formatting, options.preserve_space)}"
         if document.commentsbody is not None:
-            returnstring = f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting)}".strip()
+            returnstring = \
+                f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting, options.preserve_space)}".strip()
     # normalize Unicode format (defaults to NFC)
     return normalize_unicode(returnstring)
 
@@ -140,6 +141,7 @@ def bare_extraction(
     include_tables: bool = True,
     include_images: bool = False,
     include_formatting: bool = False,
+    preserve_space: bool = False,
     include_links: bool = False,
     deduplicate: bool = False,
     date_extraction_params: Optional[Dict[str, Any]] = None,
@@ -171,6 +173,7 @@ def bare_extraction(
         include_images: Take images into account (experimental).
         include_formatting: Keep structural elements related to formatting
             (present in XML format, converted to markdown otherwise).
+        preserve_space: Preserve space when formatting text.
         include_links: Keep links along with their targets (experimental).
         deduplicate: Remove duplicate segments and documents.
         date_extraction_params: Provide extraction parameters to htmldate as dict().
@@ -205,6 +208,7 @@ def bare_extraction(
             recall=favor_recall,
             comments=include_comments,
             formatting=include_formatting,
+            preserve_space=preserve_space,
             links=include_links,
             images=include_images,
             tables=include_tables,
@@ -361,6 +365,7 @@ def extract(
     include_tables: bool = True,
     include_images: bool = False,
     include_formatting: bool = False,
+    preserve_space: bool = False,
     include_links: bool = False,
     deduplicate: bool = False,
     date_extraction_params: Optional[Dict[str, Any]] = None,
@@ -394,6 +399,7 @@ def extract(
         include_images: Take images into account (experimental).
         include_formatting: Keep structural elements related to formatting
             (only valuable if output_format is set to XML).
+        preserve_space: Preserve space when formatting text.
         include_links: Keep links along with their targets (experimental).
         deduplicate: Remove duplicate segments and documents.
         date_extraction_params: Provide extraction parameters to htmldate as dict().
@@ -427,6 +433,7 @@ def extract(
         include_tables=include_tables,
         include_images=include_images,
         include_formatting=include_formatting,
+        preserve_space=preserve_space,
         include_links=include_links,
         deduplicate=deduplicate,
         date_extraction_params=date_extraction_params,
@@ -456,6 +463,7 @@ def extract_with_metadata(
     include_tables: bool = True,
     include_images: bool = False,
     include_formatting: bool = False,
+    preserve_space: bool = False,
     include_links: bool = False,
     deduplicate: bool = False,
     date_extraction_params: Optional[Dict[str, Any]] = None,
@@ -487,6 +495,7 @@ def extract_with_metadata(
         include_images: Take images into account (experimental).
         include_formatting: Keep structural elements related to formatting
             (only valuable if output_format is set to XML).
+        preserve_space: Preserve space when formatting text.
         include_links: Keep links along with their targets (experimental).
         deduplicate: Remove duplicate segments and documents.
         date_extraction_params: Provide extraction parameters to htmldate as dict().
@@ -515,6 +524,7 @@ def extract_with_metadata(
         include_tables=include_tables,
         include_images=include_images,
         include_formatting=include_formatting,
+        preserve_space=preserve_space,
         include_links=include_links,
         deduplicate=deduplicate,
         date_extraction_params=date_extraction_params,
@@ -564,6 +574,7 @@ def _internal_extraction(
         include_tables: bool = True,
         include_images: bool = False,
         include_formatting: bool = False,
+        preserve_space: bool = False,
         include_links: bool = False,
         deduplicate: bool = False,
         date_extraction_params: Optional[Dict[str, Any]] = None,
@@ -590,6 +601,7 @@ def _internal_extraction(
             recall=favor_recall,
             comments=include_comments,
             formatting=include_formatting,
+            preserve_space=preserve_space,
             links=include_links,
             images=include_images,
             tables=include_tables,

diff --git a/trafilatura/settings.py b/trafilatura/settings.py
@@ -70,6 +70,7 @@ class Extractor:
         "focus",
         "comments",
         "formatting",
+        "preserve_space",
         "links",
         "images",
         "tables",
@@ -108,6 +109,7 @@ def __init__(
         recall: bool = False,
         comments: bool = True,
         formatting: bool = False,
+        preserve_space: bool = False,
         links: bool = False,
         images: bool = False,
         tables: bool = True,
@@ -131,6 +133,7 @@ def __init__(
         )
         self.comments: bool = comments
         self.formatting: bool = formatting or self.format == "markdown"
+        self.preserve_space: bool = preserve_space
         self.links: bool = links
         self.images: bool = images
         self.tables: bool = tables

diff --git a/trafilatura/xml.py b/trafilatura/xml.py
@@ -361,7 +361,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
         returnlist.append(element.tail)
 
 
-def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str:
+def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool, preserve_space: bool = False) -> str:
     "Convert to plain text format and optionally preserve formatting as markdown."
     if xmloutput is None:
         return ""
@@ -370,7 +370,7 @@ def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str:
 
     process_element(xmloutput, returnlist, include_formatting)
 
-    return unescape(sanitize("".join(returnlist)) or "")
+    return unescape(sanitize("".join(returnlist), preserve_space) or "")
 
 
 def xmltocsv(document: Document, include_formatting: bool, *, delim: str = "\t", null: str = "null") -> str: