Skip to content

Commit

Permalink
add preserve_space option to keep original format in code blocks
Browse files Browse the repository at this point in the history
  • Loading branch information
CodyInnowhere authored and CodyInnowhere committed Dec 24, 2024
1 parent 91c567c commit eed8c0b
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 4 deletions.
20 changes: 20 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,26 @@ def test_formatting():
my_result = extract(my_document, output_format='xml', fast=True, include_formatting=True, config=ZERO_CONFIG)
assert '<head rend="h4">1) The <code>in</code> Operator</head>' in my_result and '<p>The easiest way to check if a Python string contains a substring is to use the <code>in</code> operator. The <code>in</code> operator is used to check data structures for membership in Python. It returns a Boolean (either <code>True</code> or <code>False</code>) and can be used as follows:</p>' in my_result

my_document = html.fromstring("""
<html><head><body><article>python code below:
```python
def test:
print('hello')
print('world')
```
</article></body></html>
""")
my_result = extract(my_document, output_format='markdown', include_formatting=True)
assert "python code below:\n```python\ndef test:\nprint('hello')\nprint('world')\n```" == my_result

my_result = extract(my_document, output_format='markdown', include_formatting=True, preserve_space=True)
assert """python code below:
```python
def test:
print('hello')
print('world')
```""" == my_result


def test_extract_with_metadata():
'''Test extract_with_metadata method'''
Expand Down
16 changes: 14 additions & 2 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,10 @@ def determine_returnstring(document: Document, options: Extractor) -> str:
header += "---\n"
else:
header = ""
returnstring = f"{header}{xmltotxt(document.body, options.formatting)}"
returnstring = f"{header}{xmltotxt(document.body, options.formatting, options.preserve_space)}"
if document.commentsbody is not None:
returnstring = f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting)}".strip()
returnstring = \
f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting, options.preserve_space)}".strip()
# normalize Unicode format (defaults to NFC)
return normalize_unicode(returnstring)

Expand Down Expand Up @@ -140,6 +141,7 @@ def bare_extraction(
include_tables: bool = True,
include_images: bool = False,
include_formatting: bool = False,
preserve_space: bool = False,
include_links: bool = False,
deduplicate: bool = False,
date_extraction_params: Optional[Dict[str, Any]] = None,
Expand Down Expand Up @@ -171,6 +173,7 @@ def bare_extraction(
include_images: Take images into account (experimental).
include_formatting: Keep structural elements related to formatting
(present in XML format, converted to markdown otherwise).
preserve_space: Preserve space when formatting text.
include_links: Keep links along with their targets (experimental).
deduplicate: Remove duplicate segments and documents.
date_extraction_params: Provide extraction parameters to htmldate as dict().
Expand Down Expand Up @@ -205,6 +208,7 @@ def bare_extraction(
recall=favor_recall,
comments=include_comments,
formatting=include_formatting,
preserve_space=preserve_space,
links=include_links,
images=include_images,
tables=include_tables,
Expand Down Expand Up @@ -361,6 +365,7 @@ def extract(
include_tables: bool = True,
include_images: bool = False,
include_formatting: bool = False,
preserve_space: bool = False,
include_links: bool = False,
deduplicate: bool = False,
date_extraction_params: Optional[Dict[str, Any]] = None,
Expand Down Expand Up @@ -394,6 +399,7 @@ def extract(
include_images: Take images into account (experimental).
include_formatting: Keep structural elements related to formatting
(only valuable if output_format is set to XML).
preserve_space: Preserve space when formatting text.
include_links: Keep links along with their targets (experimental).
deduplicate: Remove duplicate segments and documents.
date_extraction_params: Provide extraction parameters to htmldate as dict().
Expand Down Expand Up @@ -427,6 +433,7 @@ def extract(
include_tables=include_tables,
include_images=include_images,
include_formatting=include_formatting,
preserve_space=preserve_space,
include_links=include_links,
deduplicate=deduplicate,
date_extraction_params=date_extraction_params,
Expand Down Expand Up @@ -456,6 +463,7 @@ def extract_with_metadata(
include_tables: bool = True,
include_images: bool = False,
include_formatting: bool = False,
preserve_space: bool = False,
include_links: bool = False,
deduplicate: bool = False,
date_extraction_params: Optional[Dict[str, Any]] = None,
Expand Down Expand Up @@ -487,6 +495,7 @@ def extract_with_metadata(
include_images: Take images into account (experimental).
include_formatting: Keep structural elements related to formatting
(only valuable if output_format is set to XML).
preserve_space: Preserve space when formatting text.
include_links: Keep links along with their targets (experimental).
deduplicate: Remove duplicate segments and documents.
date_extraction_params: Provide extraction parameters to htmldate as dict().
Expand Down Expand Up @@ -515,6 +524,7 @@ def extract_with_metadata(
include_tables=include_tables,
include_images=include_images,
include_formatting=include_formatting,
preserve_space=preserve_space,
include_links=include_links,
deduplicate=deduplicate,
date_extraction_params=date_extraction_params,
Expand Down Expand Up @@ -564,6 +574,7 @@ def _internal_extraction(
include_tables: bool = True,
include_images: bool = False,
include_formatting: bool = False,
preserve_space: bool = False,
include_links: bool = False,
deduplicate: bool = False,
date_extraction_params: Optional[Dict[str, Any]] = None,
Expand All @@ -590,6 +601,7 @@ def _internal_extraction(
recall=favor_recall,
comments=include_comments,
formatting=include_formatting,
preserve_space=preserve_space,
links=include_links,
images=include_images,
tables=include_tables,
Expand Down
3 changes: 3 additions & 0 deletions trafilatura/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ class Extractor:
"focus",
"comments",
"formatting",
"preserve_space",
"links",
"images",
"tables",
Expand Down Expand Up @@ -108,6 +109,7 @@ def __init__(
recall: bool = False,
comments: bool = True,
formatting: bool = False,
preserve_space: bool = False,
links: bool = False,
images: bool = False,
tables: bool = True,
Expand All @@ -131,6 +133,7 @@ def __init__(
)
self.comments: bool = comments
self.formatting: bool = formatting or self.format == "markdown"
self.preserve_space: bool = preserve_space
self.links: bool = links
self.images: bool = images
self.tables: bool = tables
Expand Down
4 changes: 2 additions & 2 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
returnlist.append(element.tail)


def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str:
def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool, preserve_space: bool = False) -> str:
"Convert to plain text format and optionally preserve formatting as markdown."
if xmloutput is None:
return ""
Expand All @@ -370,7 +370,7 @@ def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str:

process_element(xmloutput, returnlist, include_formatting)

return unescape(sanitize("".join(returnlist)) or "")
return unescape(sanitize("".join(returnlist), preserve_space) or "")


def xmltocsv(document: Document, include_formatting: bool, *, delim: str = "\t", null: str = "null") -> str:
Expand Down

0 comments on commit eed8c0b

Please sign in to comment.