simplify trim() function (#727)

* simplify trim() function * use walrus
adbar · Oct 23, 2024 · 52d21a6 · 52d21a6
1 parent 2977af5
commit 52d21a6
Show file tree

Hide file tree

Showing 8 changed files with 28 additions and 32 deletions.
diff --git a/trafilatura/baseline.py b/trafilatura/baseline.py
@@ -43,16 +43,16 @@ def baseline(filecontent: Any) -> Tuple[_Element, str, int]:
     for elem in tree.iterfind('.//script[@type="application/ld+json"]'):
         if elem.text and 'articleBody' in elem.text:
             try:
-                json_body = json.loads(elem.text).get("articleBody")
+                json_body = json.loads(elem.text).get("articleBody", "")
             except Exception:  # JSONDecodeError or 'list' object has no attribute 'get'
                 json_body = ""
             if json_body:
                 if "<p>" in json_body:
                     parsed = load_html(json_body)
-                    text = parsed.text_content() if parsed is not None else ""
+                    text = trim(parsed.text_content()) if parsed is not None else ""
                 else:
-                    text = json_body
-                SubElement(postbody, 'p').text = trim(text)
+                    text = trim(json_body)
+                SubElement(postbody, 'p').text = text
                 temp_text += " " + text if temp_text else text
                 # return postbody, elem.text, len(elem.text)
     if len(temp_text) > 100:
@@ -63,7 +63,7 @@ def baseline(filecontent: Any) -> Tuple[_Element, str, int]:
     # scrape from article tag
     temp_text = ""
     for article_elem in tree.iterfind('.//article'):
-        text = trim(article_elem.text_content()) or ""
+        text = trim(article_elem.text_content())
         if len(text) > 100:
             SubElement(postbody, 'p').text = text
             temp_text += " " + text if temp_text else text
@@ -76,7 +76,7 @@ def baseline(filecontent: Any) -> Tuple[_Element, str, int]:
     temp_text = ""
     # postbody = Element('body')
     for element in tree.iter('blockquote', 'code', 'p', 'pre', 'q', 'quote'):
-        entry = trim(element.text_content()) or ""
+        entry = trim(element.text_content())
         if entry not in results:
             SubElement(postbody, 'p').text = entry
             temp_text += " " + entry if temp_text else entry

diff --git a/trafilatura/deduplication.py b/trafilatura/deduplication.py
@@ -242,7 +242,7 @@ def put_in_cache(teststring: str) -> None:
 
 def duplicate_test(element: _Element, options: Any) -> bool:
     "Check for duplicate text with LRU cache."
-    teststring = trim(" ".join(element.itertext())) or ""
+    teststring = trim(" ".join(element.itertext()))
     # teststring = element.text
     if len(teststring) > options.min_duplcheck_size:
         # retrieve value from cache

diff --git a/trafilatura/external.py b/trafilatura/external.py
@@ -56,7 +56,7 @@ def compare_extraction(tree: HtmlElement, backup_tree: HtmlElement, body: _Eleme
     # try with readability
     temppost_algo = try_readability(backup_tree)
     # unicode fix necessary on certain systems (#331)
-    algo_text = trim(tostring(temppost_algo, method='text', encoding='utf-8').decode('utf-8')) or ""
+    algo_text = trim(tostring(temppost_algo, method='text', encoding='utf-8').decode('utf-8'))
     len_algo = len(algo_text)
 
     # compare
@@ -155,7 +155,7 @@ def justext_rescue(tree: HtmlElement, options: Any) -> Tuple[_Element, str, int]
     tree = basic_cleaning(tree)
     # proceed
     temppost_algo = try_justext(tree, options.url, options.lang)
-    temp_text = trim(' '.join(temppost_algo.itertext())) or ""
+    temp_text = trim(' '.join(temppost_algo.itertext()))
     return temppost_algo, temp_text, len(temp_text)
 
 
@@ -185,5 +185,5 @@ def sanitize_tree(tree: HtmlElement, options: Any) -> Tuple[HtmlElement, str, in
     ]
     strip_tags(cleaned_tree, *sanitization_list)
     # 4. return
-    text = trim(' '.join(cleaned_tree.itertext())) or ""
+    text = trim(' '.join(cleaned_tree.itertext()))
     return cleaned_tree, text, len(text)
diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py
@@ -140,7 +140,7 @@ def link_density_test(
     # shortcut
     if len(links_xpath) == 1:
         len_threshold = 10 if favor_precision else 100
-        link_text = trim(links_xpath[0].text_content()) or ""
+        link_text = trim(links_xpath[0].text_content())
         if len(link_text) > len_threshold and len(link_text) > len(text) * 0.9:
             return True, []
     if element.tag == "p":
@@ -176,7 +176,7 @@ def link_density_test_tables(element: HtmlElement) -> bool:
     if not links_xpath:
         return False
 
-    elemlen = len(trim(element.text_content()) or "")
+    elemlen = len(trim(element.text_content()))
     if elemlen < 200:
         return False
 
@@ -201,7 +201,7 @@ def delete_by_link_density(
     depth_threshold = 1 if favor_precision else 3
 
     for elem in subtree.iter(tagname):
-        elemtext = trim(elem.text_content()) or ""
+        elemtext = trim(elem.text_content())
         result, templist = link_density_test(elem, elemtext, favor_precision)
         if result or (
             backtracking
@@ -232,7 +232,7 @@ def handle_textnode(
     # lb bypass
     if not comments_fix and elem.tag == "lb":
         if not preserve_spaces:
-            elem.tail = trim(elem.tail)
+            elem.tail = trim(elem.tail) or None
         # if textfilter(elem) is True:
         #     return None
         # duplicate_test(subelement)?
@@ -248,9 +248,9 @@ def handle_textnode(
 
     # trim
     if not preserve_spaces:
-        elem.text = trim(elem.text)
+        elem.text = trim(elem.text) or None
         if elem.tail:
-            elem.tail = trim(elem.tail)
+            elem.tail = trim(elem.tail) or None
 
     # filter content
     # or not re.search(r'\w', element.text):  # text_content()?
@@ -269,7 +269,7 @@ def process_node(elem: _Element, options: Extractor) -> Optional[_Element]:
         return None
 
     # trim
-    elem.text, elem.tail = trim(elem.text), trim(elem.tail)
+    elem.text, elem.tail = trim(elem.text) or None, trim(elem.tail) or None
 
     # adapt content string
     if elem.tag != "lb" and not elem.text and elem.tail:

diff --git a/trafilatura/json_metadata.py b/trafilatura/json_metadata.py
@@ -220,7 +220,7 @@ def normalize_json(string: str) -> str:
         string = JSON_UNICODE_REPLACE.sub(lambda match: chr(int(match[1], 16)), string)
         string = ''.join(c for c in string if ord(c) < 0xD800 or ord(c) > 0xDFFF)
         string = unescape(string)
-    return trim(JSON_REMOVE_HTML.sub('', string)) or ""
+    return trim(JSON_REMOVE_HTML.sub('', string))
 
 
 def normalize_authors(current_authors: Optional[str], author_string: str) -> Optional[str]:
@@ -240,13 +240,13 @@ def normalize_authors(current_authors: Optional[str], author_string: str) -> Opt
     author_string = HTML_STRIP_TAGS.sub('', author_string)
     # examine names
     for author in AUTHOR_SPLIT.split(author_string):
-        author = trim(author) or ""
+        author = trim(author)
         # remove emoji
         author = AUTHOR_EMOJI_REMOVE.sub('', author)
         # remove @username
         author = AUTHOR_TWITTER.sub('', author)
         # replace special characters with space
-        author = trim(AUTHOR_REPLACE_JOIN.sub(' ', author)) or ""
+        author = trim(AUTHOR_REPLACE_JOIN.sub(' ', author))
         author = AUTHOR_REMOVE_NICKNAME.sub('', author)
         # remove special characters
         author = AUTHOR_REMOVE_SPECIAL.sub('', author)

diff --git a/trafilatura/metadata.py b/trafilatura/metadata.py
@@ -340,10 +340,8 @@ def examine_title_element(
     title_element = tree.find(".//head//title")
     if title_element is not None:
         title = trim(title_element.text_content())
-        if title:
-            match = HTMLTITLE_REGEX.match(title)
-            if match:
-                return title, match[1], match[2]
+        if match := HTMLTITLE_REGEX.match(title):
+            return title, match[1], match[2]
     LOGGER.debug("no main title found")
     return title, None, None
 

diff --git a/trafilatura/readability_lxml.py b/trafilatura/readability_lxml.py
@@ -86,7 +86,7 @@ def _tostring(string: HtmlElement) -> str:
 
 def text_length(elem: HtmlElement) -> int:
     "Return the length of the element with all its contents."
-    return len(trim(elem.text_content()) or "")
+    return len(trim(elem.text_content()))
 
 
 class Candidate:
@@ -232,7 +232,7 @@ def score_paragraphs(self) -> Dict[HtmlElement, Candidate]:
                 continue
             grand_parent_node = parent_node.getparent()
 
-            elem_text = trim(elem.text_content()) or ""
+            elem_text = trim(elem.text_content())
             elem_text_len = len(elem_text)
 
             # discard too short paragraphs

diff --git a/trafilatura/utils.py b/trafilatura/utils.py
@@ -337,15 +337,13 @@ def sanitize_tree(tree: _Element) -> _Element:
 
 
 @lru_cache(maxsize=1024)
-def trim(string: str) -> Optional[str]:
-    '''Remove unnecessary spaces within a text string'''
+def trim(string: str) -> str:
+    "Remove unnecessary spaces within a text string."
     try:
         # remove newlines that are not related to punctuation or markup + proper trimming
-        # return LINES_TRIMMING.sub(r' ', string).strip(' \t\n\r\v')
-        # faster:
-        return ' '.join(string.split()).strip()
+        return " ".join(string.split()).strip()
     except (AttributeError, TypeError):
-        return None
+        return ""
 
 
 def is_image_file(imagesrc: Optional[str]) -> bool: