From 52d21a68cb2563c7ad123bb28ee8755d0d6157df Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Wed, 23 Oct 2024 13:26:23 +0200 Subject: [PATCH] simplify trim() function (#727) * simplify trim() function * use walrus --- trafilatura/baseline.py | 12 ++++++------ trafilatura/deduplication.py | 2 +- trafilatura/external.py | 6 +++--- trafilatura/htmlprocessing.py | 14 +++++++------- trafilatura/json_metadata.py | 6 +++--- trafilatura/metadata.py | 6 ++---- trafilatura/readability_lxml.py | 4 ++-- trafilatura/utils.py | 10 ++++------ 8 files changed, 28 insertions(+), 32 deletions(-) diff --git a/trafilatura/baseline.py b/trafilatura/baseline.py index cb2b7235..c8811781 100644 --- a/trafilatura/baseline.py +++ b/trafilatura/baseline.py @@ -43,16 +43,16 @@ def baseline(filecontent: Any) -> Tuple[_Element, str, int]: for elem in tree.iterfind('.//script[@type="application/ld+json"]'): if elem.text and 'articleBody' in elem.text: try: - json_body = json.loads(elem.text).get("articleBody") + json_body = json.loads(elem.text).get("articleBody", "") except Exception: # JSONDecodeError or 'list' object has no attribute 'get' json_body = "" if json_body: if "

" in json_body: parsed = load_html(json_body) - text = parsed.text_content() if parsed is not None else "" + text = trim(parsed.text_content()) if parsed is not None else "" else: - text = json_body - SubElement(postbody, 'p').text = trim(text) + text = trim(json_body) + SubElement(postbody, 'p').text = text temp_text += " " + text if temp_text else text # return postbody, elem.text, len(elem.text) if len(temp_text) > 100: @@ -63,7 +63,7 @@ def baseline(filecontent: Any) -> Tuple[_Element, str, int]: # scrape from article tag temp_text = "" for article_elem in tree.iterfind('.//article'): - text = trim(article_elem.text_content()) or "" + text = trim(article_elem.text_content()) if len(text) > 100: SubElement(postbody, 'p').text = text temp_text += " " + text if temp_text else text @@ -76,7 +76,7 @@ def baseline(filecontent: Any) -> Tuple[_Element, str, int]: temp_text = "" # postbody = Element('body') for element in tree.iter('blockquote', 'code', 'p', 'pre', 'q', 'quote'): - entry = trim(element.text_content()) or "" + entry = trim(element.text_content()) if entry not in results: SubElement(postbody, 'p').text = entry temp_text += " " + entry if temp_text else entry diff --git a/trafilatura/deduplication.py b/trafilatura/deduplication.py index fda5188d..f73a8527 100644 --- a/trafilatura/deduplication.py +++ b/trafilatura/deduplication.py @@ -242,7 +242,7 @@ def put_in_cache(teststring: str) -> None: def duplicate_test(element: _Element, options: Any) -> bool: "Check for duplicate text with LRU cache." - teststring = trim(" ".join(element.itertext())) or "" + teststring = trim(" ".join(element.itertext())) # teststring = element.text if len(teststring) > options.min_duplcheck_size: # retrieve value from cache diff --git a/trafilatura/external.py b/trafilatura/external.py index 3c663461..42e4c1db 100644 --- a/trafilatura/external.py +++ b/trafilatura/external.py @@ -56,7 +56,7 @@ def compare_extraction(tree: HtmlElement, backup_tree: HtmlElement, body: _Eleme # try with readability temppost_algo = try_readability(backup_tree) # unicode fix necessary on certain systems (#331) - algo_text = trim(tostring(temppost_algo, method='text', encoding='utf-8').decode('utf-8')) or "" + algo_text = trim(tostring(temppost_algo, method='text', encoding='utf-8').decode('utf-8')) len_algo = len(algo_text) # compare @@ -155,7 +155,7 @@ def justext_rescue(tree: HtmlElement, options: Any) -> Tuple[_Element, str, int] tree = basic_cleaning(tree) # proceed temppost_algo = try_justext(tree, options.url, options.lang) - temp_text = trim(' '.join(temppost_algo.itertext())) or "" + temp_text = trim(' '.join(temppost_algo.itertext())) return temppost_algo, temp_text, len(temp_text) @@ -185,5 +185,5 @@ def sanitize_tree(tree: HtmlElement, options: Any) -> Tuple[HtmlElement, str, in ] strip_tags(cleaned_tree, *sanitization_list) # 4. return - text = trim(' '.join(cleaned_tree.itertext())) or "" + text = trim(' '.join(cleaned_tree.itertext())) return cleaned_tree, text, len(text) diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py index b7d4d2e1..34b228c7 100644 --- a/trafilatura/htmlprocessing.py +++ b/trafilatura/htmlprocessing.py @@ -140,7 +140,7 @@ def link_density_test( # shortcut if len(links_xpath) == 1: len_threshold = 10 if favor_precision else 100 - link_text = trim(links_xpath[0].text_content()) or "" + link_text = trim(links_xpath[0].text_content()) if len(link_text) > len_threshold and len(link_text) > len(text) * 0.9: return True, [] if element.tag == "p": @@ -176,7 +176,7 @@ def link_density_test_tables(element: HtmlElement) -> bool: if not links_xpath: return False - elemlen = len(trim(element.text_content()) or "") + elemlen = len(trim(element.text_content())) if elemlen < 200: return False @@ -201,7 +201,7 @@ def delete_by_link_density( depth_threshold = 1 if favor_precision else 3 for elem in subtree.iter(tagname): - elemtext = trim(elem.text_content()) or "" + elemtext = trim(elem.text_content()) result, templist = link_density_test(elem, elemtext, favor_precision) if result or ( backtracking @@ -232,7 +232,7 @@ def handle_textnode( # lb bypass if not comments_fix and elem.tag == "lb": if not preserve_spaces: - elem.tail = trim(elem.tail) + elem.tail = trim(elem.tail) or None # if textfilter(elem) is True: # return None # duplicate_test(subelement)? @@ -248,9 +248,9 @@ def handle_textnode( # trim if not preserve_spaces: - elem.text = trim(elem.text) + elem.text = trim(elem.text) or None if elem.tail: - elem.tail = trim(elem.tail) + elem.tail = trim(elem.tail) or None # filter content # or not re.search(r'\w', element.text): # text_content()? @@ -269,7 +269,7 @@ def process_node(elem: _Element, options: Extractor) -> Optional[_Element]: return None # trim - elem.text, elem.tail = trim(elem.text), trim(elem.tail) + elem.text, elem.tail = trim(elem.text) or None, trim(elem.tail) or None # adapt content string if elem.tag != "lb" and not elem.text and elem.tail: diff --git a/trafilatura/json_metadata.py b/trafilatura/json_metadata.py index adca561b..27c148ba 100644 --- a/trafilatura/json_metadata.py +++ b/trafilatura/json_metadata.py @@ -220,7 +220,7 @@ def normalize_json(string: str) -> str: string = JSON_UNICODE_REPLACE.sub(lambda match: chr(int(match[1], 16)), string) string = ''.join(c for c in string if ord(c) < 0xD800 or ord(c) > 0xDFFF) string = unescape(string) - return trim(JSON_REMOVE_HTML.sub('', string)) or "" + return trim(JSON_REMOVE_HTML.sub('', string)) def normalize_authors(current_authors: Optional[str], author_string: str) -> Optional[str]: @@ -240,13 +240,13 @@ def normalize_authors(current_authors: Optional[str], author_string: str) -> Opt author_string = HTML_STRIP_TAGS.sub('', author_string) # examine names for author in AUTHOR_SPLIT.split(author_string): - author = trim(author) or "" + author = trim(author) # remove emoji author = AUTHOR_EMOJI_REMOVE.sub('', author) # remove @username author = AUTHOR_TWITTER.sub('', author) # replace special characters with space - author = trim(AUTHOR_REPLACE_JOIN.sub(' ', author)) or "" + author = trim(AUTHOR_REPLACE_JOIN.sub(' ', author)) author = AUTHOR_REMOVE_NICKNAME.sub('', author) # remove special characters author = AUTHOR_REMOVE_SPECIAL.sub('', author) diff --git a/trafilatura/metadata.py b/trafilatura/metadata.py index c501b017..04a6a637 100644 --- a/trafilatura/metadata.py +++ b/trafilatura/metadata.py @@ -340,10 +340,8 @@ def examine_title_element( title_element = tree.find(".//head//title") if title_element is not None: title = trim(title_element.text_content()) - if title: - match = HTMLTITLE_REGEX.match(title) - if match: - return title, match[1], match[2] + if match := HTMLTITLE_REGEX.match(title): + return title, match[1], match[2] LOGGER.debug("no main title found") return title, None, None diff --git a/trafilatura/readability_lxml.py b/trafilatura/readability_lxml.py index fc563221..74ec86c7 100644 --- a/trafilatura/readability_lxml.py +++ b/trafilatura/readability_lxml.py @@ -86,7 +86,7 @@ def _tostring(string: HtmlElement) -> str: def text_length(elem: HtmlElement) -> int: "Return the length of the element with all its contents." - return len(trim(elem.text_content()) or "") + return len(trim(elem.text_content())) class Candidate: @@ -232,7 +232,7 @@ def score_paragraphs(self) -> Dict[HtmlElement, Candidate]: continue grand_parent_node = parent_node.getparent() - elem_text = trim(elem.text_content()) or "" + elem_text = trim(elem.text_content()) elem_text_len = len(elem_text) # discard too short paragraphs diff --git a/trafilatura/utils.py b/trafilatura/utils.py index ec6c5cb3..c41189fd 100644 --- a/trafilatura/utils.py +++ b/trafilatura/utils.py @@ -337,15 +337,13 @@ def sanitize_tree(tree: _Element) -> _Element: @lru_cache(maxsize=1024) -def trim(string: str) -> Optional[str]: - '''Remove unnecessary spaces within a text string''' +def trim(string: str) -> str: + "Remove unnecessary spaces within a text string." try: # remove newlines that are not related to punctuation or markup + proper trimming - # return LINES_TRIMMING.sub(r' ', string).strip(' \t\n\r\v') - # faster: - return ' '.join(string.split()).strip() + return " ".join(string.split()).strip() except (AttributeError, TypeError): - return None + return "" def is_image_file(imagesrc: Optional[str]) -> bool: