Skip to content

Commit

Permalink
simplify trim() function (#727)
Browse files Browse the repository at this point in the history
* simplify trim() function

* use walrus
  • Loading branch information
adbar authored Oct 23, 2024
1 parent 2977af5 commit 52d21a6
Show file tree
Hide file tree
Showing 8 changed files with 28 additions and 32 deletions.
12 changes: 6 additions & 6 deletions trafilatura/baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,16 @@ def baseline(filecontent: Any) -> Tuple[_Element, str, int]:
for elem in tree.iterfind('.//script[@type="application/ld+json"]'):
if elem.text and 'articleBody' in elem.text:
try:
json_body = json.loads(elem.text).get("articleBody")
json_body = json.loads(elem.text).get("articleBody", "")
except Exception: # JSONDecodeError or 'list' object has no attribute 'get'
json_body = ""
if json_body:
if "<p>" in json_body:
parsed = load_html(json_body)
text = parsed.text_content() if parsed is not None else ""
text = trim(parsed.text_content()) if parsed is not None else ""
else:
text = json_body
SubElement(postbody, 'p').text = trim(text)
text = trim(json_body)
SubElement(postbody, 'p').text = text
temp_text += " " + text if temp_text else text
# return postbody, elem.text, len(elem.text)
if len(temp_text) > 100:
Expand All @@ -63,7 +63,7 @@ def baseline(filecontent: Any) -> Tuple[_Element, str, int]:
# scrape from article tag
temp_text = ""
for article_elem in tree.iterfind('.//article'):
text = trim(article_elem.text_content()) or ""
text = trim(article_elem.text_content())
if len(text) > 100:
SubElement(postbody, 'p').text = text
temp_text += " " + text if temp_text else text
Expand All @@ -76,7 +76,7 @@ def baseline(filecontent: Any) -> Tuple[_Element, str, int]:
temp_text = ""
# postbody = Element('body')
for element in tree.iter('blockquote', 'code', 'p', 'pre', 'q', 'quote'):
entry = trim(element.text_content()) or ""
entry = trim(element.text_content())
if entry not in results:
SubElement(postbody, 'p').text = entry
temp_text += " " + entry if temp_text else entry
Expand Down
2 changes: 1 addition & 1 deletion trafilatura/deduplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def put_in_cache(teststring: str) -> None:

def duplicate_test(element: _Element, options: Any) -> bool:
"Check for duplicate text with LRU cache."
teststring = trim(" ".join(element.itertext())) or ""
teststring = trim(" ".join(element.itertext()))
# teststring = element.text
if len(teststring) > options.min_duplcheck_size:
# retrieve value from cache
Expand Down
6 changes: 3 additions & 3 deletions trafilatura/external.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def compare_extraction(tree: HtmlElement, backup_tree: HtmlElement, body: _Eleme
# try with readability
temppost_algo = try_readability(backup_tree)
# unicode fix necessary on certain systems (#331)
algo_text = trim(tostring(temppost_algo, method='text', encoding='utf-8').decode('utf-8')) or ""
algo_text = trim(tostring(temppost_algo, method='text', encoding='utf-8').decode('utf-8'))
len_algo = len(algo_text)

# compare
Expand Down Expand Up @@ -155,7 +155,7 @@ def justext_rescue(tree: HtmlElement, options: Any) -> Tuple[_Element, str, int]
tree = basic_cleaning(tree)
# proceed
temppost_algo = try_justext(tree, options.url, options.lang)
temp_text = trim(' '.join(temppost_algo.itertext())) or ""
temp_text = trim(' '.join(temppost_algo.itertext()))
return temppost_algo, temp_text, len(temp_text)


Expand Down Expand Up @@ -185,5 +185,5 @@ def sanitize_tree(tree: HtmlElement, options: Any) -> Tuple[HtmlElement, str, in
]
strip_tags(cleaned_tree, *sanitization_list)
# 4. return
text = trim(' '.join(cleaned_tree.itertext())) or ""
text = trim(' '.join(cleaned_tree.itertext()))
return cleaned_tree, text, len(text)
14 changes: 7 additions & 7 deletions trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def link_density_test(
# shortcut
if len(links_xpath) == 1:
len_threshold = 10 if favor_precision else 100
link_text = trim(links_xpath[0].text_content()) or ""
link_text = trim(links_xpath[0].text_content())
if len(link_text) > len_threshold and len(link_text) > len(text) * 0.9:
return True, []
if element.tag == "p":
Expand Down Expand Up @@ -176,7 +176,7 @@ def link_density_test_tables(element: HtmlElement) -> bool:
if not links_xpath:
return False

elemlen = len(trim(element.text_content()) or "")
elemlen = len(trim(element.text_content()))
if elemlen < 200:
return False

Expand All @@ -201,7 +201,7 @@ def delete_by_link_density(
depth_threshold = 1 if favor_precision else 3

for elem in subtree.iter(tagname):
elemtext = trim(elem.text_content()) or ""
elemtext = trim(elem.text_content())
result, templist = link_density_test(elem, elemtext, favor_precision)
if result or (
backtracking
Expand Down Expand Up @@ -232,7 +232,7 @@ def handle_textnode(
# lb bypass
if not comments_fix and elem.tag == "lb":
if not preserve_spaces:
elem.tail = trim(elem.tail)
elem.tail = trim(elem.tail) or None
# if textfilter(elem) is True:
# return None
# duplicate_test(subelement)?
Expand All @@ -248,9 +248,9 @@ def handle_textnode(

# trim
if not preserve_spaces:
elem.text = trim(elem.text)
elem.text = trim(elem.text) or None
if elem.tail:
elem.tail = trim(elem.tail)
elem.tail = trim(elem.tail) or None

# filter content
# or not re.search(r'\w', element.text): # text_content()?
Expand All @@ -269,7 +269,7 @@ def process_node(elem: _Element, options: Extractor) -> Optional[_Element]:
return None

# trim
elem.text, elem.tail = trim(elem.text), trim(elem.tail)
elem.text, elem.tail = trim(elem.text) or None, trim(elem.tail) or None

# adapt content string
if elem.tag != "lb" and not elem.text and elem.tail:
Expand Down
6 changes: 3 additions & 3 deletions trafilatura/json_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def normalize_json(string: str) -> str:
string = JSON_UNICODE_REPLACE.sub(lambda match: chr(int(match[1], 16)), string)
string = ''.join(c for c in string if ord(c) < 0xD800 or ord(c) > 0xDFFF)
string = unescape(string)
return trim(JSON_REMOVE_HTML.sub('', string)) or ""
return trim(JSON_REMOVE_HTML.sub('', string))


def normalize_authors(current_authors: Optional[str], author_string: str) -> Optional[str]:
Expand All @@ -240,13 +240,13 @@ def normalize_authors(current_authors: Optional[str], author_string: str) -> Opt
author_string = HTML_STRIP_TAGS.sub('', author_string)
# examine names
for author in AUTHOR_SPLIT.split(author_string):
author = trim(author) or ""
author = trim(author)
# remove emoji
author = AUTHOR_EMOJI_REMOVE.sub('', author)
# remove @username
author = AUTHOR_TWITTER.sub('', author)
# replace special characters with space
author = trim(AUTHOR_REPLACE_JOIN.sub(' ', author)) or ""
author = trim(AUTHOR_REPLACE_JOIN.sub(' ', author))
author = AUTHOR_REMOVE_NICKNAME.sub('', author)
# remove special characters
author = AUTHOR_REMOVE_SPECIAL.sub('', author)
Expand Down
6 changes: 2 additions & 4 deletions trafilatura/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,10 +340,8 @@ def examine_title_element(
title_element = tree.find(".//head//title")
if title_element is not None:
title = trim(title_element.text_content())
if title:
match = HTMLTITLE_REGEX.match(title)
if match:
return title, match[1], match[2]
if match := HTMLTITLE_REGEX.match(title):
return title, match[1], match[2]
LOGGER.debug("no main title found")
return title, None, None

Expand Down
4 changes: 2 additions & 2 deletions trafilatura/readability_lxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def _tostring(string: HtmlElement) -> str:

def text_length(elem: HtmlElement) -> int:
"Return the length of the element with all its contents."
return len(trim(elem.text_content()) or "")
return len(trim(elem.text_content()))


class Candidate:
Expand Down Expand Up @@ -232,7 +232,7 @@ def score_paragraphs(self) -> Dict[HtmlElement, Candidate]:
continue
grand_parent_node = parent_node.getparent()

elem_text = trim(elem.text_content()) or ""
elem_text = trim(elem.text_content())
elem_text_len = len(elem_text)

# discard too short paragraphs
Expand Down
10 changes: 4 additions & 6 deletions trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,15 +337,13 @@ def sanitize_tree(tree: _Element) -> _Element:


@lru_cache(maxsize=1024)
def trim(string: str) -> Optional[str]:
'''Remove unnecessary spaces within a text string'''
def trim(string: str) -> str:
"Remove unnecessary spaces within a text string."
try:
# remove newlines that are not related to punctuation or markup + proper trimming
# return LINES_TRIMMING.sub(r' ', string).strip(' \t\n\r\v')
# faster:
return ' '.join(string.split()).strip()
return " ".join(string.split()).strip()
except (AttributeError, TypeError):
return None
return ""


def is_image_file(imagesrc: Optional[str]) -> bool:
Expand Down

0 comments on commit 52d21a6

Please sign in to comment.