Skip to content

Commit

Permalink
more fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Oct 23, 2024
1 parent ec7ecee commit 521a556
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 40 deletions.
55 changes: 27 additions & 28 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from copy import copy, deepcopy
from typing import Any, Dict, Optional, Set, Tuple, Union

from lxml.etree import _Element, XPath, strip_tags
from lxml.etree import _Element, Element, XPath, strip_tags
from lxml.html import HtmlElement

# own
Expand Down Expand Up @@ -198,32 +198,31 @@ def bare_extraction(
# PendingDeprecationWarning
# )

# load data
try:
# regroup extraction options
if not options or not isinstance(options, Extractor):
options = Extractor(
config=config,
output_format=output_format,
fast=no_fallback,
precision=favor_precision,
recall=favor_recall,
comments=include_comments,
formatting=include_formatting,
links=include_links,
images=include_images,
tables=include_tables,
dedup=deduplicate,
lang=target_language,
max_tree_size=max_tree_size,
url=url,
with_metadata=with_metadata,
only_with_metadata=only_with_metadata,
author_blacklist=author_blacklist,
url_blacklist=url_blacklist,
date_params=date_extraction_params,
)
# regroup extraction options
if not options or not isinstance(options, Extractor):
options = Extractor(
config=config,
output_format=output_format,
fast=no_fallback,
precision=favor_precision,
recall=favor_recall,
comments=include_comments,
formatting=include_formatting,
links=include_links,
images=include_images,
tables=include_tables,
dedup=deduplicate,
lang=target_language,
max_tree_size=max_tree_size,
url=url,
with_metadata=with_metadata,
only_with_metadata=only_with_metadata,
author_blacklist=author_blacklist,
url_blacklist=url_blacklist,
date_params=date_extraction_params,
)

try:
# load the HTML tree
tree = load_html(filecontent)
if tree is None:
Expand Down Expand Up @@ -282,7 +281,7 @@ def bare_extraction(
cleaned_tree, options
)
else:
commentsbody, temp_comments, len_comments = None, "", 0
commentsbody, temp_comments, len_comments = Element("body"), "", 0
if options.focus == "precision":
cleaned_tree = prune_unwanted_nodes(cleaned_tree, REMOVE_COMMENTS_XPATH)

Expand Down Expand Up @@ -450,7 +449,7 @@ def extract(
)

# post-processing
if not document:
if not document or not isinstance(document, Document):
return None

if options.format not in TXT_FORMATS:
Expand Down
5 changes: 3 additions & 2 deletions trafilatura/external.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@ def try_readability(htmlinput: HtmlElement) -> HtmlElement:
try:
doc = ReadabilityDocument(htmlinput, min_text_length=25, retry_length=250)
# force conversion to utf-8 (see #319)
return fromstring_bytes(doc.summary())
summary = fromstring_bytes(doc.summary())
return summary if summary is not None else HtmlElement()
except Exception as err:
LOGGER.warning('readability_lxml failed: %s', err)
return HtmlElement('div')
return HtmlElement()


def compare_extraction(tree: HtmlElement, backup_tree: HtmlElement, body: _Element, text: str, len_text: int, options: Any) -> Tuple[_Element, str, int]:
Expand Down
8 changes: 4 additions & 4 deletions trafilatura/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,9 +334,9 @@ def extract_metainfo(

def examine_title_element(
tree: HtmlElement,
) -> Tuple[Optional[str], Optional[str], Optional[str]]:
) -> Tuple[str, Optional[str], Optional[str]]:
"""Extract text segments out of main <title> element."""
title = None
title = ""
title_element = tree.find(".//head//title")
if title_element is not None:
title = trim(title_element.text_content())
Expand All @@ -355,8 +355,8 @@ def extract_title(tree: HtmlElement) -> Optional[str]:
if title:
return title
# extract using x-paths
title = extract_metainfo(tree, TITLE_XPATHS)
if title is not None:
title = extract_metainfo(tree, TITLE_XPATHS) or ""
if title:
return title
# extract using title tag
title, first, second = examine_title_element(tree)
Expand Down
5 changes: 2 additions & 3 deletions trafilatura/readability_lxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,8 @@ def summary(self) -> str:
LOGGER.debug(
"Ruthless and lenient parsing did not work. Returning raw html"
)
article = self.doc.find("body")
if article is None:
article = self.doc
body = self.doc.find("body")
article = body if body is not None else self.doc

cleaned_article = self.sanitize(article, candidates)
article_length = len(cleaned_article or "")
Expand Down
4 changes: 2 additions & 2 deletions trafilatura/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def __init__(
tags: Optional[List[str]] = None,
fingerprint: Optional[str] = None,
idval: Optional[str] = None,
license: Optional[str] = None,
license_val: Optional[str] = None,
body: _Element = Element("body"),
comments: Optional[str] = None,
commentsbody: _Element = Element("body"),
Expand All @@ -268,7 +268,7 @@ def __init__(
self.tags: Optional[List[str]] = tags
self.fingerprint: Optional[str] = fingerprint
self.id: Optional[str] = idval
self.license: Optional[str] = license
self.license: Optional[str] = license_val
self.body: _Element = body
self.comments: Optional[str] = comments
self.commentsbody: _Element = commentsbody
Expand Down
2 changes: 1 addition & 1 deletion trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@
# COMMENTS_BLACKLIST = ('( Abmelden / Ändern )') # Fill in your details below|Trage deine Daten unten|Kommentar verfassen|Bitte logge dich|Hinterlasse einen Kommentar| to %s| mit %s)


def handle_compressed_file(filecontent: bytes) -> Union[bytes, str]:
def handle_compressed_file(filecontent: bytes) -> bytes:
"""
Don't trust response headers and try to decompress a binary string
with a cascade of installed packages. Use magic numbers when available.
Expand Down

0 comments on commit 521a556

Please sign in to comment.