Skip to content

Commit

Permalink
more fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Oct 24, 2024
1 parent 30b1266 commit 6b440f6
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 9 deletions.
21 changes: 15 additions & 6 deletions trafilatura/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,15 +208,23 @@ def _send_urllib_request(
return None


def _handle_response(
url: str, response: Response, decode: bool, options: Extractor
) -> Optional[Union[Response, str]]: # todo: only return str
"Internal function to run safety checks on response result."
def _is_suitable_response(url: str, response: Response, options: Extractor) -> bool:
"Check if the response conforms to formal criteria."
lentest = len(response.html or response.data or "")
if response.status != 200:
LOGGER.error("not a 200 response: %s for URL %s", response.status, url)
return False
# raise error instead?
elif is_acceptable_length(lentest, options):
if not is_acceptable_length(lentest, options):
return False
return True


def _handle_response(
url: str, response: Response, decode: bool, options: Extractor
) -> Optional[Union[Response, str]]: # todo: only return str
"Internal function to run safety checks on response result."
if _is_suitable_response(url, response, options):
return response.html if decode else response
# catchall
return None
Expand Down Expand Up @@ -244,7 +252,8 @@ def fetch_url(
if response and response.data:
if not options:
options = Extractor(config=config)
return _handle_response(url, response, True, options)
if _is_suitable_response(url, response, options):
return response.html
return None


Expand Down
2 changes: 1 addition & 1 deletion trafilatura/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
CPU_COUNT = len(sched_getaffinity(0))
except ImportError:
from os import cpu_count
CPU_COUNT = cpu_count()
CPU_COUNT = cpu_count() or 1

from pathlib import Path

Expand Down
2 changes: 1 addition & 1 deletion trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ def line_processing(line: str, preserve_space: bool = False, trailing_space: boo
if not preserve_space:
# remove newlines that are not related to punctuation or markup
# remove non-printable chars and normalize space characters (including Unicode spaces)
new_line = trim(LINES_TRIMMING.sub(r" ", new_line)) # type: ignore[assignment]
new_line = trim(LINES_TRIMMING.sub(r" ", new_line))
# prune empty lines
if all(map(str.isspace, new_line)):
new_line = None # type: ignore[assignment]
Expand Down
2 changes: 1 addition & 1 deletion trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def build_xml_output(docmeta: Document) -> _Element:
'''Build XML output tree based on extracted information'''
output = Element('doc')
add_xml_meta(output, docmeta)
docmeta.body.tag = 'main' # type: ignore[attr-defined]
docmeta.body.tag = 'main'

# clean XML tree
output.append(clean_attributes(docmeta.body))
Expand Down

0 comments on commit 6b440f6

Please sign in to comment.