Skip to content

Commit

Permalink
CLI: add 126 exit code for high error ratio
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Nov 19, 2024
1 parent dafbe6d commit 031ed2a
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 19 deletions.
2 changes: 1 addition & 1 deletion tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def test_download():
args = cli.parse_args(testargs)
with pytest.raises(SystemExit) as e:
cli.process_args(args)
assert e.type == SystemExit and e.value.code == 1
assert e.type == SystemExit and e.value.code == 126


# @patch('trafilatura.settings.MAX_FILES_PER_DIRECTORY', 1)
Expand Down
19 changes: 7 additions & 12 deletions trafilatura/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def main() -> None:

def process_args(args: Any) -> None:
"""Perform the actual processing according to the arguments"""
error_caught = False
exit_code = 0

if args.verbose == 1:
logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
Expand All @@ -211,7 +211,7 @@ def process_args(args: Any) -> None:

# fetch urls from a feed or a sitemap
if args.explore or args.feed or args.sitemap:
cli_discovery(args)
exit_code = cli_discovery(args)

# activate crawler/spider
elif args.crawl:
Expand All @@ -225,24 +225,19 @@ def process_args(args: Any) -> None:
elif args.input_dir:
file_processing_pipeline(args)

# read url list from input file
elif args.input_file:
# read url list from input file or process input URL
elif args.input_file or args.URL:
url_store = load_input_dict(args)
error_caught = url_processing_pipeline(args, url_store)

# process input URL
elif args.URL:
url_store = load_input_dict(args)
error_caught = url_processing_pipeline(args, url_store) # process single url
exit_code = url_processing_pipeline(args, url_store)

# read input on STDIN directly
else:
result = examine(sys.stdin.buffer.read(), args, url=args.URL)
write_result(result, args)

# change exit code if there are errors
if error_caught:
sys.exit(1)
if exit_code != 0:
sys.exit(exit_code)


if __name__ == '__main__':
Expand Down
22 changes: 16 additions & 6 deletions trafilatura/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def download_queue_processing(
return errors, counter


def cli_discovery(args: Any) -> None:
def cli_discovery(args: Any) -> int:
"Group CLI functions dedicated to URL discovery."
url_store = load_input_dict(args)
input_urls = url_store.dump_urls()
Expand Down Expand Up @@ -320,14 +320,16 @@ def cli_discovery(args: Any) -> None:
reset_caches()

# process the (rest of the) links found
error_caught = url_processing_pipeline(args, url_store)
exit_code = url_processing_pipeline(args, url_store)

# activate site explorer
if args.explore:
# add to compressed dict and crawl the remaining websites
control_dict = build_exploration_dict(url_store, input_urls, args)
cli_crawler(args, url_store=control_dict, options=options)

return exit_code


def build_exploration_dict(
url_store: UrlStore, input_urls: List[str], args: Any
Expand Down Expand Up @@ -417,14 +419,22 @@ def probe_homepage(args: Any) -> None:
print(url, flush=True)


def url_processing_pipeline(args: Any, url_store: UrlStore) -> bool:
def _define_exit_code(errors: List[str], total: int) -> int:
"""Compute exit code based on the number of errors:
0 if there are no errors, 126 if there are too many, 1 otherwise."""
ratio = len(errors) / total if total > 0 else 0
return 126 if ratio > 0.99 else 1 if errors else 0


def url_processing_pipeline(args: Any, url_store: UrlStore) -> int:
"Aggregated functions to show a list and download and process an input list."
if args.list:
url_store.print_unvisited_urls() # and not write_result()
return False # and not sys.exit(0)

options = args_to_extractor(args)
counter = 0 if url_store.total_url_number() > MAX_FILES_PER_DIRECTORY else -1
url_count = url_store.total_url_number()
counter = 0 if url_count > MAX_FILES_PER_DIRECTORY else -1

# download strategy
errors, counter = download_queue_processing(url_store, args, counter, options)
Expand All @@ -443,9 +453,9 @@ def url_processing_pipeline(args: Any, url_store: UrlStore) -> bool:
len(errors),
)
# pass information along if URLs are missing
return bool(archived_errors)
return _define_exit_code(archived_errors, url_store.total_url_number())

return bool(errors)
return _define_exit_code(errors, url_count)


def file_processing_pipeline(args: Any) -> None:
Expand Down

0 comments on commit 031ed2a

Please sign in to comment.