review CLI processing

adbar · Feb 8, 2024 · a40ef04 · a40ef04
1 parent 0ad353d
commit a40ef04
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 10 deletions.
diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py
@@ -231,20 +231,22 @@ def download_queue_processing(url_store, args, counter, config):
 
 def cli_discovery(args):
     "Group CLI functions dedicated to URL discovery."
-    func = find_feed_urls if args.feed else sitemap_search
-
     url_store = load_input_dict(args)
     input_urls = url_store.dump_urls()
     if args.list:
         url_store.reset()
 
     config = use_config(filename=args.config_file)
-    ext = config.getboolean('DEFAULT', 'EXTERNAL_URLS')
-    # sleep_time = config.getfloat('DEFAULT', 'SLEEP_TIME')
+    func = partial(
+               find_feed_urls if args.feed else sitemap_search,
+               target_lang=args.target_language,
+               external=config.getboolean('DEFAULT', 'EXTERNAL_URLS'),
+               sleep_time=config.getfloat('DEFAULT', 'SLEEP_TIME')
+           )
 
     # link discovery and storage
     with ThreadPoolExecutor(max_workers=args.parallel) as executor:
-        futures = (executor.submit(func, url, target_lang=args.target_language, external=ext) for url in input_urls)
+        futures = (executor.submit(func, url) for url in input_urls)
         # process results from the parallel threads and add them
         # to the compressed URL dictionary for further processing
         for future in as_completed(futures):

diff --git a/trafilatura/feeds.py b/trafilatura/feeds.py
@@ -10,6 +10,7 @@
 import re
 
 from itertools import islice
+from time import sleep
 from typing import List, Optional
 
 from courlan import (
@@ -216,7 +217,7 @@ def determine_feed(htmlstring: str, params: FeedParameters) -> List[str]:
 
 
 def find_feed_urls(
-    url: str, target_lang: Optional[str] = None, external: bool = False
+    url: str, target_lang: Optional[str] = None, external: bool = False, sleep_time: int = 2,
 ) -> List[str]:
     """Try to find feed URLs.
 
@@ -227,6 +228,7 @@ def find_feed_urls(
                      (two-letter string, ISO 639-1 format).
         external: Similar hosts only or external URLs
                   (boolean, defaults to False).
+        sleep_time: Wait between requests on the same website.
 
     Returns:
         The extracted links as a list (sorted list of unique links).
@@ -259,6 +261,7 @@ def find_feed_urls(
     else:
         LOGGER.error("Could not download web page: %s", url)
         if url.strip("/") != baseurl:
+            sleep(sleep_time)
             return try_homepage(baseurl, target_lang)
     # try alternative: Google News
     if target_lang is not None:

diff --git a/trafilatura/sitemaps.py b/trafilatura/sitemaps.py
@@ -197,7 +197,7 @@ def sitemap_search(
                      (two-letter string, ISO 639-1 format).
         external: Similar hosts only or external URLs
                   (boolean, defaults to False).
-        sleep_time: Wait between requests to the same website.
+        sleep_time: Wait between requests on the same website.
 
     Returns:
         The extracted links as a list (sorted list of unique links).
@@ -231,7 +231,7 @@ def sitemap_search(
         ]
 
     # iterate through nested sitemaps and results
-    while sitemap.sitemap_urls:
+    while sitemap.sitemap_urls and len(sitemap.seen) < MAX_SITEMAPS_SEEN:
         sitemap.current_url = sitemap.sitemap_urls.pop()
         sitemap.fetch()
         sitemap.process()
@@ -242,8 +242,6 @@ def sitemap_search(
 
         if len(sitemap.seen) < MAX_SITEMAPS_SEEN:
             sleep(sleep_time)
-        else:
-            break
 
     if urlfilter:
         sitemap.urls = filter_urls(sitemap.urls, urlfilter)