Skip to content

Commit

Permalink
review CLI processing
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Feb 8, 2024
1 parent 0ad353d commit a40ef04
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 10 deletions.
12 changes: 7 additions & 5 deletions trafilatura/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,20 +231,22 @@ def download_queue_processing(url_store, args, counter, config):

def cli_discovery(args):
"Group CLI functions dedicated to URL discovery."
func = find_feed_urls if args.feed else sitemap_search

url_store = load_input_dict(args)
input_urls = url_store.dump_urls()
if args.list:
url_store.reset()

config = use_config(filename=args.config_file)
ext = config.getboolean('DEFAULT', 'EXTERNAL_URLS')
# sleep_time = config.getfloat('DEFAULT', 'SLEEP_TIME')
func = partial(
find_feed_urls if args.feed else sitemap_search,
target_lang=args.target_language,
external=config.getboolean('DEFAULT', 'EXTERNAL_URLS'),
sleep_time=config.getfloat('DEFAULT', 'SLEEP_TIME')
)

# link discovery and storage
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
futures = (executor.submit(func, url, target_lang=args.target_language, external=ext) for url in input_urls)
futures = (executor.submit(func, url) for url in input_urls)
# process results from the parallel threads and add them
# to the compressed URL dictionary for further processing
for future in as_completed(futures):
Expand Down
5 changes: 4 additions & 1 deletion trafilatura/feeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import re

from itertools import islice
from time import sleep
from typing import List, Optional

from courlan import (
Expand Down Expand Up @@ -216,7 +217,7 @@ def determine_feed(htmlstring: str, params: FeedParameters) -> List[str]:


def find_feed_urls(
url: str, target_lang: Optional[str] = None, external: bool = False
url: str, target_lang: Optional[str] = None, external: bool = False, sleep_time: int = 2,
) -> List[str]:
"""Try to find feed URLs.
Expand All @@ -227,6 +228,7 @@ def find_feed_urls(
(two-letter string, ISO 639-1 format).
external: Similar hosts only or external URLs
(boolean, defaults to False).
sleep_time: Wait between requests on the same website.
Returns:
The extracted links as a list (sorted list of unique links).
Expand Down Expand Up @@ -259,6 +261,7 @@ def find_feed_urls(
else:
LOGGER.error("Could not download web page: %s", url)
if url.strip("/") != baseurl:
sleep(sleep_time)
return try_homepage(baseurl, target_lang)
# try alternative: Google News
if target_lang is not None:
Expand Down
6 changes: 2 additions & 4 deletions trafilatura/sitemaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def sitemap_search(
(two-letter string, ISO 639-1 format).
external: Similar hosts only or external URLs
(boolean, defaults to False).
sleep_time: Wait between requests to the same website.
sleep_time: Wait between requests on the same website.
Returns:
The extracted links as a list (sorted list of unique links).
Expand Down Expand Up @@ -231,7 +231,7 @@ def sitemap_search(
]

# iterate through nested sitemaps and results
while sitemap.sitemap_urls:
while sitemap.sitemap_urls and len(sitemap.seen) < MAX_SITEMAPS_SEEN:
sitemap.current_url = sitemap.sitemap_urls.pop()
sitemap.fetch()
sitemap.process()
Expand All @@ -242,8 +242,6 @@ def sitemap_search(

if len(sitemap.seen) < MAX_SITEMAPS_SEEN:
sleep(sleep_time)
else:
break

if urlfilter:
sitemap.urls = filter_urls(sitemap.urls, urlfilter)
Expand Down

0 comments on commit a40ef04

Please sign in to comment.