Skip to content

Commit

Permalink
shorter code
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Feb 8, 2024
1 parent 23b7cdd commit 7f8cefc
Showing 1 changed file with 5 additions and 9 deletions.
14 changes: 5 additions & 9 deletions trafilatura/sitemaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,25 +215,21 @@ def sitemap_search(

sitemap = SitemapObject(baseurl, domainname, sitemapurls, target_lang, external)

# try sitemaps in robots.txt file
# try sitemaps in robots.txt file, additional URLs just in case
if not sitemap.sitemap_urls:
sitemap.sitemap_urls = find_robots_sitemaps(baseurl)
# try additional URLs just in case
if not sitemap.sitemap_urls:
sitemap.sitemap_urls = ["".join([baseurl, "/", g]) for g in GUESSES]
sitemap.sitemap_urls = find_robots_sitemaps(baseurl) or [
f"{baseurl}/{g}" for g in GUESSES
]

# iterate through nested sitemaps and results
while sitemap.sitemap_urls:
while sitemap.sitemap_urls and len(sitemap.seen) < MAX_SITEMAPS_SEEN:
sitemap.current_url = sitemap.sitemap_urls.pop()
sitemap.fetch()
sitemap.process()
# sanity check: keep track of visited sitemaps and exclude them
sitemap.sitemap_urls = [
s for s in sitemap.sitemap_urls if s not in sitemap.seen
]
# counter and safeguard
if len(sitemap.seen) >= MAX_SITEMAPS_SEEN:
break

LOGGER.debug("%s sitemap links found for %s", len(sitemap.urls), domainname)
return sitemap.urls
Expand Down

0 comments on commit 7f8cefc

Please sign in to comment.