diff --git a/setup.py b/setup.py index 7dbc52ad..db0fbdcd 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,7 @@ def get_long_description(): "brotli", "cchardet >= 2.1.7; python_version < '3.11'", # build issue "faust-cchardet >= 2.1.19; python_version >= '3.11'", + "feedparser >= 6.0.11", "htmldate[speed] >= 1.7.0", "py3langid >= 0.2.2", "pycurl >= 7.45.2", diff --git a/trafilatura/feeds.py b/trafilatura/feeds.py index c63f1328..2df23ea6 100644 --- a/trafilatura/feeds.py +++ b/trafilatura/feeds.py @@ -13,6 +13,11 @@ from time import sleep from typing import List, Optional +try: + import feedparser +except ImportError: + feedparser = None + from courlan import ( check_url, clean_url, @@ -118,14 +123,9 @@ def handle_link_list(linklist: List[str], params: FeedParameters) -> List[str]: return output_links -def extract_links(feed_string: str, params: FeedParameters) -> List[str]: - """Extract links from Atom and RSS feeds""" +def crude_extraction(feed_string, params): + "Extract links based on regular expressions." feed_links = [] - # check if it's a feed - if feed_string is None: - LOGGER.debug("Empty feed: %s", params.domain) - return feed_links - feed_string = feed_string.strip() # typical first and second lines absent if not FEED_OPENING.match(feed_string) and not ( " List[str]: ) ] ) + return feed_links + + +def feedparser_extraction(feed_string): + "Parse and extract entry links using external feedparser package." + data = feedparser.parse(feed_string) + return [entry.link for entry in data.entries] + + +def extract_links(feed_string: str, params: FeedParameters) -> List[str]: + """Extract links from Atom and RSS feeds""" + # check if it's a feed + if feed_string is None: + LOGGER.debug("Empty feed: %s", params.domain) + return [] + feed_string = feed_string.strip() + + feed_links = crude_extraction(feed_string, params) + if feedparser: + feed_links.extend(feedparser_extraction(feed_string)) # refine output_links = handle_link_list(feed_links, params)