Skip to content

Commit

Permalink
feeds: also use feedparser if available
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Feb 12, 2024
1 parent ca32cab commit 36758fa
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 7 deletions.
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def get_long_description():
"brotli",
"cchardet >= 2.1.7; python_version < '3.11'", # build issue
"faust-cchardet >= 2.1.19; python_version >= '3.11'",
"feedparser >= 6.0.11",
"htmldate[speed] >= 1.7.0",
"py3langid >= 0.2.2",
"pycurl >= 7.45.2",
Expand Down
34 changes: 27 additions & 7 deletions trafilatura/feeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
from time import sleep
from typing import List, Optional

try:
import feedparser
except ImportError:
feedparser = None

from courlan import (
check_url,
clean_url,
Expand Down Expand Up @@ -118,14 +123,9 @@ def handle_link_list(linklist: List[str], params: FeedParameters) -> List[str]:
return output_links


def extract_links(feed_string: str, params: FeedParameters) -> List[str]:
"""Extract links from Atom and RSS feeds"""
def crude_extraction(feed_string, params):
"Extract links based on regular expressions."
feed_links = []
# check if it's a feed
if feed_string is None:
LOGGER.debug("Empty feed: %s", params.domain)
return feed_links
feed_string = feed_string.strip()
# typical first and second lines absent
if not FEED_OPENING.match(feed_string) and not (
"<rss" in feed_string[:100] or "<feed" in feed_string[:100]
Expand Down Expand Up @@ -163,6 +163,26 @@ def extract_links(feed_string: str, params: FeedParameters) -> List[str]:
)
]
)
return feed_links


def feedparser_extraction(feed_string):
"Parse and extract entry links using external feedparser package."
data = feedparser.parse(feed_string)
return [entry.link for entry in data.entries]


def extract_links(feed_string: str, params: FeedParameters) -> List[str]:
"""Extract links from Atom and RSS feeds"""
# check if it's a feed
if feed_string is None:
LOGGER.debug("Empty feed: %s", params.domain)
return []
feed_string = feed_string.strip()

feed_links = crude_extraction(feed_string, params)
if feedparser:
feed_links.extend(feedparser_extraction(feed_string))

# refine
output_links = handle_link_list(feed_links, params)
Expand Down

0 comments on commit 36758fa

Please sign in to comment.