diff --git a/tests/feeds_tests.py b/tests/feeds_tests.py
index 82883c2b..4d37e748 100644
--- a/tests/feeds_tests.py
+++ b/tests/feeds_tests.py
@@ -7,174 +7,276 @@
import sys
from unittest.mock import patch
-from trafilatura import cli, feeds
+from courlan import get_hostinfo
+from trafilatura.cli import main
+from trafilatura.feeds import (
+ FeedParameters,
+ determine_feed,
+ extract_links,
+ find_feed_urls,
+ handle_link_list,
+)
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
TEST_DIR = os.path.abspath(os.path.dirname(__file__))
-RESOURCES_DIR = os.path.join(TEST_DIR, 'resources')
+RESOURCES_DIR = os.path.join(TEST_DIR, "resources")
XMLDECL = '\n'
def test_atom_extraction():
- '''Test link extraction from an Atom feed'''
- assert feeds.extract_links(None, 'example.org', 'https://example.org', '') == []
- assert len(feeds.extract_links('', 'example.org', 'https://example.org', '')) == 0
- filepath = os.path.join(RESOURCES_DIR, 'feed1.atom')
- with open(filepath) as f:
+ """Test link extraction from an Atom feed"""
+ params = FeedParameters("https://example.org", "example.org", "")
+ assert not extract_links(None, params)
+ assert len(extract_links("", params)) == 0
+
+ filepath = os.path.join(RESOURCES_DIR, "feed1.atom")
+ with open(filepath, "r", encoding="utf-8") as f:
teststring = f.read()
- assert len(feeds.extract_links(teststring, 'example.org', 'https://example.org', '')) > 0
+ assert len(extract_links(teststring, params)) > 0
+
+ params = FeedParameters("https://www.dwds.de", "dwds.de", "")
assert (
len(
- feeds.extract_links(
+ extract_links(
f'{XMLDECL}',
- 'dwds.de',
- 'https://www.dwds.de',
- '',
+ params,
)
)
== 0
)
+
+ params = FeedParameters("http://example.org", "example.org", "http://example.org")
assert (
len(
- feeds.extract_links(
+ extract_links(
f'{XMLDECL}',
- 'example.org',
- 'http://example.org/',
- 'http://example.org',
+ params,
)
)
== 0
)
+
+ params = FeedParameters("https://example.org", "example.org", "")
assert (
len(
- feeds.extract_links(
+ extract_links(
f'{XMLDECL}',
- 'example.org',
- 'https://example.org',
- '',
+ params,
)
)
== 0
)
- assert feeds.extract_links(
- f'{XMLDECL}',
- 'example.org',
- 'http://example.org/',
- 'http://example.org',
- ) == ['http://example.org/article1/'] # TODO: remove slash?
+
+ params = FeedParameters("http://example.org/", "example.org", "http://example.org")
+ assert extract_links(
+ f'{XMLDECL}', params
+ ) == [
+ "http://example.org/article1/"
+ ] # TODO: remove slash?
def test_rss_extraction():
- '''Test link extraction from a RSS feed'''
+ """Test link extraction from a RSS feed"""
+ params = FeedParameters("http://example.org/", "example.org", "")
assert (
len(
- feeds.extract_links(
- f'{XMLDECL}http://example.org/article1/',
- 'example.org',
- 'http://example.org/',
- '',
- )
+ extract_links(f"{XMLDECL}http://example.org/article1/", params)
)
== 1
)
# CDATA
- assert feeds.extract_links(
- f'{XMLDECL}',
- 'example.org',
- 'http://example.org/',
- '',
- ) == ['http://example.org/article1/'] # TODO: remove slash?
+ assert extract_links(
+ f"{XMLDECL}", params
+ ) == [
+ "http://example.org/article1/"
+ ] # TODO: remove slash?
+
# spaces
- assert len(feeds.extract_links(XMLDECL + '\r\n https://www.ak-kurier.de/akkurier/www/artikel/108815-sinfonisches-blasorchester-spielt-1500-euro-fuer-kinder-in-drk-krankenhaus-kirchen-ein ', 'ak-kurier.de', 'https://www.ak-kurier.de/', '')) == 1
+ params = FeedParameters("https://www.ak-kurier.de/", "ak-kurier.de", "")
assert (
len(
- feeds.extract_links(
- f'{XMLDECL}http://example.org/',
- 'example.org',
- 'http://example.org',
- 'http://example.org',
+ extract_links(
+ XMLDECL
+ + "\r\n https://www.ak-kurier.de/akkurier/www/artikel/108815-sinfonisches-blasorchester-spielt-1500-euro-fuer-kinder-in-drk-krankenhaus-kirchen-ein ",
+ params,
)
)
- == 0
+ == 1
)
+
+ params = FeedParameters("http://example.org", "example.org", "http://example.org")
+ assert len(extract_links(f"{XMLDECL}http://example.org/", params)) == 0
+
+ params = FeedParameters("http://example.org", "example.org", "")
+ assert len(extract_links(f"{XMLDECL}https://example.org", params)) == 0
+
+ params = FeedParameters("https://www.dwds.de", "dwds.de", "https://www.dwds.de")
+ assert extract_links(
+ f"{XMLDECL}/api/feed/themenglossar/Corona", params
+ ) == ["https://www.dwds.de/api/feed/themenglossar/Corona"]
+
+ params = FeedParameters("https://example.org", "example.org", "")
+ filepath = os.path.join(RESOURCES_DIR, "feed2.rss")
+ with open(filepath, "r", encoding="utf-8") as f:
+ teststring = f.read()
+ assert len(extract_links(teststring, params)) > 0
+
+
+def test_json_extraction():
+ """Test link extraction from a JSON feed"""
+ # find link
+ params = FeedParameters("https://www.jsonfeed.org", "jsonfeed.org", "")
assert (
len(
- feeds.extract_links(
- f'{XMLDECL}https://example.org',
- 'example.org',
- 'http://example.org/',
- '',
+ determine_feed(
+ '>