Skip to content

Commit

Permalink
htmldate and courlan: update setup and tests (#444)
Browse files Browse the repository at this point in the history
* dependencies: update setup and tests

* use is_valid_url

* fix

* update setup and test

* lint setup
  • Loading branch information
adbar authored Nov 28, 2023
1 parent 71c1661 commit 6f66414
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 10 deletions.
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def get_long_description():
"brotli",
"cchardet >= 2.1.7; python_version < '3.11'", # build issue
"faust-cchardet >= 2.1.18; python_version >= '3.11'", # fix for build
"htmldate[speed] >= 1.5.1",
"htmldate[speed] >= 1.6.0",
"py3langid >= 0.2.2",
"pycurl >= 7.45.2",
],
Expand Down Expand Up @@ -111,8 +111,8 @@ def get_long_description():
"certifi",
"charset_normalizer >= 3.0.1; python_version < '3.7'",
"charset_normalizer >= 3.2.0; python_version >= '3.7'",
"courlan >= 0.9.4",
"htmldate >= 1.5.1",
"courlan >= 0.9.5",
"htmldate >= 1.6.0",
"justext >= 3.0.0",
"lxml >= 4.9.3 ; platform_system != 'Darwin'",
"lxml == 4.9.2 ; platform_system == 'Darwin'",
Expand Down
2 changes: 1 addition & 1 deletion tests/metadata_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def test_dates():
metadata = extract_metadata(mystring, fastmode=False)
assert metadata.date == '2017-09-01'
metadata = extract_metadata(mystring, fastmode=True)
assert metadata.date is None
assert metadata.date == '2017-09-01'


def test_sitename():
Expand Down
3 changes: 2 additions & 1 deletion tests/realworld_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,8 +700,9 @@ def test_pages():
assert metadata.url == url

url = 'https://www.ndr.de/nachrichten/info/16-Coronavirus-Update-Wir-brauchen-Abkuerzungen-bei-der-Impfstoffzulassung,podcastcoronavirus140.html'
corrected_url = 'https://www.ndr.de/nachrichten/info/16-Coronavirus-Update-Wir-brauchen-Abkuerzungen-bei-der-Impfstoffzulassung,podcastcoronavirus140.html'
metadata = extract_metadata(load_mock_page_meta(url), default_url=url)
assert metadata.url == url
assert metadata.url == corrected_url
assert 'Korinna Hennig' in metadata.author
assert 'Ältere Menschen' in str(metadata.tags)

Expand Down
4 changes: 2 additions & 2 deletions trafilatura/feeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
filter_urls,
fix_relative_urls,
get_hostinfo,
validate_url,
is_valid_url,
)

from .downloads import fetch_url
Expand Down Expand Up @@ -186,7 +186,7 @@ def determine_feed(htmlstring: str, params: FeedParameters) -> List[str]:
for link in uniquify_list(feed_urls):
link = fix_relative_urls(params.base, link)
link = clean_url(link)
if link is None or link == params.ref or validate_url(link)[0] is False:
if link is None or link == params.ref or not is_valid_url(link):
continue
if BLACKLIST.search(link):
continue
Expand Down
6 changes: 3 additions & 3 deletions trafilatura/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import re
from copy import deepcopy

from courlan import extract_domain, get_base_url, normalize_url, validate_url
from courlan import extract_domain, get_base_url, is_valid_url, normalize_url, validate_url
from htmldate import find_date
from lxml.html import tostring

Expand Down Expand Up @@ -173,7 +173,7 @@ def extract_opengraph(tree):
title = elem.get('content')
# orig URL
elif elem.get('property') == 'og:url':
if validate_url(elem.get('content'))[0] is True:
if is_valid_url(elem.get('content')):
url = elem.get('content')
# description
elif elem.get('property') == 'og:description':
Expand Down Expand Up @@ -250,7 +250,7 @@ def examine_meta(tree):
backup_sitename = content_attr
# url
elif name_attr == 'twitter:url':
if url is None and validate_url(content_attr)[0] is True:
if url is None and is_valid_url(content_attr):
url = content_attr
# keywords
elif name_attr in METANAME_TAG: # 'page-topic'
Expand Down

0 comments on commit 6f66414

Please sign in to comment.