Skip to content

Commit

Permalink
fix certain types and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Oct 22, 2024
1 parent ebc0eba commit 2e72e08
Show file tree
Hide file tree
Showing 6 changed files with 7 additions and 7 deletions.
1 change: 1 addition & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -821,6 +821,7 @@ def test_extraction_options():
assert extract(my_html, only_with_metadata=False, output_format='xml', config=ZERO_CONFIG) is not None
assert extract(my_html, only_with_metadata=True, output_format='xml', config=ZERO_CONFIG) is None
assert extract(my_html, target_language='de', config=ZERO_CONFIG) is None
assert extract(my_html, target_language='de', no_fallback=True, config=ZERO_CONFIG) is None
assert etree.tostring(try_justext(html.fromstring(my_html), None, 'de')) == b'<body/>'
# assert extract(my_html) is None

Expand Down
4 changes: 2 additions & 2 deletions trafilatura/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ def create_pool(**args: Any) -> Any:
"Configure urllib3 download pool according to user-defined settings."
manager_class = SOCKSProxyManager if PROXY_URL else urllib3.PoolManager
manager_args = {"proxy_url": PROXY_URL} if PROXY_URL else {}
manager_args["num_pools"] = 50
return manager_class(**manager_args, **args)
manager_args["num_pools"] = 50 # type: ignore[assignment]
return manager_class(**manager_args, **args) # type: ignore[arg-type]


DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True)
Expand Down
4 changes: 1 addition & 3 deletions trafilatura/external.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from .readability_lxml import Document as ReadabilityDocument # fork
from .settings import JUSTEXT_LANGUAGES
from .utils import fromstring_bytes, trim
from .xml import TEI_VALID_TAGS, delete_element
from .xml import TEI_VALID_TAGS
from .xpaths import OVERALL_DISCARD_XPATH

LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -163,8 +163,6 @@ def sanitize_tree(tree: HtmlElement, options: Any) -> Tuple[HtmlElement, str, in
'''Convert and sanitize the output from the generic algorithm (post-processing)'''
# 1. clean
cleaned_tree = tree_cleaning(tree, options)
for elem in tree.findall(SANITIZED_XPATH):
delete_element(elem, keep_tail=False)
if options.links is False:
strip_tags(cleaned_tree, 'a')
strip_tags(cleaned_tree, 'span')
Expand Down
2 changes: 1 addition & 1 deletion trafilatura/feeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def find_links(feed_string: str, params: FeedParameters) -> List[str]:
# Atom
if "<link " in feed_string:
return [
LINK_HREF.search(link)[1]
LINK_HREF.search(link)[1] # type: ignore[index]
for link in (
m[0] for m in islice(LINK_ATTRS.finditer(feed_string), MAX_LINKS)
)
Expand Down
1 change: 1 addition & 0 deletions trafilatura/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,7 @@ def as_dict(self) -> Dict[str, Optional[str]]:
"math",
"menuitem",
"nav",
"noindex",
"noscript",
"optgroup",
"option",
Expand Down
2 changes: 1 addition & 1 deletion trafilatura/sitemaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def handle_link(self, link: str) -> None:
return
# fix, check, clean and normalize
link = fix_relative_urls(self.base_url, link)
link = clean_url(link, self.target_lang)
link = clean_url(link, self.target_lang) or ""

if not link or not lang_filter(link, self.target_lang):
return
Expand Down

0 comments on commit 2e72e08

Please sign in to comment.