fix certain types and tests

adbar · Oct 22, 2024 · 2e72e08 · 2e72e08
1 parent ebc0eba
commit 2e72e08
Show file tree

Hide file tree

Showing 6 changed files with 7 additions and 7 deletions.
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -821,6 +821,7 @@ def test_extraction_options():
     assert extract(my_html, only_with_metadata=False, output_format='xml', config=ZERO_CONFIG) is not None
     assert extract(my_html, only_with_metadata=True, output_format='xml', config=ZERO_CONFIG) is None
     assert extract(my_html, target_language='de', config=ZERO_CONFIG) is None
+    assert extract(my_html, target_language='de', no_fallback=True, config=ZERO_CONFIG) is None
     assert etree.tostring(try_justext(html.fromstring(my_html), None, 'de')) == b'<body/>'
     # assert extract(my_html) is None
 

diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py
@@ -58,8 +58,8 @@ def create_pool(**args: Any) -> Any:
     "Configure urllib3 download pool according to user-defined settings."
     manager_class = SOCKSProxyManager if PROXY_URL else urllib3.PoolManager
     manager_args = {"proxy_url": PROXY_URL} if PROXY_URL else {}
-    manager_args["num_pools"] = 50
-    return manager_class(**manager_args, **args)
+    manager_args["num_pools"] = 50  # type: ignore[assignment]
+    return manager_class(**manager_args, **args)  # type: ignore[arg-type]
 
 
 DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True)

diff --git a/trafilatura/external.py b/trafilatura/external.py
@@ -19,7 +19,7 @@
 from .readability_lxml import Document as ReadabilityDocument  # fork
 from .settings import JUSTEXT_LANGUAGES
 from .utils import fromstring_bytes, trim
-from .xml import TEI_VALID_TAGS, delete_element
+from .xml import TEI_VALID_TAGS
 from .xpaths import OVERALL_DISCARD_XPATH
 
 LOGGER = logging.getLogger(__name__)
@@ -163,8 +163,6 @@ def sanitize_tree(tree: HtmlElement, options: Any) -> Tuple[HtmlElement, str, in
     '''Convert and sanitize the output from the generic algorithm (post-processing)'''
     # 1. clean
     cleaned_tree = tree_cleaning(tree, options)
-    for elem in tree.findall(SANITIZED_XPATH):
-        delete_element(elem, keep_tail=False)
     if options.links is False:
         strip_tags(cleaned_tree, 'a')
     strip_tags(cleaned_tree, 'span')

diff --git a/trafilatura/feeds.py b/trafilatura/feeds.py
@@ -145,7 +145,7 @@ def find_links(feed_string: str, params: FeedParameters) -> List[str]:
     # Atom
     if "<link " in feed_string:
         return [
-            LINK_HREF.search(link)[1]
+            LINK_HREF.search(link)[1]  # type: ignore[index]
             for link in (
                 m[0] for m in islice(LINK_ATTRS.finditer(feed_string), MAX_LINKS)
             )

diff --git a/trafilatura/settings.py b/trafilatura/settings.py
@@ -353,6 +353,7 @@ def as_dict(self) -> Dict[str, Optional[str]]:
     "math",
     "menuitem",
     "nav",
+    "noindex",
     "noscript",
     "optgroup",
     "option",

diff --git a/trafilatura/sitemaps.py b/trafilatura/sitemaps.py
@@ -92,7 +92,7 @@ def handle_link(self, link: str) -> None:
             return
         # fix, check, clean and normalize
         link = fix_relative_urls(self.base_url, link)
-        link = clean_url(link, self.target_lang)
+        link = clean_url(link, self.target_lang) or ""
 
         if not link or not lang_filter(link, self.target_lang):
             return