Skip to content

Commit

Permalink
refactoring: add type hints and review code (#723)
Browse files Browse the repository at this point in the history
* refactoring: add type hints

* fix tests

* add further types

* add missing types

* fix type

* fix errors

* fix code

* fix certain types and tests

* polish
  • Loading branch information
adbar authored Oct 22, 2024
1 parent 6dedfbf commit 2977af5
Show file tree
Hide file tree
Showing 22 changed files with 708 additions and 397 deletions.
3 changes: 2 additions & 1 deletion tests/metadata_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,8 @@ def test_meta():
assert metadata.title == 'Title'

# catch errors
assert extract_metadata('') is None
metadata = extract_metadata('')
assert all(getattr(metadata, a) is None for a in metadata.__slots__)
metadata = extract_metadata('<html><title></title></html>')
assert metadata.sitename is None
metadata = extract_metadata('<html><head><title>' + 'AAA'*10000 + '</title></head></html>')
Expand Down
1 change: 1 addition & 0 deletions tests/sitemaps_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def test_extraction():
#sitemap.handle_link(url) # (url, '0')

# safety belts
assert sitemaps.is_plausible_sitemap('http://example.org/sitemap.xml.gz', None) is False
assert sitemaps.is_plausible_sitemap('http://example.org/sitemap.xml.gz', b'\x1f\x8bABC') is False
assert sitemaps.is_plausible_sitemap('http://example.org/sitemap.xml', 'ABC') is False
assert sitemaps.is_plausible_sitemap('http://test.org/sitemap.xml', '<!DOCTYPE html><html><body/></html>') is False
Expand Down
5 changes: 5 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,8 @@ def test_tei():

def test_htmlprocessing():
'''test html-related functions'''
assert xml.xmltotxt(None, include_formatting=False) == ""

options = DEFAULT_OPTIONS
options.tables = True
assert trafilatura.htmlprocessing.tree_cleaning(etree.Element('html'), options) is not None
Expand Down Expand Up @@ -819,6 +821,7 @@ def test_extraction_options():
assert extract(my_html, only_with_metadata=False, output_format='xml', config=ZERO_CONFIG) is not None
assert extract(my_html, only_with_metadata=True, output_format='xml', config=ZERO_CONFIG) is None
assert extract(my_html, target_language='de', config=ZERO_CONFIG) is None
assert extract(my_html, target_language='de', no_fallback=True, config=ZERO_CONFIG) is None
assert etree.tostring(try_justext(html.fromstring(my_html), None, 'de')) == b'<body/>'
# assert extract(my_html) is None

Expand Down Expand Up @@ -1383,6 +1386,8 @@ def test_is_probably_readerable():
"""
Test is_probably_readerable function.
"""
assert not is_probably_readerable("ABC")

very_small_str = "hello there"
small_str = "hello there " * 11
large_str = "hello there " * 12
Expand Down
14 changes: 8 additions & 6 deletions trafilatura/baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@
from typing import Any, Tuple

from lxml.etree import _Element, Element, SubElement
from lxml.html import HtmlElement

from .settings import BASIC_CLEAN_XPATH
from .utils import load_html, trim
from .xml import delete_element


def basic_cleaning(tree: _Element) -> _Element:
def basic_cleaning(tree: HtmlElement) -> HtmlElement:
"Remove a few section types from the document."
for elem in BASIC_CLEAN_XPATH(tree):
delete_element(elem)
Expand Down Expand Up @@ -62,7 +63,7 @@ def baseline(filecontent: Any) -> Tuple[_Element, str, int]:
# scrape from article tag
temp_text = ""
for article_elem in tree.iterfind('.//article'):
text = trim(article_elem.text_content())
text = trim(article_elem.text_content()) or ""
if len(text) > 100:
SubElement(postbody, 'p').text = text
temp_text += " " + text if temp_text else text
Expand All @@ -75,7 +76,7 @@ def baseline(filecontent: Any) -> Tuple[_Element, str, int]:
temp_text = ""
# postbody = Element('body')
for element in tree.iter('blockquote', 'code', 'p', 'pre', 'q', 'quote'):
entry = trim(element.text_content())
entry = trim(element.text_content()) or ""
if entry not in results:
SubElement(postbody, 'p').text = entry
temp_text += " " + entry if temp_text else entry
Expand All @@ -88,10 +89,11 @@ def baseline(filecontent: Any) -> Tuple[_Element, str, int]:
postbody = Element('body')
body_elem = tree.find('.//body')
if body_elem is not None:
elem = SubElement(postbody, 'p')
p_elem = SubElement(postbody, 'p')
# todo: sanitize?
elem.text = '\n'.join([trim(e) for e in body_elem.itertext()])
return postbody, elem.text, len(elem.text)
text_elems = [trim(e) for e in body_elem.itertext()]
p_elem.text = '\n'.join([e for e in text_elems if e])
return postbody, p_elem.text, len(p_elem.text)

# new fallback
text = html2txt(tree, clean=False)
Expand Down
5 changes: 3 additions & 2 deletions trafilatura/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@
url_processing_pipeline, write_result)
from .settings import PARALLEL_CORES, SUPPORTED_FMT_CLI


# fix output encoding on some systems
if sys.stdout.encoding != 'UTF-8':
if sys.stdout.encoding != 'UTF-8' and hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8')
if sys.stderr.encoding != 'UTF-8':
if sys.stderr.encoding != 'UTF-8' and hasattr(sys.stderr, 'reconfigure'):
sys.stderr.reconfigure(encoding='utf-8')


Expand Down
6 changes: 3 additions & 3 deletions trafilatura/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from .baseline import html2txt
from .core import extract
from .deduplication import generate_bow_hash
from .downloads import add_to_compressed_dict, buffered_downloads, load_download_buffer
from .downloads import Response, add_to_compressed_dict, buffered_downloads, load_download_buffer
from .feeds import find_feed_urls
from .meta import reset_caches
from .settings import (
Expand Down Expand Up @@ -272,7 +272,7 @@ def download_queue_processing(
bufferlist, args.parallel, options=options
):
# handle result
if result:
if result and isinstance(result, str):
options.url = url
counter = process_result(result, args, counter, options)
else:
Expand Down Expand Up @@ -380,7 +380,7 @@ def cli_crawler(
for url, result in buffered_downloads(
bufferlist, args.parallel, decode=False, options=options
):
if result is not None:
if result and isinstance(result, Response):
spider.process_response(result, param_dict[get_base_url(url)])
# early exit if maximum count is reached
if any(c >= n for c in spider.URL_STORE.get_all_counts()):
Expand Down
Loading

0 comments on commit 2977af5

Please sign in to comment.