diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 9990a107..7e9beae7 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -483,6 +483,7 @@ def test_images(): assert is_image_file('test.txt') is False assert is_image_file('test.jpg'*2000) is False # length threshold # tag with attributes + assert handle_image(None) is None assert handle_image(html.fromstring('')) is not None assert handle_image(html.fromstring('text')) is not None assert handle_image(html.fromstring('')) is None @@ -494,6 +495,12 @@ def test_images(): assert '![Example image](test.jpg)' in extract(teststring, include_images=True, fast=True) assert '' in extract(teststring, include_images=True, fast=True, output_format='xml', config=ZERO_CONFIG) assert extract('
text
', include_images=True, fast=True) == '![a title text](test.jpg)' + assert extract('

text

', include_images=True, fast=True) == '![a title text](test.jpg)' + assert extract('

text

', include_images=True, fast=True) == '' + assert extract('

text

', include_images=True, fast=True) == '![a title text](test.jpg)' + assert extract('

text

', include_images=True, fast=True) == '![a title text](test.jpg)' + + assert handle_image(html.fromstring('text')) is None # CNN example mydoc = html.fromstring('Harry and Meghan last March, in their final royal engagement.') diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py index af855ee2..d78734fe 100644 --- a/trafilatura/htmlprocessing.py +++ b/trafilatura/htmlprocessing.py @@ -20,7 +20,7 @@ MANUALLY_CLEANED, MANUALLY_STRIPPED, ) -from .utils import textfilter, trim +from .utils import textfilter, trim, is_image_element from .xml import META_ATTRIBUTES, delete_element @@ -226,6 +226,8 @@ def handle_textnode( preserve_spaces: bool = False, ) -> Optional[_Element]: "Convert, format, and probe potential text elements." + if elem.tag == "graphic" and is_image_element(elem): + return elem if elem.tag == "done" or (len(elem) == 0 and not elem.text and not elem.tail): return None diff --git a/trafilatura/main_extractor.py b/trafilatura/main_extractor.py index eb50338e..2a950bec 100644 --- a/trafilatura/main_extractor.py +++ b/trafilatura/main_extractor.py @@ -331,6 +331,11 @@ def handle_paragraphs(element: _Element, potential_tags: Set[str], options: Extr # else: # newsub.tail = processed_child.text newsub.text, newsub.tail = processed_child.text, processed_child.tail + + if processed_child.tag == 'graphic': + image_elem = handle_image(processed_child) + if image_elem is not None: + newsub = image_elem processed_element.append(newsub) child.tag = "done" # finish @@ -437,8 +442,11 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac return None -def handle_image(element: _Element) -> Optional[_Element]: +def handle_image(element: Optional[_Element]) -> Optional[_Element]: "Process image elements and their relevant attributes." + if element is None: + return None + processed_element = Element(element.tag) for attr in ("data-src", "src"): diff --git a/trafilatura/utils.py b/trafilatura/utils.py index 7db53889..8cb09793 100644 --- a/trafilatura/utils.py +++ b/trafilatura/utils.py @@ -346,6 +346,20 @@ def trim(string: str) -> str: return "" +def is_image_element(element: _Element) -> bool: + '''Check if an element is a valid img element''' + for attr in ("data-src", "src"): + src = element.get(attr, "") + if is_image_file(src): + return True + else: + # take the first corresponding attribute + for attr, value in element.attrib.items(): + if attr.startswith("data-src") and is_image_file(value): + return True + return False + + def is_image_file(imagesrc: Optional[str]) -> bool: '''Check if the observed string corresponds to a valid image extension. Use a length threshold and apply a regex on the content.'''