diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 9990a107..7e9beae7 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -483,6 +483,7 @@ def test_images():
assert is_image_file('test.txt') is False
assert is_image_file('test.jpg'*2000) is False # length threshold
# tag with attributes
+ assert handle_image(None) is None
assert handle_image(html.fromstring('')) is not None
assert handle_image(html.fromstring('')) is not None
assert handle_image(html.fromstring('')) is None
@@ -494,6 +495,12 @@ def test_images():
assert '![Example image](test.jpg)' in extract(teststring, include_images=True, fast=True)
assert '
' in extract(teststring, include_images=True, fast=True, output_format='xml', config=ZERO_CONFIG)
assert extract('
', include_images=True, fast=True) == '![a title text](test.jpg)'
+ assert extract('', include_images=True, fast=True) == '![a title text](test.jpg)'
+ assert extract('', include_images=True, fast=True) == ''
+ assert extract('', include_images=True, fast=True) == '![a title text](test.jpg)'
+ assert extract('', include_images=True, fast=True) == '![a title text](test.jpg)'
+
+ assert handle_image(html.fromstring('')) is None
# CNN example
mydoc = html.fromstring('')
diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py
index af855ee2..d78734fe 100644
--- a/trafilatura/htmlprocessing.py
+++ b/trafilatura/htmlprocessing.py
@@ -20,7 +20,7 @@
MANUALLY_CLEANED,
MANUALLY_STRIPPED,
)
-from .utils import textfilter, trim
+from .utils import textfilter, trim, is_image_element
from .xml import META_ATTRIBUTES, delete_element
@@ -226,6 +226,8 @@ def handle_textnode(
preserve_spaces: bool = False,
) -> Optional[_Element]:
"Convert, format, and probe potential text elements."
+ if elem.tag == "graphic" and is_image_element(elem):
+ return elem
if elem.tag == "done" or (len(elem) == 0 and not elem.text and not elem.tail):
return None
diff --git a/trafilatura/main_extractor.py b/trafilatura/main_extractor.py
index eb50338e..2a950bec 100644
--- a/trafilatura/main_extractor.py
+++ b/trafilatura/main_extractor.py
@@ -331,6 +331,11 @@ def handle_paragraphs(element: _Element, potential_tags: Set[str], options: Extr
# else:
# newsub.tail = processed_child.text
newsub.text, newsub.tail = processed_child.text, processed_child.tail
+
+ if processed_child.tag == 'graphic':
+ image_elem = handle_image(processed_child)
+ if image_elem is not None:
+ newsub = image_elem
processed_element.append(newsub)
child.tag = "done"
# finish
@@ -437,8 +442,11 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac
return None
-def handle_image(element: _Element) -> Optional[_Element]:
+def handle_image(element: Optional[_Element]) -> Optional[_Element]:
"Process image elements and their relevant attributes."
+ if element is None:
+ return None
+
processed_element = Element(element.tag)
for attr in ("data-src", "src"):
diff --git a/trafilatura/utils.py b/trafilatura/utils.py
index 7db53889..8cb09793 100644
--- a/trafilatura/utils.py
+++ b/trafilatura/utils.py
@@ -346,6 +346,20 @@ def trim(string: str) -> str:
return ""
+def is_image_element(element: _Element) -> bool:
+ '''Check if an element is a valid img element'''
+ for attr in ("data-src", "src"):
+ src = element.get(attr, "")
+ if is_image_file(src):
+ return True
+ else:
+ # take the first corresponding attribute
+ for attr, value in element.attrib.items():
+ if attr.startswith("data-src") and is_image_file(value):
+ return True
+ return False
+
+
def is_image_file(imagesrc: Optional[str]) -> bool:
'''Check if the observed string corresponds to a valid image extension.
Use a length threshold and apply a regex on the content.'''