extraction: refine img src url and fix table extraction bugs (#762)

Co-authored-by: CodyInnowhere <[email protected]>
adbar · Dec 6, 2024 · 7067937 · 7067937
1 parent 76200b7
commit 7067937
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 17 deletions.
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -499,6 +499,13 @@ def test_images():
     assert extract('<html><body><article><p><img other="test.jpg" alt="text" title="a title"/></p></article></body></html>', include_images=True, fast=True) == ''
     assert extract('<html><body><article><div><p><img data-src="test.jpg" alt="text" title="a title"/></p></div></article></body></html>', include_images=True, fast=True) == '![a title text](test.jpg)'
     assert extract('<html><body><article><div><p><img data-src-small="test.jpg" alt="text" title="a title"/></p></div></article></body></html>', include_images=True, fast=True) == '![a title text](test.jpg)'
+    assert extract('<html><body><article><div><p><img src="https://a.b/test.jpg" alt="text" title="a title"/></p></div></article></body></html>', include_images=True, fast=True) == '![a title text](https://a.b/test.jpg)'
+
+    url = 'http://a.b/c/d.html'
+    assert extract('<html><body><article><div><p><img src="//a.b/test.jpg" alt="text" title="a title"/></p></div></article></body></html>', url=url, include_images=True, fast=True) == '![a title text](http://a.b/test.jpg)'
+    assert extract('<html><body><article><div><p><img src="/a.b/test.jpg" alt="text" title="a title"/></p></div></article></body></html>', url=url, include_images=True, fast=True) == '![a title text](http://a.b/a.b/test.jpg)'
+    assert extract('<html><body><article><div><p><img src="./a.b/test.jpg" alt="text" title="a title"/></p></div></article></body></html>', url=url, include_images=True, fast=True) == '![a title text](http://a.b/c/a.b/test.jpg)'
+    assert extract('<html><body><article><div><p><img src="../a.b/test.jpg" alt="text" title="a title"/></p></div></article></body></html>', url=url, include_images=True, fast=True) == '![a title text](http://a.b/a.b/test.jpg)'
 
     assert handle_image(html.fromstring('<img src="data:image/jpeg;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==" alt="text"></img>')) is None
 
@@ -1187,6 +1194,38 @@ def test_table_processing():
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
     assert result == ""
 
+    htmlstring = """
+                 <html><body><article>
+                 <table>
+                 <tr><td>a</td><td>b</td><td>c</td></tr>
+                 <tr><td>a</td><td colspan="2">
+                 <p>b</p>
+                 <p>c</p>
+                 </td></tr>
+                 </table>
+                 </article></body></html>
+                 """
+    result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
+    assert result == "| a | b | c |\n| a | b c | |"
+
+    htmlstring = """
+                 <html><body><article>
+                 <table>
+                 <tr><td>a</td><td>b</td><td>c</td></tr>
+                 <tr><td>a</td><td colspan="2">
+                 <p>b</p>
+                 <p>c</p>
+                 </td></tr>
+                 <tr><td>a</td><td colspan="2">
+                 <p>b</p>
+                 <p>c</p>
+                 </td></tr>
+                 </table>
+                 </article></body></html>
+                 """
+    result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
+    assert result == "| a | b | c |\n| a | b c | |\n| a | b c | |"
+
 
 def test_list_processing():
     options = DEFAULT_OPTIONS

diff --git a/trafilatura/main_extractor.py b/trafilatura/main_extractor.py
@@ -8,6 +8,7 @@
 
 from copy import deepcopy
 from typing import Any, Optional, Tuple, Set, Union
+from urllib.parse import urljoin
 
 from lxml.etree import _Element, Element, SubElement, strip_elements, strip_tags, tostring
 from lxml.html import HtmlElement
@@ -333,7 +334,7 @@ def handle_paragraphs(element: _Element, potential_tags: Set[str], options: Extr
             newsub.text, newsub.tail = processed_child.text, processed_child.tail
 
             if processed_child.tag == 'graphic':
-                image_elem = handle_image(processed_child)
+                image_elem = handle_image(processed_child, options)
                 if image_elem is not None:
                     newsub = image_elem
             processed_element.append(newsub)
@@ -367,10 +368,16 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac
     # strip these structural elements
     strip_tags(table_elem, "thead", "tbody", "tfoot")
 
-    # calculate maximum number of columns per row, includin colspan
+    # calculate maximum number of columns per row, including colspan
     max_cols = 0
+    diff_colspans = set()
     for tr in table_elem.iter('tr'):
-        max_cols = max(max_cols, sum(int(td.get("colspan", 1)) for td in tr.iter(TABLE_ELEMS)))
+        total_colspans = 0
+        for td in tr.iter(TABLE_ELEMS):
+            colspan = int(td.get("colspan", 1))
+            diff_colspans.add(colspan)
+            total_colspans += colspan
+        max_cols = max(max_cols, total_colspans)
 
     # explore sub-elements
     seen_header_row = False
@@ -431,8 +438,9 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac
         # cleanup
         subelement.tag = "done"
 
-    # clean up row attributes
-    newrow.attrib.pop("span", None)
+    # clean up row attributes only when all cells in table share the same colspan
+    if len(diff_colspans) == 1:
+        newrow.attrib.pop("span", None)
 
     # end of processing
     if len(newrow) > 0:
@@ -442,7 +450,7 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac
     return None
 
 
-def handle_image(element: Optional[_Element]) -> Optional[_Element]:
+def handle_image(element: Optional[_Element], options: Optional[Extractor] = None) -> Optional[_Element]:
     "Process image elements and their relevant attributes."
     if element is None:
         return None
@@ -472,9 +480,13 @@ def handle_image(element: Optional[_Element]) -> Optional[_Element]:
         return None
 
     # post-processing: URLs
-    src_attr = processed_element.get("src", "")
-    if not src_attr.startswith("http"):
-        processed_element.set("src", re.sub(r"^//", "http://", src_attr))
+    link = processed_element.get("src", "")
+    if not link.startswith("http"):
+        if options is not None and options.url is not None:
+            link = urljoin(options.url, link)
+        else:
+            link = re.sub(r"^//", "http://", link)
+        processed_element.set("src", link)
 
     return processed_element
 
@@ -502,7 +514,7 @@ def handle_textelem(element: _Element, potential_tags: Set[str], options: Extrac
     elif element.tag == 'table' and 'table' in potential_tags:
         new_element = handle_table(element, potential_tags, options)
     elif element.tag == 'graphic' and 'graphic' in potential_tags:
-        new_element = handle_image(element)
+        new_element = handle_image(element, options)
     else:
         # other elements (div, ??, ??)
         new_element = handle_other_elements(element, potential_tags, options)

diff --git a/trafilatura/xml.py b/trafilatura/xml.py
@@ -285,12 +285,15 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
         else:
             LOGGER.warning("empty link: %s %s", elem_text, element.attrib)
     # cells
-    if element.tag == "cell" and elem_text and len(element) > 0:
-        if element[0].tag == 'p':
-            elem_text = f"{elem_text} " if element.getprevious() is not None else f"| {elem_text} "
-    elif element.tag == 'cell' and elem_text:
-        # add | before first cell
-        elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}"
+    if element.tag == "cell":
+        elem_text = elem_text.strip()
+
+        if elem_text and len(element) > 0:
+            if element[0].tag == 'p':
+                elem_text = f"{elem_text} " if element.getprevious() is not None else f"| {elem_text} "
+        elif elem_text:
+            # add | before first cell
+            elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}"
     # lists
     elif element.tag == "item" and elem_text:
         elem_text = f"- {elem_text}\n"
@@ -348,7 +351,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
 
     # this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS
     if element.tail:
-        returnlist.append(element.tail)
+        returnlist.append(element.tail.strip() if element.tag == 'cell' else element.tail)
 
 
 def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str: