Skip to content

Commit

Permalink
extraction: refine img src url and fix table extraction bugs (#762)
Browse files Browse the repository at this point in the history
Co-authored-by: CodyInnowhere <[email protected]>
  • Loading branch information
unsleepy22 and CodyInnowhere authored Dec 6, 2024
1 parent 76200b7 commit 7067937
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 17 deletions.
39 changes: 39 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,13 @@ def test_images():
assert extract('<html><body><article><p><img other="test.jpg" alt="text" title="a title"/></p></article></body></html>', include_images=True, fast=True) == ''
assert extract('<html><body><article><div><p><img data-src="test.jpg" alt="text" title="a title"/></p></div></article></body></html>', include_images=True, fast=True) == '![a title text](test.jpg)'
assert extract('<html><body><article><div><p><img data-src-small="test.jpg" alt="text" title="a title"/></p></div></article></body></html>', include_images=True, fast=True) == '![a title text](test.jpg)'
assert extract('<html><body><article><div><p><img src="https://a.b/test.jpg" alt="text" title="a title"/></p></div></article></body></html>', include_images=True, fast=True) == '![a title text](https://a.b/test.jpg)'

url = 'http://a.b/c/d.html'
assert extract('<html><body><article><div><p><img src="//a.b/test.jpg" alt="text" title="a title"/></p></div></article></body></html>', url=url, include_images=True, fast=True) == '![a title text](http://a.b/test.jpg)'
assert extract('<html><body><article><div><p><img src="/a.b/test.jpg" alt="text" title="a title"/></p></div></article></body></html>', url=url, include_images=True, fast=True) == '![a title text](http://a.b/a.b/test.jpg)'
assert extract('<html><body><article><div><p><img src="./a.b/test.jpg" alt="text" title="a title"/></p></div></article></body></html>', url=url, include_images=True, fast=True) == '![a title text](http://a.b/c/a.b/test.jpg)'
assert extract('<html><body><article><div><p><img src="../a.b/test.jpg" alt="text" title="a title"/></p></div></article></body></html>', url=url, include_images=True, fast=True) == '![a title text](http://a.b/a.b/test.jpg)'

assert handle_image(html.fromstring('<img src="data:image/jpeg;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==" alt="text"></img>')) is None

Expand Down Expand Up @@ -1187,6 +1194,38 @@ def test_table_processing():
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == ""

htmlstring = """
<html><body><article>
<table>
<tr><td>a</td><td>b</td><td>c</td></tr>
<tr><td>a</td><td colspan="2">
<p>b</p>
<p>c</p>
</td></tr>
</table>
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == "| a | b | c |\n| a | b c | |"

htmlstring = """
<html><body><article>
<table>
<tr><td>a</td><td>b</td><td>c</td></tr>
<tr><td>a</td><td colspan="2">
<p>b</p>
<p>c</p>
</td></tr>
<tr><td>a</td><td colspan="2">
<p>b</p>
<p>c</p>
</td></tr>
</table>
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == "| a | b | c |\n| a | b c | |\n| a | b c | |"


def test_list_processing():
options = DEFAULT_OPTIONS
Expand Down
32 changes: 22 additions & 10 deletions trafilatura/main_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from copy import deepcopy
from typing import Any, Optional, Tuple, Set, Union
from urllib.parse import urljoin

from lxml.etree import _Element, Element, SubElement, strip_elements, strip_tags, tostring
from lxml.html import HtmlElement
Expand Down Expand Up @@ -333,7 +334,7 @@ def handle_paragraphs(element: _Element, potential_tags: Set[str], options: Extr
newsub.text, newsub.tail = processed_child.text, processed_child.tail

if processed_child.tag == 'graphic':
image_elem = handle_image(processed_child)
image_elem = handle_image(processed_child, options)
if image_elem is not None:
newsub = image_elem
processed_element.append(newsub)
Expand Down Expand Up @@ -367,10 +368,16 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac
# strip these structural elements
strip_tags(table_elem, "thead", "tbody", "tfoot")

# calculate maximum number of columns per row, includin colspan
# calculate maximum number of columns per row, including colspan
max_cols = 0
diff_colspans = set()
for tr in table_elem.iter('tr'):
max_cols = max(max_cols, sum(int(td.get("colspan", 1)) for td in tr.iter(TABLE_ELEMS)))
total_colspans = 0
for td in tr.iter(TABLE_ELEMS):
colspan = int(td.get("colspan", 1))
diff_colspans.add(colspan)
total_colspans += colspan
max_cols = max(max_cols, total_colspans)

# explore sub-elements
seen_header_row = False
Expand Down Expand Up @@ -431,8 +438,9 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac
# cleanup
subelement.tag = "done"

# clean up row attributes
newrow.attrib.pop("span", None)
# clean up row attributes only when all cells in table share the same colspan
if len(diff_colspans) == 1:
newrow.attrib.pop("span", None)

# end of processing
if len(newrow) > 0:
Expand All @@ -442,7 +450,7 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac
return None


def handle_image(element: Optional[_Element]) -> Optional[_Element]:
def handle_image(element: Optional[_Element], options: Optional[Extractor] = None) -> Optional[_Element]:
"Process image elements and their relevant attributes."
if element is None:
return None
Expand Down Expand Up @@ -472,9 +480,13 @@ def handle_image(element: Optional[_Element]) -> Optional[_Element]:
return None

# post-processing: URLs
src_attr = processed_element.get("src", "")
if not src_attr.startswith("http"):
processed_element.set("src", re.sub(r"^//", "http://", src_attr))
link = processed_element.get("src", "")
if not link.startswith("http"):
if options is not None and options.url is not None:
link = urljoin(options.url, link)
else:
link = re.sub(r"^//", "http://", link)
processed_element.set("src", link)

return processed_element

Expand Down Expand Up @@ -502,7 +514,7 @@ def handle_textelem(element: _Element, potential_tags: Set[str], options: Extrac
elif element.tag == 'table' and 'table' in potential_tags:
new_element = handle_table(element, potential_tags, options)
elif element.tag == 'graphic' and 'graphic' in potential_tags:
new_element = handle_image(element)
new_element = handle_image(element, options)
else:
# other elements (div, ??, ??)
new_element = handle_other_elements(element, potential_tags, options)
Expand Down
17 changes: 10 additions & 7 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,12 +285,15 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
else:
LOGGER.warning("empty link: %s %s", elem_text, element.attrib)
# cells
if element.tag == "cell" and elem_text and len(element) > 0:
if element[0].tag == 'p':
elem_text = f"{elem_text} " if element.getprevious() is not None else f"| {elem_text} "
elif element.tag == 'cell' and elem_text:
# add | before first cell
elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}"
if element.tag == "cell":
elem_text = elem_text.strip()

if elem_text and len(element) > 0:
if element[0].tag == 'p':
elem_text = f"{elem_text} " if element.getprevious() is not None else f"| {elem_text} "
elif elem_text:
# add | before first cell
elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}"
# lists
elif element.tag == "item" and elem_text:
elem_text = f"- {elem_text}\n"
Expand Down Expand Up @@ -348,7 +351,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting

# this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS
if element.tail:
returnlist.append(element.tail)
returnlist.append(element.tail.strip() if element.tag == 'cell' else element.tail)


def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str:
Expand Down

0 comments on commit 7067937

Please sign in to comment.