diff --git a/tests/unit_tests.py b/tests/unit_tests.py index eb90ae35..c0b39a85 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -171,6 +171,8 @@ def test_exotic_tags(xmloutput=False): element.append(etree.Element('lb')) converted = handle_paragraphs(element, ['p'], False, ZERO_CONFIG) assert etree.tostring(converted) == b'

1st part. 2nd part.

' + # naked div with + assert '1.\n2.\n3.' in extract('
1.
2.
3.
', no_fallback=True, config=ZERO_CONFIG) # malformed lists (common error) result = etree.tostring(handle_lists(etree.fromstring('Description of the list:List item 1List item 2List item 3'), False, ZERO_CONFIG)) assert result.count(b'List item') == 3 diff --git a/trafilatura/core.py b/trafilatura/core.py index 44d878f6..d937e497 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -537,10 +537,14 @@ def extract_content(tree, favor_precision=False, favor_recall=False, include_tab if 'span' not in potential_tags: strip_tags(subtree, 'span') LOGGER.debug(sorted(potential_tags)) - ##strip_tags(subtree, 'lb') # BoingBoing-Bug + # proper extraction + subelems = subtree.xpath('.//*') + # e.g. only lb-elems in a div + if set(e.tag for e in subelems) == {'lb'}: + subelems = [subtree] # extract content # list(filter(None.__ne__, processed_elems)) ? result_body.extend(e for e in - [handle_textelem(e, potential_tags, deduplicate, config) for e in subtree.xpath('.//*')] + [handle_textelem(e, potential_tags, deduplicate, config) for e in subelems] if e is not None) # remove trailing titles while len(result_body) > 0 and (result_body[-1].tag in NOT_AT_THE_END):