diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index eb90ae35..c0b39a85 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -171,6 +171,8 @@ def test_exotic_tags(xmloutput=False):
element.append(etree.Element('lb'))
converted = handle_paragraphs(element, ['p'], False, ZERO_CONFIG)
assert etree.tostring(converted) == b'
1st part. 2nd part.
'
+ # naked div with
+ assert '1.\n2.\n3.' in extract('1.
2.
3.
', no_fallback=True, config=ZERO_CONFIG)
# malformed lists (common error)
result = etree.tostring(handle_lists(etree.fromstring('Description of the list:- List item 1
- List item 2
- List item 3
'), False, ZERO_CONFIG))
assert result.count(b'List item') == 3
diff --git a/trafilatura/core.py b/trafilatura/core.py
index 44d878f6..d937e497 100644
--- a/trafilatura/core.py
+++ b/trafilatura/core.py
@@ -537,10 +537,14 @@ def extract_content(tree, favor_precision=False, favor_recall=False, include_tab
if 'span' not in potential_tags:
strip_tags(subtree, 'span')
LOGGER.debug(sorted(potential_tags))
- ##strip_tags(subtree, 'lb') # BoingBoing-Bug
+ # proper extraction
+ subelems = subtree.xpath('.//*')
+ # e.g. only lb-elems in a div
+ if set(e.tag for e in subelems) == {'lb'}:
+ subelems = [subtree]
# extract content # list(filter(None.__ne__, processed_elems)) ?
result_body.extend(e for e in
- [handle_textelem(e, potential_tags, deduplicate, config) for e in subtree.xpath('.//*')]
+ [handle_textelem(e, potential_tags, deduplicate, config) for e in subelems]
if e is not None)
# remove trailing titles
while len(result_body) > 0 and (result_body[-1].tag in NOT_AT_THE_END):