Skip to content

Commit

Permalink
logging: better debug messages in main_extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Oct 7, 2024
1 parent 99d3573 commit d15870a
Showing 1 changed file with 10 additions and 5 deletions.
15 changes: 10 additions & 5 deletions trafilatura/main_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
link_density_test_tables, process_node,
prune_unwanted_nodes)
from .settings import TAG_CATALOG
from .utils import FORMATTING_PROTECTED, is_image_file, text_chars_test
from .utils import FORMATTING_PROTECTED, is_image_file, text_chars_test, trim
from .xml import delete_element
from .xpaths import (BODY_XPATH, COMMENTS_DISCARD_XPATH, COMMENTS_XPATH,
DISCARD_IMAGE_ELEMENTS, OVERALL_DISCARD_XPATH,
Expand All @@ -33,6 +33,12 @@
NOT_AT_THE_END = {'head', 'ref'}


def _log_event(msg, tag, text):
"Format extraction event for debugging purposes."
if LOGGER.isEnabledFor(logging.DEBUG):
LOGGER.debug(f"{msg}: {tag} {trim(text) or 'None'}")

Check warning on line 39 in trafilatura/main_extractor.py

View check run for this annotation

Codecov / codecov/patch

trafilatura/main_extractor.py#L39

Added line #L39 was not covered by tests


def handle_titles(element, options):
'''Process head elements (titles)'''
if len(element) == 0:
Expand Down Expand Up @@ -241,7 +247,7 @@ def handle_other_elements(element, potential_tags, options):
# delete unwanted
if element.tag not in potential_tags:
if element.tag != "done":
LOGGER.debug("discarding element: %s %s", element.tag, element.text)
_log_event("discarding element", element.tag, element.text)
return None

if element.tag == "div":
Expand Down Expand Up @@ -282,8 +288,7 @@ def handle_paragraphs(element, potential_tags, options):
if processed_child is not None:
# todo: needing attention!
if processed_child.tag == "p":
LOGGER.debug("extra p within p: %s %s %s", processed_child.tag, processed_child.text,
processed_child.tail)
_log_event("extra p", "p", processed_child.text or "")
if processed_element.text:
processed_element.text += " " + processed_child.text
else:
Expand Down Expand Up @@ -335,7 +340,7 @@ def handle_paragraphs(element, potential_tags, options):
return processed_element
if processed_element.text:
return processed_element
LOGGER.debug("discarding p-child: %s", tostring(processed_element))
_log_event("discarding element:", "p", tostring(processed_element))
return None


Expand Down

0 comments on commit d15870a

Please sign in to comment.