diff --git a/trafilatura/main_extractor.py b/trafilatura/main_extractor.py index d77ee2fb..5712b485 100644 --- a/trafilatura/main_extractor.py +++ b/trafilatura/main_extractor.py @@ -15,7 +15,7 @@ link_density_test_tables, process_node, prune_unwanted_nodes) from .settings import TAG_CATALOG -from .utils import FORMATTING_PROTECTED, is_image_file, text_chars_test +from .utils import FORMATTING_PROTECTED, is_image_file, text_chars_test, trim from .xml import delete_element from .xpaths import (BODY_XPATH, COMMENTS_DISCARD_XPATH, COMMENTS_XPATH, DISCARD_IMAGE_ELEMENTS, OVERALL_DISCARD_XPATH, @@ -33,6 +33,12 @@ NOT_AT_THE_END = {'head', 'ref'} +def _log_event(msg, tag, text): + "Format extraction event for debugging purposes." + if LOGGER.isEnabledFor(logging.DEBUG): + LOGGER.debug(f"{msg}: {tag} {trim(text) or 'None'}") + + def handle_titles(element, options): '''Process head elements (titles)''' if len(element) == 0: @@ -241,7 +247,7 @@ def handle_other_elements(element, potential_tags, options): # delete unwanted if element.tag not in potential_tags: if element.tag != "done": - LOGGER.debug("discarding element: %s %s", element.tag, element.text) + _log_event("discarding element", element.tag, element.text) return None if element.tag == "div": @@ -282,8 +288,7 @@ def handle_paragraphs(element, potential_tags, options): if processed_child is not None: # todo: needing attention! if processed_child.tag == "p": - LOGGER.debug("extra p within p: %s %s %s", processed_child.tag, processed_child.text, - processed_child.tail) + _log_event("extra p", "p", processed_child.text or "") if processed_element.text: processed_element.text += " " + processed_child.text else: @@ -335,7 +340,7 @@ def handle_paragraphs(element, potential_tags, options): return processed_element if processed_element.text: return processed_element - LOGGER.debug("discarding p-child: %s", tostring(processed_element)) + _log_event("discarding element:", "p", tostring(processed_element)) return None