From 658ee6e89ed3194b6b4f1496182f1c97301795bd Mon Sep 17 00:00:00 2001 From: Elwin Date: Thu, 17 Mar 2022 10:44:37 +0000 Subject: [PATCH] Improvements for Chinese web pages (#186) Co-authored-by: Adrien Barbaresi --- .gitignore | 3 ++ setup.py | 11 +++--- trafilatura/core.py | 75 +++++++++++++++++++++++---------------- trafilatura/filters.py | 6 +++- trafilatura/metaxpaths.py | 2 +- trafilatura/xpaths.py | 19 ++++++---- 6 files changed, 71 insertions(+), 45 deletions(-) diff --git a/.gitignore b/.gitignore index 52e05234..abb51da2 100644 --- a/.gitignore +++ b/.gitignore @@ -8,10 +8,12 @@ dist/ build/ *.egg-info/ +.idea/ # tests .cache/ .eggs/ +.pytest_cache/ .tox/ .coverage @@ -30,3 +32,4 @@ Pipfile* # older stuff old/ + diff --git a/setup.py b/setup.py index 7cde2823..e99ad1aa 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,6 @@ from setuptools import setup - def get_version(package): "Return package version as listed in `__version__` in `init.py`" # version = Path(package, '__init__.py').read_text() # Python >= 3.5 @@ -21,8 +20,8 @@ def get_long_description(): "Return the README" with open('README.rst', 'r', encoding='utf-8') as filehandle: long_description = filehandle.read() - #long_description += "\n\n" - #with open("CHANGELOG.md", encoding="utf8") as f: + # long_description += "\n\n" + # with open("CHANGELOG.md", encoding="utf8") as f: # long_description += f.read() return long_description @@ -31,7 +30,7 @@ def get_long_description(): extras = { 'all': [ 'cchardet >= 2.1.7', - 'htmldate[speed] >= 1.1.1', + 'htmldate[speed] >= 1.2.0', 'py3langid >= 0.2.0', 'pycurl >= 7.44.1', 'urllib3[brotli]', @@ -95,13 +94,13 @@ def get_long_description(): 'certifi', 'charset_normalizer >= 2.0.12', 'courlan >= 0.6.0', - 'htmldate >= 1.1.1', + 'htmldate >= 1.2.0', 'justext >= 3.0.0', 'lxml >= 4.6.4', 'urllib3 >= 1.26, < 2', ], extras_require=extras, - entry_points = { + entry_points={ 'console_scripts': [ 'trafilatura=trafilatura.cli:main', 'trafilatura_gui=trafilatura.gui:main', diff --git a/trafilatura/core.py b/trafilatura/core.py index 171334c6..034d2e66 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -20,7 +20,7 @@ # own from .external import justext_rescue, sanitize_tree, SANITIZED_XPATH, try_readability from .filters import (check_html_lang, content_fingerprint, duplicate_test, - language_filter, text_chars_test) + language_filter, text_chars_test) from .htmlprocessing import (convert_tags, handle_textnode, link_density_test, link_density_test_tables, process_node, prune_unwanted_nodes, tree_cleaning) @@ -33,7 +33,6 @@ ADDITIONAL_DISCARD_XPATH, PRECISION_DISCARD_XPATH, DISCARD_IMAGE_ELEMENTS, REMOVE_COMMENTS_XPATH) - LOGGER = logging.getLogger(__name__) FORMATTING_PROTECTED = {'cell', 'head', 'hi', 'item', 'p', 'quote', 'td'} @@ -143,7 +142,8 @@ def handle_lists(element, dedupbool, config): if processed_subchild is not None: newchildelem.append(processed_subchild) else: - processed_subchild = handle_textnode(subelem, comments_fix=False, deduplicate=dedupbool, config=config) + processed_subchild = handle_textnode(subelem, comments_fix=False, deduplicate=dedupbool, + config=config) # add child element to processed_element if processed_subchild is not None: subchildelem = SubElement(newchildelem, processed_subchild.tag) @@ -165,7 +165,7 @@ def handle_quotes(element, dedupbool, config): '''Process quotes elements''' processed_element = Element(element.tag) for child in element.iter('*'): - processed_child = process_node(child, dedupbool, config) # handle_textnode(child, comments_fix=True) + processed_child = process_node(child, dedupbool, config) # handle_textnode(child, comments_fix=True) if processed_child is not None: newsub = SubElement(processed_element, child.tag) newsub.text, newsub.tail = processed_child.text, processed_child.tail @@ -218,11 +218,13 @@ def handle_paragraphs(element, potential_tags, dedupbool, config): continue # spacing = child.tag in SPACING_PROTECTED # todo: outputformat.startswith('xml')? # todo: act on spacing here? - processed_child = handle_textnode(child, comments_fix=False, deduplicate=dedupbool, preserve_spaces=True, config=config) + processed_child = handle_textnode(child, comments_fix=False, deduplicate=dedupbool, preserve_spaces=True, + config=config) if processed_child is not None: # todo: needing attention! if processed_child.tag == 'p': - LOGGER.debug('extra p within p: %s %s %s', processed_child.tag, processed_child.text, processed_child.tail) + LOGGER.debug('extra p within p: %s %s %s', processed_child.tag, processed_child.text, + processed_child.tail) if processed_element.text: processed_element.text += ' ' + processed_child.text else: @@ -268,8 +270,8 @@ def handle_paragraphs(element, potential_tags, dedupbool, config): if len(processed_element) > 0: # clean trailing lb-elements if ( - processed_element[-1].tag == 'lb' - and processed_element[-1].tail is None + processed_element[-1].tag == 'lb' + and processed_element[-1].tail is None ): processed_element[-1].getparent().remove(processed_element[-1]) return processed_element @@ -279,7 +281,6 @@ def handle_paragraphs(element, potential_tags, dedupbool, config): return None - def define_cell_type(element): '''Determine cell element type and mint new element''' # define tag @@ -319,7 +320,8 @@ def handle_table(table_elem, potential_tags, dedupbool, config): if child.tag in TABLE_ELEMS: # subcell_elem = define_cell_type(subelement) child.tag = 'cell' - processed_subchild = handle_textnode(child, preserve_spaces=True, comments_fix=True, deduplicate=dedupbool, config=config) + processed_subchild = handle_textnode(child, preserve_spaces=True, comments_fix=True, + deduplicate=dedupbool, config=config) # todo: lists in table cells else: # subcell_elem = Element(child.tag) @@ -373,7 +375,8 @@ def handle_image(element): return processed_element -def recover_wild_text(tree, result_body, favor_precision=False, favor_recall=False, potential_tags=TAG_CATALOG, deduplicate=True, config=None): +def recover_wild_text(tree, result_body, favor_precision=False, favor_recall=False, potential_tags=TAG_CATALOG, + deduplicate=True, config=None): '''Look for all previously unconsidered wild elements, including outside of the determined frame and throughout the document to recover potentially missing text parts''' LOGGER.debug('Recovering wild text elements') @@ -397,10 +400,10 @@ def recover_wild_text(tree, result_body, favor_precision=False, favor_recall=Fal else: strip_tags(search_tree, 'span') result_body.extend(e for e in - [handle_textelem( - element, potential_tags, deduplicate, config) - for element in search_tree.iter(search_list)] - if e is not None) + [handle_textelem( + element, potential_tags, deduplicate, config) + for element in search_tree.iter(search_list)] + if e is not None) return result_body @@ -423,7 +426,7 @@ def handle_textelem(element, potential_tags, dedupbool, config): new_element = Element('p') new_element.text = element.tail elif element.tag in FORMATTING: - new_element = handle_formatting(element, dedupbool, config) # process_node(element, dedupbool, config) + new_element = handle_formatting(element, dedupbool, config) # process_node(element, dedupbool, config) elif element.tag == 'table' and 'table' in potential_tags: new_element = handle_table(element, potential_tags, dedupbool, config) elif element.tag == 'graphic' and 'graphic' in potential_tags: @@ -461,7 +464,8 @@ def delete_by_link_density(subtree, tagname, backtracking=False): return subtree -def extract_content(tree, favor_precision=False, favor_recall=False, include_tables=False, include_images=False, include_links=False, deduplicate=False, config=None): +def extract_content(tree, favor_precision=False, favor_recall=False, include_tables=False, include_images=False, + include_links=False, deduplicate=False, config=None): '''Find the main content of a page using a set of XPath expressions, then extract relevant elements, strip them of unwanted subparts and convert them''' @@ -495,7 +499,7 @@ def extract_content(tree, favor_precision=False, favor_recall=False, include_tab subtree = delete_by_link_density(subtree, 'div', backtracking=True) subtree = delete_by_link_density(subtree, 'list', backtracking=False) subtree = delete_by_link_density(subtree, 'p', backtracking=False) - #subtree = delete_by_link_density(subtree, 'head', backtracking=False) + # subtree = delete_by_link_density(subtree, 'head', backtracking=False) # also filter fw/head, table and quote elements? if favor_precision is True: subtree = delete_by_link_density(subtree, 'head', backtracking=False) @@ -526,8 +530,8 @@ def extract_content(tree, favor_precision=False, favor_recall=False, include_tab ##strip_tags(subtree, 'lb') # BoingBoing-Bug # extract content # list(filter(None.__ne__, processed_elems)) ? result_body.extend(e for e in - [handle_textelem(e, potential_tags, deduplicate, config) for e in subtree.xpath('.//*')] - if e is not None) + [handle_textelem(e, potential_tags, deduplicate, config) for e in subtree.xpath('.//*')] + if e is not None) # remove trailing titles while len(result_body) > 0 and (result_body[-1].tag in NOT_AT_THE_END): result_body[-1].getparent().remove(result_body[-1]) @@ -539,7 +543,9 @@ def extract_content(tree, favor_precision=False, favor_recall=False, include_tab # try parsing wild

elements if nothing found or text too short # todo: test precision and recall settings here if len(result_body) == 0 or len(temp_text) < config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE'): - result_body = recover_wild_text(backup_tree, result_body, favor_precision=favor_precision, favor_recall=favor_recall, potential_tags=potential_tags, deduplicate=deduplicate, config=config) + result_body = recover_wild_text(backup_tree, result_body, favor_precision=favor_precision, + favor_recall=favor_recall, potential_tags=potential_tags, + deduplicate=deduplicate, config=config) temp_text = trim(' '.join(result_body.itertext())) # filter output strip_elements(result_body, 'done') @@ -583,7 +589,8 @@ def extract_comments(tree, dedupbool, config): # processed_elem = process_comments_node(elem, potential_tags) # if processed_elem is not None: # comments_body.append(processed_elem) - processed_elems = (process_comments_node(elem, potential_tags, dedupbool, config) for elem in subtree.xpath('.//*')) + processed_elems = (process_comments_node(elem, potential_tags, dedupbool, config) for elem in + subtree.xpath('.//*')) comments_body.extend(elem for elem in processed_elems if elem is not None) # control if len(comments_body) > 0: # if it has children @@ -596,11 +603,12 @@ def extract_comments(tree, dedupbool, config): return comments_body, temp_comments, len(temp_comments), tree -def compare_extraction(tree, backup_tree, url, body, text, len_text, target_language, favor_precision, favor_recall, include_formatting, include_links, include_images, include_tables, config): +def compare_extraction(tree, backup_tree, url, body, text, len_text, target_language, favor_precision, favor_recall, + include_formatting, include_links, include_images, include_tables, config): '''Decide whether to choose own or external extraction based on a series of heuristics''' # bypass for recall - if favor_recall is True and len_text > config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE')*10: + if favor_recall is True and len_text > config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE') * 10: return body, text, len_text algo_flag, jt_result = False, False # prior cleaning @@ -625,7 +633,8 @@ def compare_extraction(tree, backup_tree, url, body, text, len_text, target_lang else: if not body.xpath('//p//text()') and len_algo > config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE') * 2: algo_flag = True - elif len(body.xpath('//table')) > len(body.xpath('//p')) and len_algo > config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE') * 2: + elif len(body.xpath('//table')) > len(body.xpath('//p')) and len_algo > config.getint('DEFAULT', + 'MIN_EXTRACTED_SIZE') * 2: algo_flag = True else: LOGGER.debug('extraction values: %s %s for %s', len_text, len_algo, url) @@ -822,7 +831,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False, if only_with_metadata is True and any( x is None for x in [document.date, document.title, document.url] - ): + ): LOGGER.error('no metadata for URL %s', url) raise ValueError else: @@ -841,18 +850,23 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # comments first, then remove if include_comments is True: - commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments(cleaned_tree, deduplicate, config) + commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments(cleaned_tree, deduplicate, + config) else: commentsbody, temp_comments, len_comments = None, '', 0 if favor_precision is True: cleaned_tree = prune_unwanted_nodes(cleaned_tree, REMOVE_COMMENTS_XPATH) # extract content - postbody, temp_text, len_text = extract_content(cleaned_tree, favor_precision, favor_recall, include_tables, include_images, include_links, deduplicate, config) + postbody, temp_text, len_text = extract_content(cleaned_tree, favor_precision, favor_recall, include_tables, + include_images, include_links, deduplicate, config) # compare if necessary if no_fallback is False: - postbody, temp_text, len_text = compare_extraction(cleaned_tree_backup, tree_backup_1, url, postbody, temp_text, len_text, target_language, favor_precision, favor_recall, include_formatting, include_links, include_images, include_tables, config) + postbody, temp_text, len_text = compare_extraction(cleaned_tree_backup, tree_backup_1, url, postbody, + temp_text, len_text, target_language, favor_precision, + favor_recall, include_formatting, include_links, + include_images, include_tables, config) # add baseline as additional fallback # rescue: try to use original/dirty tree # and favor_precision is False=? if len_text < config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE'): @@ -872,7 +886,8 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # size checks if len_comments < config.getint('DEFAULT', 'MIN_EXTRACTED_COMM_SIZE'): LOGGER.info('not enough comments %s', url) - if len_text < config.getint('DEFAULT', 'MIN_OUTPUT_SIZE') and len_comments < config.getint('DEFAULT', 'MIN_OUTPUT_COMM_SIZE'): + if len_text < config.getint('DEFAULT', 'MIN_OUTPUT_SIZE') and len_comments < config.getint('DEFAULT', + 'MIN_OUTPUT_COMM_SIZE'): LOGGER.info('text and comments not long enough: %s %s', len_text, len_comments) raise ValueError diff --git a/trafilatura/filters.py b/trafilatura/filters.py index 2809859e..75a4b4c4 100644 --- a/trafilatura/filters.py +++ b/trafilatura/filters.py @@ -27,7 +27,11 @@ RE_HTML_LANG = re.compile(r'([a-z]{2})', re.I) -RE_FILTER = re.compile(r'\W*(Drucken|E-?Mail|Facebook|Flipboard|Google|Instagram|Linkedin|Mail|PDF|Pinterest|Pocket|Print|Reddit|Twitter|Whatsapp|Xing|Mehr zum Thema:?|More on this.{,8}$)$', flags=re.IGNORECASE) +# Mostly filters for social media +RE_FILTER = re.compile(r'\W*(Drucken|E-?Mail|Facebook|Flipboard|Google|Instagram|' + 'Linkedin|Mail|PDF|Pinterest|Pocket|Print|QQ|Reddit|Twitter|' + 'WeChat|WeiBo|Whatsapp|Xing|Mehr zum Thema:?|More on this.{,8}$)$', + flags=re.IGNORECASE) # COMMENTS_BLACKLIST = ('( Abmelden / Ă„ndern )') # Fill in your details below|Trage deine Daten unten|Kommentar verfassen|Bitte logge dich|Hinterlasse einen Kommentar| to %s| mit %s) diff --git a/trafilatura/metaxpaths.py b/trafilatura/metaxpaths.py index f263a854..bede202b 100644 --- a/trafilatura/metaxpaths.py +++ b/trafilatura/metaxpaths.py @@ -10,7 +10,7 @@ author_xpaths = [ '//*[(self::a or self::address or self::link or self::p or self::span or self::strong)][@rel="author" or @id="author" or @class="author" or @itemprop="author name" or rel="me"]|//author', # specific '//*[(self::a or self::div or self::span or self::p or self::strong)][contains(@class, "author-name") or contains(@class, "AuthorName") or contains(@class, "authorName") or contains(@class, "author name")]', # almost specific - '//*[(self::a or self::div or self::span or self::p or self::h4 or self::h3)][contains(@class, "author") or contains(@id, "author") or contains(@itemprop, "author") or @class="byline"]', # almost generic + '//*[(self::a or self::div or self::span or self::p or self::h4 or self::h3)][contains(@class, "author") or contains(@id, "author") or contains(@itemprop, "author") or @class="byline" or contains(@id, "zuozhe") or contains(@class, "zuozhe") or contains(@id, "bianji") or contains(@class, "bianji") or contains(@id, "xiaobian") or contains(@class, "xiaobian")]', # almost generic '//*[(self::a or self::div or self::span or self::p)][contains(@class, "authors") or contains(@class, "byline") or contains(@class, "ByLine") or contains(@class, "submitted-by") or contains(@class, "posted-by")]', # generic '//*[contains(@class, "author") or contains(@class, "Author") or contains(@id, "Author") or contains(@class, "screenname") or contains(@data-component, "Byline") or contains(@itemprop, "author") or contains(@class, "writer") or contains(@class, "byline")]', # any element '//*[(self::a or self::span)][@class="username" or @class="BBL"]', # not common diff --git a/trafilatura/xpaths.py b/trafilatura/xpaths.py index ed0e847c..be0d8b13 100644 --- a/trafilatura/xpaths.py +++ b/trafilatura/xpaths.py @@ -101,9 +101,8 @@ OVERALL_DISCARD_XPATH = [ '''.//*[contains(@id, "footer") or contains(@class, "footer") or contains(@id, "bottom") or contains(@class, "bottom")]''', + # related posts, sharing jp-post-flair jp-relatedposts, news outlets + navigation - #'''.//*[ - # self::link '''.//*[(self::div or self::item or self::list or self::p or self::section or self::span)][ contains(@id, "related") or contains(translate(@class, "R", "r"), "related") or @@ -122,10 +121,9 @@ contains(@class, "sidebar") or contains(@id, "banner") or contains(@class, "banner") or contains(@class, "meta") or contains(@id, "menu") or contains(@class, "menu") or - starts-with(@id, "nav") or starts-with(@class, "nav") or - contains(@id, "navigation") or contains(translate(@class, "N","n"), "navigation") - or contains(@role, "navigation") or contains(@class, "navbar") - or contains(@class, "navbox") or starts-with(@class, "post-nav") + contains(translate(@id, "N", "n"), "nav") or contains(translate(@role, "N", "n"), "nav") + or starts-with(@class, "nav") or contains(translate(@class, "N","n"), "navigation") or + contains(@class, "navbar") or contains(@class, "navbox") or starts-with(@class, "post-nav") or contains(@id, "breadcrumb") or contains(@class, "breadcrumb") or contains(@id, "bread-crumb") or contains(@class, "bread-crumb") or contains(@id, "author") or contains(@class, "author") or @@ -146,10 +144,14 @@ or contains(@class, " ad ") or contains(@class, "next-post") or contains(@class, "message-container") or contains(@id, "message_container") + or contains(@class, "yin") or contains(@class, "zlylin") or + contains(@class, "xg1") or contains(@id, "bmdh") or @data-lp-replacement-content]''', + # comment debris '''.//*[@class="comments-title" or contains(@class, "comments-title") or contains(@class, "nocomments") or starts-with(@id, "reply-") or starts-with(@class, "reply-") or contains(@class, "-reply-") or contains(@class, "message") or contains(@id, "akismet") or contains(@class, "akismet")]''', + # hidden '''.//*[starts-with(@class, "hide-") or contains(@class, "hide-print") or contains(@id, "hidden") or contains(@style, "hidden") or contains(@hidden, "hidden") or contains(@class, "noprint") or contains(@style, "display:none") or contains(@class, " hidden") or @aria-hidden="true" or contains(@class, "notloaded")]''', @@ -158,7 +160,9 @@ # contains(@id, "header") or contains(@class, "header") or # class contains "cats" (categories, also tags?) # or contains(@class, "hidden ") or contains(@class, "-hide") -# or contains(@class, "paywall") +# or contains(@class, "paywall") +# contains(@class, "content-info") or contains(@class, "content-title") +# contains(translate(@class, "N", "n"), "nav") or # the following conditions focus on extraction precision @@ -197,3 +201,4 @@ or contains(@class, "signin") or contains(@id, "akismet") or contains(@class, "akismet") or contains(@style, "display:none")]''', ] +