From 473656e72e0d440efde556057bdc192bdb228dcf Mon Sep 17 00:00:00 2001 From: Alessio Tognan Date: Mon, 4 Mar 2024 10:54:44 +0100 Subject: [PATCH] feat: use richtext features for string extraction --- wagtail_localize/strings.py | 123 +++++++++++++++++++++++++++++++----- 1 file changed, 108 insertions(+), 15 deletions(-) diff --git a/wagtail_localize/strings.py b/wagtail_localize/strings.py index 7071cb9b4..ea9464593 100644 --- a/wagtail_localize/strings.py +++ b/wagtail_localize/strings.py @@ -3,10 +3,88 @@ from bs4 import BeautifulSoup, NavigableString, Tag from django.utils.html import escape from django.utils.translation import gettext as _ +from wagtail.rich_text import features as feature_registry # List of tags that are allowed in segments -INLINE_TAGS = ["a", "abbr", "acronym", "b", "code", "em", "i", "strong", "br"] +INLINE_TAGS = None + + +def set_inline_tags(): + global INLINE_TAGS + inline_tags = ["a", "abbr", "acronym", "b", "code", "em", "i", "strong", "br"] + + if not feature_registry.has_scanned_for_features: + feature_registry._scan_for_features() + + for editor in feature_registry.plugins_by_editor.keys(): + # This method only supports draftail + if editor != "draftail": + continue + + for feature in feature_registry.plugins_by_editor[editor].keys(): + plugin = feature_registry.get_editor_plugin(editor, feature) + if plugin and plugin.option_name == "inlineStyles": + db_converter = feature_registry.get_converter_rule( + "contentstate", feature + ) + name = plugin.data["type"] + style = dict_to_css_selector( + db_converter["to_database_format"]["style_map"][name] + ) + if style not in inline_tags: + inline_tags.append(style) + + INLINE_TAGS = inline_tags + + +def dict_to_css_selector(d): + """ + Converts a dictionary to a CSS selector string. + """ + + if not isinstance(d, dict): + return d + + if "element" not in d: + raise ValueError("Element key is required") + + selector = d["element"] + if "props" in d: + tag_id = d["props"].pop("id", None) + if tag_id: + selector += f"#{tag_id}" + klass = d["props"].pop("class", None) + if klass: + selector += f".{klass}" + for key, value in d["props"].items(): + selector += f"[{key}='{value}']" + + return selector + + +def bs4_to_css_selector(tag): + """ + Converts a BeautifulSoup tag to a CSS selector string. + """ + + if not isinstance(tag, Tag): + return None + + selector = tag.name + attrs = tag.attrs.copy() + + if "id" in attrs: + selector += f"#{attrs['id']}" + del attrs["id"] + if "class" in attrs: + selector += f".{'.'.join(attrs['class'])}" + del attrs["class"] + + for key, value in attrs.items(): + selector += f"[{key}='{value}']" + + return selector def lstrip_keep(text): @@ -39,26 +117,33 @@ def validate_element(element): if isinstance(element, NavigableString): return + if INLINE_TAGS is None: + set_inline_tags() + # Validate tag and attributes if isinstance(element, Tag) and element.name != "[document]": # Block tags are not allowed in strings - if element.name not in INLINE_TAGS: + if not ( + element.name in INLINE_TAGS or bs4_to_css_selector(element) in INLINE_TAGS + ): raise ValueError( _( "<{}> tag is not allowed. Strings can only contain standard HTML inline tags (such as , )" - ).format(element.name) + ).format(bs4_to_css_selector(element)) ) + # This check is not necessary because we allowing attributes now + # Elements can't have attributes, except for tags - keys = set(element.attrs.keys()) - if element.name == "a" and "id" in keys: - keys.remove("id") - if keys: - raise ValueError( - _( - "Strings cannot have any HTML tags with attributes (except for 'id' in tags)" - ) - ) + # keys = set(element.attrs.keys()) + # if element.name == "a" and "id" in keys: + # keys.remove("id") + # if keys: + # raise ValueError( + # _( + # "Strings cannot have any HTML tags with attributes (except for 'id' in tags)" + # ) + # ) # Traverse children for child_element in element.children: @@ -129,7 +214,7 @@ def walk(soup): else: # Extract HTML attributes replacing them with an ID - if element.attrs: + if element.name == "a" and element.attrs: counter[element.name] += 1 element_id = element.name + str(counter[element.name]) attrs[element_id] = element.attrs @@ -300,6 +385,9 @@ def extract_strings(html): if html is None: html = "" + if INLINE_TAGS is None: + set_inline_tags() + soup = BeautifulSoup(html, "html.parser") def wrap(elements): @@ -321,7 +409,10 @@ def wrap(elements): len(elements) == 1 and not isinstance(elements[0], NavigableString) and elements[0].name != "a" # keep href translatable - and elements[0].name in INLINE_TAGS + and ( + elements[0].name in INLINE_TAGS + or bs4_to_css_selector(elements[0]) in INLINE_TAGS + ) ): wrap(elements[0].children) return @@ -423,7 +514,9 @@ def walk(element): buffer = [] has_wrap = True - if element.name not in INLINE_TAGS: + if not ( + element.name in INLINE_TAGS or bs4_to_css_selector(element) in INLINE_TAGS + ): if buffer: wrap(buffer) has_wrap = True