Extract links to segment and restore them while translating

wagtail · Feb 19, 2021 · 5432ad3 · 5432ad3
1 parent 110473b
commit 5432ad3
Show file tree

Hide file tree

Showing 7 changed files with 104 additions and 26 deletions.
diff --git a/wagtail_localize/segments/extract.py b/wagtail_localize/segments/extract.py
@@ -18,6 +18,13 @@
 from ..strings import extract_strings
 
 
+def quote_path_component(text):
+    """
+    Puts quotes around the path compoenents, and escapes any special characters.
+    """
+    return "'" + text.replace("\\", "\\\\") .replace("'", "\\'") + "'"
+
+
 class StreamFieldSegmentExtractor:
     def __init__(self, field, include_overridables=False):
         self.field = field
@@ -48,11 +55,17 @@ def handle_block(self, block_type, block_value):
             return [StringSegmentValue("", block_value)]
 
         elif isinstance(block_type, blocks.RichTextBlock):
-            template, strings = extract_strings(block_value.source)
-
-            return [TemplateSegmentValue("", "html", template, len(strings))] + [
-                StringSegmentValue("", string, attrs=attrs) for string, attrs in strings
+            template, strings, hrefs = extract_strings(block_value.source)
+            ret = [
+                TemplateSegmentValue("", "html", template, len(strings))
+            ] + [
+                StringSegmentValue("", string, attrs=attrs)
+                for string, attrs in strings
+            ] + [
+                OverridableSegmentValue(quote_path_component(href), href)
+                for href in hrefs
             ]
+            return ret
 
         elif isinstance(block_type, blocks.ChooserBlock):
             return self.handle_related_object_block(block_value)
@@ -139,10 +152,16 @@ def extract_segments(instance):
 
         elif isinstance(field, RichTextField):
             if is_translatable:
-                template, strings = extract_strings(field.value_from_object(instance))
-
-                field_segments = [TemplateSegmentValue("", "html", template, len(strings))] + [
-                    StringSegmentValue("", string, attrs=attrs) for string, attrs in strings
+                template, strings, hrefs = extract_strings(field.value_from_object(instance))
+
+                field_segments = [
+                    TemplateSegmentValue("", "html", template, len(strings))
+                ] + [
+                    StringSegmentValue("", string, attrs=attrs)
+                    for string, attrs in strings
+                ] + [
+                    OverridableSegmentValue(quote_path_component(href), href)
+                    for href in hrefs
                 ]
 
                 segments.extend(segment.wrap(field.name) for segment in field_segments)

diff --git a/wagtail_localize/segments/ingest.py b/wagtail_localize/segments/ingest.py
@@ -9,17 +9,45 @@
 
 from wagtail_localize.strings import restore_strings
 
-from .types import OverridableSegmentValue
+from .types import OverridableSegmentValue, StringSegmentValue
+
+
+def unquote_path_component(text):
+    """
+    Removes quotes around a quoted path component, and unescapes any special characters.
+    """
+    if text[0] != "'" or text[-1] != "'":
+        raise ValueError("value must be a quoted string")
+
+    return text[1:-1].replace("\\'", "'").replace("\\\\", "\\")
 
 
 def organise_template_segments(segments):
     # The first segment is always the template, followed by the texts in order of their position
+
     segments.sort(key=lambda segment: segment.order)
     template = segments[0]
+    xrefs = {
+        unquote_path_component(segment.path): segment.data
+        for segment in segments
+        if isinstance(segment, OverridableSegmentValue) and segment.data
+    }
+
+    def translate_href(attrs):
+        """Update href in segments with their translated values."""
+        if attrs:
+            for key, val in attrs.items():
+                if val and "href" in val and val["href"] in xrefs:
+                    val["href"] = xrefs[val["href"]]
+        return attrs
+
     return (
         template.format,
         template.template,
-        [(segment.string, segment.attrs) for segment in segments[1:]],
+        [
+            (segment.string, translate_href(segment.attrs))
+            for segment in segments[1:] if isinstance(segment, StringSegmentValue)
+        ],
     )
 
 

diff --git a/wagtail_localize/segments/tests/test_segment_extraction.py b/wagtail_localize/segments/tests/test_segment_extraction.py
@@ -51,6 +51,10 @@ def make_test_page(**kwargs):
             "a1": {"href": "http://example.com"}
         }
     ),
+    OverridableSegmentValue(
+        "'http://example.com'",
+        "http://example.com"
+    )
 ]
 
 

diff --git a/wagtail_localize/segments/tests/test_segment_ingestion.py b/wagtail_localize/segments/tests/test_segment_ingestion.py
@@ -48,9 +48,14 @@ def make_test_page(**kwargs):
         },
         order=12,
     ),
+    OverridableSegmentValue(
+        "'http://example.com'",
+        "http://example.fr",
+        order=13,
+    )
 ]
 
-RICH_TEXT_TEST_OUTPUT = '<h1>Ceci est une rubrique</h1><p>Ceci est un paragraphe. &lt;foo&gt; <b>Texte en gras</b></p><ul><li><a href="http://example.com">Ceci est un lien</a></li></ul>'
+RICH_TEXT_TEST_OUTPUT = '<h1>Ceci est une rubrique</h1><p>Ceci est un paragraphe. &lt;foo&gt; <b>Texte en gras</b></p><ul><li><a href="http://example.fr">Ceci est un lien</a></li></ul>'
 
 
 class TestSegmentIngestion(TestCase):

diff --git a/wagtail_localize/strings.py b/wagtail_localize/strings.py
@@ -244,6 +244,7 @@ def wrap(elements):
         if (
             len(elements) == 1
             and not isinstance(elements[0], NavigableString)
+            and elements[0].name != 'a'  # keep href translatable
             and elements[0].name in INLINE_TAGS
         ):
             wrap(elements[0].children)
@@ -358,7 +359,9 @@ def walk(element):
     walk(soup)
 
     # Now extract strings from the <text> tags
+    hrefs = set()
     strings = []
+    position = 0
     for element in soup.descendants:
         if element.name == "text":
             text = element.attrs.pop("value")
@@ -369,21 +372,31 @@ def walk(element):
             text, prefix = lstrip_keep(text)
             text, suffix = rstrip_keep(text)
 
-            element.attrs["position"] = len(strings)
-            strings.append(StringValue.from_source_html(text))
+            element.attrs["position"] = position
+            position += 1
+            string_val, attrs = StringValue.from_source_html(text)
+            strings.append((string_val, attrs))
+            # Links should be translated
+            if attrs:
+                for key, val in attrs.items():
+                    if 'href' in val:
+                        hrefs.add(val['href'])
 
             if prefix:
                 element.insert_before(prefix)
 
             if suffix:
                 element.insert_after(suffix)
 
-    return str(soup), strings
+        elif element.name == "a":
+            if element.attrs and 'href' in element.attrs and element.attrs['href'] not in hrefs:
+                hrefs.add(element.attrs['href'])
+
+    return str(soup), strings, sorted(hrefs)
 
 
 def restore_strings(template, strings):
     soup = BeautifulSoup(template, "html.parser")
-
     for text_element in soup.findAll("text"):
         string, attrs = strings[int(text_element.get("position"))]
         text_element.replaceWith(string.render_soup(attrs))

diff --git a/wagtail_localize/tests/test_edit_translation.py b/wagtail_localize/tests/test_edit_translation.py
@@ -185,19 +185,27 @@ def test_edit_page_translation(self):
         self.assertEqual(
             [(segment['contentPath'], segment['value']) for segment in props['segments'] if segment['type'] == 'synchronised_value'],
             [
+                ("test_richtextfield.'http://example.com'", 'http://example.com'),
                 ('test_synchronized_emailfield', '[email protected]'),
             ]
         )
 
         # Test locations
         self.assertEqual(props['segments'][0]['location'], {'tab': 'content', 'field': 'Char field', 'blockId': None, 'fieldHelpText': '', 'order': 1, 'subField': None, 'widget': None})
         self.assertEqual(props['segments'][7]['location'], {'tab': 'content', 'field': 'Test richtextfield', 'blockId': None, 'fieldHelpText': '', 'order': 6, 'subField': None, 'widget': None})
-        self.assertEqual(props['segments'][9]['location'], {'tab': 'content', 'field': 'Text block', 'blockId': str(STREAM_TEXT_BLOCK_ID), 'fieldHelpText': '', 'order': 7, 'subField': None, 'widget': None})
-        self.assertEqual(props['segments'][10]['location'], {'tab': 'content', 'field': 'Test structblock', 'blockId': str(STREAM_STRUCT_BLOCK_ID), 'fieldHelpText': '', 'order': 7, 'subField': 'Field a', 'widget': None})
+        self.assertEqual(props['segments'][10]['location'], {'tab': 'content', 'field': 'Text block', 'blockId': str(STREAM_TEXT_BLOCK_ID), 'fieldHelpText': '', 'order': 7, 'subField': None, 'widget': None})
+        self.assertEqual(props['segments'][11]['location'], {'tab': 'content', 'field': 'Test structblock', 'blockId': str(STREAM_STRUCT_BLOCK_ID), 'fieldHelpText': '', 'order': 7, 'subField': 'Field a', 'widget': None})
         # TODO: Example that uses fieldHelpText
 
+        # Check synchronised value
+        synchronised_value_segment = props['segments'][9]
+        self.assertEqual(synchronised_value_segment['type'], 'synchronised_value')
+        self.assertEqual(synchronised_value_segment['contentPath'], "test_richtextfield.'http://example.com'")
+        self.assertEqual(synchronised_value_segment['location'], {'blockId': None, 'field': 'Test richtextfield', 'fieldHelpText': '', 'order': 6, 'subField': None, 'tab': 'content', 'widget': {'type': 'text'}})
+        self.assertEqual(synchronised_value_segment['value'], 'http://example.com')
+
         # Check related object
-        related_object_segment = props['segments'][12]
+        related_object_segment = props['segments'][13]
         self.assertEqual(related_object_segment['type'], 'related_object')
         self.assertEqual(related_object_segment['contentPath'], 'test_snippet')
         self.assertEqual(related_object_segment['location'], {'tab': 'content', 'field': 'Test snippet', 'blockId': None, 'fieldHelpText': '', 'order': 8, 'subField': None, 'widget': None})
@@ -305,7 +313,7 @@ def test_manually_translated_related_object(self):
         props = json.loads(response.context['props'])
 
         # Check related object
-        related_object_segment = props['segments'][12]
+        related_object_segment = props['segments'][13]
         self.assertEqual(related_object_segment['type'], 'related_object')
         self.assertEqual(related_object_segment['contentPath'], 'test_snippet')
         self.assertEqual(related_object_segment['location'], {'tab': 'content', 'field': 'Test snippet', 'blockId': None, 'fieldHelpText': '', 'order': 8, 'subField': None, 'widget': None})
@@ -352,6 +360,7 @@ def test_override_types(self):
         self.assertEqual(
             [(segment['contentPath'], segment['location']['widget'], segment['value']) for segment in props['segments'] if segment['type'] == 'synchronised_value'],
             [
+                ("test_richtextfield.'http://example.com'", {'type': 'text'}, 'http://example.com'),
                 (f'test_streamfield.{url_block_id}', {'type': 'text'}, "https://wagtail.io/"),
                 (f'test_streamfield.{page_block_id}', {'type': 'page_chooser', 'allowed_page_types': ['wagtailcore.page']}, self.page.id),
                 (f'test_streamfield.{image_block_id}', {'type': 'image_chooser'}, self.page.test_synchronized_image.id),

diff --git a/wagtail_localize/tests/test_strings.py b/wagtail_localize/tests/test_strings.py
@@ -157,7 +157,7 @@ def test_br_tags_converted_to_newlines(self):
 
 class TextExtractStrings(TestCase):
     def test_extract_strings(self):
-        template, strings = extract_strings(
+        template, strings, hrefs = extract_strings(
             """
             <p><b>Bread</b>\xa0is a\xa0<a href="https://en.wikipedia.org/wiki/Staple_food">staple food</a>\xa0prepared from a\xa0<a href="https://en.wikipedia.org/wiki/Dough">dough</a>\xa0of\xa0<a href="https://en.wikipedia.org/wiki/Flour">flour</a>\xa0and\xa0<a href="https://en.wikipedia.org/wiki/Water">water</a>, usually by\xa0<a href="https://en.wikipedia.org/wiki/Baking">baking</a>. Throughout recorded history it has been popular around the world and is one of the oldest artificial foods, having been of importance since the dawn of\xa0<a href="https://en.wikipedia.org/wiki/Agriculture#History">agriculture</a>.</p>
             <p>Proportions of types of flour and other ingredients vary widely, as do modes of preparation. As a result, types, shapes, sizes, and textures of breads differ around the world. Bread may be\xa0<a href="https://en.wikipedia.org/wiki/Leaven">leavened</a>\xa0by processes such as reliance on naturally occurring\xa0<a href="https://en.wikipedia.org/wiki/Sourdough">sourdough</a>\xa0microbes, chemicals, industrially produced yeast, or high-pressure aeration. Some bread is cooked before it can leaven, including for traditional or religious reasons. Non-cereal ingredients such as fruits, nuts and fats may be included. Commercial bread commonly contains additives to improve flavor, texture, color, shelf life, and ease of manufacturing.</p>
@@ -181,7 +181,7 @@ def test_extract_strings(self):
         )
 
     def test_extract_strings_2(self):
-        template, strings = extract_strings(
+        template, strings, hrefs = extract_strings(
             """
             <h1>Foo bar baz</h1>
             <p>This is a paragraph. <b>This is some bold <i>and now italic</i></b> text</p>
@@ -222,7 +222,7 @@ def test_extract_strings_2(self):
     def test_block_tag_in_inline_tag(self):
         # If an inline tag contains a block tag. The inline tag must be in the template.
         # Testing for issue https://github.com/mozilla/donate-wagtail/issues/586
-        template, strings = extract_strings("<p><i>Foo <p>Bar</p></i></p>")
+        template, strings, hrefs = extract_strings("<p><i>Foo <p>Bar</p></i></p>")
 
         self.assertHTMLEqual(
             template,
@@ -235,7 +235,7 @@ def test_block_tag_in_inline_tag(self):
         ])
 
     def test_br_tag_is_treated_as_inline_tag(self):
-        template, strings = extract_strings(
+        template, strings, hrefs = extract_strings(
             "<p><b>Foo <i>Bar<br/>Baz</i></b></p>"
         )
 
@@ -246,21 +246,21 @@ def test_br_tag_is_treated_as_inline_tag(self):
         ])
 
     def test_br_tag_is_removed_when_it_appears_at_beginning_of_segment(self):
-        template, strings = extract_strings("<p><i><br/>Foo</i></p>")
+        template, strings, hrefs = extract_strings("<p><i><br/>Foo</i></p>")
 
         self.assertHTMLEqual(template, '<p><i><br/><text position="0"></text></i></p>')
 
         self.assertEqual(strings, [StringValue.from_source_html("Foo")])
 
     def test_br_tag_is_removed_when_it_appears_at_end_of_segment(self):
-        template, strings = extract_strings("<p><i>Foo</i><br/></p>")
+        template, strings, hrefs = extract_strings("<p><i>Foo</i><br/></p>")
 
         self.assertHTMLEqual(template, '<p><i><text position="0"></text></i><br/></p>')
 
         self.assertEqual(strings, [StringValue.from_source_html("Foo")])
 
     def test_empty_inline_tag(self):
-        template, strings = extract_strings("<p><i></i>Foo</p>")
+        template, strings, hrefs = extract_strings("<p><i></i>Foo</p>")
 
         self.assertHTMLEqual(template, '<p><i></i><text position="0"></text></p>')