Skip to content

Commit

Permalink
Extract links to segment and restore them while translating
Browse files Browse the repository at this point in the history
  • Loading branch information
mardiros authored and kaedroho committed Feb 19, 2021
1 parent 110473b commit 5432ad3
Show file tree
Hide file tree
Showing 7 changed files with 104 additions and 26 deletions.
35 changes: 27 additions & 8 deletions wagtail_localize/segments/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@
from ..strings import extract_strings


def quote_path_component(text):
"""
Puts quotes around the path compoenents, and escapes any special characters.
"""
return "'" + text.replace("\\", "\\\\") .replace("'", "\\'") + "'"


class StreamFieldSegmentExtractor:
def __init__(self, field, include_overridables=False):
self.field = field
Expand Down Expand Up @@ -48,11 +55,17 @@ def handle_block(self, block_type, block_value):
return [StringSegmentValue("", block_value)]

elif isinstance(block_type, blocks.RichTextBlock):
template, strings = extract_strings(block_value.source)

return [TemplateSegmentValue("", "html", template, len(strings))] + [
StringSegmentValue("", string, attrs=attrs) for string, attrs in strings
template, strings, hrefs = extract_strings(block_value.source)
ret = [
TemplateSegmentValue("", "html", template, len(strings))
] + [
StringSegmentValue("", string, attrs=attrs)
for string, attrs in strings
] + [
OverridableSegmentValue(quote_path_component(href), href)
for href in hrefs
]
return ret

elif isinstance(block_type, blocks.ChooserBlock):
return self.handle_related_object_block(block_value)
Expand Down Expand Up @@ -139,10 +152,16 @@ def extract_segments(instance):

elif isinstance(field, RichTextField):
if is_translatable:
template, strings = extract_strings(field.value_from_object(instance))

field_segments = [TemplateSegmentValue("", "html", template, len(strings))] + [
StringSegmentValue("", string, attrs=attrs) for string, attrs in strings
template, strings, hrefs = extract_strings(field.value_from_object(instance))

field_segments = [
TemplateSegmentValue("", "html", template, len(strings))
] + [
StringSegmentValue("", string, attrs=attrs)
for string, attrs in strings
] + [
OverridableSegmentValue(quote_path_component(href), href)
for href in hrefs
]

segments.extend(segment.wrap(field.name) for segment in field_segments)
Expand Down
32 changes: 30 additions & 2 deletions wagtail_localize/segments/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,45 @@

from wagtail_localize.strings import restore_strings

from .types import OverridableSegmentValue
from .types import OverridableSegmentValue, StringSegmentValue


def unquote_path_component(text):
"""
Removes quotes around a quoted path component, and unescapes any special characters.
"""
if text[0] != "'" or text[-1] != "'":
raise ValueError("value must be a quoted string")

return text[1:-1].replace("\\'", "'").replace("\\\\", "\\")


def organise_template_segments(segments):
# The first segment is always the template, followed by the texts in order of their position

segments.sort(key=lambda segment: segment.order)
template = segments[0]
xrefs = {
unquote_path_component(segment.path): segment.data
for segment in segments
if isinstance(segment, OverridableSegmentValue) and segment.data
}

def translate_href(attrs):
"""Update href in segments with their translated values."""
if attrs:
for key, val in attrs.items():
if val and "href" in val and val["href"] in xrefs:
val["href"] = xrefs[val["href"]]
return attrs

return (
template.format,
template.template,
[(segment.string, segment.attrs) for segment in segments[1:]],
[
(segment.string, translate_href(segment.attrs))
for segment in segments[1:] if isinstance(segment, StringSegmentValue)
],
)


Expand Down
4 changes: 4 additions & 0 deletions wagtail_localize/segments/tests/test_segment_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ def make_test_page(**kwargs):
"a1": {"href": "http://example.com"}
}
),
OverridableSegmentValue(
"'http://example.com'",
"http://example.com"
)
]


Expand Down
7 changes: 6 additions & 1 deletion wagtail_localize/segments/tests/test_segment_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,14 @@ def make_test_page(**kwargs):
},
order=12,
),
OverridableSegmentValue(
"'http://example.com'",
"http://example.fr",
order=13,
)
]

RICH_TEXT_TEST_OUTPUT = '<h1>Ceci est une rubrique</h1><p>Ceci est un paragraphe. &lt;foo&gt; <b>Texte en gras</b></p><ul><li><a href="http://example.com">Ceci est un lien</a></li></ul>'
RICH_TEXT_TEST_OUTPUT = '<h1>Ceci est une rubrique</h1><p>Ceci est un paragraphe. &lt;foo&gt; <b>Texte en gras</b></p><ul><li><a href="http://example.fr">Ceci est un lien</a></li></ul>'


class TestSegmentIngestion(TestCase):
Expand Down
21 changes: 17 additions & 4 deletions wagtail_localize/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ def wrap(elements):
if (
len(elements) == 1
and not isinstance(elements[0], NavigableString)
and elements[0].name != 'a' # keep href translatable
and elements[0].name in INLINE_TAGS
):
wrap(elements[0].children)
Expand Down Expand Up @@ -358,7 +359,9 @@ def walk(element):
walk(soup)

# Now extract strings from the <text> tags
hrefs = set()
strings = []
position = 0
for element in soup.descendants:
if element.name == "text":
text = element.attrs.pop("value")
Expand All @@ -369,21 +372,31 @@ def walk(element):
text, prefix = lstrip_keep(text)
text, suffix = rstrip_keep(text)

element.attrs["position"] = len(strings)
strings.append(StringValue.from_source_html(text))
element.attrs["position"] = position
position += 1
string_val, attrs = StringValue.from_source_html(text)
strings.append((string_val, attrs))
# Links should be translated
if attrs:
for key, val in attrs.items():
if 'href' in val:
hrefs.add(val['href'])

if prefix:
element.insert_before(prefix)

if suffix:
element.insert_after(suffix)

return str(soup), strings
elif element.name == "a":
if element.attrs and 'href' in element.attrs and element.attrs['href'] not in hrefs:
hrefs.add(element.attrs['href'])

return str(soup), strings, sorted(hrefs)


def restore_strings(template, strings):
soup = BeautifulSoup(template, "html.parser")

for text_element in soup.findAll("text"):
string, attrs = strings[int(text_element.get("position"))]
text_element.replaceWith(string.render_soup(attrs))
Expand Down
17 changes: 13 additions & 4 deletions wagtail_localize/tests/test_edit_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,19 +185,27 @@ def test_edit_page_translation(self):
self.assertEqual(
[(segment['contentPath'], segment['value']) for segment in props['segments'] if segment['type'] == 'synchronised_value'],
[
("test_richtextfield.'http://example.com'", 'http://example.com'),
('test_synchronized_emailfield', '[email protected]'),
]
)

# Test locations
self.assertEqual(props['segments'][0]['location'], {'tab': 'content', 'field': 'Char field', 'blockId': None, 'fieldHelpText': '', 'order': 1, 'subField': None, 'widget': None})
self.assertEqual(props['segments'][7]['location'], {'tab': 'content', 'field': 'Test richtextfield', 'blockId': None, 'fieldHelpText': '', 'order': 6, 'subField': None, 'widget': None})
self.assertEqual(props['segments'][9]['location'], {'tab': 'content', 'field': 'Text block', 'blockId': str(STREAM_TEXT_BLOCK_ID), 'fieldHelpText': '', 'order': 7, 'subField': None, 'widget': None})
self.assertEqual(props['segments'][10]['location'], {'tab': 'content', 'field': 'Test structblock', 'blockId': str(STREAM_STRUCT_BLOCK_ID), 'fieldHelpText': '', 'order': 7, 'subField': 'Field a', 'widget': None})
self.assertEqual(props['segments'][10]['location'], {'tab': 'content', 'field': 'Text block', 'blockId': str(STREAM_TEXT_BLOCK_ID), 'fieldHelpText': '', 'order': 7, 'subField': None, 'widget': None})
self.assertEqual(props['segments'][11]['location'], {'tab': 'content', 'field': 'Test structblock', 'blockId': str(STREAM_STRUCT_BLOCK_ID), 'fieldHelpText': '', 'order': 7, 'subField': 'Field a', 'widget': None})
# TODO: Example that uses fieldHelpText

# Check synchronised value
synchronised_value_segment = props['segments'][9]
self.assertEqual(synchronised_value_segment['type'], 'synchronised_value')
self.assertEqual(synchronised_value_segment['contentPath'], "test_richtextfield.'http://example.com'")
self.assertEqual(synchronised_value_segment['location'], {'blockId': None, 'field': 'Test richtextfield', 'fieldHelpText': '', 'order': 6, 'subField': None, 'tab': 'content', 'widget': {'type': 'text'}})
self.assertEqual(synchronised_value_segment['value'], 'http://example.com')

# Check related object
related_object_segment = props['segments'][12]
related_object_segment = props['segments'][13]
self.assertEqual(related_object_segment['type'], 'related_object')
self.assertEqual(related_object_segment['contentPath'], 'test_snippet')
self.assertEqual(related_object_segment['location'], {'tab': 'content', 'field': 'Test snippet', 'blockId': None, 'fieldHelpText': '', 'order': 8, 'subField': None, 'widget': None})
Expand Down Expand Up @@ -305,7 +313,7 @@ def test_manually_translated_related_object(self):
props = json.loads(response.context['props'])

# Check related object
related_object_segment = props['segments'][12]
related_object_segment = props['segments'][13]
self.assertEqual(related_object_segment['type'], 'related_object')
self.assertEqual(related_object_segment['contentPath'], 'test_snippet')
self.assertEqual(related_object_segment['location'], {'tab': 'content', 'field': 'Test snippet', 'blockId': None, 'fieldHelpText': '', 'order': 8, 'subField': None, 'widget': None})
Expand Down Expand Up @@ -352,6 +360,7 @@ def test_override_types(self):
self.assertEqual(
[(segment['contentPath'], segment['location']['widget'], segment['value']) for segment in props['segments'] if segment['type'] == 'synchronised_value'],
[
("test_richtextfield.'http://example.com'", {'type': 'text'}, 'http://example.com'),
(f'test_streamfield.{url_block_id}', {'type': 'text'}, "https://wagtail.io/"),
(f'test_streamfield.{page_block_id}', {'type': 'page_chooser', 'allowed_page_types': ['wagtailcore.page']}, self.page.id),
(f'test_streamfield.{image_block_id}', {'type': 'image_chooser'}, self.page.test_synchronized_image.id),
Expand Down
14 changes: 7 additions & 7 deletions wagtail_localize/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def test_br_tags_converted_to_newlines(self):

class TextExtractStrings(TestCase):
def test_extract_strings(self):
template, strings = extract_strings(
template, strings, hrefs = extract_strings(
"""
<p><b>Bread</b>\xa0is a\xa0<a href="https://en.wikipedia.org/wiki/Staple_food">staple food</a>\xa0prepared from a\xa0<a href="https://en.wikipedia.org/wiki/Dough">dough</a>\xa0of\xa0<a href="https://en.wikipedia.org/wiki/Flour">flour</a>\xa0and\xa0<a href="https://en.wikipedia.org/wiki/Water">water</a>, usually by\xa0<a href="https://en.wikipedia.org/wiki/Baking">baking</a>. Throughout recorded history it has been popular around the world and is one of the oldest artificial foods, having been of importance since the dawn of\xa0<a href="https://en.wikipedia.org/wiki/Agriculture#History">agriculture</a>.</p>
<p>Proportions of types of flour and other ingredients vary widely, as do modes of preparation. As a result, types, shapes, sizes, and textures of breads differ around the world. Bread may be\xa0<a href="https://en.wikipedia.org/wiki/Leaven">leavened</a>\xa0by processes such as reliance on naturally occurring\xa0<a href="https://en.wikipedia.org/wiki/Sourdough">sourdough</a>\xa0microbes, chemicals, industrially produced yeast, or high-pressure aeration. Some bread is cooked before it can leaven, including for traditional or religious reasons. Non-cereal ingredients such as fruits, nuts and fats may be included. Commercial bread commonly contains additives to improve flavor, texture, color, shelf life, and ease of manufacturing.</p>
Expand All @@ -181,7 +181,7 @@ def test_extract_strings(self):
)

def test_extract_strings_2(self):
template, strings = extract_strings(
template, strings, hrefs = extract_strings(
"""
<h1>Foo bar baz</h1>
<p>This is a paragraph. <b>This is some bold <i>and now italic</i></b> text</p>
Expand Down Expand Up @@ -222,7 +222,7 @@ def test_extract_strings_2(self):
def test_block_tag_in_inline_tag(self):
# If an inline tag contains a block tag. The inline tag must be in the template.
# Testing for issue https://github.com/mozilla/donate-wagtail/issues/586
template, strings = extract_strings("<p><i>Foo <p>Bar</p></i></p>")
template, strings, hrefs = extract_strings("<p><i>Foo <p>Bar</p></i></p>")

self.assertHTMLEqual(
template,
Expand All @@ -235,7 +235,7 @@ def test_block_tag_in_inline_tag(self):
])

def test_br_tag_is_treated_as_inline_tag(self):
template, strings = extract_strings(
template, strings, hrefs = extract_strings(
"<p><b>Foo <i>Bar<br/>Baz</i></b></p>"
)

Expand All @@ -246,21 +246,21 @@ def test_br_tag_is_treated_as_inline_tag(self):
])

def test_br_tag_is_removed_when_it_appears_at_beginning_of_segment(self):
template, strings = extract_strings("<p><i><br/>Foo</i></p>")
template, strings, hrefs = extract_strings("<p><i><br/>Foo</i></p>")

self.assertHTMLEqual(template, '<p><i><br/><text position="0"></text></i></p>')

self.assertEqual(strings, [StringValue.from_source_html("Foo")])

def test_br_tag_is_removed_when_it_appears_at_end_of_segment(self):
template, strings = extract_strings("<p><i>Foo</i><br/></p>")
template, strings, hrefs = extract_strings("<p><i>Foo</i><br/></p>")

self.assertHTMLEqual(template, '<p><i><text position="0"></text></i><br/></p>')

self.assertEqual(strings, [StringValue.from_source_html("Foo")])

def test_empty_inline_tag(self):
template, strings = extract_strings("<p><i></i>Foo</p>")
template, strings, hrefs = extract_strings("<p><i></i>Foo</p>")

self.assertHTMLEqual(template, '<p><i></i><text position="0"></text></p>')

Expand Down

0 comments on commit 5432ad3

Please sign in to comment.