From ca631bdcca63f76323e6452bf672e72343ca238b Mon Sep 17 00:00:00 2001 From: Rangi42 Date: Tue, 28 Jan 2025 00:23:36 -0500 Subject: [PATCH] Fix `CHARLEN` and `CHARSUB` on invalid UTF-8 --- src/asm/charmap.cpp | 13 ++++--------- test/asm/invalid-utf-8-strings.asm | 10 +++++++--- test/asm/invalid-utf-8-strings.err | 28 +++++++++++++++++++++++++++- test/asm/invalid-utf-8-strings.out | 3 ++- 4 files changed, 40 insertions(+), 14 deletions(-) diff --git a/src/asm/charmap.cpp b/src/asm/charmap.cpp index b7eadbca7b..89dab4da04 100644 --- a/src/asm/charmap.cpp +++ b/src/asm/charmap.cpp @@ -226,25 +226,20 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector *output matchLen = value.size(); } else if (inputIdx < input.length()) { // No match found, but there is some input left size_t codepointLen = 0; - // This will write the codepoint's value to `output`, little-endian for (uint32_t state = 0, codepoint = 0;;) { if (decode(&state, &codepoint, input[inputIdx + codepointLen]) == 1) { - codepointLen = 0; + error("Input string is not valid UTF-8\n"); + codepointLen = 1; break; } - - if (output) { - output->push_back(input[inputIdx + codepointLen]); - } codepointLen++; - if (state == 0) { break; } } - if (codepointLen == 0) { - error("Input string is not valid UTF-8\n"); + if (output) { + output->insert(output->end(), &input[inputIdx], &input[inputIdx + codepointLen]); } // Warn if this character is not mapped but any others are diff --git a/test/asm/invalid-utf-8-strings.asm b/test/asm/invalid-utf-8-strings.asm index f105e6e350..c4ff6e205c 100644 --- a/test/asm/invalid-utf-8-strings.asm +++ b/test/asm/invalid-utf-8-strings.asm @@ -29,9 +29,13 @@ println "\"{mid2}{mid1}\"" ; 4: invalid byte 0x81 ; 5: invalid byte 0xFF ; 6: U+0020 space -; 7: U+0042 B -REDEF invalid EQUS "A B" +; 7: U+6F22 kanji (0xE6 0xBC 0xA2) +REDEF invalid EQUS "A 漢" DEF n = strlen("{invalid}") DEF r = charlen("{invalid}") -println "\"{#s:invalid}\": {d:n} != {d:r}" +println "\"{#s:invalid}\": {d:n} == {d:r}" + +REDEF mid1 EQUS CHARSUB("{invalid}", 4) +REDEF mid2 EQUS CHARSUB("{invalid}", 7) +println "\"{mid2}{mid1}\"" diff --git a/test/asm/invalid-utf-8-strings.err b/test/asm/invalid-utf-8-strings.err index dfcb1a29c0..529c7a9eed 100644 --- a/test/asm/invalid-utf-8-strings.err +++ b/test/asm/invalid-utf-8-strings.err @@ -50,4 +50,30 @@ error: invalid-utf-8-strings.asm(35): STRLEN: Invalid UTF-8 byte 0xFF error: invalid-utf-8-strings.asm(36): Input string is not valid UTF-8 -error: Assembly aborted (26 errors)! +error: invalid-utf-8-strings.asm(36): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(36): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(39): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(39): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(39): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(39): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(39): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(40): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(40): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(40): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(40): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(40): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(40): + Input string is not valid UTF-8 +error: Assembly aborted (39 errors)! diff --git a/test/asm/invalid-utf-8-strings.out b/test/asm/invalid-utf-8-strings.out index e24c370dda..2d9147a9a2 100644 --- a/test/asm/invalid-utf-8-strings.out +++ b/test/asm/invalid-utf-8-strings.out @@ -1,3 +1,4 @@ "aäb漢,ab!" == "aäb漢,ab!" (12) "b,a" -"A B": 7 != 2 +"A 漢": 7 == 7 +"漢"