Skip to content

Commit

Permalink
Fix CHARLEN and CHARSUB on invalid UTF-8 (gbdev#1630)
Browse files Browse the repository at this point in the history
  • Loading branch information
Rangi42 authored Jan 28, 2025
1 parent d54619a commit 44caffe
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 16 deletions.
16 changes: 7 additions & 9 deletions src/asm/charmap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,24 +227,22 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector<int32_t> *output
} else if (inputIdx < input.length()) { // No match found, but there is some input left
size_t codepointLen = 0;
// This will write the codepoint's value to `output`, little-endian
for (uint32_t state = 0, codepoint = 0;;) {
for (uint32_t state = 0, codepoint = 0; inputIdx + codepointLen < input.length();) {
if (decode(&state, &codepoint, input[inputIdx + codepointLen]) == 1) {
codepointLen = 0;
error("Input string is not valid UTF-8\n");
codepointLen = 1;
break;
}

if (output) {
output->push_back(input[inputIdx + codepointLen]);
}
codepointLen++;

if (state == 0) {
break;
}
}

if (codepointLen == 0) {
error("Input string is not valid UTF-8\n");
if (output) {
output->insert(
output->end(), input.data() + inputIdx, input.data() + inputIdx + codepointLen
);
}

// Warn if this character is not mapped but any others are
Expand Down
14 changes: 9 additions & 5 deletions test/asm/invalid-utf-8-strings.asm
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,13 @@ println "\"{mid2}{mid1}\""
; 4: invalid byte 0x81
; 5: invalid byte 0xFF
; 6: U+0020 space
; 7: U+0042 B
REDEF invalid EQUS "A þÿ B"
; 7: U+6F22 kanji (0xE6 0xBC 0xA2)
REDEF invalid EQUS "A þÿ æ¼¢"

DEF n = strlen("{invalid}")
DEF r = charlen("{invalid}")
println "\"{#s:invalid}\": {d:n} != {d:r}"
DEF n = STRLEN("{invalid}")
DEF r = CHARLEN("{invalid}")
println "\"{#s:invalid}\": {d:n} == {d:r}"

REDEF mid1 EQUS CHARSUB("{invalid}", 4)
REDEF mid2 EQUS CHARSUB("{invalid}", 7)
println "\"{mid2}{mid1}\""
28 changes: 27 additions & 1 deletion test/asm/invalid-utf-8-strings.err
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,30 @@ error: invalid-utf-8-strings.asm(35):
STRLEN: Invalid UTF-8 byte 0xFF
error: invalid-utf-8-strings.asm(36):
Input string is not valid UTF-8
error: Assembly aborted (26 errors)!
error: invalid-utf-8-strings.asm(36):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(36):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(39):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(39):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(39):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(39):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(39):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(40):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(40):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(40):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(40):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(40):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(40):
Input string is not valid UTF-8
error: Assembly aborted (39 errors)!
3 changes: 2 additions & 1 deletion test/asm/invalid-utf-8-strings.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"aäb漢,a��b��!" == "aäb漢,a��b��!" (12)
"b,a"
"A ��� B": 7 != 2
"A ��� 漢": 7 == 7
"漢�"

0 comments on commit 44caffe

Please sign in to comment.