Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compiler segfault in InternPool.indexToKey during sema likely due to invalid index #22824

Open
Jarred-Sumner opened this issue Feb 9, 2025 · 4 comments
Labels
bug Observed behavior contradicts documented or intended behavior

Comments

@Jarred-Sumner
Copy link
Contributor

Zig Version

0.14.0-dev.2987+183bb8b08

Steps to Reproduce and Observed Behavior

Seeing a crash during semantic analysis in Bun when there is likely a compiler error or two.

thread #1, queue = 'com.apple.main-thread', stop reason = EXC_BAD_ACCESS (code=1, address=0xfffffffffffffffb)
* frame #0: 0x00000001057b217c zig`InternPool.indexToKey + 104
->  ldur   w9, [x11, #-0x4]    // x11 = 0xffffffffffffffff

It's probably an invalid index being passed.

pub fn indexToKey(ip: *const InternPool, index: Index) Key {
    assert(index != .none);
    const unwrapped_index = index.unwrap(ip);
    const item = unwrapped_index.getItem(ip);  // <-- Crash likely around here
    const data = item.data;
This was the diff in Bun that led to the crash:
diff --git a/src/js_lexer.zig b/src/js_lexer.zig
index c9dd2d56a..91295d955 100644
--- a/src/js_lexer.zig
+++ b/src/js_lexer.zig
@@ -797,29 +797,34 @@ fn NewLexer_(
             }
         }
 
-        inline fn nextCodepointSlice(it: *LexerType) []const u8 {
-            if (it.current >= it.source.contents.len) {
+        fn nextCodepointSlice(it: *const LexerType) []const u8 {
+            const contents = it.source.contents;
+            const current = it.current;
+            if (current >= contents.len) {
                 return "";
             }
-            const cp_len = strings.wtf8ByteSequenceLengthWithInvalid(it.source.contents.ptr[it.current]);
-            return if (!(cp_len + it.current > it.source.contents.len)) it.source.contents[it.current .. cp_len + it.current] else "";
+            const cp_len = strings.wtf8ByteSequenceLengthWithInvalid(contents[current]);
+            return if (!(cp_len + current > contents.len)) contents[current .. cp_len + current] else "";
         }
 
-        inline fn nextCodepoint(it: *LexerType) CodePoint {
-            if (it.current >= it.source.contents.len) {
-                it.end = it.source.contents.len;
+        fn nextCodepoint(it: *LexerType) CodePoint {
+            const contents = it.source.contents;
+            const current = it.current;
+
+            if (current >= contents.len) {
+                it.end = contents.len;
                 return -1;
             }
-            const cp_len = strings.wtf8ByteSequenceLengthWithInvalid(it.source.contents.ptr[it.current]);
-            const slice = if (!(cp_len + it.current > it.source.contents.len)) it.source.contents[it.current .. cp_len + it.current] else "";
+            const cp_len = strings.wtf8ByteSequenceLengthWithInvalid(contents[current]);
+            const slice = if (!(cp_len + current > contents.len)) contents[current .. cp_len + current] else "";
 
             const code_point = switch (slice.len) {
                 0 => -1,
                 1 => @as(CodePoint, slice[0]),
-                else => strings.decodeWTF8RuneTMultibyte(slice.ptr[0..4], @as(u3, @intCast(slice.len)), CodePoint, strings.unicode_replacement),
+                else => strings.decodeWTF8RuneTMultibyte(slice.ptr[0..4], slice.len, CodePoint, strings.unicode_replacement),
             };
 
-            it.end = it.current;
+            it.end = current;
 
             it.current += if (code_point != strings.unicode_replacement)
                 cp_len
diff --git a/src/js_printer.zig b/src/js_printer.zig
index a9fcaa864..fdee9258a 100644
--- a/src/js_printer.zig
+++ b/src/js_printer.zig
@@ -180,7 +180,7 @@ pub fn estimateLengthForUTF8(input: []const u8, comptime ascii_only: bool, compt
                 4 => remaining[0..4].*,
                 else => unreachable,
             },
-            char_len,
+            @intCast(char_len),
             i32,
             0,
         );
@@ -211,12 +211,12 @@ pub fn writePreQuotedString(text_in: []const u8, comptime Writer: type, writer:
     var i: usize = 0;
     const n: usize = text.len;
     while (i < n) {
-        const width = switch (comptime encoding) {
+        const width: usize = switch (comptime encoding) {
             .latin1, .ascii => 1,
             .utf8 => strings.wtf8ByteSequenceLengthWithInvalid(text[i]),
             .utf16 => 1,
         };
-        const clamped_width = @min(@as(usize, width), n -| i);
+        const clamped_width = @min(width, n - i);
         const c = switch (encoding) {
             .utf8 => strings.decodeWTF8RuneT(
                 &switch (clamped_width) {
diff --git a/src/sourcemap/sourcemap.zig b/src/sourcemap/sourcemap.zig
index bca26ab6a..9ef9a687d 100644
--- a/src/sourcemap/sourcemap.zig
+++ b/src/sourcemap/sourcemap.zig
@@ -1397,9 +1397,8 @@ pub const LineOffsetTable = struct {
 
         var remaining = contents;
         while (remaining.len > 0) {
-            const len_ = strings.wtf8ByteSequenceLengthWithInvalid(remaining[0]);
-            const c = strings.decodeWTF8RuneT(remaining.ptr[0..4], len_, i32, 0);
-            const cp_len = @as(usize, len_);
+            const cp_len = strings.wtf8ByteSequenceLengthWithInvalid(remaining[0]);
+            const c = strings.decodeWTF8RuneT(remaining.ptr[0..4], cp_len, i32, 0);
 
             if (column == 0) {
                 line_byte_offset = @as(
diff --git a/src/string_immutable.zig b/src/string_immutable.zig
index 365d1acda..76b939ff2 100644
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -3796,7 +3796,7 @@ pub fn encodeUTF8Comptime(comptime cp: u32) []const u8 {
 
 // This is a clone of golang's "utf8.EncodeRune" that has been modified to encode using
 // WTF-8 instead. See https://simonsapin.github.io/wtf-8/ for more info.
-pub fn encodeWTF8Rune(p: *[4]u8, r: i32) u3 {
+pub fn encodeWTF8Rune(noalias p: *[4]u8, r: i32) u3_fast {
     return @call(
         .always_inline,
         encodeWTF8RuneT,
@@ -3808,7 +3808,7 @@ pub fn encodeWTF8Rune(p: *[4]u8, r: i32) u3 {
     );
 }
 
-pub fn encodeWTF8RuneT(p: *[4]u8, comptime R: type, r: R) u3 {
+pub fn encodeWTF8RuneT(noalias p: *[4]u8, comptime R: type, r: R) u3_fast {
     switch (r) {
         0...0x7F => {
             p[0] = @as(u8, @intCast(r));
@@ -3864,33 +3864,36 @@ pub fn wtf8Sequence(code_point: u32) [4]u8 {
     };
 }
 
-pub inline fn wtf8ByteSequenceLength(first_byte: u8) u3 {
+const u3_fast = usize;
+
+pub inline fn wtf8ByteSequenceLength(first_byte: u8) u3_fast {
     return switch (first_byte) {
         0 => 0,
         1...0x80 - 1 => 1,
         else => if ((first_byte & 0xE0) == 0xC0)
-            @as(u3, 2)
+            2
         else if ((first_byte & 0xF0) == 0xE0)
-            @as(u3, 3)
+            3
         else if ((first_byte & 0xF8) == 0xF0)
-            @as(u3, 4)
+            4
         else
-            @as(u3, 1),
+            1,
     };
 }
 
 /// 0 == invalid
-pub inline fn wtf8ByteSequenceLengthWithInvalid(first_byte: u8) u3 {
+/// We use a usize so it fits in a register
+pub inline fn wtf8ByteSequenceLengthWithInvalid(first_byte: u8) u3_fast {
     return switch (first_byte) {
         0...0x80 - 1 => 1,
         else => if ((first_byte & 0xE0) == 0xC0)
-            @as(u3, 2)
+            2
         else if ((first_byte & 0xF0) == 0xE0)
-            @as(u3, 3)
+            3
         else if ((first_byte & 0xF8) == 0xF0)
-            @as(u3, 4)
+            4
         else
-            @as(u3, 1),
+            1,
     };
 }
 
@@ -3899,8 +3902,9 @@ pub inline fn wtf8ByteSequenceLengthWithInvalid(first_byte: u8) u3 {
 /// This is a clone of esbuild's decodeWTF8Rune
 /// which was a clone of golang's "utf8.DecodeRune" that was modified to decode using WTF-8 instead.
 /// Asserts a multi-byte codepoint
-pub inline fn decodeWTF8RuneTMultibyte(p: *const [4]u8, len: u3, comptime T: type, comptime zero: T) T {
+pub inline fn decodeWTF8RuneTMultibyte(noalias p: *const [4]u8, len: u3_fast, comptime T: type, comptime zero: T) T {
     if (comptime Environment.allow_assert) assert(len > 1);
+    bun.debugAssert(len < std.math.maxInt(u4));
 
     const s1 = p[1];
     if ((s1 & 0xC0) != 0x80) return zero;
@@ -4902,7 +4906,8 @@ pub fn @"nextUTF16NonASCIIOr$`\\"(
 /// Convert potentially ill-formed UTF-8 or UTF-16 bytes to a Unicode Codepoint.
 /// - Invalid codepoints are replaced with `zero` parameter
 /// - Null bytes return 0
-pub fn decodeWTF8RuneT(p: *const [4]u8, len: u3, comptime T: type, comptime zero: T) T {
+pub fn decodeWTF8RuneT(p: *const [4]u8, len: u3_fast, comptime T: type, comptime zero: T) T {
+    bun.debugAssert(len < std.math.maxInt(u3));
     if (len == 0) return zero;
     if (len == 1) return p[0];
 
diff --git a/src/toml/toml_lexer.zig b/src/toml/toml_lexer.zig
index 6467efd7c..cee345445 100644
--- a/src/toml/toml_lexer.zig
+++ b/src/toml/toml_lexer.zig
@@ -155,19 +155,23 @@ pub const Lexer = struct {
         return it.source.contents[original_i..end_ix];
     }
 
-    inline fn nextCodepointSlice(it: *Lexer) []const u8 {
-        const cp_len = strings.wtf8ByteSequenceLengthWithInvalid(it.source.contents.ptr[it.current]);
-        return if (!(cp_len + it.current > it.source.contents.len)) it.source.contents[it.current .. cp_len + it.current] else "";
+    inline fn nextCodepointSlice(it: *const Lexer) []const u8 {
+        const contents = it.source.contents;
+        const current = it.current;
+        const cp_len = strings.wtf8ByteSequenceLengthWithInvalid(contents[current]);
+        return if (!(cp_len + current > contents.len)) contents[current .. cp_len + current] else "";
     }
 
     inline fn nextCodepoint(it: *Lexer) CodePoint {
-        const cp_len = strings.wtf8ByteSequenceLengthWithInvalid(it.source.contents.ptr[it.current]);
-        const slice = if (!(cp_len + it.current > it.source.contents.len)) it.source.contents[it.current .. cp_len + it.current] else "";
+        const contents = it.source.contents;
+        const current = it.current;
+        const cp_len = strings.wtf8ByteSequenceLengthWithInvalid(contents[current]);
+        const slice = if (!(cp_len + current > contents.len)) contents[current .. cp_len + current] else "";
 
         const code_point = switch (slice.len) {
             0 => -1,
             1 => @as(CodePoint, slice[0]),
-            else => strings.decodeWTF8RuneTMultibyte(slice.ptr[0..4], @as(u3, @intCast(slice.len)), CodePoint, strings.unicode_replacement),
+            else => strings.decodeWTF8RuneTMultibyte(slice.ptr[0..4], slice.len, CodePoint, strings.unicode_replacement),
         };
 
         it.end = it.current;

Expected Behavior

Error(s) showing where my code is wrong

@Jarred-Sumner Jarred-Sumner added the bug Observed behavior contradicts documented or intended behavior label Feb 9, 2025
@mlugg
Copy link
Member

mlugg commented Feb 9, 2025

Incomplete reproduction; I don't know what your diff is against, nor what command to run to reproduce the crash.

Please either say what to apply the diff against, or ideally just provide a branch which reproduces the bug; and say what command to run to trigger the crash.

@Jarred-Sumner
Copy link
Contributor Author

Jarred-Sumner commented Feb 9, 2025

Sorry - here is a commit you can checkout 524f1849038b25b79cee14495f5de3f044998070

To repro:

First run:

# Necessary to run the codegen step before zig build runs
bun run build

Once it starts compiling a bunch of C++ you can CTRL + C

Then run

zig build check

And that should fail pretty quickly with a message like this:

obj
└─ zig build-obj bun-debug Debug aarch64-macos.13.0-none failure
error: the following command terminated unexpectedly:
/Users/jarred/Code/bun/vendor/zig/zig build-obj -freference-trace=24 -fllvm -fno-lld -fno-strip -fno-omit-frame-pointer -fPIC -ODebug -target aarch64-macos.13.0-none -mcpu apple_m1 --dep async_io --dep zlib-internal --dep async --dep ZigGeneratedClasses --dep ResolvedSourceTag --dep ErrorCode --dep build_options --dep translated-c-headers -Mroot=/Users/jarred/Code/bun/root.zig -Masync_io=/Users/jarred/Code/bun/src/io/io_darwin.zig -Mzlib-internal=/Users/jarred/Code/bun/src/deps/zlib.posix.zig -Masync=/Users/jarred/Code/bun/src/async/posix_event_loop.zig -MZigGeneratedClasses=/Users/jarred/Code/bun/build/debug/codegen/ZigGeneratedClasses.zig -MResolvedSourceTag=/Users/jarred/Code/bun/build/debug/codegen/ResolvedSourceTag.zig -MErrorCode=/Users/jarred/Code/bun/build/debug/codegen/ErrorCode.zig -Mbuild_options=/Users/jarred/Code/bun/build/debug/cache/zig/local/c/71043ec0281e8832a9875d3532e32536/options.zig -ODebug -target aarch64-macos.13.0-none -mcpu apple_m1 -Mtranslated-c-headers=/Users/jarred/Code/bun/build/debug/cache/zig/local/o/21185a03a8290413fcee447f7caa5e1f/c-headers-for-zig.zig -lc++ -lc --cache-dir /Users/jarred/Code/bun/build/debug/cache/zig/local --global-cache-dir /Users/jarred/Code/bun/build/debug/cache/zig/global --name bun-debug -fno-compiler-rt --zig-lib-dir /Users/jarred/Code/bun/vendor/zig/lib/ --listen=- 
Build Summary: 2/5 steps succeeded; 1 failed
obj transitive failure
├─ zig build-obj bun-debug Debug aarch64-macos.13.0-none failure
└─ install generated to bun-zig.o transitive failure
   └─ zig build-obj bun-debug Debug aarch64-macos.13.0-none (+2 more reused dependencies)
error: the following build command failed with exit code 1:
/Users/jarred/Code/bun/build/debug/cache/zig/local/o/164273aabf6bf1b3f1bc532a6ee1b6f6/build /Users/jarred/Code/bun/vendor/zig/zig /Users/jarred/Code/bun/vendor/zig/lib /Users/jarred/Code/bun /Users/jarred/Code/bun/build/debug/cache/zig/local /Users/jarred/Code/bun/build/debug/cache/zig/global --seed 0xb72d8224 -Z5e6e00b40476db49 obj --prefix /Users/jarred/Code/bun/build/debug -Dobj_format=obj -Dtarget=aarch64-macos-none -Doptimize=Debug -Dcpu=apple_m1 -Denable_logs=true -Dversion=1.2.3 -Dreported_nodejs_version=22.6.0 -Dcanary=1 -Dcodegen_path=/Users/jarred/Code/bun/build/debug/codegen -Dcodegen_embed=false --prominent-compile-errors -Dsha=2644bad5d472e4306232bc7cafa61d93b978e90a
FAILED: bun-zig.o /Users/jarred/Code/bun/build/debug/bun-zig.o 
cd /Users/jarred/Code/bun && /Users/jarred/Code/bun/vendor/zig/zig build obj --cache-dir /Users/jarred/Code/bun/build/debug/cache/zig/local --global-cache-dir /Users/jarred/Code/bun/build/debug/cache/zig/global --zig-lib-dir /Users/jarred/Code/bun/vendor/zig/lib --prefix /Users/jarred/Code/bun/build/debug -Dobj_format=obj -Dtarget=aarch64-macos-none -Doptimize=Debug -Dcpu=apple_m1 -Denable_logs=true -Dversion=1.2.3 -Dreported_nodejs_version=22.6.0 -Dcanary=1 -Dcodegen_path=/Users/jarred/Code/bun/build/debug/codegen -Dcodegen_embed=false --prominent-compile-errors -Dsha=2644bad5d472e4306232bc7cafa61d93b978e90a
ninja: build stopped: subcommand failed.
  cmake took 0.34 seconds

My hunch is it's something to do with non power of 2 sized integers and packed structs

At one point the following diff caused it to stop crashing briefly, but I didn't commit the code to get everything else that changed

- const cp_len = wtf8ByteSequenceLength(it.bytes[pos]);
+ const cp_len: u3 = @intCast(wtf8ByteSequenceLength(it.bytes[pos]));

@Jarred-Sumner
Copy link
Contributor Author

Jarred-Sumner commented Feb 9, 2025

Another reproducing commit: https://github.com/oven-sh/bun/blob/0c75516f20c370a12fd187a5a4258fc9af6f6feb/src/bundler/bundle_v2.zig#L1786 doesn't reproduce it

It seems to happen for almost any compiler error on macOS arm64.

Linux x64 correctly shows the error:

info: zig compiler v0.14.0-dev.2987+183bb8b08
check
└─ zig build-obj bun-debug Debug native-native-gnu.2.27 1 errors
src/bundler/bundle_v2.zig:1786:31: error: struct 'src.jsc.API' has no member named 'Server'
            page: *bun.JSC.API.Server.PageBundleRoute,
                   ~~~~~~~~~~~^~~~~~~
src/jsc.zig:30:17: note: struct declared here
pub const API = struct {
                ^~~~~~
referenced by:

@Jarred-Sumner
Copy link
Contributor Author

Okay I have a better reproduction.

oven-sh/bun@6630f3b

It stops crashing when commenting out the two line diff in oven-sh/bun@6630f3b

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Observed behavior contradicts documented or intended behavior
Projects
None yet
Development

No branches or pull requests

2 participants