From e9612b4f90429f2c5bcb8b3d252d57bf36043fe2 Mon Sep 17 00:00:00 2001 From: ksss Date: Mon, 25 May 2026 13:59:01 +0900 Subject: [PATCH] Fix lexer infinite loop / abort on invalid UTF-8 byte When the active encoding's `char_width` returned 0 for a byte, `rbs_next_char` left `byte_len = 0`. The lexer then either looped forever (when the byte was inside a comment) or tripped `RBS_ASSERT(current_character_bytes > 0, ...)` in `rbs_skip` at top level. Treat such a byte as a 1-byte garbage character so the lexer always advances at least one byte. The invalid byte then surfaces as a regular parsing error through the existing error path. Minimal reproducer that used to hang the host process indefinitely with the GVL held: RBS::Parser._parse_signature( RBS::Buffer.new(content: "# \xC2".force_encoding("UTF-8"), name: "x.rbs"), 0, 3 ) Found by fuzzing the parser entry points with random byte mutations of the existing seed RBS files. Co-Authored-By: Claude Opus 4.7 (1M context) --- Gemfile | 1 + Gemfile.lock | 1 + src/lexstate.c | 6 +++++- test/rbs/parser_test.rb | 19 +++++++++++++++++++ 4 files changed, 26 insertions(+), 1 deletion(-) diff --git a/Gemfile b/Gemfile index cf2982acd..d35624b74 100644 --- a/Gemfile +++ b/Gemfile @@ -48,6 +48,7 @@ gem "net-smtp" gem 'csv' gem 'ostruct' gem 'pstore' +gem "timeout" group :minitest do gem "minitest" diff --git a/Gemfile.lock b/Gemfile.lock index ada025774..0a1f7d0a4 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -231,6 +231,7 @@ DEPENDENCIES steep! tempfile test-unit + timeout BUNDLED WITH 4.0.1 diff --git a/src/lexstate.c b/src/lexstate.c index d65671e5f..f3d35b435 100644 --- a/src/lexstate.c +++ b/src/lexstate.c @@ -134,7 +134,11 @@ bool rbs_next_char(rbs_lexer_t *lexer, unsigned int *codepoint, size_t *byte_len *byte_len = lexer->encoding->char_width((const uint8_t *) start, (ptrdiff_t) (lexer->string.end - start)); - if (*byte_len == 1) { + if (*byte_len == 0) { + // Avoid infinite loop on invalid bytes. + *byte_len = 1; + *codepoint = (unsigned int) (unsigned char) *start; + } else if (*byte_len == 1) { *codepoint = (unsigned int) *start; } else { *codepoint = 12523; // Dummy data for "ル" from "ルビー" (Ruby) in Unicode diff --git a/test/rbs/parser_test.rb b/test/rbs/parser_test.rb index ea160cc9d..fa5b26e46 100644 --- a/test/rbs/parser_test.rb +++ b/test/rbs/parser_test.rb @@ -1,4 +1,5 @@ require "test_helper" +require "timeout" class RBS::ParserTest < Test::Unit::TestCase def buffer(source) @@ -1028,4 +1029,22 @@ class Foo[T < Integer] < Bar # Comment assert_equal [:tTRIVIA, "\n", 56...57], tokens.shift.then { |t| [t[0], t[1].source, t[1].range] } assert_equal [:pEOF, '', 57...57], tokens.shift.then { |t| [t[0], t[1].source, t[1].range] } end + + def test_invalid_utf8_byte_in_comment_does_not_hang + # Regression: invalid UTF-8 byte in a comment used to loop forever in the lexer. + source = "# \xC2".dup.force_encoding(Encoding::UTF_8) + Timeout.timeout(5) do + RBS::Parser._parse_signature(buffer(source), 0, source.bytesize) + end + end + + def test_invalid_utf8_byte_at_top_level_raises + # Regression: invalid UTF-8 byte at top level used to trip RBS_ASSERT in the C extension. + source = "\xFF".dup.force_encoding(Encoding::UTF_8) + Timeout.timeout(5) do + assert_raises(RBS::ParsingError) do + RBS::Parser._parse_signature(buffer(source), 0, source.bytesize) + end + end + end end