From e9612b4f90429f2c5bcb8b3d252d57bf36043fe2 Mon Sep 17 00:00:00 2001
From: ksss <co000ri@gmail.com>
Date: Mon, 25 May 2026 13:59:01 +0900
Subject: [PATCH] Fix lexer infinite loop / abort on invalid UTF-8 byte

When the active encoding's `char_width` returned 0 for a byte,
`rbs_next_char` left `byte_len = 0`. The lexer then either looped
forever (when the byte was inside a comment) or tripped
`RBS_ASSERT(current_character_bytes > 0, ...)` in `rbs_skip` at top
level.

Treat such a byte as a 1-byte garbage character so the lexer always
advances at least one byte. The invalid byte then surfaces as a
regular parsing error through the existing error path.

Minimal reproducer that used to hang the host process indefinitely
with the GVL held:

  RBS::Parser._parse_signature(
    RBS::Buffer.new(content: "# \xC2".force_encoding("UTF-8"),
                    name: "x.rbs"),
    0, 3
  )

Found by fuzzing the parser entry points with random byte mutations
of the existing seed RBS files.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Gemfile                 |  1 +
 Gemfile.lock            |  1 +
 src/lexstate.c          |  6 +++++-
 test/rbs/parser_test.rb | 19 +++++++++++++++++++
 4 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/Gemfile b/Gemfile
index cf2982acd..d35624b74 100644
--- a/Gemfile
+++ b/Gemfile
@@ -48,6 +48,7 @@ gem "net-smtp"
 gem 'csv'
 gem 'ostruct'
 gem 'pstore'
+gem "timeout"
 
 group :minitest do
   gem "minitest"
diff --git a/Gemfile.lock b/Gemfile.lock
index ada025774..0a1f7d0a4 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -231,6 +231,7 @@ DEPENDENCIES
   steep!
   tempfile
   test-unit
+  timeout
 
 BUNDLED WITH
   4.0.1
diff --git a/src/lexstate.c b/src/lexstate.c
index d65671e5f..f3d35b435 100644
--- a/src/lexstate.c
+++ b/src/lexstate.c
@@ -134,7 +134,11 @@ bool rbs_next_char(rbs_lexer_t *lexer, unsigned int *codepoint, size_t *byte_len
 
     *byte_len = lexer->encoding->char_width((const uint8_t *) start, (ptrdiff_t) (lexer->string.end - start));
 
-    if (*byte_len == 1) {
+    if (*byte_len == 0) {
+        // Avoid infinite loop on invalid bytes.
+        *byte_len = 1;
+        *codepoint = (unsigned int) (unsigned char) *start;
+    } else if (*byte_len == 1) {
         *codepoint = (unsigned int) *start;
     } else {
         *codepoint = 12523; // Dummy data for "ル" from "ルビー" (Ruby) in Unicode
diff --git a/test/rbs/parser_test.rb b/test/rbs/parser_test.rb
index ea160cc9d..fa5b26e46 100644
--- a/test/rbs/parser_test.rb
+++ b/test/rbs/parser_test.rb
@@ -1,4 +1,5 @@
 require "test_helper"
+require "timeout"
 
 class RBS::ParserTest < Test::Unit::TestCase
   def buffer(source)
@@ -1028,4 +1029,22 @@ class Foo[T < Integer] < Bar # Comment
     assert_equal [:tTRIVIA, "\n", 56...57], tokens.shift.then { |t| [t[0], t[1].source, t[1].range] }
     assert_equal [:pEOF, '', 57...57], tokens.shift.then { |t| [t[0], t[1].source, t[1].range] }
   end
+
+  def test_invalid_utf8_byte_in_comment_does_not_hang
+    # Regression: invalid UTF-8 byte in a comment used to loop forever in the lexer.
+    source = "# \xC2".dup.force_encoding(Encoding::UTF_8)
+    Timeout.timeout(5) do
+      RBS::Parser._parse_signature(buffer(source), 0, source.bytesize)
+    end
+  end
+
+  def test_invalid_utf8_byte_at_top_level_raises
+    # Regression: invalid UTF-8 byte at top level used to trip RBS_ASSERT in the C extension.
+    source = "\xFF".dup.force_encoding(Encoding::UTF_8)
+    Timeout.timeout(5) do
+      assert_raises(RBS::ParsingError) do
+        RBS::Parser._parse_signature(buffer(source), 0, source.bytesize)
+      end
+    end
+  end
 end