From 299377994e09d8a81d1870b01e50a13b3d137651 Mon Sep 17 00:00:00 2001 From: Soutaro Matsumoto Date: Tue, 3 Mar 2026 10:59:31 +0900 Subject: [PATCH 1/2] Specify the input range by byte offsets instead of character offsets --- include/rbs/lexer.h | 4 ++-- lib/rbs/parser_aux.rb | 11 ++++++----- src/lexstate.c | 2 +- src/parser.c | 4 +++- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/include/rbs/lexer.h b/include/rbs/lexer.h index 6fd527959..4f182ba1e 100644 --- a/include/rbs/lexer.h +++ b/include/rbs/lexer.h @@ -136,8 +136,8 @@ typedef struct { * */ typedef struct { rbs_string_t string; - int start_pos; /* The character position that defines the start of the input */ - int end_pos; /* The character position that defines the end of the input */ + int start_pos; /* The byte position that defines the start of the input */ + int end_pos; /* The byte position that defines the end of the input */ rbs_position_t current; /* The current position: just before the current_character */ rbs_position_t start; /* The start position of the current token */ diff --git a/lib/rbs/parser_aux.rb b/lib/rbs/parser_aux.rb index 1607eb922..fa7d83f39 100644 --- a/lib/rbs/parser_aux.rb +++ b/lib/rbs/parser_aux.rb @@ -7,12 +7,12 @@ module RBS class Parser def self.parse_type(source, range: 0..., variables: [], require_eof: false, void_allowed: true, self_allowed: true, classish_allowed: true) buf = buffer(source) - _parse_type(buf, range.begin || 0, range.end || buf.last_position, variables, require_eof, void_allowed, self_allowed, classish_allowed) + _parse_type(buf, range.begin || 0, range.end || buf.content.bytesize, variables, require_eof, void_allowed, self_allowed, classish_allowed) end def self.parse_method_type(source, range: 0..., variables: [], require_eof: false) buf = buffer(source) - _parse_method_type(buf, range.begin || 0, range.end || buf.last_position, variables, require_eof) + _parse_method_type(buf, range.begin || 0, range.end || buf.content.bytesize, variables, require_eof) end def self.parse_signature(source) @@ -25,7 +25,8 @@ def self.parse_signature(source) else 0 end - dirs, decls = _parse_signature(buf, start_pos, buf.last_position) + content = buf.content + dirs, decls = _parse_signature(buf, start_pos, content.bytesize) if resolved dirs = dirs.dup if dirs.frozen? @@ -37,7 +38,7 @@ def self.parse_signature(source) def self.parse_type_params(source, module_type_params: true) buf = buffer(source) - _parse_type_params(buf, 0, buf.last_position, module_type_params) + _parse_type_params(buf, 0, buf.content.bytesize, module_type_params) end def self.magic_comment(buf) @@ -66,7 +67,7 @@ def self.magic_comment(buf) def self.lex(source) buf = buffer(source) - list = _lex(buf, buf.last_position) + list = _lex(buf, buf.content.bytesize) value = list.map do |type, location| Token.new(type: type, location: location) end diff --git a/src/lexstate.c b/src/lexstate.c index b6f370daa..a4372bde4 100644 --- a/src/lexstate.c +++ b/src/lexstate.c @@ -118,7 +118,7 @@ unsigned int rbs_peek(rbs_lexer_t *lexer) { } bool rbs_next_char(rbs_lexer_t *lexer, unsigned int *codepoint, size_t *byte_len) { - if (RBS_UNLIKELY(lexer->current.char_pos == lexer->end_pos)) { + if (RBS_UNLIKELY(lexer->current.byte_pos == lexer->end_pos)) { return false; } diff --git a/src/parser.c b/src/parser.c index ffa32dec3..2a83dc9c2 100644 --- a/src/parser.c +++ b/src/parser.c @@ -3475,7 +3475,9 @@ rbs_lexer_t *rbs_lexer_new(rbs_allocator_t *allocator, rbs_string_t string, cons } if (start_pos > 0) { - rbs_skipn(lexer, start_pos); + while (lexer->current.byte_pos < start_pos) { + rbs_skip(lexer); + } } lexer->start = lexer->current; From d36a956a023c5b21400ef20763d830d22773ecd1 Mon Sep 17 00:00:00 2001 From: Soutaro Matsumoto Date: Tue, 3 Mar 2026 11:25:33 +0900 Subject: [PATCH 2/2] Add `byte_range:` parsing API --- lib/rbs/parser_aux.rb | 20 ++++++++++++---- sig/parser.rbs | 28 ++++++++++++---------- test/rbs/parser_test.rb | 2 +- test/rbs/type_parsing_test.rb | 44 +++++++++++++++++++++++++++++++++++ 4 files changed, 77 insertions(+), 17 deletions(-) diff --git a/lib/rbs/parser_aux.rb b/lib/rbs/parser_aux.rb index fa7d83f39..974c54e20 100644 --- a/lib/rbs/parser_aux.rb +++ b/lib/rbs/parser_aux.rb @@ -5,14 +5,16 @@ module RBS class Parser - def self.parse_type(source, range: 0..., variables: [], require_eof: false, void_allowed: true, self_allowed: true, classish_allowed: true) + def self.parse_type(source, range: nil, byte_range: 0..., variables: [], require_eof: false, void_allowed: true, self_allowed: true, classish_allowed: true) buf = buffer(source) - _parse_type(buf, range.begin || 0, range.end || buf.content.bytesize, variables, require_eof, void_allowed, self_allowed, classish_allowed) + byte_range = byte_range(range, buf.content) if range + _parse_type(buf, byte_range.begin || 0, byte_range.end || buf.content.bytesize, variables, require_eof, void_allowed, self_allowed, classish_allowed) end - def self.parse_method_type(source, range: 0..., variables: [], require_eof: false) + def self.parse_method_type(source, range: nil, byte_range: 0..., variables: [], require_eof: false) buf = buffer(source) - _parse_method_type(buf, range.begin || 0, range.end || buf.content.bytesize, variables, require_eof) + byte_range = byte_range(range, buf.content) if range + _parse_method_type(buf, byte_range.begin || 0, byte_range.end || buf.content.bytesize, variables, require_eof) end def self.parse_signature(source) @@ -126,5 +128,15 @@ def self.parse_inline_trailing_annotation(source, range, variables: []) buf = buffer(source) _parse_inline_trailing_annotation(buf, range.begin || 0, range.end || buf.last_position, variables) end + + def self.byte_range(char_range, content) + start_offset = char_range.begin + end_offset = char_range.end + + start_prefix = content[0, start_offset] or raise if start_offset + end_prefix = content[0, end_offset] or raise if end_offset + + start_prefix&.bytesize...end_prefix&.bytesize + end end end diff --git a/sig/parser.rbs b/sig/parser.rbs index 028dc8df1..66e404bb8 100644 --- a/sig/parser.rbs +++ b/sig/parser.rbs @@ -20,13 +20,13 @@ module RBS # Parse a method type and return it # - # When `range` keyword is specified, it starts parsing from the `begin` to the `end` of the range. + # When `byte_range` keyword is specified, it starts parsing from the `begin` to the `end` of the range. # # ```ruby - # RBS::Parser.parse_method_type("() -> void") # => `() -> void` - # RBS::Parser.parse_method_type("() -> void", range: 0...) # => `() -> void` - # RBS::Parser.parse_method_type("() -> void () -> String", range: 11...) # => `() -> String` - # RBS::Parser.parse_method_type("() -> void () -> String", range: 23...) # => nil + # RBS::Parser.parse_method_type("() -> void") # => `() -> void` + # RBS::Parser.parse_method_type("() -> void", byte_range: 0...) # => `() -> void` + # RBS::Parser.parse_method_type("() -> void () -> String", byte_range: 11...) # => `() -> String` + # RBS::Parser.parse_method_type("() -> void () -> String", byte_range: 23...) # => nil # ``` # # When `require_eof` is `true`, an error is raised if more tokens are left in the input. @@ -39,17 +39,18 @@ module RBS # RBS::Parser.parse_method_type("", require_eof: true) # => nil # ``` # - def self.parse_method_type: (Buffer | String, ?range: Range[Integer?], ?variables: Array[Symbol], ?require_eof: bool) -> MethodType? + def self.parse_method_type: (Buffer | String, ?byte_range: Range[Integer?], ?variables: Array[Symbol], ?require_eof: bool) -> MethodType? + | %a{deprecated: Use `byte_range:` keyword instead of `range:`} (Buffer | String, range: Range[Integer?], ?variables: Array[Symbol], ?require_eof: bool) -> MethodType? # Parse a type and return it # - # When `range` keyword is specified, it starts parsing from the `begin` to the `end` of the range. + # When `byte_range` keyword is specified, it starts parsing from the `begin` to the `end` of the range. # # ```ruby - # RBS::Parser.parse_type("String") # => `String` - # RBS::Parser.parse_type("String", range: 0...) # => `String` - # RBS::Parser.parse_type("String Integer", pos: 7...) # => `Integer` - # RBS::Parser.parse_type("String Integer", pos: 14...) # => nil + # RBS::Parser.parse_type("String") # => `String` + # RBS::Parser.parse_type("String", byte_range: 0...) # => `String` + # RBS::Parser.parse_type("String Integer", byte_range: 7...) # => `Integer` + # RBS::Parser.parse_type("String Integer", byte_range: 14...) # => nil # ``` # # When `require_eof` is `true`, an error is raised if more tokens are left in the input. @@ -76,7 +77,8 @@ module RBS # RBS::Parser.parse_type("self", self_allowed: false) # => Raises an syntax error # ``` # - def self.parse_type: (Buffer | String, ?range: Range[Integer?], ?variables: Array[Symbol], ?require_eof: bool, ?void_allowed: bool, ?self_allowed: bool, ?classish_allowed: bool) -> Types::t? + def self.parse_type: (Buffer | String, ?byte_range: Range[Integer?], ?variables: Array[Symbol], ?require_eof: bool, ?void_allowed: bool, ?self_allowed: bool, ?classish_allowed: bool) -> Types::t? + | %a{deprecated: Use `byte_range:` keyword instead of `range:`} (Buffer | String, range: Range[Integer?], ?variables: Array[Symbol], ?require_eof: bool, ?void_allowed: bool, ?self_allowed: bool, ?classish_allowed: bool) -> Types::t? # Parse whole RBS file and return an array of declarations # @@ -130,6 +132,8 @@ module RBS def self.buffer: (String | Buffer source) -> Buffer + def self.byte_range: (Range[Integer?] char_range, String content) -> Range[Integer?] + def self._parse_type: (Buffer, Integer start_pos, Integer end_pos, Array[Symbol] variables, bool require_eof, bool void_allowed, bool self_allowed, bool classish_allowed) -> Types::t? def self._parse_method_type: (Buffer, Integer start_pos, Integer end_pos, Array[Symbol] variables, bool require_eof) -> MethodType? diff --git a/test/rbs/parser_test.rb b/test/rbs/parser_test.rb index 7c8c57783..ea160cc9d 100644 --- a/test/rbs/parser_test.rb +++ b/test/rbs/parser_test.rb @@ -812,7 +812,7 @@ def test_buffer_location def test_negative_range assert_raises ArgumentError do - RBS::Parser.parse_type("a", range: -2...-1) + RBS::Parser.parse_type("a", byte_range: -2...-1) end end diff --git a/test/rbs/type_parsing_test.rb b/test/rbs/type_parsing_test.rb index 0a9214052..c140442f8 100644 --- a/test/rbs/type_parsing_test.rb +++ b/test/rbs/type_parsing_test.rb @@ -980,4 +980,48 @@ def test_parse__string_unicode_escape__non_unicode assert_equal "[\\u30eb]", type.literal end end + + def test_parse__byte_range + input = '["🐕", "🐈"]' + + Parser.parse_type(input).yield_self do |type| + assert_instance_of Types::Tuple, type + end + + Parser.parse_type(input, byte_range: '["🐕", '.bytesize...).yield_self do |type| + assert_instance_of Types::Literal, type + assert_equal "🐈", type.literal + end + + Parser.parse_type(input, byte_range: '["🐕", '.bytesize...'["🐕", "🐈"'.bytesize, require_eof: true).yield_self do |type| + assert_instance_of Types::Literal, type + assert_equal "🐈", type.literal + end + + Parser.parse_type(input, byte_range: '["🐕", '.bytesize..'["🐕", "🐈"'.bytesize, require_eof: true).yield_self do |type| + assert_instance_of Types::Literal, type + assert_equal "🐈", type.literal + end + end + + def test_parse__range_works + input = '["🐕", "🐈"]' + + Parser.parse_type(input, range: 6...9, require_eof: true).yield_self do |type| + assert_instance_of Types::Literal, type + assert_equal "🐈", type.literal + end + end + + def test_parse__byte_range_incorrect + # We want a better error handling ergonomics, but currently simply raises a syntax error. + + input = '"🐕🐈"' + + exn = assert_raises RBS::ParsingError do + Parser.parse_type(input, byte_range: 2...) + end + + assert_equal "a.rbs:1:2...1:3: Syntax error: unexpected token for simple type, token=`🐈` (ErrorToken)", exn.message + end end