From 76c6270a996c663beab51d8837b35ad66a42ea7f Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 28 Dec 2025 16:00:10 +0000 Subject: [PATCH] Handle unicode whitespace characters in lexer Add isClickHouseWhitespace() to recognize characters that ClickHouse treats as whitespace but Go's unicode.IsSpace() does not: U+FEFF BOM, U+180E Mongolian Vowel Separator, U+200B Zero Width Space, U+200C Zero Width Non-Joiner, U+200D Zero Width Joiner, and U+2060 Word Joiner. This fixes the 01280_unicode_whitespaces_lexer test case stmt3. --- lexer/lexer.go | 25 +++++++++++++++++-- .../metadata.json | 6 +---- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/lexer/lexer.go b/lexer/lexer.go index 8fc82c1b3b..92e032b6a0 100644 --- a/lexer/lexer.go +++ b/lexer/lexer.go @@ -73,12 +73,33 @@ func (l *Lexer) peekChar() rune { } func (l *Lexer) skipWhitespace() { - // Skip whitespace and BOM (byte order mark U+FEFF) - for unicode.IsSpace(l.ch) || l.ch == '\uFEFF' { + // Skip whitespace, BOM, and other Unicode characters that ClickHouse treats as whitespace. + // See: https://github.com/ClickHouse/ClickHouse/blob/master/src/Parsers/Lexer.cpp + for unicode.IsSpace(l.ch) || isClickHouseWhitespace(l.ch) { l.readChar() } } +// isClickHouseWhitespace returns true for characters ClickHouse treats as whitespace +// but Go's unicode.IsSpace does not recognize. +func isClickHouseWhitespace(ch rune) bool { + switch ch { + case '\uFEFF': // BOM (Byte Order Mark) + return true + case '\u180E': // MONGOLIAN VOWEL SEPARATOR + return true + case '\u200B': // ZERO WIDTH SPACE + return true + case '\u200C': // ZERO WIDTH NON-JOINER + return true + case '\u200D': // ZERO WIDTH JOINER + return true + case '\u2060': // WORD JOINER + return true + } + return false +} + // NextToken returns the next token from the input. func (l *Lexer) NextToken() Item { l.skipWhitespace() diff --git a/parser/testdata/01280_unicode_whitespaces_lexer/metadata.json b/parser/testdata/01280_unicode_whitespaces_lexer/metadata.json index 1295a45747..0967ef424b 100644 --- a/parser/testdata/01280_unicode_whitespaces_lexer/metadata.json +++ b/parser/testdata/01280_unicode_whitespaces_lexer/metadata.json @@ -1,5 +1 @@ -{ - "explain_todo": { - "stmt3": true - } -} +{}