Skip to content

Commit 3adedd2

Browse files
authored
Handle unicode whitespace characters in lexer (#68)
1 parent d875f47 commit 3adedd2

File tree

2 files changed

+24
-7
lines changed

2 files changed

+24
-7
lines changed

lexer/lexer.go

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,12 +73,33 @@ func (l *Lexer) peekChar() rune {
7373
}
7474

7575
func (l *Lexer) skipWhitespace() {
76-
// Skip whitespace and BOM (byte order mark U+FEFF)
77-
for unicode.IsSpace(l.ch) || l.ch == '\uFEFF' {
76+
// Skip whitespace, BOM, and other Unicode characters that ClickHouse treats as whitespace.
77+
// See: https://github.com/ClickHouse/ClickHouse/blob/master/src/Parsers/Lexer.cpp
78+
for unicode.IsSpace(l.ch) || isClickHouseWhitespace(l.ch) {
7879
l.readChar()
7980
}
8081
}
8182

83+
// isClickHouseWhitespace returns true for characters ClickHouse treats as whitespace
84+
// but Go's unicode.IsSpace does not recognize.
85+
func isClickHouseWhitespace(ch rune) bool {
86+
switch ch {
87+
case '\uFEFF': // BOM (Byte Order Mark)
88+
return true
89+
case '\u180E': // MONGOLIAN VOWEL SEPARATOR
90+
return true
91+
case '\u200B': // ZERO WIDTH SPACE
92+
return true
93+
case '\u200C': // ZERO WIDTH NON-JOINER
94+
return true
95+
case '\u200D': // ZERO WIDTH JOINER
96+
return true
97+
case '\u2060': // WORD JOINER
98+
return true
99+
}
100+
return false
101+
}
102+
82103
// NextToken returns the next token from the input.
83104
func (l *Lexer) NextToken() Item {
84105
l.skipWhitespace()
Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1 @@
1-
{
2-
"explain_todo": {
3-
"stmt3": true
4-
}
5-
}
1+
{}

0 commit comments

Comments
 (0)