diff --git a/redisvl/utils/token_escaper.py b/redisvl/utils/token_escaper.py index 04e04cd2..effa1c96 100644 --- a/redisvl/utils/token_escaper.py +++ b/redisvl/utils/token_escaper.py @@ -9,10 +9,10 @@ class TokenEscaper: """ # Characters that RediSearch requires us to escape during queries. - # Source: https://redis.io/docs/stack/search/reference/escaping/#the-rules-of-text-field-tokenization - DEFAULT_ESCAPED_CHARS = r"[,.<>{}\[\]\\\"\':;!@#$%^&*()\-+=~\/ ]" + # Source: https://redis.io/docs/latest/develop/ai/search-and-query/advanced-concepts/escaping/#tokenization-rules-for-text-fields + DEFAULT_ESCAPED_CHARS = r"[,.<>{}\[\]\\\"\':;!@#$%^&*()\-+=~\/ \?]" - # Same as above but excludes * to allow wildcard patterns + # Same as above but excludes * and ? to allow wildcard patterns ESCAPED_CHARS_NO_WILDCARD = r"[,.<>{}\[\]\\\"\':;!@#$%^&()\-+=~\/ ]" def __init__(self, escape_chars_re: Optional[Pattern] = None): diff --git a/tests/unit/test_token_escaper.py b/tests/unit/test_token_escaper.py index 0adb2d11..c1d6fd89 100644 --- a/tests/unit/test_token_escaper.py +++ b/tests/unit/test_token_escaper.py @@ -19,8 +19,8 @@ def escaper(): ), ( r"& symbols, like * and ?", - r"\&\ symbols\,\ like\ \*\ and\ ?", - ), # TODO: question marks are not caught? + r"\&\ symbols\,\ like\ \*\ and\ \?", + ), # underscores are ignored (r"-dashes_and_underscores-", r"\-dashes_and_underscores\-"), ], @@ -57,7 +57,7 @@ def test_escape_text_chars(escaper, test_input, expected): ("(parentheses)", r"\(parentheses\)"), ("[brackets]", r"\[brackets\]"), ("{braces}", r"\{braces\}"), - # ("question?mark", r"question\?mark"), #TODO - question marks are not caught? + ("question?mark", r"question\?mark"), # Unicode characters in tags ("你好", r"你好"), # Assuming non-Latin characters don't need escaping ("emoji:😊", r"emoji\:😊"), @@ -81,6 +81,7 @@ def test_escape_text_chars(escaper, test_input, expected): "parentheses", "brackets", "braces", + "question", "non-latin", "emoji", ],