diff --git a/be/src/core/data_type_serde/complex_type_deserialize_util.h b/be/src/core/data_type_serde/complex_type_deserialize_util.h index 20f49636c769b1..ff76d64dcd6542 100644 --- a/be/src/core/data_type_serde/complex_type_deserialize_util.h +++ b/be/src/core/data_type_serde/complex_type_deserialize_util.h @@ -42,7 +42,9 @@ struct ComplexTypeDeserializeUtil { std::vector elements; for (int pos = 0; pos < str.size; ++pos) { char c = str.data[pos]; - if (c == '"' || c == '\'') { + if (c == '\\' && pos + 1 < static_cast(str.size)) { + ++pos; // skip escaped character + } else if (c == '"' || c == '\'') { if (!has_quote) { quote_char = c; has_quote = !has_quote; diff --git a/be/test/core/data_type_serde/data_type_serde_map_test.cpp b/be/test/core/data_type_serde/data_type_serde_map_test.cpp index 33d72e3b7e823f..cfe6dee93e0e80 100644 --- a/be/test/core/data_type_serde/data_type_serde_map_test.cpp +++ b/be/test/core/data_type_serde/data_type_serde_map_test.cpp @@ -41,6 +41,7 @@ #include "core/data_type/data_type_nullable.h" #include "core/data_type/data_type_string.h" #include "core/data_type/define_primitive_type.h" +#include "core/data_type_serde/complex_type_deserialize_util.h" #include "core/field.h" #include "core/types.h" #include "storage/olap_common.h" @@ -178,4 +179,60 @@ TEST_F(DataTypeMapSerDeTest, ArrowMemNotAligned) { EXPECT_TRUE(st.ok()); } +// Stream Load JSON stores Map as String via to_json_string, then converts back +// via from_string → split_by_delimiter. The splitter must handle '\' escapes +// so that '\"' inside a value doesn't flip quote state and expose inner ':'/','. +TEST_F(DataTypeMapSerDeTest, SplitByDelimiterHandlesBackslashEscape) { + DataTypeSerDe::FormatOptions opts; + opts.map_key_delim = ':'; + opts.collection_delim = ','; + + auto make_map_type = []() { + auto str = std::make_shared(std::make_shared()); + return std::make_shared(str, str); + }; + + // split_by_delimiter: '\"' must not toggle quote state + // Input (after stripping outer {}): "k":"[{\"a\":\"b\\nc:" + // Expected: 2 elements — key "k" and value "[{\"a\":\"b\\nc:" + { + std::string inner = "\"k\":\"[{\\\"a\\\":\\\"b\\\\nc:\""; + StringRef str(inner.data(), inner.size()); + auto result = ComplexTypeDeserializeUtil::split_by_delimiter( + str, [&](char c) { return c == opts.map_key_delim || c == opts.collection_delim; }); + EXPECT_EQ(result.size(), 2u); + } + + // from_string: value ending with ':' (map_key_delim) must not cause split error + // Simulates to_json_string output: {"k":"[{\"a\":\"b\\nc:"} + { + auto map_type = make_map_type(); + auto col = map_type->create_column(); + std::string map_str = "{\"k\":\"[{\\\"a\\\":\\\"b\\\\nc:\"}"; + StringRef ref(map_str.data(), map_str.size()); + EXPECT_TRUE(map_type->get_serde()->from_string(ref, *col, opts).ok()); + EXPECT_EQ(col->size(), 1u); + } + + // from_string: value ending with ',' (collection_delim) — same class of bug + { + auto map_type = make_map_type(); + auto col = map_type->create_column(); + std::string map_str = "{\"k\":\"[{\\\"a\\\":\\\"b\\\\nc,\"}"; + StringRef ref(map_str.data(), map_str.size()); + EXPECT_TRUE(map_type->get_serde()->from_string(ref, *col, opts).ok()); + EXPECT_EQ(col->size(), 1u); + } + + // Control: value ending with ')' (not a delimiter) — always worked + { + auto map_type = make_map_type(); + auto col = map_type->create_column(); + std::string map_str = "{\"k\":\"[{\\\"a\\\":\\\"b\\\\nc)\"}"; + StringRef ref(map_str.data(), map_str.size()); + EXPECT_TRUE(map_type->get_serde()->from_string(ref, *col, opts).ok()); + EXPECT_EQ(col->size(), 1u); + } +} + } // namespace doris diff --git a/regression-test/data/jsonb_p0/test_jsonb_cast.csv b/regression-test/data/jsonb_p0/test_jsonb_cast.csv index d4d64bebe190aa..3efda7706dc838 100644 --- a/regression-test/data/jsonb_p0/test_jsonb_cast.csv +++ b/regression-test/data/jsonb_p0/test_jsonb_cast.csv @@ -1,4 +1,4 @@ 1 \N 2 ['{\'x\':\'{"y":1}\', \'t\':\'{"y":2}\'}', '{"x":1}'] -3 ['foo\'bar', 'foo"bar', 'foo\\'bar', 'foo\'\'bar'] +3 ['foo\'bar', 'foo"bar', 'foo\'bar', 'foo\'\'bar'] 4 ['\/some\/cool\/url', '/some/cool/url', 'a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e'] diff --git a/regression-test/data/jsonb_p0/test_jsonb_cast.out b/regression-test/data/jsonb_p0/test_jsonb_cast.out index bb47b7249a523c..0046ea38e0aea8 100644 --- a/regression-test/data/jsonb_p0/test_jsonb_cast.out +++ b/regression-test/data/jsonb_p0/test_jsonb_cast.out @@ -2,13 +2,13 @@ -- !select_1 -- 1 \N 2 ["{'x':'{"y":1}', 't':'{"y":2}'}", "{"x":1}"] -3 ["foo'bar', 'foo"bar', 'foo\\'bar', 'foo''bar"] +3 ["foo'bar", "foo"bar", "foo'bar", "foo''bar"] 4 ["/some/cool/url", "/some/cool/url", "a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e"] -- !select_2 -- 1 \N 2 ["{'x':'{"y":1}', 't':'{"y":2}'}", "{"x":1}"] -3 ["foo'bar', 'foo"bar', 'foo\\'bar', 'foo''bar"] +3 ["foo'bar", "foo"bar", "foo'bar", "foo''bar"] 4 ["/some/cool/url", "/some/cool/url", "a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e"] 27 ["{"k1":"v1", "k2":200}"] 28 ["{"a.b.c":{"k1.a1":"v31", "k2":300},"a":"niu"}"] @@ -18,7 +18,7 @@ -- !select_json -- 1 \N 2 ["{'x':'{\\"y\\":1}', 't':'{\\"y\\":2}'}","{\\"x\\":1}"] -3 ["foo'bar', 'foo\\"bar', 'foo\\\\'bar', 'foo''bar"] +3 ["foo'bar","foo\\"bar","foo'bar","foo''bar"] 4 ["/some/cool/url","/some/cool/url","a\\\\_\\\\c\\\\l\\\\i\\\\c\\\\k\\\\h\\\\o\\\\u\\\\s\\\\e"] 27 ["{\\"k1\\":\\"v1\\", \\"k2\\":200}"] 28 ["{\\"a.b.c\\":{\\"k1.a1\\":\\"v31\\", \\"k2\\":300},\\"a\\":\\"niu\\"}"] diff --git a/regression-test/data/jsonb_p0/test_jsonb_unescaped.csv b/regression-test/data/jsonb_p0/test_jsonb_unescaped.csv index e4f859e7511b1b..37c07297cbf445 100644 --- a/regression-test/data/jsonb_p0/test_jsonb_unescaped.csv +++ b/regression-test/data/jsonb_p0/test_jsonb_unescaped.csv @@ -1,5 +1,5 @@ 1 \N 2 ['{\'x\' : \'{"y" : 1}\', \'t\' : \'{"y" : 2}\'}', '{"x" : 1}'] -3 ['foo\'bar', 'foo"bar', 'foo\\'bar', 'foo\'\'bar'] +3 ['foo\'bar', 'foo"bar', 'foo\'bar', 'foo\'\'bar'] 4 ['\/some\/cool\/url', '/some/cool/url', 'a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e'] 5 ["\"双引号\"", "反斜\\线"] \ No newline at end of file diff --git a/regression-test/data/jsonb_p0/test_jsonb_with_unescaped_string.out b/regression-test/data/jsonb_p0/test_jsonb_with_unescaped_string.out index 99fb23ef9eed17..f7df0f30c14603 100644 --- a/regression-test/data/jsonb_p0/test_jsonb_with_unescaped_string.out +++ b/regression-test/data/jsonb_p0/test_jsonb_with_unescaped_string.out @@ -2,14 +2,14 @@ -- !select_csv -- 1 \N 2 ["{'x' : '{"y" : 1}', 't' : '{"y" : 2}'}", "{"x" : 1}"] -3 ["foo'bar', 'foo"bar', 'foo\\'bar', 'foo''bar"] +3 ["foo'bar", "foo"bar", "foo'bar", "foo''bar"] 4 ["/some/cool/url", "/some/cool/url", "a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e"] 5 [""双引号"", "反斜\\线"] -- !select_json -- 1 \N 2 ["{'x' : '{"y" : 1}', 't' : '{"y" : 2}'}", "'{"x" : 1}'"] -3 ["foo'bar', 'foo"bar', 'foo\\'bar', 'foo''bar"] +3 ["foo'bar", "foo"bar", "foo\\'bar", "foo''bar"] 4 ["/some/cool/url", "/some/cool/url", "a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e"] 5 [""双引号"", "反斜\\线"]