From 1cda26346e84c0c45785d73c07f4f916727a6121 Mon Sep 17 00:00:00 2001 From: Joachim Rosskopf Date: Thu, 25 Jun 2026 14:50:16 +0200 Subject: [PATCH 1/3] fix: Serialize LIST/STRUCT columns per-row in REST JSON (#89) - convertVectorListToJson read the entire list child vector for every row instead of the row's own slice, so multi-row LIST(VARCHAR) columns reported the concatenation of all rows' lists. - convertVectorStructToJson always read child element 0, so structs nested in a list (LIST(STRUCT)) repeated the first element N times. - Use the per-row duckdb_list_entry offset/length for lists and pass row_idx through to struct children; honor row validity (NULL lists and structs now serialize as null). - Add regression tests covering single/multi-row LIST(STRUCT), LIST(VARCHAR), multi-row STRUCT, and NULL list entries. Closes #89 --- src/query_executor.cpp | 29 +++++++- test/cpp/query_executor_test.cpp | 120 +++++++++++++++++++++++++++++++ 2 files changed, 146 insertions(+), 3 deletions(-) diff --git a/src/query_executor.cpp b/src/query_executor.cpp index 9bf4074..f92c0aa 100644 --- a/src/query_executor.cpp +++ b/src/query_executor.cpp @@ -422,18 +422,38 @@ crow::json::wvalue QueryResult::convertVectorEnumToJson(const duckdb_vector &vec } crow::json::wvalue QueryResult::convertVectorListToJson(const duckdb_vector &vector, const idx_t row_idx) { + auto validity = duckdb_vector_get_validity(vector); + if (!duckdb_validity_row_is_valid(validity, row_idx)) { + return crow::json::wvalue(nullptr); + } + + // A LIST column stores every row's elements back-to-back in a single child + // vector. The per-row slice is described by the list_entry at `row_idx` + // (offset + length); emitting the whole child vector instead would repeat + // every other row's elements (issue #89). + auto entries = static_cast(duckdb_vector_get_data(vector)); + auto entry = entries[row_idx]; auto child_vector = duckdb_list_vector_get_child(vector); auto child_size = duckdb_list_vector_get_size(vector); std::vector result; - for (idx_t i = 0; i < child_size; i++) { - result.push_back(convertVectorEntryToJson(child_vector, i)); + for (idx_t i = 0; i < entry.length; i++) { + idx_t child_idx = entry.offset + i; + if (child_idx >= child_size) { + break; + } + result.push_back(convertVectorEntryToJson(child_vector, child_idx)); } return crow::json::wvalue(result); } crow::json::wvalue QueryResult::convertVectorStructToJson(const duckdb_vector &vector, const idx_t row_idx) { + auto validity = duckdb_vector_get_validity(vector); + if (!duckdb_validity_row_is_valid(validity, row_idx)) { + return crow::json::wvalue(nullptr); + } + auto struct_type = duckdb_vector_get_column_type(vector); auto child_size = duckdb_struct_type_child_count(struct_type); @@ -443,8 +463,11 @@ crow::json::wvalue QueryResult::convertVectorStructToJson(const duckdb_vector &v DuckDBString child_name_wrapper(duckdb_struct_type_child_name(struct_type, i)); auto str_name = child_name_wrapper.to_string(); + // Each struct field is a parallel child vector; read this row's entry + // (`row_idx`), not element 0, so multi-row results and structs nested + // inside a list resolve the correct element (issue #89). auto child_vector = duckdb_struct_vector_get_child(vector, i); - result[str_name] = convertVectorEntryToJson(child_vector, 0); + result[str_name] = convertVectorEntryToJson(child_vector, row_idx); } duckdb_destroy_logical_type(&struct_type); diff --git a/test/cpp/query_executor_test.cpp b/test/cpp/query_executor_test.cpp index 8c78636..feec2e9 100644 --- a/test/cpp/query_executor_test.cpp +++ b/test/cpp/query_executor_test.cpp @@ -228,6 +228,126 @@ TEST_CASE("QueryExecutor chunk experiment", "[query_executor]") { duckdb_close(&database); } +TEST_CASE("QueryExecutor LIST/STRUCT per-row serialization", "[query_executor][list]") { + // Regression for issue #89: native LIST(STRUCT) / LIST(VARCHAR) columns + // must serialize each row's own slice of the list child vector, not the + // whole child vector, and structs nested inside a list must read their + // own element rather than always element[0]. + duckdb_database database; + REQUIRE(duckdb_open(NULL, &database) == DuckDBSuccess); + + SECTION("single-row LIST(STRUCT) emits distinct elements") { + QueryExecutor executor(database); + executor.execute(R"SQL( + SELECT [ + {'stage': 'prepare', 'n_dropped': 11945}, + {'stage': 'features', 'n_dropped': 57203}, + {'stage': 'postprocess', 'n_dropped': 238505} + ] AS by_stage + )SQL"); + + auto doc = crow::json::load(executor.toJson().dump()); + REQUIRE(doc.size() == 1); + REQUIRE(doc[0]["by_stage"].size() == 3); + REQUIRE(doc[0]["by_stage"][0]["stage"].s() == "prepare"); + REQUIRE(doc[0]["by_stage"][0]["n_dropped"].i() == 11945); + REQUIRE(doc[0]["by_stage"][1]["stage"].s() == "features"); + REQUIRE(doc[0]["by_stage"][1]["n_dropped"].i() == 57203); + REQUIRE(doc[0]["by_stage"][2]["stage"].s() == "postprocess"); + REQUIRE(doc[0]["by_stage"][2]["n_dropped"].i() == 238505); + } + + SECTION("multi-row LIST(VARCHAR) keeps each row's own list") { + QueryExecutor executor(database); + executor.execute(R"SQL( + SELECT * FROM (VALUES + (1, ['a1', 'a2', 'a3']), + (2, ['b1', 'b2', 'b3']), + (3, ['c1', 'c2', 'c3']), + (4, ['d1', 'd2', 'd3']) + ) AS t(id, flags) + ORDER BY id + )SQL"); + + auto doc = crow::json::load(executor.toJson().dump()); + REQUIRE(doc.size() == 4); + + REQUIRE(doc[0]["flags"].size() == 3); + REQUIRE(doc[0]["flags"][0].s() == "a1"); + REQUIRE(doc[0]["flags"][2].s() == "a3"); + + REQUIRE(doc[1]["flags"].size() == 3); + REQUIRE(doc[1]["flags"][0].s() == "b1"); + REQUIRE(doc[1]["flags"][2].s() == "b3"); + + REQUIRE(doc[3]["flags"].size() == 3); + REQUIRE(doc[3]["flags"][0].s() == "d1"); + REQUIRE(doc[3]["flags"][2].s() == "d3"); + } + + SECTION("multi-row LIST(STRUCT) keeps each row's own elements") { + QueryExecutor executor(database); + executor.execute(R"SQL( + SELECT * FROM (VALUES + (1, [{'k': 'a', 'v': 1}, {'k': 'b', 'v': 2}]), + (2, [{'k': 'c', 'v': 3}, {'k': 'd', 'v': 4}]) + ) AS t(id, items) + ORDER BY id + )SQL"); + + auto doc = crow::json::load(executor.toJson().dump()); + REQUIRE(doc.size() == 2); + + REQUIRE(doc[0]["items"].size() == 2); + REQUIRE(doc[0]["items"][0]["k"].s() == "a"); + REQUIRE(doc[0]["items"][0]["v"].i() == 1); + REQUIRE(doc[0]["items"][1]["k"].s() == "b"); + REQUIRE(doc[0]["items"][1]["v"].i() == 2); + + REQUIRE(doc[1]["items"].size() == 2); + REQUIRE(doc[1]["items"][0]["k"].s() == "c"); + REQUIRE(doc[1]["items"][0]["v"].i() == 3); + REQUIRE(doc[1]["items"][1]["k"].s() == "d"); + REQUIRE(doc[1]["items"][1]["v"].i() == 4); + } + + SECTION("multi-row STRUCT column reads its own row") { + QueryExecutor executor(database); + executor.execute(R"SQL( + SELECT * FROM (VALUES + (1, {'name': 'alice', 'age': 30}), + (2, {'name': 'bob', 'age': 40}) + ) AS t(id, person) + ORDER BY id + )SQL"); + + auto doc = crow::json::load(executor.toJson().dump()); + REQUIRE(doc.size() == 2); + REQUIRE(doc[0]["person"]["name"].s() == "alice"); + REQUIRE(doc[0]["person"]["age"].i() == 30); + REQUIRE(doc[1]["person"]["name"].s() == "bob"); + REQUIRE(doc[1]["person"]["age"].i() == 40); + } + + SECTION("NULL list entry stays null") { + QueryExecutor executor(database); + executor.execute(R"SQL( + SELECT * FROM (VALUES + (1, ['x', 'y']), + (2, CAST(NULL AS VARCHAR[])) + ) AS t(id, flags) + ORDER BY id + )SQL"); + + auto doc = crow::json::load(executor.toJson().dump()); + REQUIRE(doc.size() == 2); + REQUIRE(doc[0]["flags"].size() == 2); + REQUIRE(doc[1]["flags"].t() == crow::json::type::Null); + } + + duckdb_close(&database); +} + TEST_CASE("QueryExecutor::executeWithBindings - prepared path", "[query_executor][prepared]") { duckdb_database database; REQUIRE(duckdb_open(NULL, &database) == DuckDBSuccess); From f6218d8d2a3a7e314d5f791e8cfbd7aff2d97027 Mon Sep 17 00:00:00 2001 From: Joachim Rosskopf Date: Thu, 25 Jun 2026 14:55:07 +0200 Subject: [PATCH 2/3] fix: Serialize fixed-size ARRAY columns per-row (#89) ARRAY columns were routed through the LIST serializer, which after the list fix reads per-row duckdb_list_entry offset/length metadata. ARRAY vectors have no such metadata (each row is a constant array_size run in the child vector), so this read garbage and risked UB. - Add convertVectorArrayToJson using duckdb_array_type_array_size and duckdb_array_vector_get_child, indexing child at row_idx*array_size+i. - Route DUCKDB_TYPE_ARRAY to the new serializer; honor row validity. - Add a multi-row fixed-size ARRAY regression test. Found in codex review of the LIST/STRUCT fix. --- src/include/query_executor.hpp | 1 + src/query_executor.cpp | 25 ++++++++++++++++++++++++- test/cpp/query_executor_test.cpp | 23 +++++++++++++++++++++++ 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/src/include/query_executor.hpp b/src/include/query_executor.hpp index bf69039..a7f4c98 100644 --- a/src/include/query_executor.hpp +++ b/src/include/query_executor.hpp @@ -56,6 +56,7 @@ class QueryResult { static crow::json::wvalue convertVectorIntervalToJson(const duckdb_vector &vector, const idx_t row_idx); static crow::json::wvalue convertVectorEnumToJson(const duckdb_vector &vector, const idx_t row_idx); static crow::json::wvalue convertVectorListToJson(const duckdb_vector &vector, const idx_t row_idx); + static crow::json::wvalue convertVectorArrayToJson(const duckdb_vector &vector, const idx_t row_idx); static crow::json::wvalue convertVectorStructToJson(const duckdb_vector &vector, const idx_t row_idx); template diff --git a/src/query_executor.cpp b/src/query_executor.cpp index f92c0aa..165e615 100644 --- a/src/query_executor.cpp +++ b/src/query_executor.cpp @@ -288,7 +288,7 @@ crow::json::wvalue QueryResult::convertVectorEntryToJson(const duckdb_vector &ve case DUCKDB_TYPE_MAP: return convertVectorStructToJson(vector, row_idx); // Treat as struct for JSON case DUCKDB_TYPE_ARRAY: - return convertVectorListToJson(vector, row_idx); // Treat as list for JSON + return convertVectorArrayToJson(vector, row_idx); case DUCKDB_TYPE_UNION: return convertVectorStructToJson(vector, row_idx); // Treat as struct for JSON default: @@ -448,6 +448,29 @@ crow::json::wvalue QueryResult::convertVectorListToJson(const duckdb_vector &vec return crow::json::wvalue(result); } +crow::json::wvalue QueryResult::convertVectorArrayToJson(const duckdb_vector &vector, const idx_t row_idx) { + auto validity = duckdb_vector_get_validity(vector); + if (!duckdb_validity_row_is_valid(validity, row_idx)) { + return crow::json::wvalue(nullptr); + } + + // A fixed-size ARRAY has no per-row list_entry: every row occupies a + // constant `array_size` run in the child vector, so this row's slice + // starts at row_idx * array_size (unlike LIST, which carries offsets). + auto array_type = duckdb_vector_get_column_type(vector); + auto array_size = duckdb_array_type_array_size(array_type); + duckdb_destroy_logical_type(&array_type); + + auto child_vector = duckdb_array_vector_get_child(vector); + + std::vector result; + for (idx_t i = 0; i < array_size; i++) { + result.push_back(convertVectorEntryToJson(child_vector, row_idx * array_size + i)); + } + + return crow::json::wvalue(result); +} + crow::json::wvalue QueryResult::convertVectorStructToJson(const duckdb_vector &vector, const idx_t row_idx) { auto validity = duckdb_vector_get_validity(vector); if (!duckdb_validity_row_is_valid(validity, row_idx)) { diff --git a/test/cpp/query_executor_test.cpp b/test/cpp/query_executor_test.cpp index feec2e9..7820d6d 100644 --- a/test/cpp/query_executor_test.cpp +++ b/test/cpp/query_executor_test.cpp @@ -329,6 +329,29 @@ TEST_CASE("QueryExecutor LIST/STRUCT per-row serialization", "[query_executor][l REQUIRE(doc[1]["person"]["age"].i() == 40); } + SECTION("multi-row fixed-size ARRAY keeps each row's own elements") { + QueryExecutor executor(database); + executor.execute(R"SQL( + SELECT * FROM (VALUES + (1, [10, 11, 12]::INTEGER[3]), + (2, [20, 21, 22]::INTEGER[3]), + (3, [30, 31, 32]::INTEGER[3]) + ) AS t(id, coords) + ORDER BY id + )SQL"); + + auto doc = crow::json::load(executor.toJson().dump()); + REQUIRE(doc.size() == 3); + + REQUIRE(doc[0]["coords"].size() == 3); + REQUIRE(doc[0]["coords"][0].i() == 10); + REQUIRE(doc[0]["coords"][2].i() == 12); + + REQUIRE(doc[2]["coords"].size() == 3); + REQUIRE(doc[2]["coords"][0].i() == 30); + REQUIRE(doc[2]["coords"][2].i() == 32); + } + SECTION("NULL list entry stays null") { QueryExecutor executor(database); executor.execute(R"SQL( From bfb1bca347422ada7853a4518ec7d97de4c159d9 Mon Sep 17 00:00:00 2001 From: Joachim Rosskopf Date: Thu, 25 Jun 2026 15:08:28 +0200 Subject: [PATCH 3/3] fix: Serialize UNION columns as active member only (#89) UNION columns were routed through the generic struct serializer, which emitted the tag plus every candidate member for each row, exposing inactive members instead of the one selected by the row's tag. - Add convertVectorUnionToJson: read the tag (struct child 0, uint8), resolve the active member (struct child tag+1) and emit it as {member_name: value}, matching DuckDB's own to_json output. - Route DUCKDB_TYPE_UNION to the new serializer; honor row validity and fail safe on out-of-range tags. - Add a multi-row UNION regression test. Found in codex review of the LIST/STRUCT fix. --- src/include/query_executor.hpp | 1 + src/query_executor.cpp | 38 +++++++++++++++++++++++++++++++- test/cpp/query_executor_test.cpp | 27 +++++++++++++++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) diff --git a/src/include/query_executor.hpp b/src/include/query_executor.hpp index a7f4c98..8990d4c 100644 --- a/src/include/query_executor.hpp +++ b/src/include/query_executor.hpp @@ -58,6 +58,7 @@ class QueryResult { static crow::json::wvalue convertVectorListToJson(const duckdb_vector &vector, const idx_t row_idx); static crow::json::wvalue convertVectorArrayToJson(const duckdb_vector &vector, const idx_t row_idx); static crow::json::wvalue convertVectorStructToJson(const duckdb_vector &vector, const idx_t row_idx); + static crow::json::wvalue convertVectorUnionToJson(const duckdb_vector &vector, const idx_t row_idx); template static crow::json::wvalue convertVectorEntryToJson(const duckdb_vector &vector, const idx_t row_idx) { diff --git a/src/query_executor.cpp b/src/query_executor.cpp index 165e615..bb714be 100644 --- a/src/query_executor.cpp +++ b/src/query_executor.cpp @@ -290,7 +290,7 @@ crow::json::wvalue QueryResult::convertVectorEntryToJson(const duckdb_vector &ve case DUCKDB_TYPE_ARRAY: return convertVectorArrayToJson(vector, row_idx); case DUCKDB_TYPE_UNION: - return convertVectorStructToJson(vector, row_idx); // Treat as struct for JSON + return convertVectorUnionToJson(vector, row_idx); default: CROW_LOG_WARNING << "Unknown type: " << type_id; return crow::json::wvalue(nullptr); @@ -497,4 +497,40 @@ crow::json::wvalue QueryResult::convertVectorStructToJson(const duckdb_vector &v return result; } +crow::json::wvalue QueryResult::convertVectorUnionToJson(const duckdb_vector &vector, const idx_t row_idx) { + auto validity = duckdb_vector_get_validity(vector); + if (!duckdb_validity_row_is_valid(validity, row_idx)) { + return crow::json::wvalue(nullptr); + } + + // A UNION is physically a STRUCT: child 0 is the tag (uint8_t), children + // 1..n are the candidate members. Only the member selected by the row's + // tag is valid, so emit just that member as {name: value} (matching + // DuckDB's own to_json) rather than every member, which the generic + // struct path did, exposing inactive members. + auto union_type = duckdb_vector_get_column_type(vector); + auto member_count = duckdb_union_type_member_count(union_type); + + auto tag_vector = duckdb_struct_vector_get_child(vector, 0); + auto tags = static_cast(duckdb_vector_get_data(tag_vector)); + idx_t member_idx = static_cast(tags[row_idx]); + if (member_idx >= member_count) { + // Out-of-range tag should not happen for well-formed unions; fail + // safe rather than reading past the struct children. + duckdb_destroy_logical_type(&union_type); + return crow::json::wvalue(nullptr); + } + + DuckDBString member_name_wrapper(duckdb_union_type_member_name(union_type, member_idx)); + auto member_name = member_name_wrapper.to_string(); + duckdb_destroy_logical_type(&union_type); + + // Member i lives at struct child index i + 1 (the tag occupies slot 0). + auto member_vector = duckdb_struct_vector_get_child(vector, member_idx + 1); + + crow::json::wvalue result; + result[member_name] = convertVectorEntryToJson(member_vector, row_idx); + return result; +} + } // namespace flapi \ No newline at end of file diff --git a/test/cpp/query_executor_test.cpp b/test/cpp/query_executor_test.cpp index 7820d6d..602b329 100644 --- a/test/cpp/query_executor_test.cpp +++ b/test/cpp/query_executor_test.cpp @@ -352,6 +352,33 @@ TEST_CASE("QueryExecutor LIST/STRUCT per-row serialization", "[query_executor][l REQUIRE(doc[2]["coords"][2].i() == 32); } + SECTION("multi-row UNION emits only the active member per row") { + QueryExecutor executor(database); + executor.execute(R"SQL( + SELECT id, u FROM ( + SELECT 1 AS id, union_value(num := 42)::UNION(num INTEGER, str VARCHAR) AS u + UNION ALL + SELECT 2, union_value(str := 'hi')::UNION(num INTEGER, str VARCHAR) + UNION ALL + SELECT 3, union_value(num := 7)::UNION(num INTEGER, str VARCHAR) + ) ORDER BY id + )SQL"); + + auto doc = crow::json::load(executor.toJson().dump()); + REQUIRE(doc.size() == 3); + + // Each row exposes only its active member ({name: value}), matching + // DuckDB's to_json — not every union member. + REQUIRE(doc[0]["u"]["num"].i() == 42); + REQUIRE_FALSE(doc[0]["u"].has("str")); + + REQUIRE(doc[1]["u"]["str"].s() == "hi"); + REQUIRE_FALSE(doc[1]["u"].has("num")); + + REQUIRE(doc[2]["u"]["num"].i() == 7); + REQUIRE_FALSE(doc[2]["u"].has("str")); + } + SECTION("NULL list entry stays null") { QueryExecutor executor(database); executor.execute(R"SQL(