From b283dcc78e9f122ccadff4113a6eddf726575d12 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 15 May 2026 11:19:50 -0700 Subject: [PATCH 1/4] Inject parquet settings in testSearchCommandWithSpecialIndexName The test creates `logs-2021.01.11` and `logs-7.10.0-2021.01.11` directly via raw PUTs to exercise the parser's handling of dot/version-prefixed index names. The raw path bypasses `TestUtils.createIndexByRestClient`, so the analytics-engine `tests.analytics.parquet_indices=true` toggle never injects the composite/parquet settings. On the analytics-engine route this surfaces as `Table 'logs-2021.01.11' not found` (the catalog-builder only exposes indices that match an analytics-engine- recognized data format), failing the test. Routing the creates through `TestUtils.createIndexByRestClient` makes both code paths honor the toggle: legacy / Calcite continues to create plain Lucene-backed indices, analytics-engine gets parquet-backed composite indices. The index names are still tested verbatim. Surfaced by `CalciteSearchCommandIT` on the analytics-engine route. Signed-off-by: Kai Huang --- .../java/org/opensearch/sql/ppl/SearchCommandIT.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/ppl/SearchCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/ppl/SearchCommandIT.java index b8e5b2a5722..a13dbb3c625 100644 --- a/integ-test/src/test/java/org/opensearch/sql/ppl/SearchCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/ppl/SearchCommandIT.java @@ -54,10 +54,15 @@ public void testSearchCommandWithoutSearchKeyword() throws IOException { @Test public void testSearchCommandWithSpecialIndexName() throws IOException { - executeRequest(new Request("PUT", "/logs-2021.01.11")); + // Route through TestUtils so the analytics-engine parquet-indices toggle + // (tests.analytics.parquet_indices=true) applies the composite/parquet + // settings; without them the analytics-engine planner can't find a backend + // for the scan and the query 500s. + org.opensearch.sql.legacy.TestUtils.createIndexByRestClient(client(), "logs-2021.01.11", null); verifyDataRows(executeQuery("search source=logs-2021.01.11")); - executeRequest(new Request("PUT", "/logs-7.10.0-2021.01.11")); + org.opensearch.sql.legacy.TestUtils.createIndexByRestClient( + client(), "logs-7.10.0-2021.01.11", null); verifyDataRows(executeQuery("search source=logs-7.10.0-2021.01.11")); } From ee5798d8bbe6a99524c24de9f755fc967359afe7 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 15 May 2026 14:14:07 -0700 Subject: [PATCH 2/4] Pass minimal `{"properties":{}}` mapping in testSearchCommandWithSpecialIndexName MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `OpenSearchSchemaBuilder.buildSchema` skips any index whose mapping has no `properties` block — the analytics catalog then doesn't surface the index, and the PPL parser fails with `Table 'logs-2021.01.11' not found` before reaching the row type. Routing the special-index create through `TestUtils.createIndexByRestClient` with `null` mapping produces an empty `{}` body, so the index is created with no mapping at all and the catalog still skips it. Pass a minimal `{"mappings":{"properties":{}}}` body. OpenSearch strips the empty `properties` to `{}` server-side but the schema builder sees a non-null mapping and registers the (zero-column) table. The query then parses cleanly and the test's `verifyDataRows()` (no expected rows) succeeds against the empty index. Signed-off-by: Kai Huang --- .../org/opensearch/sql/ppl/SearchCommandIT.java | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/ppl/SearchCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/ppl/SearchCommandIT.java index a13dbb3c625..791d7ca7db7 100644 --- a/integ-test/src/test/java/org/opensearch/sql/ppl/SearchCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/ppl/SearchCommandIT.java @@ -54,15 +54,18 @@ public void testSearchCommandWithoutSearchKeyword() throws IOException { @Test public void testSearchCommandWithSpecialIndexName() throws IOException { - // Route through TestUtils so the analytics-engine parquet-indices toggle - // (tests.analytics.parquet_indices=true) applies the composite/parquet - // settings; without them the analytics-engine planner can't find a backend - // for the scan and the query 500s. - org.opensearch.sql.legacy.TestUtils.createIndexByRestClient(client(), "logs-2021.01.11", null); + // A minimal `{"properties":{}}` mapping is required so the analytics-engine + // catalog (OpenSearchSchemaBuilder.buildSchema) actually surfaces the index — + // mapping-less indices are skipped and the parser then fails with + // "Table 'logs-2021.01.11' not found". Routing through TestUtils also + // honors the tests.analytics.parquet_indices toggle. + String minimalMapping = "{\"mappings\":{\"properties\":{}}}"; + org.opensearch.sql.legacy.TestUtils.createIndexByRestClient( + client(), "logs-2021.01.11", minimalMapping); verifyDataRows(executeQuery("search source=logs-2021.01.11")); org.opensearch.sql.legacy.TestUtils.createIndexByRestClient( - client(), "logs-7.10.0-2021.01.11", null); + client(), "logs-7.10.0-2021.01.11", minimalMapping); verifyDataRows(executeQuery("search source=logs-7.10.0-2021.01.11")); } From 4070c28ebfafc4308ab4b501a7fe46dbe7ce4d54 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 15 May 2026 14:48:22 -0700 Subject: [PATCH 3/4] Preserve datetime schema-column types in AnalyticsExecutionEngine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `DatetimeOutputCastRule` (api/spec/datetime) wraps every datetime-typed root column in `CAST(... AS VARCHAR)` so the analytics-engine backend can emit PPL's documented space-separator format via `to_char` (`DatetimeOutputCastRewriter` on the OpenSearch side). The downside is that `plan.getRowType()` then advertises those columns as VARCHAR — the response schema reports `"string"` for what callers expect as `"timestamp"`. The legacy / Calcite-reference path doesn't have this divergence: the wire value is the formatted string while the schema column type stays as the original datetime type. Walk the outermost `Project` (descending through a single `Sort`, the `LogicalSystemLimit` system query-size wrapper) and, for any slot of shape `CAST( AS VARCHAR)`, swap in the inner `RelDataType` for the schema column. Values still come back as the strings DataFusion emitted — only the schema column type is restored. Falls back to the slot's reported type for non-cast slots, non-datetime sources, or when the root isn't a Project (raw scan / aggregate fragment). Surfaced by `verifySchema(... schema("@timestamp", "timestamp") ...)` assertions in `CalciteSearchCommandIT` time-modifier tests; this commit removes the schema-type divergence for those queries (the remaining data-row mismatches are downstream Lucene-secondary issues filed separately). Signed-off-by: Kai Huang --- .../analytics/AnalyticsExecutionEngine.java | 81 +++++++++++++++++-- 1 file changed, 76 insertions(+), 5 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/executor/analytics/AnalyticsExecutionEngine.java b/core/src/main/java/org/opensearch/sql/executor/analytics/AnalyticsExecutionEngine.java index ddfe5fd3556..19e2cda324a 100644 --- a/core/src/main/java/org/opensearch/sql/executor/analytics/AnalyticsExecutionEngine.java +++ b/core/src/main/java/org/opensearch/sql/executor/analytics/AnalyticsExecutionEngine.java @@ -6,13 +6,20 @@ package org.opensearch.sql.executor.analytics; import java.util.ArrayList; +import java.util.EnumSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import org.apache.calcite.plan.RelOptUtil; import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.Project; +import org.apache.calcite.rel.core.Sort; import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeField; +import org.apache.calcite.rex.RexCall; +import org.apache.calcite.rex.RexNode; +import org.apache.calcite.sql.SqlKind; +import org.apache.calcite.sql.type.SqlTypeName; import org.opensearch.analytics.exec.QueryPlanExecutor; import org.opensearch.core.action.ActionListener; import org.opensearch.sql.ast.statement.ExplainMode; @@ -87,7 +94,7 @@ public void onResponse(Iterable rows) { try { List fields = plan.getRowType().getFieldList(); List results = convertRows(rows, fields); - Schema schema = buildSchema(fields); + Schema schema = buildSchema(fields, recoverOriginalDatetimeTypes(plan)); execMetric.set(System.nanoTime() - execStart); listener.onResponse(new QueryResponse(schema, results, Cursor.None)); } catch (Exception e) { @@ -132,11 +139,11 @@ private List convertRows(Iterable rows, List fields) { + private Schema buildSchema(List fields, List reportedTypes) { List columns = new ArrayList<>(); - for (RelDataTypeField field : fields) { - ExprType exprType = convertType(field.getType()); - columns.add(new Schema.Column(field.getName(), null, exprType)); + for (int i = 0; i < fields.size(); i++) { + ExprType exprType = convertType(reportedTypes.get(i)); + columns.add(new Schema.Column(fields.get(i).getName(), null, exprType)); } return new Schema(columns); } @@ -148,4 +155,68 @@ private ExprType convertType(RelDataType type) { return org.opensearch.sql.data.type.ExprCoreType.UNKNOWN; } } + + /** + * Datetime SqlTypeNames produced by the SQL-plugin engine pipeline. Used by {@link + * #recoverOriginalDatetimeTypes} to detect {@code DatetimeOutputCastRule}'s {@code + * CAST( AS VARCHAR)} output slots. + */ + private static final EnumSet DATETIME_TYPES = + EnumSet.of( + SqlTypeName.TIMESTAMP, + SqlTypeName.TIMESTAMP_WITH_LOCAL_TIME_ZONE, + SqlTypeName.DATE, + SqlTypeName.TIME); + + /** + * Recovers the pre-cast datetime types for the schema. {@code DatetimeOutputCastRule} (api + * post-analysis pass) wraps every datetime-typed root column in {@code CAST(... AS VARCHAR)} so + * the analytics-engine backend can emit PPL's documented space-separator format via {@code + * to_char} ({@code DatetimeOutputCastRewriter} on the OpenSearch sandbox side). The downside is + * that {@code plan.getRowType()} then advertises those columns as VARCHAR — the response schema + * would report {@code "string"} for what tests (and callers) still expect as {@code "timestamp"}. + * The legacy / Calcite-reference path doesn't have this divergence: the wire value is a formatted + * string while the schema column type is the original datetime type. + * + *

This walk inspects the OUTERMOST {@link Project} (descending through a single {@link Sort}, + * the {@code LogicalSystemLimit} wrapper) and, for any slot of the exact shape {@code + * CAST( AS VARCHAR)}, swaps in the inner {@link RelDataType} for the schema. Values + * still come back as the strings DataFusion emitted; only the schema column type is restored. + * + *

Falls back to the slot's reported type for non-cast slots, non-datetime sources, or when the + * root isn't a Project (raw scan / aggregate fragment). + */ + private static List recoverOriginalDatetimeTypes(RelNode plan) { + List fields = plan.getRowType().getFieldList(); + List reportedTypes = new ArrayList<>(fields.size()); + for (RelDataTypeField field : fields) { + reportedTypes.add(field.getType()); + } + RelNode root = plan; + if (root instanceof Sort sort && sort.getInput() instanceof Project) { + root = sort.getInput(); + } + if (!(root instanceof Project project)) { + return reportedTypes; + } + List slots = project.getProjects(); + int n = Math.min(slots.size(), reportedTypes.size()); + for (int i = 0; i < n; i++) { + RexNode slot = slots.get(i); + if (!(slot instanceof RexCall call)) { + continue; + } + if (call.getKind() != SqlKind.CAST) { + continue; + } + if (call.getType().getSqlTypeName() != SqlTypeName.VARCHAR) { + continue; + } + RexNode source = call.getOperands().get(0); + if (DATETIME_TYPES.contains(source.getType().getSqlTypeName())) { + reportedTypes.set(i, source.getType()); + } + } + return reportedTypes; + } } From cdac62ede405470dc3cf22b6e8bf04d8b68f6125 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Mon, 18 May 2026 15:09:15 -0700 Subject: [PATCH 4/4] Convert byte[] cells to base64 ExprStringValue for IP / BINARY fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `AnalyticsExecutionEngine.convertRows` reaches `ExprValueUtils.fromObjectValue` for every cell in every row. OpenSearch `ip` and `binary` fields are stored as raw bytes, so once the analytics-engine reader (paired with OpenSearch PR #21681 commit 4) materializes a parquet column, each cell arrives as a Java `byte[]`. The fall-through catch-all previously threw: ExpressionEvaluationException: unsupported object class [B at ExprValueUtils.fromObjectValue(ExprValueUtils.java:168) at AnalyticsExecutionEngine.convertRows(AnalyticsExecutionEngine.java:128) Every PPL query that even projects an ip column fails on the analytics route with this error, regardless of operator or predicate. `OpenSearchSchemaBuilder.mapFieldType` collapses both `ip` and `binary` into Calcite `VARBINARY`, so the cell-level type system can't distinguish the two. An earlier draft of this commit gated IP rendering on byte length (4 or 16 bytes → `ExprIpValue` via `InetAddress.getByAddress`), but that misinterprets any 4-byte ASCII payload (e.g. `"INFO"`, `"WARN"`, `"POST"`) as a public IPv4 address — the classic "Norway problem" applied to bytes. With no schema context here we can't safely guess, so default to a base64-encoded `ExprStringValue`. Typed call sites that DO know the column is `ip` should route through `ipValue(String)` explicitly. Once `OpenSearchSchemaBuilder` splits `ip` and `binary` into separate UDTs (TODO already filed in that file), this branch can render IP cells in dotted-decimal form again without the length-guessing. Signed-off-by: Kai Huang --- .../opensearch/sql/data/model/ExprValueUtils.java | 10 ++++++++++ .../sql/data/model/ExprValueUtilsTest.java | 12 ++++++++++++ 2 files changed, 22 insertions(+) diff --git a/core/src/main/java/org/opensearch/sql/data/model/ExprValueUtils.java b/core/src/main/java/org/opensearch/sql/data/model/ExprValueUtils.java index 02b2e2bcb1d..f1b48214b17 100644 --- a/core/src/main/java/org/opensearch/sql/data/model/ExprValueUtils.java +++ b/core/src/main/java/org/opensearch/sql/data/model/ExprValueUtils.java @@ -164,6 +164,16 @@ public static ExprValue fromObjectValue(Object o) { return timestampValue(((LocalDateTime) o).toInstant(ZoneOffset.UTC)); } else if (o instanceof TemporalAmount) { return intervalValue((TemporalAmount) o); + } else if (o instanceof byte[] bytes) { + // No type context here: OpenSearch `ip` and `binary` fields both collapse to + // VARBINARY in `OpenSearchSchemaBuilder`, so a 4- or 16-byte payload could be + // a genuine address or arbitrary binary that happens to be that length (4-byte + // ASCII strings like "INFO" or "WARN" are indistinguishable from a public IPv4 + // address byte-for-byte). Default to a safe, unambiguous encoding. Typed call + // sites should route IP cells through {@link #ipValue(String)} explicitly. + // TODO: split `ip` and `binary` into separate UDTs upstream so this branch can + // render IP cells in dotted form again. See `OpenSearchSchemaBuilder.mapFieldType`. + return stringValue(java.util.Base64.getEncoder().encodeToString(bytes)); } else { throw new ExpressionEvaluationException("unsupported object " + o.getClass()); } diff --git a/core/src/test/java/org/opensearch/sql/data/model/ExprValueUtilsTest.java b/core/src/test/java/org/opensearch/sql/data/model/ExprValueUtilsTest.java index 888042ad789..4cb7b48f24c 100644 --- a/core/src/test/java/org/opensearch/sql/data/model/ExprValueUtilsTest.java +++ b/core/src/test/java/org/opensearch/sql/data/model/ExprValueUtilsTest.java @@ -244,6 +244,18 @@ public void constructDateAndTimeValue() { ExprValueUtils.fromObjectValue("2012-07-07 01:01:01", TIMESTAMP)); } + @Test + public void fromObjectValue_byte_array_returns_base64_string() { + // Without type context the factory can't distinguish `ip` from `binary` + // (both collapse to VARBINARY in OpenSearchSchemaBuilder), so every byte[] + // is base64-encoded. Typed routing happens at the call site. + assertEquals( + new ExprStringValue("AQIDBA=="), ExprValueUtils.fromObjectValue(new byte[] {1, 2, 3, 4})); + assertEquals( + new ExprStringValue("AQIDBAU="), + ExprValueUtils.fromObjectValue(new byte[] {1, 2, 3, 4, 5})); + } + @Test public void fromObjectValue_double_infinity_returns_null() { assertTrue(ExprValueUtils.fromObjectValue(Double.POSITIVE_INFINITY).isNull());