diff --git a/docs/docs/development/type-generation.md b/docs/docs/development/type-generation.md index c715791a..48d239ff 100644 --- a/docs/docs/development/type-generation.md +++ b/docs/docs/development/type-generation.md @@ -94,6 +94,17 @@ The type generator: 4. Generates TypeScript interfaces for query parameters and results 5. Creates a `QueryRegistry` type for type-safe query execution +### Parameters during `DESCRIBE QUERY` + +Type generation describes each query without binding real parameters, so it +substitutes a placeholder default for every `:param` (e.g. `''` for a string). +That breaks queries whose shape depends on a value — most notably dynamic table +names via `IDENTIFIER(:catalog || '.schema.table')`. Annotate such parameters +with a sample value (`-- @param catalog STRING = main`) so the describe call can +resolve a real table. The sample value is used only at type-generation time; the +runtime query still binds the actual parameter. See +[SQL parameters → Sample values](../plugins/analytics.md#sample-values-for-type-generation). + ## Using generated types Once types are generated, your IDE will provide autocomplete and type checking: diff --git a/docs/docs/plugins/analytics.md b/docs/docs/plugins/analytics.md index 3198d434..1ed10210 100644 --- a/docs/docs/plugins/analytics.md +++ b/docs/docs/plugins/analytics.md @@ -61,6 +61,34 @@ at the call site. - `FLOAT`, `DOUBLE` — bind via `sql.float()` / `sql.double()` - `NUMERIC`, `DECIMAL` — bind via `sql.numeric()` (pass strings for precision) +### Sample values for type generation + +Some queries only have a valid shape once a parameter has a concrete value — most +commonly a dynamic table name built with `IDENTIFIER()`. During type generation +AppKit runs `DESCRIBE QUERY` with placeholder defaults, so an unresolved parameter +collapses to an empty string and produces invalid SQL +(`IDENTIFIER('' || '.schema.table')` → `PARSE_SYNTAX_ERROR`). + +Append `= value` to a `-- @param` annotation to give type generation a sample +value. It is used **only** while describing the query; at runtime the real +parameter is still bound, so the query stays portable across environments: + +```sql +-- @param target_catalog STRING = main +SELECT * +FROM IDENTIFIER(:target_catalog || '.sales.nation') +``` + +Type generation describes `main.sales.nation` to infer the result columns, while +the deployed app binds whatever catalog the caller passes. String, `DATE`, and +`TIMESTAMP` values are quoted automatically (`= main` → `'main'`), and an +already-quoted literal is kept as-is (`= '2024-01-01'`). Numeric, `BOOLEAN`, and +`BINARY` values are validated against a strict literal shape (`= 100`, `= true`, +`= X'00'`); a value that doesn't match — anything that could otherwise inject SQL +into the describe statement — is ignored and the parameter falls back to its +type-based placeholder, so a sample value can never break out of the +`DESCRIBE QUERY`. + ## Server-injected parameters `:workspaceId` is **injected by the server** and **must not** be annotated: diff --git a/packages/appkit/src/type-generator/query-registry.ts b/packages/appkit/src/type-generator/query-registry.ts index ce0cdd0b..2573155f 100644 --- a/packages/appkit/src/type-generator/query-registry.ts +++ b/packages/appkit/src/type-generator/query-registry.ts @@ -393,6 +393,159 @@ export function defaultForType(sqlType: string | undefined): string { } } +/** + * True when `raw` is already a single, well-formed SQL single-quoted string + * literal — i.e. it opens and closes with `'` and every interior quote is part + * of an escaped `''` pair. `'2024-01-01'` and `'O''Brien'` qualify; + * `'a' OR 1=1 OR 'b'` does not (it has lone interior quotes), so it is treated + * as raw content and re-escaped rather than trusted. + */ +function isWellFormedStringLiteral(raw: string): boolean { + if (raw.length < 2 || !raw.startsWith("'") || !raw.endsWith("'")) { + return false; + } + const inner = raw.slice(1, -1); + return !inner.replace(/''/g, "").includes("'"); +} + +/** + * Format a user-supplied sample value as a SQL literal for substitution into + * the build-time DESCRIBE statement. Returns `null` when the value isn't valid + * for its type, so the caller falls back to the safe type-based placeholder + * instead of substituting attacker-controllable text. + * + * The value comes from a `.sql` file that may be shared via a template or + * dependency, so it must not be able to inject SQL into `DESCRIBE QUERY`: + * - string-like types are always emitted as one well-formed, fully-escaped + * single-quoted literal (a pre-quoted literal is kept as-is, anything else is + * quoted with `'` doubled), so the value can never break out of the string; + * - numeric / boolean / binary values must match a strict literal shape and are + * rejected (`null`) otherwise, rather than being passed through verbatim. + */ +function formatSampleValue( + sqlType: string | undefined, + raw: string, +): string | null { + switch (sqlType?.toUpperCase()) { + case "STRING": + case "DATE": + case "TIMESTAMP": + case "TIMESTAMP_NTZ": + return isWellFormedStringLiteral(raw) + ? raw + : `'${raw.replace(/'/g, "''")}'`; + case "NUMERIC": + case "DECIMAL": + case "BIGINT": + case "TINYINT": + case "SMALLINT": + case "INT": + case "FLOAT": + case "DOUBLE": + return /^[+-]?\d+(\.\d+)?$/.test(raw) ? raw : null; + case "BOOLEAN": + return /^(?:true|false)$/i.test(raw) ? raw.toLowerCase() : null; + case "BINARY": + return /^X'[0-9a-fA-F]*'$/i.test(raw) ? raw : null; + default: + return null; + } +} + +/** + * Parse optional describe-time sample values from `@param` annotations, e.g. + * `-- @param target_catalog STRING = main`. The value is substituted into the + * SQL **only during DESCRIBE QUERY** so type generation can resolve queries + * whose shape depends on a parameter value — most notably dynamic table names + * via `IDENTIFIER(:target_catalog || '.schema.table')`, where the empty-string + * default would otherwise produce malformed SQL. Runtime binding is unaffected: + * the analytics plugin still binds the real parameter at execution time, so the + * query stays portable across environments. + * + * Returns a map of parameter name to the formatted SQL literal to substitute. + */ +export function extractParameterDefaults(sql: string): Record { + const defaults: Record = {}; + // Mirrors extractParameterTypes' type alternation, then requires `= ` + // through end-of-line. Lines without a value are left to extractParameterTypes. + const regex = + /--\s*@param\s+(\w+)\s+(STRING|NUMERIC|DECIMAL|BIGINT|TINYINT|SMALLINT|INT|FLOAT|DOUBLE|BOOLEAN|DATE|TIMESTAMP_NTZ|TIMESTAMP|BINARY)\s*=\s*(.+?)\s*$/gim; + for (const match of sql.matchAll(regex)) { + const [, paramName, paramType, rawValue] = match; + const formatted = formatSampleValue(paramType, rawValue); + // A value that fails type validation is dropped, not substituted: the param + // then falls back to the safe type-based placeholder during DESCRIBE. + if (formatted !== null) { + defaults[paramName] = formatted; + } + } + return defaults; +} + +/** + * Replace `:param` placeholders with describe-time literals so `DESCRIBE QUERY` + * can run without bound parameters. Resolution order per parameter: + * 1. An explicit `-- @param name TYPE = value` sample value (wins), which lets + * dynamic table names via `IDENTIFIER(...)` resolve to a real table. + * 2. Otherwise a placeholder default derived from the annotated/inferred type. + * Placeholders inside string literals or comments are left untouched. + */ +export function substituteParametersForDescribe(sql: string): string { + const protectedRanges = getProtectedRanges(sql); + const annotatedTypes = extractParameterTypes(sql); + const inferredTypes = inferParameterTypes(sql, protectedRanges); + const parameterTypes = { ...inferredTypes, ...annotatedTypes }; + const parameterDefaults = extractParameterDefaults(sql); + return sql.replace( + /(? { + if (isInsideProtectedRange(offset, protectedRanges)) { + return original; + } + const sampleValue = parameterDefaults[paramName]; + if (sampleValue !== undefined) { + return sampleValue; + } + return defaultForType(parameterTypes[paramName]); + }, + ); +} + +/** + * Append a remediation hint when a DESCRIBE failure looks like a dynamic + * identifier that couldn't be resolved: the query calls `IDENTIFIER(...)` and + * has at least one parameter without a describe-time sample value. These fail + * because typegen substitutes a placeholder default (e.g. `''`) that yields a + * malformed or non-existent table name. Steering the user to the `= value` + * annotation turns the fatal error into a one-line fix. + */ +function withIdentifierHint( + error: { code?: string; message: string }, + sql: string, +): { code?: string; message: string } { + if (!/\bIDENTIFIER\s*\(/i.test(sql)) { + return error; + } + const protectedRanges = getProtectedRanges(sql); + const params = extractParameters(sql, protectedRanges); + const defaults = extractParameterDefaults(sql); + const unresolved = params.filter( + (p) => !SERVER_INJECTED_PARAMS.includes(p) && defaults[p] === undefined, + ); + if (unresolved.length === 0) { + return error; + } + const example = unresolved[0]; + return { + ...error, + message: `${error.message}\n Hint: this query uses IDENTIFIER() with parameter(s) ${unresolved + .map((p) => `:${p}`) + .join( + ", ", + )}. Give type generation a sample value so it can resolve the table, e.g. \`-- @param ${example} STRING = my_catalog\`. The runtime query still binds the real parameter.`, + }; +} + /** * Infer parameter types from positional context in SQL. * V1 only infers NUMERIC from patterns like LIMIT, OFFSET, TOP, @@ -512,20 +665,17 @@ export async function generateQueriesFromDescribe( const annotatedTypes = extractParameterTypes(sql); const inferredTypes = inferParameterTypes(sql, protectedRanges); const parameterTypes = { ...inferredTypes, ...annotatedTypes }; - const sqlWithDefaults = sql.replace( - /(? { - if (isInsideProtectedRange(offset, protectedRanges)) { - return original; - } - return defaultForType(parameterTypes[paramName]); - }, - ); + // Explicit describe-time sample values (`-- @param name TYPE = value`) + // take precedence over the type-based default so queries with dynamic + // table names (IDENTIFIER) can resolve a real table during DESCRIBE. + const parameterDefaults = extractParameterDefaults(sql); + const sqlWithDefaults = substituteParametersForDescribe(sql); // Warn about unresolved parameters const allParams = extractParameters(sql, protectedRanges); for (const param of allParams) { if (SERVER_INJECTED_PARAMS.includes(param)) continue; + if (parameterDefaults[param]) continue; if (parameterTypes[param]) continue; logger.warn( '%s: parameter ":%s" has no type annotation or inference. Add %s to the query file.', @@ -711,7 +861,7 @@ export async function generateQueriesFromDescribe( status: "syntax", index, schema: { name: queryName, type }, - error: parseError(sqlError), + error: withIdentifierHint(parseError(sqlError), sql), }; } diff --git a/packages/appkit/src/type-generator/tests/query-registry.test.ts b/packages/appkit/src/type-generator/tests/query-registry.test.ts index b149d5bb..3858ebf4 100644 --- a/packages/appkit/src/type-generator/tests/query-registry.test.ts +++ b/packages/appkit/src/type-generator/tests/query-registry.test.ts @@ -2,12 +2,14 @@ import { describe, expect, test } from "vitest"; import { convertToQueryType, defaultForType, + extractParameterDefaults, extractParameters, extractParameterTypes, getProtectedRanges, inferParameterTypes, normalizeTypeName, SERVER_INJECTED_PARAMS, + substituteParametersForDescribe, } from "../query-registry"; import type { DatabricksStatementExecutionResponse } from "../types"; @@ -590,3 +592,152 @@ describe("substitution skips protected ranges", () => { expect(result).toContain("id = ''"); }); }); + +describe("extractParameterDefaults", () => { + test("returns empty object when no sample values are present", () => { + const sql = + "-- @param startDate DATE\nSELECT * FROM t WHERE d = :startDate"; + expect(extractParameterDefaults(sql)).toEqual({}); + }); + + test("quotes string-like sample values", () => { + const sql = [ + "-- @param target_catalog STRING = main", + "-- @param since DATE = 2024-01-01", + "SELECT 1", + ].join("\n"); + expect(extractParameterDefaults(sql)).toEqual({ + target_catalog: "'main'", + since: "'2024-01-01'", + }); + }); + + test("passes numeric and boolean sample values through verbatim", () => { + const sql = [ + "-- @param limit INT = 100", + "-- @param ratio DOUBLE = 0.5", + "-- @param active BOOLEAN = true", + "SELECT 1", + ].join("\n"); + expect(extractParameterDefaults(sql)).toEqual({ + limit: "100", + ratio: "0.5", + active: "true", + }); + }); + + test("leaves already-quoted values untouched and escapes embedded quotes", () => { + const sql = [ + "-- @param a STRING = 'pre-quoted'", + "-- @param b STRING = O'Brien", + "SELECT 1", + ].join("\n"); + expect(extractParameterDefaults(sql)).toEqual({ + a: "'pre-quoted'", + b: "'O''Brien'", + }); + }); + + test("ignores @param lines without a value", () => { + const sql = "-- @param onlyType STRING\nSELECT 1"; + expect(extractParameterDefaults(sql)).toEqual({}); + }); + + test("rejects numeric sample values that aren't plain numbers (no injection)", () => { + const sql = [ + "-- @param n INT = 0) UNION SELECT secret FROM creds --", + "-- @param r DOUBLE = 1; DROP TABLE x", + "SELECT 1", + ].join("\n"); + // Both fail strict numeric validation and are dropped (not substituted), + // so they fall back to the safe type default during DESCRIBE. + expect(extractParameterDefaults(sql)).toEqual({}); + }); + + test("rejects non-boolean BOOLEAN sample values", () => { + const sql = "-- @param flag BOOLEAN = 1 OR 1=1\nSELECT 1"; + expect(extractParameterDefaults(sql)).toEqual({}); + }); + + test("accepts well-formed BINARY and rejects malformed BINARY", () => { + expect( + extractParameterDefaults("-- @param b BINARY = X'00'\nSELECT 1"), + ).toEqual({ b: "X'00'" }); + expect( + extractParameterDefaults("-- @param b BINARY = X'00' OR 1=1\nSELECT 1"), + ).toEqual({}); + }); + + test("neutralizes a string value that isn't a well-formed literal by escaping it", () => { + const sql = ["-- @param x STRING = 'a' OR 1=1 OR 'b'", "SELECT 1"].join( + "\n", + ); + // The lone interior quotes mean this isn't a single well-formed literal, so + // it's treated as raw content and fully escaped — one inert string literal, + // no SQL break-out. + expect(extractParameterDefaults(sql)).toEqual({ + x: "'''a'' OR 1=1 OR ''b'''", + }); + }); +}); + +describe("substituteParametersForDescribe (IDENTIFIER support, #383)", () => { + test("empty-string default produces malformed SQL for IDENTIFIER params", () => { + // Reproduces the bug: with no sample value, an untyped IDENTIFIER param + // collapses to '' and yields IDENTIFIER('' || '.schema.table'), a leading-dot + // identifier that Databricks rejects with PARSE_SYNTAX_ERROR. + const sql = "SELECT * FROM IDENTIFIER(:target_catalog || '.sales.nation')"; + expect(substituteParametersForDescribe(sql)).toBe( + "SELECT * FROM IDENTIFIER('' || '.sales.nation')", + ); + }); + + test("sample value resolves IDENTIFIER to a real, describable table", () => { + const sql = [ + "-- @param target_catalog STRING = main", + "SELECT * FROM IDENTIFIER(:target_catalog || '.sales.nation')", + ].join("\n"); + const out = substituteParametersForDescribe(sql); + expect(out).toContain("IDENTIFIER('main' || '.sales.nation')"); + expect(out).not.toContain(":target_catalog"); + }); + + test("sample value wins over the type-based default", () => { + const sql = [ + "-- @param status STRING = active", + "SELECT * FROM t WHERE status = :status", + ].join("\n"); + expect(substituteParametersForDescribe(sql)).toContain("status = 'active'"); + }); + + test("falls back to the type default when no sample value is given", () => { + const sql = ["-- @param limit INT", "SELECT * FROM t LIMIT :limit"].join( + "\n", + ); + expect(substituteParametersForDescribe(sql)).toBe( + "-- @param limit INT\nSELECT * FROM t LIMIT 0", + ); + }); + + test("does not substitute placeholders inside string literals", () => { + const sql = "SELECT ':target_catalog' AS lit, :target_catalog AS p"; + const sql2 = ["-- @param target_catalog STRING = main", sql].join("\n"); + const out = substituteParametersForDescribe(sql2); + expect(out).toContain("':target_catalog' AS lit"); + expect(out).toContain("'main' AS p"); + }); + + test("rejects an injection attempt in a numeric param, falling back to the placeholder", () => { + const sql = [ + "-- @param n INT = 1); DROP TABLE x --", + "SELECT * FROM t LIMIT :n", + ].join("\n"); + const out = substituteParametersForDescribe(sql); + // The malicious value is dropped; :n falls back to the INT default 0, so the + // payload never reaches the executable LIMIT clause. (It survives only in the + // @param comment line, which is inert.) + expect(out).toContain("SELECT * FROM t LIMIT 0"); + expect(out).not.toContain(":n"); + expect(out).not.toContain("LIMIT 1)"); + }); +});