From 156e0e2ccddf343c42200a875bac477b893803fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Mon, 6 Apr 2026 15:34:26 +0000 Subject: [PATCH 1/2] fix: avoid views when querying BigLake tables from SQL cells Workaround for internal bug 493608478. --- packages/bigframes/bigframes/core/pyformat.py | 26 ++++++++- .../session/_io/bigquery/__init__.py | 10 +++- .../tests/unit/core/test_pyformat.py | 57 +++++++++++++++++++ 3 files changed, 88 insertions(+), 5 deletions(-) diff --git a/packages/bigframes/bigframes/core/pyformat.py b/packages/bigframes/bigframes/core/pyformat.py index ed8db2242ff2..024c351adc11 100644 --- a/packages/bigframes/bigframes/core/pyformat.py +++ b/packages/bigframes/bigframes/core/pyformat.py @@ -21,7 +21,7 @@ import string import typing -from typing import Any, Optional, Union +from typing import Any, Optional, Tuple, Union import google.cloud.bigquery import pandas @@ -39,7 +39,11 @@ def _table_to_sql(table: _BQ_TABLE_TYPES) -> str: - return f"`{table.project}`.`{table.dataset_id}`.`{table.table_id}`" + # BiglakeIcebergTable IDs have 4 parts. BigFrames packs catalog.namespace + # into the dataset_id. + dataset_parts = table.dataset_id.split(".") + dataset_sql = ".".join(f"`{part}`" for part in dataset_parts) + return f"`{table.project}`.{dataset_sql}.`{table.table_id}`" def _pandas_df_to_sql_dry_run(pd_df: pandas.DataFrame) -> str: @@ -102,6 +106,24 @@ def _field_to_template_value( return _pandas_df_to_sql(value, session=session, dry_run=dry_run, name=name) if isinstance(value, bigframes.dataframe.DataFrame): + import bigframes.core.bq_data as bq_data + import bigframes.core.nodes as nodes + + # TODO(b/493608478): Remove this workaround for BigLake/Iceberg tables, + # which cannot currently be used in views, once a fix rolls out. + def is_biglake( + node: nodes.BigFrameNode, child_results: Tuple[bool, ...] + ) -> bool: + if isinstance(node, nodes.ReadTableNode): + return isinstance(node.source.table, bq_data.BiglakeIcebergTable) + return any(child_results) + + contains_biglake = value._block.expr.node.reduce_up(is_biglake) + + if contains_biglake: + sql_query, _, _ = value._to_sql_query(include_index=False) + return f"({sql_query})" + return _table_to_sql(value._to_placeholder_table(dry_run=dry_run)) if isinstance(value, str): diff --git a/packages/bigframes/bigframes/session/_io/bigquery/__init__.py b/packages/bigframes/bigframes/session/_io/bigquery/__init__.py index c1e24e7e97d5..88c70b6a186b 100644 --- a/packages/bigframes/bigframes/session/_io/bigquery/__init__.py +++ b/packages/bigframes/bigframes/session/_io/bigquery/__init__.py @@ -515,9 +515,13 @@ def to_query( time_travel_timestamp: Optional[datetime.datetime] = None, ) -> str: """Compile query_or_table with conditions(filters, wildcards) to query.""" - sub_query = ( - f"({query_or_table})" if is_query(query_or_table) else f"`{query_or_table}`" - ) + if is_query(query_or_table): + sub_query = f"({query_or_table})" + else: + # Table ID can have 1, 2, 3, or 4 parts. Quoting all parts to be safe. + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers + parts = query_or_table.split(".") + sub_query = ".".join(f"`{part}`" for part in parts) # TODO(b/338111344): Generate an index based on DefaultIndexKind if we # don't have index columns specified. diff --git a/packages/bigframes/tests/unit/core/test_pyformat.py b/packages/bigframes/tests/unit/core/test_pyformat.py index db7cedba8f18..be7f52f4d5d4 100644 --- a/packages/bigframes/tests/unit/core/test_pyformat.py +++ b/packages/bigframes/tests/unit/core/test_pyformat.py @@ -500,6 +500,15 @@ def test_pyformat_with_query_string_replaces_variables(session): ), "SELECT * FROM `ListedProject`.`ListedDataset`.`ListedTable`", ), + ( + google.cloud.bigquery.TableReference( + google.cloud.bigquery.DatasetReference( + "my-project", "my-catalog.my-namespace" + ), + "my-table", + ), + "SELECT * FROM `my-project`.`my-catalog`.`my-namespace`.`my-table`", + ), ), ) def test_pyformat_with_table_replaces_variables(table, expected_sql, session=session): @@ -511,3 +520,51 @@ def test_pyformat_with_table_replaces_variables(table, expected_sql, session=ses sql = "SELECT * FROM {table}" got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session) assert got_sql == expected_sql + + +def test_pyformat_with_bigframes_dataframe_biglake_table(session): + # Create a real BigFrames DataFrame that points to a BigLake table. + import bigframes.core.array_value as array_value + import bigframes.core.blocks as blocks + import bigframes.core.bq_data as bq_data + import bigframes.dataframe + + # Define the BigLake table + project_id = "my-project" + catalog_id = "my-catalog" + namespace_id = "my-namespace" + table_id = "my-table" + schema = (google.cloud.bigquery.SchemaField("col", "INTEGER"),) + + biglake_table = bq_data.BiglakeIcebergTable( + project_id=project_id, + catalog_id=catalog_id, + namespace_id=namespace_id, + table_id=table_id, + physical_schema=schema, + cluster_cols=(), + metadata=bq_data.TableMetadata( + location=bq_data.BigQueryRegion("us-central1"), + type="TABLE", + ), + ) + + # ArrayValue.from_table is what read_gbq uses. + av = array_value.ArrayValue.from_table(biglake_table, session) + block = blocks.Block(av, index_columns=[], column_labels=["col"]) + df = bigframes.dataframe.DataFrame(block) + + pyformat_args = {"df": df} + sql = "SELECT * FROM {df}" + + got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session) + + # For BigLake, we now expect a SUBQUERY, not a view reference. + # The subquery should have correctly quoted 4-part ID. + assert "SELECT" in got_sql + assert project_id in got_sql + assert catalog_id in got_sql + assert namespace_id in got_sql + assert table_id in got_sql + assert got_sql.startswith("SELECT * FROM (SELECT") + assert got_sql.endswith(")") From 80ba5501dd73329a75eeafc85efd2769da1142d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 6 Apr 2026 11:50:47 -0500 Subject: [PATCH 2/2] Update packages/bigframes/bigframes/core/pyformat.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- packages/bigframes/bigframes/core/pyformat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/bigframes/bigframes/core/pyformat.py b/packages/bigframes/bigframes/core/pyformat.py index 024c351adc11..8f3c94054094 100644 --- a/packages/bigframes/bigframes/core/pyformat.py +++ b/packages/bigframes/bigframes/core/pyformat.py @@ -121,7 +121,7 @@ def is_biglake( contains_biglake = value._block.expr.node.reduce_up(is_biglake) if contains_biglake: - sql_query, _, _ = value._to_sql_query(include_index=False) + sql_query, _, _ = value._to_sql_query(include_index=True) return f"({sql_query})" return _table_to_sql(value._to_placeholder_table(dry_run=dry_run))