From 46c9d4827c41f6afddc4bdbe84a58045dc4096f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 31 Mar 2026 19:17:04 +0000 Subject: [PATCH 1/2] fix: avoid views when querying BigLake tables from SQL cells --- .../compile/ibis_compiler/ibis_compiler.py | 21 ++++---- bigframes/core/pyformat.py | 26 ++++++++- bigframes/session/_io/bigquery/__init__.py | 10 ++-- tests/unit/core/test_pyformat.py | 54 +++++++++++++++++++ 4 files changed, 94 insertions(+), 17 deletions(-) diff --git a/bigframes/core/compile/ibis_compiler/ibis_compiler.py b/bigframes/core/compile/ibis_compiler/ibis_compiler.py index 3802a57e02d..b1659cc19f7 100644 --- a/bigframes/core/compile/ibis_compiler/ibis_compiler.py +++ b/bigframes/core/compile/ibis_compiler/ibis_compiler.py @@ -221,18 +221,15 @@ def _table_to_ibis( physical_schema = ibis_bigquery.BigQuerySchema.to_ibis( list(source.table.physical_schema) ) - if source.at_time is not None or source.sql_predicate is not None: - import bigframes.session._io.bigquery - - sql = bigframes.session._io.bigquery.to_query( - full_table_name, - columns=scan_cols, - sql_predicate=source.sql_predicate, - time_travel_timestamp=source.at_time, - ) - return ibis_bigquery.Backend().sql(schema=physical_schema, query=sql) - else: - return ibis_api.table(physical_schema, full_table_name).select(scan_cols) + import bigframes.session._io.bigquery + + sql = bigframes.session._io.bigquery.to_query( + full_table_name, + columns=scan_cols, + sql_predicate=source.sql_predicate, + time_travel_timestamp=source.at_time, + ) + return ibis_bigquery.Backend().sql(schema=physical_schema, query=sql) @_compile_node.register diff --git a/bigframes/core/pyformat.py b/bigframes/core/pyformat.py index 1dbb74fbb72..6cd57dd452e 100644 --- a/bigframes/core/pyformat.py +++ b/bigframes/core/pyformat.py @@ -21,7 +21,7 @@ import string import typing -from typing import Any, Optional, Union +from typing import Any, Optional, Tuple, Union import google.cloud.bigquery import pandas @@ -39,7 +39,11 @@ def _table_to_sql(table: _BQ_TABLE_TYPES) -> str: - return f"`{table.project}`.`{table.dataset_id}`.`{table.table_id}`" + # BiglakeIcebergTable IDs have 4 parts. BigFrames packs catalog.namespace + # into the dataset_id. + dataset_parts = table.dataset_id.split(".") + dataset_sql = ".".join(f"`{part}`" for part in dataset_parts) + return f"`{table.project}`.{dataset_sql}.`{table.table_id}`" def _pandas_df_to_sql_dry_run(pd_df: pandas.DataFrame) -> str: @@ -102,6 +106,24 @@ def _field_to_template_value( return _pandas_df_to_sql(value, session=session, dry_run=dry_run, name=name) if isinstance(value, bigframes.dataframe.DataFrame): + import bigframes.core.bq_data as bq_data + import bigframes.core.nodes as nodes + + # TODO(b/493608478): Remove this workaround for BigLake/Iceberg tables, + # which cannot currently be used in views, once a fix rolls out. + def is_biglake( + node: nodes.BigFrameNode, child_results: Tuple[bool, ...] + ) -> bool: + if isinstance(node, nodes.ReadTableNode): + return isinstance(node.source.table, bq_data.BiglakeIcebergTable) + return any(child_results) + + contains_biglake = value._block.expr.node.reduce_up(is_biglake) + + if contains_biglake: + sql_query, _, _ = value._to_sql_query(include_index=False) + return f"({sql_query})" + return _table_to_sql(value._to_placeholder_table(dry_run=dry_run)) if isinstance(value, str): diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 61b22d03115..09cce2b0c47 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -519,9 +519,13 @@ def to_query( time_travel_timestamp: Optional[datetime.datetime] = None, ) -> str: """Compile query_or_table with conditions(filters, wildcards) to query.""" - sub_query = ( - f"({query_or_table})" if is_query(query_or_table) else f"`{query_or_table}`" - ) + if is_query(query_or_table): + sub_query = f"({query_or_table})" + else: + # Table ID can have 1, 2, 3, or 4 parts. Quoting all parts to be safe. + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers + parts = query_or_table.split(".") + sub_query = ".".join(f"`{part}`" for part in parts) # TODO(b/338111344): Generate an index based on DefaultIndexKind if we # don't have index columns specified. diff --git a/tests/unit/core/test_pyformat.py b/tests/unit/core/test_pyformat.py index db7cedba8f1..ac65e5c16d6 100644 --- a/tests/unit/core/test_pyformat.py +++ b/tests/unit/core/test_pyformat.py @@ -500,6 +500,15 @@ def test_pyformat_with_query_string_replaces_variables(session): ), "SELECT * FROM `ListedProject`.`ListedDataset`.`ListedTable`", ), + ( + google.cloud.bigquery.TableReference( + google.cloud.bigquery.DatasetReference( + "my-project", "my-catalog.my-namespace" + ), + "my-table", + ), + "SELECT * FROM `my-project`.`my-catalog`.`my-namespace`.`my-table`", + ), ), ) def test_pyformat_with_table_replaces_variables(table, expected_sql, session=session): @@ -511,3 +520,48 @@ def test_pyformat_with_table_replaces_variables(table, expected_sql, session=ses sql = "SELECT * FROM {table}" got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session) assert got_sql == expected_sql + + +def test_pyformat_with_bigframes_dataframe_biglake_table(session): + # Create a real BigFrames DataFrame that points to a BigLake table. + import bigframes.core.array_value as array_value + import bigframes.core.blocks as blocks + import bigframes.core.bq_data as bq_data + import bigframes.dataframe + + # Define the BigLake table + project_id = "my-project" + catalog_id = "my-catalog" + namespace_id = "my-namespace" + table_id = "my-table" + schema = (google.cloud.bigquery.SchemaField("col", "INTEGER"),) + + biglake_table = bq_data.BiglakeIcebergTable( + project_id=project_id, + catalog_id=catalog_id, + namespace_id=namespace_id, + table_id=table_id, + physical_schema=schema, + cluster_cols=(), + metadata=bq_data.TableMetadata( + location=bq_data.BigQueryRegion("us-central1"), + type="TABLE", + ), + ) + + # ArrayValue.from_table is what read_gbq uses. + av = array_value.ArrayValue.from_table(biglake_table, session) + block = blocks.Block(av, index_columns=[], column_labels=["col"]) + df = bigframes.dataframe.DataFrame(block) + + pyformat_args = {"df": df} + sql = "SELECT * FROM {df}" + + got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session) + + # For BigLake, we now expect a SUBQUERY, not a view reference. + # The subquery should have correctly quoted 4-part ID. + assert "SELECT" in got_sql + assert "FROM `my-project`.`my-catalog`.`my-namespace`.`my-table`" in got_sql + assert got_sql.startswith("SELECT * FROM (SELECT") + assert got_sql.endswith(")") From 693315d46545e21f4e1978653333ed186079901b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 31 Mar 2026 19:58:12 +0000 Subject: [PATCH 2/2] revert: ibis change --- .../compile/ibis_compiler/ibis_compiler.py | 21 +++++++++++-------- tests/unit/core/test_pyformat.py | 5 ++++- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/bigframes/core/compile/ibis_compiler/ibis_compiler.py b/bigframes/core/compile/ibis_compiler/ibis_compiler.py index b1659cc19f7..3802a57e02d 100644 --- a/bigframes/core/compile/ibis_compiler/ibis_compiler.py +++ b/bigframes/core/compile/ibis_compiler/ibis_compiler.py @@ -221,15 +221,18 @@ def _table_to_ibis( physical_schema = ibis_bigquery.BigQuerySchema.to_ibis( list(source.table.physical_schema) ) - import bigframes.session._io.bigquery - - sql = bigframes.session._io.bigquery.to_query( - full_table_name, - columns=scan_cols, - sql_predicate=source.sql_predicate, - time_travel_timestamp=source.at_time, - ) - return ibis_bigquery.Backend().sql(schema=physical_schema, query=sql) + if source.at_time is not None or source.sql_predicate is not None: + import bigframes.session._io.bigquery + + sql = bigframes.session._io.bigquery.to_query( + full_table_name, + columns=scan_cols, + sql_predicate=source.sql_predicate, + time_travel_timestamp=source.at_time, + ) + return ibis_bigquery.Backend().sql(schema=physical_schema, query=sql) + else: + return ibis_api.table(physical_schema, full_table_name).select(scan_cols) @_compile_node.register diff --git a/tests/unit/core/test_pyformat.py b/tests/unit/core/test_pyformat.py index ac65e5c16d6..be7f52f4d5d 100644 --- a/tests/unit/core/test_pyformat.py +++ b/tests/unit/core/test_pyformat.py @@ -562,6 +562,9 @@ def test_pyformat_with_bigframes_dataframe_biglake_table(session): # For BigLake, we now expect a SUBQUERY, not a view reference. # The subquery should have correctly quoted 4-part ID. assert "SELECT" in got_sql - assert "FROM `my-project`.`my-catalog`.`my-namespace`.`my-table`" in got_sql + assert project_id in got_sql + assert catalog_id in got_sql + assert namespace_id in got_sql + assert table_id in got_sql assert got_sql.startswith("SELECT * FROM (SELECT") assert got_sql.endswith(")")