feat: Add ai_generate_bool to the bigframes.bigquery package

sycai · sycai · commit d250e14789b8 · 2025-09-09T04:34:10.000Z
diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py
@@ -18,6 +18,7 @@
 
 import sys
 
+from bigframes.bigquery._operations.ai import ai_generate_bool
 from bigframes.bigquery._operations.approx_agg import approx_top_count
 from bigframes.bigquery._operations.array import (
     array_agg,
@@ -57,6 +58,8 @@
 from bigframes.core import log_adapter
 
 _functions = [
+    # ai ops
+    ai_generate_bool,
     # approximate aggregate ops
     approx_top_count,
     # array ops
diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py
@@ -0,0 +1,177 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import functools
+import json
+from typing import Any, List, Literal, Mapping, Sequence, Tuple
+
+from bigframes import clients, dtypes, series
+from bigframes.operations import ai_ops
+
+
+def ai_generate_bool(
+    prompt: series.Series | List[str | series.Series] | Tuple[str | series.Series],
+    *,
+    connection_id: str | None = None,
+    endpoint: str | None = None,
+    request_type: Literal["dedicated", "shared", "unspecified"] = "unspecified",
+    model_params: Mapping[Any, Any] | None = None,
+) -> series.Series:
+    """Returns the AI analysis based on the prompt, which can be any combination of text and unstructured data.
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> bpd.options.display.progress_bar = None
+        >>> df = bpd.DataFrame({
+        ...     "col_1": ["apple", "bear", "pear"],
+        ...     "col_2": ["fruit", "animal", "animal"]
+        ... })
+        >>> bbq.ai_generate_bool((df["col_1"], " is a ", df["col_2"]))
+        0    {'result': True, 'full_response': '{"candidate...
+        1    {'result': True, 'full_response': '{"candidate...
+        2    {'result': False, 'full_response': '{"candidat...
+        dtype: struct<result: bool, full_response: string, status: string>[pyarrow]
+
+        >>> bbq.ai_generate_bool((df["col_1"], " is a ", df["col_2"])).struct.field("result")
+        0     True
+        1     True
+        2    False
+        Name: result, dtype: boolean
+
+        >>> model_params = {
+        ...     "generation_config": {
+        ...         "thinking_config": {
+        ...             "thinking_budget": 0
+        ...         }
+        ...     }
+        ... }
+        >>> bbq.ai_generate_bool(
+        ...     (df["col_1"], " is a ", df["col_2"]),
+        ...     endpoint="gemini-2.5-pro",
+        ...     model_params=model_params,
+        ... ).struct.field("result")
+        0     True
+        1     True
+        2    False
+        Name: result, dtype: boolean
+
+    Args:
+        prompt (series.Series | List[str|series.Series] | Tuple[str|series.Series]):
+            A mixture of Series and string literals that specifies the prompt to send to the model.
+        connection_id (str, optional):
+            Specifies the connection to use to communicate with the model. For example, `myproject.us.myconnection`.
+            If not provided, the connection from the current session will be used.
+        endpoint (str, optional):
+            Specifies the Vertex AI endpoint to use for the model. You can specify any generally available
+            or preview Gemini model. If you specify the model name, BigQuery ML automatically identifies and
+            uses the full endpoint of the model. If you don't specify an ENDPOINT value, BigQuery ML selects
+            a recent stable version of Gemini to use.
+        request_type (Literal["dedicated", "shared", "unspecified"]):
+            Specifies the type of inference request to send to the Gemini model. The request type determines what
+            quota the request uses.
+            * "dedicated": function only uses Provisioned Throughput quota. The AI.GENERATE function returns the error Provisioned throughput is not purchased or is not active if Provisioned Throughput quota isn't available.
+            * "shared": the function only uses dynamic shared quota (DSQ), even if you have purchased Provisioned Throughput quota.
+            * "unspecified":
+                * If you haven't purchased Provisioned Throughput quota, the function uses DSQ quota.
+                * If you have purchased Provisioned Throughput quota, the function uses the Provisioned Throughput quota first. If requests exceed the Provisioned Throughput quota, the overflow traffic uses DSQ quota.
+        model_params (Mapping[Any, Any]):
+            Provides additional parameters to the model. The MODEL_PARAMS value must conform to the generateContent request body format.
+
+    Returns:
+        bigframes.series.Series: A new struct Series with the result data. The struct contains these fields:
+        * "result": a BOOL value containing the model's response to the prompt. The result is None if the request fails or is filtered by responsible AI.
+        * "full_resposne": a STRING value containing the JSON response from the projects.locations.endpoints.generateContent call to the model. The generated text is in the text element.
+        * "status": a STRING value that contains the API response status for the corresponding row. This value is empty if the operation was successful.
+    """
+
+    if request_type not in ("dedicated", "shared", "unspecified"):
+        raise ValueError(f"Unsupported request type: {request_type}")
+
+    prompt_context, series_list = _separate_context_and_series(prompt)
+
+    if not series_list:
+        raise ValueError("Please provide at least one Series in the prompt")
+
+    operator = ai_ops.AIGenerateBool(
+        prompt_context=tuple(prompt_context),
+        connection_id=_resolve_connection_id(series_list[0], connection_id),
+        endpoint=endpoint,
+        request_type=request_type,
+        model_params=json.dumps(model_params) if model_params else None,
+    )
+
+    return series_list[0]._apply_nary_op(operator, series_list[1:])
+
+
+@functools.singledispatch
+def _separate_context_and_series(
+    prompt: Any,
+) -> Tuple[List[str | None], List[series.Series]]:
+    """
+    Returns the two values. The first value is the prompt with all series replaced by None. The second value is all the series
+    in the prompt. The original item order is kept.
+    For example:
+    Input: ("str1", series1, "str2", "str3", series2)
+    Output: ["str1", None, "str2", "str3", None], [series1, series2]
+    """
+    raise ValueError(f"Unsupported prompt type: {type(prompt)}")
+
+
+@_separate_context_and_series.register
+def _(
+    prompt: series.Series,
+) -> Tuple[List[str | None], List[series.Series]]:
+    if prompt.dtype == dtypes.OBJ_REF_DTYPE:
+        # Multi-model support
+        return [None], [prompt.blob.read_url()]
+    return [None], [prompt]
+
+
+@_separate_context_and_series.register(list)
+@_separate_context_and_series.register(tuple)
+def _(
+    prompt: Sequence[str | series.Series],
+) -> Tuple[List[str | None], List[series.Series]]:
+
+    prompt_context: List[str | None] = []
+    series_list: List[series.Series] = []
+
+    for item in prompt:
+        if isinstance(item, str):
+            prompt_context.append(item)
+
+        elif isinstance(item, series.Series):
+            prompt_context.append(None)
+
+            if item.dtype == dtypes.OBJ_REF_DTYPE:
+                # Multi-model support
+                item = item.blob.read_url()
+            series_list.append(item)
+
+        else:
+            raise ValueError(f"Unsupported type in prompt: {type(item)}")
+
+    return prompt_context, series_list
+
+
+def _resolve_connection_id(series: series.Series, connection_id: str | None):
+    return clients.get_canonical_bq_connection_id(
+        connection_id or series._session._bq_connection,
+        series._session._project,
+        series._session._location,
+    )
diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py
@@ -17,8 +17,10 @@
 import functools
 import typing
 
+from bigframes_vendored import ibis
 import bigframes_vendored.ibis.expr.api as ibis_api
 import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes
+import bigframes_vendored.ibis.expr.operations.ai_ops as ai_ops
 import bigframes_vendored.ibis.expr.operations.generic as ibis_generic
 import bigframes_vendored.ibis.expr.operations.udf as ibis_udf
 import bigframes_vendored.ibis.expr.types as ibis_types
@@ -1963,6 +1965,32 @@ def struct_op_impl(
     return ibis_types.struct(data)
 
 
+@scalar_op_compiler.register_nary_op(ops.AIGenerateBool, pass_op=True)
+def ai_generate_bool(
+    *values: ibis_types.Value, op: ops.AIGenerateBool
+) -> ibis_dtypes.StructValue:
+
+    prompt = {}
+    column_ref_idx = 0
+
+    for idx, elem in enumerate(op.prompt_context):
+        if elem is None:
+            value = values[column_ref_idx]
+            column_ref_idx += 1
+        else:
+            value = elem
+
+        prompt[f"_field_{idx + 1}"] = value
+
+    return ai_ops.AIGenerateBool(
+        ibis.struct(prompt),
+        op.connection_id,
+        op.endpoint,
+        op.request_type.upper(),
+        op.model_params,
+    ).to_expr()
+
+
 @scalar_op_compiler.register_nary_op(ops.RowKey, pass_op=True)
 def rowkey_op_impl(*values: ibis_types.Value, op: ops.RowKey) -> ibis_types.Value:
     return bigframes.core.compile.default_ordering.gen_row_key(values)
diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py
@@ -14,6 +14,7 @@
 
 from __future__ import annotations
 
+from bigframes.operations.ai_ops import AIGenerateBool
 from bigframes.operations.array_ops import (
     ArrayIndexOp,
     ArrayReduceOp,
@@ -408,6 +409,8 @@
     "geo_x_op",
     "geo_y_op",
     "GeoStDistanceOp",
+    # AI ops
+    "AIGenerateBool",
     # Numpy ops mapping
     "NUMPY_TO_BINOP",
     "NUMPY_TO_OP",
diff --git a/bigframes/operations/ai_ops.py b/bigframes/operations/ai_ops.py
@@ -0,0 +1,47 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import ClassVar, Literal, Tuple
+
+import pandas as pd
+import pyarrow as pa
+
+from bigframes import dtypes
+from bigframes.operations import base_ops
+
+
+@dataclasses.dataclass(frozen=True)
+class AIGenerateBool(base_ops.NaryOp):
+    name: ClassVar[str] = "ai_generate_bool"
+
+    # None are the placeholders for column references.
+    prompt_context: Tuple[str | None]
+    connection_id: str
+    endpoint: str | None
+    request_type: Literal["dedicated", "shared", "unspecified"]
+    model_params: str | None
+
+    def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
+        return pd.ArrowDtype(
+            pa.struct(
+                (
+                    pa.field("result", pa.bool_()),
+                    pa.field("full_response", pa.string()),
+                    pa.field("status", pa.string()),
+                )
+            )
+        )
diff --git a/tests/system/large/bigquery/__init__.py b/tests/system/large/bigquery/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/system/large/bigquery/test_ai.py b/tests/system/large/bigquery/test_ai.py
@@ -0,0 +1,35 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+import pandas.testing
+
+import bigframes.bigquery as bbq
+
+
+def test_ai_generate_bool_multi_model(session):
+    df = session.from_glob_path(
+        "gs://bigframes-dev-testing/a_multimodel/images/*", name="image"
+    )
+
+    result = bbq.ai_generate_bool((df["image"], " contains an animal")).struct.field(
+        "result"
+    )
+
+    pandas.testing.assert_series_equal(
+        result.to_pandas(),
+        pd.Series([True, True, False, False, False], name="result"),
+        check_dtype=False,
+        check_index=False,
+    )
diff --git a/tests/system/small/bigquery/test_ai.py b/tests/system/small/bigquery/test_ai.py
@@ -0,0 +1,37 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+import pandas.testing
+
+import bigframes.bigquery as bbq
+import bigframes.pandas as bpd
+
+
+def test_ai_generate_bool(session):
+    s1 = bpd.Series(["apple", "bear"], session=session)
+    s2 = bpd.Series(["fruit", "tree"], session=session)
+    prompt = (s1, " is a ", s2)
+    model_params = {"generation_config": {"thinking_config": {"thinking_budget": 0}}}
+
+    result = bbq.ai_generate_bool(
+        prompt, endpoint="gemini-2.5-flash", model_params=model_params
+    ).struct.field("result")
+
+    pandas.testing.assert_series_equal(
+        result.to_pandas(),
+        pd.Series([True, False], name="result"),
+        check_dtype=False,
+        check_index=False,
+    )
diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py
diff --git a/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py b/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py