Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -1026,7 +1026,7 @@ def to_timedelta_op_impl(x: ibis_types.Value, op: ops.ToTimedeltaOp):

@scalar_op_compiler.register_unary_op(ops.timedelta_floor_op)
def timedelta_floor_op_impl(x: ibis_types.NumericValue):
return x.floor()
return ibis_api.case().when(x > 0, x.floor()).else_(x.ceil()).end()


@scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True)
Expand Down
8 changes: 4 additions & 4 deletions bigframes/core/rewrite/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,12 +206,12 @@ def _rewrite_div_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr:


def _rewrite_floordiv_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr:
result = _TypedExpr.create_op_expr(ops.floordiv_op, left, right)

if left.dtype == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype):
return _TypedExpr.create_op_expr(ops.timedelta_floor_op, result)
return _TypedExpr.create_op_expr(
ops.timedelta_floor_op, _TypedExpr.create_op_expr(ops.div_op, left, right)
)

return result
return _TypedExpr.create_op_expr(ops.floordiv_op, left, right)


def _rewrite_to_timedelta_op(op: ops.ToTimedeltaOp, arg: _TypedExpr):
Expand Down
8 changes: 4 additions & 4 deletions bigframes/ml/metrics/_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def confusion_matrix(
y_true = row["y_true"]
y_pred = row["y_pred"]
count = row["dummy"]
confusion_matrix[y_pred][y_true] = count
confusion_matrix.at[y_true, y_pred] = count

return confusion_matrix

Expand Down Expand Up @@ -251,7 +251,7 @@ def recall_score(
/ is_accurate.groupby(y_true_series).count()
).to_pandas()

recall_score = pd.Series(0, index=index)
recall_score = pd.Series(0.0, index=index)
for i in recall_score.index:
recall_score.loc[i] = recall.loc[i]

Expand Down Expand Up @@ -321,7 +321,7 @@ def _precision_score_per_label(y_true: bpd.Series, y_pred: bpd.Series) -> pd.Ser
is_accurate.groupby(y_pred).sum() / is_accurate.groupby(y_pred).count()
).to_pandas()

precision_score = pd.Series(0, index=index)
precision_score = pd.Series(0.0, index=index)
for i in precision.index:
precision_score.loc[i] = precision.loc[i]

Expand Down Expand Up @@ -366,7 +366,7 @@ def f1_score(
recall = recall_score(y_true_series, y_pred_series, average=None)
precision = precision_score(y_true_series, y_pred_series, average=None)

f1_score = pd.Series(0, index=recall.index)
f1_score = pd.Series(0.0, index=recall.index)
for index in recall.index:
if precision[index] + recall[index] != 0:
f1_score[index] = (
Expand Down
7 changes: 7 additions & 0 deletions bigframes/testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,10 @@
These modules are provided for testing the BigQuery DataFrames package. The
interface is not considered stable.
"""
from bigframes.testing.utils import (
assert_frame_equal,
assert_index_equal,
assert_series_equal,
)

__all__ = ["assert_frame_equal", "assert_series_equal", "assert_index_equal"]
41 changes: 35 additions & 6 deletions bigframes/testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import base64
import decimal
import re
from typing import Iterable, Optional, Sequence, Set, Union
from typing import Iterable, Optional, Sequence, Set, TypeVar, Union

import geopandas as gpd # type: ignore
import google.api_core.operation
Expand All @@ -29,7 +29,6 @@

from bigframes import operations as ops
from bigframes.core import expression as ex
import bigframes.dtypes
import bigframes.functions._utils as bff_utils
import bigframes.pandas as bpd

Expand Down Expand Up @@ -69,6 +68,8 @@
"content",
]

SeriesOrIndexT = TypeVar("SeriesOrIndexT", pd.Series, pd.Index)


def pandas_major_version() -> int:
match = re.search(r"^v?(\d+)", pd.__version__.strip())
Expand All @@ -90,19 +91,31 @@ def assert_series_equivalent(pd_series: pd.Series, bf_series: bpd.Series, **kwar


def _normalize_all_nulls(col: pd.Series) -> pd.Series:
if col.dtype in (bigframes.dtypes.FLOAT_DTYPE, bigframes.dtypes.INT_DTYPE):
col = col.astype("float64")
if pd_types.is_object_dtype(col):
col = col.fillna(float("nan"))
# This over-normalizes probably, make more conservative later
if col.hasnans and (pd_types.is_float_dtype(col.dtype)):
col = col.astype("float64").astype("Float64")
return col


def _normalize_index_nulls(idx: pd.Index) -> pd.Index:
if isinstance(idx, pd.MultiIndex):
new_levels = [
_normalize_index_nulls(idx.get_level_values(i)) for i in range(idx.nlevels)
]
return pd.MultiIndex.from_arrays(new_levels, names=idx.names)
if idx.hasnans:
if pd_types.is_float_dtype(idx.dtype):
idx = idx.astype("float64").astype("Float64")
return idx


def assert_frame_equal(
left: pd.DataFrame,
right: pd.DataFrame,
*,
ignore_order: bool = False,
nulls_are_nan: bool = True,
downcast_object: bool = True,
**kwargs,
):
if ignore_order:
Expand All @@ -118,9 +131,17 @@ def assert_frame_equal(
left = left.sort_index()
right = right.sort_index()

# Pandas sometimes likes to produce object dtype columns
# However, nan/None/Null inconsistency makes comparison futile, convert to typed column
if downcast_object:
left = left.apply(lambda x: x.infer_objects())
right = right.apply(lambda x: x.infer_objects())

if nulls_are_nan:
left = left.apply(_normalize_all_nulls)
right = right.apply(_normalize_all_nulls)
left.index = _normalize_index_nulls(left.index)
right.index = _normalize_index_nulls(right.index)

pd.testing.assert_frame_equal(left, right, **kwargs)

Expand Down Expand Up @@ -153,10 +174,18 @@ def assert_series_equal(
if nulls_are_nan:
left = _normalize_all_nulls(left)
right = _normalize_all_nulls(right)
left.index = _normalize_index_nulls(left.index)
right.index = _normalize_index_nulls(right.index)
left.name = pd.NA if pd.isna(left.name) else left.name # type: ignore
right.name = pd.NA if pd.isna(right.name) else right.name # type: ignore

pd.testing.assert_series_equal(left, right, **kwargs)


def assert_index_equal(left, right, **kwargs):
pd.testing.assert_index_equal(left, right, **kwargs)


def _standardize_index(idx):
return pd.Index(list(idx), name=idx.name)

Expand Down
88 changes: 12 additions & 76 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import multiprocessing
import os
import pathlib
import re
import shutil
import time
from typing import Dict, List
Expand Down Expand Up @@ -588,99 +587,36 @@ def prerelease(session: nox.sessions.Session, tests_path, extra_pytest_options=(
constraints_path = str(
CURRENT_DIRECTORY / "testing" / f"constraints-{session.python}.txt"
)

# Ignore officially released versions of certain packages specified in
# testing/constraints-*.txt and install a more recent, pre-release versions
# directly
already_installed = set()
session.install(
*set(UNIT_TEST_STANDARD_DEPENDENCIES + SYSTEM_TEST_STANDARD_DEPENDENCIES),
"-c",
constraints_path,
"-e",
".",
)

# PyArrow prerelease packages are published to an alternative PyPI host.
# https://arrow.apache.org/docs/python/install.html#installing-nightly-packages
session.install(
"--no-deps",
"--upgrade",
"--extra-index-url",
"https://pypi.fury.io/arrow-nightlies/",
"--prefer-binary",
"--pre",
"--upgrade",
"pyarrow",
)
already_installed.add("pyarrow")

session.install(
"--prefer-binary",
"--pre",
"--upgrade",
# We exclude each version individually so that we can continue to test
# some prerelease packages. See:
# https://github.com/googleapis/python-bigquery-dataframes/pull/268#discussion_r1423205172
# "pandas!=2.1.4, !=2.2.0rc0, !=2.2.0, !=2.2.1",
"pandas",
)
already_installed.add("pandas")

# Try to avoid a cap on our SQLGlot so that bigframes
# can be integrated with SQLMesh. See:
# https://github.com/googleapis/python-bigquery-dataframes/issues/942
# If SQLGlot introduces something that breaks us, lets file an issue
# upstream and/or make sure we fix bigframes to work with it.
session.install(
"--upgrade",
"git+https://github.com/tobymao/sqlglot.git#egg=sqlglot",
)
already_installed.add("sqlglot")

# Workaround https://github.com/googleapis/python-db-dtypes-pandas/issues/178
session.install("--no-deps", "db-dtypes")
already_installed.add("db-dtypes")

# Ensure we catch breaking changes in the client libraries early.
session.install(
"--upgrade",
# Workaround https://github.com/googleapis/python-db-dtypes-pandas/issues/178
"db-dtypes",
# Ensure we catch breaking changes in the client libraries early.
"git+https://github.com/googleapis/python-bigquery.git#egg=google-cloud-bigquery",
)
already_installed.add("google-cloud-bigquery")
session.install(
"--upgrade",
"-e",
"git+https://github.com/googleapis/google-cloud-python.git#egg=google-cloud-bigquery-storage&subdirectory=packages/google-cloud-bigquery-storage",
)
already_installed.add("google-cloud-bigquery-storage")
session.install(
"--upgrade",
"git+https://github.com/googleapis/python-bigquery-pandas.git#egg=pandas-gbq",
)
already_installed.add("pandas-gbq")

session.install(
*set(UNIT_TEST_STANDARD_DEPENDENCIES + SYSTEM_TEST_STANDARD_DEPENDENCIES),
"-c",
constraints_path,
)

# Because we test minimum dependency versions on the minimum Python
# version, the first version we test with in the unit tests sessions has a
# constraints file containing all dependencies and extras.
with open(
CURRENT_DIRECTORY / "testing" / f"constraints-{DEFAULT_PYTHON_VERSION}.txt",
encoding="utf-8",
) as constraints_file:
constraints_text = constraints_file.read()

# Ignore leading whitespace and comment lines.
deps = [
match.group(1)
for match in re.finditer(
r"^\s*(\S+)(?===\S+)", constraints_text, flags=re.MULTILINE
)
if match.group(1) not in already_installed
]

print(already_installed)

# We use --no-deps to ensure that pre-release versions aren't overwritten
# by the version ranges in setup.py.
session.install(*deps)
session.install("--no-deps", "-e", ".")

# Print out prerelease package versions.
session.run("python", "-m", "pip", "freeze")
Expand Down
Loading
Loading