diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd index 72c278d3e74b..385a2924d1da 100644 --- a/python/pyarrow/includes/libarrow_python.pxd +++ b/python/pyarrow/includes/libarrow_python.pxd @@ -213,7 +213,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil: CTimePoint TimePoint_from_ns(int64_t val) CResult[c_string] TzinfoToString(PyObject* pytzinfo) - CResult[PyObject*] StringToTzinfo(c_string) + CResult[PyObject*] StringToTzinfo(c_string, c_bool) cdef extern from "arrow/python/numpy_init.h" namespace "arrow::py": diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index dfca59cbf5f9..d27a95b9f9d8 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -787,7 +787,7 @@ def _reconstruct_block(item, columns=None, extension_columns=None, return_block= def make_datetimetz(unit, tz): if _pandas_api.is_v1(): unit = 'ns' # ARROW-3789: Coerce date/timestamp types to datetime64[ns] - tz = pa.lib.string_to_tzinfo(tz) + tz = pa.lib.string_to_tzinfo(tz, prefer_zoneinfo=_pandas_api.is_ge_v3()) return _pandas_api.datetimetz_type(unit, tz=tz) @@ -1183,7 +1183,8 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): # ARROW-13756: if index is timezone aware DataTimeIndex elif pandas_dtype == "datetimetz": tz = pa.lib.string_to_tzinfo( - column_indexes[0]['metadata']['timezone']) + column_indexes[0]['metadata']['timezone'], + prefer_zoneinfo=_pandas_api.is_ge_v3()) level = pd.to_datetime(level, utc=True).tz_convert(tz) if _pandas_api.is_ge_v3(): # with pandas 3+, to_datetime returns a unit depending on the string @@ -1289,7 +1290,7 @@ def make_tz_aware(series, tz): """ Make a datetime64 Series timezone-aware for the given tz """ - tz = pa.lib.string_to_tzinfo(tz) + tz = pa.lib.string_to_tzinfo(tz, prefer_zoneinfo=_pandas_api.is_ge_v3()) series = (series.dt.tz_localize('utc') .dt.tz_convert(tz)) return series diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index a6377b2bb707..636787f916d5 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -822,7 +822,14 @@ cdef class TimestampScalar(Scalar): return None if not dtype.timezone().empty(): - tzinfo = string_to_tzinfo(frombytes(dtype.timezone())) + prefer_zoneinfo = True + # only we this method would return a pandas.Timestamp, prefer + # zoneinfo depending on the pandas version + if _pandas_api.have_pandas and dtype.unit() == TimeUnit_NANO: + prefer_zoneinfo = _pandas_api.is_ge_v3() + tzinfo = string_to_tzinfo( + frombytes(dtype.timezone()), prefer_zoneinfo=prefer_zoneinfo + ) else: tzinfo = None diff --git a/python/pyarrow/src/arrow/python/datetime.cc b/python/pyarrow/src/arrow/python/datetime.cc index 1c4e66064d1d..6a835c2d37ce 100644 --- a/python/pyarrow/src/arrow/python/datetime.cc +++ b/python/pyarrow/src/arrow/python/datetime.cc @@ -368,13 +368,14 @@ Result PyTZInfo_utcoffset_hhmm(PyObject* pytzinfo) { // Converted from python. See https://github.com/apache/arrow/pull/7604 // for details. -Result StringToTzinfo(const std::string& tz) { +Result StringToTzinfo(const std::string& tz, bool prefer_zoneinfo) { std::string_view sign_str, hour_str, minute_str; OwnedRef pytz; OwnedRef zoneinfo; OwnedRef datetime; - if (internal::ImportModule("pytz", &pytz).ok()) { + // Legacy behavior: prefer pytz objects when available + if (!prefer_zoneinfo && internal::ImportModule("pytz", &pytz).ok()) { if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) { int sign = -1; if (sign_str == "+") { @@ -406,7 +407,7 @@ Result StringToTzinfo(const std::string& tz) { return tzinfo; } - // catch fixed offset if pytz is not present + // Handle fixed offsets with datetime.timezone if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) { RETURN_NOT_OK(internal::ImportModule("datetime", &datetime)); int sign = -1; @@ -447,7 +448,7 @@ Result StringToTzinfo(const std::string& tz) { return tzinfo; } - // fallback on zoneinfo if tz is string and pytz is not present + // Use zoneinfo for named timezones when available if (internal::ImportModule("zoneinfo", &zoneinfo).ok()) { OwnedRef class_zoneinfo; RETURN_NOT_OK( @@ -456,12 +457,25 @@ Result StringToTzinfo(const std::string& tz) { PyUnicode_FromStringAndSize(tz.c_str(), static_cast(tz.size()))); auto tzinfo = PyObject_CallFunctionObjArgs(class_zoneinfo.obj(), py_tz_string.obj(), NULL); + if (tzinfo != nullptr) { + return tzinfo; + } + + // Keep backwards compatibility for named timezones only available in pytz + PyErr_Clear(); + } + + if (internal::ImportModule("pytz", &pytz).ok()) { + OwnedRef timezone; + RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "timezone", &timezone)); + OwnedRef py_tz_string( + PyUnicode_FromStringAndSize(tz.c_str(), static_cast(tz.size()))); + auto tzinfo = PyObject_CallFunctionObjArgs(timezone.obj(), py_tz_string.obj(), NULL); RETURN_IF_PYERROR(); return tzinfo; } - return Status::Invalid( - "Pytz package or Python>=3.8 for zoneinfo module must be installed."); + return Status::Invalid("The zoneinfo module or pytz package must be installed."); } Result TzinfoToString(PyObject* tzinfo) { diff --git a/python/pyarrow/src/arrow/python/datetime.h b/python/pyarrow/src/arrow/python/datetime.h index 9b21eeb43421..84f46fe2d19c 100644 --- a/python/pyarrow/src/arrow/python/datetime.h +++ b/python/pyarrow/src/arrow/python/datetime.h @@ -188,7 +188,7 @@ Result PyDateTime_utcoffset_s(PyObject* pydatetime); /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 /// GIL must be held when calling this method. ARROW_PYTHON_EXPORT -Result StringToTzinfo(const std::string& tz); +Result StringToTzinfo(const std::string& tz, bool prefer_zoneinfo = true); /// \brief Convert a time zone object to a string representation. /// diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 0339975f4571..07f214fd9b8e 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -21,6 +21,7 @@ import multiprocessing as mp import sys import warnings +import zoneinfo from collections import OrderedDict from datetime import date, datetime, time, timedelta, timezone @@ -1168,10 +1169,23 @@ def test_python_datetime(self): def test_python_datetime_with_pytz_tzinfo(self): pytz = pytest.importorskip("pytz") - for tz in [pytz.utc, pytz.timezone('US/Eastern'), pytz.FixedOffset(1)]: - values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz)] + timezones_pytz = [pytz.utc, pytz.timezone('US/Eastern'), pytz.FixedOffset(1)] + timezones_zoneinfo = [ + zoneinfo.ZoneInfo('UTC'), + zoneinfo.ZoneInfo('US/Eastern'), + timezone(timedelta(minutes=1)) + ] + + for tz, tz_zoneinfo in zip(timezones_pytz, timezones_zoneinfo): + values = [tz.localize(datetime(2018, 1, 1, 12, 23, 45))] df = pd.DataFrame({'datetime': values}) - _check_pandas_roundtrip(df) + if Version(pd.__version__) >= Version("3.0.0"): + df_expected = pd.DataFrame( + {'datetime': [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_zoneinfo)]} + ) + else: + df_expected = None + _check_pandas_roundtrip(df, expected=df_expected) @h.given(st.none() | past.timezones) @h.settings(deadline=None) @@ -1183,7 +1197,6 @@ def test_python_datetime_with_pytz_timezone(self, tz): _check_pandas_roundtrip(df, check_dtype=False) def test_python_datetime_with_timezone_tzinfo(self): - pytz = pytest.importorskip("pytz") from datetime import timezone values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=timezone.utc)] @@ -1191,14 +1204,19 @@ def test_python_datetime_with_timezone_tzinfo(self): df = pd.DataFrame({'datetime': values}, index=values) _check_pandas_roundtrip(df, preserve_index=True) - # datetime.timezone is going to be pytz.FixedOffset hours = 1 tz_timezone = timezone(timedelta(hours=hours)) - tz_pytz = pytz.FixedOffset(hours * 60) values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_timezone)] - values_exp = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_pytz)] df = pd.DataFrame({'datetime': values}, index=values) - df_exp = pd.DataFrame({'datetime': values_exp}, index=values_exp) + if Version(pd.__version__) < Version("3.0.0"): + # datetime.timezone is going to be pytz.FixedOffset + pytz = pytest.importorskip("pytz") + tz_pytz = pytz.FixedOffset(hours * 60) + values_exp = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_pytz)] + df_exp = pd.DataFrame({'datetime': values_exp}, index=values_exp) + else: + df_exp = None + _check_pandas_roundtrip(df, expected=df_exp, preserve_index=True) def test_python_datetime_subclass(self): diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 539f01724542..4a33d79223d0 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -20,6 +20,7 @@ from functools import partial import datetime import sys +import zoneinfo import pytest import hypothesis as h @@ -491,35 +492,40 @@ def utcoffset(self, dt): def test_string_to_tzinfo(): string = ['UTC', 'Europe/Paris', '+03:00', '+01:30', '-02:00'] - try: - import pytz - expected = [pytz.utc, pytz.timezone('Europe/Paris'), - pytz.FixedOffset(180), pytz.FixedOffset(90), - pytz.FixedOffset(-120)] - result = [pa.lib.string_to_tzinfo(i) for i in string] - assert result == expected - - except ImportError: - try: - import zoneinfo - expected = [zoneinfo.ZoneInfo(key='UTC'), - zoneinfo.ZoneInfo(key='Europe/Paris'), - datetime.timezone(datetime.timedelta(hours=3)), - datetime.timezone( - datetime.timedelta(hours=1, minutes=30)), - datetime.timezone(-datetime.timedelta(hours=2))] - result = [pa.lib.string_to_tzinfo(i) for i in string] - assert result == expected - - except ImportError: - pytest.skip('requires pytz or zoneinfo to be installed') - - -def test_timezone_string_roundtrip_pytz(): + result = [pa.lib.string_to_tzinfo(i) for i in string] + expected = [ + zoneinfo.ZoneInfo('UTC'), + zoneinfo.ZoneInfo('Europe/Paris'), + datetime.timezone(datetime.timedelta(hours=3)), + datetime.timezone(datetime.timedelta(hours=1, minutes=30)), + datetime.timezone(-datetime.timedelta(hours=2)), + ] + assert result == expected + + +def test_string_to_tzinfo_prefer_zoneinfo_false(): pytz = pytest.importorskip("pytz") + result = pa.lib.string_to_tzinfo("Europe/Brussels", prefer_zoneinfo=False) + assert result == pytz.timezone("Europe/Brussels") + result = pa.lib.string_to_tzinfo("+01:30", prefer_zoneinfo=False) + assert result == pytz.FixedOffset(90) + + +@pytest.mark.skipif( + sys.platform == 'darwin', reason="macOS supports those lower-case names" +) +def test_string_to_tzinfo_pytz_fallback(): + pytz = pytest.importorskip("pytz") + result = pa.lib.string_to_tzinfo("europe/brussels") + expected = pytz.timezone("Europe/Brussels") + assert result == expected + - tz = [pytz.FixedOffset(90), pytz.FixedOffset(-90), - pytz.utc, pytz.timezone('America/New_York')] +def test_timezone_string_roundtrip(): + tz = [datetime.timezone(datetime.timedelta(hours=1, minutes=30)), + datetime.timezone(datetime.timedelta(hours=-1, minutes=-30)), + zoneinfo.ZoneInfo('UTC'), + zoneinfo.ZoneInfo('America/New_York')] name = ['+01:30', '-01:30', 'UTC', 'America/New_York'] assert [pa.lib.tzinfo_to_string(i) for i in tz] == name diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index e9eef8965153..ec1a5a2ba9a3 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -4166,7 +4166,7 @@ def tzinfo_to_string(tz): return frombytes(GetResultValue(TzinfoToString(tz))) -def string_to_tzinfo(name): +def string_to_tzinfo(name, *, prefer_zoneinfo=True): """ Convert a time zone name into a time zone object. @@ -4177,15 +4177,21 @@ def string_to_tzinfo(name): Parameters ---------- - name: str + name: str Time zone name. + prefer_zoneinfo : bool, default True + If True, resolve named timezones using ``zoneinfo`` first and only + fall back to ``pytz`` when needed. If False, prefer ``pytz`` when it + is available. Returns ------- tz : datetime.tzinfo Time zone object """ - cdef PyObject* tz = GetResultValue(StringToTzinfo(name.encode('utf-8'))) + cdef PyObject* tz = GetResultValue( + StringToTzinfo(name.encode('utf-8'), prefer_zoneinfo) + ) return PyObject_to_object(tz)