From ea82e63a33e628e3e9631ea1bbbd3d1bbacb69c5 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Sat, 23 May 2026 18:40:41 +0800 Subject: [PATCH 1/2] refactor: move temporal utilities out of transform util Also add internal math helpers for floor division and checked multiplication. Keep the Human* formatting helpers in TransformUtil to stay consistent with the Java TransformUtil implementation. --- src/iceberg/expression/json_serde.cc | 13 +- src/iceberg/expression/literal.cc | 13 +- src/iceberg/test/CMakeLists.txt | 2 + ...e_metrics_evaluator_with_transform_test.cc | 6 +- src/iceberg/test/literal_test.cc | 10 +- src/iceberg/test/math_util_internal_test.cc | 56 ++++ src/iceberg/test/meson.build | 2 + src/iceberg/test/temporal_test_helper.h | 7 +- src/iceberg/test/temporal_util_test.cc | 230 ++++++++++++++ src/iceberg/test/transform_util_test.cc | 275 ----------------- src/iceberg/transform.cc | 1 + src/iceberg/util/math_util_internal.h | 47 +++ src/iceberg/util/temporal_util.cc | 252 ++++++++++++++-- src/iceberg/util/temporal_util.h | 90 ++++++ src/iceberg/util/transform_util.cc | 282 ++---------------- src/iceberg/util/transform_util.h | 72 +---- 16 files changed, 708 insertions(+), 650 deletions(-) create mode 100644 src/iceberg/test/math_util_internal_test.cc create mode 100644 src/iceberg/test/temporal_util_test.cc create mode 100644 src/iceberg/util/math_util_internal.h diff --git a/src/iceberg/expression/json_serde.cc b/src/iceberg/expression/json_serde.cc index 065f41cf2..df8aba88f 100644 --- a/src/iceberg/expression/json_serde.cc +++ b/src/iceberg/expression/json_serde.cc @@ -33,6 +33,7 @@ #include "iceberg/util/json_util_internal.h" #include "iceberg/util/macros.h" #include "iceberg/util/string_util.h" +#include "iceberg/util/temporal_util.h" #include "iceberg/util/transform_util.h" namespace iceberg { @@ -363,7 +364,7 @@ Result LiteralFromJson(const nlohmann::json& json, const Type* type) { return JsonParseError("Cannot parse {} as a date value", SafeDumpJson(json)); } ICEBERG_ASSIGN_OR_RAISE(auto days, - TransformUtil::ParseDay(json.get())); + TemporalUtils::ParseDay(json.get())); return Literal::Date(days); } @@ -372,7 +373,7 @@ Result LiteralFromJson(const nlohmann::json& json, const Type* type) { return JsonParseError("Cannot parse {} as a time value", SafeDumpJson(json)); } ICEBERG_ASSIGN_OR_RAISE(auto micros, - TransformUtil::ParseTime(json.get())); + TemporalUtils::ParseTime(json.get())); return Literal::Time(micros); } @@ -381,7 +382,7 @@ Result LiteralFromJson(const nlohmann::json& json, const Type* type) { return JsonParseError("Cannot parse {} as a timestamp value", SafeDumpJson(json)); } ICEBERG_ASSIGN_OR_RAISE(auto micros, - TransformUtil::ParseTimestamp(json.get())); + TemporalUtils::ParseTimestamp(json.get())); return Literal::Timestamp(micros); } @@ -391,7 +392,7 @@ Result LiteralFromJson(const nlohmann::json& json, const Type* type) { SafeDumpJson(json)); } ICEBERG_ASSIGN_OR_RAISE( - auto micros, TransformUtil::ParseTimestampWithZone(json.get())); + auto micros, TemporalUtils::ParseTimestampWithZone(json.get())); return Literal::TimestampTz(micros); } @@ -401,7 +402,7 @@ Result LiteralFromJson(const nlohmann::json& json, const Type* type) { SafeDumpJson(json)); } ICEBERG_ASSIGN_OR_RAISE(auto nanos, - TransformUtil::ParseTimestampNs(json.get())); + TemporalUtils::ParseTimestampNs(json.get())); return Literal::TimestampNs(nanos); } @@ -411,7 +412,7 @@ Result LiteralFromJson(const nlohmann::json& json, const Type* type) { SafeDumpJson(json)); } ICEBERG_ASSIGN_OR_RAISE( - auto nanos, TransformUtil::ParseTimestampNsWithZone(json.get())); + auto nanos, TemporalUtils::ParseTimestampNsWithZone(json.get())); return Literal::TimestampTzNs(nanos); } diff --git a/src/iceberg/expression/literal.cc b/src/iceberg/expression/literal.cc index e14d3def2..6a85f65a2 100644 --- a/src/iceberg/expression/literal.cc +++ b/src/iceberg/expression/literal.cc @@ -32,7 +32,6 @@ #include "iceberg/util/macros.h" #include "iceberg/util/string_util.h" #include "iceberg/util/temporal_util.h" -#include "iceberg/util/transform_util.h" namespace iceberg { @@ -203,29 +202,29 @@ Result LiteralCaster::CastFromString( return Literal::UUID(uuid); } case TypeId::kDate: { - ICEBERG_ASSIGN_OR_RAISE(auto days, TransformUtil::ParseDay(str_val)); + ICEBERG_ASSIGN_OR_RAISE(auto days, TemporalUtils::ParseDay(str_val)); return Literal::Date(days); } case TypeId::kTime: { - ICEBERG_ASSIGN_OR_RAISE(auto micros, TransformUtil::ParseTime(str_val)); + ICEBERG_ASSIGN_OR_RAISE(auto micros, TemporalUtils::ParseTime(str_val)); return Literal::Time(micros); } case TypeId::kTimestamp: { - ICEBERG_ASSIGN_OR_RAISE(auto micros, TransformUtil::ParseTimestamp(str_val)); + ICEBERG_ASSIGN_OR_RAISE(auto micros, TemporalUtils::ParseTimestamp(str_val)); return Literal::Timestamp(micros); } case TypeId::kTimestampTz: { ICEBERG_ASSIGN_OR_RAISE(auto micros, - TransformUtil::ParseTimestampWithZone(str_val)); + TemporalUtils::ParseTimestampWithZone(str_val)); return Literal::TimestampTz(micros); } case TypeId::kTimestampNs: { - ICEBERG_ASSIGN_OR_RAISE(auto nanos, TransformUtil::ParseTimestampNs(str_val)); + ICEBERG_ASSIGN_OR_RAISE(auto nanos, TemporalUtils::ParseTimestampNs(str_val)); return Literal::TimestampNs(nanos); } case TypeId::kTimestampTzNs: { ICEBERG_ASSIGN_OR_RAISE(auto nanos, - TransformUtil::ParseTimestampNsWithZone(str_val)); + TemporalUtils::ParseTimestampNsWithZone(str_val)); return Literal::TimestampTzNs(nanos); } case TypeId::kBinary: { diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index c632403c3..7b5462673 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -128,11 +128,13 @@ add_iceberg_test(util_test formatter_test.cc lazy_test.cc location_util_test.cc + math_util_internal_test.cc roaring_position_bitmap_test.cc position_delete_index_test.cc retry_util_test.cc string_util_test.cc struct_like_set_test.cc + temporal_util_test.cc transform_util_test.cc truncate_util_test.cc url_encoder_test.cc diff --git a/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc b/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc index 935f3c3ab..4502cda72 100644 --- a/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc +++ b/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc @@ -30,6 +30,7 @@ #include "iceberg/schema.h" #include "iceberg/test/matchers.h" #include "iceberg/type.h" +#include "iceberg/util/temporal_util.h" namespace iceberg { @@ -38,9 +39,8 @@ constexpr bool kRowsMightMatch = true; constexpr bool kRowCannotMatch = false; constexpr int64_t kIntMinValue = 30; constexpr int64_t kIntMaxValue = 79; -constexpr int64_t kMicrosPerDay = 86'400'000'000LL; -constexpr int64_t kTsMinValue = 30 * kMicrosPerDay; -constexpr int64_t kTsMaxValue = 79 * kMicrosPerDay; +constexpr int64_t kTsMinValue = 30 * internal::kMicrosPerDay; +constexpr int64_t kTsMaxValue = 79 * internal::kMicrosPerDay; std::shared_ptr> ToBoundTransform( const std::shared_ptr& transform) { diff --git a/src/iceberg/test/literal_test.cc b/src/iceberg/test/literal_test.cc index 86b892377..5724c8b55 100644 --- a/src/iceberg/test/literal_test.cc +++ b/src/iceberg/test/literal_test.cc @@ -29,6 +29,7 @@ #include "iceberg/test/matchers.h" #include "iceberg/test/temporal_test_helper.h" #include "iceberg/type.h" +#include "iceberg/util/temporal_util.h" namespace iceberg { @@ -676,10 +677,11 @@ INSTANTIATE_TEST_SUITE_P( .small_literal = Literal::Date(100), .large_literal = Literal::Date(200), .equal_literal = Literal::Date(100)}, - ComparisonLiteralTestParam{.test_name = "Time", - .small_literal = Literal::Time(43200000000LL), - .large_literal = Literal::Time(86400000000LL), - .equal_literal = Literal::Time(43200000000LL)}, + ComparisonLiteralTestParam{ + .test_name = "Time", + .small_literal = Literal::Time(internal::kMicrosPerDay / 2), + .large_literal = Literal::Time(internal::kMicrosPerDay), + .equal_literal = Literal::Time(internal::kMicrosPerDay / 2)}, ComparisonLiteralTestParam{.test_name = "Timestamp", .small_literal = Literal::Timestamp(1000000LL), .large_literal = Literal::Timestamp(2000000LL), diff --git a/src/iceberg/test/math_util_internal_test.cc b/src/iceberg/test/math_util_internal_test.cc new file mode 100644 index 000000000..682c786e2 --- /dev/null +++ b/src/iceberg/test/math_util_internal_test.cc @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/math_util_internal.h" + +#include + +#include + +#include "iceberg/test/matchers.h" + +namespace iceberg::internal { + +TEST(MathUtilInternalTest, FloorDiv) { + EXPECT_EQ(0, FloorDiv(0, 1000)); + EXPECT_EQ(1, FloorDiv(1001, 1000)); + EXPECT_EQ(-1, FloorDiv(-1, 1000)); + EXPECT_EQ(-2, FloorDiv(-1001, 1000)); + EXPECT_EQ(1, FloorDiv(-1001, -1000)); + EXPECT_EQ(-2, FloorDiv(1001, -1000)); +} + +TEST(MathUtilInternalTest, MultiplyExact) { + ICEBERG_UNWRAP_OR_FAIL(auto positive, MultiplyExact(1000, 1000)); + EXPECT_EQ(1000000, positive); + + ICEBERG_UNWRAP_OR_FAIL(auto negative, MultiplyExact(-1000, 1000)); + EXPECT_EQ(-1000000, negative); + + ICEBERG_UNWRAP_OR_FAIL(auto min_value, + MultiplyExact(std::numeric_limits::min(), 1)); + EXPECT_EQ(std::numeric_limits::min(), min_value); + + EXPECT_THAT(MultiplyExact(std::numeric_limits::max(), 2), + IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(MultiplyExact(std::numeric_limits::min(), -1), + IsError(ErrorKind::kInvalidArgument)); +} + +} // namespace iceberg::internal diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build index 1acb46e9b..6928ab820 100644 --- a/src/iceberg/test/meson.build +++ b/src/iceberg/test/meson.build @@ -92,11 +92,13 @@ iceberg_tests = { 'formatter_test.cc', 'lazy_test.cc', 'location_util_test.cc', + 'math_util_internal_test.cc', 'position_delete_index_test.cc', 'retry_util_test.cc', 'roaring_position_bitmap_test.cc', 'string_util_test.cc', 'struct_like_set_test.cc', + 'temporal_util_test.cc', 'transform_util_test.cc', 'truncate_util_test.cc', 'url_encoder_test.cc', diff --git a/src/iceberg/test/temporal_test_helper.h b/src/iceberg/test/temporal_test_helper.h index 0f2904891..c4ba3a152 100644 --- a/src/iceberg/test/temporal_test_helper.h +++ b/src/iceberg/test/temporal_test_helper.h @@ -22,6 +22,8 @@ #include #include +#include "iceberg/util/temporal_util.h" + namespace iceberg { using namespace std::chrono; // NOLINT @@ -64,13 +66,12 @@ struct TimestampNanosParts { }; class TemporalTestHelper { - static constexpr auto kEpochDays = sys_days(year{1970} / January / 1); - public: /// \brief Construct a Calendar date without timezone or time static int32_t CreateDate(const DateParts& parts) { return static_cast( - (sys_days(year{parts.year} / month{parts.month} / day{parts.day}) - kEpochDays) + (sys_days(year{parts.year} / month{parts.month} / day{parts.day}) - + internal::kEpochDays) .count()); } diff --git a/src/iceberg/test/temporal_util_test.cc b/src/iceberg/test/temporal_util_test.cc new file mode 100644 index 000000000..0d4426b0b --- /dev/null +++ b/src/iceberg/test/temporal_util_test.cc @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/temporal_util.h" + +#include +#include + +#include + +#include "iceberg/test/matchers.h" + +namespace iceberg { + +TEST(TemporalUtilTest, ParseTimestampNs) { + ICEBERG_UNWRAP_OR_FAIL( + auto nanos, TemporalUtils::ParseTimestampNs("2026-01-01T00:00:01.000001001")); + EXPECT_EQ(nanos, 1767225601000001001L); + + ICEBERG_UNWRAP_OR_FAIL(auto pre_epoch_nanos, TemporalUtils::ParseTimestampNs( + "1969-12-31T23:59:59.123456789")); + EXPECT_EQ(pre_epoch_nanos, -876543211); +} + +TEST(TemporalUtilTest, ParseTimestampNsChecksInt64Bounds) { + ICEBERG_UNWRAP_OR_FAIL( + auto max_nanos, TemporalUtils::ParseTimestampNs("2262-04-11T23:47:16.854775807")); + EXPECT_EQ(max_nanos, std::numeric_limits::max()); + + ICEBERG_UNWRAP_OR_FAIL( + auto min_nanos, TemporalUtils::ParseTimestampNs("1677-09-21T00:12:43.145224192")); + EXPECT_EQ(min_nanos, std::numeric_limits::min()); + + EXPECT_THAT(TemporalUtils::ParseTimestampNs("2262-04-11T23:47:16.854775808"), + IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(TemporalUtils::ParseTimestampNs("1677-09-21T00:12:43.145224191"), + IsError(ErrorKind::kInvalidArgument)); +} + +TEST(TemporalUtilTest, ParseTimestampNsRejectsMoreThanNineFractionalDigits) { + EXPECT_THAT(TemporalUtils::ParseTimestampNs("2026-01-01T00:00:01.0000010011"), + IsError(ErrorKind::kInvalidArgument)); +} + +TEST(TemporalUtilTest, ParseTimestampNsWithZone) { + ICEBERG_UNWRAP_OR_FAIL(auto nanos, TemporalUtils::ParseTimestampNsWithZone( + "2026-01-01T00:00:01.000001001+00:00")); + EXPECT_EQ(nanos, 1767225601000001001L); +} + +TEST(TemporalUtilTest, ParseTimestampNsWithZoneChecksInt64BoundsAfterOffset) { + ICEBERG_UNWRAP_OR_FAIL(auto max_nanos, TemporalUtils::ParseTimestampNsWithZone( + "2262-04-12T00:47:16.854775807+01:00")); + EXPECT_EQ(max_nanos, std::numeric_limits::max()); + + ICEBERG_UNWRAP_OR_FAIL(auto min_nanos, TemporalUtils::ParseTimestampNsWithZone( + "1677-09-20T23:12:43.145224192-01:00")); + EXPECT_EQ(min_nanos, std::numeric_limits::min()); + + EXPECT_THAT( + TemporalUtils::ParseTimestampNsWithZone("2262-04-11T23:47:16.854775807-00:01"), + IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT( + TemporalUtils::ParseTimestampNsWithZone("1677-09-21T00:12:43.145224192+00:01"), + IsError(ErrorKind::kInvalidArgument)); +} + +TEST(TemporalUtilTest, ParseTimestampNsWithZoneRejectsOffsetPastPlusMinus1800) { + EXPECT_THAT( + TemporalUtils::ParseTimestampNsWithZone("2026-01-01T00:00:01.000001001+18:01"), + IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT( + TemporalUtils::ParseTimestampNsWithZone("2026-01-01T00:00:01.000001001-18:30"), + IsError(ErrorKind::kInvalidArgument)); +} + +struct ParseParam { + std::string name; + std::string str; + int64_t value; + enum Kind { kDay, kTime, kTimestamp, kTimestampTz } kind; +}; + +class TemporalParseTest : public ::testing::TestWithParam {}; + +TEST_P(TemporalParseTest, ParsesCorrectly) { + const auto& param = GetParam(); + switch (param.kind) { + case ParseParam::kDay: { + ICEBERG_UNWRAP_OR_FAIL(auto parsed, TemporalUtils::ParseDay(param.str)); + EXPECT_EQ(parsed, static_cast(param.value)); + break; + } + case ParseParam::kTime: { + ICEBERG_UNWRAP_OR_FAIL(auto parsed, TemporalUtils::ParseTime(param.str)); + EXPECT_EQ(parsed, param.value); + break; + } + case ParseParam::kTimestamp: { + ICEBERG_UNWRAP_OR_FAIL(auto parsed, TemporalUtils::ParseTimestamp(param.str)); + EXPECT_EQ(parsed, param.value); + break; + } + case ParseParam::kTimestampTz: { + ICEBERG_UNWRAP_OR_FAIL(auto parsed, + TemporalUtils::ParseTimestampWithZone(param.str)); + EXPECT_EQ(parsed, param.value); + break; + } + } +} + +struct ParseTimeErrorParam { + std::string name; + std::string str; +}; + +class ParseTimeErrorTest : public ::testing::TestWithParam {}; + +TEST_P(ParseTimeErrorTest, ReturnsError) { + EXPECT_THAT(TemporalUtils::ParseTime(GetParam().str), + IsError(ErrorKind::kInvalidArgument)); +} + +INSTANTIATE_TEST_SUITE_P( + TemporalUtilTest, TemporalParseTest, + ::testing::Values( + ParseParam{"DayEpoch", "1970-01-01", 0, ParseParam::kDay}, + ParseParam{"DayNext", "1970-01-02", 1, ParseParam::kDay}, + ParseParam{"DayBeforeEpoch", "1969-12-31", -1, ParseParam::kDay}, + ParseParam{"DayYear999", "0999-12-31", -354286, ParseParam::kDay}, + ParseParam{"DayNonLeap", "1971-01-01", 365, ParseParam::kDay}, + ParseParam{"DayY2K", "2000-01-01", 10957, ParseParam::kDay}, + ParseParam{"Day2026", "2026-01-01", 20454, ParseParam::kDay}, + ParseParam{"TimeMidnight", "00:00", 0, ParseParam::kTime}, + ParseParam{"TimeOneSec", "00:00:01", 1000000, ParseParam::kTime}, + ParseParam{"TimeMillis", "00:00:01.500", 1500000, ParseParam::kTime}, + ParseParam{"TimeOneMillis", "00:00:01.001", 1001000, ParseParam::kTime}, + ParseParam{"TimeMicros", "00:00:01.000001", 1000001, ParseParam::kTime}, + ParseParam{"TimeHourMinSec", "01:02:03", 3723000000, ParseParam::kTime}, + ParseParam{"TimeEndOfDay", "23:59:59", 86399000000, ParseParam::kTime}, + ParseParam{"TimestampEpoch", "1970-01-01T00:00:00", 0, ParseParam::kTimestamp}, + ParseParam{"TimestampOneSec", "1970-01-01T00:00:01", 1000000, + ParseParam::kTimestamp}, + ParseParam{"TimestampMillis", "2026-01-01T00:00:01.500", 1767225601500000L, + ParseParam::kTimestamp}, + ParseParam{"TimestampOneMillis", "2026-01-01T00:00:01.001", 1767225601001000L, + ParseParam::kTimestamp}, + ParseParam{"TimestampMicros", "2026-01-01T00:00:01.000001", 1767225601000001L, + ParseParam::kTimestamp}, + ParseParam{"TimestampTzEpoch", "1970-01-01T00:00:00+00:00", 0, + ParseParam::kTimestampTz}, + ParseParam{"TimestampTzOneSec", "1970-01-01T00:00:01+00:00", 1000000, + ParseParam::kTimestampTz}, + ParseParam{"TimestampTzMillis", "2026-01-01T00:00:01.500+00:00", + 1767225601500000L, ParseParam::kTimestampTz}, + ParseParam{"TimestampTzOneMillis", "2026-01-01T00:00:01.001+00:00", + 1767225601001000L, ParseParam::kTimestampTz}, + ParseParam{"TimestampTzMicros", "2026-01-01T00:00:01.000001+00:00", + 1767225601000001L, ParseParam::kTimestampTz}, + ParseParam{"TimestampTzSuffixZ_Epoch", "1970-01-01T00:00:00Z", 0, + ParseParam::kTimestampTz}, + ParseParam{"TimestampTzSuffixZ_Millis", "2026-01-01T00:00:01.500Z", + 1767225601500000L, ParseParam::kTimestampTz}, + ParseParam{"TimestampTzNegZero_Epoch", "1970-01-01T00:00:00-00:00", 0, + ParseParam::kTimestampTz}, + ParseParam{"TimestampTzNegZero_Millis", "2026-01-01T00:00:01.500-00:00", + 1767225601500000L, ParseParam::kTimestampTz}, + ParseParam{"TimeTruncatesNanos", "00:00:01.123456789", 1123456, + ParseParam::kTime}, + ParseParam{"1Digit", "00:00:01.5", 1500000, ParseParam::kTime}, + ParseParam{"2Digits", "00:00:01.50", 1500000, ParseParam::kTime}, + ParseParam{"2DigitsNonZero", "00:00:01.12", 1120000, ParseParam::kTime}, + ParseParam{"4Digits", "00:00:01.0001", 1000100, ParseParam::kTime}, + ParseParam{"TimestampNoSec_Zero", "1970-01-01T00:00", 0, ParseParam::kTimestamp}, + ParseParam{"TimestampNoSec_OneMin", "1970-01-01T00:01", 60000000, + ParseParam::kTimestamp}, + ParseParam{"TimestampTzNoSec_Offset", "1970-01-01T00:00+00:00", 0, + ParseParam::kTimestampTz}, + ParseParam{"TimestampTzNoSec_OneMin", "1970-01-01T00:01+00:00", 60000000, + ParseParam::kTimestampTz}, + ParseParam{"TimestampTzNoSec_Z", "1970-01-01T00:00Z", 0, + ParseParam::kTimestampTz}, + ParseParam{"ExtendedYearPlusEpoch", "+1970-01-01", 0, ParseParam::kDay}, + ParseParam{"ExtendedYearPlus2026", "+2026-01-01", 20454, ParseParam::kDay}, + ParseParam{"ExtendedYearMinus2026", "-2026-01-01", -1459509, ParseParam::kDay}, + ParseParam{"TimestampTzPositiveOffset", "1970-01-01T05:00:00+05:00", 0, + ParseParam::kTimestampTz}, + ParseParam{"TimestampTzNegativeOffset", "1970-01-01T00:00:00-05:00", 18000000000, + ParseParam::kTimestampTz}, + ParseParam{"TimestampTzOffsetWithMillis", "2026-01-01T05:30:01.500+05:30", + 1767225601500000L, ParseParam::kTimestampTz}, + ParseParam{"TimestampTzNegOffsetToEpoch", "1969-12-31T19:00:00-05:00", 0, + ParseParam::kTimestampTz}, + ParseParam{"TimestampTzNoSecWithOffset", "1970-01-01T05:30+05:30", 0, + ParseParam::kTimestampTz}), + [](const ::testing::TestParamInfo& info) { return info.param.name; }); + +INSTANTIATE_TEST_SUITE_P( + TemporalUtilTest, ParseTimeErrorTest, + ::testing::Values(ParseTimeErrorParam{"EmptyString", ""}, + ParseTimeErrorParam{"TooShort1Char", "1"}, + ParseTimeErrorParam{"TooShort2Chars", "12"}, + ParseTimeErrorParam{"TooShort4Chars", "12:3"}, + ParseTimeErrorParam{"MissingColon", "1200:00"}, + ParseTimeErrorParam{"OutofRangeHours", "24:00:00"}, + ParseTimeErrorParam{"OutofRangeMinutes", "12:60:00"}, + ParseTimeErrorParam{"OutofRangeSeconds", "12:30:61"}, + ParseTimeErrorParam{"SpaceInsteadOfColon", "12 30"}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + +} // namespace iceberg diff --git a/src/iceberg/test/transform_util_test.cc b/src/iceberg/test/transform_util_test.cc index 5b64bb33c..f5a22233a 100644 --- a/src/iceberg/test/transform_util_test.cc +++ b/src/iceberg/test/transform_util_test.cc @@ -19,12 +19,8 @@ #include "iceberg/util/transform_util.h" -#include - #include -#include "iceberg/test/matchers.h" - namespace iceberg { TEST(TransformUtilTest, HumanYear) { @@ -157,69 +153,6 @@ TEST(TransformUtilTest, HumanTimestampNsWithZone) { TransformUtil::HumanTimestampNsWithZone(-876543211)); } -TEST(TransformUtilTest, ParseTimestampNs) { - ICEBERG_UNWRAP_OR_FAIL( - auto nanos, TransformUtil::ParseTimestampNs("2026-01-01T00:00:01.000001001")); - EXPECT_EQ(nanos, 1767225601000001001L); - ICEBERG_UNWRAP_OR_FAIL(auto pre_epoch_nanos, TransformUtil::ParseTimestampNs( - "1969-12-31T23:59:59.123456789")); - EXPECT_EQ(pre_epoch_nanos, -876543211); - EXPECT_EQ(TransformUtil::HumanTimestampNs(pre_epoch_nanos), - "1969-12-31T23:59:59.123456789"); -} - -TEST(TransformUtilTest, ParseTimestampNsChecksInt64Bounds) { - ICEBERG_UNWRAP_OR_FAIL( - auto max_nanos, TransformUtil::ParseTimestampNs("2262-04-11T23:47:16.854775807")); - EXPECT_EQ(max_nanos, std::numeric_limits::max()); - - ICEBERG_UNWRAP_OR_FAIL( - auto min_nanos, TransformUtil::ParseTimestampNs("1677-09-21T00:12:43.145224192")); - EXPECT_EQ(min_nanos, std::numeric_limits::min()); - - EXPECT_THAT(TransformUtil::ParseTimestampNs("2262-04-11T23:47:16.854775808"), - IsError(ErrorKind::kInvalidArgument)); - EXPECT_THAT(TransformUtil::ParseTimestampNs("1677-09-21T00:12:43.145224191"), - IsError(ErrorKind::kInvalidArgument)); -} - -TEST(TransformUtilTest, ParseTimestampNsRejectsMoreThanNineFractionalDigits) { - EXPECT_THAT(TransformUtil::ParseTimestampNs("2026-01-01T00:00:01.0000010011"), - IsError(ErrorKind::kInvalidArgument)); -} - -TEST(TransformUtilTest, ParseTimestampNsWithZone) { - ICEBERG_UNWRAP_OR_FAIL(auto nanos, TransformUtil::ParseTimestampNsWithZone( - "2026-01-01T00:00:01.000001001+00:00")); - EXPECT_EQ(nanos, 1767225601000001001L); -} - -TEST(TransformUtilTest, ParseTimestampNsWithZoneChecksInt64BoundsAfterOffset) { - ICEBERG_UNWRAP_OR_FAIL(auto max_nanos, TransformUtil::ParseTimestampNsWithZone( - "2262-04-12T00:47:16.854775807+01:00")); - EXPECT_EQ(max_nanos, std::numeric_limits::max()); - - ICEBERG_UNWRAP_OR_FAIL(auto min_nanos, TransformUtil::ParseTimestampNsWithZone( - "1677-09-20T23:12:43.145224192-01:00")); - EXPECT_EQ(min_nanos, std::numeric_limits::min()); - - EXPECT_THAT( - TransformUtil::ParseTimestampNsWithZone("2262-04-11T23:47:16.854775807-00:01"), - IsError(ErrorKind::kInvalidArgument)); - EXPECT_THAT( - TransformUtil::ParseTimestampNsWithZone("1677-09-21T00:12:43.145224192+00:01"), - IsError(ErrorKind::kInvalidArgument)); -} - -TEST(TransformUtilTest, ParseTimestampNsWithZoneRejectsOffsetPastPlusMinus1800) { - EXPECT_THAT( - TransformUtil::ParseTimestampNsWithZone("2026-01-01T00:00:01.000001001+18:01"), - IsError(ErrorKind::kInvalidArgument)); - EXPECT_THAT( - TransformUtil::ParseTimestampNsWithZone("2026-01-01T00:00:01.000001001-18:30"), - IsError(ErrorKind::kInvalidArgument)); -} - TEST(TransformUtilTest, Base64Encode) { // Empty string EXPECT_EQ("", TransformUtil::Base64Encode("")); @@ -245,212 +178,4 @@ TEST(TransformUtilTest, Base64Encode) { EXPECT_EQ("AA==", TransformUtil::Base64Encode({"\x00", 1})); } -struct ParseRoundTripParam { - std::string name; - std::string str; - int64_t value; - enum Kind { kDay, kTime, kTimestamp, kTimestampTz } kind; -}; - -class ParseRoundTripTest : public ::testing::TestWithParam {}; - -TEST_P(ParseRoundTripTest, RoundTrip) { - const auto& param = GetParam(); - switch (param.kind) { - case ParseRoundTripParam::kDay: { - EXPECT_EQ(TransformUtil::HumanDay(static_cast(param.value)), param.str); - ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseDay(param.str)); - EXPECT_EQ(parsed, static_cast(param.value)); - break; - } - case ParseRoundTripParam::kTime: { - EXPECT_EQ(TransformUtil::HumanTime(param.value), param.str); - ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseTime(param.str)); - EXPECT_EQ(parsed, param.value); - break; - } - case ParseRoundTripParam::kTimestamp: { - EXPECT_EQ(TransformUtil::HumanTimestamp(param.value), param.str); - ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseTimestamp(param.str)); - EXPECT_EQ(parsed, param.value); - break; - } - case ParseRoundTripParam::kTimestampTz: { - EXPECT_EQ(TransformUtil::HumanTimestampWithZone(param.value), param.str); - ICEBERG_UNWRAP_OR_FAIL(auto parsed, - TransformUtil::ParseTimestampWithZone(param.str)); - EXPECT_EQ(parsed, param.value); - break; - } - } -} - -struct ParseOnlyParam { - std::string name; - std::string str; - int64_t value; - enum Kind { kDay, kTime, kTimestamp, kTimestampTz } kind; -}; - -class ParseOnlyTest : public ::testing::TestWithParam {}; - -TEST_P(ParseOnlyTest, ParsesCorrectly) { - const auto& param = GetParam(); - switch (param.kind) { - case ParseOnlyParam::kDay: { - ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseDay(param.str)); - EXPECT_EQ(parsed, static_cast(param.value)); - break; - } - case ParseOnlyParam::kTime: { - ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseTime(param.str)); - EXPECT_EQ(parsed, param.value); - break; - } - case ParseOnlyParam::kTimestamp: { - ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseTimestamp(param.str)); - EXPECT_EQ(parsed, param.value); - break; - } - case ParseOnlyParam::kTimestampTz: { - ICEBERG_UNWRAP_OR_FAIL(auto parsed, - TransformUtil::ParseTimestampWithZone(param.str)); - EXPECT_EQ(parsed, param.value); - break; - } - } -} - -struct ParseTimeErrorParam { - std::string name; - std::string str; -}; - -class ParseTimeErrorTest : public ::testing::TestWithParam {}; - -TEST_P(ParseTimeErrorTest, ReturnsError) { - EXPECT_THAT(TransformUtil::ParseTime(GetParam().str), - IsError(ErrorKind::kInvalidArgument)); -} - -INSTANTIATE_TEST_SUITE_P( - TransformUtilTest, ParseRoundTripTest, - ::testing::Values( - // Day round-trips - ParseRoundTripParam{"DayEpoch", "1970-01-01", 0, ParseRoundTripParam::kDay}, - ParseRoundTripParam{"DayNext", "1970-01-02", 1, ParseRoundTripParam::kDay}, - ParseRoundTripParam{"DayBeforeEpoch", "1969-12-31", -1, - ParseRoundTripParam::kDay}, - ParseRoundTripParam{"DayYear999", "0999-12-31", -354286, - ParseRoundTripParam::kDay}, - ParseRoundTripParam{"DayNonLeap", "1971-01-01", 365, ParseRoundTripParam::kDay}, - ParseRoundTripParam{"DayY2K", "2000-01-01", 10957, ParseRoundTripParam::kDay}, - ParseRoundTripParam{"Day2026", "2026-01-01", 20454, ParseRoundTripParam::kDay}, - // Time round-trips - ParseRoundTripParam{"TimeMidnight", "00:00", 0, ParseRoundTripParam::kTime}, - ParseRoundTripParam{"TimeOneSec", "00:00:01", 1000000, - ParseRoundTripParam::kTime}, - ParseRoundTripParam{"TimeMillis", "00:00:01.500", 1500000, - ParseRoundTripParam::kTime}, - ParseRoundTripParam{"TimeOneMillis", "00:00:01.001", 1001000, - ParseRoundTripParam::kTime}, - ParseRoundTripParam{"TimeMicros", "00:00:01.000001", 1000001, - ParseRoundTripParam::kTime}, - ParseRoundTripParam{"TimeHourMinSec", "01:02:03", 3723000000, - ParseRoundTripParam::kTime}, - ParseRoundTripParam{"TimeEndOfDay", "23:59:59", 86399000000, - ParseRoundTripParam::kTime}, - // Timestamp round-trips - ParseRoundTripParam{"TimestampEpoch", "1970-01-01T00:00:00", 0, - ParseRoundTripParam::kTimestamp}, - ParseRoundTripParam{"TimestampOneSec", "1970-01-01T00:00:01", 1000000, - ParseRoundTripParam::kTimestamp}, - ParseRoundTripParam{"TimestampMillis", "2026-01-01T00:00:01.500", - 1767225601500000L, ParseRoundTripParam::kTimestamp}, - ParseRoundTripParam{"TimestampOneMillis", "2026-01-01T00:00:01.001", - 1767225601001000L, ParseRoundTripParam::kTimestamp}, - ParseRoundTripParam{"TimestampMicros", "2026-01-01T00:00:01.000001", - 1767225601000001L, ParseRoundTripParam::kTimestamp}, - // TimestampTz round-trips - ParseRoundTripParam{"TimestampTzEpoch", "1970-01-01T00:00:00+00:00", 0, - ParseRoundTripParam::kTimestampTz}, - ParseRoundTripParam{"TimestampTzOneSec", "1970-01-01T00:00:01+00:00", 1000000, - ParseRoundTripParam::kTimestampTz}, - ParseRoundTripParam{"TimestampTzMillis", "2026-01-01T00:00:01.500+00:00", - 1767225601500000L, ParseRoundTripParam::kTimestampTz}, - ParseRoundTripParam{"TimestampTzOneMillis", "2026-01-01T00:00:01.001+00:00", - 1767225601001000L, ParseRoundTripParam::kTimestampTz}, - ParseRoundTripParam{"TimestampTzMicros", "2026-01-01T00:00:01.000001+00:00", - 1767225601000001L, ParseRoundTripParam::kTimestampTz}), - [](const ::testing::TestParamInfo& info) { - return info.param.name; - }); - -INSTANTIATE_TEST_SUITE_P( - TransformUtilTest, ParseOnlyTest, - ::testing::Values( - // TimestampTz with "Z" suffix - ParseOnlyParam{"TimestampTzSuffixZ_Epoch", "1970-01-01T00:00:00Z", 0, - ParseOnlyParam::kTimestampTz}, - ParseOnlyParam{"TimestampTzSuffixZ_Millis", "2026-01-01T00:00:01.500Z", - 1767225601500000L, ParseOnlyParam::kTimestampTz}, - // TimestampTz with "-00:00" suffix - ParseOnlyParam{"TimestampTzNegZero_Epoch", "1970-01-01T00:00:00-00:00", 0, - ParseOnlyParam::kTimestampTz}, - ParseOnlyParam{"TimestampTzNegZero_Millis", "2026-01-01T00:00:01.500-00:00", - 1767225601500000L, ParseOnlyParam::kTimestampTz}, - // Fractional micros truncates nanos - ParseOnlyParam{"TimeTruncatesNanos", "00:00:01.123456789", 1123456, - ParseOnlyParam::kTime}, - // Fractional seconds (trimmed trailing zeros) - ParseOnlyParam{"1Digit", "00:00:01.5", 1500000, ParseOnlyParam::kTime}, - ParseOnlyParam{"2Digits", "00:00:01.50", 1500000, ParseOnlyParam::kTime}, - ParseOnlyParam{"2DigitsNonZero", "00:00:01.12", 1120000, ParseOnlyParam::kTime}, - ParseOnlyParam{"4Digits", "00:00:01.0001", 1000100, ParseOnlyParam::kTime}, - // Timestamp without seconds - ParseOnlyParam{"TimestampNoSec_Zero", "1970-01-01T00:00", 0, - ParseOnlyParam::kTimestamp}, - ParseOnlyParam{"TimestampNoSec_OneMin", "1970-01-01T00:01", 60000000, - ParseOnlyParam::kTimestamp}, - // TimestampTz without seconds - ParseOnlyParam{"TimestampTzNoSec_Offset", "1970-01-01T00:00+00:00", 0, - ParseOnlyParam::kTimestampTz}, - ParseOnlyParam{"TimestampTzNoSec_OneMin", "1970-01-01T00:01+00:00", 60000000, - ParseOnlyParam::kTimestampTz}, - ParseOnlyParam{"TimestampTzNoSec_Z", "1970-01-01T00:00Z", 0, - ParseOnlyParam::kTimestampTz}, - // Extended year with '+' prefix - ParseOnlyParam{"ExtendedYearPlusEpoch", "+1970-01-01", 0, ParseOnlyParam::kDay}, - ParseOnlyParam{"ExtendedYearPlus2026", "+2026-01-01", 20454, - ParseOnlyParam::kDay}, - ParseOnlyParam{"ExtendedYearMinus2026", "-2026-01-01", -1459509, - ParseOnlyParam::kDay}, - // Non-UTC timezone offsets - ParseOnlyParam{"TimestampTzPositiveOffset", "1970-01-01T05:00:00+05:00", 0, - ParseOnlyParam::kTimestampTz}, - ParseOnlyParam{"TimestampTzNegativeOffset", "1970-01-01T00:00:00-05:00", - 18000000000, ParseOnlyParam::kTimestampTz}, - ParseOnlyParam{"TimestampTzOffsetWithMillis", "2026-01-01T05:30:01.500+05:30", - 1767225601500000L, ParseOnlyParam::kTimestampTz}, - ParseOnlyParam{"TimestampTzNegOffsetToEpoch", "1969-12-31T19:00:00-05:00", 0, - ParseOnlyParam::kTimestampTz}, - ParseOnlyParam{"TimestampTzNoSecWithOffset", "1970-01-01T05:30+05:30", 0, - ParseOnlyParam::kTimestampTz}), - [](const ::testing::TestParamInfo& info) { return info.param.name; }); - -INSTANTIATE_TEST_SUITE_P( - TransformUtilTest, ParseTimeErrorTest, - ::testing::Values(ParseTimeErrorParam{"EmptyString", ""}, - ParseTimeErrorParam{"TooShort1Char", "1"}, - ParseTimeErrorParam{"TooShort2Chars", "12"}, - ParseTimeErrorParam{"TooShort4Chars", "12:3"}, - ParseTimeErrorParam{"MissingColon", "1200:00"}, - ParseTimeErrorParam{"OutofRangeHours", "24:00:00"}, - ParseTimeErrorParam{"OutofRangeMinutes", "12:60:00"}, - ParseTimeErrorParam{"OutofRangeSeconds", "12:30:61"}, - ParseTimeErrorParam{"SpaceInsteadOfColon", "12 30"}), - [](const ::testing::TestParamInfo& info) { - return info.param.name; - }); - } // namespace iceberg diff --git a/src/iceberg/transform.cc b/src/iceberg/transform.cc index 8a7d4b3e1..c019c7ead 100644 --- a/src/iceberg/transform.cc +++ b/src/iceberg/transform.cc @@ -32,6 +32,7 @@ #include "iceberg/util/macros.h" #include "iceberg/util/projection_util_internal.h" #include "iceberg/util/string_util.h" +#include "iceberg/util/temporal_util.h" #include "iceberg/util/transform_util.h" namespace iceberg { diff --git a/src/iceberg/util/math_util_internal.h b/src/iceberg/util/math_util_internal.h new file mode 100644 index 000000000..62d82a281 --- /dev/null +++ b/src/iceberg/util/math_util_internal.h @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include + +#include "iceberg/result.h" +#include "iceberg/util/int128.h" + +namespace iceberg::internal { + +inline constexpr int64_t FloorDiv(int64_t dividend, int64_t divisor) { + const auto quotient = dividend / divisor; + if ((dividend ^ divisor) < 0 && quotient * divisor != dividend) { + return quotient - 1; + } + return quotient; +} + +inline Result MultiplyExact(int64_t lhs, int64_t rhs) { + const auto result = static_cast(lhs) * static_cast(rhs); + if (result > std::numeric_limits::max() || + result < std::numeric_limits::min()) [[unlikely]] { + return InvalidArgument("Long overflow when multiplying {} by {}", lhs, rhs); + } + return static_cast(result); +} + +} // namespace iceberg::internal diff --git a/src/iceberg/util/temporal_util.cc b/src/iceberg/util/temporal_util.cc index b91fcec77..caaeb9156 100644 --- a/src/iceberg/util/temporal_util.cc +++ b/src/iceberg/util/temporal_util.cc @@ -26,6 +26,9 @@ #include "iceberg/expression/literal.h" #include "iceberg/util/int128.h" +#include "iceberg/util/macros.h" +#include "iceberg/util/math_util_internal.h" +#include "iceberg/util/string_util.h" namespace iceberg { @@ -33,30 +36,136 @@ namespace { using namespace std::chrono; // NOLINT -constexpr int64_t kNanosPerMicro = 1000; +/// Parse a timezone offset of the form "+HH:mm" or "-HH:mm" and return the +/// offset in microseconds (positive for east of UTC, negative for west). +Result ParseTimezoneOffset(std::string_view offset) { + if (offset.size() != 6 || (offset[0] != '+' && offset[0] != '-') || offset[3] != ':') { + return InvalidArgument("Invalid timezone offset: '{}'", offset); + } + bool negative = offset[0] == '-'; + ICEBERG_ASSIGN_OR_RAISE(auto hours, + StringUtils::ParseNumber(offset.substr(1, 2))); + ICEBERG_ASSIGN_OR_RAISE(auto minutes, + StringUtils::ParseNumber(offset.substr(4, 2))); + if (hours > 18 || minutes > 59) [[unlikely]] { + return InvalidArgument("Invalid timezone offset: '{}'", offset); + } + + if (hours == 18 && minutes != 0) [[unlikely]] { + return InvalidArgument("Timezone offset '{}' not in range [-18:00, +18:00]", offset); + } -constexpr auto kEpochYmd = year{1970} / January / 1; -constexpr auto kEpochDays = sys_days(kEpochYmd); + auto micros = hours * internal::kSecondsPerHour * internal::kMicrosPerSecond + + minutes * internal::kSecondsPerMinute * internal::kMicrosPerSecond; + return negative ? -micros : micros; +} + +Result> ParseTimestampWithZoneSuffix( + std::string_view str) { + if (str.empty()) [[unlikely]] { + return InvalidArgument("Invalid timestamptz string: '{}'", str); + } -inline constexpr int64_t FloorDiv(int64_t dividend, int64_t divisor) { - const auto quotient = dividend / divisor; - if ((dividend ^ divisor) < 0 && quotient * divisor != dividend) { - return quotient - 1; + int64_t offset_micros = 0; + std::string_view timestamp_part; + + if (str.back() == 'Z') { + timestamp_part = str.substr(0, str.size() - 1); + } else if (str.size() >= 6 && + (str[str.size() - 6] == '+' || str[str.size() - 6] == '-')) { + ICEBERG_ASSIGN_OR_RAISE(offset_micros, + ParseTimezoneOffset(str.substr(str.size() - 6))); + timestamp_part = str.substr(0, str.size() - 6); + } else { + return InvalidArgument("Invalid timestamptz string (missing timezone suffix): '{}'", + str); } - return quotient; + + return std::make_pair(timestamp_part, offset_micros); } -Result MultiplyExact(int64_t lhs, int64_t rhs) { - const auto result = static_cast(lhs) * static_cast(rhs); - if (result > std::numeric_limits::max() || - result < std::numeric_limits::min()) [[unlikely]] { - return InvalidArgument("Long overflow when multiplying {} by {}", lhs, rhs); +Result TimestampFromDayTime(int32_t days, int64_t time_units, + int64_t units_per_day, int64_t offset_micros, + int64_t units_per_micro) { + const auto offset_units = + static_cast(offset_micros) * static_cast(units_per_micro); + const auto timestamp = + static_cast(days) * static_cast(units_per_day) + + static_cast(time_units) - offset_units; + + if (timestamp > std::numeric_limits::max() || + timestamp < std::numeric_limits::min()) [[unlikely]] { + return InvalidArgument("Timestamp value is out of int64 range"); } - return static_cast(result); + + return static_cast(timestamp); +} + +/// Parse fractional seconds (after '.') and return micros. +/// Digits beyond 6 are truncated. +Result ParseFractionalMicros(std::string_view frac) { + if (frac.empty() || frac.size() > 9) [[unlikely]] { + return InvalidArgument("Invalid fractional seconds: '{}'", frac); + } + if (frac.size() > 6) frac = frac.substr(0, 6); + ICEBERG_ASSIGN_OR_RAISE(auto val, StringUtils::ParseNumber(frac)); + for (size_t i = frac.size(); i < 6; ++i) { + val *= 10; + } + return static_cast(val); +} + +/// Parse fractional seconds (after '.') and return nanos. +Result ParseFractionalNanos(std::string_view frac) { + if (frac.empty() || frac.size() > 9) [[unlikely]] { + return InvalidArgument("Invalid fractional seconds: '{}'", frac); + } + ICEBERG_ASSIGN_OR_RAISE(auto val, StringUtils::ParseNumber(frac)); + for (size_t i = frac.size(); i < 9; ++i) { + val *= 10; + } + return static_cast(val); +} + +template +Result ParseTimeWithFraction(std::string_view str, int64_t units_per_second, + TimeScaleParser&& parse_fraction) { + if (str.size() < 5 || str[2] != ':') [[unlikely]] { + return InvalidArgument("Invalid time string: '{}'", str); + } + + ICEBERG_ASSIGN_OR_RAISE(auto hours, + StringUtils::ParseNumber(str.substr(0, 2))); + ICEBERG_ASSIGN_OR_RAISE(auto minutes, + StringUtils::ParseNumber(str.substr(3, 2))); + int64_t seconds = 0; + + int64_t frac_units = 0; + if (str.size() > 5) { + if (str[5] != ':' || str.size() < 8) [[unlikely]] { + return InvalidArgument("Invalid time string: '{}'", str); + } + ICEBERG_ASSIGN_OR_RAISE(seconds, StringUtils::ParseNumber(str.substr(6, 2))); + if (str.size() > 8) { + if (str[8] != '.') [[unlikely]] { + return InvalidArgument("Invalid time string: '{}'", str); + } + ICEBERG_ASSIGN_OR_RAISE(frac_units, parse_fraction(str.substr(9))); + } + } + + if (hours < 0 || hours > 23 || minutes < 0 || minutes > 59 || seconds < 0 || + seconds > 59) [[unlikely]] { + return InvalidArgument("Invalid time string: '{}'", str); + } + + return hours * internal::kSecondsPerHour * units_per_second + + minutes * internal::kSecondsPerMinute * units_per_second + + seconds * units_per_second + frac_units; } inline constexpr year_month_day DateToYmd(int32_t days_since_epoch) { - return {kEpochDays + days{days_since_epoch}}; + return {internal::kEpochDays + days{days_since_epoch}}; } inline constexpr year_month_day TimestampToYmd(int64_t micros_since_epoch) { @@ -86,7 +195,7 @@ inline constexpr int32_t TimestampNsToDuration(int64_t nanos_since_epoch) { } inline constexpr int32_t MonthsSinceEpoch(const year_month_day& ymd) { - auto delta = ymd.year() - kEpochYmd.year(); + auto delta = ymd.year() - internal::kEpochYmd.year(); // Calculate the month as months from 1970-01 // Note: January is month 1, so we subtract 1 to get zero-based month count. return static_cast(delta.count() * 12 + static_cast(ymd.month()) - @@ -102,21 +211,21 @@ template <> Result ExtractYearImpl(const Literal& literal) { auto value = std::get(literal.value()); auto ymd = DateToYmd(value); - return Literal::Int((ymd.year() - kEpochYmd.year()).count()); + return Literal::Int((ymd.year() - internal::kEpochYmd.year()).count()); } template <> Result ExtractYearImpl(const Literal& literal) { auto value = std::get(literal.value()); auto ymd = TimestampToYmd(value); - return Literal::Int((ymd.year() - kEpochYmd.year()).count()); + return Literal::Int((ymd.year() - internal::kEpochYmd.year()).count()); } template <> Result ExtractYearImpl(const Literal& literal) { auto value = std::get(literal.value()); auto ymd = TimestampNsToYmd(value); - return Literal::Int((ymd.year() - kEpochYmd.year()).count()); + return Literal::Int((ymd.year() - internal::kEpochYmd.year()).count()); } template <> @@ -227,11 +336,112 @@ Result ExtractHourImpl(const Literal& literal) } // namespace int64_t TemporalUtils::NanosToMicros(int64_t nanos) { - return FloorDiv(nanos, kNanosPerMicro); + return internal::FloorDiv(nanos, internal::kNanosPerMicro); } Result TemporalUtils::MicrosToNanos(int64_t micros) { - return MultiplyExact(micros, kNanosPerMicro); + return internal::MultiplyExact(micros, internal::kNanosPerMicro); +} + +Result TemporalUtils::ParseDay(std::string_view str) { + auto dash1 = str.find('-', (!str.empty() && (str[0] == '-' || str[0] == '+')) ? 1 : 0); + auto dash2 = str.find('-', dash1 + 1); + if (str.size() < 10 || dash1 == std::string_view::npos || + dash2 == std::string_view::npos) [[unlikely]] { + return InvalidArgument("Invalid date string: '{}'", str); + } + auto year_str = str.substr(0, dash1); + if (!year_str.empty() && year_str[0] == '+') { + year_str = year_str.substr(1); + } + ICEBERG_ASSIGN_OR_RAISE(auto year_value, StringUtils::ParseNumber(year_str)); + ICEBERG_ASSIGN_OR_RAISE(auto month_value, StringUtils::ParseNumber(str.substr( + dash1 + 1, dash2 - dash1 - 1))); + ICEBERG_ASSIGN_OR_RAISE(auto day_value, + StringUtils::ParseNumber(str.substr(dash2 + 1))); + + auto ymd = std::chrono::year{year_value} / + std::chrono::month{static_cast(month_value)} / + std::chrono::day{static_cast(day_value)}; + if (!ymd.ok()) [[unlikely]] { + return InvalidArgument("Invalid date: '{}'", str); + } + + auto days_since_epoch = std::chrono::sys_days{ymd} - internal::kEpochDays; + return static_cast(days_since_epoch.count()); +} + +Result TemporalUtils::ParseTime(std::string_view str) { + return ParseTimeWithFraction(str, internal::kMicrosPerSecond, ParseFractionalMicros); +} + +Result TemporalUtils::ParseTimeNs(std::string_view str) { + return ParseTimeWithFraction(str, internal::kNanosPerSecond, ParseFractionalNanos); +} + +Result TemporalUtils::ParseTimestamp(std::string_view str) { + auto t_pos = str.find('T'); + if (t_pos == std::string_view::npos) [[unlikely]] { + return InvalidArgument("Invalid timestamp string (missing 'T'): '{}'", str); + } + + ICEBERG_ASSIGN_OR_RAISE(auto days_since_epoch, ParseDay(str.substr(0, t_pos))); + ICEBERG_ASSIGN_OR_RAISE(auto time_micros, ParseTime(str.substr(t_pos + 1))); + + return TimestampFromDayTime(days_since_epoch, time_micros, internal::kMicrosPerDay, + /*offset_micros=*/0, /*units_per_micro=*/1); +} + +Result TemporalUtils::ParseTimestampNs(std::string_view str) { + auto t_pos = str.find('T'); + if (t_pos == std::string_view::npos) [[unlikely]] { + return InvalidArgument("Invalid timestamp string (missing 'T'): '{}'", str); + } + + ICEBERG_ASSIGN_OR_RAISE(auto days_since_epoch, ParseDay(str.substr(0, t_pos))); + ICEBERG_ASSIGN_OR_RAISE(auto time_nanos, ParseTimeNs(str.substr(t_pos + 1))); + + return TimestampFromDayTime(days_since_epoch, time_nanos, internal::kNanosPerDay, + /*offset_micros=*/0, + /*units_per_micro=*/internal::kNanosPerMicro); +} + +Result TemporalUtils::ParseTimestampWithZone(std::string_view str) { + ICEBERG_ASSIGN_OR_RAISE(auto timestamp_with_offset, ParseTimestampWithZoneSuffix(str)); + const auto [timestamp_part, offset_micros] = timestamp_with_offset; + + auto t_pos = timestamp_part.find('T'); + if (t_pos == std::string_view::npos) [[unlikely]] { + return InvalidArgument("Invalid timestamp string (missing 'T'): '{}'", + timestamp_part); + } + + ICEBERG_ASSIGN_OR_RAISE(auto days_since_epoch, + ParseDay(timestamp_part.substr(0, t_pos))); + ICEBERG_ASSIGN_OR_RAISE(auto time_micros, ParseTime(timestamp_part.substr(t_pos + 1))); + + return TimestampFromDayTime(days_since_epoch, time_micros, internal::kMicrosPerDay, + offset_micros, + /*units_per_micro=*/1); +} + +Result TemporalUtils::ParseTimestampNsWithZone(std::string_view str) { + ICEBERG_ASSIGN_OR_RAISE(auto timestamp_with_offset, ParseTimestampWithZoneSuffix(str)); + const auto [timestamp_part, offset_micros] = timestamp_with_offset; + + auto t_pos = timestamp_part.find('T'); + if (t_pos == std::string_view::npos) [[unlikely]] { + return InvalidArgument("Invalid timestamp string (missing 'T'): '{}'", + timestamp_part); + } + + ICEBERG_ASSIGN_OR_RAISE(auto days_since_epoch, + ParseDay(timestamp_part.substr(0, t_pos))); + ICEBERG_ASSIGN_OR_RAISE(auto time_nanos, ParseTimeNs(timestamp_part.substr(t_pos + 1))); + + return TimestampFromDayTime(days_since_epoch, time_nanos, internal::kNanosPerDay, + offset_micros, + /*units_per_micro=*/internal::kNanosPerMicro); } #define DISPATCH_EXTRACT_YEAR(type_id) \ diff --git a/src/iceberg/util/temporal_util.h b/src/iceberg/util/temporal_util.h index 414e4fd20..2121f565d 100644 --- a/src/iceberg/util/temporal_util.h +++ b/src/iceberg/util/temporal_util.h @@ -19,12 +19,34 @@ #pragma once +#include #include +#include #include "iceberg/iceberg_export.h" #include "iceberg/result.h" #include "iceberg/type_fwd.h" +namespace iceberg::internal { + +inline constexpr int64_t kNanosPerMicro = 1000; +inline constexpr int64_t kMicrosPerMilli = 1000; +inline constexpr int64_t kMicrosPerSecond = 1000 * kMicrosPerMilli; +inline constexpr int64_t kSecondsPerMinute = 60; +inline constexpr int64_t kMinutesPerHour = 60; +inline constexpr int64_t kHoursPerDay = 24; +inline constexpr int64_t kSecondsPerHour = kMinutesPerHour * kSecondsPerMinute; +inline constexpr int64_t kSecondsPerDay = kHoursPerDay * kSecondsPerHour; +inline constexpr int64_t kMicrosPerDay = kSecondsPerDay * kMicrosPerSecond; +inline constexpr int64_t kNanosPerMilli = kMicrosPerMilli * kNanosPerMicro; +inline constexpr int64_t kNanosPerSecond = kMicrosPerSecond * kNanosPerMicro; +inline constexpr int64_t kNanosPerDay = kMicrosPerDay * kNanosPerMicro; + +inline constexpr auto kEpochYmd = std::chrono::year{1970} / std::chrono::January / 1; +inline constexpr auto kEpochDays = std::chrono::sys_days{kEpochYmd}; + +} // namespace iceberg::internal + namespace iceberg { class ICEBERG_EXPORT TemporalUtils { @@ -35,6 +57,74 @@ class ICEBERG_EXPORT TemporalUtils { /// \brief Convert microseconds since epoch to nanoseconds, failing on overflow. static Result MicrosToNanos(int64_t micros); + /// \brief Parses a date string in "[+-]yyyy-MM-dd" format into days since epoch. + /// + /// Supports an optional '+' or '-' prefix for extended years beyond 9999. + /// + /// \param str The date string to parse. + /// \return The number of days since 1970-01-01, or an error. + static Result ParseDay(std::string_view str); + + /// \brief Parses a time string into microseconds from midnight. + /// + /// Accepts ISO-8601 local time formats: "HH:mm", "HH:mm:ss", or + /// "HH:mm:ss.f" where the fractional part can be 1-9 digits. + /// Digits beyond 6 (microsecond precision) are truncated. + /// + /// \param str The time string to parse. + /// \return The number of microseconds from midnight, or an error. + static Result ParseTime(std::string_view str); + + /// \brief Parses a time string into nanoseconds from midnight. + /// + /// Accepts ISO-8601 local time formats: "HH:mm", "HH:mm:ss", or + /// "HH:mm:ss.f" where the fractional part can be 1-9 digits. + /// Digits beyond 9 (nanosecond precision) are truncated. + /// + /// \param str The time string to parse. + /// \return The number of nanoseconds from midnight, or an error. + static Result ParseTimeNs(std::string_view str); + + /// \brief Parses a timestamp string into microseconds since epoch. + /// + /// Accepts ISO-8601 local date-time formats: "yyyy-MM-ddTHH:mm", + /// "yyyy-MM-ddTHH:mm:ss", or "yyyy-MM-ddTHH:mm:ss.f" where the + /// fractional part can be 1-9 digits (truncated to microseconds). + /// + /// \param str The timestamp string to parse. + /// \return The number of microseconds since epoch, or an error. + static Result ParseTimestamp(std::string_view str); + + /// \brief Parses a timestamp string into nanoseconds since epoch. + /// + /// Accepts ISO-8601 local date-time formats: "yyyy-MM-ddTHH:mm", + /// "yyyy-MM-ddTHH:mm:ss", or "yyyy-MM-ddTHH:mm:ss.f" where the + /// fractional part can be 1-9 digits. + /// + /// \param str The timestamp string to parse. + /// \return The number of nanoseconds since epoch, or an error. + static Result ParseTimestampNs(std::string_view str); + + /// \brief Parses a timestamp-with-zone string into microseconds since epoch (UTC). + /// + /// Accepts the same formats as ParseTimestamp, with a timezone suffix: + /// "Z", "+HH:mm", or "-HH:mm". Non-UTC offsets are converted to UTC. + /// The seconds and fractional parts are optional (e.g. "yyyy-MM-ddTHH:mm+00:00"). + /// + /// \param str The timestamp string to parse. + /// \return The number of microseconds since epoch (UTC), or an error. + static Result ParseTimestampWithZone(std::string_view str); + + /// \brief Parses a timestamp-with-zone string into nanoseconds since epoch (UTC). + /// + /// Accepts the same formats as ParseTimestampNs, with a timezone suffix: + /// "Z", "+HH:mm", or "-HH:mm". Non-UTC offsets are converted to UTC. + /// The seconds and fractional parts are optional (e.g. "yyyy-MM-ddTHH:mm+00:00"). + /// + /// \param str The timestamp string to parse. + /// \return The number of nanoseconds since epoch (UTC), or an error. + static Result ParseTimestampNsWithZone(std::string_view str); + /// \brief Extract a date or timestamp year, as years from 1970 static Result ExtractYear(const Literal& literal); diff --git a/src/iceberg/util/transform_util.cc b/src/iceberg/util/transform_util.cc index fc1b104e5..d12449382 100644 --- a/src/iceberg/util/transform_util.cc +++ b/src/iceberg/util/transform_util.cc @@ -21,166 +21,24 @@ #include #include -#include +#include -#include "iceberg/util/int128.h" -#include "iceberg/util/macros.h" -#include "iceberg/util/string_util.h" +#include "iceberg/util/temporal_util.h" namespace iceberg { -namespace { -constexpr auto kEpochDate = std::chrono::year{1970} / std::chrono::January / 1; -constexpr int64_t kMicrosPerMillis = 1'000; -constexpr int64_t kMicrosPerSecond = 1'000'000; -constexpr int64_t kMicrosPerDay = 86'400'000'000LL; -constexpr int64_t kNanosPerMillis = 1'000'000; -constexpr int64_t kNanosPerSecond = 1'000'000'000; -constexpr int64_t kNanosPerDay = 86'400'000'000'000LL; - -/// Parse a timezone offset of the form "+HH:mm" or "-HH:mm" and return the -/// offset in microseconds (positive for east of UTC, negative for west). -Result ParseTimezoneOffset(std::string_view offset) { - if (offset.size() != 6 || (offset[0] != '+' && offset[0] != '-') || offset[3] != ':') { - return InvalidArgument("Invalid timezone offset: '{}'", offset); - } - bool negative = offset[0] == '-'; - ICEBERG_ASSIGN_OR_RAISE(auto hours, - StringUtils::ParseNumber(offset.substr(1, 2))); - ICEBERG_ASSIGN_OR_RAISE(auto minutes, - StringUtils::ParseNumber(offset.substr(4, 2))); - if (hours > 18 || minutes > 59) [[unlikely]] { - return InvalidArgument("Invalid timezone offset: '{}'", offset); - } - - if (hours == 18 && minutes != 0) [[unlikely]] { - return InvalidArgument("Timezone offset '{}' not in range [-18:00, +18:00]", offset); - } - - auto micros = hours * 3'600 * kMicrosPerSecond + minutes * 60 * kMicrosPerSecond; - return negative ? -micros : micros; -} - -Result> ParseTimestampWithZoneSuffix( - std::string_view str) { - if (str.empty()) [[unlikely]] { - return InvalidArgument("Invalid timestamptz string: '{}'", str); - } - - int64_t offset_micros = 0; - std::string_view timestamp_part; - - if (str.back() == 'Z') { - timestamp_part = str.substr(0, str.size() - 1); - } else if (str.size() >= 6 && - (str[str.size() - 6] == '+' || str[str.size() - 6] == '-')) { - // Parse "+HH:mm" or "-HH:mm" offset suffix - ICEBERG_ASSIGN_OR_RAISE(offset_micros, - ParseTimezoneOffset(str.substr(str.size() - 6))); - timestamp_part = str.substr(0, str.size() - 6); - } else { - return InvalidArgument("Invalid timestamptz string (missing timezone suffix): '{}'", - str); - } - - return std::make_pair(timestamp_part, offset_micros); -} - -Result TimestampFromDayTime(int32_t days, int64_t time_units, - int64_t units_per_day, int64_t offset_micros, - int64_t units_per_micro) { - const auto offset_units = - static_cast(offset_micros) * static_cast(units_per_micro); - const auto timestamp = - static_cast(days) * static_cast(units_per_day) + - static_cast(time_units) - offset_units; - - if (timestamp > std::numeric_limits::max() || - timestamp < std::numeric_limits::min()) [[unlikely]] { - return InvalidArgument("Timestamp value is out of int64 range"); - } - - return static_cast(timestamp); -} - -/// Parse fractional seconds (after '.') and return micros. -/// Digits beyond 6 are truncated (nanosecond precision). -Result ParseFractionalMicros(std::string_view frac) { - if (frac.empty() || frac.size() > 9) [[unlikely]] { - return InvalidArgument("Invalid fractional seconds: '{}'", frac); - } - // Truncate to microsecond precision (6 digits), matching Java ISO_LOCAL_TIME behavior - if (frac.size() > 6) frac = frac.substr(0, 6); - ICEBERG_ASSIGN_OR_RAISE(auto val, StringUtils::ParseNumber(frac)); - // Right-pad to 6 digits: "500" -> 500000, "001" -> 1000, "000001" -> 1000 - for (size_t i = frac.size(); i < 6; ++i) { - val *= 10; - } - return static_cast(val); -} - -/// Parse fractional seconds (after '.') and return nanos. -Result ParseFractionalNanos(std::string_view frac) { - if (frac.empty() || frac.size() > 9) [[unlikely]] { - return InvalidArgument("Invalid fractional seconds: '{}'", frac); - } - ICEBERG_ASSIGN_OR_RAISE(auto val, StringUtils::ParseNumber(frac)); - // Right-pad to 9 digits: "500" -> 500000000, "001" -> 1000000, "000001" -> 1000 - for (size_t i = frac.size(); i < 9; ++i) { - val *= 10; - } - return static_cast(val); -} - -template -Result ParseTimeWithFraction(std::string_view str, int64_t units_per_second, - TimeScaleParser&& parse_fraction) { - if (str.size() < 5 || str[2] != ':') [[unlikely]] { - return InvalidArgument("Invalid time string: '{}'", str); - } - - ICEBERG_ASSIGN_OR_RAISE(auto hours, - StringUtils::ParseNumber(str.substr(0, 2))); - ICEBERG_ASSIGN_OR_RAISE(auto minutes, - StringUtils::ParseNumber(str.substr(3, 2))); - int64_t seconds = 0; - - int64_t frac_units = 0; - if (str.size() > 5) { - if (str[5] != ':' || str.size() < 8) [[unlikely]] { - return InvalidArgument("Invalid time string: '{}'", str); - } - ICEBERG_ASSIGN_OR_RAISE(seconds, StringUtils::ParseNumber(str.substr(6, 2))); - if (str.size() > 8) { - if (str[8] != '.') [[unlikely]] { - return InvalidArgument("Invalid time string: '{}'", str); - } - ICEBERG_ASSIGN_OR_RAISE(frac_units, parse_fraction(str.substr(9))); - } - } - - if (hours < 0 || hours > 23 || minutes < 0 || minutes > 59 || seconds < 0 || - seconds > 59) [[unlikely]] { - return InvalidArgument("Invalid time string: '{}'", str); - } - - return hours * 3'600 * units_per_second + minutes * 60 * units_per_second + - seconds * units_per_second + frac_units; -} -} // namespace - std::string TransformUtil::HumanYear(int32_t year_ordinal) { - auto y = kEpochDate + std::chrono::years{year_ordinal}; + auto y = internal::kEpochYmd + std::chrono::years{year_ordinal}; return std::format("{:%Y}", y); } std::string TransformUtil::HumanMonth(int32_t month_ordinal) { - auto ym = kEpochDate + std::chrono::months(month_ordinal); + auto ym = internal::kEpochYmd + std::chrono::months(month_ordinal); return std::format("{:%Y-%m}", ym); } std::string TransformUtil::HumanDay(int32_t day_ordinal) { - auto ymd = std::chrono::sys_days{kEpochDate} + std::chrono::days{day_ordinal}; + auto ymd = internal::kEpochDays + std::chrono::days{day_ordinal}; return std::format("{:%F}", ymd); } @@ -192,14 +50,14 @@ std::string TransformUtil::HumanHour(int32_t hour_ordinal) { std::string TransformUtil::HumanTime(int64_t micros_from_midnight) { std::chrono::hh_mm_ss hms{ - std::chrono::seconds{micros_from_midnight / kMicrosPerSecond}}; - auto micros = micros_from_midnight % kMicrosPerSecond; + std::chrono::seconds{micros_from_midnight / internal::kMicrosPerSecond}}; + auto micros = micros_from_midnight % internal::kMicrosPerSecond; if (micros == 0 && hms.seconds().count() == 0) { return std::format("{:%R}", hms); } else if (micros == 0) { return std::format("{:%T}", hms); - } else if (micros % kMicrosPerMillis == 0) { - return std::format("{:%T}.{:03d}", hms, micros / kMicrosPerMillis); + } else if (micros % internal::kMicrosPerMilli == 0) { + return std::format("{:%T}.{:03d}", hms, micros / internal::kMicrosPerMilli); } else { return std::format("{:%T}.{:06d}", hms, micros); } @@ -216,8 +74,8 @@ std::string TransformUtil::HumanTimestamp(int64_t timestamp_micros) { .count(); if (micros == 0) { return std::format("{:%FT%T}", tp); - } else if (micros % kMicrosPerMillis == 0) { - return std::format("{:%FT%T}.{:03d}", tp, micros / kMicrosPerMillis); + } else if (micros % internal::kMicrosPerMilli == 0) { + return std::format("{:%FT%T}.{:03d}", tp, micros / internal::kMicrosPerMilli); } else { return std::format("{:%FT%T}.{:06d}", tp, micros); } @@ -234,10 +92,10 @@ std::string TransformUtil::HumanTimestampNs(int64_t timestamp_nanos) { .count(); if (nanos == 0) { return std::format("{:%FT%T}", tp); - } else if (nanos % kNanosPerMillis == 0) { - return std::format("{:%FT%T}.{:03d}", tp, nanos / kNanosPerMillis); - } else if (nanos % kMicrosPerMillis == 0) { - return std::format("{:%FT%T}.{:06d}", tp, nanos / kMicrosPerMillis); + } else if (nanos % internal::kNanosPerMilli == 0) { + return std::format("{:%FT%T}.{:03d}", tp, nanos / internal::kNanosPerMilli); + } else if (nanos % internal::kNanosPerMicro == 0) { + return std::format("{:%FT%T}.{:06d}", tp, nanos / internal::kNanosPerMicro); } else { return std::format("{:%FT%T}.{:09d}", tp, nanos); } @@ -254,8 +112,8 @@ std::string TransformUtil::HumanTimestampWithZone(int64_t timestamp_micros) { .count(); if (micros == 0) { return std::format("{:%FT%T}+00:00", tp); - } else if (micros % kMicrosPerMillis == 0) { - return std::format("{:%FT%T}.{:03d}+00:00", tp, micros / kMicrosPerMillis); + } else if (micros % internal::kMicrosPerMilli == 0) { + return std::format("{:%FT%T}.{:03d}+00:00", tp, micros / internal::kMicrosPerMilli); } else { return std::format("{:%FT%T}.{:06d}+00:00", tp, micros); } @@ -272,113 +130,15 @@ std::string TransformUtil::HumanTimestampNsWithZone(int64_t timestamp_nanos) { .count(); if (nanos == 0) { return std::format("{:%FT%T}+00:00", tp); - } else if (nanos % kNanosPerMillis == 0) { - return std::format("{:%FT%T}.{:03d}+00:00", tp, nanos / kNanosPerMillis); - } else if (nanos % kMicrosPerMillis == 0) { - return std::format("{:%FT%T}.{:06d}+00:00", tp, nanos / kMicrosPerMillis); + } else if (nanos % internal::kNanosPerMilli == 0) { + return std::format("{:%FT%T}.{:03d}+00:00", tp, nanos / internal::kNanosPerMilli); + } else if (nanos % internal::kNanosPerMicro == 0) { + return std::format("{:%FT%T}.{:06d}+00:00", tp, nanos / internal::kNanosPerMicro); } else { return std::format("{:%FT%T}.{:09d}+00:00", tp, nanos); } } -Result TransformUtil::ParseDay(std::string_view str) { - // Expected format: "[+-]yyyy-MM-dd" - // Parse year, month, day manually, skipping leading '+' or '-' to find first date dash - auto dash1 = str.find('-', (!str.empty() && (str[0] == '-' || str[0] == '+')) ? 1 : 0); - auto dash2 = str.find('-', dash1 + 1); - if (str.size() < 10 || dash1 == std::string_view::npos || - dash2 == std::string_view::npos) [[unlikely]] { - return InvalidArgument("Invalid date string: '{}'", str); - } - auto year_str = str.substr(0, dash1); - // std::from_chars does not accept '+' prefix, strip it for positive extended years - if (!year_str.empty() && year_str[0] == '+') { - year_str = year_str.substr(1); - } - ICEBERG_ASSIGN_OR_RAISE(auto year, StringUtils::ParseNumber(year_str)); - ICEBERG_ASSIGN_OR_RAISE(auto month, StringUtils::ParseNumber( - str.substr(dash1 + 1, dash2 - dash1 - 1))); - ICEBERG_ASSIGN_OR_RAISE(auto day, - StringUtils::ParseNumber(str.substr(dash2 + 1))); - - auto ymd = std::chrono::year{year} / std::chrono::month{static_cast(month)} / - std::chrono::day{static_cast(day)}; - if (!ymd.ok()) [[unlikely]] { - return InvalidArgument("Invalid date: '{}'", str); - } - - auto days = std::chrono::sys_days{ymd} - std::chrono::sys_days{kEpochDate}; - return static_cast(days.count()); -} - -Result TransformUtil::ParseTime(std::string_view str) { - return ParseTimeWithFraction(str, kMicrosPerSecond, ParseFractionalMicros); -} - -Result TransformUtil::ParseTimeNs(std::string_view str) { - return ParseTimeWithFraction(str, kNanosPerSecond, ParseFractionalNanos); -} - -Result TransformUtil::ParseTimestamp(std::string_view str) { - auto t_pos = str.find('T'); - if (t_pos == std::string_view::npos) [[unlikely]] { - return InvalidArgument("Invalid timestamp string (missing 'T'): '{}'", str); - } - - ICEBERG_ASSIGN_OR_RAISE(auto days, ParseDay(str.substr(0, t_pos))); - ICEBERG_ASSIGN_OR_RAISE(auto time_micros, ParseTime(str.substr(t_pos + 1))); - - return TimestampFromDayTime(days, time_micros, kMicrosPerDay, /*offset_micros=*/0, - /*units_per_micro=*/1); -} - -Result TransformUtil::ParseTimestampNs(std::string_view str) { - auto t_pos = str.find('T'); - if (t_pos == std::string_view::npos) [[unlikely]] { - return InvalidArgument("Invalid timestamp string (missing 'T'): '{}'", str); - } - - ICEBERG_ASSIGN_OR_RAISE(auto days, ParseDay(str.substr(0, t_pos))); - ICEBERG_ASSIGN_OR_RAISE(auto time_nanos, ParseTimeNs(str.substr(t_pos + 1))); - - return TimestampFromDayTime(days, time_nanos, kNanosPerDay, /*offset_micros=*/0, - /*units_per_micro=*/1'000); -} - -Result TransformUtil::ParseTimestampWithZone(std::string_view str) { - ICEBERG_ASSIGN_OR_RAISE(auto timestamp_with_offset, ParseTimestampWithZoneSuffix(str)); - const auto [timestamp_part, offset_micros] = timestamp_with_offset; - - auto t_pos = timestamp_part.find('T'); - if (t_pos == std::string_view::npos) [[unlikely]] { - return InvalidArgument("Invalid timestamp string (missing 'T'): '{}'", - timestamp_part); - } - - ICEBERG_ASSIGN_OR_RAISE(auto days, ParseDay(timestamp_part.substr(0, t_pos))); - ICEBERG_ASSIGN_OR_RAISE(auto time_micros, ParseTime(timestamp_part.substr(t_pos + 1))); - - return TimestampFromDayTime(days, time_micros, kMicrosPerDay, offset_micros, - /*units_per_micro=*/1); -} - -Result TransformUtil::ParseTimestampNsWithZone(std::string_view str) { - ICEBERG_ASSIGN_OR_RAISE(auto timestamp_with_offset, ParseTimestampWithZoneSuffix(str)); - const auto [timestamp_part, offset_micros] = timestamp_with_offset; - - auto t_pos = timestamp_part.find('T'); - if (t_pos == std::string_view::npos) [[unlikely]] { - return InvalidArgument("Invalid timestamp string (missing 'T'): '{}'", - timestamp_part); - } - - ICEBERG_ASSIGN_OR_RAISE(auto days, ParseDay(timestamp_part.substr(0, t_pos))); - ICEBERG_ASSIGN_OR_RAISE(auto time_nanos, ParseTimeNs(timestamp_part.substr(t_pos + 1))); - - return TimestampFromDayTime(days, time_nanos, kNanosPerDay, offset_micros, - /*units_per_micro=*/1'000); -} - std::string TransformUtil::Base64Encode(std::string_view str_to_encode) { static constexpr std::string_view kBase64Chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; diff --git a/src/iceberg/util/transform_util.h b/src/iceberg/util/transform_util.h index b9c692098..2fbcb3e55 100644 --- a/src/iceberg/util/transform_util.h +++ b/src/iceberg/util/transform_util.h @@ -19,11 +19,11 @@ #pragma once +#include #include +#include #include "iceberg/iceberg_export.h" -#include "iceberg/result.h" -#include "iceberg/type_fwd.h" namespace iceberg { @@ -127,74 +127,6 @@ class ICEBERG_EXPORT TransformUtil { /// \return a string representation of this timestamp. static std::string HumanTimestampNsWithZone(int64_t timestamp_nanos); - /// \brief Parses a date string in "[+-]yyyy-MM-dd" format into days since epoch. - /// - /// Supports an optional '+' or '-' prefix for extended years beyond 9999. - /// - /// \param str The date string to parse. - /// \return The number of days since 1970-01-01, or an error. - static Result ParseDay(std::string_view str); - - /// \brief Parses a time string into microseconds from midnight. - /// - /// Accepts ISO-8601 local time formats: "HH:mm", "HH:mm:ss", or - /// "HH:mm:ss.f" where the fractional part can be 1-9 digits. - /// Digits beyond 6 (microsecond precision) are truncated. - /// - /// \param str The time string to parse. - /// \return The number of microseconds from midnight, or an error. - static Result ParseTime(std::string_view str); - - /// \brief Parses a time string into nanoseconds from midnight. - /// - /// Accepts ISO-8601 local time formats: "HH:mm", "HH:mm:ss", or - /// "HH:mm:ss.f" where the fractional part can be 1-9 digits. - /// Digits beyond 9 (nanosecond precision) are truncated. - /// - /// \param str The time string to parse. - /// \return The number of nanoseconds from midnight, or an error. - static Result ParseTimeNs(std::string_view str); - - /// \brief Parses a timestamp string into microseconds since epoch. - /// - /// Accepts ISO-8601 local date-time formats: "yyyy-MM-ddTHH:mm", - /// "yyyy-MM-ddTHH:mm:ss", or "yyyy-MM-ddTHH:mm:ss.f" where the - /// fractional part can be 1-9 digits (truncated to microseconds). - /// - /// \param str The timestamp string to parse. - /// \return The number of microseconds since epoch, or an error. - static Result ParseTimestamp(std::string_view str); - - /// \brief Parses a timestamp string into nanoseconds since epoch. - /// - /// Accepts ISO-8601 local date-time formats: "yyyy-MM-ddTHH:mm", - /// "yyyy-MM-ddTHH:mm:ss", or "yyyy-MM-ddTHH:mm:ss.f" where the - /// fractional part can be 1-9 digits. - /// - /// \param str The timestamp string to parse. - /// \return The number of nanoseconds since epoch, or an error. - static Result ParseTimestampNs(std::string_view str); - - /// \brief Parses a timestamp-with-zone string into microseconds since epoch (UTC). - /// - /// Accepts the same formats as ParseTimestamp, with a timezone suffix: - /// "Z", "+HH:mm", or "-HH:mm". Non-UTC offsets are converted to UTC. - /// The seconds and fractional parts are optional (e.g. "yyyy-MM-ddTHH:mm+00:00"). - /// - /// \param str The timestamp string to parse. - /// \return The number of microseconds since epoch (UTC), or an error. - static Result ParseTimestampWithZone(std::string_view str); - - /// \brief Parses a timestamp-with-zone string into nanoseconds since epoch (UTC). - /// - /// Accepts the same formats as ParseTimestampNs, with a timezone suffix: - /// "Z", "+HH:mm", or "-HH:mm". Non-UTC offsets are converted to UTC. - /// The seconds and fractional parts are optional (e.g. "yyyy-MM-ddTHH:mm+00:00"). - /// - /// \param str The timestamp string to parse. - /// \return The number of nanoseconds since epoch (UTC), or an error. - static Result ParseTimestampNsWithZone(std::string_view str); - /// \brief Base64 encode a string static std::string Base64Encode(std::string_view str_to_encode); }; From 2005956285f05664cf982f574bd2120d344e060c Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Mon, 25 May 2026 15:37:48 +0800 Subject: [PATCH 2/2] resolve review comment --- src/iceberg/test/math_util_internal_test.cc | 4 ++-- src/iceberg/util/math_util_internal.h | 4 ++-- src/iceberg/util/temporal_util.cc | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/iceberg/test/math_util_internal_test.cc b/src/iceberg/test/math_util_internal_test.cc index 682c786e2..ac9ef36cb 100644 --- a/src/iceberg/test/math_util_internal_test.cc +++ b/src/iceberg/test/math_util_internal_test.cc @@ -25,7 +25,7 @@ #include "iceberg/test/matchers.h" -namespace iceberg::internal { +namespace iceberg { TEST(MathUtilInternalTest, FloorDiv) { EXPECT_EQ(0, FloorDiv(0, 1000)); @@ -53,4 +53,4 @@ TEST(MathUtilInternalTest, MultiplyExact) { IsError(ErrorKind::kInvalidArgument)); } -} // namespace iceberg::internal +} // namespace iceberg diff --git a/src/iceberg/util/math_util_internal.h b/src/iceberg/util/math_util_internal.h index 62d82a281..20d8ef898 100644 --- a/src/iceberg/util/math_util_internal.h +++ b/src/iceberg/util/math_util_internal.h @@ -25,7 +25,7 @@ #include "iceberg/result.h" #include "iceberg/util/int128.h" -namespace iceberg::internal { +namespace iceberg { inline constexpr int64_t FloorDiv(int64_t dividend, int64_t divisor) { const auto quotient = dividend / divisor; @@ -44,4 +44,4 @@ inline Result MultiplyExact(int64_t lhs, int64_t rhs) { return static_cast(result); } -} // namespace iceberg::internal +} // namespace iceberg diff --git a/src/iceberg/util/temporal_util.cc b/src/iceberg/util/temporal_util.cc index caaeb9156..e00ee7cf5 100644 --- a/src/iceberg/util/temporal_util.cc +++ b/src/iceberg/util/temporal_util.cc @@ -336,11 +336,11 @@ Result ExtractHourImpl(const Literal& literal) } // namespace int64_t TemporalUtils::NanosToMicros(int64_t nanos) { - return internal::FloorDiv(nanos, internal::kNanosPerMicro); + return FloorDiv(nanos, internal::kNanosPerMicro); } Result TemporalUtils::MicrosToNanos(int64_t micros) { - return internal::MultiplyExact(micros, internal::kNanosPerMicro); + return MultiplyExact(micros, internal::kNanosPerMicro); } Result TemporalUtils::ParseDay(std::string_view str) {