-
Notifications
You must be signed in to change notification settings - Fork 2k
feat: support nanosecond date_part #20674
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,7 +20,7 @@ use std::str::FromStr; | |
| use std::sync::Arc; | ||
|
|
||
| use arrow::array::timezone::Tz; | ||
| use arrow::array::{Array, ArrayRef, Float64Array, Int32Array}; | ||
| use arrow::array::{Array, ArrayRef, Float64Array, Int32Array, Int64Array}; | ||
| use arrow::compute::kernels::cast_utils::IntervalUnit; | ||
| use arrow::compute::{DatePart, binary, date_part}; | ||
| use arrow::datatypes::DataType::{ | ||
|
|
@@ -167,6 +167,9 @@ impl ScalarUDFImpl for DatePartFunc { | |
| .map(|part| { | ||
| if is_epoch(part) { | ||
| Field::new(self.name(), DataType::Float64, nullable) | ||
| } else if is_nanosecond(part) { | ||
| // See notes on [seconds_ns] for rationale | ||
| Field::new(self.name(), DataType::Int64, nullable) | ||
| } else { | ||
| Field::new(self.name(), DataType::Int32, nullable) | ||
| } | ||
|
|
@@ -218,7 +221,7 @@ impl ScalarUDFImpl for DatePartFunc { | |
| IntervalUnit::Second => seconds_as_i32(array.as_ref(), Second)?, | ||
| IntervalUnit::Millisecond => seconds_as_i32(array.as_ref(), Millisecond)?, | ||
| IntervalUnit::Microsecond => seconds_as_i32(array.as_ref(), Microsecond)?, | ||
| IntervalUnit::Nanosecond => seconds_as_i32(array.as_ref(), Nanosecond)?, | ||
| IntervalUnit::Nanosecond => seconds_ns(array.as_ref())?, | ||
| // century and decade are not supported by `DatePart`, although they are supported in postgres | ||
| _ => return exec_err!("Date part '{part}' not supported"), | ||
| } | ||
|
|
@@ -321,6 +324,12 @@ fn is_epoch(part: &str) -> bool { | |
| matches!(part.to_lowercase().as_str(), "epoch") | ||
| } | ||
|
|
||
| fn is_nanosecond(part: &str) -> bool { | ||
| IntervalUnit::from_str(part_normalization(part)) | ||
| .map(|p| matches!(p, IntervalUnit::Nanosecond)) | ||
| .unwrap_or(false) | ||
| } | ||
|
|
||
| fn date_to_scalar(date: NaiveDate, target_type: &DataType) -> Option<ScalarValue> { | ||
| Some(match target_type { | ||
| Date32 => ScalarValue::Date32(Some(Date32Type::from_naive_date(date))), | ||
|
|
@@ -517,3 +526,39 @@ fn epoch(array: &dyn Array) -> Result<ArrayRef> { | |
| }; | ||
| Ok(Arc::new(f)) | ||
| } | ||
|
|
||
| /// Invoke [`date_part`] on an `array` (e.g. Timestamp) and convert the | ||
| /// result to a total number of nanoseconds as an Int64 array. | ||
| /// | ||
| /// This returns an Int64 rather than Int32 because there 1 billion | ||
| /// `nanosecond`s in each second, so representing up to 60 seconds as | ||
| /// nanoseconds can be values up to 60 billion, which does not fit in Int32. | ||
| fn seconds_ns(array: &dyn Array) -> Result<ArrayRef> { | ||
| let secs = date_part(array, DatePart::Second)?; | ||
| // This assumes array is primitive and not a dictionary | ||
| let secs = as_int32_array(secs.as_ref())?; | ||
| let subsecs = date_part(array, DatePart::Nanosecond)?; | ||
| let subsecs = as_int32_array(subsecs.as_ref())?; | ||
|
|
||
| // Special case where there are no nulls. | ||
| if subsecs.null_count() == 0 { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be safer to also check sec.nulls too? It does feel like any nulls in subsecs would also be in secs and vica versa
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I re-reviewed the code and since they both come from calling arrow |
||
| let r: Int64Array = binary(secs, subsecs, |secs, subsecs| { | ||
| (secs as i64) * 1_000_000_000 + (subsecs as i64) | ||
| })?; | ||
| Ok(Arc::new(r)) | ||
| } else { | ||
| // Nulls in secs are preserved, nulls in subsecs are treated as zero to account for the case | ||
| // where the number of nanoseconds overflows. | ||
| let r: Int64Array = secs | ||
| .iter() | ||
| .zip(subsecs) | ||
| .map(|(secs, subsecs)| { | ||
| secs.map(|secs| { | ||
| let subsecs = subsecs.unwrap_or(0); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't see any test coverage of nulls 🤔 -- maybe we can add some |
||
| (secs as i64) * 1_000_000_000 + (subsecs as i64) | ||
| }) | ||
| }) | ||
| .collect(); | ||
| Ok(Arc::new(r)) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -212,6 +212,22 @@ SELECT date_part('microsecond', ts_nano_no_tz), date_part('microsecond', ts_nano | |
| 123456 123456 123456 123000 123000 123000 | ||
| 123456 123456 123456 123000 123000 123000 | ||
|
|
||
| # date_part (nanosecond) | ||
| query IIIIII | ||
| SELECT date_part('nanosecond', ts_nano_no_tz), date_part('nanosecond', ts_nano_utc), date_part('nanosecond', ts_nano_eastern), date_part('nanosecond', ts_milli_no_tz), date_part('nanosecond', ts_milli_utc), date_part('nanosecond', ts_milli_eastern) FROM source_ts; | ||
| ---- | ||
| 0 0 0 0 0 0 | ||
| 0 0 0 0 0 0 | ||
| 0 0 0 0 0 0 | ||
| 0 0 0 0 0 0 | ||
| 0 0 0 0 0 0 | ||
| 0 0 0 0 0 0 | ||
| 0 0 0 0 0 0 | ||
| 30000000000 30000000000 30000000000 30000000000 30000000000 30000000000 | ||
| 123000000 123000000 123000000 123000000 123000000 123000000 | ||
| 123456000 123456000 123456000 123000000 123000000 123000000 | ||
| 123456789 123456789 123456789 123000000 123000000 123000000 | ||
|
|
||
| ### Cleanup | ||
| statement ok | ||
| drop table source_ts; | ||
|
|
@@ -433,6 +449,12 @@ SELECT arrow_typeof(date_part('minute', to_timestamp('2020-09-08T12:12:00+00:00' | |
| ---- | ||
| Int32 | ||
|
|
||
| # nanosecond can exceed Int32 and returns Int64 | ||
| query T | ||
| SELECT arrow_typeof(date_part('nanosecond', to_timestamp('2020-09-08T12:12:00+00:00'))) | ||
| ---- | ||
| Int64 | ||
|
|
||
| query I | ||
| SELECT EXTRACT(second FROM timestamp '2020-09-08T12:00:12.12345678+00:00') | ||
| ---- | ||
|
|
@@ -448,8 +470,11 @@ SELECT EXTRACT(microsecond FROM timestamp '2020-09-08T12:00:12.12345678+00:00') | |
| ---- | ||
| 12123456 | ||
|
|
||
| query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported | ||
| # note the output is more than Int32 can store | ||
| query I | ||
| SELECT EXTRACT(nanosecond FROM timestamp '2020-09-08T12:00:12.12345678+00:00') | ||
| ---- | ||
| 12123456780 | ||
|
|
||
| query I | ||
| SELECT EXTRACT("second" FROM timestamp '2020-09-08T12:00:12.12345678+00:00') | ||
|
|
@@ -466,8 +491,10 @@ SELECT EXTRACT("microsecond" FROM timestamp '2020-09-08T12:00:12.12345678+00:00' | |
| ---- | ||
| 12123456 | ||
|
|
||
| query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported | ||
| query I | ||
| SELECT EXTRACT("nanosecond" FROM timestamp '2020-09-08T12:00:12.12345678+00:00') | ||
| ---- | ||
| 12123456780 | ||
|
|
||
| query I | ||
| SELECT EXTRACT('second' FROM timestamp '2020-09-08T12:00:12.12345678+00:00') | ||
|
|
@@ -484,9 +511,10 @@ SELECT EXTRACT('microsecond' FROM timestamp '2020-09-08T12:00:12.12345678+00:00' | |
| ---- | ||
| 12123456 | ||
|
|
||
| query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported | ||
| query I | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice |
||
| SELECT EXTRACT('nanosecond' FROM timestamp '2020-09-08T12:00:12.12345678+00:00') | ||
|
|
||
| ---- | ||
| 12123456780 | ||
|
|
||
| # Keep precision when coercing Utf8 to Timestamp | ||
| query I | ||
|
|
@@ -504,9 +532,10 @@ SELECT date_part('microsecond', timestamp '2020-09-08T12:00:12.12345678+00:00') | |
| ---- | ||
| 12123456 | ||
|
|
||
| query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported | ||
| query I | ||
| SELECT date_part('nanosecond', timestamp '2020-09-08T12:00:12.12345678+00:00') | ||
|
|
||
| ---- | ||
| 12123456780 | ||
|
|
||
| query I | ||
| SELECT date_part('second', '2020-09-08T12:00:12.12345678+00:00') | ||
|
|
@@ -523,8 +552,30 @@ SELECT date_part('microsecond', '2020-09-08T12:00:12.12345678+00:00') | |
| ---- | ||
| 12123456 | ||
|
|
||
| query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported | ||
| query I | ||
| SELECT date_part('nanosecond', '2020-09-08T12:00:12.12345678+00:00') | ||
| ---- | ||
| 12123456780 | ||
|
|
||
| query I | ||
| SELECT EXTRACT(nanosecond FROM ts) | ||
| FROM (VALUES | ||
| (timestamp '2020-09-08T12:00:12.12345678+00:00'), | ||
| (NULL::timestamp) | ||
| ) AS t(ts) | ||
| ---- | ||
| 12123456780 | ||
| NULL | ||
|
|
||
| query I | ||
| SELECT date_part('nanosecond', ts) | ||
| FROM (VALUES | ||
| (timestamp '2020-09-08T12:00:12.12345678+00:00'), | ||
| (NULL::timestamp) | ||
| ) AS t(ts) | ||
| ---- | ||
| 12123456780 | ||
| NULL | ||
|
|
||
| # test_date_part_time | ||
|
|
||
|
|
@@ -579,8 +630,10 @@ SELECT extract(microsecond from arrow_cast('23:32:50'::time, 'Time32(Second)')) | |
| ---- | ||
| 50000000 | ||
|
|
||
| query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported | ||
| query I | ||
| SELECT extract(nanosecond from arrow_cast('23:32:50'::time, 'Time32(Second)')) | ||
| ---- | ||
| 50000000000 | ||
|
|
||
| query R | ||
| SELECT date_part('epoch', arrow_cast('23:32:50'::time, 'Time32(Second)')) | ||
|
|
@@ -643,8 +696,10 @@ SELECT extract(microsecond from arrow_cast('23:32:50.123'::time, 'Time32(Millise | |
| ---- | ||
| 50123000 | ||
|
|
||
| query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported | ||
| query I | ||
| SELECT extract(nanosecond from arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)')) | ||
| ---- | ||
| 50123000000 | ||
|
|
||
| query R | ||
| SELECT date_part('epoch', arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)')) | ||
|
|
@@ -707,8 +762,10 @@ SELECT extract(microsecond from arrow_cast('23:32:50.123456'::time, 'Time64(Micr | |
| ---- | ||
| 50123456 | ||
|
|
||
| query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported | ||
| query I | ||
| SELECT extract(nanosecond from arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)')) | ||
| ---- | ||
| 50123456000 | ||
|
|
||
| query R | ||
| SELECT date_part('epoch', arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)')) | ||
|
|
@@ -797,8 +854,10 @@ SELECT extract(us from arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond | |
| ---- | ||
| 50123456 | ||
|
|
||
| query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported | ||
| query I | ||
| SELECT date_part('nanosecond', arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)')) | ||
| ---- | ||
| 50123456789 | ||
|
|
||
| query R | ||
| SELECT date_part('epoch', arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)')) | ||
|
|
@@ -1172,8 +1231,11 @@ SELECT (date_part('microsecond', now()) = EXTRACT(microsecond FROM now())) | |
| ---- | ||
| true | ||
|
|
||
| query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported | ||
| query B | ||
| SELECT (date_part('nanosecond', now()) = EXTRACT(nanosecond FROM now())) | ||
| ---- | ||
| true | ||
|
|
||
|
|
||
| query I | ||
| SELECT date_part('ISODOW', CAST('2000-01-01' AS DATE)) | ||
|
|
@@ -1211,7 +1273,7 @@ query D | |
| select c from t1 where extract(year from c) <> 2024; | ||
| ---- | ||
| 1990-01-01 | ||
| 2030-01-01 | ||
| 2030-01-01 | ||
|
|
||
| query D | ||
| select c from t1 where extract(year from c) > 2024; | ||
|
|
@@ -1495,7 +1557,7 @@ query TT | |
| explain select c from t1 where extract (nanosecond from c) = 2024 | ||
| ---- | ||
| logical_plan | ||
| 01)Filter: date_part(Utf8("NANOSECOND"), t1.c) = Int32(2024) | ||
| 01)Filter: date_part(Utf8("NANOSECOND"), t1.c) = Int64(2024) | ||
| 02)--TableScan: t1 projection=[c] | ||
| physical_plan | ||
| 01)FilterExec: date_part(NANOSECOND, c@0) = 2024 | ||
|
|
@@ -1582,7 +1644,7 @@ query D | |
| select c1_date32 from t2 where extract(year from c1_date32) <> 2024; | ||
| ---- | ||
| 1990-05-20 | ||
| 2030-12-31 | ||
| 2030-12-31 | ||
|
|
||
| query P | ||
| select c2_ts_sec from t2 where extract(year from c2_ts_sec) > 2024; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it worth adding a note that this is required due to the fact that nanosecond needs a different return type