Glue: support nanosecond-precision timestamp partition filtering (#5915)

This commit is contained in:
Robert Schmidtke 2023-02-10 23:09:56 +01:00 committed by GitHub
parent c6c0e50ee9
commit c9fe32520d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 8 additions and 8 deletions

View File

@ -173,7 +173,6 @@ class GlueBackend(BaseBackend):
Expression caveats:
- Column names must consist of UPPERCASE, lowercase, dots and underscores only.
- Nanosecond expressions on timestamp columns are rounded to microseconds.
- Literal dates and timestamps must be valid, i.e. no support for February 31st.
- LIKE expressions are converted to Python regexes, escaping special characters.
Only % and _ wildcards are supported, and SQL escaping using [] does not work.

View File

@ -2,7 +2,7 @@ import abc
import operator
import re
import warnings
from datetime import date, datetime, timedelta
from datetime import date, datetime
from itertools import repeat
from typing import Any, Dict, List, Optional, Union
@ -74,15 +74,17 @@ def _cast(type_: str, value: Any) -> Union[date, datetime, float, int, str]:
f" {value} is not a timestamp."
)
# use nanosecond representation for timestamps
posix_nanoseconds = int(timestamp.timestamp() * 1_000_000_000)
nanos = match.group("nanos")
if nanos is not None:
# strip leading dot, reverse and left pad with zeros to nanoseconds
nanos = "".join(reversed(nanos[1:])).zfill(9)
for i, nanoseconds in enumerate(nanos):
microseconds = (int(nanoseconds) * 10**i) / 1000
timestamp += timedelta(microseconds=round(microseconds))
posix_nanoseconds += int(nanoseconds) * 10**i
return timestamp
return posix_nanoseconds
raise InvalidInputException("GetPartitions", f"Unknown type : '{type_}'")

View File

@ -288,11 +288,10 @@ def test_get_partitions_expression_timestamp_column():
"timestamp_col between '2022-01-15 00:00:00' AND '2022-02-15 00:00:00'",
"timestamp_col > '2022-01-15 00:00:00' AND "
"timestamp_col < '2022-02-15 00:00:00'",
# these expressions only work because of rounding to microseconds
"timestamp_col = '2022-01-31 23:59:59.999999999'",
"timestamp_col = '2022-02-01 00:00:00.00000001'",
"timestamp_col > '2022-01-31 23:59:59.999999499' AND"
" timestamp_col < '2022-02-01 00:00:00.0000009'",
"timestamp_col > '2022-01-31 23:59:59.999999999' AND"
" timestamp_col < '2022-02-01 00:00:00.000000001'",
)
for expression in timestamp_col_is_february_expressions: