Glue: support nanosecond-precision timestamp partition filtering (#5915)

This commit is contained in:
Robert Schmidtke 2023-02-10 23:09:56 +01:00 committed by GitHub
parent c6c0e50ee9
commit c9fe32520d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 8 additions and 8 deletions

View File

@ -173,7 +173,6 @@ class GlueBackend(BaseBackend):
Expression caveats: Expression caveats:
- Column names must consist of UPPERCASE, lowercase, dots and underscores only. - Column names must consist of UPPERCASE, lowercase, dots and underscores only.
- Nanosecond expressions on timestamp columns are rounded to microseconds.
- Literal dates and timestamps must be valid, i.e. no support for February 31st. - Literal dates and timestamps must be valid, i.e. no support for February 31st.
- LIKE expressions are converted to Python regexes, escaping special characters. - LIKE expressions are converted to Python regexes, escaping special characters.
Only % and _ wildcards are supported, and SQL escaping using [] does not work. Only % and _ wildcards are supported, and SQL escaping using [] does not work.

View File

@ -2,7 +2,7 @@ import abc
import operator import operator
import re import re
import warnings import warnings
from datetime import date, datetime, timedelta from datetime import date, datetime
from itertools import repeat from itertools import repeat
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
@ -74,15 +74,17 @@ def _cast(type_: str, value: Any) -> Union[date, datetime, float, int, str]:
f" {value} is not a timestamp." f" {value} is not a timestamp."
) )
# use nanosecond representation for timestamps
posix_nanoseconds = int(timestamp.timestamp() * 1_000_000_000)
nanos = match.group("nanos") nanos = match.group("nanos")
if nanos is not None: if nanos is not None:
# strip leading dot, reverse and left pad with zeros to nanoseconds # strip leading dot, reverse and left pad with zeros to nanoseconds
nanos = "".join(reversed(nanos[1:])).zfill(9) nanos = "".join(reversed(nanos[1:])).zfill(9)
for i, nanoseconds in enumerate(nanos): for i, nanoseconds in enumerate(nanos):
microseconds = (int(nanoseconds) * 10**i) / 1000 posix_nanoseconds += int(nanoseconds) * 10**i
timestamp += timedelta(microseconds=round(microseconds))
return timestamp return posix_nanoseconds
raise InvalidInputException("GetPartitions", f"Unknown type : '{type_}'") raise InvalidInputException("GetPartitions", f"Unknown type : '{type_}'")

View File

@ -288,11 +288,10 @@ def test_get_partitions_expression_timestamp_column():
"timestamp_col between '2022-01-15 00:00:00' AND '2022-02-15 00:00:00'", "timestamp_col between '2022-01-15 00:00:00' AND '2022-02-15 00:00:00'",
"timestamp_col > '2022-01-15 00:00:00' AND " "timestamp_col > '2022-01-15 00:00:00' AND "
"timestamp_col < '2022-02-15 00:00:00'", "timestamp_col < '2022-02-15 00:00:00'",
# these expressions only work because of rounding to microseconds
"timestamp_col = '2022-01-31 23:59:59.999999999'",
"timestamp_col = '2022-02-01 00:00:00.00000001'",
"timestamp_col > '2022-01-31 23:59:59.999999499' AND" "timestamp_col > '2022-01-31 23:59:59.999999499' AND"
" timestamp_col < '2022-02-01 00:00:00.0000009'", " timestamp_col < '2022-02-01 00:00:00.0000009'",
"timestamp_col > '2022-01-31 23:59:59.999999999' AND"
" timestamp_col < '2022-02-01 00:00:00.000000001'",
) )
for expression in timestamp_col_is_february_expressions: for expression in timestamp_col_is_february_expressions: