S3: select_object_content() now takes RecordDelimiter into account (#6618)

This commit is contained in:
Bert Blommers 2023-08-09 10:49:01 +00:00 committed by GitHub
parent db87597018
commit db0bec1418
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 23 additions and 6 deletions

View File

@ -2522,7 +2522,6 @@ class S3Backend(BaseBackend, CloudWatchMetricProvider):
key_name: str,
select_query: str,
input_details: Dict[str, Any],
output_details: Dict[str, Any], # pylint: disable=unused-argument
) -> List[bytes]:
"""
Highly experimental. Please raise an issue if you find any inconsistencies/bugs.
@ -2531,7 +2530,7 @@ class S3Backend(BaseBackend, CloudWatchMetricProvider):
- Function aliases (count(*) as cnt)
- Most functions (only count() is supported)
- Result is always in JSON
- FieldDelimiters and RecordDelimiters are ignored
- FieldDelimiters are ignored
"""
self.get_bucket(bucket_name)
key = self.get_object(bucket_name, key_name)

View File

@ -2288,9 +2288,9 @@ class S3Response(BaseResponse):
input_details = request["InputSerialization"]
output_details = request["OutputSerialization"]
results = self.backend.select_object_content(
bucket_name, key_name, select_query, input_details, output_details
bucket_name, key_name, select_query, input_details
)
return 200, {}, serialize_select(results)
return 200, {}, serialize_select(results, output_details)
else:
raise NotImplementedError(

View File

@ -49,8 +49,11 @@ def _create_end_message() -> bytes:
return _create_message(content_type=None, event_type=b"End", payload=b"")
def serialize_select(data_list: List[bytes]) -> bytes:
def serialize_select(data_list: List[bytes], output_details: Dict[str, Any]) -> bytes:
delimiter = (
(output_details.get("JSON") or {}).get("RecordDelimiter") or "\n"
).encode("utf-8")
response = b""
for data in data_list:
response += _create_data_message(data + b",")
response += _create_data_message(data + delimiter)
return response + _create_stats_message() + _create_end_message()

View File

@ -133,6 +133,21 @@ class TestS3Select(TestCase):
result = list(content["Payload"])
assert {"Records": {"Payload": b'{"_1":3},'}} in result
def test_default_record_delimiter(self):
content = self.client.select_object_content(
Bucket=self.bucket_name,
Key="simple_csv",
Expression="SELECT count(*) FROM S3Object",
ExpressionType="SQL",
InputSerialization={
"CSV": {"FileHeaderInfo": "USE", "FieldDelimiter": ","}
},
# RecordDelimiter is not specified - should default to new line (\n)
OutputSerialization={"JSON": {}},
)
result = list(content["Payload"])
assert {"Records": {"Payload": b'{"_1":3}\n'}} in result
def test_extensive_json__select_list(self):
content = self.client.select_object_content(
Bucket=self.bucket_name,