moto/tests/test_s3/test_s3_select.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

360 lines
12 KiB
Python
Raw Normal View History

import bz2
import gzip
import json
import boto3
import pytest
from botocore.exceptions import ClientError
from . import s3_aws_verified
2023-05-25 16:37:45 +00:00
SIMPLE_JSON = {"a1": "b1", "a2": "b2", "a3": None}
SIMPLE_JSON2 = {"a1": "b2", "a3": "b3"}
NESTED_JSON = {"a1": {"b1": "b2"}, "a2": [True, False], "a3": True, "a4": [1, 5]}
EXTENSIVE_JSON = [
{
"staff": [
{
"name": "Janelyn M",
"city": "Chicago",
"kids": [{"Name": "Josh"}, {"Name": "Jay"}],
},
{"name": "Stacy P", "city": "Seattle", "kids": {"Name": "Josh"}},
],
"country": "USA",
}
]
SIMPLE_LIST = [SIMPLE_JSON, SIMPLE_JSON2]
SIMPLE_CSV = """a,b,c
e,r,f
y,u,i
q,w,y"""
def create_test_files(bucket_name):
client = boto3.client("s3", "us-east-1")
client.put_object(
Bucket=bucket_name, Key="simple.json", Body=json.dumps(SIMPLE_JSON)
)
client.put_object(Bucket=bucket_name, Key="list.json", Body=json.dumps(SIMPLE_LIST))
client.put_object(Bucket=bucket_name, Key="simple_csv", Body=SIMPLE_CSV)
client.put_object(
Bucket=bucket_name,
Key="extensive.json",
Body=json.dumps(EXTENSIVE_JSON),
)
client.put_object(
Bucket=bucket_name,
Key="nested.json",
Body=json.dumps(NESTED_JSON),
)
client.put_object(
Bucket=bucket_name,
Key="json.gzip",
Body=gzip.compress(json.dumps(NESTED_JSON).encode("utf-8")),
)
client.put_object(
Bucket=bucket_name,
Key="json.bz2",
Body=bz2.compress(json.dumps(NESTED_JSON).encode("utf-8")),
)
client.put_object(
Bucket=bucket_name,
Key="csv.gzip",
Body=gzip.compress(SIMPLE_CSV.encode("utf-8")),
)
client.put_object(
Bucket=bucket_name, Key="csv.bz2", Body=bz2.compress(SIMPLE_CSV.encode("utf-8"))
)
@pytest.mark.aws_verified
@s3_aws_verified
def test_query_all(bucket_name=None):
client = boto3.client("s3", "us-east-1")
create_test_files(bucket_name)
content = client.select_object_content(
Bucket=bucket_name,
Key="simple.json",
Expression="SELECT * FROM S3Object",
ExpressionType="SQL",
InputSerialization={"JSON": {"Type": "DOCUMENT"}},
OutputSerialization={"JSON": {"RecordDelimiter": ","}},
)
result = list(content["Payload"])
assert {"Records": {"Payload": b'{"a1":"b1","a2":"b2","a3":null},'}} in result
# Verify result is valid JSON
json.loads(result[0]["Records"]["Payload"][0:-1].decode("utf-8"))
# Verify result contains metadata
stats = [res for res in result if "Stats" in res][0]["Stats"]
assert "BytesScanned" in stats["Details"]
assert "BytesProcessed" in stats["Details"]
assert "BytesReturned" in stats["Details"]
assert {"End": {}} in result
@pytest.mark.aws_verified
@s3_aws_verified
def test_count_function(bucket_name=None):
client = boto3.client("s3", "us-east-1")
create_test_files(bucket_name)
content = client.select_object_content(
Bucket=bucket_name,
Key="simple.json",
Expression="SELECT count(*) FROM S3Object",
ExpressionType="SQL",
InputSerialization={"JSON": {"Type": "DOCUMENT"}},
OutputSerialization={"JSON": {"RecordDelimiter": ","}},
)
result = list(content["Payload"])
assert {"Records": {"Payload": b'{"_1":1},'}} in result
@pytest.mark.aws_verified
@s3_aws_verified
@pytest.mark.xfail(message="Not yet implement in our parser")
def test_count_as(bucket_name=None):
client = boto3.client("s3", "us-east-1")
create_test_files(bucket_name)
content = client.select_object_content(
Bucket=bucket_name,
Key="simple.json",
Expression="SELECT count(*) as cnt FROM S3Object",
ExpressionType="SQL",
InputSerialization={"JSON": {"Type": "DOCUMENT"}},
OutputSerialization={"JSON": {"RecordDelimiter": ","}},
)
result = list(content["Payload"])
assert {"Records": {"Payload": b'{"cnt":1},'}} in result
@pytest.mark.aws_verified
@s3_aws_verified
@pytest.mark.xfail(message="Not yet implement in our parser")
def test_count_list_as(bucket_name=None):
client = boto3.client("s3", "us-east-1")
create_test_files(bucket_name)
content = client.select_object_content(
Bucket=bucket_name,
Key="list.json",
Expression="SELECT count(*) as cnt FROM S3Object",
ExpressionType="SQL",
InputSerialization={"JSON": {"Type": "DOCUMENT"}},
OutputSerialization={"JSON": {"RecordDelimiter": ","}},
)
result = list(content["Payload"])
assert {"Records": {"Payload": b'{"cnt":1},'}} in result
@pytest.mark.aws_verified
@s3_aws_verified
def test_count_csv(bucket_name=None):
client = boto3.client("s3", "us-east-1")
create_test_files(bucket_name)
content = client.select_object_content(
Bucket=bucket_name,
Key="simple_csv",
Expression="SELECT count(*) FROM S3Object",
ExpressionType="SQL",
InputSerialization={"CSV": {"FileHeaderInfo": "USE", "FieldDelimiter": ","}},
OutputSerialization={"JSON": {"RecordDelimiter": ","}},
)
result = list(content["Payload"])
assert {"Records": {"Payload": b'{"_1":3},'}} in result
@pytest.mark.aws_verified
@s3_aws_verified
def test_default_record_delimiter(bucket_name=None):
client = boto3.client("s3", "us-east-1")
create_test_files(bucket_name)
content = client.select_object_content(
Bucket=bucket_name,
Key="simple_csv",
Expression="SELECT count(*) FROM S3Object",
ExpressionType="SQL",
InputSerialization={"CSV": {"FileHeaderInfo": "USE", "FieldDelimiter": ","}},
# RecordDelimiter is not specified - should default to new line (\n)
OutputSerialization={"JSON": {}},
)
result = list(content["Payload"])
assert {"Records": {"Payload": b'{"_1":3}\n'}} in result
@pytest.mark.aws_verified
@s3_aws_verified
def test_extensive_json__select_list(bucket_name=None):
client = boto3.client("s3", "us-east-1")
create_test_files(bucket_name)
content = client.select_object_content(
Bucket=bucket_name,
Key="extensive.json",
Expression="select * from s3object[*].staff[*] s",
ExpressionType="SQL",
InputSerialization={"JSON": {"Type": "DOCUMENT"}},
OutputSerialization={"JSON": {"RecordDelimiter": ","}},
)
result = list(content["Payload"])
assert {"Records": {"Payload": b"{},"}} in result
@pytest.mark.aws_verified
@s3_aws_verified
def test_extensive_json__select_all(bucket_name=None):
client = boto3.client("s3", "us-east-1")
create_test_files(bucket_name)
content = client.select_object_content(
Bucket=bucket_name,
Key="extensive.json",
Expression="select * from s3object s",
ExpressionType="SQL",
InputSerialization={"JSON": {"Type": "DOCUMENT"}},
OutputSerialization={"JSON": {"RecordDelimiter": ","}},
)
result = list(content["Payload"])
records = [res for res in result if "Records" in res][0]["Records"][
"Payload"
].decode("utf-8")
# For some reason, AWS returns records with a comma at the end
assert records[-1] == ","
# Because the original doc is a list, it is returned like this
assert json.loads(records[:-1]) == {"_1": EXTENSIVE_JSON}
@pytest.mark.aws_verified
@s3_aws_verified
def test_nested_json__select_all(bucket_name=None):
client = boto3.client("s3", "us-east-1")
create_test_files(bucket_name)
content = client.select_object_content(
Bucket=bucket_name,
Key="nested.json",
Expression="select * from s3object s",
ExpressionType="SQL",
InputSerialization={"JSON": {"Type": "DOCUMENT"}},
OutputSerialization={"JSON": {"RecordDelimiter": ","}},
)
result = list(content["Payload"])
records = [res for res in result if "Records" in res][0]["Records"][
"Payload"
].decode("utf-8")
# For some reason, AWS returns records with a comma at the end
assert records[-1] == ","
assert json.loads(records[:-1]) == NESTED_JSON
@pytest.mark.aws_verified
@s3_aws_verified
def test_gzipped_json(bucket_name=None):
client = boto3.client("s3", "us-east-1")
create_test_files(bucket_name)
content = client.select_object_content(
Bucket=bucket_name,
Key="json.gzip",
Expression="SELECT count(*) FROM S3Object",
ExpressionType="SQL",
InputSerialization={"JSON": {"Type": "DOCUMENT"}, "CompressionType": "GZIP"},
OutputSerialization={"JSON": {"RecordDelimiter": ","}},
)
result = list(content["Payload"])
assert {"Records": {"Payload": b'{"_1":1},'}} in result
@pytest.mark.aws_verified
@s3_aws_verified
def test_bzipped_json(bucket_name=None):
client = boto3.client("s3", "us-east-1")
create_test_files(bucket_name)
content = client.select_object_content(
Bucket=bucket_name,
Key="json.bz2",
Expression="SELECT count(*) FROM S3Object",
ExpressionType="SQL",
InputSerialization={"JSON": {"Type": "DOCUMENT"}, "CompressionType": "BZIP2"},
OutputSerialization={"JSON": {"RecordDelimiter": ","}},
)
result = list(content["Payload"])
assert {"Records": {"Payload": b'{"_1":1},'}} in result
@pytest.mark.aws_verified
@s3_aws_verified
def test_bzipped_csv_to_csv(bucket_name=None):
client = boto3.client("s3", "us-east-1")
create_test_files(bucket_name)
# Count Records
content = client.select_object_content(
Bucket=bucket_name,
Key="csv.bz2",
Expression="SELECT count(*) FROM S3Object",
ExpressionType="SQL",
InputSerialization={"CSV": {}, "CompressionType": "BZIP2"},
OutputSerialization={"CSV": {"RecordDelimiter": "_", "FieldDelimiter": ":"}},
)
result = list(content["Payload"])
assert {"Records": {"Payload": b"4_"}} in result
# Count Records
content = client.select_object_content(
Bucket=bucket_name,
Key="csv.bz2",
Expression="SELECT count(*) FROM S3Object",
ExpressionType="SQL",
InputSerialization={"CSV": {}, "CompressionType": "BZIP2"},
OutputSerialization={"CSV": {}},
)
result = list(content["Payload"])
assert {"Records": {"Payload": b"4\n"}} in result
# Mirror records
content = client.select_object_content(
Bucket=bucket_name,
Key="csv.bz2",
Expression="SELECT * FROM S3Object",
ExpressionType="SQL",
InputSerialization={"CSV": {}, "CompressionType": "BZIP2"},
OutputSerialization={"CSV": {}},
)
result = list(content["Payload"])
assert {"Records": {"Payload": b"a,b,c\ne,r,f\ny,u,i\nq,w,y\n"}} in result
# Mirror records, specifying output format
content = client.select_object_content(
Bucket=bucket_name,
Key="csv.bz2",
Expression="SELECT * FROM S3Object",
ExpressionType="SQL",
InputSerialization={"CSV": {}, "CompressionType": "BZIP2"},
OutputSerialization={"CSV": {"RecordDelimiter": "\n", "FieldDelimiter": ":"}},
)
result = list(content["Payload"])
assert {"Records": {"Payload": b"a:b:c\ne:r:f\ny:u:i\nq:w:y\n"}} in result
@pytest.mark.aws_verified
@s3_aws_verified
def test_select_unknown_key(bucket_name=None):
client = boto3.client("s3", "us-east-1")
with pytest.raises(ClientError) as exc:
client.select_object_content(
Bucket=bucket_name,
Key="unknown",
Expression="SELECT count(*) FROM S3Object",
ExpressionType="SQL",
InputSerialization={"CSV": {}, "CompressionType": "BZIP2"},
OutputSerialization={
"CSV": {"RecordDelimiter": "\n", "FieldDelimiter": ":"}
},
)
err = exc.value.response["Error"]
assert err["Code"] == "NoSuchKey"
assert err["Message"] == "The specified key does not exist."
assert err["Key"] == "unknown"