2023-08-07 16:48:48 +00:00
|
|
|
import boto3
|
2023-10-10 15:44:00 +00:00
|
|
|
import json
|
2023-08-07 16:48:48 +00:00
|
|
|
import pytest
|
|
|
|
|
2023-10-10 15:44:00 +00:00
|
|
|
from . import s3_aws_verified
|
2023-08-07 16:48:48 +00:00
|
|
|
|
2023-03-21 16:55:19 +00:00
|
|
|
|
2023-05-25 16:37:45 +00:00
|
|
|
SIMPLE_JSON = {"a1": "b1", "a2": "b2", "a3": None}
|
2023-03-21 16:55:19 +00:00
|
|
|
SIMPLE_JSON2 = {"a1": "b2", "a3": "b3"}
|
2023-10-10 15:44:00 +00:00
|
|
|
NESTED_JSON = {"a1": {"b1": "b2"}, "a2": [True, False], "a3": True, "a4": [1, 5]}
|
2023-05-30 11:04:20 +00:00
|
|
|
EXTENSIVE_JSON = [
|
|
|
|
{
|
|
|
|
"staff": [
|
2023-10-10 15:44:00 +00:00
|
|
|
{
|
|
|
|
"name": "Janelyn M",
|
|
|
|
"city": "Chicago",
|
|
|
|
"kids": [{"Name": "Josh"}, {"Name": "Jay"}],
|
|
|
|
},
|
|
|
|
{"name": "Stacy P", "city": "Seattle", "kids": {"Name": "Josh"}},
|
2023-05-30 11:04:20 +00:00
|
|
|
],
|
|
|
|
"country": "USA",
|
|
|
|
}
|
|
|
|
]
|
2023-03-21 16:55:19 +00:00
|
|
|
SIMPLE_LIST = [SIMPLE_JSON, SIMPLE_JSON2]
|
|
|
|
SIMPLE_CSV = """a,b,c
|
|
|
|
e,r,f
|
|
|
|
y,u,i
|
|
|
|
q,w,y"""
|
|
|
|
|
|
|
|
|
2023-10-10 15:44:00 +00:00
|
|
|
def create_test_files(bucket_name):
|
|
|
|
client = boto3.client("s3", "us-east-1")
|
|
|
|
client.put_object(
|
|
|
|
Bucket=bucket_name, Key="simple.json", Body=json.dumps(SIMPLE_JSON)
|
|
|
|
)
|
|
|
|
client.put_object(Bucket=bucket_name, Key="list.json", Body=json.dumps(SIMPLE_LIST))
|
|
|
|
client.put_object(Bucket=bucket_name, Key="simple_csv", Body=SIMPLE_CSV)
|
|
|
|
client.put_object(
|
|
|
|
Bucket=bucket_name,
|
|
|
|
Key="extensive.json",
|
|
|
|
Body=json.dumps(EXTENSIVE_JSON),
|
|
|
|
)
|
|
|
|
client.put_object(
|
|
|
|
Bucket=bucket_name,
|
|
|
|
Key="nested.json",
|
|
|
|
Body=json.dumps(NESTED_JSON),
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.aws_verified
|
|
|
|
@s3_aws_verified
|
|
|
|
def test_query_all(bucket_name=None):
|
|
|
|
client = boto3.client("s3", "us-east-1")
|
|
|
|
create_test_files(bucket_name)
|
|
|
|
content = client.select_object_content(
|
|
|
|
Bucket=bucket_name,
|
|
|
|
Key="simple.json",
|
|
|
|
Expression="SELECT * FROM S3Object",
|
|
|
|
ExpressionType="SQL",
|
|
|
|
InputSerialization={"JSON": {"Type": "DOCUMENT"}},
|
|
|
|
OutputSerialization={"JSON": {"RecordDelimiter": ","}},
|
|
|
|
)
|
|
|
|
result = list(content["Payload"])
|
|
|
|
assert {"Records": {"Payload": b'{"a1":"b1","a2":"b2","a3":null},'}} in result
|
|
|
|
|
|
|
|
# Verify result is valid JSON
|
|
|
|
json.loads(result[0]["Records"]["Payload"][0:-1].decode("utf-8"))
|
|
|
|
|
|
|
|
# Verify result contains metadata
|
|
|
|
stats = [res for res in result if "Stats" in res][0]["Stats"]
|
|
|
|
assert "BytesScanned" in stats["Details"]
|
|
|
|
assert "BytesProcessed" in stats["Details"]
|
|
|
|
assert "BytesReturned" in stats["Details"]
|
|
|
|
assert {"End": {}} in result
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.aws_verified
|
|
|
|
@s3_aws_verified
|
|
|
|
def test_count_function(bucket_name=None):
|
|
|
|
client = boto3.client("s3", "us-east-1")
|
|
|
|
create_test_files(bucket_name)
|
|
|
|
content = client.select_object_content(
|
|
|
|
Bucket=bucket_name,
|
|
|
|
Key="simple.json",
|
|
|
|
Expression="SELECT count(*) FROM S3Object",
|
|
|
|
ExpressionType="SQL",
|
|
|
|
InputSerialization={"JSON": {"Type": "DOCUMENT"}},
|
|
|
|
OutputSerialization={"JSON": {"RecordDelimiter": ","}},
|
|
|
|
)
|
|
|
|
result = list(content["Payload"])
|
|
|
|
assert {"Records": {"Payload": b'{"_1":1},'}} in result
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.aws_verified
|
|
|
|
@s3_aws_verified
|
|
|
|
@pytest.mark.xfail(message="Not yet implement in our parser")
|
|
|
|
def test_count_as(bucket_name=None):
|
|
|
|
client = boto3.client("s3", "us-east-1")
|
|
|
|
create_test_files(bucket_name)
|
|
|
|
content = client.select_object_content(
|
|
|
|
Bucket=bucket_name,
|
|
|
|
Key="simple.json",
|
|
|
|
Expression="SELECT count(*) as cnt FROM S3Object",
|
|
|
|
ExpressionType="SQL",
|
|
|
|
InputSerialization={"JSON": {"Type": "DOCUMENT"}},
|
|
|
|
OutputSerialization={"JSON": {"RecordDelimiter": ","}},
|
|
|
|
)
|
|
|
|
result = list(content["Payload"])
|
|
|
|
assert {"Records": {"Payload": b'{"cnt":1},'}} in result
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.aws_verified
|
|
|
|
@s3_aws_verified
|
|
|
|
@pytest.mark.xfail(message="Not yet implement in our parser")
|
|
|
|
def test_count_list_as(bucket_name=None):
|
|
|
|
client = boto3.client("s3", "us-east-1")
|
|
|
|
create_test_files(bucket_name)
|
|
|
|
content = client.select_object_content(
|
|
|
|
Bucket=bucket_name,
|
|
|
|
Key="list.json",
|
|
|
|
Expression="SELECT count(*) as cnt FROM S3Object",
|
|
|
|
ExpressionType="SQL",
|
|
|
|
InputSerialization={"JSON": {"Type": "DOCUMENT"}},
|
|
|
|
OutputSerialization={"JSON": {"RecordDelimiter": ","}},
|
|
|
|
)
|
|
|
|
result = list(content["Payload"])
|
|
|
|
assert {"Records": {"Payload": b'{"cnt":1},'}} in result
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.aws_verified
|
|
|
|
@s3_aws_verified
|
|
|
|
def test_count_csv(bucket_name=None):
|
|
|
|
client = boto3.client("s3", "us-east-1")
|
|
|
|
create_test_files(bucket_name)
|
|
|
|
content = client.select_object_content(
|
|
|
|
Bucket=bucket_name,
|
|
|
|
Key="simple_csv",
|
|
|
|
Expression="SELECT count(*) FROM S3Object",
|
|
|
|
ExpressionType="SQL",
|
|
|
|
InputSerialization={"CSV": {"FileHeaderInfo": "USE", "FieldDelimiter": ","}},
|
|
|
|
OutputSerialization={"JSON": {"RecordDelimiter": ","}},
|
|
|
|
)
|
|
|
|
result = list(content["Payload"])
|
|
|
|
assert {"Records": {"Payload": b'{"_1":3},'}} in result
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.aws_verified
|
|
|
|
@s3_aws_verified
|
|
|
|
def test_default_record_delimiter(bucket_name=None):
|
|
|
|
client = boto3.client("s3", "us-east-1")
|
|
|
|
create_test_files(bucket_name)
|
|
|
|
content = client.select_object_content(
|
|
|
|
Bucket=bucket_name,
|
|
|
|
Key="simple_csv",
|
|
|
|
Expression="SELECT count(*) FROM S3Object",
|
|
|
|
ExpressionType="SQL",
|
|
|
|
InputSerialization={"CSV": {"FileHeaderInfo": "USE", "FieldDelimiter": ","}},
|
|
|
|
# RecordDelimiter is not specified - should default to new line (\n)
|
|
|
|
OutputSerialization={"JSON": {}},
|
|
|
|
)
|
|
|
|
result = list(content["Payload"])
|
|
|
|
assert {"Records": {"Payload": b'{"_1":3}\n'}} in result
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.aws_verified
|
|
|
|
@s3_aws_verified
|
|
|
|
def test_extensive_json__select_list(bucket_name=None):
|
|
|
|
client = boto3.client("s3", "us-east-1")
|
|
|
|
create_test_files(bucket_name)
|
|
|
|
content = client.select_object_content(
|
|
|
|
Bucket=bucket_name,
|
|
|
|
Key="extensive.json",
|
|
|
|
Expression="select * from s3object[*].staff[*] s",
|
|
|
|
ExpressionType="SQL",
|
|
|
|
InputSerialization={"JSON": {"Type": "DOCUMENT"}},
|
|
|
|
OutputSerialization={"JSON": {"RecordDelimiter": ","}},
|
|
|
|
)
|
|
|
|
result = list(content["Payload"])
|
|
|
|
assert {"Records": {"Payload": b"{},"}} in result
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.aws_verified
|
|
|
|
@s3_aws_verified
|
|
|
|
def test_extensive_json__select_all(bucket_name=None):
|
|
|
|
client = boto3.client("s3", "us-east-1")
|
|
|
|
create_test_files(bucket_name)
|
|
|
|
content = client.select_object_content(
|
|
|
|
Bucket=bucket_name,
|
|
|
|
Key="extensive.json",
|
|
|
|
Expression="select * from s3object s",
|
|
|
|
ExpressionType="SQL",
|
|
|
|
InputSerialization={"JSON": {"Type": "DOCUMENT"}},
|
|
|
|
OutputSerialization={"JSON": {"RecordDelimiter": ","}},
|
|
|
|
)
|
|
|
|
result = list(content["Payload"])
|
|
|
|
records = [res for res in result if "Records" in res][0]["Records"][
|
|
|
|
"Payload"
|
|
|
|
].decode("utf-8")
|
|
|
|
|
|
|
|
# For some reason, AWS returns records with a comma at the end
|
|
|
|
assert records[-1] == ","
|
|
|
|
|
|
|
|
# Because the original doc is a list, it is returned like this
|
|
|
|
assert json.loads(records[:-1]) == {"_1": EXTENSIVE_JSON}
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.aws_verified
|
|
|
|
@s3_aws_verified
|
|
|
|
def test_nested_json__select_all(bucket_name=None):
|
|
|
|
client = boto3.client("s3", "us-east-1")
|
|
|
|
create_test_files(bucket_name)
|
|
|
|
content = client.select_object_content(
|
|
|
|
Bucket=bucket_name,
|
|
|
|
Key="nested.json",
|
|
|
|
Expression="select * from s3object s",
|
|
|
|
ExpressionType="SQL",
|
|
|
|
InputSerialization={"JSON": {"Type": "DOCUMENT"}},
|
|
|
|
OutputSerialization={"JSON": {"RecordDelimiter": ","}},
|
|
|
|
)
|
|
|
|
result = list(content["Payload"])
|
|
|
|
records = [res for res in result if "Records" in res][0]["Records"][
|
|
|
|
"Payload"
|
|
|
|
].decode("utf-8")
|
|
|
|
|
|
|
|
# For some reason, AWS returns records with a comma at the end
|
|
|
|
assert records[-1] == ","
|
|
|
|
|
|
|
|
assert json.loads(records[:-1]) == NESTED_JSON
|