Databrew: Add datasets support (#5095)
This commit is contained in:
parent
957b3148e0
commit
babbd21814
@ -1069,30 +1069,30 @@
|
||||
|
||||
## databrew
|
||||
<details>
|
||||
<summary>22% implemented</summary>
|
||||
<summary>27% implemented</summary>
|
||||
|
||||
- [ ] batch_delete_recipe_version
|
||||
- [ ] create_dataset
|
||||
- [X] create_dataset
|
||||
- [ ] create_profile_job
|
||||
- [ ] create_project
|
||||
- [X] create_recipe
|
||||
- [ ] create_recipe_job
|
||||
- [X] create_ruleset
|
||||
- [ ] create_schedule
|
||||
- [ ] delete_dataset
|
||||
- [X] delete_dataset
|
||||
- [ ] delete_job
|
||||
- [ ] delete_project
|
||||
- [X] delete_recipe_version
|
||||
- [X] delete_ruleset
|
||||
- [ ] delete_schedule
|
||||
- [ ] describe_dataset
|
||||
- [X] describe_dataset
|
||||
- [ ] describe_job
|
||||
- [ ] describe_job_run
|
||||
- [ ] describe_project
|
||||
- [ ] describe_recipe
|
||||
- [ ] describe_ruleset
|
||||
- [ ] describe_schedule
|
||||
- [ ] list_datasets
|
||||
- [X] list_datasets
|
||||
- [ ] list_job_runs
|
||||
- [ ] list_jobs
|
||||
- [ ] list_projects
|
||||
@ -1108,7 +1108,7 @@
|
||||
- [ ] stop_job_run
|
||||
- [ ] tag_resource
|
||||
- [ ] untag_resource
|
||||
- [ ] update_dataset
|
||||
- [X] update_dataset
|
||||
- [ ] update_profile_job
|
||||
- [ ] update_project
|
||||
- [X] update_recipe
|
||||
|
@ -26,27 +26,27 @@ databrew
|
||||
|start-h3| Implemented features for this service |end-h3|
|
||||
|
||||
- [ ] batch_delete_recipe_version
|
||||
- [ ] create_dataset
|
||||
- [X] create_dataset
|
||||
- [ ] create_profile_job
|
||||
- [ ] create_project
|
||||
- [X] create_recipe
|
||||
- [ ] create_recipe_job
|
||||
- [X] create_ruleset
|
||||
- [ ] create_schedule
|
||||
- [ ] delete_dataset
|
||||
- [X] delete_dataset
|
||||
- [ ] delete_job
|
||||
- [ ] delete_project
|
||||
- [X] delete_recipe_version
|
||||
- [X] delete_ruleset
|
||||
- [ ] delete_schedule
|
||||
- [ ] describe_dataset
|
||||
- [X] describe_dataset
|
||||
- [ ] describe_job
|
||||
- [ ] describe_job_run
|
||||
- [ ] describe_project
|
||||
- [ ] describe_recipe
|
||||
- [ ] describe_ruleset
|
||||
- [ ] describe_schedule
|
||||
- [ ] list_datasets
|
||||
- [X] list_datasets
|
||||
- [ ] list_job_runs
|
||||
- [ ] list_jobs
|
||||
- [ ] list_projects
|
||||
@ -62,7 +62,7 @@ databrew
|
||||
- [ ] stop_job_run
|
||||
- [ ] tag_resource
|
||||
- [ ] untag_resource
|
||||
- [ ] update_dataset
|
||||
- [X] update_dataset
|
||||
- [ ] update_profile_job
|
||||
- [ ] update_project
|
||||
- [X] update_recipe
|
||||
|
@ -1,4 +1,4 @@
|
||||
# autogenerated by scripts/update_backend_index.py
|
||||
# autogenerated by ./scripts/update_backend_index.py
|
||||
import re
|
||||
|
||||
backend_url_patterns = [
|
||||
|
@ -42,3 +42,12 @@ class ResourceNotFoundException(DataBrewClientError):
|
||||
class RulesetNotFoundException(EntityNotFoundException):
|
||||
def __init__(self, recipe_name):
|
||||
super().__init__("Ruleset %s not found." % recipe_name)
|
||||
|
||||
|
||||
class ServiceQuotaExceededException(JsonRESTError):
|
||||
code = 402
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
"ServiceQuotaExceededException", "A service quota is exceeded."
|
||||
)
|
||||
|
@ -6,12 +6,15 @@ from datetime import datetime
|
||||
from moto.core import BaseBackend, BaseModel
|
||||
from moto.core.utils import BackendDict
|
||||
from moto.utilities.paginator import paginate
|
||||
|
||||
from .exceptions import (
|
||||
AlreadyExistsException,
|
||||
ConflictException,
|
||||
ResourceNotFoundException,
|
||||
ValidationException,
|
||||
RulesetAlreadyExistsException,
|
||||
RulesetNotFoundException,
|
||||
ResourceNotFoundException,
|
||||
)
|
||||
from .exceptions import RulesetAlreadyExistsException, RulesetNotFoundException
|
||||
|
||||
|
||||
class DataBrewBackend(BaseBackend):
|
||||
@ -34,12 +37,19 @@ class DataBrewBackend(BaseBackend):
|
||||
"limit_default": 100,
|
||||
"unique_attribute": "name",
|
||||
},
|
||||
"list_datasets": {
|
||||
"input_token": "next_token",
|
||||
"limit_key": "max_results",
|
||||
"limit_default": 100,
|
||||
"unique_attribute": "name",
|
||||
},
|
||||
}
|
||||
|
||||
def __init__(self, region_name):
|
||||
self.region_name = region_name
|
||||
self.recipes = OrderedDict()
|
||||
self.rulesets = OrderedDict()
|
||||
self.datasets = OrderedDict()
|
||||
|
||||
def reset(self):
|
||||
"""Re-initialize all attributes for this instance."""
|
||||
@ -221,6 +231,74 @@ class DataBrewBackend(BaseBackend):
|
||||
|
||||
del self.rulesets[ruleset_name]
|
||||
|
||||
def create_dataset(
|
||||
self,
|
||||
dataset_name,
|
||||
dataset_format,
|
||||
dataset_format_options,
|
||||
dataset_input,
|
||||
dataset_path_options,
|
||||
tags,
|
||||
):
|
||||
if dataset_name in self.datasets:
|
||||
raise AlreadyExistsException(dataset_name)
|
||||
|
||||
dataset = FakeDataset(
|
||||
self.region_name,
|
||||
dataset_name,
|
||||
dataset_format,
|
||||
dataset_format_options,
|
||||
dataset_input,
|
||||
dataset_path_options,
|
||||
tags,
|
||||
)
|
||||
self.datasets[dataset_name] = dataset
|
||||
return dataset
|
||||
|
||||
@paginate(pagination_model=PAGINATION_MODEL)
|
||||
def list_datasets(self):
|
||||
return list(self.datasets.values())
|
||||
|
||||
def update_dataset(
|
||||
self,
|
||||
dataset_name,
|
||||
dataset_format,
|
||||
dataset_format_options,
|
||||
dataset_input,
|
||||
dataset_path_options,
|
||||
tags,
|
||||
):
|
||||
|
||||
if dataset_name not in self.datasets:
|
||||
raise ResourceNotFoundException("One or more resources can't be found.")
|
||||
|
||||
dataset = self.datasets[dataset_name]
|
||||
|
||||
if dataset_format is not None:
|
||||
dataset.format = dataset_format
|
||||
if dataset_format_options is not None:
|
||||
dataset.format_options = dataset_format_options
|
||||
if dataset_input is not None:
|
||||
dataset.input = dataset_input
|
||||
if dataset_path_options is not None:
|
||||
dataset.path_options = dataset_path_options
|
||||
if tags is not None:
|
||||
dataset.tags = tags
|
||||
|
||||
return dataset
|
||||
|
||||
def delete_dataset(self, dataset_name):
|
||||
if dataset_name not in self.datasets:
|
||||
raise ResourceNotFoundException("One or more resources can't be found.")
|
||||
|
||||
del self.datasets[dataset_name]
|
||||
|
||||
def describe_dataset(self, dataset_name):
|
||||
if dataset_name not in self.datasets:
|
||||
raise ResourceNotFoundException("One or more resources can't be found.")
|
||||
|
||||
return self.datasets[dataset_name]
|
||||
|
||||
|
||||
class FakeRecipe(BaseModel):
|
||||
INITIAL_VERSION = 0.1
|
||||
@ -355,4 +433,36 @@ class FakeRuleset(BaseModel):
|
||||
}
|
||||
|
||||
|
||||
class FakeDataset(BaseModel):
|
||||
def __init__(
|
||||
self,
|
||||
region_name,
|
||||
dataset_name,
|
||||
dataset_format,
|
||||
dataset_format_options,
|
||||
dataset_input,
|
||||
dataset_path_options,
|
||||
tags,
|
||||
):
|
||||
self.region_name = region_name
|
||||
self.name = dataset_name
|
||||
self.format = dataset_format
|
||||
self.format_options = dataset_format_options
|
||||
self.input = dataset_input
|
||||
self.path_options = dataset_path_options
|
||||
self.created_time = datetime.now()
|
||||
self.tags = tags
|
||||
|
||||
def as_dict(self):
|
||||
return {
|
||||
"Name": self.name,
|
||||
"Format": self.format,
|
||||
"FormatOptions": self.format_options,
|
||||
"Input": self.input,
|
||||
"PathOptions": self.path_options,
|
||||
"CreateTime": self.created_time.isoformat(),
|
||||
"Tags": self.tags or dict(),
|
||||
}
|
||||
|
||||
|
||||
databrew_backends = BackendDict(DataBrewBackend, "databrew")
|
||||
|
@ -14,6 +14,7 @@ class DataBrewResponse(BaseResponse):
|
||||
"""Return backend instance specific for this region."""
|
||||
return databrew_backends[self.region]
|
||||
|
||||
# region Recipes
|
||||
@property
|
||||
def parameters(self):
|
||||
return json.loads(self.body)
|
||||
@ -133,6 +134,10 @@ class DataBrewResponse(BaseResponse):
|
||||
elif request.method == "GET":
|
||||
return self.get_recipe_response(recipe_name)
|
||||
|
||||
# endregion
|
||||
|
||||
# region Rulesets
|
||||
|
||||
@amzn_request_id
|
||||
def create_ruleset(self):
|
||||
ruleset_description = self.parameters.get("Description")
|
||||
@ -202,3 +207,94 @@ class DataBrewResponse(BaseResponse):
|
||||
"NextToken": next_token,
|
||||
}
|
||||
)
|
||||
|
||||
# endregion
|
||||
|
||||
# region Datasets
|
||||
|
||||
@amzn_request_id
|
||||
def create_dataset(self):
|
||||
dataset_name = self.parameters.get("Name")
|
||||
dataset_format = self.parameters.get("Format")
|
||||
dataset_format_options = self.parameters.get("FormatOptions")
|
||||
dataset_input = self.parameters.get("Input")
|
||||
dataset_path_otions = self.parameters.get("PathOptions")
|
||||
dataset_tags = self.parameters.get("Tags")
|
||||
|
||||
return json.dumps(
|
||||
self.databrew_backend.create_dataset(
|
||||
dataset_name,
|
||||
dataset_format,
|
||||
dataset_format_options,
|
||||
dataset_input,
|
||||
dataset_path_otions,
|
||||
dataset_tags,
|
||||
).as_dict()
|
||||
)
|
||||
|
||||
@amzn_request_id
|
||||
def list_datasets(self):
|
||||
next_token = self._get_param("NextToken", self._get_param("nextToken"))
|
||||
max_results = self._get_int_param(
|
||||
"MaxResults", self._get_int_param("maxResults")
|
||||
)
|
||||
|
||||
# pylint: disable=unexpected-keyword-arg, unbalanced-tuple-unpacking
|
||||
dataset_list, next_token = self.databrew_backend.list_datasets(
|
||||
next_token=next_token, max_results=max_results
|
||||
)
|
||||
|
||||
return json.dumps(
|
||||
{
|
||||
"Datasets": [dataset.as_dict() for dataset in dataset_list],
|
||||
"NextToken": next_token,
|
||||
}
|
||||
)
|
||||
|
||||
@amzn_request_id
|
||||
def update_dataset(self, dataset_name):
|
||||
dataset_format = self.parameters.get("Format")
|
||||
dataset_format_options = self.parameters.get("FormatOptions")
|
||||
dataset_input = self.parameters.get("Input")
|
||||
dataset_path_otions = self.parameters.get("PathOptions")
|
||||
dataset_tags = self.parameters.get("Tags")
|
||||
|
||||
dataset = self.databrew_backend.update_dataset(
|
||||
dataset_name,
|
||||
dataset_format,
|
||||
dataset_format_options,
|
||||
dataset_input,
|
||||
dataset_path_otions,
|
||||
dataset_tags,
|
||||
)
|
||||
return 200, {}, json.dumps(dataset.as_dict())
|
||||
|
||||
@amzn_request_id
|
||||
def delete_dataset(self, dataset_name):
|
||||
self.databrew_backend.delete_dataset(dataset_name)
|
||||
return 200, {}, json.dumps({"Name": dataset_name})
|
||||
|
||||
@amzn_request_id
|
||||
def describe_dataset(self, dataset_name):
|
||||
dataset = self.databrew_backend.describe_dataset(dataset_name)
|
||||
return 200, {}, json.dumps(dataset.as_dict())
|
||||
|
||||
@amzn_request_id
|
||||
def dataset_response(self, request, full_url, headers):
|
||||
self.setup_class(request, full_url, headers)
|
||||
parsed_url = urlparse(full_url)
|
||||
|
||||
dataset_name = parsed_url.path.split("/")[-1]
|
||||
|
||||
if request.method == "POST":
|
||||
return self.create_dataset()
|
||||
elif request.method == "GET" and dataset_name:
|
||||
return self.describe_dataset(dataset_name)
|
||||
elif request.method == "GET":
|
||||
return self.list_datasets()
|
||||
elif request.method == "DELETE":
|
||||
return self.delete_dataset(dataset_name)
|
||||
elif request.method == "PUT":
|
||||
return self.update_dataset(dataset_name)
|
||||
|
||||
# endregion
|
||||
|
@ -10,4 +10,6 @@ url_paths = {
|
||||
"{0}/recipes/(?P<recipe_name>[^/]+)/publishRecipe$": DataBrewResponse().publish_recipe,
|
||||
"{0}/rulesets$": DataBrewResponse.dispatch,
|
||||
"{0}/rulesets/(?P<ruleset_name>[^/]+)$": DataBrewResponse().ruleset_response,
|
||||
"{0}/datasets$": DataBrewResponse.dispatch,
|
||||
"{0}/datasets/(?P<dataset_name>[^/]+)$": DataBrewResponse().dataset_response,
|
||||
}
|
||||
|
268
tests/test_databrew/test_databrew_datasets.py
Normal file
268
tests/test_databrew/test_databrew_datasets.py
Normal file
@ -0,0 +1,268 @@
|
||||
import uuid
|
||||
|
||||
import boto3
|
||||
import pytest
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
from moto import mock_databrew
|
||||
|
||||
|
||||
def _create_databrew_client():
|
||||
client = boto3.client("databrew", region_name="us-west-1")
|
||||
return client
|
||||
|
||||
|
||||
def _create_test_dataset(
|
||||
client,
|
||||
tags=None,
|
||||
dataset_name=None,
|
||||
dataset_format="JSON",
|
||||
dataset_format_options=None,
|
||||
):
|
||||
if dataset_name is None:
|
||||
dataset_name = str(uuid.uuid4())
|
||||
|
||||
if not dataset_format_options:
|
||||
if dataset_format == "JSON":
|
||||
dataset_format_options = {"Json": {"MultiLine": True}}
|
||||
elif dataset_format == "CSV":
|
||||
dataset_format_options = {"Csv": {"Delimiter": ",", "HeaderRow": False}}
|
||||
elif dataset_format == "EXCEL":
|
||||
dataset_format_options = {
|
||||
"Excel": {
|
||||
"SheetNames": [
|
||||
"blaa",
|
||||
],
|
||||
"SheetIndexes": [
|
||||
123,
|
||||
],
|
||||
"HeaderRow": True,
|
||||
}
|
||||
}
|
||||
|
||||
return client.create_dataset(
|
||||
Name=dataset_name,
|
||||
Format=dataset_format,
|
||||
FormatOptions=dataset_format_options,
|
||||
Input={
|
||||
"S3InputDefinition": {
|
||||
"Bucket": "somerandombucketname",
|
||||
},
|
||||
"DataCatalogInputDefinition": {
|
||||
"DatabaseName": "somedbname",
|
||||
"TableName": "sometablename",
|
||||
"TempDirectory": {
|
||||
"Bucket": "sometempbucketname",
|
||||
},
|
||||
},
|
||||
"DatabaseInputDefinition": {
|
||||
"GlueConnectionName": "someglueconnectionname",
|
||||
"TempDirectory": {
|
||||
"Bucket": "sometempbucketname",
|
||||
},
|
||||
},
|
||||
},
|
||||
PathOptions={
|
||||
"LastModifiedDateCondition": {
|
||||
"Expression": "string",
|
||||
"ValuesMap": {"string": "string"},
|
||||
},
|
||||
"FilesLimit": {
|
||||
"MaxFiles": 123,
|
||||
"OrderedBy": "LAST_MODIFIED_DATE",
|
||||
"Order": "ASCENDING",
|
||||
},
|
||||
"Parameters": {
|
||||
"string": {
|
||||
"Name": "string",
|
||||
"Type": "string",
|
||||
"CreateColumn": False,
|
||||
"Filter": {
|
||||
"Expression": "string",
|
||||
"ValuesMap": {"string": "string"},
|
||||
},
|
||||
}
|
||||
},
|
||||
},
|
||||
Tags=tags or {},
|
||||
)
|
||||
|
||||
|
||||
def _create_test_datasets(client, count):
|
||||
for _ in range(count):
|
||||
_create_test_dataset(client)
|
||||
|
||||
|
||||
@mock_databrew
|
||||
def test_dataset_list_when_empty():
|
||||
client = _create_databrew_client()
|
||||
|
||||
response = client.list_datasets()
|
||||
response.should.have.key("Datasets")
|
||||
response["Datasets"].should.have.length_of(0)
|
||||
|
||||
|
||||
@mock_databrew
|
||||
def test_list_datasets_with_max_results():
|
||||
client = _create_databrew_client()
|
||||
|
||||
_create_test_datasets(client, 4)
|
||||
response = client.list_datasets(MaxResults=2)
|
||||
response["Datasets"].should.have.length_of(2)
|
||||
response.should.have.key("NextToken")
|
||||
|
||||
|
||||
@mock_databrew
|
||||
def test_list_datasets_from_next_token():
|
||||
client = _create_databrew_client()
|
||||
_create_test_datasets(client, 10)
|
||||
first_response = client.list_datasets(MaxResults=3)
|
||||
response = client.list_datasets(NextToken=first_response["NextToken"])
|
||||
response["Datasets"].should.have.length_of(7)
|
||||
|
||||
|
||||
@mock_databrew
|
||||
def test_list_datasets_with_max_results_greater_than_actual_results():
|
||||
client = _create_databrew_client()
|
||||
_create_test_datasets(client, 4)
|
||||
response = client.list_datasets(MaxResults=10)
|
||||
response["Datasets"].should.have.length_of(4)
|
||||
|
||||
|
||||
@mock_databrew
|
||||
def test_describe_dataset():
|
||||
client = _create_databrew_client()
|
||||
|
||||
# region basic test
|
||||
response = _create_test_dataset(client)
|
||||
dataset = client.describe_dataset(Name=response["Name"])
|
||||
dataset["Name"].should.equal(response["Name"])
|
||||
# endregion
|
||||
|
||||
# region JSON test
|
||||
response = _create_test_dataset(client, dataset_format="CSV")
|
||||
dataset = client.describe_dataset(Name=response["Name"])
|
||||
dataset["Format"].should.equal("CSV")
|
||||
# endregion
|
||||
|
||||
|
||||
@mock_databrew
|
||||
def test_describe_dataset_that_does_not_exist():
|
||||
client = _create_databrew_client()
|
||||
|
||||
with pytest.raises(ClientError) as exc:
|
||||
client.describe_dataset(Name="DoseNotExist")
|
||||
err = exc.value.response["Error"]
|
||||
err["Code"].should.equal("ResourceNotFoundException")
|
||||
err["Message"].should.equal("One or more resources can't be found.")
|
||||
|
||||
|
||||
@mock_databrew
|
||||
def test_create_dataset_that_already_exists():
|
||||
client = _create_databrew_client()
|
||||
|
||||
response = _create_test_dataset(client)
|
||||
|
||||
with pytest.raises(ClientError) as exc:
|
||||
_create_test_dataset(client, dataset_name=response["Name"])
|
||||
err = exc.value.response["Error"]
|
||||
err["Code"].should.equal("AlreadyExistsException")
|
||||
err["Message"].should.equal(f"{response['Name']} already exists.")
|
||||
|
||||
|
||||
@mock_databrew
|
||||
def test_delete_dataset():
|
||||
client = _create_databrew_client()
|
||||
response = _create_test_dataset(client)
|
||||
|
||||
# Check dataset exists
|
||||
dataset = client.describe_dataset(Name=response["Name"])
|
||||
dataset["Name"].should.equal(response["Name"])
|
||||
|
||||
# Delete the dataset
|
||||
client.delete_dataset(Name=response["Name"])
|
||||
|
||||
# Check it does not exist anymore
|
||||
with pytest.raises(ClientError) as exc:
|
||||
client.describe_dataset(Name=response["Name"])
|
||||
|
||||
err = exc.value.response["Error"]
|
||||
err["Code"].should.equal("ResourceNotFoundException")
|
||||
err["Message"].should.equal("One or more resources can't be found.")
|
||||
|
||||
# Check that a dataset that does not exist errors
|
||||
with pytest.raises(ClientError) as exc:
|
||||
client.delete_dataset(Name=response["Name"])
|
||||
err = exc.value.response["Error"]
|
||||
err["Code"].should.equal("ResourceNotFoundException")
|
||||
err["Message"].should.equal("One or more resources can't be found.")
|
||||
|
||||
|
||||
@mock_databrew
|
||||
def test_update_dataset():
|
||||
client = _create_databrew_client()
|
||||
response = _create_test_dataset(client)
|
||||
|
||||
# Update the dataset and check response
|
||||
dataset = client.update_dataset(
|
||||
Name=response["Name"],
|
||||
Format="TEST",
|
||||
Input={
|
||||
"S3InputDefinition": {
|
||||
"Bucket": "somerandombucketname",
|
||||
},
|
||||
"DataCatalogInputDefinition": {
|
||||
"DatabaseName": "somedbname",
|
||||
"TableName": "sometablename",
|
||||
"TempDirectory": {
|
||||
"Bucket": "sometempbucketname",
|
||||
},
|
||||
},
|
||||
"DatabaseInputDefinition": {
|
||||
"GlueConnectionName": "someglueconnectionname",
|
||||
"TempDirectory": {
|
||||
"Bucket": "sometempbucketname",
|
||||
},
|
||||
},
|
||||
},
|
||||
)
|
||||
dataset["Name"].should.equal(response["Name"])
|
||||
|
||||
# Describe the dataset and check the changes
|
||||
dataset = client.describe_dataset(Name=response["Name"])
|
||||
dataset["Name"].should.equal(response["Name"])
|
||||
dataset["Format"].should.equal("TEST")
|
||||
|
||||
|
||||
@mock_databrew
|
||||
def test_update_dataset_that_does_not_exist():
|
||||
client = _create_databrew_client()
|
||||
|
||||
# Update the dataset and check response
|
||||
with pytest.raises(ClientError) as exc:
|
||||
client.update_dataset(
|
||||
Name="RANDOMNAME",
|
||||
Format="TEST",
|
||||
Input={
|
||||
"S3InputDefinition": {
|
||||
"Bucket": "somerandombucketname",
|
||||
},
|
||||
"DataCatalogInputDefinition": {
|
||||
"DatabaseName": "somedbname",
|
||||
"TableName": "sometablename",
|
||||
"TempDirectory": {
|
||||
"Bucket": "sometempbucketname",
|
||||
},
|
||||
},
|
||||
"DatabaseInputDefinition": {
|
||||
"GlueConnectionName": "someglueconnectionname",
|
||||
"TempDirectory": {
|
||||
"Bucket": "sometempbucketname",
|
||||
},
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
err = exc.value.response["Error"]
|
||||
err["Code"].should.equal("ResourceNotFoundException")
|
||||
err["Message"].should.equal("One or more resources can't be found.")
|
Loading…
Reference in New Issue
Block a user