From babbd218140324aef8364a2e2eb1407048846ded Mon Sep 17 00:00:00 2001 From: Steven Church Date: Fri, 13 May 2022 11:48:04 +0100 Subject: [PATCH] Databrew: Add datasets support (#5095) --- IMPLEMENTATION_COVERAGE.md | 12 +- docs/docs/services/databrew.rst | 10 +- moto/backend_index.py | 2 +- moto/databrew/exceptions.py | 9 + moto/databrew/models.py | 114 +++++++- moto/databrew/responses.py | 96 +++++++ moto/databrew/urls.py | 2 + tests/test_databrew/test_databrew_datasets.py | 268 ++++++++++++++++++ 8 files changed, 499 insertions(+), 14 deletions(-) create mode 100644 tests/test_databrew/test_databrew_datasets.py diff --git a/IMPLEMENTATION_COVERAGE.md b/IMPLEMENTATION_COVERAGE.md index 0d95af4cb..ab106b596 100644 --- a/IMPLEMENTATION_COVERAGE.md +++ b/IMPLEMENTATION_COVERAGE.md @@ -1069,30 +1069,30 @@ ## databrew
-22% implemented +27% implemented - [ ] batch_delete_recipe_version -- [ ] create_dataset +- [X] create_dataset - [ ] create_profile_job - [ ] create_project - [X] create_recipe - [ ] create_recipe_job - [X] create_ruleset - [ ] create_schedule -- [ ] delete_dataset +- [X] delete_dataset - [ ] delete_job - [ ] delete_project - [X] delete_recipe_version - [X] delete_ruleset - [ ] delete_schedule -- [ ] describe_dataset +- [X] describe_dataset - [ ] describe_job - [ ] describe_job_run - [ ] describe_project - [ ] describe_recipe - [ ] describe_ruleset - [ ] describe_schedule -- [ ] list_datasets +- [X] list_datasets - [ ] list_job_runs - [ ] list_jobs - [ ] list_projects @@ -1108,7 +1108,7 @@ - [ ] stop_job_run - [ ] tag_resource - [ ] untag_resource -- [ ] update_dataset +- [X] update_dataset - [ ] update_profile_job - [ ] update_project - [X] update_recipe diff --git a/docs/docs/services/databrew.rst b/docs/docs/services/databrew.rst index fca4b097d..8715cae99 100644 --- a/docs/docs/services/databrew.rst +++ b/docs/docs/services/databrew.rst @@ -26,27 +26,27 @@ databrew |start-h3| Implemented features for this service |end-h3| - [ ] batch_delete_recipe_version -- [ ] create_dataset +- [X] create_dataset - [ ] create_profile_job - [ ] create_project - [X] create_recipe - [ ] create_recipe_job - [X] create_ruleset - [ ] create_schedule -- [ ] delete_dataset +- [X] delete_dataset - [ ] delete_job - [ ] delete_project - [X] delete_recipe_version - [X] delete_ruleset - [ ] delete_schedule -- [ ] describe_dataset +- [X] describe_dataset - [ ] describe_job - [ ] describe_job_run - [ ] describe_project - [ ] describe_recipe - [ ] describe_ruleset - [ ] describe_schedule -- [ ] list_datasets +- [X] list_datasets - [ ] list_job_runs - [ ] list_jobs - [ ] list_projects @@ -62,7 +62,7 @@ databrew - [ ] stop_job_run - [ ] tag_resource - [ ] untag_resource -- [ ] update_dataset +- [X] update_dataset - [ ] update_profile_job - [ ] update_project - [X] update_recipe diff --git a/moto/backend_index.py b/moto/backend_index.py index d6348d149..e9e122a33 100644 --- a/moto/backend_index.py +++ b/moto/backend_index.py @@ -1,4 +1,4 @@ -# autogenerated by scripts/update_backend_index.py +# autogenerated by ./scripts/update_backend_index.py import re backend_url_patterns = [ diff --git a/moto/databrew/exceptions.py b/moto/databrew/exceptions.py index 6a3a839ab..187bd2c8f 100644 --- a/moto/databrew/exceptions.py +++ b/moto/databrew/exceptions.py @@ -42,3 +42,12 @@ class ResourceNotFoundException(DataBrewClientError): class RulesetNotFoundException(EntityNotFoundException): def __init__(self, recipe_name): super().__init__("Ruleset %s not found." % recipe_name) + + +class ServiceQuotaExceededException(JsonRESTError): + code = 402 + + def __init__(self): + super().__init__( + "ServiceQuotaExceededException", "A service quota is exceeded." + ) diff --git a/moto/databrew/models.py b/moto/databrew/models.py index 0787cf2f3..da1291b0a 100644 --- a/moto/databrew/models.py +++ b/moto/databrew/models.py @@ -6,12 +6,15 @@ from datetime import datetime from moto.core import BaseBackend, BaseModel from moto.core.utils import BackendDict from moto.utilities.paginator import paginate + from .exceptions import ( + AlreadyExistsException, ConflictException, - ResourceNotFoundException, ValidationException, + RulesetAlreadyExistsException, + RulesetNotFoundException, + ResourceNotFoundException, ) -from .exceptions import RulesetAlreadyExistsException, RulesetNotFoundException class DataBrewBackend(BaseBackend): @@ -34,12 +37,19 @@ class DataBrewBackend(BaseBackend): "limit_default": 100, "unique_attribute": "name", }, + "list_datasets": { + "input_token": "next_token", + "limit_key": "max_results", + "limit_default": 100, + "unique_attribute": "name", + }, } def __init__(self, region_name): self.region_name = region_name self.recipes = OrderedDict() self.rulesets = OrderedDict() + self.datasets = OrderedDict() def reset(self): """Re-initialize all attributes for this instance.""" @@ -221,6 +231,74 @@ class DataBrewBackend(BaseBackend): del self.rulesets[ruleset_name] + def create_dataset( + self, + dataset_name, + dataset_format, + dataset_format_options, + dataset_input, + dataset_path_options, + tags, + ): + if dataset_name in self.datasets: + raise AlreadyExistsException(dataset_name) + + dataset = FakeDataset( + self.region_name, + dataset_name, + dataset_format, + dataset_format_options, + dataset_input, + dataset_path_options, + tags, + ) + self.datasets[dataset_name] = dataset + return dataset + + @paginate(pagination_model=PAGINATION_MODEL) + def list_datasets(self): + return list(self.datasets.values()) + + def update_dataset( + self, + dataset_name, + dataset_format, + dataset_format_options, + dataset_input, + dataset_path_options, + tags, + ): + + if dataset_name not in self.datasets: + raise ResourceNotFoundException("One or more resources can't be found.") + + dataset = self.datasets[dataset_name] + + if dataset_format is not None: + dataset.format = dataset_format + if dataset_format_options is not None: + dataset.format_options = dataset_format_options + if dataset_input is not None: + dataset.input = dataset_input + if dataset_path_options is not None: + dataset.path_options = dataset_path_options + if tags is not None: + dataset.tags = tags + + return dataset + + def delete_dataset(self, dataset_name): + if dataset_name not in self.datasets: + raise ResourceNotFoundException("One or more resources can't be found.") + + del self.datasets[dataset_name] + + def describe_dataset(self, dataset_name): + if dataset_name not in self.datasets: + raise ResourceNotFoundException("One or more resources can't be found.") + + return self.datasets[dataset_name] + class FakeRecipe(BaseModel): INITIAL_VERSION = 0.1 @@ -355,4 +433,36 @@ class FakeRuleset(BaseModel): } +class FakeDataset(BaseModel): + def __init__( + self, + region_name, + dataset_name, + dataset_format, + dataset_format_options, + dataset_input, + dataset_path_options, + tags, + ): + self.region_name = region_name + self.name = dataset_name + self.format = dataset_format + self.format_options = dataset_format_options + self.input = dataset_input + self.path_options = dataset_path_options + self.created_time = datetime.now() + self.tags = tags + + def as_dict(self): + return { + "Name": self.name, + "Format": self.format, + "FormatOptions": self.format_options, + "Input": self.input, + "PathOptions": self.path_options, + "CreateTime": self.created_time.isoformat(), + "Tags": self.tags or dict(), + } + + databrew_backends = BackendDict(DataBrewBackend, "databrew") diff --git a/moto/databrew/responses.py b/moto/databrew/responses.py index 89100e488..422dc74a5 100644 --- a/moto/databrew/responses.py +++ b/moto/databrew/responses.py @@ -14,6 +14,7 @@ class DataBrewResponse(BaseResponse): """Return backend instance specific for this region.""" return databrew_backends[self.region] + # region Recipes @property def parameters(self): return json.loads(self.body) @@ -133,6 +134,10 @@ class DataBrewResponse(BaseResponse): elif request.method == "GET": return self.get_recipe_response(recipe_name) + # endregion + + # region Rulesets + @amzn_request_id def create_ruleset(self): ruleset_description = self.parameters.get("Description") @@ -202,3 +207,94 @@ class DataBrewResponse(BaseResponse): "NextToken": next_token, } ) + + # endregion + + # region Datasets + + @amzn_request_id + def create_dataset(self): + dataset_name = self.parameters.get("Name") + dataset_format = self.parameters.get("Format") + dataset_format_options = self.parameters.get("FormatOptions") + dataset_input = self.parameters.get("Input") + dataset_path_otions = self.parameters.get("PathOptions") + dataset_tags = self.parameters.get("Tags") + + return json.dumps( + self.databrew_backend.create_dataset( + dataset_name, + dataset_format, + dataset_format_options, + dataset_input, + dataset_path_otions, + dataset_tags, + ).as_dict() + ) + + @amzn_request_id + def list_datasets(self): + next_token = self._get_param("NextToken", self._get_param("nextToken")) + max_results = self._get_int_param( + "MaxResults", self._get_int_param("maxResults") + ) + + # pylint: disable=unexpected-keyword-arg, unbalanced-tuple-unpacking + dataset_list, next_token = self.databrew_backend.list_datasets( + next_token=next_token, max_results=max_results + ) + + return json.dumps( + { + "Datasets": [dataset.as_dict() for dataset in dataset_list], + "NextToken": next_token, + } + ) + + @amzn_request_id + def update_dataset(self, dataset_name): + dataset_format = self.parameters.get("Format") + dataset_format_options = self.parameters.get("FormatOptions") + dataset_input = self.parameters.get("Input") + dataset_path_otions = self.parameters.get("PathOptions") + dataset_tags = self.parameters.get("Tags") + + dataset = self.databrew_backend.update_dataset( + dataset_name, + dataset_format, + dataset_format_options, + dataset_input, + dataset_path_otions, + dataset_tags, + ) + return 200, {}, json.dumps(dataset.as_dict()) + + @amzn_request_id + def delete_dataset(self, dataset_name): + self.databrew_backend.delete_dataset(dataset_name) + return 200, {}, json.dumps({"Name": dataset_name}) + + @amzn_request_id + def describe_dataset(self, dataset_name): + dataset = self.databrew_backend.describe_dataset(dataset_name) + return 200, {}, json.dumps(dataset.as_dict()) + + @amzn_request_id + def dataset_response(self, request, full_url, headers): + self.setup_class(request, full_url, headers) + parsed_url = urlparse(full_url) + + dataset_name = parsed_url.path.split("/")[-1] + + if request.method == "POST": + return self.create_dataset() + elif request.method == "GET" and dataset_name: + return self.describe_dataset(dataset_name) + elif request.method == "GET": + return self.list_datasets() + elif request.method == "DELETE": + return self.delete_dataset(dataset_name) + elif request.method == "PUT": + return self.update_dataset(dataset_name) + + # endregion diff --git a/moto/databrew/urls.py b/moto/databrew/urls.py index d3b441be3..d8508e55c 100644 --- a/moto/databrew/urls.py +++ b/moto/databrew/urls.py @@ -10,4 +10,6 @@ url_paths = { "{0}/recipes/(?P[^/]+)/publishRecipe$": DataBrewResponse().publish_recipe, "{0}/rulesets$": DataBrewResponse.dispatch, "{0}/rulesets/(?P[^/]+)$": DataBrewResponse().ruleset_response, + "{0}/datasets$": DataBrewResponse.dispatch, + "{0}/datasets/(?P[^/]+)$": DataBrewResponse().dataset_response, } diff --git a/tests/test_databrew/test_databrew_datasets.py b/tests/test_databrew/test_databrew_datasets.py new file mode 100644 index 000000000..f2c48880a --- /dev/null +++ b/tests/test_databrew/test_databrew_datasets.py @@ -0,0 +1,268 @@ +import uuid + +import boto3 +import pytest +from botocore.exceptions import ClientError + +from moto import mock_databrew + + +def _create_databrew_client(): + client = boto3.client("databrew", region_name="us-west-1") + return client + + +def _create_test_dataset( + client, + tags=None, + dataset_name=None, + dataset_format="JSON", + dataset_format_options=None, +): + if dataset_name is None: + dataset_name = str(uuid.uuid4()) + + if not dataset_format_options: + if dataset_format == "JSON": + dataset_format_options = {"Json": {"MultiLine": True}} + elif dataset_format == "CSV": + dataset_format_options = {"Csv": {"Delimiter": ",", "HeaderRow": False}} + elif dataset_format == "EXCEL": + dataset_format_options = { + "Excel": { + "SheetNames": [ + "blaa", + ], + "SheetIndexes": [ + 123, + ], + "HeaderRow": True, + } + } + + return client.create_dataset( + Name=dataset_name, + Format=dataset_format, + FormatOptions=dataset_format_options, + Input={ + "S3InputDefinition": { + "Bucket": "somerandombucketname", + }, + "DataCatalogInputDefinition": { + "DatabaseName": "somedbname", + "TableName": "sometablename", + "TempDirectory": { + "Bucket": "sometempbucketname", + }, + }, + "DatabaseInputDefinition": { + "GlueConnectionName": "someglueconnectionname", + "TempDirectory": { + "Bucket": "sometempbucketname", + }, + }, + }, + PathOptions={ + "LastModifiedDateCondition": { + "Expression": "string", + "ValuesMap": {"string": "string"}, + }, + "FilesLimit": { + "MaxFiles": 123, + "OrderedBy": "LAST_MODIFIED_DATE", + "Order": "ASCENDING", + }, + "Parameters": { + "string": { + "Name": "string", + "Type": "string", + "CreateColumn": False, + "Filter": { + "Expression": "string", + "ValuesMap": {"string": "string"}, + }, + } + }, + }, + Tags=tags or {}, + ) + + +def _create_test_datasets(client, count): + for _ in range(count): + _create_test_dataset(client) + + +@mock_databrew +def test_dataset_list_when_empty(): + client = _create_databrew_client() + + response = client.list_datasets() + response.should.have.key("Datasets") + response["Datasets"].should.have.length_of(0) + + +@mock_databrew +def test_list_datasets_with_max_results(): + client = _create_databrew_client() + + _create_test_datasets(client, 4) + response = client.list_datasets(MaxResults=2) + response["Datasets"].should.have.length_of(2) + response.should.have.key("NextToken") + + +@mock_databrew +def test_list_datasets_from_next_token(): + client = _create_databrew_client() + _create_test_datasets(client, 10) + first_response = client.list_datasets(MaxResults=3) + response = client.list_datasets(NextToken=first_response["NextToken"]) + response["Datasets"].should.have.length_of(7) + + +@mock_databrew +def test_list_datasets_with_max_results_greater_than_actual_results(): + client = _create_databrew_client() + _create_test_datasets(client, 4) + response = client.list_datasets(MaxResults=10) + response["Datasets"].should.have.length_of(4) + + +@mock_databrew +def test_describe_dataset(): + client = _create_databrew_client() + + # region basic test + response = _create_test_dataset(client) + dataset = client.describe_dataset(Name=response["Name"]) + dataset["Name"].should.equal(response["Name"]) + # endregion + + # region JSON test + response = _create_test_dataset(client, dataset_format="CSV") + dataset = client.describe_dataset(Name=response["Name"]) + dataset["Format"].should.equal("CSV") + # endregion + + +@mock_databrew +def test_describe_dataset_that_does_not_exist(): + client = _create_databrew_client() + + with pytest.raises(ClientError) as exc: + client.describe_dataset(Name="DoseNotExist") + err = exc.value.response["Error"] + err["Code"].should.equal("ResourceNotFoundException") + err["Message"].should.equal("One or more resources can't be found.") + + +@mock_databrew +def test_create_dataset_that_already_exists(): + client = _create_databrew_client() + + response = _create_test_dataset(client) + + with pytest.raises(ClientError) as exc: + _create_test_dataset(client, dataset_name=response["Name"]) + err = exc.value.response["Error"] + err["Code"].should.equal("AlreadyExistsException") + err["Message"].should.equal(f"{response['Name']} already exists.") + + +@mock_databrew +def test_delete_dataset(): + client = _create_databrew_client() + response = _create_test_dataset(client) + + # Check dataset exists + dataset = client.describe_dataset(Name=response["Name"]) + dataset["Name"].should.equal(response["Name"]) + + # Delete the dataset + client.delete_dataset(Name=response["Name"]) + + # Check it does not exist anymore + with pytest.raises(ClientError) as exc: + client.describe_dataset(Name=response["Name"]) + + err = exc.value.response["Error"] + err["Code"].should.equal("ResourceNotFoundException") + err["Message"].should.equal("One or more resources can't be found.") + + # Check that a dataset that does not exist errors + with pytest.raises(ClientError) as exc: + client.delete_dataset(Name=response["Name"]) + err = exc.value.response["Error"] + err["Code"].should.equal("ResourceNotFoundException") + err["Message"].should.equal("One or more resources can't be found.") + + +@mock_databrew +def test_update_dataset(): + client = _create_databrew_client() + response = _create_test_dataset(client) + + # Update the dataset and check response + dataset = client.update_dataset( + Name=response["Name"], + Format="TEST", + Input={ + "S3InputDefinition": { + "Bucket": "somerandombucketname", + }, + "DataCatalogInputDefinition": { + "DatabaseName": "somedbname", + "TableName": "sometablename", + "TempDirectory": { + "Bucket": "sometempbucketname", + }, + }, + "DatabaseInputDefinition": { + "GlueConnectionName": "someglueconnectionname", + "TempDirectory": { + "Bucket": "sometempbucketname", + }, + }, + }, + ) + dataset["Name"].should.equal(response["Name"]) + + # Describe the dataset and check the changes + dataset = client.describe_dataset(Name=response["Name"]) + dataset["Name"].should.equal(response["Name"]) + dataset["Format"].should.equal("TEST") + + +@mock_databrew +def test_update_dataset_that_does_not_exist(): + client = _create_databrew_client() + + # Update the dataset and check response + with pytest.raises(ClientError) as exc: + client.update_dataset( + Name="RANDOMNAME", + Format="TEST", + Input={ + "S3InputDefinition": { + "Bucket": "somerandombucketname", + }, + "DataCatalogInputDefinition": { + "DatabaseName": "somedbname", + "TableName": "sometablename", + "TempDirectory": { + "Bucket": "sometempbucketname", + }, + }, + "DatabaseInputDefinition": { + "GlueConnectionName": "someglueconnectionname", + "TempDirectory": { + "Bucket": "sometempbucketname", + }, + }, + }, + ) + + err = exc.value.response["Error"] + err["Code"].should.equal("ResourceNotFoundException") + err["Message"].should.equal("One or more resources can't be found.")