diff --git a/IMPLEMENTATION_COVERAGE.md b/IMPLEMENTATION_COVERAGE.md index 1b8a47b0f..c9ae8b4af 100644 --- a/IMPLEMENTATION_COVERAGE.md +++ b/IMPLEMENTATION_COVERAGE.md @@ -1027,7 +1027,7 @@ ## batch
-78% implemented +84% implemented - [X] cancel_job - [X] create_compute_environment @@ -3797,7 +3797,7 @@ - [ ] modify_vpc_endpoint_connection_notification - [ ] modify_vpc_endpoint_service_configuration - [ ] modify_vpc_endpoint_service_permissions -- [ ] modify_vpc_peering_connection_options +- [X] modify_vpc_peering_connection_options - [X] modify_vpc_tenancy - [ ] modify_vpn_connection - [ ] modify_vpn_connection_options @@ -4904,7 +4904,7 @@ ## glue
-4% implemented +7% implemented - [ ] batch_create_partition - [ ] batch_delete_connection @@ -4923,7 +4923,7 @@ - [ ] check_schema_version_validity - [ ] create_classifier - [ ] create_connection -- [ ] create_crawler +- [X] create_crawler - [X] create_database - [ ] create_dev_endpoint - [ ] create_job @@ -4942,7 +4942,7 @@ - [ ] delete_column_statistics_for_partition - [ ] delete_column_statistics_for_table - [ ] delete_connection -- [ ] delete_crawler +- [X] delete_crawler - [ ] delete_database - [ ] delete_dev_endpoint - [ ] delete_job @@ -4966,9 +4966,9 @@ - [ ] get_column_statistics_for_table - [ ] get_connection - [ ] get_connections -- [ ] get_crawler +- [X] get_crawler - [ ] get_crawler_metrics -- [ ] get_crawlers +- [X] get_crawlers - [ ] get_data_catalog_encryption_settings - [X] get_database - [X] get_databases @@ -10589,7 +10589,7 @@ ## ssm
-16% implemented +17% implemented - [X] add_tags_to_resource - [ ] associate_ops_item_related_item @@ -10626,7 +10626,7 @@ - [ ] describe_automation_step_executions - [ ] describe_available_patches - [X] describe_document -- [ ] describe_document_permission +- [X] describe_document_permission - [ ] describe_effective_instance_associations - [ ] describe_effective_patches_for_patch_baseline - [ ] describe_instance_associations_status @@ -10692,7 +10692,7 @@ - [ ] list_resource_compliance_summaries - [ ] list_resource_data_sync - [X] list_tags_for_resource -- [ ] modify_document_permission +- [X] modify_document_permission - [ ] put_compliance_items - [ ] put_inventory - [X] put_parameter diff --git a/moto/glue/exceptions.py b/moto/glue/exceptions.py index c4b7048db..c45a8199d 100644 --- a/moto/glue/exceptions.py +++ b/moto/glue/exceptions.py @@ -28,6 +28,11 @@ class PartitionAlreadyExistsException(AlreadyExistsException): super(PartitionAlreadyExistsException, self).__init__("Partition") +class CrawlerAlreadyExistsException(AlreadyExistsException): + def __init__(self): + super(CrawlerAlreadyExistsException, self).__init__("Crawler") + + class EntityNotFoundException(GlueClientError): def __init__(self, msg): super(GlueClientError, self).__init__("EntityNotFoundException", msg) @@ -48,6 +53,13 @@ class PartitionNotFoundException(EntityNotFoundException): super(PartitionNotFoundException, self).__init__("Cannot find partition.") +class CrawlerNotFoundException(EntityNotFoundException): + def __init__(self, crawler): + super(CrawlerNotFoundException, self).__init__( + "Crawler %s not found." % crawler + ) + + class VersionNotFoundException(EntityNotFoundException): def __init__(self): super(VersionNotFoundException, self).__init__("Version not found.") diff --git a/moto/glue/models.py b/moto/glue/models.py index d3e730a5d..d11056115 100644 --- a/moto/glue/models.py +++ b/moto/glue/models.py @@ -7,6 +7,8 @@ from moto.core import BaseBackend, BaseModel from collections import OrderedDict from .exceptions import ( JsonRESTError, + CrawlerAlreadyExistsException, + CrawlerNotFoundException, DatabaseAlreadyExistsException, DatabaseNotFoundException, TableAlreadyExistsException, @@ -20,6 +22,7 @@ from .exceptions import ( class GlueBackend(BaseBackend): def __init__(self): self.databases = OrderedDict() + self.crawlers = OrderedDict() def create_database(self, database_name, database_input): if database_name in self.databases: @@ -67,6 +70,59 @@ class GlueBackend(BaseBackend): raise TableNotFoundException(table_name) return {} + def create_crawler( + self, + name, + role, + database_name, + description, + targets, + schedule, + classifiers, + table_prefix, + schema_change_policy, + recrawl_policy, + lineage_configuration, + configuration, + crawler_security_configuration, + tags, + ): + if name in self.crawlers: + raise CrawlerAlreadyExistsException() + + crawler = FakeCrawler( + name=name, + role=role, + database_name=database_name, + description=description, + targets=targets, + schedule=schedule, + classifiers=classifiers, + table_prefix=table_prefix, + schema_change_policy=schema_change_policy, + recrawl_policy=recrawl_policy, + lineage_configuration=lineage_configuration, + configuration=configuration, + crawler_security_configuration=crawler_security_configuration, + tags=tags, + ) + self.crawlers[name] = crawler + + def get_crawler(self, name): + try: + return self.crawlers[name] + except KeyError: + raise CrawlerNotFoundException(name) + + def get_crawlers(self): + return [self.crawlers[key] for key in self.crawlers] if self.crawlers else [] + + def delete_crawler(self, name): + try: + del self.crawlers[name] + except KeyError: + raise CrawlerNotFoundException(name) + class FakeDatabase(BaseModel): def __init__(self, database_name, database_input): @@ -177,4 +233,100 @@ class FakePartition(BaseModel): return obj +class FakeCrawler(BaseModel): + def __init__( + self, + name, + role, + database_name, + description, + targets, + schedule, + classifiers, + table_prefix, + schema_change_policy, + recrawl_policy, + lineage_configuration, + configuration, + crawler_security_configuration, + tags, + ): + self.name = name + self.role = role + self.database_name = database_name + self.description = description + self.targets = targets + self.schedule = schedule + self.classifiers = classifiers + self.table_prefix = table_prefix + self.schema_change_policy = schema_change_policy + self.recrawl_policy = recrawl_policy + self.lineage_configuration = lineage_configuration + self.configuration = configuration + self.crawler_security_configuration = crawler_security_configuration + self.tags = tags + self.state = "READY" + self.creation_time = datetime.utcnow() + self.last_updated = self.creation_time + self.version = 1 + self.crawl_elapsed_time = 0 + self.last_crawl_info = None + + def as_dict(self): + last_crawl = self.last_crawl_info.as_dict() if self.last_crawl_info else None + data = { + "Name": self.name, + "Role": self.role, + "Targets": self.targets, + "DatabaseName": self.database_name, + "Description": self.description, + "Classifiers": self.classifiers, + "RecrawlPolicy": self.recrawl_policy, + "SchemaChangePolicy": self.schema_change_policy, + "LineageConfiguration": self.lineage_configuration, + "State": self.state, + "TablePrefix": self.table_prefix, + "CrawlElapsedTime": self.crawl_elapsed_time, + "CreationTime": self.creation_time.isoformat(), + "LastUpdated": self.last_updated.isoformat(), + "LastCrawl": last_crawl, + "Version": self.version, + "Configuration": self.configuration, + "CrawlerSecurityConfiguration": self.crawler_security_configuration, + } + + if self.schedule: + data["Schedule"] = { + "ScheduleExpression": self.schedule, + "State": "SCHEDULED", + } + + if self.last_crawl_info: + data["LastCrawl"] = self.last_crawl_info.as_dict() + + return data + + +class LastCrawlInfo(BaseModel): + def __init__( + self, error_message, log_group, log_stream, message_prefix, start_time, status, + ): + self.error_message = error_message + self.log_group = log_group + self.log_stream = log_stream + self.message_prefix = message_prefix + self.start_time = start_time + self.status = status + + def as_dict(self): + return { + "ErrorMessage": self.error_message, + "LogGroup": self.log_group, + "LogStream": self.log_stream, + "MessagePrefix": self.message_prefix, + "StartTime": self.start_time, + "Status": self.status, + } + + glue_backend = GlueBackend() diff --git a/moto/glue/responses.py b/moto/glue/responses.py index e3ec08dee..c4fd8bba3 100644 --- a/moto/glue/responses.py +++ b/moto/glue/responses.py @@ -274,3 +274,38 @@ class GlueResponse(BaseResponse): out["Errors"] = errors_output return json.dumps(out) + + def create_crawler(self): + self.glue_backend.create_crawler( + name=self.parameters.get("Name"), + role=self.parameters.get("Role"), + database_name=self.parameters.get("DatabaseName"), + description=self.parameters.get("Description"), + targets=self.parameters.get("Targets"), + schedule=self.parameters.get("Schedule"), + classifiers=self.parameters.get("Classifiers"), + table_prefix=self.parameters.get("TablePrefix"), + schema_change_policy=self.parameters.get("SchemaChangePolicy"), + recrawl_policy=self.parameters.get("RecrawlPolicy"), + lineage_configuration=self.parameters.get("LineageConfiguration"), + configuration=self.parameters.get("Configuration"), + crawler_security_configuration=self.parameters.get( + "CrawlerSecurityConfiguration" + ), + tags=self.parameters.get("Tags"), + ) + return "" + + def get_crawler(self): + name = self.parameters.get("Name") + crawler = self.glue_backend.get_crawler(name) + return json.dumps({"Crawler": crawler.as_dict()}) + + def get_crawlers(self): + crawlers = self.glue_backend.get_crawlers() + return json.dumps({"Crawlers": [crawler.as_dict() for crawler in crawlers]}) + + def delete_crawler(self): + name = self.parameters.get("Name") + self.glue_backend.delete_crawler(name) + return "" diff --git a/tests/test_glue/helpers.py b/tests/test_glue/helpers.py index b0a602c75..5dc944fa9 100644 --- a/tests/test_glue/helpers.py +++ b/tests/test_glue/helpers.py @@ -106,3 +106,43 @@ def get_partition(client, database_name, table_name, values): return client.get_partition( DatabaseName=database_name, TableName=table_name, PartitionValues=values ) + + +def create_crawler( + client, crawler_name, crawler_role=None, crawler_targets=None, **kwargs +): + optional_param_map = { + "database_name": "DatabaseName", + "description": "Description", + "schedule": "Schedule", + "classifiers": "Classifiers", + "table_prefix": "TablePrefix", + "schema_change_policy": "SchemaChangePolicy", + "recrawl_policy": "RecrawlPolicy", + "lineage_configuration": "LineageConfiguration", + "configuration": "Configuration", + "crawler_security_configuration": "CrawlerSecurityConfiguration", + "tags": "Tags", + } + + params = { + boto3_key: kwargs.get(key) + for key, boto3_key in optional_param_map.items() + if kwargs.get(key) is not None + } + + if crawler_role is None: + crawler_role = "arn:aws:iam::123456789012:role/Glue/Role" + + if crawler_targets is None: + crawler_targets = { + "S3Targets": [], + "JdbcTargets": [], + "MongoDBTargets": [], + "DynamoDBTargets": [], + "CatalogTargets": [], + } + + return client.create_crawler( + Name=crawler_name, Role=crawler_role, Targets=crawler_targets, **params, + ) diff --git a/tests/test_glue/test_datacatalog.py b/tests/test_glue/test_datacatalog.py index 62b5cc443..43a2ae380 100644 --- a/tests/test_glue/test_datacatalog.py +++ b/tests/test_glue/test_datacatalog.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import sure # noqa import re import pytest +import json import boto3 from botocore.client import ClientError @@ -905,3 +906,247 @@ def test_batch_delete_partition_with_bad_partitions(): ["2018-11-01"].should.be.within(error_partitions) ["2018-11-02"].should.be.within(error_partitions) ["2018-11-03"].should.be.within(error_partitions) + + +@mock_glue +@freeze_time(FROZEN_CREATE_TIME) +def test_create_crawler_scheduled(): + client = boto3.client("glue", region_name="us-east-1") + name = "my_crawler_name" + role = "arn:aws:iam::123456789012:role/Glue/Role" + database_name = "my_database_name" + description = "my crawler description" + targets = { + "S3Targets": [{"Path": "s3://my-source-bucket/"}], + "JdbcTargets": [], + "MongoDBTargets": [], + "DynamoDBTargets": [], + "CatalogTargets": [], + } + schedule = "cron(15 12 * * ? *)" + classifiers = [] + table_prefix = "my_table_prefix_" + schema_change_policy = { + "UpdateBehavior": "LOG", + "DeleteBehavior": "LOG", + } + recrawl_policy = {"RecrawlBehavior": "CRAWL_NEW_FOLDERS_ONLY"} + lineage_configuration = {"CrawlerLineageSettings": "DISABLE"} + configuration = json.dumps( + { + "Version": 1.0, + "CrawlerOutput": { + "Partitions": {"AddOrUpdateBehavior": "InheritFromTable"}, + }, + "Grouping": {"TableGroupingPolicy": "CombineCompatibleSchemas"}, + } + ) + crawler_security_configuration = "my_security_configuration" + tags = {"tag_key": "tag_value"} + helpers.create_crawler( + client, + name, + role, + targets, + database_name=database_name, + description=description, + schedule=schedule, + classifiers=classifiers, + table_prefix=table_prefix, + schema_change_policy=schema_change_policy, + recrawl_policy=recrawl_policy, + lineage_configuration=lineage_configuration, + configuration=configuration, + crawler_security_configuration=crawler_security_configuration, + tags=tags, + ) + + response = client.get_crawler(Name=name) + crawler = response["Crawler"] + + crawler.get("Name").should.equal(name) + crawler.get("Role").should.equal(role) + crawler.get("DatabaseName").should.equal(database_name) + crawler.get("Description").should.equal(description) + crawler.get("Targets").should.equal(targets) + crawler.get("Schedule").should.equal( + {"ScheduleExpression": schedule, "State": "SCHEDULED"} + ) + crawler.get("Classifiers").should.equal(classifiers) + crawler.get("TablePrefix").should.equal(table_prefix) + crawler.get("SchemaChangePolicy").should.equal(schema_change_policy) + crawler.get("RecrawlPolicy").should.equal(recrawl_policy) + crawler.get("LineageConfiguration").should.equal(lineage_configuration) + crawler.get("Configuration").should.equal(configuration) + crawler.get("CrawlerSecurityConfiguration").should.equal( + crawler_security_configuration + ) + + crawler.get("State").should.equal("READY") + crawler.get("CrawlElapsedTime").should.equal(0) + crawler.get("Version").should.equal(1) + if not settings.TEST_SERVER_MODE: + crawler.get("CreationTime").should.equal(FROZEN_CREATE_TIME) + crawler.get("LastUpdated").should.equal(FROZEN_CREATE_TIME) + + crawler.should.not_have.key("LastCrawl") + + +@mock_glue +@freeze_time(FROZEN_CREATE_TIME) +def test_create_crawler_unscheduled(): + client = boto3.client("glue", region_name="us-east-1") + name = "my_crawler_name" + role = "arn:aws:iam::123456789012:role/Glue/Role" + database_name = "my_database_name" + description = "my crawler description" + targets = { + "S3Targets": [{"Path": "s3://my-source-bucket/"}], + "JdbcTargets": [], + "MongoDBTargets": [], + "DynamoDBTargets": [], + "CatalogTargets": [], + } + classifiers = [] + table_prefix = "my_table_prefix_" + schema_change_policy = { + "UpdateBehavior": "LOG", + "DeleteBehavior": "LOG", + } + recrawl_policy = {"RecrawlBehavior": "CRAWL_NEW_FOLDERS_ONLY"} + lineage_configuration = {"CrawlerLineageSettings": "DISABLE"} + configuration = json.dumps( + { + "Version": 1.0, + "CrawlerOutput": { + "Partitions": {"AddOrUpdateBehavior": "InheritFromTable"}, + }, + "Grouping": {"TableGroupingPolicy": "CombineCompatibleSchemas"}, + } + ) + crawler_security_configuration = "my_security_configuration" + tags = {"tag_key": "tag_value"} + helpers.create_crawler( + client, + name, + role, + targets, + database_name=database_name, + description=description, + classifiers=classifiers, + table_prefix=table_prefix, + schema_change_policy=schema_change_policy, + recrawl_policy=recrawl_policy, + lineage_configuration=lineage_configuration, + configuration=configuration, + crawler_security_configuration=crawler_security_configuration, + tags=tags, + ) + + response = client.get_crawler(Name=name) + crawler = response["Crawler"] + + crawler.get("Name").should.equal(name) + crawler.get("Role").should.equal(role) + crawler.get("DatabaseName").should.equal(database_name) + crawler.get("Description").should.equal(description) + crawler.get("Targets").should.equal(targets) + crawler.should.not_have.key("Schedule") + crawler.get("Classifiers").should.equal(classifiers) + crawler.get("TablePrefix").should.equal(table_prefix) + crawler.get("SchemaChangePolicy").should.equal(schema_change_policy) + crawler.get("RecrawlPolicy").should.equal(recrawl_policy) + crawler.get("LineageConfiguration").should.equal(lineage_configuration) + crawler.get("Configuration").should.equal(configuration) + crawler.get("CrawlerSecurityConfiguration").should.equal( + crawler_security_configuration + ) + + crawler.get("State").should.equal("READY") + crawler.get("CrawlElapsedTime").should.equal(0) + crawler.get("Version").should.equal(1) + if not settings.TEST_SERVER_MODE: + crawler.get("CreationTime").should.equal(FROZEN_CREATE_TIME) + crawler.get("LastUpdated").should.equal(FROZEN_CREATE_TIME) + + crawler.should.not_have.key("LastCrawl") + + +@mock_glue +def test_create_crawler_already_exists(): + client = boto3.client("glue", region_name="us-east-1") + name = "my_crawler_name" + helpers.create_crawler(client, name) + + with pytest.raises(ClientError) as exc: + helpers.create_crawler(client, name) + + exc.value.response["Error"]["Code"].should.equal("AlreadyExistsException") + + +@mock_glue +def test_get_crawler_not_exits(): + client = boto3.client("glue", region_name="us-east-1") + name = "my_crawler_name" + + with pytest.raises(ClientError) as exc: + client.get_crawler(Name=name) + + exc.value.response["Error"]["Code"].should.equal("EntityNotFoundException") + exc.value.response["Error"]["Message"].should.match( + "Crawler my_crawler_name not found" + ) + + +@mock_glue +def test_get_crawlers_empty(): + client = boto3.client("glue", region_name="us-east-1") + response = client.get_crawlers() + response["Crawlers"].should.have.length_of(0) + + +@mock_glue +def test_get_crawlers_several_items(): + client = boto3.client("glue", region_name="us-east-1") + name_1, name_2 = "my_crawler_name_1", "my_crawler_name_2" + + helpers.create_crawler(client, name_1) + helpers.create_crawler(client, name_2) + + crawlers = sorted(client.get_crawlers()["Crawlers"], key=lambda x: x["Name"]) + crawlers.should.have.length_of(2) + crawlers[0].get("Name").should.equal(name_1) + crawlers[1].get("Name").should.equal(name_2) + + +@mock_glue +def test_delete_crawler(): + client = boto3.client("glue", region_name="us-east-1") + name = "my_crawler_name" + helpers.create_crawler(client, name) + + result = client.delete_crawler(Name=name) + result["ResponseMetadata"]["HTTPStatusCode"].should.equal(200) + + # confirm crawler is deleted + with pytest.raises(ClientError) as exc: + client.get_crawler(Name=name) + + exc.value.response["Error"]["Code"].should.equal("EntityNotFoundException") + exc.value.response["Error"]["Message"].should.match( + "Crawler my_crawler_name not found" + ) + + +@mock_glue +def test_delete_crawler_not_exists(): + client = boto3.client("glue", region_name="us-east-1") + name = "my_crawler_name" + + with pytest.raises(ClientError) as exc: + client.delete_crawler(Name=name) + + exc.value.response["Error"]["Code"].should.equal("EntityNotFoundException") + exc.value.response["Error"]["Message"].should.match( + "Crawler my_crawler_name not found" + )