Glue - Implemented create_crawler, get_crawler, get_crawlers, delete_crawler. Updated IMPLEMENTATION_COVERAGE.md. (#4222)

This commit is contained in:
Mark Woods 2021-08-26 10:49:41 +01:00 committed by GitHub
parent cbbeaff23e
commit 21021a6a03
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 494 additions and 10 deletions

View File

@ -1027,7 +1027,7 @@
## batch
<details>
<summary>78% implemented</summary>
<summary>84% implemented</summary>
- [X] cancel_job
- [X] create_compute_environment
@ -3797,7 +3797,7 @@
- [ ] modify_vpc_endpoint_connection_notification
- [ ] modify_vpc_endpoint_service_configuration
- [ ] modify_vpc_endpoint_service_permissions
- [ ] modify_vpc_peering_connection_options
- [X] modify_vpc_peering_connection_options
- [X] modify_vpc_tenancy
- [ ] modify_vpn_connection
- [ ] modify_vpn_connection_options
@ -4904,7 +4904,7 @@
## glue
<details>
<summary>4% implemented</summary>
<summary>7% implemented</summary>
- [ ] batch_create_partition
- [ ] batch_delete_connection
@ -4923,7 +4923,7 @@
- [ ] check_schema_version_validity
- [ ] create_classifier
- [ ] create_connection
- [ ] create_crawler
- [X] create_crawler
- [X] create_database
- [ ] create_dev_endpoint
- [ ] create_job
@ -4942,7 +4942,7 @@
- [ ] delete_column_statistics_for_partition
- [ ] delete_column_statistics_for_table
- [ ] delete_connection
- [ ] delete_crawler
- [X] delete_crawler
- [ ] delete_database
- [ ] delete_dev_endpoint
- [ ] delete_job
@ -4966,9 +4966,9 @@
- [ ] get_column_statistics_for_table
- [ ] get_connection
- [ ] get_connections
- [ ] get_crawler
- [X] get_crawler
- [ ] get_crawler_metrics
- [ ] get_crawlers
- [X] get_crawlers
- [ ] get_data_catalog_encryption_settings
- [X] get_database
- [X] get_databases
@ -10589,7 +10589,7 @@
## ssm
<details>
<summary>16% implemented</summary>
<summary>17% implemented</summary>
- [X] add_tags_to_resource
- [ ] associate_ops_item_related_item
@ -10626,7 +10626,7 @@
- [ ] describe_automation_step_executions
- [ ] describe_available_patches
- [X] describe_document
- [ ] describe_document_permission
- [X] describe_document_permission
- [ ] describe_effective_instance_associations
- [ ] describe_effective_patches_for_patch_baseline
- [ ] describe_instance_associations_status
@ -10692,7 +10692,7 @@
- [ ] list_resource_compliance_summaries
- [ ] list_resource_data_sync
- [X] list_tags_for_resource
- [ ] modify_document_permission
- [X] modify_document_permission
- [ ] put_compliance_items
- [ ] put_inventory
- [X] put_parameter

View File

@ -28,6 +28,11 @@ class PartitionAlreadyExistsException(AlreadyExistsException):
super(PartitionAlreadyExistsException, self).__init__("Partition")
class CrawlerAlreadyExistsException(AlreadyExistsException):
def __init__(self):
super(CrawlerAlreadyExistsException, self).__init__("Crawler")
class EntityNotFoundException(GlueClientError):
def __init__(self, msg):
super(GlueClientError, self).__init__("EntityNotFoundException", msg)
@ -48,6 +53,13 @@ class PartitionNotFoundException(EntityNotFoundException):
super(PartitionNotFoundException, self).__init__("Cannot find partition.")
class CrawlerNotFoundException(EntityNotFoundException):
def __init__(self, crawler):
super(CrawlerNotFoundException, self).__init__(
"Crawler %s not found." % crawler
)
class VersionNotFoundException(EntityNotFoundException):
def __init__(self):
super(VersionNotFoundException, self).__init__("Version not found.")

View File

@ -7,6 +7,8 @@ from moto.core import BaseBackend, BaseModel
from collections import OrderedDict
from .exceptions import (
JsonRESTError,
CrawlerAlreadyExistsException,
CrawlerNotFoundException,
DatabaseAlreadyExistsException,
DatabaseNotFoundException,
TableAlreadyExistsException,
@ -20,6 +22,7 @@ from .exceptions import (
class GlueBackend(BaseBackend):
def __init__(self):
self.databases = OrderedDict()
self.crawlers = OrderedDict()
def create_database(self, database_name, database_input):
if database_name in self.databases:
@ -67,6 +70,59 @@ class GlueBackend(BaseBackend):
raise TableNotFoundException(table_name)
return {}
def create_crawler(
self,
name,
role,
database_name,
description,
targets,
schedule,
classifiers,
table_prefix,
schema_change_policy,
recrawl_policy,
lineage_configuration,
configuration,
crawler_security_configuration,
tags,
):
if name in self.crawlers:
raise CrawlerAlreadyExistsException()
crawler = FakeCrawler(
name=name,
role=role,
database_name=database_name,
description=description,
targets=targets,
schedule=schedule,
classifiers=classifiers,
table_prefix=table_prefix,
schema_change_policy=schema_change_policy,
recrawl_policy=recrawl_policy,
lineage_configuration=lineage_configuration,
configuration=configuration,
crawler_security_configuration=crawler_security_configuration,
tags=tags,
)
self.crawlers[name] = crawler
def get_crawler(self, name):
try:
return self.crawlers[name]
except KeyError:
raise CrawlerNotFoundException(name)
def get_crawlers(self):
return [self.crawlers[key] for key in self.crawlers] if self.crawlers else []
def delete_crawler(self, name):
try:
del self.crawlers[name]
except KeyError:
raise CrawlerNotFoundException(name)
class FakeDatabase(BaseModel):
def __init__(self, database_name, database_input):
@ -177,4 +233,100 @@ class FakePartition(BaseModel):
return obj
class FakeCrawler(BaseModel):
def __init__(
self,
name,
role,
database_name,
description,
targets,
schedule,
classifiers,
table_prefix,
schema_change_policy,
recrawl_policy,
lineage_configuration,
configuration,
crawler_security_configuration,
tags,
):
self.name = name
self.role = role
self.database_name = database_name
self.description = description
self.targets = targets
self.schedule = schedule
self.classifiers = classifiers
self.table_prefix = table_prefix
self.schema_change_policy = schema_change_policy
self.recrawl_policy = recrawl_policy
self.lineage_configuration = lineage_configuration
self.configuration = configuration
self.crawler_security_configuration = crawler_security_configuration
self.tags = tags
self.state = "READY"
self.creation_time = datetime.utcnow()
self.last_updated = self.creation_time
self.version = 1
self.crawl_elapsed_time = 0
self.last_crawl_info = None
def as_dict(self):
last_crawl = self.last_crawl_info.as_dict() if self.last_crawl_info else None
data = {
"Name": self.name,
"Role": self.role,
"Targets": self.targets,
"DatabaseName": self.database_name,
"Description": self.description,
"Classifiers": self.classifiers,
"RecrawlPolicy": self.recrawl_policy,
"SchemaChangePolicy": self.schema_change_policy,
"LineageConfiguration": self.lineage_configuration,
"State": self.state,
"TablePrefix": self.table_prefix,
"CrawlElapsedTime": self.crawl_elapsed_time,
"CreationTime": self.creation_time.isoformat(),
"LastUpdated": self.last_updated.isoformat(),
"LastCrawl": last_crawl,
"Version": self.version,
"Configuration": self.configuration,
"CrawlerSecurityConfiguration": self.crawler_security_configuration,
}
if self.schedule:
data["Schedule"] = {
"ScheduleExpression": self.schedule,
"State": "SCHEDULED",
}
if self.last_crawl_info:
data["LastCrawl"] = self.last_crawl_info.as_dict()
return data
class LastCrawlInfo(BaseModel):
def __init__(
self, error_message, log_group, log_stream, message_prefix, start_time, status,
):
self.error_message = error_message
self.log_group = log_group
self.log_stream = log_stream
self.message_prefix = message_prefix
self.start_time = start_time
self.status = status
def as_dict(self):
return {
"ErrorMessage": self.error_message,
"LogGroup": self.log_group,
"LogStream": self.log_stream,
"MessagePrefix": self.message_prefix,
"StartTime": self.start_time,
"Status": self.status,
}
glue_backend = GlueBackend()

View File

@ -274,3 +274,38 @@ class GlueResponse(BaseResponse):
out["Errors"] = errors_output
return json.dumps(out)
def create_crawler(self):
self.glue_backend.create_crawler(
name=self.parameters.get("Name"),
role=self.parameters.get("Role"),
database_name=self.parameters.get("DatabaseName"),
description=self.parameters.get("Description"),
targets=self.parameters.get("Targets"),
schedule=self.parameters.get("Schedule"),
classifiers=self.parameters.get("Classifiers"),
table_prefix=self.parameters.get("TablePrefix"),
schema_change_policy=self.parameters.get("SchemaChangePolicy"),
recrawl_policy=self.parameters.get("RecrawlPolicy"),
lineage_configuration=self.parameters.get("LineageConfiguration"),
configuration=self.parameters.get("Configuration"),
crawler_security_configuration=self.parameters.get(
"CrawlerSecurityConfiguration"
),
tags=self.parameters.get("Tags"),
)
return ""
def get_crawler(self):
name = self.parameters.get("Name")
crawler = self.glue_backend.get_crawler(name)
return json.dumps({"Crawler": crawler.as_dict()})
def get_crawlers(self):
crawlers = self.glue_backend.get_crawlers()
return json.dumps({"Crawlers": [crawler.as_dict() for crawler in crawlers]})
def delete_crawler(self):
name = self.parameters.get("Name")
self.glue_backend.delete_crawler(name)
return ""

View File

@ -106,3 +106,43 @@ def get_partition(client, database_name, table_name, values):
return client.get_partition(
DatabaseName=database_name, TableName=table_name, PartitionValues=values
)
def create_crawler(
client, crawler_name, crawler_role=None, crawler_targets=None, **kwargs
):
optional_param_map = {
"database_name": "DatabaseName",
"description": "Description",
"schedule": "Schedule",
"classifiers": "Classifiers",
"table_prefix": "TablePrefix",
"schema_change_policy": "SchemaChangePolicy",
"recrawl_policy": "RecrawlPolicy",
"lineage_configuration": "LineageConfiguration",
"configuration": "Configuration",
"crawler_security_configuration": "CrawlerSecurityConfiguration",
"tags": "Tags",
}
params = {
boto3_key: kwargs.get(key)
for key, boto3_key in optional_param_map.items()
if kwargs.get(key) is not None
}
if crawler_role is None:
crawler_role = "arn:aws:iam::123456789012:role/Glue/Role"
if crawler_targets is None:
crawler_targets = {
"S3Targets": [],
"JdbcTargets": [],
"MongoDBTargets": [],
"DynamoDBTargets": [],
"CatalogTargets": [],
}
return client.create_crawler(
Name=crawler_name, Role=crawler_role, Targets=crawler_targets, **params,
)

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
import sure # noqa
import re
import pytest
import json
import boto3
from botocore.client import ClientError
@ -905,3 +906,247 @@ def test_batch_delete_partition_with_bad_partitions():
["2018-11-01"].should.be.within(error_partitions)
["2018-11-02"].should.be.within(error_partitions)
["2018-11-03"].should.be.within(error_partitions)
@mock_glue
@freeze_time(FROZEN_CREATE_TIME)
def test_create_crawler_scheduled():
client = boto3.client("glue", region_name="us-east-1")
name = "my_crawler_name"
role = "arn:aws:iam::123456789012:role/Glue/Role"
database_name = "my_database_name"
description = "my crawler description"
targets = {
"S3Targets": [{"Path": "s3://my-source-bucket/"}],
"JdbcTargets": [],
"MongoDBTargets": [],
"DynamoDBTargets": [],
"CatalogTargets": [],
}
schedule = "cron(15 12 * * ? *)"
classifiers = []
table_prefix = "my_table_prefix_"
schema_change_policy = {
"UpdateBehavior": "LOG",
"DeleteBehavior": "LOG",
}
recrawl_policy = {"RecrawlBehavior": "CRAWL_NEW_FOLDERS_ONLY"}
lineage_configuration = {"CrawlerLineageSettings": "DISABLE"}
configuration = json.dumps(
{
"Version": 1.0,
"CrawlerOutput": {
"Partitions": {"AddOrUpdateBehavior": "InheritFromTable"},
},
"Grouping": {"TableGroupingPolicy": "CombineCompatibleSchemas"},
}
)
crawler_security_configuration = "my_security_configuration"
tags = {"tag_key": "tag_value"}
helpers.create_crawler(
client,
name,
role,
targets,
database_name=database_name,
description=description,
schedule=schedule,
classifiers=classifiers,
table_prefix=table_prefix,
schema_change_policy=schema_change_policy,
recrawl_policy=recrawl_policy,
lineage_configuration=lineage_configuration,
configuration=configuration,
crawler_security_configuration=crawler_security_configuration,
tags=tags,
)
response = client.get_crawler(Name=name)
crawler = response["Crawler"]
crawler.get("Name").should.equal(name)
crawler.get("Role").should.equal(role)
crawler.get("DatabaseName").should.equal(database_name)
crawler.get("Description").should.equal(description)
crawler.get("Targets").should.equal(targets)
crawler.get("Schedule").should.equal(
{"ScheduleExpression": schedule, "State": "SCHEDULED"}
)
crawler.get("Classifiers").should.equal(classifiers)
crawler.get("TablePrefix").should.equal(table_prefix)
crawler.get("SchemaChangePolicy").should.equal(schema_change_policy)
crawler.get("RecrawlPolicy").should.equal(recrawl_policy)
crawler.get("LineageConfiguration").should.equal(lineage_configuration)
crawler.get("Configuration").should.equal(configuration)
crawler.get("CrawlerSecurityConfiguration").should.equal(
crawler_security_configuration
)
crawler.get("State").should.equal("READY")
crawler.get("CrawlElapsedTime").should.equal(0)
crawler.get("Version").should.equal(1)
if not settings.TEST_SERVER_MODE:
crawler.get("CreationTime").should.equal(FROZEN_CREATE_TIME)
crawler.get("LastUpdated").should.equal(FROZEN_CREATE_TIME)
crawler.should.not_have.key("LastCrawl")
@mock_glue
@freeze_time(FROZEN_CREATE_TIME)
def test_create_crawler_unscheduled():
client = boto3.client("glue", region_name="us-east-1")
name = "my_crawler_name"
role = "arn:aws:iam::123456789012:role/Glue/Role"
database_name = "my_database_name"
description = "my crawler description"
targets = {
"S3Targets": [{"Path": "s3://my-source-bucket/"}],
"JdbcTargets": [],
"MongoDBTargets": [],
"DynamoDBTargets": [],
"CatalogTargets": [],
}
classifiers = []
table_prefix = "my_table_prefix_"
schema_change_policy = {
"UpdateBehavior": "LOG",
"DeleteBehavior": "LOG",
}
recrawl_policy = {"RecrawlBehavior": "CRAWL_NEW_FOLDERS_ONLY"}
lineage_configuration = {"CrawlerLineageSettings": "DISABLE"}
configuration = json.dumps(
{
"Version": 1.0,
"CrawlerOutput": {
"Partitions": {"AddOrUpdateBehavior": "InheritFromTable"},
},
"Grouping": {"TableGroupingPolicy": "CombineCompatibleSchemas"},
}
)
crawler_security_configuration = "my_security_configuration"
tags = {"tag_key": "tag_value"}
helpers.create_crawler(
client,
name,
role,
targets,
database_name=database_name,
description=description,
classifiers=classifiers,
table_prefix=table_prefix,
schema_change_policy=schema_change_policy,
recrawl_policy=recrawl_policy,
lineage_configuration=lineage_configuration,
configuration=configuration,
crawler_security_configuration=crawler_security_configuration,
tags=tags,
)
response = client.get_crawler(Name=name)
crawler = response["Crawler"]
crawler.get("Name").should.equal(name)
crawler.get("Role").should.equal(role)
crawler.get("DatabaseName").should.equal(database_name)
crawler.get("Description").should.equal(description)
crawler.get("Targets").should.equal(targets)
crawler.should.not_have.key("Schedule")
crawler.get("Classifiers").should.equal(classifiers)
crawler.get("TablePrefix").should.equal(table_prefix)
crawler.get("SchemaChangePolicy").should.equal(schema_change_policy)
crawler.get("RecrawlPolicy").should.equal(recrawl_policy)
crawler.get("LineageConfiguration").should.equal(lineage_configuration)
crawler.get("Configuration").should.equal(configuration)
crawler.get("CrawlerSecurityConfiguration").should.equal(
crawler_security_configuration
)
crawler.get("State").should.equal("READY")
crawler.get("CrawlElapsedTime").should.equal(0)
crawler.get("Version").should.equal(1)
if not settings.TEST_SERVER_MODE:
crawler.get("CreationTime").should.equal(FROZEN_CREATE_TIME)
crawler.get("LastUpdated").should.equal(FROZEN_CREATE_TIME)
crawler.should.not_have.key("LastCrawl")
@mock_glue
def test_create_crawler_already_exists():
client = boto3.client("glue", region_name="us-east-1")
name = "my_crawler_name"
helpers.create_crawler(client, name)
with pytest.raises(ClientError) as exc:
helpers.create_crawler(client, name)
exc.value.response["Error"]["Code"].should.equal("AlreadyExistsException")
@mock_glue
def test_get_crawler_not_exits():
client = boto3.client("glue", region_name="us-east-1")
name = "my_crawler_name"
with pytest.raises(ClientError) as exc:
client.get_crawler(Name=name)
exc.value.response["Error"]["Code"].should.equal("EntityNotFoundException")
exc.value.response["Error"]["Message"].should.match(
"Crawler my_crawler_name not found"
)
@mock_glue
def test_get_crawlers_empty():
client = boto3.client("glue", region_name="us-east-1")
response = client.get_crawlers()
response["Crawlers"].should.have.length_of(0)
@mock_glue
def test_get_crawlers_several_items():
client = boto3.client("glue", region_name="us-east-1")
name_1, name_2 = "my_crawler_name_1", "my_crawler_name_2"
helpers.create_crawler(client, name_1)
helpers.create_crawler(client, name_2)
crawlers = sorted(client.get_crawlers()["Crawlers"], key=lambda x: x["Name"])
crawlers.should.have.length_of(2)
crawlers[0].get("Name").should.equal(name_1)
crawlers[1].get("Name").should.equal(name_2)
@mock_glue
def test_delete_crawler():
client = boto3.client("glue", region_name="us-east-1")
name = "my_crawler_name"
helpers.create_crawler(client, name)
result = client.delete_crawler(Name=name)
result["ResponseMetadata"]["HTTPStatusCode"].should.equal(200)
# confirm crawler is deleted
with pytest.raises(ClientError) as exc:
client.get_crawler(Name=name)
exc.value.response["Error"]["Code"].should.equal("EntityNotFoundException")
exc.value.response["Error"]["Message"].should.match(
"Crawler my_crawler_name not found"
)
@mock_glue
def test_delete_crawler_not_exists():
client = boto3.client("glue", region_name="us-east-1")
name = "my_crawler_name"
with pytest.raises(ClientError) as exc:
client.delete_crawler(Name=name)
exc.value.response["Error"]["Code"].should.equal("EntityNotFoundException")
exc.value.response["Error"]["Message"].should.match(
"Crawler my_crawler_name not found"
)