list crawler implementation (#5123)

This commit is contained in:
joshuaghezzi 2022-05-12 22:17:17 +12:00 committed by GitHub
parent 31737bc81e
commit 1cb2c80bf2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 104 additions and 4 deletions

View File

@ -24,6 +24,12 @@ from ..utilities.paginator import paginate
class GlueBackend(BaseBackend): class GlueBackend(BaseBackend):
PAGINATION_MODEL = { PAGINATION_MODEL = {
"list_crawlers": {
"input_token": "next_token",
"limit_key": "max_results",
"limit_default": 100,
"unique_attribute": "name",
},
"list_jobs": { "list_jobs": {
"input_token": "next_token", "input_token": "next_token",
"limit_key": "max_results", "limit_key": "max_results",
@ -143,6 +149,10 @@ class GlueBackend(BaseBackend):
def get_crawlers(self): def get_crawlers(self):
return [self.crawlers[key] for key in self.crawlers] if self.crawlers else [] return [self.crawlers[key] for key in self.crawlers] if self.crawlers else []
@paginate(pagination_model=PAGINATION_MODEL)
def list_crawlers(self):
return [crawler for _, crawler in self.crawlers.items()]
def start_crawler(self, name): def start_crawler(self, name):
crawler = self.get_crawler(name) crawler = self.get_crawler(name)
crawler.start_crawler() crawler.start_crawler()
@ -380,6 +390,9 @@ class FakeCrawler(BaseModel):
self.crawl_elapsed_time = 0 self.crawl_elapsed_time = 0
self.last_crawl_info = None self.last_crawl_info = None
def get_name(self):
return self.name
def as_dict(self): def as_dict(self):
last_crawl = self.last_crawl_info.as_dict() if self.last_crawl_info else None last_crawl = self.last_crawl_info.as_dict() if self.last_crawl_info else None
data = { data = {

View File

@ -307,6 +307,30 @@ class GlueResponse(BaseResponse):
crawlers = self.glue_backend.get_crawlers() crawlers = self.glue_backend.get_crawlers()
return json.dumps({"Crawlers": [crawler.as_dict() for crawler in crawlers]}) return json.dumps({"Crawlers": [crawler.as_dict() for crawler in crawlers]})
def list_crawlers(self):
next_token = self._get_param("NextToken")
max_results = self._get_int_param("MaxResults")
tags = self._get_param("Tags")
crawlers, next_token = self.glue_backend.list_crawlers(
next_token=next_token, max_results=max_results
)
filtered_crawler_names = self.filter_crawlers_by_tags(crawlers, tags)
return json.dumps(
dict(
CrawlerNames=[crawler_name for crawler_name in filtered_crawler_names],
NextToken=next_token,
)
)
def filter_crawlers_by_tags(self, crawlers, tags):
if not tags:
return [crawler.get_name() for crawler in crawlers]
return [
crawler.get_name()
for crawler in crawlers
if self.is_tags_match(crawler.tags, tags)
]
def start_crawler(self): def start_crawler(self):
name = self.parameters.get("Name") name = self.parameters.get("Name")
self.glue_backend.start_crawler(name) self.glue_backend.start_crawler(name)
@ -402,9 +426,9 @@ class GlueResponse(BaseResponse):
return [job.get_name() for job in jobs if self.is_tags_match(job.tags, tags)] return [job.get_name() for job in jobs if self.is_tags_match(job.tags, tags)]
@staticmethod @staticmethod
def is_tags_match(job_tags, tags): def is_tags_match(glue_resource_tags, tags):
mutual_keys = set(job_tags).intersection(tags) mutual_keys = set(glue_resource_tags).intersection(tags)
for key in mutual_keys: for key in mutual_keys:
if job_tags[key] == tags[key]: if glue_resource_tags[key] == tags[key]:
return True return True
return False return False

View File

@ -193,7 +193,7 @@ def test_list_jobs_with_tags():
@mock_glue @mock_glue
def test_next_token_logic_does_not_create_infinite_loop(): def test_list_jobs_next_token_logic_does_not_create_infinite_loop():
client = create_glue_client() client = create_glue_client()
create_test_jobs(client, 4) create_test_jobs(client, 4)
first_response = client.list_jobs(MaxResults=1) first_response = client.list_jobs(MaxResults=1)
@ -228,3 +228,66 @@ def create_test_job_w_all_attributes(client, **job_attributes):
def create_test_jobs(client, number_of_jobs): def create_test_jobs(client, number_of_jobs):
for _ in range(number_of_jobs): for _ in range(number_of_jobs):
create_test_job(client) create_test_job(client)
def create_test_crawler(client, tags=None):
crawler_name = str(uuid4())
client.create_crawler(
Name=crawler_name,
Role="test_role",
Targets={"S3Targets": [{"Path": "s3://tests3target"}]},
Tags=tags or {},
)
return crawler_name
def create_test_crawlers(client, number_of_crawlers):
for _ in range(number_of_crawlers):
create_test_crawler(client)
@mock_glue
def test_list_crawlers_with_max_results():
client = create_glue_client()
create_test_crawlers(client, 4)
response = client.list_crawlers(MaxResults=2)
response["CrawlerNames"].should.have.length_of(2)
response.should.have.key("NextToken")
@mock_glue
def test_list_crawlers_from_next_token():
client = create_glue_client()
create_test_crawlers(client, 10)
first_response = client.list_crawlers(MaxResults=3)
response = client.list_crawlers(NextToken=first_response["NextToken"])
response["CrawlerNames"].should.have.length_of(7)
@mock_glue
def test_list_crawlers_with_max_results_greater_than_actual_results():
client = create_glue_client()
create_test_crawlers(client, 4)
response = client.list_crawlers(MaxResults=10)
response["CrawlerNames"].should.have.length_of(4)
@mock_glue
def test_list_crawlers_with_tags():
client = create_glue_client()
create_test_crawler(client)
create_test_crawler(client, {"string": "string"})
response = client.list_crawlers(Tags={"string": "string"})
response["CrawlerNames"].should.have.length_of(1)
@mock_glue
def test_list_crawlers_next_token_logic_does_not_create_infinite_loop():
client = create_glue_client()
create_test_crawlers(client, 4)
first_response = client.list_crawlers(MaxResults=1)
next_token = first_response["NextToken"]
while next_token:
response = client.list_crawlers(NextToken=next_token)
next_token = response.get("NextToken")
assert not next_token