Batch - implement attemptDurationSeconds (#4636)

* Batch - implement attemptDurationSeconds

* Batch tests - make job def names unique
This commit is contained in:
Bert Blommers 2021-11-27 05:25:53 -01:00 committed by GitHub
parent 1ac9b9949d
commit 41de9b82ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 134 additions and 9 deletions

View File

@ -200,6 +200,7 @@ class JobDefinition(CloudFormationModel):
tags={},
revision=0,
retry_strategy=0,
timeout=None,
):
self.name = name
self.retries = retry_strategy
@ -210,9 +211,8 @@ class JobDefinition(CloudFormationModel):
self.arn = None
self.status = "ACTIVE"
self.tagger = TaggingService()
if parameters is None:
parameters = {}
self.parameters = parameters
self.parameters = parameters or {}
self.timeout = timeout
self._validate()
self._update_arn()
@ -295,7 +295,9 @@ class JobDefinition(CloudFormationModel):
if vcpus < 1:
raise ClientException("container vcpus limit must be greater than 0")
def update(self, parameters, _type, container_properties, retry_strategy, tags):
def update(
self, parameters, _type, container_properties, retry_strategy, tags, timeout
):
if parameters is None:
parameters = self.parameters
@ -317,6 +319,7 @@ class JobDefinition(CloudFormationModel):
revision=self.revision,
retry_strategy=retry_strategy,
tags=tags,
timeout=timeout,
)
def describe(self):
@ -333,6 +336,8 @@ class JobDefinition(CloudFormationModel):
result["containerProperties"] = self.container_properties
if self.retries is not None and self.retries > 0:
result["retryStrategy"] = {"attempts": self.retries}
if self.timeout:
result["timeout"] = self.timeout
return result
@ -362,6 +367,7 @@ class JobDefinition(CloudFormationModel):
tags=lowercase_first_key(properties.get("Tags", {})),
retry_strategy=lowercase_first_key(properties["RetryStrategy"]),
container_properties=lowercase_first_key(properties["ContainerProperties"]),
timeout=lowercase_first_key(properties.get("timeout", {})),
)
arn = res[1]
@ -378,6 +384,7 @@ class Job(threading.Thread, BaseModel, DockerModel):
container_overrides,
depends_on,
all_jobs,
timeout,
):
"""
Docker Job
@ -405,6 +412,7 @@ class Job(threading.Thread, BaseModel, DockerModel):
self.job_stopped = False
self.job_stopped_reason = None
self.depends_on = depends_on
self.timeout = timeout
self.all_jobs = all_jobs
self.stop = False
@ -447,6 +455,8 @@ class Job(threading.Thread, BaseModel, DockerModel):
result["container"]["logStreamName"] = self.log_stream_name
if self.job_stopped_reason is not None:
result["statusReason"] = self.job_stopped_reason
if self.timeout:
result["timeout"] = self.timeout
return result
def _get_container_property(self, p, default):
@ -474,6 +484,13 @@ class Job(threading.Thread, BaseModel, DockerModel):
p, self.job_definition.container_properties.get(p, default)
)
def _get_attempt_duration(self):
if self.timeout:
return self.timeout["attemptDurationSeconds"]
if self.job_definition.timeout:
return self.job_definition.timeout["attemptDurationSeconds"]
return None
def run(self):
"""
Run the container.
@ -546,10 +563,23 @@ class Job(threading.Thread, BaseModel, DockerModel):
self.job_state = "RUNNING"
try:
container.reload()
max_time = None
if self._get_attempt_duration():
attempt_duration = self._get_attempt_duration()
max_time = self.job_started_at + datetime.timedelta(
seconds=attempt_duration
)
while container.status == "running" and not self.stop:
container.reload()
time.sleep(0.5)
if max_time and datetime.datetime.now() > max_time:
raise Exception(
"Job time exceeded the configured attemptDurationSeconds"
)
# Container should be stopped by this point... unless asked to stop
if container.status == "running":
container.kill()
@ -1266,7 +1296,14 @@ class BatchBackend(BaseBackend):
del self._job_queues[job_queue.arn]
def register_job_definition(
self, def_name, parameters, _type, tags, retry_strategy, container_properties
self,
def_name,
parameters,
_type,
tags,
retry_strategy,
container_properties,
timeout,
):
if def_name is None:
raise ClientException("jobDefinitionName must be provided")
@ -1288,11 +1325,12 @@ class BatchBackend(BaseBackend):
tags=tags,
region_name=self.region_name,
retry_strategy=retry_strategy,
timeout=timeout,
)
else:
# Make new jobdef
job_def = job_def.update(
parameters, _type, container_properties, retry_strategy, tags
parameters, _type, container_properties, retry_strategy, tags, timeout
)
self._job_definitions[job_def.arn] = job_def
@ -1347,6 +1385,7 @@ class BatchBackend(BaseBackend):
retries=None,
depends_on=None,
container_overrides=None,
timeout=None,
):
# TODO parameters, retries (which is a dict raw from request), job dependencies and container overrides are ignored for now
@ -1369,6 +1408,7 @@ class BatchBackend(BaseBackend):
container_overrides=container_overrides,
depends_on=depends_on,
all_jobs=self._jobs,
timeout=timeout,
)
self._jobs[job.job_id] = job

View File

@ -179,6 +179,7 @@ class BatchResponse(BaseResponse):
tags = self._get_param("tags")
retry_strategy = self._get_param("retryStrategy")
_type = self._get_param("type")
timeout = self._get_param("timeout")
try:
name, arn, revision = self.batch_backend.register_job_definition(
def_name=def_name,
@ -187,6 +188,7 @@ class BatchResponse(BaseResponse):
tags=tags,
retry_strategy=retry_strategy,
container_properties=container_properties,
timeout=timeout,
)
except AWSError as err:
return err.response()
@ -231,6 +233,7 @@ class BatchResponse(BaseResponse):
job_queue = self._get_param("jobQueue")
parameters = self._get_param("parameters")
retries = self._get_param("retryStrategy")
timeout = self._get_param("timeout")
try:
name, job_id = self.batch_backend.submit_job(
@ -241,6 +244,7 @@ class BatchResponse(BaseResponse):
retries=retries,
depends_on=depends_on,
container_overrides=container_overrides,
timeout=timeout,
)
except AWSError as err:
return err.response()

View File

@ -684,8 +684,9 @@ def test_update_job_definition():
"vcpus": 2,
}
job_def_name = str(uuid4())[0:6]
batch_client.register_job_definition(
jobDefinitionName="test-job",
jobDefinitionName=job_def_name,
type="container",
tags=tags[0],
parameters={},
@ -694,20 +695,100 @@ def test_update_job_definition():
container_props["memory"] = 2048
batch_client.register_job_definition(
jobDefinitionName="test-job",
jobDefinitionName=job_def_name,
type="container",
tags=tags[1],
parameters={},
containerProperties=container_props,
)
job_defs = batch_client.describe_job_definitions(jobDefinitionName="test-job")[
job_defs = batch_client.describe_job_definitions(jobDefinitionName=job_def_name)[
"jobDefinitions"
]
job_defs.should.have.length_of(2)
job_defs[0]["containerProperties"]["memory"].should.equal(1024)
job_defs[0]["tags"].should.equal(tags[0])
job_defs[0].shouldnt.have.key("timeout")
job_defs[1]["containerProperties"]["memory"].should.equal(2048)
job_defs[1]["tags"].should.equal(tags[1])
@mock_batch
def test_register_job_definition_with_timeout():
_, _, _, _, batch_client = _get_clients()
container_props = {
"image": "amazonlinux",
"memory": 1024,
"vcpus": 2,
}
job_def_name = str(uuid4())[0:6]
batch_client.register_job_definition(
jobDefinitionName=job_def_name,
type="container",
parameters={},
containerProperties=container_props,
timeout={"attemptDurationSeconds": 3},
)
resp = batch_client.describe_job_definitions(jobDefinitionName=job_def_name)
job_def = resp["jobDefinitions"][0]
job_def.should.have.key("timeout").equals({"attemptDurationSeconds": 3})
@mock_batch
@mock_ec2
@mock_iam
def test_submit_job_with_timeout():
ec2_client, iam_client, _, _, batch_client = _get_clients()
_, _, _, iam_arn = _setup(ec2_client, iam_client)
job_def_name = str(uuid4())[0:6]
commands = ["sleep", "3"]
job_def_arn, queue_arn = prepare_job(batch_client, commands, iam_arn, job_def_name)
resp = batch_client.submit_job(
jobName=str(uuid4())[0:6],
jobQueue=queue_arn,
jobDefinition=job_def_arn,
timeout={"attemptDurationSeconds": 1},
)
job_id = resp["jobId"]
# This should fail, as the job-duration is longer than the attemptDurationSeconds
_wait_for_job_status(batch_client, job_id, "FAILED")
@mock_batch
@mock_ec2
@mock_iam
def test_submit_job_with_timeout_set_at_definition():
ec2_client, iam_client, _, _, batch_client = _get_clients()
_, _, _, iam_arn = _setup(ec2_client, iam_client)
job_def_name = str(uuid4())[0:6]
commands = ["sleep", "3"]
_, queue_arn = prepare_job(batch_client, commands, iam_arn, job_def_name)
resp = batch_client.register_job_definition(
jobDefinitionName=job_def_name,
type="container",
containerProperties={
"image": "busybox:latest",
"vcpus": 1,
"memory": 128,
"command": commands,
},
timeout={"attemptDurationSeconds": 1},
)
job_def_arn = resp["jobDefinitionArn"]
resp = batch_client.submit_job(
jobName=str(uuid4())[0:6], jobQueue=queue_arn, jobDefinition=job_def_arn
)
job_id = resp["jobId"]
# This should fail, as the job-duration is longer than the attemptDurationSeconds
_wait_for_job_status(batch_client, job_id, "FAILED")