Batch - implement attemptDurationSeconds (#4636)
* Batch - implement attemptDurationSeconds * Batch tests - make job def names unique
This commit is contained in:
parent
1ac9b9949d
commit
41de9b82ac
@ -200,6 +200,7 @@ class JobDefinition(CloudFormationModel):
|
|||||||
tags={},
|
tags={},
|
||||||
revision=0,
|
revision=0,
|
||||||
retry_strategy=0,
|
retry_strategy=0,
|
||||||
|
timeout=None,
|
||||||
):
|
):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.retries = retry_strategy
|
self.retries = retry_strategy
|
||||||
@ -210,9 +211,8 @@ class JobDefinition(CloudFormationModel):
|
|||||||
self.arn = None
|
self.arn = None
|
||||||
self.status = "ACTIVE"
|
self.status = "ACTIVE"
|
||||||
self.tagger = TaggingService()
|
self.tagger = TaggingService()
|
||||||
if parameters is None:
|
self.parameters = parameters or {}
|
||||||
parameters = {}
|
self.timeout = timeout
|
||||||
self.parameters = parameters
|
|
||||||
|
|
||||||
self._validate()
|
self._validate()
|
||||||
self._update_arn()
|
self._update_arn()
|
||||||
@ -295,7 +295,9 @@ class JobDefinition(CloudFormationModel):
|
|||||||
if vcpus < 1:
|
if vcpus < 1:
|
||||||
raise ClientException("container vcpus limit must be greater than 0")
|
raise ClientException("container vcpus limit must be greater than 0")
|
||||||
|
|
||||||
def update(self, parameters, _type, container_properties, retry_strategy, tags):
|
def update(
|
||||||
|
self, parameters, _type, container_properties, retry_strategy, tags, timeout
|
||||||
|
):
|
||||||
if parameters is None:
|
if parameters is None:
|
||||||
parameters = self.parameters
|
parameters = self.parameters
|
||||||
|
|
||||||
@ -317,6 +319,7 @@ class JobDefinition(CloudFormationModel):
|
|||||||
revision=self.revision,
|
revision=self.revision,
|
||||||
retry_strategy=retry_strategy,
|
retry_strategy=retry_strategy,
|
||||||
tags=tags,
|
tags=tags,
|
||||||
|
timeout=timeout,
|
||||||
)
|
)
|
||||||
|
|
||||||
def describe(self):
|
def describe(self):
|
||||||
@ -333,6 +336,8 @@ class JobDefinition(CloudFormationModel):
|
|||||||
result["containerProperties"] = self.container_properties
|
result["containerProperties"] = self.container_properties
|
||||||
if self.retries is not None and self.retries > 0:
|
if self.retries is not None and self.retries > 0:
|
||||||
result["retryStrategy"] = {"attempts": self.retries}
|
result["retryStrategy"] = {"attempts": self.retries}
|
||||||
|
if self.timeout:
|
||||||
|
result["timeout"] = self.timeout
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@ -362,6 +367,7 @@ class JobDefinition(CloudFormationModel):
|
|||||||
tags=lowercase_first_key(properties.get("Tags", {})),
|
tags=lowercase_first_key(properties.get("Tags", {})),
|
||||||
retry_strategy=lowercase_first_key(properties["RetryStrategy"]),
|
retry_strategy=lowercase_first_key(properties["RetryStrategy"]),
|
||||||
container_properties=lowercase_first_key(properties["ContainerProperties"]),
|
container_properties=lowercase_first_key(properties["ContainerProperties"]),
|
||||||
|
timeout=lowercase_first_key(properties.get("timeout", {})),
|
||||||
)
|
)
|
||||||
arn = res[1]
|
arn = res[1]
|
||||||
|
|
||||||
@ -378,6 +384,7 @@ class Job(threading.Thread, BaseModel, DockerModel):
|
|||||||
container_overrides,
|
container_overrides,
|
||||||
depends_on,
|
depends_on,
|
||||||
all_jobs,
|
all_jobs,
|
||||||
|
timeout,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Docker Job
|
Docker Job
|
||||||
@ -405,6 +412,7 @@ class Job(threading.Thread, BaseModel, DockerModel):
|
|||||||
self.job_stopped = False
|
self.job_stopped = False
|
||||||
self.job_stopped_reason = None
|
self.job_stopped_reason = None
|
||||||
self.depends_on = depends_on
|
self.depends_on = depends_on
|
||||||
|
self.timeout = timeout
|
||||||
self.all_jobs = all_jobs
|
self.all_jobs = all_jobs
|
||||||
|
|
||||||
self.stop = False
|
self.stop = False
|
||||||
@ -447,6 +455,8 @@ class Job(threading.Thread, BaseModel, DockerModel):
|
|||||||
result["container"]["logStreamName"] = self.log_stream_name
|
result["container"]["logStreamName"] = self.log_stream_name
|
||||||
if self.job_stopped_reason is not None:
|
if self.job_stopped_reason is not None:
|
||||||
result["statusReason"] = self.job_stopped_reason
|
result["statusReason"] = self.job_stopped_reason
|
||||||
|
if self.timeout:
|
||||||
|
result["timeout"] = self.timeout
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _get_container_property(self, p, default):
|
def _get_container_property(self, p, default):
|
||||||
@ -474,6 +484,13 @@ class Job(threading.Thread, BaseModel, DockerModel):
|
|||||||
p, self.job_definition.container_properties.get(p, default)
|
p, self.job_definition.container_properties.get(p, default)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _get_attempt_duration(self):
|
||||||
|
if self.timeout:
|
||||||
|
return self.timeout["attemptDurationSeconds"]
|
||||||
|
if self.job_definition.timeout:
|
||||||
|
return self.job_definition.timeout["attemptDurationSeconds"]
|
||||||
|
return None
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
"""
|
"""
|
||||||
Run the container.
|
Run the container.
|
||||||
@ -546,10 +563,23 @@ class Job(threading.Thread, BaseModel, DockerModel):
|
|||||||
self.job_state = "RUNNING"
|
self.job_state = "RUNNING"
|
||||||
try:
|
try:
|
||||||
container.reload()
|
container.reload()
|
||||||
|
|
||||||
|
max_time = None
|
||||||
|
if self._get_attempt_duration():
|
||||||
|
attempt_duration = self._get_attempt_duration()
|
||||||
|
max_time = self.job_started_at + datetime.timedelta(
|
||||||
|
seconds=attempt_duration
|
||||||
|
)
|
||||||
|
|
||||||
while container.status == "running" and not self.stop:
|
while container.status == "running" and not self.stop:
|
||||||
container.reload()
|
container.reload()
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
if max_time and datetime.datetime.now() > max_time:
|
||||||
|
raise Exception(
|
||||||
|
"Job time exceeded the configured attemptDurationSeconds"
|
||||||
|
)
|
||||||
|
|
||||||
# Container should be stopped by this point... unless asked to stop
|
# Container should be stopped by this point... unless asked to stop
|
||||||
if container.status == "running":
|
if container.status == "running":
|
||||||
container.kill()
|
container.kill()
|
||||||
@ -1266,7 +1296,14 @@ class BatchBackend(BaseBackend):
|
|||||||
del self._job_queues[job_queue.arn]
|
del self._job_queues[job_queue.arn]
|
||||||
|
|
||||||
def register_job_definition(
|
def register_job_definition(
|
||||||
self, def_name, parameters, _type, tags, retry_strategy, container_properties
|
self,
|
||||||
|
def_name,
|
||||||
|
parameters,
|
||||||
|
_type,
|
||||||
|
tags,
|
||||||
|
retry_strategy,
|
||||||
|
container_properties,
|
||||||
|
timeout,
|
||||||
):
|
):
|
||||||
if def_name is None:
|
if def_name is None:
|
||||||
raise ClientException("jobDefinitionName must be provided")
|
raise ClientException("jobDefinitionName must be provided")
|
||||||
@ -1288,11 +1325,12 @@ class BatchBackend(BaseBackend):
|
|||||||
tags=tags,
|
tags=tags,
|
||||||
region_name=self.region_name,
|
region_name=self.region_name,
|
||||||
retry_strategy=retry_strategy,
|
retry_strategy=retry_strategy,
|
||||||
|
timeout=timeout,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Make new jobdef
|
# Make new jobdef
|
||||||
job_def = job_def.update(
|
job_def = job_def.update(
|
||||||
parameters, _type, container_properties, retry_strategy, tags
|
parameters, _type, container_properties, retry_strategy, tags, timeout
|
||||||
)
|
)
|
||||||
|
|
||||||
self._job_definitions[job_def.arn] = job_def
|
self._job_definitions[job_def.arn] = job_def
|
||||||
@ -1347,6 +1385,7 @@ class BatchBackend(BaseBackend):
|
|||||||
retries=None,
|
retries=None,
|
||||||
depends_on=None,
|
depends_on=None,
|
||||||
container_overrides=None,
|
container_overrides=None,
|
||||||
|
timeout=None,
|
||||||
):
|
):
|
||||||
# TODO parameters, retries (which is a dict raw from request), job dependencies and container overrides are ignored for now
|
# TODO parameters, retries (which is a dict raw from request), job dependencies and container overrides are ignored for now
|
||||||
|
|
||||||
@ -1369,6 +1408,7 @@ class BatchBackend(BaseBackend):
|
|||||||
container_overrides=container_overrides,
|
container_overrides=container_overrides,
|
||||||
depends_on=depends_on,
|
depends_on=depends_on,
|
||||||
all_jobs=self._jobs,
|
all_jobs=self._jobs,
|
||||||
|
timeout=timeout,
|
||||||
)
|
)
|
||||||
self._jobs[job.job_id] = job
|
self._jobs[job.job_id] = job
|
||||||
|
|
||||||
|
@ -179,6 +179,7 @@ class BatchResponse(BaseResponse):
|
|||||||
tags = self._get_param("tags")
|
tags = self._get_param("tags")
|
||||||
retry_strategy = self._get_param("retryStrategy")
|
retry_strategy = self._get_param("retryStrategy")
|
||||||
_type = self._get_param("type")
|
_type = self._get_param("type")
|
||||||
|
timeout = self._get_param("timeout")
|
||||||
try:
|
try:
|
||||||
name, arn, revision = self.batch_backend.register_job_definition(
|
name, arn, revision = self.batch_backend.register_job_definition(
|
||||||
def_name=def_name,
|
def_name=def_name,
|
||||||
@ -187,6 +188,7 @@ class BatchResponse(BaseResponse):
|
|||||||
tags=tags,
|
tags=tags,
|
||||||
retry_strategy=retry_strategy,
|
retry_strategy=retry_strategy,
|
||||||
container_properties=container_properties,
|
container_properties=container_properties,
|
||||||
|
timeout=timeout,
|
||||||
)
|
)
|
||||||
except AWSError as err:
|
except AWSError as err:
|
||||||
return err.response()
|
return err.response()
|
||||||
@ -231,6 +233,7 @@ class BatchResponse(BaseResponse):
|
|||||||
job_queue = self._get_param("jobQueue")
|
job_queue = self._get_param("jobQueue")
|
||||||
parameters = self._get_param("parameters")
|
parameters = self._get_param("parameters")
|
||||||
retries = self._get_param("retryStrategy")
|
retries = self._get_param("retryStrategy")
|
||||||
|
timeout = self._get_param("timeout")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
name, job_id = self.batch_backend.submit_job(
|
name, job_id = self.batch_backend.submit_job(
|
||||||
@ -241,6 +244,7 @@ class BatchResponse(BaseResponse):
|
|||||||
retries=retries,
|
retries=retries,
|
||||||
depends_on=depends_on,
|
depends_on=depends_on,
|
||||||
container_overrides=container_overrides,
|
container_overrides=container_overrides,
|
||||||
|
timeout=timeout,
|
||||||
)
|
)
|
||||||
except AWSError as err:
|
except AWSError as err:
|
||||||
return err.response()
|
return err.response()
|
||||||
|
@ -684,8 +684,9 @@ def test_update_job_definition():
|
|||||||
"vcpus": 2,
|
"vcpus": 2,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
job_def_name = str(uuid4())[0:6]
|
||||||
batch_client.register_job_definition(
|
batch_client.register_job_definition(
|
||||||
jobDefinitionName="test-job",
|
jobDefinitionName=job_def_name,
|
||||||
type="container",
|
type="container",
|
||||||
tags=tags[0],
|
tags=tags[0],
|
||||||
parameters={},
|
parameters={},
|
||||||
@ -694,20 +695,100 @@ def test_update_job_definition():
|
|||||||
|
|
||||||
container_props["memory"] = 2048
|
container_props["memory"] = 2048
|
||||||
batch_client.register_job_definition(
|
batch_client.register_job_definition(
|
||||||
jobDefinitionName="test-job",
|
jobDefinitionName=job_def_name,
|
||||||
type="container",
|
type="container",
|
||||||
tags=tags[1],
|
tags=tags[1],
|
||||||
parameters={},
|
parameters={},
|
||||||
containerProperties=container_props,
|
containerProperties=container_props,
|
||||||
)
|
)
|
||||||
|
|
||||||
job_defs = batch_client.describe_job_definitions(jobDefinitionName="test-job")[
|
job_defs = batch_client.describe_job_definitions(jobDefinitionName=job_def_name)[
|
||||||
"jobDefinitions"
|
"jobDefinitions"
|
||||||
]
|
]
|
||||||
job_defs.should.have.length_of(2)
|
job_defs.should.have.length_of(2)
|
||||||
|
|
||||||
job_defs[0]["containerProperties"]["memory"].should.equal(1024)
|
job_defs[0]["containerProperties"]["memory"].should.equal(1024)
|
||||||
job_defs[0]["tags"].should.equal(tags[0])
|
job_defs[0]["tags"].should.equal(tags[0])
|
||||||
|
job_defs[0].shouldnt.have.key("timeout")
|
||||||
|
|
||||||
job_defs[1]["containerProperties"]["memory"].should.equal(2048)
|
job_defs[1]["containerProperties"]["memory"].should.equal(2048)
|
||||||
job_defs[1]["tags"].should.equal(tags[1])
|
job_defs[1]["tags"].should.equal(tags[1])
|
||||||
|
|
||||||
|
|
||||||
|
@mock_batch
|
||||||
|
def test_register_job_definition_with_timeout():
|
||||||
|
_, _, _, _, batch_client = _get_clients()
|
||||||
|
|
||||||
|
container_props = {
|
||||||
|
"image": "amazonlinux",
|
||||||
|
"memory": 1024,
|
||||||
|
"vcpus": 2,
|
||||||
|
}
|
||||||
|
|
||||||
|
job_def_name = str(uuid4())[0:6]
|
||||||
|
batch_client.register_job_definition(
|
||||||
|
jobDefinitionName=job_def_name,
|
||||||
|
type="container",
|
||||||
|
parameters={},
|
||||||
|
containerProperties=container_props,
|
||||||
|
timeout={"attemptDurationSeconds": 3},
|
||||||
|
)
|
||||||
|
|
||||||
|
resp = batch_client.describe_job_definitions(jobDefinitionName=job_def_name)
|
||||||
|
job_def = resp["jobDefinitions"][0]
|
||||||
|
job_def.should.have.key("timeout").equals({"attemptDurationSeconds": 3})
|
||||||
|
|
||||||
|
|
||||||
|
@mock_batch
|
||||||
|
@mock_ec2
|
||||||
|
@mock_iam
|
||||||
|
def test_submit_job_with_timeout():
|
||||||
|
ec2_client, iam_client, _, _, batch_client = _get_clients()
|
||||||
|
_, _, _, iam_arn = _setup(ec2_client, iam_client)
|
||||||
|
|
||||||
|
job_def_name = str(uuid4())[0:6]
|
||||||
|
commands = ["sleep", "3"]
|
||||||
|
job_def_arn, queue_arn = prepare_job(batch_client, commands, iam_arn, job_def_name)
|
||||||
|
|
||||||
|
resp = batch_client.submit_job(
|
||||||
|
jobName=str(uuid4())[0:6],
|
||||||
|
jobQueue=queue_arn,
|
||||||
|
jobDefinition=job_def_arn,
|
||||||
|
timeout={"attemptDurationSeconds": 1},
|
||||||
|
)
|
||||||
|
job_id = resp["jobId"]
|
||||||
|
|
||||||
|
# This should fail, as the job-duration is longer than the attemptDurationSeconds
|
||||||
|
_wait_for_job_status(batch_client, job_id, "FAILED")
|
||||||
|
|
||||||
|
|
||||||
|
@mock_batch
|
||||||
|
@mock_ec2
|
||||||
|
@mock_iam
|
||||||
|
def test_submit_job_with_timeout_set_at_definition():
|
||||||
|
ec2_client, iam_client, _, _, batch_client = _get_clients()
|
||||||
|
_, _, _, iam_arn = _setup(ec2_client, iam_client)
|
||||||
|
|
||||||
|
job_def_name = str(uuid4())[0:6]
|
||||||
|
commands = ["sleep", "3"]
|
||||||
|
_, queue_arn = prepare_job(batch_client, commands, iam_arn, job_def_name)
|
||||||
|
resp = batch_client.register_job_definition(
|
||||||
|
jobDefinitionName=job_def_name,
|
||||||
|
type="container",
|
||||||
|
containerProperties={
|
||||||
|
"image": "busybox:latest",
|
||||||
|
"vcpus": 1,
|
||||||
|
"memory": 128,
|
||||||
|
"command": commands,
|
||||||
|
},
|
||||||
|
timeout={"attemptDurationSeconds": 1},
|
||||||
|
)
|
||||||
|
job_def_arn = resp["jobDefinitionArn"]
|
||||||
|
|
||||||
|
resp = batch_client.submit_job(
|
||||||
|
jobName=str(uuid4())[0:6], jobQueue=queue_arn, jobDefinition=job_def_arn
|
||||||
|
)
|
||||||
|
job_id = resp["jobId"]
|
||||||
|
|
||||||
|
# This should fail, as the job-duration is longer than the attemptDurationSeconds
|
||||||
|
_wait_for_job_status(batch_client, job_id, "FAILED")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user