2017-09-26 21:22:59 +00:00
|
|
|
import re
|
2017-09-29 22:29:36 +00:00
|
|
|
from itertools import cycle
|
2017-10-06 00:21:29 +00:00
|
|
|
import datetime
|
|
|
|
import time
|
2017-09-29 22:29:36 +00:00
|
|
|
import uuid
|
2017-10-06 00:21:29 +00:00
|
|
|
import logging
|
|
|
|
import docker
|
|
|
|
import threading
|
|
|
|
import dateutil.parser
|
2022-01-13 12:39:26 +00:00
|
|
|
from sys import platform
|
2019-12-26 16:12:22 +00:00
|
|
|
|
2020-08-01 14:23:36 +00:00
|
|
|
from moto.core import BaseBackend, BaseModel, CloudFormationModel
|
2017-09-26 21:22:59 +00:00
|
|
|
from moto.iam import iam_backends
|
|
|
|
from moto.ec2 import ec2_backends
|
2017-09-29 22:29:36 +00:00
|
|
|
from moto.ecs import ecs_backends
|
2017-10-06 00:21:29 +00:00
|
|
|
from moto.logs import logs_backends
|
2017-09-26 21:22:59 +00:00
|
|
|
|
2021-09-21 16:12:18 +00:00
|
|
|
from .exceptions import InvalidParameterValueException, ClientException, ValidationError
|
2019-10-31 15:44:26 +00:00
|
|
|
from .utils import (
|
|
|
|
make_arn_for_compute_env,
|
|
|
|
make_arn_for_job_queue,
|
|
|
|
make_arn_for_task_def,
|
|
|
|
lowercase_first_key,
|
|
|
|
)
|
2017-09-26 21:22:59 +00:00
|
|
|
from moto.ec2.exceptions import InvalidSubnetIdError
|
2017-09-29 22:29:36 +00:00
|
|
|
from moto.ec2.models import INSTANCE_TYPES as EC2_INSTANCE_TYPES
|
2017-09-26 21:22:59 +00:00
|
|
|
from moto.iam.exceptions import IAMNotFoundException
|
2019-12-17 02:05:29 +00:00
|
|
|
from moto.core import ACCOUNT_ID as DEFAULT_ACCOUNT_ID
|
2021-12-24 21:02:45 +00:00
|
|
|
from moto.core.utils import unix_time_millis, BackendDict
|
2021-12-01 23:06:58 +00:00
|
|
|
from moto.utilities.docker_utilities import DockerModel
|
2022-01-27 23:25:18 +00:00
|
|
|
from moto import settings
|
2021-09-21 16:12:18 +00:00
|
|
|
from ..utilities.tagging_service import TaggingService
|
2017-09-26 21:22:59 +00:00
|
|
|
|
2017-10-06 00:21:29 +00:00
|
|
|
logger = logging.getLogger(__name__)
|
2019-10-31 15:44:26 +00:00
|
|
|
COMPUTE_ENVIRONMENT_NAME_REGEX = re.compile(
|
|
|
|
r"^[A-Za-z0-9][A-Za-z0-9_-]{1,126}[A-Za-z0-9]$"
|
|
|
|
)
|
2017-09-26 21:22:59 +00:00
|
|
|
|
|
|
|
|
2021-10-28 09:28:45 +00:00
|
|
|
def datetime2int_milliseconds(date):
|
|
|
|
"""
|
|
|
|
AWS returns timestamps in milliseconds
|
|
|
|
We don't use milliseconds timestamps internally,
|
|
|
|
this method should be used only in describe() method
|
|
|
|
"""
|
|
|
|
return int(date.timestamp() * 1000)
|
|
|
|
|
|
|
|
|
2017-10-06 00:21:29 +00:00
|
|
|
def datetime2int(date):
|
|
|
|
return int(time.mktime(date.timetuple()))
|
|
|
|
|
|
|
|
|
2020-08-01 14:23:36 +00:00
|
|
|
class ComputeEnvironment(CloudFormationModel):
|
2019-10-31 15:44:26 +00:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
compute_environment_name,
|
|
|
|
_type,
|
|
|
|
state,
|
|
|
|
compute_resources,
|
|
|
|
service_role,
|
|
|
|
region_name,
|
|
|
|
):
|
2017-09-29 22:29:36 +00:00
|
|
|
self.name = compute_environment_name
|
2017-10-19 23:51:04 +00:00
|
|
|
self.env_type = _type
|
2017-09-26 21:22:59 +00:00
|
|
|
self.state = state
|
|
|
|
self.compute_resources = compute_resources
|
|
|
|
self.service_role = service_role
|
2019-10-31 15:44:26 +00:00
|
|
|
self.arn = make_arn_for_compute_env(
|
|
|
|
DEFAULT_ACCOUNT_ID, compute_environment_name, region_name
|
|
|
|
)
|
2017-09-26 16:37:26 +00:00
|
|
|
|
2017-09-29 22:29:36 +00:00
|
|
|
self.instances = []
|
|
|
|
self.ecs_arn = None
|
2017-10-03 21:35:30 +00:00
|
|
|
self.ecs_name = None
|
2017-09-29 22:29:36 +00:00
|
|
|
|
|
|
|
def add_instance(self, instance):
|
|
|
|
self.instances.append(instance)
|
|
|
|
|
2017-10-03 21:35:30 +00:00
|
|
|
def set_ecs(self, arn, name):
|
2017-09-29 22:29:36 +00:00
|
|
|
self.ecs_arn = arn
|
2017-10-03 21:35:30 +00:00
|
|
|
self.ecs_name = name
|
2017-09-29 22:29:36 +00:00
|
|
|
|
2017-10-19 23:51:04 +00:00
|
|
|
@property
|
|
|
|
def physical_resource_id(self):
|
|
|
|
return self.arn
|
|
|
|
|
2020-08-01 14:23:36 +00:00
|
|
|
@staticmethod
|
|
|
|
def cloudformation_name_type():
|
|
|
|
return "ComputeEnvironmentName"
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def cloudformation_type():
|
|
|
|
# https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-batch-computeenvironment.html
|
|
|
|
return "AWS::Batch::ComputeEnvironment"
|
|
|
|
|
2017-10-19 23:51:04 +00:00
|
|
|
@classmethod
|
2019-10-31 15:44:26 +00:00
|
|
|
def create_from_cloudformation_json(
|
2021-11-03 21:00:42 +00:00
|
|
|
cls, resource_name, cloudformation_json, region_name, **kwargs
|
2019-10-31 15:44:26 +00:00
|
|
|
):
|
2017-10-19 23:51:04 +00:00
|
|
|
backend = batch_backends[region_name]
|
2019-10-31 15:44:26 +00:00
|
|
|
properties = cloudformation_json["Properties"]
|
2017-10-19 23:51:04 +00:00
|
|
|
|
|
|
|
env = backend.create_compute_environment(
|
|
|
|
resource_name,
|
2019-10-31 15:44:26 +00:00
|
|
|
properties["Type"],
|
|
|
|
properties.get("State", "ENABLED"),
|
|
|
|
lowercase_first_key(properties["ComputeResources"]),
|
|
|
|
properties["ServiceRole"],
|
2017-10-19 23:51:04 +00:00
|
|
|
)
|
|
|
|
arn = env[1]
|
|
|
|
|
|
|
|
return backend.get_compute_environment_by_arn(arn)
|
|
|
|
|
2017-09-26 16:37:26 +00:00
|
|
|
|
2020-08-01 14:23:36 +00:00
|
|
|
class JobQueue(CloudFormationModel):
|
2019-10-31 15:44:26 +00:00
|
|
|
def __init__(
|
|
|
|
self, name, priority, state, environments, env_order_json, region_name
|
|
|
|
):
|
2017-10-03 22:21:06 +00:00
|
|
|
"""
|
|
|
|
:param name: Job queue name
|
|
|
|
:type name: str
|
|
|
|
:param priority: Job queue priority
|
|
|
|
:type priority: int
|
|
|
|
:param state: Either ENABLED or DISABLED
|
|
|
|
:type state: str
|
|
|
|
:param environments: Compute Environments
|
|
|
|
:type environments: list of ComputeEnvironment
|
|
|
|
:param env_order_json: Compute Environments JSON for use when describing
|
|
|
|
:type env_order_json: list of dict
|
|
|
|
:param region_name: Region name
|
|
|
|
:type region_name: str
|
|
|
|
"""
|
|
|
|
self.name = name
|
|
|
|
self.priority = priority
|
|
|
|
self.state = state
|
|
|
|
self.environments = environments
|
|
|
|
self.env_order_json = env_order_json
|
|
|
|
self.arn = make_arn_for_job_queue(DEFAULT_ACCOUNT_ID, name, region_name)
|
2019-10-31 15:44:26 +00:00
|
|
|
self.status = "VALID"
|
2017-10-03 22:21:06 +00:00
|
|
|
|
2017-10-06 00:21:29 +00:00
|
|
|
self.jobs = []
|
|
|
|
|
2017-10-03 22:21:06 +00:00
|
|
|
def describe(self):
|
|
|
|
result = {
|
2019-10-31 15:44:26 +00:00
|
|
|
"computeEnvironmentOrder": self.env_order_json,
|
|
|
|
"jobQueueArn": self.arn,
|
|
|
|
"jobQueueName": self.name,
|
|
|
|
"priority": self.priority,
|
|
|
|
"state": self.state,
|
|
|
|
"status": self.status,
|
2017-10-03 22:21:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
2017-10-20 00:06:30 +00:00
|
|
|
@property
|
|
|
|
def physical_resource_id(self):
|
|
|
|
return self.arn
|
|
|
|
|
2020-08-01 14:23:36 +00:00
|
|
|
@staticmethod
|
|
|
|
def cloudformation_name_type():
|
|
|
|
return "JobQueueName"
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def cloudformation_type():
|
|
|
|
# https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-batch-jobqueue.html
|
|
|
|
return "AWS::Batch::JobQueue"
|
|
|
|
|
2017-10-20 00:06:30 +00:00
|
|
|
@classmethod
|
2019-10-31 15:44:26 +00:00
|
|
|
def create_from_cloudformation_json(
|
2021-11-03 21:00:42 +00:00
|
|
|
cls, resource_name, cloudformation_json, region_name, **kwargs
|
2019-10-31 15:44:26 +00:00
|
|
|
):
|
2017-10-20 00:06:30 +00:00
|
|
|
backend = batch_backends[region_name]
|
2019-10-31 15:44:26 +00:00
|
|
|
properties = cloudformation_json["Properties"]
|
2017-10-20 00:06:30 +00:00
|
|
|
|
|
|
|
# Need to deal with difference case from cloudformation compute_resources, e.g. instanceRole vs InstanceRole
|
|
|
|
# Hacky fix to normalise keys, is making me think I want to start spamming cAsEiNsEnSiTiVe dictionaries
|
2019-10-31 15:44:26 +00:00
|
|
|
compute_envs = [
|
|
|
|
lowercase_first_key(dict_item)
|
|
|
|
for dict_item in properties["ComputeEnvironmentOrder"]
|
|
|
|
]
|
2017-10-20 00:06:30 +00:00
|
|
|
|
|
|
|
queue = backend.create_job_queue(
|
|
|
|
queue_name=resource_name,
|
2019-10-31 15:44:26 +00:00
|
|
|
priority=properties["Priority"],
|
|
|
|
state=properties.get("State", "ENABLED"),
|
|
|
|
compute_env_order=compute_envs,
|
2017-10-20 00:06:30 +00:00
|
|
|
)
|
|
|
|
arn = queue[1]
|
|
|
|
|
|
|
|
return backend.get_job_queue_by_arn(arn)
|
|
|
|
|
2017-10-03 22:21:06 +00:00
|
|
|
|
2020-08-01 14:23:36 +00:00
|
|
|
class JobDefinition(CloudFormationModel):
|
2019-10-31 15:44:26 +00:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
name,
|
|
|
|
parameters,
|
|
|
|
_type,
|
|
|
|
container_properties,
|
|
|
|
region_name,
|
2021-12-01 23:06:58 +00:00
|
|
|
tags=None,
|
2019-10-31 15:44:26 +00:00
|
|
|
revision=0,
|
|
|
|
retry_strategy=0,
|
2021-11-27 06:25:53 +00:00
|
|
|
timeout=None,
|
2019-10-31 15:44:26 +00:00
|
|
|
):
|
2017-10-04 19:17:29 +00:00
|
|
|
self.name = name
|
|
|
|
self.retries = retry_strategy
|
|
|
|
self.type = _type
|
|
|
|
self.revision = revision
|
|
|
|
self._region = region_name
|
|
|
|
self.container_properties = container_properties
|
|
|
|
self.arn = None
|
2019-12-11 23:03:46 +00:00
|
|
|
self.status = "ACTIVE"
|
2021-09-21 16:12:18 +00:00
|
|
|
self.tagger = TaggingService()
|
2021-11-27 06:25:53 +00:00
|
|
|
self.parameters = parameters or {}
|
|
|
|
self.timeout = timeout
|
2017-10-04 19:17:29 +00:00
|
|
|
|
2017-10-04 23:00:40 +00:00
|
|
|
self._validate()
|
2017-10-04 19:17:29 +00:00
|
|
|
self._update_arn()
|
|
|
|
|
2021-12-01 23:06:58 +00:00
|
|
|
tags = self._format_tags(tags or {})
|
2021-09-21 16:12:18 +00:00
|
|
|
# Validate the tags before proceeding.
|
2021-12-01 23:06:58 +00:00
|
|
|
errmsg = self.tagger.validate_tags(tags)
|
2021-09-21 16:12:18 +00:00
|
|
|
if errmsg:
|
|
|
|
raise ValidationError(errmsg)
|
|
|
|
|
2021-12-01 23:06:58 +00:00
|
|
|
self.tagger.tag_resource(self.arn, tags)
|
2021-09-21 16:12:18 +00:00
|
|
|
|
|
|
|
def _format_tags(self, tags):
|
|
|
|
return [{"Key": k, "Value": v} for k, v in tags.items()]
|
|
|
|
|
2017-10-04 19:17:29 +00:00
|
|
|
def _update_arn(self):
|
|
|
|
self.revision += 1
|
2019-10-31 15:44:26 +00:00
|
|
|
self.arn = make_arn_for_task_def(
|
|
|
|
DEFAULT_ACCOUNT_ID, self.name, self.revision, self._region
|
|
|
|
)
|
2017-10-04 19:17:29 +00:00
|
|
|
|
2021-11-01 10:31:22 +00:00
|
|
|
def _get_resource_requirement(self, req_type, default=None):
|
|
|
|
"""
|
|
|
|
Get resource requirement from container properties.
|
|
|
|
|
|
|
|
Resource requirements like "memory" and "vcpus" are now specified in
|
|
|
|
"resourceRequirements". This function retrieves a resource requirement
|
|
|
|
from either container_properties.resourceRequirements (preferred) or
|
|
|
|
directly from container_properties (deprecated).
|
|
|
|
|
|
|
|
:param req_type: The type of resource requirement to retrieve.
|
|
|
|
:type req_type: ["gpu", "memory", "vcpus"]
|
|
|
|
|
|
|
|
:param default: The default value to return if the resource requirement is not found.
|
|
|
|
:type default: any, default=None
|
|
|
|
|
|
|
|
:return: The value of the resource requirement, or None.
|
|
|
|
:rtype: any
|
|
|
|
"""
|
|
|
|
resource_reqs = self.container_properties.get("resourceRequirements", [])
|
|
|
|
|
|
|
|
# Filter the resource requirements by the specified type.
|
|
|
|
# Note that VCPUS are specified in resourceRequirements without the
|
|
|
|
# trailing "s", so we strip that off in the comparison below.
|
|
|
|
required_resource = list(
|
|
|
|
filter(
|
|
|
|
lambda req: req["type"].lower() == req_type.lower().rstrip("s"),
|
|
|
|
resource_reqs,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
if required_resource:
|
2022-01-29 12:07:10 +00:00
|
|
|
if req_type == "vcpus":
|
|
|
|
return float(required_resource[0]["value"])
|
|
|
|
elif req_type == "memory":
|
|
|
|
return int(required_resource[0]["value"])
|
|
|
|
else:
|
|
|
|
return required_resource[0]["value"]
|
2021-11-01 10:31:22 +00:00
|
|
|
else:
|
|
|
|
return self.container_properties.get(req_type, default)
|
|
|
|
|
2017-10-04 23:00:40 +00:00
|
|
|
def _validate(self):
|
2022-01-29 12:07:10 +00:00
|
|
|
# For future use when containers arnt the only thing in batch
|
2019-10-31 15:44:26 +00:00
|
|
|
if self.type not in ("container",):
|
2017-10-04 23:00:40 +00:00
|
|
|
raise ClientException('type must be one of "container"')
|
|
|
|
|
|
|
|
if not isinstance(self.parameters, dict):
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("parameters must be a string to string map")
|
2017-10-04 23:00:40 +00:00
|
|
|
|
2019-10-31 15:44:26 +00:00
|
|
|
if "image" not in self.container_properties:
|
|
|
|
raise ClientException("containerProperties must contain image")
|
2017-10-04 19:17:29 +00:00
|
|
|
|
2021-11-01 10:31:22 +00:00
|
|
|
memory = self._get_resource_requirement("memory")
|
|
|
|
if memory is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("containerProperties must contain memory")
|
2021-11-01 10:31:22 +00:00
|
|
|
if memory < 4:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("container memory limit must be greater than 4")
|
2017-10-04 19:17:29 +00:00
|
|
|
|
2021-11-01 10:31:22 +00:00
|
|
|
vcpus = self._get_resource_requirement("vcpus")
|
|
|
|
if vcpus is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("containerProperties must contain vcpus")
|
2022-01-29 12:07:10 +00:00
|
|
|
if vcpus <= 0:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("container vcpus limit must be greater than 0")
|
2017-10-04 19:17:29 +00:00
|
|
|
|
2021-11-27 06:25:53 +00:00
|
|
|
def update(
|
|
|
|
self, parameters, _type, container_properties, retry_strategy, tags, timeout
|
|
|
|
):
|
2017-10-04 23:00:40 +00:00
|
|
|
if parameters is None:
|
|
|
|
parameters = self.parameters
|
|
|
|
|
|
|
|
if _type is None:
|
|
|
|
_type = self.type
|
|
|
|
|
|
|
|
if container_properties is None:
|
|
|
|
container_properties = self.container_properties
|
|
|
|
|
|
|
|
if retry_strategy is None:
|
|
|
|
retry_strategy = self.retries
|
|
|
|
|
2019-10-31 15:44:26 +00:00
|
|
|
return JobDefinition(
|
|
|
|
self.name,
|
|
|
|
parameters,
|
|
|
|
_type,
|
|
|
|
container_properties,
|
|
|
|
region_name=self._region,
|
|
|
|
revision=self.revision,
|
|
|
|
retry_strategy=retry_strategy,
|
2021-11-23 00:47:35 +00:00
|
|
|
tags=tags,
|
2021-11-27 06:25:53 +00:00
|
|
|
timeout=timeout,
|
2019-10-31 15:44:26 +00:00
|
|
|
)
|
2017-10-04 23:00:40 +00:00
|
|
|
|
|
|
|
def describe(self):
|
|
|
|
result = {
|
2019-10-31 15:44:26 +00:00
|
|
|
"jobDefinitionArn": self.arn,
|
|
|
|
"jobDefinitionName": self.name,
|
|
|
|
"parameters": self.parameters,
|
|
|
|
"revision": self.revision,
|
|
|
|
"status": self.status,
|
|
|
|
"type": self.type,
|
2021-09-21 16:12:18 +00:00
|
|
|
"tags": self.tagger.get_tag_dict_for_resource(self.arn),
|
2017-10-04 23:00:40 +00:00
|
|
|
}
|
|
|
|
if self.container_properties is not None:
|
2019-10-31 15:44:26 +00:00
|
|
|
result["containerProperties"] = self.container_properties
|
2017-10-04 23:00:40 +00:00
|
|
|
if self.retries is not None and self.retries > 0:
|
2019-10-31 15:44:26 +00:00
|
|
|
result["retryStrategy"] = {"attempts": self.retries}
|
2021-11-27 06:25:53 +00:00
|
|
|
if self.timeout:
|
|
|
|
result["timeout"] = self.timeout
|
2017-10-04 23:00:40 +00:00
|
|
|
|
|
|
|
return result
|
|
|
|
|
2017-10-20 18:10:31 +00:00
|
|
|
@property
|
|
|
|
def physical_resource_id(self):
|
|
|
|
return self.arn
|
|
|
|
|
2020-08-01 14:23:36 +00:00
|
|
|
@staticmethod
|
|
|
|
def cloudformation_name_type():
|
|
|
|
return "JobDefinitionName"
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def cloudformation_type():
|
|
|
|
# https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-batch-jobdefinition.html
|
|
|
|
return "AWS::Batch::JobDefinition"
|
|
|
|
|
2017-10-20 18:10:31 +00:00
|
|
|
@classmethod
|
2019-10-31 15:44:26 +00:00
|
|
|
def create_from_cloudformation_json(
|
2021-11-03 21:00:42 +00:00
|
|
|
cls, resource_name, cloudformation_json, region_name, **kwargs
|
2019-10-31 15:44:26 +00:00
|
|
|
):
|
2017-10-20 18:10:31 +00:00
|
|
|
backend = batch_backends[region_name]
|
2019-10-31 15:44:26 +00:00
|
|
|
properties = cloudformation_json["Properties"]
|
2017-10-20 18:10:31 +00:00
|
|
|
res = backend.register_job_definition(
|
|
|
|
def_name=resource_name,
|
2019-10-31 15:44:26 +00:00
|
|
|
parameters=lowercase_first_key(properties.get("Parameters", {})),
|
|
|
|
_type="container",
|
2021-09-21 16:12:18 +00:00
|
|
|
tags=lowercase_first_key(properties.get("Tags", {})),
|
2019-10-31 15:44:26 +00:00
|
|
|
retry_strategy=lowercase_first_key(properties["RetryStrategy"]),
|
|
|
|
container_properties=lowercase_first_key(properties["ContainerProperties"]),
|
2021-11-27 06:25:53 +00:00
|
|
|
timeout=lowercase_first_key(properties.get("timeout", {})),
|
2017-10-20 18:10:31 +00:00
|
|
|
)
|
|
|
|
arn = res[1]
|
|
|
|
|
|
|
|
return backend.get_job_definition_by_arn(arn)
|
|
|
|
|
2017-10-04 19:17:29 +00:00
|
|
|
|
2020-11-08 15:16:53 +00:00
|
|
|
class Job(threading.Thread, BaseModel, DockerModel):
|
2021-05-26 07:52:09 +00:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
name,
|
|
|
|
job_def,
|
|
|
|
job_queue,
|
|
|
|
log_backend,
|
|
|
|
container_overrides,
|
|
|
|
depends_on,
|
|
|
|
all_jobs,
|
2021-11-27 06:25:53 +00:00
|
|
|
timeout,
|
2021-05-26 07:52:09 +00:00
|
|
|
):
|
2017-10-06 00:21:29 +00:00
|
|
|
"""
|
|
|
|
Docker Job
|
|
|
|
|
|
|
|
:param name: Job Name
|
|
|
|
:param job_def: Job definition
|
|
|
|
:type: job_def: JobDefinition
|
|
|
|
:param job_queue: Job Queue
|
|
|
|
:param log_backend: Log backend
|
|
|
|
:type log_backend: moto.logs.models.LogsBackend
|
|
|
|
"""
|
|
|
|
threading.Thread.__init__(self)
|
2020-11-08 15:16:53 +00:00
|
|
|
DockerModel.__init__(self)
|
2017-10-06 00:21:29 +00:00
|
|
|
|
|
|
|
self.job_name = name
|
|
|
|
self.job_id = str(uuid.uuid4())
|
|
|
|
self.job_definition = job_def
|
2020-03-12 13:37:46 +00:00
|
|
|
self.container_overrides = container_overrides or {}
|
2017-10-06 00:21:29 +00:00
|
|
|
self.job_queue = job_queue
|
2019-11-22 14:39:07 +00:00
|
|
|
self.job_state = "SUBMITTED" # One of SUBMITTED | PENDING | RUNNABLE | STARTING | RUNNING | SUCCEEDED | FAILED
|
2017-10-06 00:21:29 +00:00
|
|
|
self.job_queue.jobs.append(self)
|
2021-10-28 09:28:45 +00:00
|
|
|
self.job_created_at = datetime.datetime.now()
|
2017-10-06 00:21:29 +00:00
|
|
|
self.job_started_at = datetime.datetime(1970, 1, 1)
|
|
|
|
self.job_stopped_at = datetime.datetime(1970, 1, 1)
|
|
|
|
self.job_stopped = False
|
2017-10-11 22:46:27 +00:00
|
|
|
self.job_stopped_reason = None
|
2021-05-26 07:52:09 +00:00
|
|
|
self.depends_on = depends_on
|
2021-11-27 06:25:53 +00:00
|
|
|
self.timeout = timeout
|
2021-05-26 07:52:09 +00:00
|
|
|
self.all_jobs = all_jobs
|
2017-10-06 00:21:29 +00:00
|
|
|
|
|
|
|
self.stop = False
|
2021-12-28 14:02:18 +00:00
|
|
|
self.exit_code = None
|
2017-10-06 00:21:29 +00:00
|
|
|
|
|
|
|
self.daemon = True
|
2019-10-31 15:44:26 +00:00
|
|
|
self.name = "MOTO-BATCH-" + self.job_id
|
2017-10-06 00:21:29 +00:00
|
|
|
|
|
|
|
self._log_backend = log_backend
|
2020-03-12 13:37:46 +00:00
|
|
|
self.log_stream_name = None
|
2017-10-06 00:21:29 +00:00
|
|
|
|
2021-12-28 14:02:18 +00:00
|
|
|
def describe_short(self):
|
2017-10-06 00:21:29 +00:00
|
|
|
result = {
|
2019-10-31 15:44:26 +00:00
|
|
|
"jobId": self.job_id,
|
|
|
|
"jobName": self.job_name,
|
2021-10-28 09:28:45 +00:00
|
|
|
"createdAt": datetime2int_milliseconds(self.job_created_at),
|
2021-12-28 14:02:18 +00:00
|
|
|
"status": self.job_state,
|
|
|
|
"jobDefinition": self.job_definition.arn,
|
2017-10-06 00:21:29 +00:00
|
|
|
}
|
2021-12-28 14:02:18 +00:00
|
|
|
if self.job_stopped_reason is not None:
|
|
|
|
result["statusReason"] = self.job_stopped_reason
|
2020-03-12 14:07:34 +00:00
|
|
|
if result["status"] not in ["SUBMITTED", "PENDING", "RUNNABLE", "STARTING"]:
|
2021-10-28 09:28:45 +00:00
|
|
|
result["startedAt"] = datetime2int_milliseconds(self.job_started_at)
|
2021-12-28 14:02:18 +00:00
|
|
|
if self.job_stopped:
|
|
|
|
result["stoppedAt"] = datetime2int_milliseconds(self.job_stopped_at)
|
|
|
|
if self.exit_code is not None:
|
|
|
|
result["container"] = {"exitCode": self.exit_code}
|
|
|
|
return result
|
|
|
|
|
|
|
|
def describe(self):
|
|
|
|
result = self.describe_short()
|
|
|
|
result["jobQueue"] = self.job_queue.arn
|
|
|
|
result["dependsOn"] = self.depends_on if self.depends_on else []
|
2017-10-06 00:21:29 +00:00
|
|
|
if self.job_stopped:
|
2021-10-28 09:28:45 +00:00
|
|
|
result["stoppedAt"] = datetime2int_milliseconds(self.job_stopped_at)
|
2019-10-31 15:44:26 +00:00
|
|
|
result["container"] = {}
|
2021-05-26 07:52:09 +00:00
|
|
|
result["container"]["command"] = self._get_container_property("command", [])
|
|
|
|
result["container"]["privileged"] = self._get_container_property(
|
|
|
|
"privileged", False
|
|
|
|
)
|
|
|
|
result["container"][
|
|
|
|
"readonlyRootFilesystem"
|
|
|
|
] = self._get_container_property("readonlyRootFilesystem", False)
|
|
|
|
result["container"]["ulimits"] = self._get_container_property("ulimits", {})
|
|
|
|
result["container"]["vcpus"] = self._get_container_property("vcpus", 1)
|
|
|
|
result["container"]["memory"] = self._get_container_property("memory", 512)
|
|
|
|
result["container"]["volumes"] = self._get_container_property("volumes", [])
|
|
|
|
result["container"]["environment"] = self._get_container_property(
|
|
|
|
"environment", []
|
|
|
|
)
|
2019-10-31 15:44:26 +00:00
|
|
|
result["container"]["logStreamName"] = self.log_stream_name
|
2021-11-27 06:25:53 +00:00
|
|
|
if self.timeout:
|
|
|
|
result["timeout"] = self.timeout
|
2017-10-06 00:21:29 +00:00
|
|
|
return result
|
|
|
|
|
2019-12-12 23:30:08 +00:00
|
|
|
def _get_container_property(self, p, default):
|
2021-05-26 07:52:09 +00:00
|
|
|
if p == "environment":
|
|
|
|
job_env = self.container_overrides.get(p, default)
|
|
|
|
jd_env = self.job_definition.container_properties.get(p, default)
|
|
|
|
|
|
|
|
job_env_dict = {_env["name"]: _env["value"] for _env in job_env}
|
|
|
|
jd_env_dict = {_env["name"]: _env["value"] for _env in jd_env}
|
|
|
|
|
|
|
|
for key in jd_env_dict.keys():
|
|
|
|
if key not in job_env_dict.keys():
|
|
|
|
job_env.append({"name": key, "value": jd_env_dict[key]})
|
|
|
|
|
|
|
|
job_env.append({"name": "AWS_BATCH_JOB_ID", "value": self.job_id})
|
|
|
|
|
|
|
|
return job_env
|
|
|
|
|
2021-11-01 10:31:22 +00:00
|
|
|
if p in ["vcpus", "memory"]:
|
|
|
|
return self.container_overrides.get(
|
|
|
|
p, self.job_definition._get_resource_requirement(p, default)
|
|
|
|
)
|
|
|
|
|
2019-12-12 23:30:08 +00:00
|
|
|
return self.container_overrides.get(
|
|
|
|
p, self.job_definition.container_properties.get(p, default)
|
|
|
|
)
|
|
|
|
|
2021-11-27 06:25:53 +00:00
|
|
|
def _get_attempt_duration(self):
|
|
|
|
if self.timeout:
|
|
|
|
return self.timeout["attemptDurationSeconds"]
|
|
|
|
if self.job_definition.timeout:
|
|
|
|
return self.job_definition.timeout["attemptDurationSeconds"]
|
|
|
|
return None
|
|
|
|
|
2017-10-06 00:21:29 +00:00
|
|
|
def run(self):
|
|
|
|
"""
|
|
|
|
Run the container.
|
|
|
|
|
|
|
|
Logic is as follows:
|
|
|
|
Generate container info (eventually from task definition)
|
|
|
|
Start container
|
|
|
|
Loop whilst not asked to stop and the container is running.
|
|
|
|
Get all logs from container between the last time I checked and now.
|
|
|
|
Convert logs into cloudwatch format
|
|
|
|
Put logs into cloudwatch
|
|
|
|
|
|
|
|
:return:
|
|
|
|
"""
|
|
|
|
try:
|
2019-10-31 15:44:26 +00:00
|
|
|
self.job_state = "PENDING"
|
2017-10-06 00:21:29 +00:00
|
|
|
|
2021-05-26 07:52:09 +00:00
|
|
|
if self.depends_on and not self._wait_for_dependencies():
|
|
|
|
return
|
|
|
|
|
2019-12-12 23:30:08 +00:00
|
|
|
image = self.job_definition.container_properties.get(
|
|
|
|
"image", "alpine:latest"
|
|
|
|
)
|
|
|
|
privileged = self.job_definition.container_properties.get(
|
|
|
|
"privileged", False
|
|
|
|
)
|
|
|
|
cmd = self._get_container_property(
|
|
|
|
"command",
|
|
|
|
'/bin/sh -c "for a in `seq 1 10`; do echo Hello World; sleep 1; done"',
|
|
|
|
)
|
|
|
|
environment = {
|
|
|
|
e["name"]: e["value"]
|
|
|
|
for e in self._get_container_property("environment", [])
|
|
|
|
}
|
|
|
|
volumes = {
|
|
|
|
v["name"]: v["host"]
|
|
|
|
for v in self._get_container_property("volumes", [])
|
|
|
|
}
|
|
|
|
mounts = [
|
|
|
|
docker.types.Mount(
|
|
|
|
m["containerPath"],
|
|
|
|
volumes[m["sourceVolume"]]["sourcePath"],
|
|
|
|
type="bind",
|
|
|
|
read_only=m["readOnly"],
|
|
|
|
)
|
|
|
|
for m in self._get_container_property("mountPoints", [])
|
|
|
|
]
|
2019-10-31 15:44:26 +00:00
|
|
|
name = "{0}-{1}".format(self.job_name, self.job_id)
|
2017-10-06 00:21:29 +00:00
|
|
|
|
2019-10-31 15:44:26 +00:00
|
|
|
self.job_state = "RUNNABLE"
|
2017-10-06 00:21:29 +00:00
|
|
|
# TODO setup ecs container instance
|
|
|
|
|
Fix Race Condition in batch:SubmitJob (#3480)
* Extract Duplicate Code into Helper Method
DRY up the tests and replace the arbitrary `sleep()` calls with a more
explicit check before progressing.
* Improve Testing of batch:TerminateJob
The test now confirms that the job was terminated by sandwiching a `sleep`
command between two `echo` commands. In addition to the original checks
of the terminated job status/reason, the test now asserts that only the
first echo command succeeded, confirming that the job was indeed terminated
while in progress.
* Fix Race Condition in batch:SubmitJob
The `test_submit_job` in `test_batch.py` kicks off a job, calls `describe_jobs`
in a loop until the job status returned is SUCCEEDED, and then asserts against
the logged events.
The backend code that runs the submitted job does so in a separate thread. If
the job was successful, the job status was being set to SUCCEEDED *before* the
event logs had been written to the logging backend.
As a result, it was possible for the primary thread running the test to detect
that the job was successful immediately after the secondary thread had updated
the job status but before the secondary thread had written the logs to the
logging backend. Under the right conditions, this could cause the subsequent
logging assertions in the primary thread to fail.
Additionally, the code that collected the logs from the container was using
a "dodgy hack" of time.sleep() and a modulo-based conditional that was
ultimately non-deterministic and could result in log messages being dropped
or duplicated in certain scenarios.
In order to address these issues, this commit does the following:
* Carefully re-orders any code that sets a job status or timestamp
to avoid any obvious race conditions.
* Removes the "dodgy hack" in favor of a much more straightforward
(and less error-prone) method of collecting logs from the container.
* Removes arbitrary and unnecessary calls to time.sleep()
Before applying any changes, the flaky test was failing about 12% of the
time. Putting a sleep() call between setting the `job_status` to SUCCEEDED
and collecting the logs, resulted in a 100% failure rate. Simply moving
the code that sets the job status to SUCCEEDED to the end of the code block,
dropped the failure rate to ~2%. Finally, removing the log collection
hack allowed the test suite to run ~1000 times without a single failure.
Taken in aggregate, these changes make the batch backend more deterministic
and should put the nail in the coffin of this flaky test.
Closes #3475
2020-11-18 10:49:25 +00:00
|
|
|
self.job_started_at = datetime.datetime.now()
|
2021-08-22 11:29:23 +00:00
|
|
|
|
2022-01-13 12:39:26 +00:00
|
|
|
# add host.docker.internal host on linux to emulate Mac + Windows behavior
|
|
|
|
# for communication with other mock AWS services running on localhost
|
|
|
|
extra_hosts = (
|
|
|
|
{"host.docker.internal": "host-gateway",}
|
|
|
|
if platform == "linux" or platform == "linux2"
|
|
|
|
else {}
|
|
|
|
)
|
|
|
|
|
2022-01-27 23:25:18 +00:00
|
|
|
environment["MOTO_HOST"] = settings.moto_server_host()
|
|
|
|
environment["MOTO_PORT"] = settings.moto_server_port()
|
|
|
|
environment[
|
|
|
|
"MOTO_HTTP_ENDPOINT"
|
|
|
|
] = f'{environment["MOTO_HOST"]}:{environment["MOTO_PORT"]}'
|
|
|
|
|
|
|
|
run_kwargs = dict()
|
|
|
|
network_name = settings.moto_network_name()
|
|
|
|
network_mode = settings.moto_network_mode()
|
|
|
|
if network_name:
|
|
|
|
run_kwargs["network"] = network_name
|
|
|
|
elif network_mode:
|
|
|
|
run_kwargs["network_mode"] = network_mode
|
|
|
|
|
2019-12-12 21:59:48 +00:00
|
|
|
log_config = docker.types.LogConfig(type=docker.types.LogConfig.types.JSON)
|
2021-08-22 11:29:23 +00:00
|
|
|
self.job_state = "STARTING"
|
2017-10-06 00:21:29 +00:00
|
|
|
container = self.docker_client.containers.run(
|
2019-12-12 23:30:08 +00:00
|
|
|
image,
|
|
|
|
cmd,
|
|
|
|
detach=True,
|
|
|
|
name=name,
|
|
|
|
log_config=log_config,
|
|
|
|
environment=environment,
|
|
|
|
mounts=mounts,
|
|
|
|
privileged=privileged,
|
2022-01-13 12:39:26 +00:00
|
|
|
extra_hosts=extra_hosts,
|
2022-01-27 23:25:18 +00:00
|
|
|
**run_kwargs,
|
2017-10-06 00:21:29 +00:00
|
|
|
)
|
2019-10-31 15:44:26 +00:00
|
|
|
self.job_state = "RUNNING"
|
2017-10-06 00:21:29 +00:00
|
|
|
try:
|
|
|
|
container.reload()
|
2021-11-27 06:25:53 +00:00
|
|
|
|
|
|
|
max_time = None
|
|
|
|
if self._get_attempt_duration():
|
|
|
|
attempt_duration = self._get_attempt_duration()
|
|
|
|
max_time = self.job_started_at + datetime.timedelta(
|
|
|
|
seconds=attempt_duration
|
|
|
|
)
|
|
|
|
|
2019-10-31 15:44:26 +00:00
|
|
|
while container.status == "running" and not self.stop:
|
Fix Race Condition in batch:SubmitJob (#3480)
* Extract Duplicate Code into Helper Method
DRY up the tests and replace the arbitrary `sleep()` calls with a more
explicit check before progressing.
* Improve Testing of batch:TerminateJob
The test now confirms that the job was terminated by sandwiching a `sleep`
command between two `echo` commands. In addition to the original checks
of the terminated job status/reason, the test now asserts that only the
first echo command succeeded, confirming that the job was indeed terminated
while in progress.
* Fix Race Condition in batch:SubmitJob
The `test_submit_job` in `test_batch.py` kicks off a job, calls `describe_jobs`
in a loop until the job status returned is SUCCEEDED, and then asserts against
the logged events.
The backend code that runs the submitted job does so in a separate thread. If
the job was successful, the job status was being set to SUCCEEDED *before* the
event logs had been written to the logging backend.
As a result, it was possible for the primary thread running the test to detect
that the job was successful immediately after the secondary thread had updated
the job status but before the secondary thread had written the logs to the
logging backend. Under the right conditions, this could cause the subsequent
logging assertions in the primary thread to fail.
Additionally, the code that collected the logs from the container was using
a "dodgy hack" of time.sleep() and a modulo-based conditional that was
ultimately non-deterministic and could result in log messages being dropped
or duplicated in certain scenarios.
In order to address these issues, this commit does the following:
* Carefully re-orders any code that sets a job status or timestamp
to avoid any obvious race conditions.
* Removes the "dodgy hack" in favor of a much more straightforward
(and less error-prone) method of collecting logs from the container.
* Removes arbitrary and unnecessary calls to time.sleep()
Before applying any changes, the flaky test was failing about 12% of the
time. Putting a sleep() call between setting the `job_status` to SUCCEEDED
and collecting the logs, resulted in a 100% failure rate. Simply moving
the code that sets the job status to SUCCEEDED to the end of the code block,
dropped the failure rate to ~2%. Finally, removing the log collection
hack allowed the test suite to run ~1000 times without a single failure.
Taken in aggregate, these changes make the batch backend more deterministic
and should put the nail in the coffin of this flaky test.
Closes #3475
2020-11-18 10:49:25 +00:00
|
|
|
container.reload()
|
2021-07-26 14:21:17 +00:00
|
|
|
time.sleep(0.5)
|
2017-10-06 00:21:29 +00:00
|
|
|
|
2021-11-27 06:25:53 +00:00
|
|
|
if max_time and datetime.datetime.now() > max_time:
|
|
|
|
raise Exception(
|
|
|
|
"Job time exceeded the configured attemptDurationSeconds"
|
|
|
|
)
|
|
|
|
|
2017-10-06 00:21:29 +00:00
|
|
|
# Container should be stopped by this point... unless asked to stop
|
2019-10-31 15:44:26 +00:00
|
|
|
if container.status == "running":
|
2017-10-06 00:21:29 +00:00
|
|
|
container.kill()
|
|
|
|
|
Fix Race Condition in batch:SubmitJob (#3480)
* Extract Duplicate Code into Helper Method
DRY up the tests and replace the arbitrary `sleep()` calls with a more
explicit check before progressing.
* Improve Testing of batch:TerminateJob
The test now confirms that the job was terminated by sandwiching a `sleep`
command between two `echo` commands. In addition to the original checks
of the terminated job status/reason, the test now asserts that only the
first echo command succeeded, confirming that the job was indeed terminated
while in progress.
* Fix Race Condition in batch:SubmitJob
The `test_submit_job` in `test_batch.py` kicks off a job, calls `describe_jobs`
in a loop until the job status returned is SUCCEEDED, and then asserts against
the logged events.
The backend code that runs the submitted job does so in a separate thread. If
the job was successful, the job status was being set to SUCCEEDED *before* the
event logs had been written to the logging backend.
As a result, it was possible for the primary thread running the test to detect
that the job was successful immediately after the secondary thread had updated
the job status but before the secondary thread had written the logs to the
logging backend. Under the right conditions, this could cause the subsequent
logging assertions in the primary thread to fail.
Additionally, the code that collected the logs from the container was using
a "dodgy hack" of time.sleep() and a modulo-based conditional that was
ultimately non-deterministic and could result in log messages being dropped
or duplicated in certain scenarios.
In order to address these issues, this commit does the following:
* Carefully re-orders any code that sets a job status or timestamp
to avoid any obvious race conditions.
* Removes the "dodgy hack" in favor of a much more straightforward
(and less error-prone) method of collecting logs from the container.
* Removes arbitrary and unnecessary calls to time.sleep()
Before applying any changes, the flaky test was failing about 12% of the
time. Putting a sleep() call between setting the `job_status` to SUCCEEDED
and collecting the logs, resulted in a 100% failure rate. Simply moving
the code that sets the job status to SUCCEEDED to the end of the code block,
dropped the failure rate to ~2%. Finally, removing the log collection
hack allowed the test suite to run ~1000 times without a single failure.
Taken in aggregate, these changes make the batch backend more deterministic
and should put the nail in the coffin of this flaky test.
Closes #3475
2020-11-18 10:49:25 +00:00
|
|
|
# Log collection
|
|
|
|
logs_stdout = []
|
|
|
|
logs_stderr = []
|
2019-10-31 15:44:26 +00:00
|
|
|
logs_stderr.extend(
|
|
|
|
container.logs(
|
|
|
|
stdout=False,
|
|
|
|
stderr=True,
|
|
|
|
timestamps=True,
|
Fix Race Condition in batch:SubmitJob (#3480)
* Extract Duplicate Code into Helper Method
DRY up the tests and replace the arbitrary `sleep()` calls with a more
explicit check before progressing.
* Improve Testing of batch:TerminateJob
The test now confirms that the job was terminated by sandwiching a `sleep`
command between two `echo` commands. In addition to the original checks
of the terminated job status/reason, the test now asserts that only the
first echo command succeeded, confirming that the job was indeed terminated
while in progress.
* Fix Race Condition in batch:SubmitJob
The `test_submit_job` in `test_batch.py` kicks off a job, calls `describe_jobs`
in a loop until the job status returned is SUCCEEDED, and then asserts against
the logged events.
The backend code that runs the submitted job does so in a separate thread. If
the job was successful, the job status was being set to SUCCEEDED *before* the
event logs had been written to the logging backend.
As a result, it was possible for the primary thread running the test to detect
that the job was successful immediately after the secondary thread had updated
the job status but before the secondary thread had written the logs to the
logging backend. Under the right conditions, this could cause the subsequent
logging assertions in the primary thread to fail.
Additionally, the code that collected the logs from the container was using
a "dodgy hack" of time.sleep() and a modulo-based conditional that was
ultimately non-deterministic and could result in log messages being dropped
or duplicated in certain scenarios.
In order to address these issues, this commit does the following:
* Carefully re-orders any code that sets a job status or timestamp
to avoid any obvious race conditions.
* Removes the "dodgy hack" in favor of a much more straightforward
(and less error-prone) method of collecting logs from the container.
* Removes arbitrary and unnecessary calls to time.sleep()
Before applying any changes, the flaky test was failing about 12% of the
time. Putting a sleep() call between setting the `job_status` to SUCCEEDED
and collecting the logs, resulted in a 100% failure rate. Simply moving
the code that sets the job status to SUCCEEDED to the end of the code block,
dropped the failure rate to ~2%. Finally, removing the log collection
hack allowed the test suite to run ~1000 times without a single failure.
Taken in aggregate, these changes make the batch backend more deterministic
and should put the nail in the coffin of this flaky test.
Closes #3475
2020-11-18 10:49:25 +00:00
|
|
|
since=datetime2int(self.job_started_at),
|
2019-10-31 15:44:26 +00:00
|
|
|
)
|
|
|
|
.decode()
|
|
|
|
.split("\n")
|
|
|
|
)
|
|
|
|
logs_stdout.extend(
|
|
|
|
container.logs(
|
|
|
|
stdout=True,
|
|
|
|
stderr=False,
|
|
|
|
timestamps=True,
|
Fix Race Condition in batch:SubmitJob (#3480)
* Extract Duplicate Code into Helper Method
DRY up the tests and replace the arbitrary `sleep()` calls with a more
explicit check before progressing.
* Improve Testing of batch:TerminateJob
The test now confirms that the job was terminated by sandwiching a `sleep`
command between two `echo` commands. In addition to the original checks
of the terminated job status/reason, the test now asserts that only the
first echo command succeeded, confirming that the job was indeed terminated
while in progress.
* Fix Race Condition in batch:SubmitJob
The `test_submit_job` in `test_batch.py` kicks off a job, calls `describe_jobs`
in a loop until the job status returned is SUCCEEDED, and then asserts against
the logged events.
The backend code that runs the submitted job does so in a separate thread. If
the job was successful, the job status was being set to SUCCEEDED *before* the
event logs had been written to the logging backend.
As a result, it was possible for the primary thread running the test to detect
that the job was successful immediately after the secondary thread had updated
the job status but before the secondary thread had written the logs to the
logging backend. Under the right conditions, this could cause the subsequent
logging assertions in the primary thread to fail.
Additionally, the code that collected the logs from the container was using
a "dodgy hack" of time.sleep() and a modulo-based conditional that was
ultimately non-deterministic and could result in log messages being dropped
or duplicated in certain scenarios.
In order to address these issues, this commit does the following:
* Carefully re-orders any code that sets a job status or timestamp
to avoid any obvious race conditions.
* Removes the "dodgy hack" in favor of a much more straightforward
(and less error-prone) method of collecting logs from the container.
* Removes arbitrary and unnecessary calls to time.sleep()
Before applying any changes, the flaky test was failing about 12% of the
time. Putting a sleep() call between setting the `job_status` to SUCCEEDED
and collecting the logs, resulted in a 100% failure rate. Simply moving
the code that sets the job status to SUCCEEDED to the end of the code block,
dropped the failure rate to ~2%. Finally, removing the log collection
hack allowed the test suite to run ~1000 times without a single failure.
Taken in aggregate, these changes make the batch backend more deterministic
and should put the nail in the coffin of this flaky test.
Closes #3475
2020-11-18 10:49:25 +00:00
|
|
|
since=datetime2int(self.job_started_at),
|
2019-10-31 15:44:26 +00:00
|
|
|
)
|
|
|
|
.decode()
|
|
|
|
.split("\n")
|
|
|
|
)
|
2017-10-06 00:21:29 +00:00
|
|
|
|
|
|
|
# Process logs
|
|
|
|
logs_stdout = [x for x in logs_stdout if len(x) > 0]
|
|
|
|
logs_stderr = [x for x in logs_stderr if len(x) > 0]
|
|
|
|
logs = []
|
|
|
|
for line in logs_stdout + logs_stderr:
|
2019-10-31 15:44:26 +00:00
|
|
|
date, line = line.split(" ", 1)
|
2021-11-20 10:38:48 +00:00
|
|
|
date_obj = (
|
|
|
|
dateutil.parser.parse(date)
|
|
|
|
.astimezone(datetime.timezone.utc)
|
|
|
|
.replace(tzinfo=None)
|
|
|
|
)
|
2021-11-12 17:22:47 +00:00
|
|
|
date = unix_time_millis(date_obj)
|
2019-10-31 15:44:26 +00:00
|
|
|
logs.append({"timestamp": date, "message": line.strip()})
|
2021-12-22 10:29:27 +00:00
|
|
|
logs = sorted(logs, key=lambda l: l["timestamp"])
|
2017-10-06 00:21:29 +00:00
|
|
|
|
|
|
|
# Send to cloudwatch
|
2019-10-31 15:44:26 +00:00
|
|
|
log_group = "/aws/batch/job"
|
|
|
|
stream_name = "{0}/default/{1}".format(
|
|
|
|
self.job_definition.name, self.job_id
|
|
|
|
)
|
2018-05-02 12:31:35 +00:00
|
|
|
self.log_stream_name = stream_name
|
2017-10-06 00:21:29 +00:00
|
|
|
self._log_backend.ensure_log_group(log_group, None)
|
|
|
|
self._log_backend.create_log_stream(log_group, stream_name)
|
|
|
|
self._log_backend.put_log_events(log_group, stream_name, logs, None)
|
|
|
|
|
2021-07-26 14:21:17 +00:00
|
|
|
result = container.wait() or {}
|
2021-12-28 14:02:18 +00:00
|
|
|
self.exit_code = result.get("StatusCode", 0)
|
|
|
|
job_failed = self.stop or self.exit_code > 0
|
2021-07-26 14:21:17 +00:00
|
|
|
self._mark_stopped(success=not job_failed)
|
Fix Race Condition in batch:SubmitJob (#3480)
* Extract Duplicate Code into Helper Method
DRY up the tests and replace the arbitrary `sleep()` calls with a more
explicit check before progressing.
* Improve Testing of batch:TerminateJob
The test now confirms that the job was terminated by sandwiching a `sleep`
command between two `echo` commands. In addition to the original checks
of the terminated job status/reason, the test now asserts that only the
first echo command succeeded, confirming that the job was indeed terminated
while in progress.
* Fix Race Condition in batch:SubmitJob
The `test_submit_job` in `test_batch.py` kicks off a job, calls `describe_jobs`
in a loop until the job status returned is SUCCEEDED, and then asserts against
the logged events.
The backend code that runs the submitted job does so in a separate thread. If
the job was successful, the job status was being set to SUCCEEDED *before* the
event logs had been written to the logging backend.
As a result, it was possible for the primary thread running the test to detect
that the job was successful immediately after the secondary thread had updated
the job status but before the secondary thread had written the logs to the
logging backend. Under the right conditions, this could cause the subsequent
logging assertions in the primary thread to fail.
Additionally, the code that collected the logs from the container was using
a "dodgy hack" of time.sleep() and a modulo-based conditional that was
ultimately non-deterministic and could result in log messages being dropped
or duplicated in certain scenarios.
In order to address these issues, this commit does the following:
* Carefully re-orders any code that sets a job status or timestamp
to avoid any obvious race conditions.
* Removes the "dodgy hack" in favor of a much more straightforward
(and less error-prone) method of collecting logs from the container.
* Removes arbitrary and unnecessary calls to time.sleep()
Before applying any changes, the flaky test was failing about 12% of the
time. Putting a sleep() call between setting the `job_status` to SUCCEEDED
and collecting the logs, resulted in a 100% failure rate. Simply moving
the code that sets the job status to SUCCEEDED to the end of the code block,
dropped the failure rate to ~2%. Finally, removing the log collection
hack allowed the test suite to run ~1000 times without a single failure.
Taken in aggregate, these changes make the batch backend more deterministic
and should put the nail in the coffin of this flaky test.
Closes #3475
2020-11-18 10:49:25 +00:00
|
|
|
|
2017-10-06 00:21:29 +00:00
|
|
|
except Exception as err:
|
2019-10-31 15:44:26 +00:00
|
|
|
logger.error(
|
|
|
|
"Failed to run AWS Batch container {0}. Error {1}".format(
|
|
|
|
self.name, err
|
|
|
|
)
|
|
|
|
)
|
2021-05-28 05:55:01 +00:00
|
|
|
self._mark_stopped(success=False)
|
2017-10-06 00:21:29 +00:00
|
|
|
container.kill()
|
|
|
|
finally:
|
|
|
|
container.remove()
|
|
|
|
except Exception as err:
|
2019-10-31 15:44:26 +00:00
|
|
|
logger.error(
|
|
|
|
"Failed to run AWS Batch container {0}. Error {1}".format(
|
|
|
|
self.name, err
|
|
|
|
)
|
|
|
|
)
|
2021-05-28 05:55:01 +00:00
|
|
|
self._mark_stopped(success=False)
|
2017-10-06 00:21:29 +00:00
|
|
|
|
2021-05-28 05:55:01 +00:00
|
|
|
def _mark_stopped(self, success=True):
|
|
|
|
# Ensure that job_stopped/job_stopped_at-attributes are set first
|
|
|
|
# The describe-method needs them immediately when job_state is set
|
2017-10-06 00:21:29 +00:00
|
|
|
self.job_stopped = True
|
|
|
|
self.job_stopped_at = datetime.datetime.now()
|
2021-05-28 05:55:01 +00:00
|
|
|
self.job_state = "SUCCEEDED" if success else "FAILED"
|
2017-10-06 00:21:29 +00:00
|
|
|
|
2017-10-11 22:46:27 +00:00
|
|
|
def terminate(self, reason):
|
|
|
|
if not self.stop:
|
|
|
|
self.stop = True
|
|
|
|
self.job_stopped_reason = reason
|
|
|
|
|
2021-05-26 07:52:09 +00:00
|
|
|
def _wait_for_dependencies(self):
|
|
|
|
dependent_ids = [dependency["jobId"] for dependency in self.depends_on]
|
|
|
|
successful_dependencies = set()
|
|
|
|
while len(successful_dependencies) != len(dependent_ids):
|
|
|
|
for dependent_id in dependent_ids:
|
|
|
|
if dependent_id in self.all_jobs:
|
|
|
|
dependent_job = self.all_jobs[dependent_id]
|
|
|
|
if dependent_job.job_state == "SUCCEEDED":
|
|
|
|
successful_dependencies.add(dependent_id)
|
|
|
|
if dependent_job.job_state == "FAILED":
|
|
|
|
logger.error(
|
|
|
|
"Terminating job {0} due to failed dependency {1}".format(
|
|
|
|
self.name, dependent_job.name
|
|
|
|
)
|
|
|
|
)
|
2021-05-28 05:55:01 +00:00
|
|
|
self._mark_stopped(success=False)
|
2021-05-26 07:52:09 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
time.sleep(1)
|
2022-02-04 21:10:46 +00:00
|
|
|
if self.stop:
|
|
|
|
# This job has been cancelled while it was waiting for a dependency
|
|
|
|
self._mark_stopped(success=False)
|
|
|
|
return False
|
2021-05-26 07:52:09 +00:00
|
|
|
|
|
|
|
return True
|
|
|
|
|
2017-10-06 00:21:29 +00:00
|
|
|
|
2017-09-26 16:37:26 +00:00
|
|
|
class BatchBackend(BaseBackend):
|
|
|
|
def __init__(self, region_name=None):
|
2021-12-01 23:06:58 +00:00
|
|
|
super().__init__()
|
2017-09-26 16:37:26 +00:00
|
|
|
self.region_name = region_name
|
|
|
|
|
2017-09-26 21:22:59 +00:00
|
|
|
self._compute_environments = {}
|
2017-10-03 22:21:06 +00:00
|
|
|
self._job_queues = {}
|
2017-10-04 19:17:29 +00:00
|
|
|
self._job_definitions = {}
|
2017-10-06 00:21:29 +00:00
|
|
|
self._jobs = {}
|
2017-09-26 21:22:59 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def iam_backend(self):
|
|
|
|
"""
|
|
|
|
:return: IAM Backend
|
|
|
|
:rtype: moto.iam.models.IAMBackend
|
|
|
|
"""
|
2019-10-31 15:44:26 +00:00
|
|
|
return iam_backends["global"]
|
2017-09-26 21:22:59 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def ec2_backend(self):
|
|
|
|
"""
|
|
|
|
:return: EC2 Backend
|
|
|
|
:rtype: moto.ec2.models.EC2Backend
|
|
|
|
"""
|
|
|
|
return ec2_backends[self.region_name]
|
|
|
|
|
2017-09-29 22:29:36 +00:00
|
|
|
@property
|
|
|
|
def ecs_backend(self):
|
|
|
|
"""
|
|
|
|
:return: ECS Backend
|
|
|
|
:rtype: moto.ecs.models.EC2ContainerServiceBackend
|
|
|
|
"""
|
|
|
|
return ecs_backends[self.region_name]
|
|
|
|
|
2017-10-06 00:21:29 +00:00
|
|
|
@property
|
|
|
|
def logs_backend(self):
|
|
|
|
"""
|
|
|
|
:return: ECS Backend
|
|
|
|
:rtype: moto.logs.models.LogsBackend
|
|
|
|
"""
|
|
|
|
return logs_backends[self.region_name]
|
|
|
|
|
2017-09-26 16:37:26 +00:00
|
|
|
def reset(self):
|
|
|
|
region_name = self.region_name
|
2017-10-06 00:21:29 +00:00
|
|
|
|
|
|
|
for job in self._jobs.values():
|
2019-10-31 15:44:26 +00:00
|
|
|
if job.job_state not in ("FAILED", "SUCCEEDED"):
|
2017-10-06 00:21:29 +00:00
|
|
|
job.stop = True
|
|
|
|
# Try to join
|
|
|
|
job.join(0.2)
|
|
|
|
|
2017-09-26 16:37:26 +00:00
|
|
|
self.__dict__ = {}
|
|
|
|
self.__init__(region_name)
|
|
|
|
|
2017-10-03 22:21:06 +00:00
|
|
|
def get_compute_environment_by_arn(self, arn):
|
2017-09-26 21:22:59 +00:00
|
|
|
return self._compute_environments.get(arn)
|
|
|
|
|
|
|
|
def get_compute_environment_by_name(self, name):
|
|
|
|
for comp_env in self._compute_environments.values():
|
|
|
|
if comp_env.name == name:
|
|
|
|
return comp_env
|
|
|
|
return None
|
|
|
|
|
2017-10-03 21:35:30 +00:00
|
|
|
def get_compute_environment(self, identifier):
|
|
|
|
"""
|
|
|
|
Get compute environment by name or ARN
|
|
|
|
:param identifier: Name or ARN
|
|
|
|
:type identifier: str
|
|
|
|
|
|
|
|
:return: Compute Environment or None
|
|
|
|
:rtype: ComputeEnvironment or None
|
|
|
|
"""
|
2017-10-03 22:21:06 +00:00
|
|
|
env = self.get_compute_environment_by_arn(identifier)
|
2017-10-03 21:35:30 +00:00
|
|
|
if env is None:
|
|
|
|
env = self.get_compute_environment_by_name(identifier)
|
|
|
|
return env
|
|
|
|
|
2017-10-03 22:21:06 +00:00
|
|
|
def get_job_queue_by_arn(self, arn):
|
|
|
|
return self._job_queues.get(arn)
|
|
|
|
|
|
|
|
def get_job_queue_by_name(self, name):
|
|
|
|
for comp_env in self._job_queues.values():
|
|
|
|
if comp_env.name == name:
|
|
|
|
return comp_env
|
|
|
|
return None
|
|
|
|
|
|
|
|
def get_job_queue(self, identifier):
|
|
|
|
"""
|
|
|
|
Get job queue by name or ARN
|
|
|
|
:param identifier: Name or ARN
|
|
|
|
:type identifier: str
|
|
|
|
|
|
|
|
:return: Job Queue or None
|
|
|
|
:rtype: JobQueue or None
|
|
|
|
"""
|
|
|
|
env = self.get_job_queue_by_arn(identifier)
|
|
|
|
if env is None:
|
|
|
|
env = self.get_job_queue_by_name(identifier)
|
|
|
|
return env
|
|
|
|
|
2017-10-04 19:17:29 +00:00
|
|
|
def get_job_definition_by_arn(self, arn):
|
|
|
|
return self._job_definitions.get(arn)
|
|
|
|
|
2017-10-04 23:09:10 +00:00
|
|
|
def get_job_definition_by_name(self, name):
|
2019-08-06 20:13:52 +00:00
|
|
|
latest_revision = -1
|
|
|
|
latest_job = None
|
|
|
|
for job_def in self._job_definitions.values():
|
|
|
|
if job_def.name == name and job_def.revision > latest_revision:
|
|
|
|
latest_job = job_def
|
|
|
|
latest_revision = job_def.revision
|
|
|
|
return latest_job
|
2017-10-04 19:17:29 +00:00
|
|
|
|
2017-10-04 23:00:40 +00:00
|
|
|
def get_job_definition_by_name_revision(self, name, revision):
|
|
|
|
for job_def in self._job_definitions.values():
|
2021-09-27 17:19:44 +00:00
|
|
|
if job_def.name == name and job_def.revision == int(revision):
|
2017-10-04 23:00:40 +00:00
|
|
|
return job_def
|
|
|
|
return None
|
|
|
|
|
2017-10-04 19:17:29 +00:00
|
|
|
def get_job_definition(self, identifier):
|
|
|
|
"""
|
2019-11-16 20:31:45 +00:00
|
|
|
Get job definitions by name or ARN
|
2017-10-04 19:17:29 +00:00
|
|
|
:param identifier: Name or ARN
|
|
|
|
:type identifier: str
|
|
|
|
|
2017-10-04 23:00:40 +00:00
|
|
|
:return: Job definition or None
|
|
|
|
:rtype: JobDefinition or None
|
2017-10-04 19:17:29 +00:00
|
|
|
"""
|
2019-08-06 20:13:52 +00:00
|
|
|
job_def = self.get_job_definition_by_arn(identifier)
|
|
|
|
if job_def is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
if ":" in identifier:
|
|
|
|
job_def = self.get_job_definition_by_name_revision(
|
|
|
|
*identifier.split(":", 1)
|
|
|
|
)
|
2019-08-06 20:13:52 +00:00
|
|
|
else:
|
|
|
|
job_def = self.get_job_definition_by_name(identifier)
|
|
|
|
return job_def
|
2017-10-04 19:17:29 +00:00
|
|
|
|
2017-10-04 23:00:40 +00:00
|
|
|
def get_job_definitions(self, identifier):
|
|
|
|
"""
|
2019-11-16 20:31:45 +00:00
|
|
|
Get job definitions by name or ARN
|
2017-10-04 23:00:40 +00:00
|
|
|
:param identifier: Name or ARN
|
|
|
|
:type identifier: str
|
|
|
|
|
|
|
|
:return: Job definition or None
|
|
|
|
:rtype: list of JobDefinition
|
|
|
|
"""
|
|
|
|
result = []
|
|
|
|
env = self.get_job_definition_by_arn(identifier)
|
|
|
|
if env is not None:
|
|
|
|
result.append(env)
|
|
|
|
else:
|
|
|
|
for value in self._job_definitions.values():
|
|
|
|
if value.name == identifier:
|
|
|
|
result.append(value)
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
2017-10-11 22:46:27 +00:00
|
|
|
def get_job_by_id(self, identifier):
|
|
|
|
"""
|
|
|
|
Get job by id
|
|
|
|
:param identifier: Job ID
|
|
|
|
:type identifier: str
|
|
|
|
|
|
|
|
:return: Job
|
|
|
|
:rtype: Job
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
return self._jobs[identifier]
|
|
|
|
except KeyError:
|
|
|
|
return None
|
|
|
|
|
2019-10-31 15:44:26 +00:00
|
|
|
def describe_compute_environments(
|
|
|
|
self, environments=None, max_results=None, next_token=None
|
|
|
|
):
|
2017-09-29 22:29:36 +00:00
|
|
|
envs = set()
|
|
|
|
if environments is not None:
|
|
|
|
envs = set(environments)
|
|
|
|
|
|
|
|
result = []
|
|
|
|
for arn, environment in self._compute_environments.items():
|
|
|
|
# Filter shortcut
|
|
|
|
if len(envs) > 0 and arn not in envs and environment.name not in envs:
|
|
|
|
continue
|
|
|
|
|
|
|
|
json_part = {
|
2019-10-31 15:44:26 +00:00
|
|
|
"computeEnvironmentArn": arn,
|
|
|
|
"computeEnvironmentName": environment.name,
|
|
|
|
"ecsClusterArn": environment.ecs_arn,
|
|
|
|
"serviceRole": environment.service_role,
|
|
|
|
"state": environment.state,
|
|
|
|
"type": environment.env_type,
|
|
|
|
"status": "VALID",
|
2021-07-26 14:21:17 +00:00
|
|
|
"statusReason": "Compute environment is available",
|
2017-09-29 22:29:36 +00:00
|
|
|
}
|
2019-10-31 15:44:26 +00:00
|
|
|
if environment.env_type == "MANAGED":
|
|
|
|
json_part["computeResources"] = environment.compute_resources
|
2017-09-29 22:29:36 +00:00
|
|
|
|
|
|
|
result.append(json_part)
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
2019-10-31 15:44:26 +00:00
|
|
|
def create_compute_environment(
|
|
|
|
self, compute_environment_name, _type, state, compute_resources, service_role
|
|
|
|
):
|
2017-09-26 21:22:59 +00:00
|
|
|
# Validate
|
|
|
|
if COMPUTE_ENVIRONMENT_NAME_REGEX.match(compute_environment_name) is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise InvalidParameterValueException(
|
|
|
|
"Compute environment name does not match ^[A-Za-z0-9][A-Za-z0-9_-]{1,126}[A-Za-z0-9]$"
|
|
|
|
)
|
2017-09-26 21:22:59 +00:00
|
|
|
|
|
|
|
if self.get_compute_environment_by_name(compute_environment_name) is not None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise InvalidParameterValueException(
|
|
|
|
"A compute environment already exists with the name {0}".format(
|
|
|
|
compute_environment_name
|
|
|
|
)
|
|
|
|
)
|
2017-09-26 21:22:59 +00:00
|
|
|
|
|
|
|
# Look for IAM role
|
|
|
|
try:
|
|
|
|
self.iam_backend.get_role_by_arn(service_role)
|
|
|
|
except IAMNotFoundException:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise InvalidParameterValueException(
|
|
|
|
"Could not find IAM role {0}".format(service_role)
|
|
|
|
)
|
2017-09-26 21:22:59 +00:00
|
|
|
|
2019-10-31 15:44:26 +00:00
|
|
|
if _type not in ("MANAGED", "UNMANAGED"):
|
|
|
|
raise InvalidParameterValueException(
|
|
|
|
"type {0} must be one of MANAGED | UNMANAGED".format(service_role)
|
|
|
|
)
|
2017-09-26 21:22:59 +00:00
|
|
|
|
2019-10-31 15:44:26 +00:00
|
|
|
if state is not None and state not in ("ENABLED", "DISABLED"):
|
|
|
|
raise InvalidParameterValueException(
|
|
|
|
"state {0} must be one of ENABLED | DISABLED".format(state)
|
|
|
|
)
|
2017-09-26 21:22:59 +00:00
|
|
|
|
2019-10-31 15:44:26 +00:00
|
|
|
if compute_resources is None and _type == "MANAGED":
|
|
|
|
raise InvalidParameterValueException(
|
2021-02-25 07:46:11 +00:00
|
|
|
"computeResources must be specified when creating a {0} environment".format(
|
2019-10-31 15:44:26 +00:00
|
|
|
state
|
|
|
|
)
|
|
|
|
)
|
2017-09-26 21:22:59 +00:00
|
|
|
elif compute_resources is not None:
|
|
|
|
self._validate_compute_resources(compute_resources)
|
|
|
|
|
|
|
|
# By here, all values except SPOT ones have been validated
|
|
|
|
new_comp_env = ComputeEnvironment(
|
2019-10-31 15:44:26 +00:00
|
|
|
compute_environment_name,
|
|
|
|
_type,
|
|
|
|
state,
|
|
|
|
compute_resources,
|
|
|
|
service_role,
|
|
|
|
region_name=self.region_name,
|
2017-09-26 21:22:59 +00:00
|
|
|
)
|
|
|
|
self._compute_environments[new_comp_env.arn] = new_comp_env
|
|
|
|
|
2017-09-29 22:29:36 +00:00
|
|
|
# Ok by this point, everything is legit, so if its Managed then start some instances
|
2021-10-26 12:27:24 +00:00
|
|
|
if _type == "MANAGED" and "FARGATE" not in compute_resources["type"]:
|
2019-10-31 15:44:26 +00:00
|
|
|
cpus = int(
|
|
|
|
compute_resources.get("desiredvCpus", compute_resources["minvCpus"])
|
|
|
|
)
|
|
|
|
instance_types = compute_resources["instanceTypes"]
|
|
|
|
needed_instance_types = self.find_min_instances_to_meet_vcpus(
|
|
|
|
instance_types, cpus
|
|
|
|
)
|
2017-09-29 22:29:36 +00:00
|
|
|
# Create instances
|
|
|
|
|
|
|
|
# Will loop over and over so we get decent subnet coverage
|
2019-10-31 15:44:26 +00:00
|
|
|
subnet_cycle = cycle(compute_resources["subnets"])
|
2017-09-29 22:29:36 +00:00
|
|
|
|
|
|
|
for instance_type in needed_instance_types:
|
|
|
|
reservation = self.ec2_backend.add_instances(
|
2021-01-29 11:31:56 +00:00
|
|
|
image_id="ami-03cf127a", # Todo import AMIs
|
2017-09-29 22:29:36 +00:00
|
|
|
count=1,
|
|
|
|
user_data=None,
|
|
|
|
security_group_names=[],
|
|
|
|
instance_type=instance_type,
|
|
|
|
region_name=self.region_name,
|
2021-07-26 06:40:39 +00:00
|
|
|
subnet_id=next(subnet_cycle),
|
2019-10-31 15:44:26 +00:00
|
|
|
key_name=compute_resources.get("ec2KeyPair", "AWS_OWNED"),
|
|
|
|
security_group_ids=compute_resources["securityGroupIds"],
|
2017-09-29 22:29:36 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
new_comp_env.add_instance(reservation.instances[0])
|
|
|
|
|
|
|
|
# Create ECS cluster
|
|
|
|
# Should be of format P2OnDemand_Batch_UUID
|
2019-10-31 15:44:26 +00:00
|
|
|
cluster_name = "OnDemand_Batch_" + str(uuid.uuid4())
|
2017-09-29 22:29:36 +00:00
|
|
|
ecs_cluster = self.ecs_backend.create_cluster(cluster_name)
|
2017-10-03 21:35:30 +00:00
|
|
|
new_comp_env.set_ecs(ecs_cluster.arn, cluster_name)
|
2017-09-26 21:22:59 +00:00
|
|
|
|
|
|
|
return compute_environment_name, new_comp_env.arn
|
|
|
|
|
|
|
|
def _validate_compute_resources(self, cr):
|
2017-09-29 22:29:36 +00:00
|
|
|
"""
|
|
|
|
Checks contents of sub dictionary for managed clusters
|
|
|
|
|
|
|
|
:param cr: computeResources
|
|
|
|
:type cr: dict
|
|
|
|
"""
|
2021-07-29 04:55:23 +00:00
|
|
|
if int(cr["maxvCpus"]) < 0:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise InvalidParameterValueException("maxVCpus must be positive")
|
2021-10-26 12:27:24 +00:00
|
|
|
if "FARGATE" not in cr["type"]:
|
|
|
|
# Most parameters are not applicable to jobs that are running on Fargate resources:
|
|
|
|
# non exhaustive list: minvCpus, instanceTypes, imageId, ec2KeyPair, instanceRole, tags
|
|
|
|
for profile in self.iam_backend.get_instance_profiles():
|
|
|
|
if profile.arn == cr["instanceRole"]:
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
raise InvalidParameterValueException(
|
|
|
|
"could not find instanceRole {0}".format(cr["instanceRole"])
|
|
|
|
)
|
2019-10-31 15:44:26 +00:00
|
|
|
|
2021-10-26 12:27:24 +00:00
|
|
|
if int(cr["minvCpus"]) < 0:
|
|
|
|
raise InvalidParameterValueException("minvCpus must be positive")
|
|
|
|
if int(cr["maxvCpus"]) < int(cr["minvCpus"]):
|
2019-10-31 15:44:26 +00:00
|
|
|
raise InvalidParameterValueException(
|
2021-10-26 12:27:24 +00:00
|
|
|
"maxVCpus must be greater than minvCpus"
|
2019-10-31 15:44:26 +00:00
|
|
|
)
|
2017-09-26 21:22:59 +00:00
|
|
|
|
2021-10-26 12:27:24 +00:00
|
|
|
if len(cr["instanceTypes"]) == 0:
|
|
|
|
raise InvalidParameterValueException(
|
|
|
|
"At least 1 instance type must be provided"
|
|
|
|
)
|
|
|
|
for instance_type in cr["instanceTypes"]:
|
|
|
|
if instance_type == "optimal":
|
|
|
|
pass # Optimal should pick from latest of current gen
|
|
|
|
elif instance_type not in EC2_INSTANCE_TYPES:
|
|
|
|
raise InvalidParameterValueException(
|
|
|
|
"Instance type {0} does not exist".format(instance_type)
|
|
|
|
)
|
|
|
|
|
2019-10-31 15:44:26 +00:00
|
|
|
for sec_id in cr["securityGroupIds"]:
|
2017-09-26 21:22:59 +00:00
|
|
|
if self.ec2_backend.get_security_group_from_id(sec_id) is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise InvalidParameterValueException(
|
|
|
|
"security group {0} does not exist".format(sec_id)
|
|
|
|
)
|
|
|
|
if len(cr["securityGroupIds"]) == 0:
|
|
|
|
raise InvalidParameterValueException(
|
|
|
|
"At least 1 security group must be provided"
|
|
|
|
)
|
2017-09-26 21:22:59 +00:00
|
|
|
|
2019-10-31 15:44:26 +00:00
|
|
|
for subnet_id in cr["subnets"]:
|
2017-09-26 21:22:59 +00:00
|
|
|
try:
|
|
|
|
self.ec2_backend.get_subnet(subnet_id)
|
|
|
|
except InvalidSubnetIdError:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise InvalidParameterValueException(
|
|
|
|
"subnet {0} does not exist".format(subnet_id)
|
|
|
|
)
|
|
|
|
if len(cr["subnets"]) == 0:
|
|
|
|
raise InvalidParameterValueException("At least 1 subnet must be provided")
|
2017-09-26 21:22:59 +00:00
|
|
|
|
2021-10-26 12:27:24 +00:00
|
|
|
if cr["type"] not in {"EC2", "SPOT", "FARGATE", "FARGATE_SPOT"}:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise InvalidParameterValueException(
|
2021-10-26 12:27:24 +00:00
|
|
|
"computeResources.type must be either EC2 | SPOT | FARGATE | FARGATE_SPOT"
|
2019-10-31 15:44:26 +00:00
|
|
|
)
|
2017-09-26 21:22:59 +00:00
|
|
|
|
2017-09-29 22:29:36 +00:00
|
|
|
@staticmethod
|
|
|
|
def find_min_instances_to_meet_vcpus(instance_types, target):
|
|
|
|
"""
|
|
|
|
Finds the minimum needed instances to meed a vcpu target
|
|
|
|
|
|
|
|
:param instance_types: Instance types, like ['t2.medium', 't2.small']
|
|
|
|
:type instance_types: list of str
|
|
|
|
:param target: VCPU target
|
|
|
|
:type target: float
|
|
|
|
:return: List of instance types
|
|
|
|
:rtype: list of str
|
|
|
|
"""
|
|
|
|
# vcpus = [ (vcpus, instance_type), (vcpus, instance_type), ... ]
|
|
|
|
instance_vcpus = []
|
|
|
|
instances = []
|
|
|
|
|
|
|
|
for instance_type in instance_types:
|
2019-10-31 15:44:26 +00:00
|
|
|
if instance_type == "optimal":
|
|
|
|
instance_type = "m4.4xlarge"
|
2017-10-19 23:51:04 +00:00
|
|
|
|
2017-09-29 22:29:36 +00:00
|
|
|
instance_vcpus.append(
|
2021-02-22 10:21:59 +00:00
|
|
|
(
|
|
|
|
EC2_INSTANCE_TYPES[instance_type]["VCpuInfo"]["DefaultVCpus"],
|
|
|
|
instance_type,
|
|
|
|
)
|
2017-09-29 22:29:36 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
instance_vcpus = sorted(instance_vcpus, key=lambda item: item[0], reverse=True)
|
|
|
|
# Loop through,
|
|
|
|
# if biggest instance type smaller than target, and len(instance_types)> 1, then use biggest type
|
|
|
|
# if biggest instance type bigger than target, and len(instance_types)> 1, then remove it and move on
|
|
|
|
|
|
|
|
# if biggest instance type bigger than target and len(instan_types) == 1 then add instance and finish
|
|
|
|
# if biggest instance type smaller than target and len(instan_types) == 1 then loop adding instances until target == 0
|
|
|
|
# ^^ boils down to keep adding last till target vcpus is negative
|
|
|
|
# #Algorithm ;-) ... Could probably be done better with some quality lambdas
|
|
|
|
while target > 0:
|
|
|
|
current_vcpu, current_instance = instance_vcpus[0]
|
|
|
|
|
|
|
|
if len(instance_vcpus) > 1:
|
|
|
|
if current_vcpu <= target:
|
|
|
|
target -= current_vcpu
|
|
|
|
instances.append(current_instance)
|
|
|
|
else:
|
|
|
|
# try next biggest instance
|
|
|
|
instance_vcpus.pop(0)
|
|
|
|
else:
|
|
|
|
# Were on the last instance
|
|
|
|
target -= current_vcpu
|
|
|
|
instances.append(current_instance)
|
|
|
|
|
|
|
|
return instances
|
|
|
|
|
2017-10-03 21:35:30 +00:00
|
|
|
def delete_compute_environment(self, compute_environment_name):
|
|
|
|
if compute_environment_name is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise InvalidParameterValueException("Missing computeEnvironment parameter")
|
2017-10-03 21:35:30 +00:00
|
|
|
|
|
|
|
compute_env = self.get_compute_environment(compute_environment_name)
|
|
|
|
|
|
|
|
if compute_env is not None:
|
|
|
|
# Pop ComputeEnvironment
|
|
|
|
self._compute_environments.pop(compute_env.arn)
|
|
|
|
|
|
|
|
# Delete ECS cluster
|
|
|
|
self.ecs_backend.delete_cluster(compute_env.ecs_name)
|
|
|
|
|
2019-10-31 15:44:26 +00:00
|
|
|
if compute_env.env_type == "MANAGED":
|
2019-11-16 20:31:45 +00:00
|
|
|
# Delete compute environment
|
2017-10-03 21:35:30 +00:00
|
|
|
instance_ids = [instance.id for instance in compute_env.instances]
|
2021-07-26 14:21:17 +00:00
|
|
|
if instance_ids:
|
|
|
|
self.ec2_backend.terminate_instances(instance_ids)
|
2017-10-03 21:35:30 +00:00
|
|
|
|
2019-10-31 15:44:26 +00:00
|
|
|
def update_compute_environment(
|
|
|
|
self, compute_environment_name, state, compute_resources, service_role
|
|
|
|
):
|
2017-10-03 21:35:30 +00:00
|
|
|
# Validate
|
|
|
|
compute_env = self.get_compute_environment(compute_environment_name)
|
|
|
|
if compute_env is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("Compute environment {0} does not exist")
|
2017-10-03 21:35:30 +00:00
|
|
|
|
|
|
|
# Look for IAM role
|
|
|
|
if service_role is not None:
|
|
|
|
try:
|
|
|
|
role = self.iam_backend.get_role_by_arn(service_role)
|
|
|
|
except IAMNotFoundException:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise InvalidParameterValueException(
|
|
|
|
"Could not find IAM role {0}".format(service_role)
|
|
|
|
)
|
2017-10-03 21:35:30 +00:00
|
|
|
|
|
|
|
compute_env.service_role = role
|
|
|
|
|
|
|
|
if state is not None:
|
2019-10-31 15:44:26 +00:00
|
|
|
if state not in ("ENABLED", "DISABLED"):
|
|
|
|
raise InvalidParameterValueException(
|
|
|
|
"state {0} must be one of ENABLED | DISABLED".format(state)
|
|
|
|
)
|
2017-10-03 21:35:30 +00:00
|
|
|
|
|
|
|
compute_env.state = state
|
|
|
|
|
|
|
|
if compute_resources is not None:
|
|
|
|
# TODO Implement resizing of instances based on changing vCpus
|
|
|
|
# compute_resources CAN contain desiredvCpus, maxvCpus, minvCpus, and can contain none of them.
|
|
|
|
pass
|
|
|
|
|
|
|
|
return compute_env.name, compute_env.arn
|
|
|
|
|
2017-10-03 22:21:06 +00:00
|
|
|
def create_job_queue(self, queue_name, priority, state, compute_env_order):
|
|
|
|
"""
|
|
|
|
Create a job queue
|
|
|
|
|
|
|
|
:param queue_name: Queue name
|
|
|
|
:type queue_name: str
|
|
|
|
:param priority: Queue priority
|
|
|
|
:type priority: int
|
|
|
|
:param state: Queue state
|
|
|
|
:type state: string
|
|
|
|
:param compute_env_order: Compute environment list
|
|
|
|
:type compute_env_order: list of dict
|
|
|
|
:return: Tuple of Name, ARN
|
|
|
|
:rtype: tuple of str
|
|
|
|
"""
|
2019-10-31 15:44:26 +00:00
|
|
|
for variable, var_name in (
|
|
|
|
(queue_name, "jobQueueName"),
|
|
|
|
(priority, "priority"),
|
|
|
|
(state, "state"),
|
|
|
|
(compute_env_order, "computeEnvironmentOrder"),
|
|
|
|
):
|
2017-10-03 22:21:06 +00:00
|
|
|
if variable is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("{0} must be provided".format(var_name))
|
2017-10-03 22:21:06 +00:00
|
|
|
|
2019-10-31 15:44:26 +00:00
|
|
|
if state not in ("ENABLED", "DISABLED"):
|
|
|
|
raise ClientException(
|
|
|
|
"state {0} must be one of ENABLED | DISABLED".format(state)
|
|
|
|
)
|
2017-10-03 22:21:06 +00:00
|
|
|
if self.get_job_queue_by_name(queue_name) is not None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("Job queue {0} already exists".format(queue_name))
|
2017-10-03 22:21:06 +00:00
|
|
|
|
|
|
|
if len(compute_env_order) == 0:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("At least 1 compute environment must be provided")
|
2017-10-03 22:21:06 +00:00
|
|
|
try:
|
|
|
|
# orders and extracts computeEnvironment names
|
2019-10-31 15:44:26 +00:00
|
|
|
ordered_compute_environments = [
|
|
|
|
item["computeEnvironment"]
|
|
|
|
for item in sorted(compute_env_order, key=lambda x: x["order"])
|
|
|
|
]
|
2017-10-03 22:21:06 +00:00
|
|
|
env_objects = []
|
|
|
|
# Check each ARN exists, then make a list of compute env's
|
|
|
|
for arn in ordered_compute_environments:
|
|
|
|
env = self.get_compute_environment_by_arn(arn)
|
|
|
|
if env is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException(
|
|
|
|
"Compute environment {0} does not exist".format(arn)
|
|
|
|
)
|
2017-10-03 22:21:06 +00:00
|
|
|
env_objects.append(env)
|
|
|
|
except Exception:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("computeEnvironmentOrder is malformed")
|
2017-10-03 22:21:06 +00:00
|
|
|
|
|
|
|
# Create new Job Queue
|
2019-10-31 15:44:26 +00:00
|
|
|
queue = JobQueue(
|
|
|
|
queue_name,
|
|
|
|
priority,
|
|
|
|
state,
|
|
|
|
env_objects,
|
|
|
|
compute_env_order,
|
|
|
|
self.region_name,
|
|
|
|
)
|
2017-10-03 22:21:06 +00:00
|
|
|
self._job_queues[queue.arn] = queue
|
|
|
|
|
|
|
|
return queue_name, queue.arn
|
|
|
|
|
|
|
|
def describe_job_queues(self, job_queues=None, max_results=None, next_token=None):
|
|
|
|
envs = set()
|
|
|
|
if job_queues is not None:
|
|
|
|
envs = set(job_queues)
|
|
|
|
|
|
|
|
result = []
|
|
|
|
for arn, job_queue in self._job_queues.items():
|
|
|
|
# Filter shortcut
|
|
|
|
if len(envs) > 0 and arn not in envs and job_queue.name not in envs:
|
|
|
|
continue
|
|
|
|
|
|
|
|
result.append(job_queue.describe())
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
2017-10-04 17:52:12 +00:00
|
|
|
def update_job_queue(self, queue_name, priority, state, compute_env_order):
|
|
|
|
"""
|
|
|
|
Update a job queue
|
|
|
|
|
|
|
|
:param queue_name: Queue name
|
|
|
|
:type queue_name: str
|
|
|
|
:param priority: Queue priority
|
|
|
|
:type priority: int
|
|
|
|
:param state: Queue state
|
|
|
|
:type state: string
|
|
|
|
:param compute_env_order: Compute environment list
|
|
|
|
:type compute_env_order: list of dict
|
|
|
|
:return: Tuple of Name, ARN
|
|
|
|
:rtype: tuple of str
|
|
|
|
"""
|
|
|
|
if queue_name is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("jobQueueName must be provided")
|
2017-10-04 17:52:12 +00:00
|
|
|
|
|
|
|
job_queue = self.get_job_queue(queue_name)
|
|
|
|
if job_queue is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("Job queue {0} does not exist".format(queue_name))
|
2017-10-04 17:52:12 +00:00
|
|
|
|
|
|
|
if state is not None:
|
2019-10-31 15:44:26 +00:00
|
|
|
if state not in ("ENABLED", "DISABLED"):
|
|
|
|
raise ClientException(
|
|
|
|
"state {0} must be one of ENABLED | DISABLED".format(state)
|
|
|
|
)
|
2017-10-04 17:52:12 +00:00
|
|
|
|
|
|
|
job_queue.state = state
|
|
|
|
|
|
|
|
if compute_env_order is not None:
|
|
|
|
if len(compute_env_order) == 0:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("At least 1 compute environment must be provided")
|
2017-10-04 17:52:12 +00:00
|
|
|
try:
|
|
|
|
# orders and extracts computeEnvironment names
|
2019-10-31 15:44:26 +00:00
|
|
|
ordered_compute_environments = [
|
|
|
|
item["computeEnvironment"]
|
|
|
|
for item in sorted(compute_env_order, key=lambda x: x["order"])
|
|
|
|
]
|
2017-10-04 17:52:12 +00:00
|
|
|
env_objects = []
|
|
|
|
# Check each ARN exists, then make a list of compute env's
|
|
|
|
for arn in ordered_compute_environments:
|
|
|
|
env = self.get_compute_environment_by_arn(arn)
|
|
|
|
if env is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException(
|
|
|
|
"Compute environment {0} does not exist".format(arn)
|
|
|
|
)
|
2017-10-04 17:52:12 +00:00
|
|
|
env_objects.append(env)
|
|
|
|
except Exception:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("computeEnvironmentOrder is malformed")
|
2017-10-04 17:52:12 +00:00
|
|
|
|
|
|
|
job_queue.env_order_json = compute_env_order
|
|
|
|
job_queue.environments = env_objects
|
|
|
|
|
|
|
|
if priority is not None:
|
|
|
|
job_queue.priority = priority
|
|
|
|
|
|
|
|
return queue_name, job_queue.arn
|
|
|
|
|
|
|
|
def delete_job_queue(self, queue_name):
|
|
|
|
job_queue = self.get_job_queue(queue_name)
|
|
|
|
|
|
|
|
if job_queue is not None:
|
|
|
|
del self._job_queues[job_queue.arn]
|
|
|
|
|
2019-10-31 15:44:26 +00:00
|
|
|
def register_job_definition(
|
2021-11-27 06:25:53 +00:00
|
|
|
self,
|
|
|
|
def_name,
|
|
|
|
parameters,
|
|
|
|
_type,
|
|
|
|
tags,
|
|
|
|
retry_strategy,
|
|
|
|
container_properties,
|
|
|
|
timeout,
|
2019-10-31 15:44:26 +00:00
|
|
|
):
|
2017-10-04 19:17:29 +00:00
|
|
|
if def_name is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("jobDefinitionName must be provided")
|
2017-10-04 19:17:29 +00:00
|
|
|
|
2017-10-04 23:00:40 +00:00
|
|
|
job_def = self.get_job_definition_by_name(def_name)
|
2017-10-04 19:17:29 +00:00
|
|
|
if retry_strategy is not None:
|
|
|
|
try:
|
2019-10-31 15:44:26 +00:00
|
|
|
retry_strategy = retry_strategy["attempts"]
|
2017-10-04 19:17:29 +00:00
|
|
|
except Exception:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("retryStrategy is malformed")
|
2021-11-23 00:47:35 +00:00
|
|
|
if not tags:
|
|
|
|
tags = {}
|
2017-10-04 23:00:40 +00:00
|
|
|
if job_def is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
job_def = JobDefinition(
|
|
|
|
def_name,
|
|
|
|
parameters,
|
|
|
|
_type,
|
|
|
|
container_properties,
|
2021-09-21 16:12:18 +00:00
|
|
|
tags=tags,
|
2019-10-31 15:44:26 +00:00
|
|
|
region_name=self.region_name,
|
|
|
|
retry_strategy=retry_strategy,
|
2021-11-27 06:25:53 +00:00
|
|
|
timeout=timeout,
|
2019-10-31 15:44:26 +00:00
|
|
|
)
|
2017-10-04 23:00:40 +00:00
|
|
|
else:
|
|
|
|
# Make new jobdef
|
2019-10-31 15:44:26 +00:00
|
|
|
job_def = job_def.update(
|
2021-11-27 06:25:53 +00:00
|
|
|
parameters, _type, container_properties, retry_strategy, tags, timeout
|
2019-10-31 15:44:26 +00:00
|
|
|
)
|
2017-10-04 23:00:40 +00:00
|
|
|
|
2017-10-04 19:17:29 +00:00
|
|
|
self._job_definitions[job_def.arn] = job_def
|
|
|
|
|
|
|
|
return def_name, job_def.arn, job_def.revision
|
|
|
|
|
2017-10-04 23:00:40 +00:00
|
|
|
def deregister_job_definition(self, def_name):
|
|
|
|
job_def = self.get_job_definition_by_arn(def_name)
|
2019-10-31 15:44:26 +00:00
|
|
|
if job_def is None and ":" in def_name:
|
|
|
|
name, revision = def_name.split(":", 1)
|
2017-10-04 23:00:40 +00:00
|
|
|
job_def = self.get_job_definition_by_name_revision(name, revision)
|
|
|
|
|
|
|
|
if job_def is not None:
|
|
|
|
del self._job_definitions[job_def.arn]
|
|
|
|
|
2019-10-31 15:44:26 +00:00
|
|
|
def describe_job_definitions(
|
|
|
|
self,
|
|
|
|
job_def_name=None,
|
|
|
|
job_def_list=None,
|
|
|
|
status=None,
|
|
|
|
max_results=None,
|
|
|
|
next_token=None,
|
|
|
|
):
|
2017-10-04 23:00:40 +00:00
|
|
|
jobs = []
|
|
|
|
|
|
|
|
# As a job name can reference multiple revisions, we get a list of them
|
|
|
|
if job_def_name is not None:
|
|
|
|
job_def = self.get_job_definitions(job_def_name)
|
|
|
|
if job_def is not None:
|
|
|
|
jobs.extend(job_def)
|
|
|
|
elif job_def_list is not None:
|
|
|
|
for job in job_def_list:
|
|
|
|
job_def = self.get_job_definitions(job)
|
|
|
|
if job_def is not None:
|
|
|
|
jobs.extend(job_def)
|
|
|
|
else:
|
|
|
|
jobs.extend(self._job_definitions.values())
|
|
|
|
|
|
|
|
# Got all the job defs were after, filter then by status
|
|
|
|
if status is not None:
|
|
|
|
return [job for job in jobs if job.status == status]
|
2021-09-21 16:12:18 +00:00
|
|
|
for job in jobs:
|
|
|
|
job.describe()
|
2017-10-04 23:00:40 +00:00
|
|
|
return jobs
|
|
|
|
|
2019-10-31 15:44:26 +00:00
|
|
|
def submit_job(
|
|
|
|
self,
|
|
|
|
job_name,
|
|
|
|
job_def_id,
|
|
|
|
job_queue,
|
|
|
|
parameters=None,
|
|
|
|
retries=None,
|
|
|
|
depends_on=None,
|
|
|
|
container_overrides=None,
|
2021-11-27 06:25:53 +00:00
|
|
|
timeout=None,
|
2019-10-31 15:44:26 +00:00
|
|
|
):
|
2019-11-16 20:31:45 +00:00
|
|
|
# TODO parameters, retries (which is a dict raw from request), job dependencies and container overrides are ignored for now
|
2017-10-06 00:21:29 +00:00
|
|
|
|
|
|
|
# Look for job definition
|
2019-08-06 20:13:52 +00:00
|
|
|
job_def = self.get_job_definition(job_def_id)
|
2017-10-06 00:21:29 +00:00
|
|
|
if job_def is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException(
|
|
|
|
"Job definition {0} does not exist".format(job_def_id)
|
|
|
|
)
|
2017-10-06 00:21:29 +00:00
|
|
|
|
|
|
|
queue = self.get_job_queue(job_queue)
|
|
|
|
if queue is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("Job queue {0} does not exist".format(job_queue))
|
2017-10-06 00:21:29 +00:00
|
|
|
|
2019-12-12 23:30:08 +00:00
|
|
|
job = Job(
|
|
|
|
job_name,
|
|
|
|
job_def,
|
|
|
|
queue,
|
|
|
|
log_backend=self.logs_backend,
|
|
|
|
container_overrides=container_overrides,
|
2021-05-26 07:52:09 +00:00
|
|
|
depends_on=depends_on,
|
|
|
|
all_jobs=self._jobs,
|
2021-11-27 06:25:53 +00:00
|
|
|
timeout=timeout,
|
2019-12-12 23:30:08 +00:00
|
|
|
)
|
2017-10-06 00:21:29 +00:00
|
|
|
self._jobs[job.job_id] = job
|
|
|
|
|
|
|
|
# Here comes the fun
|
|
|
|
job.start()
|
|
|
|
|
|
|
|
return job_name, job.job_id
|
|
|
|
|
|
|
|
def describe_jobs(self, jobs):
|
|
|
|
job_filter = set()
|
|
|
|
if jobs is not None:
|
|
|
|
job_filter = set(jobs)
|
|
|
|
|
|
|
|
result = []
|
|
|
|
for key, job in self._jobs.items():
|
|
|
|
if len(job_filter) > 0 and key not in job_filter:
|
|
|
|
continue
|
|
|
|
|
|
|
|
result.append(job.describe())
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
2017-10-11 22:46:27 +00:00
|
|
|
def list_jobs(self, job_queue, job_status=None, max_results=None, next_token=None):
|
|
|
|
jobs = []
|
|
|
|
|
|
|
|
job_queue = self.get_job_queue(job_queue)
|
|
|
|
if job_queue is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("Job queue {0} does not exist".format(job_queue))
|
|
|
|
|
|
|
|
if job_status is not None and job_status not in (
|
|
|
|
"SUBMITTED",
|
|
|
|
"PENDING",
|
|
|
|
"RUNNABLE",
|
|
|
|
"STARTING",
|
|
|
|
"RUNNING",
|
|
|
|
"SUCCEEDED",
|
|
|
|
"FAILED",
|
|
|
|
):
|
|
|
|
raise ClientException(
|
|
|
|
"Job status is not one of SUBMITTED | PENDING | RUNNABLE | STARTING | RUNNING | SUCCEEDED | FAILED"
|
|
|
|
)
|
2017-10-11 22:46:27 +00:00
|
|
|
|
|
|
|
for job in job_queue.jobs:
|
|
|
|
if job_status is not None and job.job_state != job_status:
|
|
|
|
continue
|
|
|
|
|
|
|
|
jobs.append(job)
|
|
|
|
|
|
|
|
return jobs
|
|
|
|
|
2021-08-22 11:29:23 +00:00
|
|
|
def cancel_job(self, job_id, reason):
|
|
|
|
job = self.get_job_by_id(job_id)
|
|
|
|
if job.job_state in ["SUBMITTED", "PENDING", "RUNNABLE"]:
|
|
|
|
job.terminate(reason)
|
|
|
|
# No-Op for jobs that have already started - user has to explicitly terminate those
|
|
|
|
|
2017-10-11 22:46:27 +00:00
|
|
|
def terminate_job(self, job_id, reason):
|
|
|
|
if job_id is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("Job ID does not exist")
|
2017-10-11 22:46:27 +00:00
|
|
|
if reason is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("Reason does not exist")
|
2017-10-11 22:46:27 +00:00
|
|
|
|
|
|
|
job = self.get_job_by_id(job_id)
|
|
|
|
if job is None:
|
2019-10-31 15:44:26 +00:00
|
|
|
raise ClientException("Job not found")
|
2017-10-11 22:46:27 +00:00
|
|
|
|
|
|
|
job.terminate(reason)
|
|
|
|
|
2017-09-26 16:37:26 +00:00
|
|
|
|
2021-12-24 21:02:45 +00:00
|
|
|
batch_backends = BackendDict(BatchBackend, "batch")
|