Better EMR coverage and boto3 request/response handling

This revision includes:

- A handler for requests for which content-type is JSON (from boto3).

- A decorator (generate_boto3_response) to convert XML responses to
  JSON (for boto3). This way, existing response templates for boto can
  be shared for generating boto3 response.

- Utility class/functions to use botocore's service specification data
  (accessible under botocore.data) for type casting, from query
  parameters to Python objects and XML to JSON.

- Updates to response handlers/models to cover more EMR end points and
  mockable parameters
This commit is contained in:
Taro Sato 2016-09-21 20:59:19 -07:00
parent 4157abe8de
commit 7cd404808b
10 changed files with 2442 additions and 884 deletions

View File

@ -3,3 +3,8 @@ try:
except ImportError: except ImportError:
# python 2.6 or earlier, use backport # python 2.6 or earlier, use backport
from ordereddict import OrderedDict # flake8: noqa from ordereddict import OrderedDict # flake8: noqa
try:
from urlparse import urlparse # flake8: noqa
except ImportError:
from urllib.parse import urlparse # flake8: noqa

View File

@ -8,7 +8,10 @@ from jinja2 import Environment, DictLoader, TemplateNotFound
import six import six
from six.moves.urllib.parse import parse_qs, urlparse from six.moves.urllib.parse import parse_qs, urlparse
import xmltodict
from pkg_resources import resource_filename
from werkzeug.exceptions import HTTPException from werkzeug.exceptions import HTTPException
from moto.compat import OrderedDict
from moto.core.utils import camelcase_to_underscores, method_names_from_class from moto.core.utils import camelcase_to_underscores, method_names_from_class
@ -90,6 +93,7 @@ class BaseResponse(_TemplateEnvironmentMixin):
default_region = 'us-east-1' default_region = 'us-east-1'
region_regex = r'\.(.+?)\.amazonaws\.com' region_regex = r'\.(.+?)\.amazonaws\.com'
aws_service_spec = None
@classmethod @classmethod
def dispatch(cls, *args, **kwargs): def dispatch(cls, *args, **kwargs):
@ -115,7 +119,20 @@ class BaseResponse(_TemplateEnvironmentMixin):
if not querystring: if not querystring:
querystring.update(parse_qs(urlparse(full_url).query, keep_blank_values=True)) querystring.update(parse_qs(urlparse(full_url).query, keep_blank_values=True))
if not querystring: if not querystring:
querystring.update(parse_qs(self.body, keep_blank_values=True)) if 'json' in request.headers.get('content-type', []) and self.aws_service_spec:
if isinstance(self.body, six.binary_type):
decoded = json.loads(self.body.decode('utf-8'))
else:
decoded = json.loads(self.body)
target = request.headers.get('x-amz-target') or request.headers.get('X-Amz-Target')
service, method = target.split('.')
input_spec = self.aws_service_spec.input_spec(method)
flat = flatten_json_request_body('', decoded, input_spec)
for key, value in flat.items():
querystring[key] = [value]
else:
querystring.update(parse_qs(self.body, keep_blank_values=True))
if not querystring: if not querystring:
querystring.update(headers) querystring.update(headers)
@ -125,15 +142,19 @@ class BaseResponse(_TemplateEnvironmentMixin):
self.path = urlparse(full_url).path self.path = urlparse(full_url).path
self.querystring = querystring self.querystring = querystring
self.method = request.method self.method = request.method
region = re.search(self.region_regex, full_url) self.region = self.get_region_from_url(full_url)
if region:
self.region = region.group(1)
else:
self.region = self.default_region
self.headers = request.headers self.headers = request.headers
self.response_headers = headers self.response_headers = headers
def get_region_from_url(self, full_url):
match = re.search(self.region_regex, full_url)
if match:
region = match.group(1)
else:
region = self.default_region
return region
def _dispatch(self, request, full_url, headers): def _dispatch(self, request, full_url, headers):
self.setup_class(request, full_url, headers) self.setup_class(request, full_url, headers)
return self.call_action() return self.call_action()
@ -164,21 +185,26 @@ class BaseResponse(_TemplateEnvironmentMixin):
return status, headers, body return status, headers, body
raise NotImplementedError("The {0} action has not been implemented".format(action)) raise NotImplementedError("The {0} action has not been implemented".format(action))
def _get_param(self, param_name): def _get_param(self, param_name, if_none=None):
return self.querystring.get(param_name, [None])[0] val = self.querystring.get(param_name)
if val is not None:
return val[0]
return if_none
def _get_int_param(self, param_name): def _get_int_param(self, param_name, if_none=None):
val = self._get_param(param_name) val = self._get_param(param_name)
if val is not None: if val is not None:
return int(val) return int(val)
return if_none
def _get_bool_param(self, param_name): def _get_bool_param(self, param_name, if_none=None):
val = self._get_param(param_name) val = self._get_param(param_name)
if val is not None: if val is not None:
if val.lower() == 'true': if val.lower() == 'true':
return True return True
elif val.lower() == 'false': elif val.lower() == 'false':
return False return False
return if_none
def _get_multi_param(self, param_prefix): def _get_multi_param(self, param_prefix):
""" """
@ -257,6 +283,28 @@ class BaseResponse(_TemplateEnvironmentMixin):
param_index += 1 param_index += 1
return results return results
def _get_map_prefix(self, param_prefix):
results = {}
param_index = 1
while 1:
index_prefix = '{0}.{1}.'.format(param_prefix, param_index)
k, v = None, None
for key, value in self.querystring.items():
if key.startswith(index_prefix):
if key.endswith('.key'):
k = value[0]
elif key.endswith('.value'):
v = value[0]
if not (k and v):
break
results[k] = v
param_index += 1
return results
@property @property
def request_json(self): def request_json(self):
return 'JSON' in self.querystring.get('ContentType', []) return 'JSON' in self.querystring.get('ContentType', [])
@ -299,3 +347,227 @@ def metadata_response(request, full_url, headers):
else: else:
raise NotImplementedError("The {0} metadata path has not been implemented".format(path)) raise NotImplementedError("The {0} metadata path has not been implemented".format(path))
return 200, headers, result return 200, headers, result
class _RecursiveDictRef(object):
"""Store a recursive reference to dict."""
def __init__(self):
self.key = None
self.dic = {}
def __repr__(self):
return '{!r}'.format(self.dic)
def __getattr__(self, key):
return self.dic.__getattr__(key)
def set_reference(self, key, dic):
"""Set the RecursiveDictRef object to keep reference to dict object
(dic) at the key.
"""
self.key = key
self.dic = dic
class AWSServiceSpec(object):
"""Parse data model from botocore. This is used to recover type info
for fields in AWS API XML response.
"""
def __init__(self, path):
self.path = resource_filename('botocore', path)
with open(self.path) as f:
spec = json.load(f)
self.metadata = spec['metadata']
self.operations = spec['operations']
self.shapes = spec['shapes']
def input_spec(self, operation):
try:
op = self.operations[operation]
except KeyError:
raise ValueError('Invalid operation: {}'.format(operation))
if 'input' not in op:
return {}
shape = self.shapes[op['input']['shape']]
return self._expand(shape)
def output_spec(self, operation):
"""Produce a JSON with a valid API response syntax for operation, but
with type information. Each node represented by a key has the
value containing field type, e.g.,
output_spec["SomeBooleanNode"] => {"type": "boolean"}
"""
try:
op = self.operations[operation]
except KeyError:
raise ValueError('Invalid operation: {}'.format(operation))
if 'output' not in op:
return {}
shape = self.shapes[op['output']['shape']]
return self._expand(shape)
def _expand(self, shape):
def expand(dic, seen=None):
seen = seen or {}
if dic['type'] == 'structure':
nodes = {}
for k, v in dic['members'].items():
seen_till_here = dict(seen)
if k in seen_till_here:
nodes[k] = seen_till_here[k]
continue
seen_till_here[k] = _RecursiveDictRef()
nodes[k] = expand(self.shapes[v['shape']], seen_till_here)
seen_till_here[k].set_reference(k, nodes[k])
nodes['type'] = 'structure'
return nodes
elif dic['type'] == 'list':
seen_till_here = dict(seen)
shape = dic['member']['shape']
if shape in seen_till_here:
return seen_till_here[shape]
seen_till_here[shape] = _RecursiveDictRef()
expanded = expand(self.shapes[shape], seen_till_here)
seen_till_here[shape].set_reference(shape, expanded)
return {'type': 'list', 'member': expanded}
elif dic['type'] == 'map':
seen_till_here = dict(seen)
node = {'type': 'map'}
if 'shape' in dic['key']:
shape = dic['key']['shape']
seen_till_here[shape] = _RecursiveDictRef()
node['key'] = expand(self.shapes[shape], seen_till_here)
seen_till_here[shape].set_reference(shape, node['key'])
else:
node['key'] = dic['key']['type']
if 'shape' in dic['value']:
shape = dic['value']['shape']
seen_till_here[shape] = _RecursiveDictRef()
node['value'] = expand(self.shapes[shape], seen_till_here)
seen_till_here[shape].set_reference(shape, node['value'])
else:
node['value'] = dic['value']['type']
return node
else:
return {'type': dic['type']}
return expand(shape)
def to_str(value, spec):
vtype = spec['type']
if vtype == 'boolean':
return 'true' if value else 'false'
elif vtype == 'integer':
return str(value)
elif vtype == 'string':
return str(value)
elif value is None:
return 'null'
else:
raise TypeError('Unknown type {}'.format(vtype))
def from_str(value, spec):
vtype = spec['type']
if vtype == 'boolean':
return True if value == 'true' else False
elif vtype == 'integer':
return int(value)
elif vtype == 'float':
return float(value)
elif vtype == 'timestamp':
return value
elif vtype == 'string':
return value
raise TypeError('Unknown type {}'.format(vtype))
def flatten_json_request_body(prefix, dict_body, spec):
"""Convert a JSON request body into query params."""
if len(spec) == 1 and 'type' in spec:
return {prefix: to_str(dict_body, spec)}
flat = {}
for key, value in dict_body.items():
node_type = spec[key]['type']
if node_type == 'list':
for idx, v in enumerate(value, 1):
pref = key + '.member.' + str(idx)
flat.update(flatten_json_request_body(pref, v, spec[key]['member']))
elif node_type == 'map':
for idx, (k, v) in enumerate(value.items(), 1):
pref = key + '.entry.' + str(idx)
flat.update(flatten_json_request_body(pref + '.key', k, spec[key]['key']))
flat.update(flatten_json_request_body(pref + '.value', v, spec[key]['value']))
else:
flat.update(flatten_json_request_body(key, value, spec[key]))
if prefix:
prefix = prefix + '.'
return dict((prefix + k, v) for k, v in flat.items())
def xml_to_json_response(service_spec, operation, xml, result_node=None):
"""Convert rendered XML response to JSON for use with boto3."""
def transform(value, spec):
"""Apply transformations to make the output JSON comply with the
expected form. This function applies:
(1) Type cast to nodes with "type" property (e.g., 'true' to
True). XML field values are all in text so this step is
necessary to convert it to valid JSON objects.
(2) Squashes "member" nodes to lists.
"""
if len(spec) == 1:
return from_str(value, spec)
od = OrderedDict()
for k, v in value.items():
if k.startswith('@') or v is None:
continue
if spec[k]['type'] == 'list':
if len(spec[k]['member']) == 1:
if isinstance(v['member'], list):
od[k] = transform(v['member'], spec[k]['member'])
else:
od[k] = [transform(v['member'], spec[k]['member'])]
elif isinstance(v['member'], list):
od[k] = [transform(o, spec[k]['member']) for o in v['member']]
elif isinstance(v['member'], OrderedDict):
od[k] = [transform(v['member'], spec[k]['member'])]
else:
raise ValueError('Malformatted input')
elif spec[k]['type'] == 'map':
key = from_str(v['entry']['key'], spec[k]['key'])
val = from_str(v['entry']['value'], spec[k]['value'])
od[k] = {key: val}
else:
od[k] = transform(v, spec[k])
return od
dic = xmltodict.parse(xml)
output_spec = service_spec.output_spec(operation)
try:
for k in (result_node or (operation + 'Response', operation + 'Result')):
dic = dic[k]
except KeyError:
return None
else:
return transform(dic, output_spec)
return None

View File

@ -1,47 +1,227 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from datetime import datetime
import boto.emr import boto.emr
import pytz
from moto.core import BaseBackend from moto.core import BaseBackend
from .utils import random_instance_group_id, random_job_id from .utils import random_instance_group_id, random_cluster_id, random_step_id
DEFAULT_JOB_FLOW_ROLE = 'EMRJobflowDefault'
class FakeApplication(object):
def __init__(self, name, version, args=None, additional_info=None):
self.additional_info = additional_info or {}
self.args = args or []
self.name = name
self.version = version
class FakeBootstrapAction(object):
def __init__(self, args, name, script_path):
self.args = args or []
self.name = name
self.script_path = script_path
class FakeInstanceGroup(object): class FakeInstanceGroup(object):
def __init__(self, id, instance_count, instance_role, instance_type, market, name, bid_price=None): def __init__(self, instance_count, instance_role, instance_type, market, name,
self.id = id id=None, bid_price=None):
self.id = id or random_instance_group_id()
self.bid_price = bid_price
self.market = market
self.name = name
self.num_instances = instance_count self.num_instances = instance_count
self.role = instance_role self.role = instance_role
self.type = instance_type self.type = instance_type
self.market = market
self.name = name self.creation_datetime = datetime.now(pytz.utc)
self.bid_price = bid_price self.start_datetime = datetime.now(pytz.utc)
self.ready_datetime = datetime.now(pytz.utc)
self.end_datetime = None
self.state = 'RUNNING'
def set_instance_count(self, instance_count): def set_instance_count(self, instance_count):
self.num_instances = instance_count self.num_instances = instance_count
class Cluster(object): class FakeStep(object):
def __init__(self, id, name, availability_zone, ec2_key_name, subnet_id, def __init__(self,
ec2_iam_profile, log_uri): state,
self.id = id name='',
jar='',
args=None,
properties=None,
action_on_failure='TERMINATE_CLUSTER'):
self.id = random_step_id()
self.action_on_failure = action_on_failure
self.args = args or []
self.name = name self.name = name
self.jar = jar
self.properties = properties or {}
self.creation_datetime = datetime.now(pytz.utc)
self.end_datetime = None
self.ready_datetime = None
self.start_datetime = None
self.state = state
class FakeCluster(object):
def __init__(self,
emr_backend,
name,
log_uri,
job_flow_role,
service_role,
steps,
instance_attrs,
bootstrap_actions=None,
configurations=None,
cluster_id=None,
visible_to_all_users='false',
release_label=None,
requested_ami_version=None,
running_ami_version=None):
self.id = cluster_id or random_cluster_id()
emr_backend.clusters[self.id] = self
self.emr_backend = emr_backend
self.applications = [] self.applications = []
self.auto_terminate = "false"
self.availability_zone = availability_zone self.bootstrap_actions = []
self.subnet_id = subnet_id for bootstrap_action in (bootstrap_actions or []):
self.ec2_iam_profile = ec2_iam_profile self.add_bootstrap_action(bootstrap_action)
self.log_uri = log_uri
self.master_public_dns_name = "" self.configurations = configurations or []
self.normalized_instance_hours = 0
self.requested_ami_version = "2.4.2"
self.running_ami_version = "2.4.2"
self.service_role = "my-service-role"
self.state = "RUNNING"
self.tags = {} self.tags = {}
self.termination_protected = "false"
self.visible_to_all_users = "false" self.log_uri = log_uri
self.name = name
self.normalized_instance_hours = 0
self.steps = []
self.add_steps(steps)
self.set_visibility(visible_to_all_users)
self.instance_group_ids = []
self.master_instance_group_id = None
self.core_instance_group_id = None
if 'master_instance_type' in instance_attrs and instance_attrs['master_instance_type']:
self.emr_backend.add_instance_groups(
self.id,
[{'instance_count': 1,
'instance_role': 'MASTER',
'instance_type': instance_attrs['master_instance_type'],
'market': 'ON_DEMAND',
'name': 'master'}])
if 'slave_instance_type' in instance_attrs and instance_attrs['slave_instance_type']:
self.emr_backend.add_instance_groups(
self.id,
[{'instance_count': instance_attrs['instance_count'] - 1,
'instance_role': 'CORE',
'instance_type': instance_attrs['slave_instance_type'],
'market': 'ON_DEMAND',
'name': 'slave'}])
self.additional_master_security_groups = instance_attrs.get('additional_master_security_groups')
self.additional_slave_security_groups = instance_attrs.get('additional_slave_security_groups')
self.availability_zone = instance_attrs.get('availability_zone')
self.ec2_key_name = instance_attrs.get('ec2_key_name')
self.ec2_subnet_id = instance_attrs.get('ec2_subnet_id')
self.hadoop_version = instance_attrs.get('hadoop_version')
self.keep_job_flow_alive_when_no_steps = instance_attrs.get('keep_job_flow_alive_when_no_steps')
self.master_security_group = instance_attrs.get('emr_managed_master_security_group')
self.service_access_security_group = instance_attrs.get('service_access_security_group')
self.slave_security_group = instance_attrs.get('emr_managed_slave_security_group')
self.termination_protected = instance_attrs.get('termination_protected')
self.release_label = release_label
self.requested_ami_version = requested_ami_version
self.running_ami_version = running_ami_version
self.role = job_flow_role or 'EMRJobflowDefault'
self.service_role = service_role
self.creation_datetime = datetime.now(pytz.utc)
self.start_datetime = None
self.ready_datetime = None
self.end_datetime = None
self.state = None
self.start_cluster()
self.run_bootstrap_actions()
@property
def instance_groups(self):
return self.emr_backend.get_instance_groups(self.instance_group_ids)
@property
def master_instance_type(self):
return self.emr_backend.instance_groups[self.master_instance_group_id].type
@property
def slave_instance_type(self):
return self.emr_backend.instance_groups[self.core_instance_group_id].type
@property
def instance_count(self):
return sum(group.num_instances for group in self.instance_groups)
def start_cluster(self):
self.state = 'STARTING'
self.start_datetime = datetime.now(pytz.utc)
def run_bootstrap_actions(self):
self.state = 'BOOTSTRAPPING'
self.ready_datetime = datetime.now(pytz.utc)
self.state = 'WAITING'
if not self.steps:
if not self.keep_job_flow_alive_when_no_steps:
self.terminate()
def terminate(self):
self.state = 'TERMINATING'
self.end_datetime = datetime.now(pytz.utc)
self.state = 'TERMINATED'
def add_applications(self, applications):
self.applications.extend([
FakeApplication(
name=app.get('name', ''),
version=app.get('version', ''),
args=app.get('args', []),
additional_info=app.get('additiona_info', {}))
for app in applications])
def add_bootstrap_action(self, bootstrap_action):
self.bootstrap_actions.append(FakeBootstrapAction(**bootstrap_action))
def add_instance_group(self, instance_group):
if instance_group.role == 'MASTER':
if self.master_instance_group_id:
raise Exception('Cannot add another master instance group')
self.master_instance_group_id = instance_group.id
if instance_group.role == 'CORE':
if self.core_instance_group_id:
raise Exception('Cannot add another core instance group')
self.core_instance_group_id = instance_group.id
self.instance_group_ids.append(instance_group.id)
def add_steps(self, steps):
added_steps = []
for step in steps:
if self.steps:
# If we already have other steps, this one is pending
fake = FakeStep(state='PENDING', **step)
else:
fake = FakeStep(state='STARTING', **step)
self.steps.append(fake)
added_steps.append(fake)
self.state = 'RUNNING'
return added_steps
def add_tags(self, tags): def add_tags(self, tags):
self.tags.update(tags) self.tags.update(tags)
@ -50,166 +230,61 @@ class Cluster(object):
for key in tag_keys: for key in tag_keys:
self.tags.pop(key, None) self.tags.pop(key, None)
class FakeStep(object):
def __init__(self, state, **kwargs):
# 'Steps.member.1.HadoopJarStep.Jar': ['/home/hadoop/contrib/streaming/hadoop-streaming.jar'],
# 'Steps.member.1.HadoopJarStep.Args.member.1': ['-mapper'],
# 'Steps.member.1.HadoopJarStep.Args.member.2': ['s3n://elasticmapreduce/samples/wordcount/wordSplitter.py'],
# 'Steps.member.1.HadoopJarStep.Args.member.3': ['-reducer'],
# 'Steps.member.1.HadoopJarStep.Args.member.4': ['aggregate'],
# 'Steps.member.1.HadoopJarStep.Args.member.5': ['-input'],
# 'Steps.member.1.HadoopJarStep.Args.member.6': ['s3n://elasticmapreduce/samples/wordcount/input'],
# 'Steps.member.1.HadoopJarStep.Args.member.7': ['-output'],
# 'Steps.member.1.HadoopJarStep.Args.member.8': ['s3n://<my output bucket>/output/wordcount_output'],
# 'Steps.member.1.ActionOnFailure': ['TERMINATE_JOB_FLOW'],
# 'Steps.member.1.Name': ['My wordcount example']}
self.action_on_failure = kwargs['action_on_failure']
self.name = kwargs['name']
self.jar = kwargs['hadoop_jar_step._jar']
self.args = []
self.state = state
arg_index = 1
while True:
arg = kwargs.get('hadoop_jar_step._args.member.{0}'.format(arg_index))
if arg:
self.args.append(arg)
arg_index += 1
else:
break
class FakeJobFlow(object):
def __init__(self, job_id, name, log_uri, job_flow_role, visible_to_all_users, steps, instance_attrs, emr_backend):
self.id = job_id
self.name = name
self.log_uri = log_uri
self.role = job_flow_role or DEFAULT_JOB_FLOW_ROLE
self.state = "STARTING"
self.steps = []
self.add_steps(steps)
self.initial_instance_count = instance_attrs.get('instance_count', 0)
self.initial_master_instance_type = instance_attrs.get('master_instance_type')
self.initial_slave_instance_type = instance_attrs.get('slave_instance_type')
self.set_visibility(visible_to_all_users)
self.normalized_instance_hours = 0
self.ec2_key_name = instance_attrs.get('ec2_key_name')
self.availability_zone = instance_attrs.get('placement.availability_zone')
self.subnet_id = instance_attrs.get('ec2_subnet_id')
self.keep_job_flow_alive_when_no_steps = instance_attrs.get('keep_job_flow_alive_when_no_steps')
self.termination_protected = instance_attrs.get('termination_protected')
self.instance_group_ids = []
self.emr_backend = emr_backend
def create_cluster(self):
cluster = Cluster(
id=self.id,
name=self.name,
availability_zone=self.availability_zone,
ec2_key_name=self.ec2_key_name,
subnet_id=self.subnet_id,
ec2_iam_profile=self.role,
log_uri=self.log_uri,
)
return cluster
def terminate(self):
self.state = 'TERMINATED'
def set_visibility(self, visibility):
if visibility == 'true':
self.visible_to_all_users = True
else:
self.visible_to_all_users = False
def set_termination_protection(self, value): def set_termination_protection(self, value):
self.termination_protected = value self.termination_protected = value
def add_steps(self, steps): def set_visibility(self, visibility):
for index, step in enumerate(steps): self.visible_to_all_users = visibility
if self.steps:
# If we already have other steps, this one is pending
self.steps.append(FakeStep(state='PENDING', **step))
else:
self.steps.append(FakeStep(state='STARTING', **step))
def add_instance_group(self, instance_group_id):
self.instance_group_ids.append(instance_group_id)
@property
def instance_groups(self):
return self.emr_backend.get_instance_groups(self.instance_group_ids)
@property
def master_instance_type(self):
groups = self.instance_groups
if groups:
return groups[0].type
else:
return self.initial_master_instance_type
@property
def slave_instance_type(self):
groups = self.instance_groups
if groups:
return groups[0].type
else:
return self.initial_slave_instance_type
@property
def instance_count(self):
groups = self.instance_groups
if not groups:
# No groups,return initial instance count
return self.initial_instance_count
count = 0
for group in groups:
count += int(group.num_instances)
return count
class ElasticMapReduceBackend(BaseBackend): class ElasticMapReduceBackend(BaseBackend):
def __init__(self): def __init__(self, region_name):
self.job_flows = {} super(ElasticMapReduceBackend, self).__init__()
self.region_name = region_name
self.clusters = {} self.clusters = {}
self.instance_groups = {} self.instance_groups = {}
def run_job_flow(self, name, log_uri, job_flow_role, visible_to_all_users, steps, instance_attrs): def reset(self):
job_id = random_job_id() region_name = self.region_name
job_flow = FakeJobFlow( self.__dict__ = {}
job_id, name, log_uri, job_flow_role, visible_to_all_users, steps, instance_attrs, self) self.__init__(region_name)
self.job_flows[job_id] = job_flow
cluster = job_flow.create_cluster() def add_applications(self, cluster_id, applications):
self.clusters[cluster.id] = cluster cluster = self.get_cluster(cluster_id)
return job_flow cluster.add_applications(applications)
def add_instance_groups(self, cluster_id, instance_groups):
cluster = self.clusters[cluster_id]
result_groups = []
for instance_group in instance_groups:
group = FakeInstanceGroup(**instance_group)
self.instance_groups[group.id] = group
cluster.add_instance_group(group)
result_groups.append(group)
return result_groups
def add_job_flow_steps(self, job_flow_id, steps): def add_job_flow_steps(self, job_flow_id, steps):
job_flow = self.job_flows[job_flow_id] cluster = self.clusters[job_flow_id]
job_flow.add_steps(steps) steps = cluster.add_steps(steps)
return job_flow return steps
def add_tags(self, cluster_id, tags):
cluster = self.get_cluster(cluster_id)
cluster.add_tags(tags)
def describe_job_flows(self, job_flow_ids=None): def describe_job_flows(self, job_flow_ids=None):
jobs = self.job_flows.values() clusters = self.clusters.values()
if job_flow_ids: if job_flow_ids:
return [job for job in jobs if job.id in job_flow_ids] return [cluster for cluster in clusters if cluster.id in job_flow_ids]
else: else:
return jobs return clusters
def terminate_job_flows(self, job_ids): def describe_step(self, cluster_id, step_id):
flows = [flow for flow in self.describe_job_flows() if flow.id in job_ids] cluster = self.clusters[cluster_id]
for flow in flows: for step in cluster.steps:
flow.terminate() if step.id == step_id:
return flows return step
def list_clusters(self):
return self.clusters.values()
def get_cluster(self, cluster_id): def get_cluster(self, cluster_id):
return self.clusters[cluster_id] return self.clusters[cluster_id]
@ -221,43 +296,50 @@ class ElasticMapReduceBackend(BaseBackend):
if group_id in instance_group_ids if group_id in instance_group_ids
] ]
def add_instance_groups(self, job_flow_id, instance_groups): def list_bootstrap_actions(self, cluster_id):
job_flow = self.job_flows[job_flow_id] return self.clusters[cluster_id].bootstrap_actions
result_groups = []
for instance_group in instance_groups: def list_clusters(self):
instance_group_id = random_instance_group_id() return self.clusters.values()
group = FakeInstanceGroup(instance_group_id, **instance_group)
self.instance_groups[instance_group_id] = group def list_instance_groups(self, cluster_id):
job_flow.add_instance_group(instance_group_id) return self.clusters[cluster_id].instance_groups
result_groups.append(group)
return result_groups def list_steps(self, cluster_id, step_states=None):
return self.clusters[cluster_id].steps
def modify_instance_groups(self, instance_groups): def modify_instance_groups(self, instance_groups):
result_groups = [] result_groups = []
for instance_group in instance_groups: for instance_group in instance_groups:
group = self.instance_groups[instance_group['instance_group_id']] group = self.instance_groups[instance_group['instance_group_id']]
group.set_instance_count(instance_group['instance_count']) group.set_instance_count(int(instance_group['instance_count']))
return result_groups return result_groups
def set_visible_to_all_users(self, job_ids, visible_to_all_users):
for job_id in job_ids:
job = self.job_flows[job_id]
job.set_visibility(visible_to_all_users)
def set_termination_protection(self, job_ids, value):
for job_id in job_ids:
job = self.job_flows[job_id]
job.set_termination_protection(value)
def add_tags(self, cluster_id, tags):
cluster = self.get_cluster(cluster_id)
cluster.add_tags(tags)
def remove_tags(self, cluster_id, tag_keys): def remove_tags(self, cluster_id, tag_keys):
cluster = self.get_cluster(cluster_id) cluster = self.get_cluster(cluster_id)
cluster.remove_tags(tag_keys) cluster.remove_tags(tag_keys)
def run_job_flow(self, **kwargs):
return FakeCluster(self, **kwargs)
def set_visible_to_all_users(self, job_flow_ids, visible_to_all_users):
for job_flow_id in job_flow_ids:
cluster = self.clusters[job_flow_id]
cluster.set_visibility(visible_to_all_users)
def set_termination_protection(self, job_flow_ids, value):
for job_flow_id in job_flow_ids:
cluster = self.clusters[job_flow_id]
cluster.set_termination_protection(value)
def terminate_job_flows(self, job_flow_ids):
clusters = [cluster for cluster in self.describe_job_flows()
if cluster.id in job_flow_ids]
for cluster in clusters:
cluster.terminate()
return clusters
emr_backends = {} emr_backends = {}
for region in boto.emr.regions(): for region in boto.emr.regions():
emr_backends[region.name] = ElasticMapReduceBackend() emr_backends[region.name] = ElasticMapReduceBackend(region.name)

File diff suppressed because it is too large Load Diff

View File

@ -2,6 +2,7 @@ from __future__ import unicode_literals
from .responses import ElasticMapReduceResponse from .responses import ElasticMapReduceResponse
url_bases = [ url_bases = [
"https?://(.+).elasticmapreduce.amazonaws.com",
"https?://elasticmapreduce.(.+).amazonaws.com", "https?://elasticmapreduce.(.+).amazonaws.com",
] ]

View File

@ -1,19 +1,25 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import random import random
import string import string
import six import six
def random_job_id(size=13): def random_id(size=13):
chars = list(range(10)) + list(string.ascii_uppercase) chars = list(range(10)) + list(string.ascii_uppercase)
job_tag = ''.join(six.text_type(random.choice(chars)) for x in range(size)) return ''.join(six.text_type(random.choice(chars)) for x in range(size))
return 'j-{0}'.format(job_tag)
def random_cluster_id(size=13):
return 'j-{0}'.format(random_id())
def random_step_id(size=13):
return 's-{0}'.format(random_id())
def random_instance_group_id(size=13): def random_instance_group_id(size=13):
chars = list(range(10)) + list(string.ascii_uppercase) return 'i-{0}'.format(random_id())
job_tag = ''.join(six.text_type(random.choice(chars)) for x in range(size))
return 'i-{0}'.format(job_tag)
def tags_from_query_string(querystring_dict): def tags_from_query_string(querystring_dict):
@ -30,3 +36,18 @@ def tags_from_query_string(querystring_dict):
else: else:
response_values[tag_key] = None response_values[tag_key] = None
return response_values return response_values
def steps_from_query_string(querystring_dict):
steps = []
for step in querystring_dict:
step['jar'] = step.pop('hadoop_jar_step._jar')
step['properties'] = dict((o['Key'], o['Value']) for o in step.get('properties', []))
step['args'] = []
idx = 1
keyfmt = 'hadoop_jar_step._args.member.{0}'
while keyfmt.format(idx) in step:
step['args'].append(step.pop(keyfmt.format(idx)))
idx += 1
steps.append(step)
return steps

View File

@ -10,6 +10,7 @@ install_requires = [
"xmltodict", "xmltodict",
"six", "six",
"werkzeug", "werkzeug",
"pytz"
] ]
extras_require = { extras_require = {

View File

@ -0,0 +1,73 @@
from __future__ import unicode_literals
import sure # noqa
from moto.core.responses import AWSServiceSpec
from moto.core.responses import flatten_json_request_body
def test_flatten_json_request_body():
spec = AWSServiceSpec('data/emr/2009-03-31/service-2.json').input_spec('RunJobFlow')
body = {
'Name': 'cluster',
'Instances': {
'Ec2KeyName': 'ec2key',
'InstanceGroups': [
{'InstanceRole': 'MASTER',
'InstanceType': 'm1.small'},
{'InstanceRole': 'CORE',
'InstanceType': 'm1.medium'},
],
'Placement': {'AvailabilityZone': 'us-east-1'},
},
'Steps': [
{'HadoopJarStep': {
'Properties': [
{'Key': 'k1', 'Value': 'v1'},
{'Key': 'k2', 'Value': 'v2'}
],
'Args': ['arg1', 'arg2']}},
],
'Configurations': [
{'Classification': 'class',
'Properties': {'propkey1': 'propkey1',
'propkey2': 'propkey2'}},
{'Classification': 'anotherclass',
'Properties': {'propkey3': 'propkey3'}},
]
}
flat = flatten_json_request_body('', body, spec)
flat['Name'].should.equal(body['Name'])
flat['Instances.Ec2KeyName'].should.equal(body['Instances']['Ec2KeyName'])
for idx in range(2):
flat['Instances.InstanceGroups.member.' + str(idx + 1) + '.InstanceRole'].should.equal(body['Instances']['InstanceGroups'][idx]['InstanceRole'])
flat['Instances.InstanceGroups.member.' + str(idx + 1) + '.InstanceType'].should.equal(body['Instances']['InstanceGroups'][idx]['InstanceType'])
flat['Instances.Placement.AvailabilityZone'].should.equal(body['Instances']['Placement']['AvailabilityZone'])
for idx in range(1):
prefix = 'Steps.member.' + str(idx + 1) + '.HadoopJarStep'
step = body['Steps'][idx]['HadoopJarStep']
i = 0
while prefix + '.Properties.member.' + str(i + 1) + '.Key' in flat:
flat[prefix + '.Properties.member.' + str(i + 1) + '.Key'].should.equal(step['Properties'][i]['Key'])
flat[prefix + '.Properties.member.' + str(i + 1) + '.Value'].should.equal(step['Properties'][i]['Value'])
i += 1
i = 0
while prefix + '.Args.member.' + str(i + 1) in flat:
flat[prefix + '.Args.member.' + str(i + 1)].should.equal(step['Args'][i])
i += 1
for idx in range(2):
flat['Configurations.member.' + str(idx + 1) + '.Classification'].should.equal(body['Configurations'][idx]['Classification'])
props = {}
i = 1
keyfmt = 'Configurations.member.{0}.Properties.entry.{1}'
key = keyfmt.format(idx + 1, i)
while key + '.key' in flat:
props[flat[key + '.key']] = flat[key + '.value']
i += 1
key = keyfmt.format(idx + 1, i)
props.should.equal(body['Configurations'][idx]['Properties'])

View File

@ -1,197 +1,111 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import boto import boto
from boto.emr.bootstrap_action import BootstrapAction
from boto.emr.instance_group import InstanceGroup from boto.emr.instance_group import InstanceGroup
from boto.emr.step import StreamingStep from boto.emr.step import StreamingStep
import six
import sure # noqa import sure # noqa
from moto import mock_emr from moto import mock_emr
from tests.helpers import requires_boto_gte from tests.helpers import requires_boto_gte
@mock_emr run_jobflow_args = dict(
def test_create_job_flow_in_multiple_regions(): job_flow_role='EMR_EC2_DefaultRole',
step = StreamingStep( keep_alive=True,
name='My wordcount example', log_uri='s3://some_bucket/jobflow_logs',
mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py', master_instance_type='c1.medium',
reducer='aggregate', name='My jobflow',
input='s3n://elasticmapreduce/samples/wordcount/input', num_instances=2,
output='s3n://output_bucket/output/wordcount_output' service_role='EMR_DefaultRole',
) slave_instance_type='c1.medium',
)
west1_conn = boto.emr.connect_to_region('us-east-1')
west1_job_id = west1_conn.run_jobflow(
name='us-east-1',
log_uri='s3://some_bucket/jobflow_logs',
master_instance_type='m1.medium',
slave_instance_type='m1.small',
steps=[step],
)
west2_conn = boto.emr.connect_to_region('eu-west-1') input_instance_groups = [
west2_job_id = west2_conn.run_jobflow( InstanceGroup(1, 'MASTER', 'c1.medium', 'ON_DEMAND', 'master'),
name='eu-west-1', InstanceGroup(3, 'CORE', 'c1.medium', 'ON_DEMAND', 'core'),
log_uri='s3://some_bucket/jobflow_logs', InstanceGroup(6, 'TASK', 'c1.large', 'SPOT', 'task-1', '0.07'),
master_instance_type='m1.medium', InstanceGroup(10, 'TASK', 'c1.xlarge', 'SPOT', 'task-2', '0.05'),
slave_instance_type='m1.small', ]
steps=[step],
)
west1_job_flow = west1_conn.describe_jobflow(west1_job_id)
west1_job_flow.name.should.equal('us-east-1')
west2_job_flow = west2_conn.describe_jobflow(west2_job_id)
west2_job_flow.name.should.equal('eu-west-1')
@mock_emr @mock_emr
def test_create_job_flow(): def test_describe_cluster():
conn = boto.connect_emr() conn = boto.connect_emr()
args = run_jobflow_args.copy()
step1 = StreamingStep( args.update(dict(
name='My wordcount example', api_params={
mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py', 'Applications.member.1.Name': 'Spark',
reducer='aggregate', 'Applications.member.1.Version': '2.4.2',
input='s3n://elasticmapreduce/samples/wordcount/input', 'Configurations.member.1.Classification': 'yarn-site',
output='s3n://output_bucket/output/wordcount_output' 'Configurations.member.1.Properties.entry.1.key': 'someproperty',
) 'Configurations.member.1.Properties.entry.1.value': 'somevalue',
'Instances.EmrManagedMasterSecurityGroup': 'master-security-group',
step2 = StreamingStep( 'Instances.Ec2SubnetId': 'subnet-8be41cec',
name='My wordcount example2', },
mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter2.py', availability_zone='us-east-2b',
reducer='aggregate', ec2_keyname='mykey',
input='s3n://elasticmapreduce/samples/wordcount/input2', job_flow_role='EMR_EC2_DefaultRole',
output='s3n://output_bucket/output/wordcount_output2' keep_alive=False,
)
job_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs', log_uri='s3://some_bucket/jobflow_logs',
master_instance_type='m1.medium',
slave_instance_type='m1.small',
steps=[step1, step2],
)
job_flow = conn.describe_jobflow(job_id)
job_flow.state.should.equal('STARTING')
job_flow.jobflowid.should.equal(job_id)
job_flow.name.should.equal('My jobflow')
job_flow.masterinstancetype.should.equal('m1.medium')
job_flow.slaveinstancetype.should.equal('m1.small')
job_flow.loguri.should.equal('s3://some_bucket/jobflow_logs')
job_flow.visibletoallusers.should.equal('False')
int(job_flow.normalizedinstancehours).should.equal(0)
job_step = job_flow.steps[0]
job_step.name.should.equal('My wordcount example')
job_step.state.should.equal('STARTING')
args = [arg.value for arg in job_step.args]
args.should.equal([
'-mapper',
's3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
'-reducer',
'aggregate',
'-input',
's3n://elasticmapreduce/samples/wordcount/input',
'-output',
's3n://output_bucket/output/wordcount_output',
])
job_step2 = job_flow.steps[1]
job_step2.name.should.equal('My wordcount example2')
job_step2.state.should.equal('PENDING')
args = [arg.value for arg in job_step2.args]
args.should.equal([
'-mapper',
's3n://elasticmapreduce/samples/wordcount/wordSplitter2.py',
'-reducer',
'aggregate',
'-input',
's3n://elasticmapreduce/samples/wordcount/input2',
'-output',
's3n://output_bucket/output/wordcount_output2',
])
@requires_boto_gte("2.8")
@mock_emr
def test_create_job_flow_with_new_params():
# Test that run_jobflow works with newer params
conn = boto.connect_emr()
conn.run_jobflow(
name='My jobflow', name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs', service_role='EMR_DefaultRole',
master_instance_type='m1.medium',
slave_instance_type='m1.small',
job_flow_role='some-role-arn',
steps=[],
)
@requires_boto_gte("2.8")
@mock_emr
def test_create_job_flow_visible_to_all_users():
conn = boto.connect_emr()
job_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[],
visible_to_all_users=True, visible_to_all_users=True,
) ))
job_flow = conn.describe_jobflow(job_id) cluster_id = conn.run_jobflow(**args)
job_flow.visibletoallusers.should.equal('True') input_tags = {'tag1': 'val1', 'tag2': 'val2'}
conn.add_tags(cluster_id, input_tags)
cluster = conn.describe_cluster(cluster_id)
cluster.applications[0].name.should.equal('Spark')
cluster.applications[0].version.should.equal('2.4.2')
cluster.autoterminate.should.equal('true')
@requires_boto_gte("2.8") # configurations appear not be supplied as attributes?
@mock_emr
def test_create_job_flow_with_instance_groups():
conn = boto.connect_emr()
instance_groups = [InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07'), attrs = cluster.ec2instanceattributes
InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07')] # AdditionalMasterSecurityGroups
job_id = conn.run_jobflow( # AdditionalSlaveSecurityGroups
name='My jobflow', attrs.ec2availabilityzone.should.equal(args['availability_zone'])
log_uri='s3://some_bucket/jobflow_logs', attrs.ec2keyname.should.equal(args['ec2_keyname'])
steps=[], attrs.ec2subnetid.should.equal(args['api_params']['Instances.Ec2SubnetId'])
instance_groups=instance_groups # EmrManagedMasterSecurityGroups
) # EmrManagedSlaveSecurityGroups
attrs.iaminstanceprofile.should.equal(args['job_flow_role'])
# ServiceAccessSecurityGroup
job_flow = conn.describe_jobflow(job_id) cluster.id.should.equal(cluster_id)
int(job_flow.instancecount).should.equal(12) cluster.loguri.should.equal(args['log_uri'])
instance_group = job_flow.instancegroups[0] cluster.masterpublicdnsname.should.be.a(six.string_types)
int(instance_group.instancerunningcount).should.equal(6) cluster.name.should.equal(args['name'])
int(cluster.normalizedinstancehours).should.equal(0)
# cluster.release_label
cluster.shouldnt.have.property('requestedamiversion')
cluster.runningamiversion.should.equal('1.0.0')
# cluster.securityconfiguration
cluster.servicerole.should.equal(args['service_role'])
cluster.status.state.should.equal('TERMINATED')
cluster.status.statechangereason.message.should.be.a(six.string_types)
cluster.status.statechangereason.code.should.be.a(six.string_types)
cluster.status.timeline.creationdatetime.should.be.a(six.string_types)
# cluster.status.timeline.enddatetime.should.be.a(six.string_types)
# cluster.status.timeline.readydatetime.should.be.a(six.string_types)
dict((item.key, item.value) for item in cluster.tags).should.equal(input_tags)
cluster.terminationprotected.should.equal('false')
cluster.visibletoallusers.should.equal('true')
@mock_emr @mock_emr
def test_terminate_job_flow(): def test_describe_jobflows():
conn = boto.connect_emr() conn = boto.connect_emr()
job_id = conn.run_jobflow( job1_id = conn.run_jobflow(**run_jobflow_args)
name='My jobflow', job2_id = conn.run_jobflow(**run_jobflow_args)
log_uri='s3://some_bucket/jobflow_logs',
steps=[]
)
flow = conn.describe_jobflows()[0]
flow.state.should.equal('STARTING')
conn.terminate_jobflow(job_id)
flow = conn.describe_jobflows()[0]
flow.state.should.equal('TERMINATED')
@mock_emr
def test_describe_job_flows():
conn = boto.connect_emr()
job1_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[]
)
job2_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[]
)
jobs = conn.describe_jobflows() jobs = conn.describe_jobflows()
jobs.should.have.length_of(2) jobs.should.have.length_of(2)
@ -205,252 +119,454 @@ def test_describe_job_flows():
@mock_emr @mock_emr
def test_add_steps_to_flow(): def test_describe_jobflow():
conn = boto.connect_emr() conn = boto.connect_emr()
args = run_jobflow_args.copy()
args.update(dict(
ami_version='3.8.1',
api_params={
#'Applications.member.1.Name': 'Spark',
#'Applications.member.1.Version': '2.4.2',
#'Configurations.member.1.Classification': 'yarn-site',
#'Configurations.member.1.Properties.entry.1.key': 'someproperty',
#'Configurations.member.1.Properties.entry.1.value': 'somevalue',
#'Instances.EmrManagedMasterSecurityGroup': 'master-security-group',
'Instances.Ec2SubnetId': 'subnet-8be41cec',
},
ec2_keyname='mykey',
hadoop_version='2.4.0',
step1 = StreamingStep(
name='My wordcount example',
mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
reducer='aggregate',
input='s3n://elasticmapreduce/samples/wordcount/input',
output='s3n://output_bucket/output/wordcount_output'
)
job_id = conn.run_jobflow(
name='My jobflow', name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs', log_uri='s3://some_bucket/jobflow_logs',
steps=[step1] keep_alive=True,
) master_instance_type='c1.medium',
slave_instance_type='c1.medium',
num_instances=2,
availability_zone='us-west-2b',
job_flow_role='EMR_EC2_DefaultRole',
service_role='EMR_DefaultRole',
visible_to_all_users=True,
))
cluster_id = conn.run_jobflow(**args)
jf = conn.describe_jobflow(cluster_id)
jf.amiversion.should.equal(args['ami_version'])
jf.bootstrapactions.should.equal(None)
jf.creationdatetime.should.be.a(six.string_types)
jf.should.have.property('laststatechangereason')
jf.readydatetime.should.be.a(six.string_types)
jf.startdatetime.should.be.a(six.string_types)
jf.state.should.equal('WAITING')
jf.ec2keyname.should.equal(args['ec2_keyname'])
# Ec2SubnetId
jf.hadoopversion.should.equal(args['hadoop_version'])
int(jf.instancecount).should.equal(2)
for ig in jf.instancegroups:
ig.creationdatetime.should.be.a(six.string_types)
# ig.enddatetime.should.be.a(six.string_types)
ig.should.have.property('instancegroupid').being.a(six.string_types)
int(ig.instancerequestcount).should.equal(1)
ig.instancerole.should.be.within(['MASTER', 'CORE'])
int(ig.instancerunningcount).should.equal(1)
ig.instancetype.should.equal('c1.medium')
ig.laststatechangereason.should.be.a(six.string_types)
ig.market.should.equal('ON_DEMAND')
ig.name.should.be.a(six.string_types)
ig.readydatetime.should.be.a(six.string_types)
ig.startdatetime.should.be.a(six.string_types)
ig.state.should.equal('RUNNING')
jf.keepjobflowalivewhennosteps.should.equal('true')
jf.masterinstanceid.should.be.a(six.string_types)
jf.masterinstancetype.should.equal(args['master_instance_type'])
jf.masterpublicdnsname.should.be.a(six.string_types)
int(jf.normalizedinstancehours).should.equal(0)
jf.availabilityzone.should.equal(args['availability_zone'])
jf.slaveinstancetype.should.equal(args['slave_instance_type'])
jf.terminationprotected.should.equal('false')
jf.jobflowid.should.equal(cluster_id)
# jf.jobflowrole.should.equal(args['job_flow_role'])
jf.loguri.should.equal(args['log_uri'])
jf.name.should.equal(args['name'])
# jf.servicerole.should.equal(args['service_role'])
jf.steps.should.have.length_of(0)
list(i.value for i in jf.supported_products).should.equal([])
jf.visibletoallusers.should.equal('true')
@mock_emr
def test_list_clusters():
conn = boto.connect_emr()
args = run_jobflow_args.copy()
args['name'] = 'jobflow1'
cluster1_id = conn.run_jobflow(**args)
args['name'] = 'jobflow2'
cluster2_id = conn.run_jobflow(**args)
conn.terminate_jobflow(cluster2_id)
summary = conn.list_clusters()
clusters = summary.clusters
clusters.should.have.length_of(2)
expected = {
cluster1_id: {
'id': cluster1_id,
'name': 'jobflow1',
'normalizedinstancehours': 0,
'state': 'WAITING'},
cluster2_id: {
'id': cluster2_id,
'name': 'jobflow2',
'normalizedinstancehours': 0,
'state': 'TERMINATED'},
}
for x in clusters:
y = expected[x.id]
x.id.should.equal(y['id'])
x.name.should.equal(y['name'])
int(x.normalizedinstancehours).should.equal(y['normalizedinstancehours'])
x.status.state.should.equal(y['state'])
x.status.timeline.creationdatetime.should.be.a(six.string_types)
if y['state'] == 'TERMINATED':
x.status.timeline.enddatetime.should.be.a(six.string_types)
else:
x.status.timeline.shouldnt.have.property('enddatetime')
x.status.timeline.readydatetime.should.be.a(six.string_types)
@mock_emr
def test_run_jobflow():
conn = boto.connect_emr()
args = run_jobflow_args.copy()
job_id = conn.run_jobflow(**args)
job_flow = conn.describe_jobflow(job_id) job_flow = conn.describe_jobflow(job_id)
job_flow.state.should.equal('STARTING') job_flow.state.should.equal('WAITING')
job_flow.jobflowid.should.equal(job_id) job_flow.jobflowid.should.equal(job_id)
job_flow.name.should.equal('My jobflow') job_flow.name.should.equal(args['name'])
job_flow.loguri.should.equal('s3://some_bucket/jobflow_logs') job_flow.masterinstancetype.should.equal(args['master_instance_type'])
job_flow.slaveinstancetype.should.equal(args['slave_instance_type'])
step2 = StreamingStep( job_flow.loguri.should.equal(args['log_uri'])
name='My wordcount example2', job_flow.visibletoallusers.should.equal('false')
mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter2.py', int(job_flow.normalizedinstancehours).should.equal(0)
reducer='aggregate', job_flow.steps.should.have.length_of(0)
input='s3n://elasticmapreduce/samples/wordcount/input2',
output='s3n://output_bucket/output/wordcount_output2'
)
conn.add_jobflow_steps(job_id, [step2])
job_flow = conn.describe_jobflow(job_id)
job_step = job_flow.steps[0]
job_step.name.should.equal('My wordcount example')
job_step.state.should.equal('STARTING')
args = [arg.value for arg in job_step.args]
args.should.equal([
'-mapper',
's3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
'-reducer',
'aggregate',
'-input',
's3n://elasticmapreduce/samples/wordcount/input',
'-output',
's3n://output_bucket/output/wordcount_output',
])
job_step2 = job_flow.steps[1]
job_step2.name.should.equal('My wordcount example2')
job_step2.state.should.equal('PENDING')
args = [arg.value for arg in job_step2.args]
args.should.equal([
'-mapper',
's3n://elasticmapreduce/samples/wordcount/wordSplitter2.py',
'-reducer',
'aggregate',
'-input',
's3n://elasticmapreduce/samples/wordcount/input2',
'-output',
's3n://output_bucket/output/wordcount_output2',
])
@mock_emr @mock_emr
def test_create_instance_groups(): def test_run_jobflow_in_multiple_regions():
conn = boto.connect_emr() regions = {}
for region in ['us-east-1', 'eu-west-1']:
conn = boto.emr.connect_to_region(region)
args = run_jobflow_args.copy()
args['name'] = region
cluster_id = conn.run_jobflow(**args)
regions[region] = {'conn': conn, 'cluster_id': cluster_id}
step1 = StreamingStep( for region in regions.keys():
name='My wordcount example', conn = regions[region]['conn']
mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py', jf = conn.describe_jobflow(regions[region]['cluster_id'])
reducer='aggregate', jf.name.should.equal(region)
input='s3n://elasticmapreduce/samples/wordcount/input',
output='s3n://output_bucket/output/wordcount_output'
)
job_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[step1],
)
instance_group = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07')
instance_group = conn.add_instance_groups(job_id, [instance_group])
instance_group_id = instance_group.instancegroupids
job_flow = conn.describe_jobflows()[0]
int(job_flow.instancecount).should.equal(6)
instance_group = job_flow.instancegroups[0]
instance_group.instancegroupid.should.equal(instance_group_id)
int(instance_group.instancerunningcount).should.equal(6)
instance_group.instancerole.should.equal('TASK')
instance_group.instancetype.should.equal('c1.medium')
instance_group.market.should.equal('SPOT')
instance_group.name.should.equal('spot-0.07')
instance_group.bidprice.should.equal('0.07')
@mock_emr
def test_modify_instance_groups():
conn = boto.connect_emr()
step1 = StreamingStep(
name='My wordcount example',
mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
reducer='aggregate',
input='s3n://elasticmapreduce/samples/wordcount/input',
output='s3n://output_bucket/output/wordcount_output'
)
job_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[step1]
)
instance_group1 = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07')
instance_group2 = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07')
instance_group = conn.add_instance_groups(job_id, [instance_group1, instance_group2])
instance_group_ids = instance_group.instancegroupids.split(",")
job_flow = conn.describe_jobflows()[0]
int(job_flow.instancecount).should.equal(12)
instance_group = job_flow.instancegroups[0]
int(instance_group.instancerunningcount).should.equal(6)
conn.modify_instance_groups(instance_group_ids, [2, 3])
job_flow = conn.describe_jobflows()[0]
int(job_flow.instancecount).should.equal(5)
instance_group1 = [
group for group
in job_flow.instancegroups
if group.instancegroupid == instance_group_ids[0]
][0]
int(instance_group1.instancerunningcount).should.equal(2)
instance_group2 = [
group for group
in job_flow.instancegroups
if group.instancegroupid == instance_group_ids[1]
][0]
int(instance_group2.instancerunningcount).should.equal(3)
@requires_boto_gte("2.8") @requires_boto_gte("2.8")
@mock_emr @mock_emr
def test_set_visible_to_all_users(): def test_run_jobflow_with_new_params():
# Test that run_jobflow works with newer params
conn = boto.connect_emr() conn = boto.connect_emr()
conn.run_jobflow(**run_jobflow_args)
job_id = conn.run_jobflow(
name='My jobflow', @requires_boto_gte("2.8")
log_uri='s3://some_bucket/jobflow_logs', @mock_emr
steps=[], def test_run_jobflow_with_visible_to_all_users():
visible_to_all_users=False, conn = boto.connect_emr()
) for expected in (True, False):
job_id = conn.run_jobflow(
visible_to_all_users=expected,
**run_jobflow_args
)
job_flow = conn.describe_jobflow(job_id)
job_flow.visibletoallusers.should.equal(str(expected).lower())
@requires_boto_gte("2.8")
@mock_emr
def test_run_jobflow_with_instance_groups():
input_groups = dict((g.name, g) for g in input_instance_groups)
conn = boto.connect_emr()
job_id = conn.run_jobflow(instance_groups=input_instance_groups,
**run_jobflow_args)
job_flow = conn.describe_jobflow(job_id) job_flow = conn.describe_jobflow(job_id)
job_flow.visibletoallusers.should.equal('False') int(job_flow.instancecount).should.equal(sum(g.num_instances for g in input_instance_groups))
for instance_group in job_flow.instancegroups:
conn.set_visible_to_all_users(job_id, True) expected = input_groups[instance_group.name]
instance_group.should.have.property('instancegroupid')
job_flow = conn.describe_jobflow(job_id) int(instance_group.instancerunningcount).should.equal(expected.num_instances)
job_flow.visibletoallusers.should.equal('True') instance_group.instancerole.should.equal(expected.role)
instance_group.instancetype.should.equal(expected.type)
conn.set_visible_to_all_users(job_id, False) instance_group.market.should.equal(expected.market)
if hasattr(expected, 'bidprice'):
job_flow = conn.describe_jobflow(job_id) instance_group.bidprice.should.equal(expected.bidprice)
job_flow.visibletoallusers.should.equal('False')
@requires_boto_gte("2.8") @requires_boto_gte("2.8")
@mock_emr @mock_emr
def test_set_termination_protection(): def test_set_termination_protection():
conn = boto.connect_emr() conn = boto.connect_emr()
job_id = conn.run_jobflow(**run_jobflow_args)
job_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[]
)
job_flow = conn.describe_jobflow(job_id) job_flow = conn.describe_jobflow(job_id)
job_flow.terminationprotected.should.equal(u'None') job_flow.terminationprotected.should.equal('false')
conn.set_termination_protection(job_id, True) conn.set_termination_protection(job_id, True)
job_flow = conn.describe_jobflow(job_id) job_flow = conn.describe_jobflow(job_id)
job_flow.terminationprotected.should.equal('true') job_flow.terminationprotected.should.equal('true')
conn.set_termination_protection(job_id, False) conn.set_termination_protection(job_id, False)
job_flow = conn.describe_jobflow(job_id) job_flow = conn.describe_jobflow(job_id)
job_flow.terminationprotected.should.equal('false') job_flow.terminationprotected.should.equal('false')
@requires_boto_gte("2.8")
@mock_emr @mock_emr
def test_list_clusters(): def test_set_visible_to_all_users():
conn = boto.connect_emr() conn = boto.connect_emr()
conn.run_jobflow( args = run_jobflow_args.copy()
name='My jobflow', args['visible_to_all_users'] = False
log_uri='s3://some_bucket/jobflow_logs', job_id = conn.run_jobflow(**args)
steps=[], job_flow = conn.describe_jobflow(job_id)
) job_flow.visibletoallusers.should.equal('false')
summary = conn.list_clusters() conn.set_visible_to_all_users(job_id, True)
clusters = summary.clusters job_flow = conn.describe_jobflow(job_id)
clusters.should.have.length_of(1) job_flow.visibletoallusers.should.equal('true')
cluster = clusters[0]
cluster.name.should.equal("My jobflow") conn.set_visible_to_all_users(job_id, False)
cluster.normalizedinstancehours.should.equal('0') job_flow = conn.describe_jobflow(job_id)
cluster.status.state.should.equal("RUNNING") job_flow.visibletoallusers.should.equal('false')
@mock_emr @mock_emr
def test_describe_cluster(): def test_terminate_jobflow():
conn = boto.connect_emr() conn = boto.connect_emr()
job_id = conn.run_jobflow( job_id = conn.run_jobflow(**run_jobflow_args)
name='My jobflow', flow = conn.describe_jobflows()[0]
log_uri='s3://some_bucket/jobflow_logs', flow.state.should.equal('WAITING')
steps=[],
conn.terminate_jobflow(job_id)
flow = conn.describe_jobflows()[0]
flow.state.should.equal('TERMINATED')
# testing multiple end points for each feature
@mock_emr
def test_bootstrap_actions():
bootstrap_actions = [
BootstrapAction(
name='bs1',
path='path/to/script',
bootstrap_action_args=['arg1', 'arg2']),
BootstrapAction(
name='bs2',
path='path/to/anotherscript',
bootstrap_action_args=[])
]
conn = boto.connect_emr()
cluster_id = conn.run_jobflow(
bootstrap_actions=bootstrap_actions,
**run_jobflow_args
) )
cluster = conn.describe_cluster(job_id) jf = conn.describe_jobflow(cluster_id)
cluster.name.should.equal("My jobflow") for x, y in zip(jf.bootstrapactions, bootstrap_actions):
cluster.normalizedinstancehours.should.equal('0') x.name.should.equal(y.name)
cluster.status.state.should.equal("RUNNING") x.path.should.equal(y.path)
list(o.value for o in x.args).should.equal(y.args())
resp = conn.list_bootstrap_actions(cluster_id)
for i, y in enumerate(bootstrap_actions):
x = resp.actions[i]
x.name.should.equal(y.name)
x.scriptpath.should.equal(y.path)
list(arg.value for arg in x.args).should.equal(y.args())
@mock_emr @mock_emr
def test_cluster_tagging(): def test_instance_groups():
conn = boto.connect_emr() input_groups = dict((g.name, g) for g in input_instance_groups)
job_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[],
)
cluster_id = job_id
conn.add_tags(cluster_id, {"tag1": "val1", "tag2": "val2"})
conn = boto.connect_emr()
args = run_jobflow_args.copy()
for key in ['master_instance_type', 'slave_instance_type', 'num_instances']:
del args[key]
args['instance_groups'] = input_instance_groups[:2]
job_id = conn.run_jobflow(**args)
jf = conn.describe_jobflow(job_id)
base_instance_count = int(jf.instancecount)
conn.add_instance_groups(job_id, input_instance_groups[2:])
jf = conn.describe_jobflow(job_id)
int(jf.instancecount).should.equal(sum(g.num_instances for g in input_instance_groups))
for x in jf.instancegroups:
y = input_groups[x.name]
if hasattr(y, 'bidprice'):
x.bidprice.should.equal(y.bidprice)
x.creationdatetime.should.be.a(six.string_types)
# x.enddatetime.should.be.a(six.string_types)
x.should.have.property('instancegroupid')
int(x.instancerequestcount).should.equal(y.num_instances)
x.instancerole.should.equal(y.role)
int(x.instancerunningcount).should.equal(y.num_instances)
x.instancetype.should.equal(y.type)
x.laststatechangereason.should.be.a(six.string_types)
x.market.should.equal(y.market)
x.name.should.be.a(six.string_types)
x.readydatetime.should.be.a(six.string_types)
x.startdatetime.should.be.a(six.string_types)
x.state.should.equal('RUNNING')
for x in conn.list_instance_groups(job_id).instancegroups:
y = input_groups[x.name]
if hasattr(y, 'bidprice'):
x.bidprice.should.equal(y.bidprice)
# Configurations
# EbsBlockDevices
# EbsOptimized
x.should.have.property('id')
x.instancegrouptype.should.equal(y.role)
x.instancetype.should.equal(y.type)
x.market.should.equal(y.market)
x.name.should.equal(y.name)
int(x.requestedinstancecount).should.equal(y.num_instances)
int(x.runninginstancecount).should.equal(y.num_instances)
# ShrinkPolicy
x.status.state.should.equal('RUNNING')
x.status.statechangereason.code.should.be.a(six.string_types)
x.status.statechangereason.message.should.be.a(six.string_types)
x.status.timeline.creationdatetime.should.be.a(six.string_types)
# x.status.timeline.enddatetime.should.be.a(six.string_types)
x.status.timeline.readydatetime.should.be.a(six.string_types)
igs = dict((g.name, g) for g in jf.instancegroups)
conn.modify_instance_groups(
[igs['task-1'].instancegroupid, igs['task-2'].instancegroupid],
[2, 3])
jf = conn.describe_jobflow(job_id)
int(jf.instancecount).should.equal(base_instance_count + 5)
igs = dict((g.name, g) for g in jf.instancegroups)
int(igs['task-1'].instancerunningcount).should.equal(2)
int(igs['task-2'].instancerunningcount).should.equal(3)
@mock_emr
def test_steps():
input_steps = [
StreamingStep(
name='My wordcount example',
mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
reducer='aggregate',
input='s3n://elasticmapreduce/samples/wordcount/input',
output='s3n://output_bucket/output/wordcount_output'),
StreamingStep(
name='My wordcount example2',
mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter2.py',
reducer='aggregate',
input='s3n://elasticmapreduce/samples/wordcount/input2',
output='s3n://output_bucket/output/wordcount_output2')
]
# TODO: implementation and test for cancel_steps
conn = boto.connect_emr()
cluster_id = conn.run_jobflow(
steps=[input_steps[0]],
**run_jobflow_args)
jf = conn.describe_jobflow(cluster_id)
jf.steps.should.have.length_of(1)
conn.add_jobflow_steps(cluster_id, [input_steps[1]])
jf = conn.describe_jobflow(cluster_id)
jf.steps.should.have.length_of(2)
for step in jf.steps:
step.actiononfailure.should.equal('TERMINATE_JOB_FLOW')
list(arg.value for arg in step.args).should.have.length_of(8)
step.creationdatetime.should.be.a(six.string_types)
# step.enddatetime.should.be.a(six.string_types)
step.jar.should.equal('/home/hadoop/contrib/streaming/hadoop-streaming.jar')
step.laststatechangereason.should.be.a(six.string_types)
step.mainclass.should.equal('')
step.name.should.be.a(six.string_types)
# step.readydatetime.should.be.a(six.string_types)
# step.startdatetime.should.be.a(six.string_types)
step.state.should.be.within(['STARTING', 'PENDING'])
expected = dict((s.name, s) for s in input_steps)
for x in conn.list_steps(cluster_id).steps:
y = expected[x.name]
# actiononfailure
list(arg.value for arg in x.config.args).should.equal([
'-mapper', y.mapper,
'-reducer', y.reducer,
'-input', y.input,
'-output', y.output,
])
x.config.jar.should.equal('/home/hadoop/contrib/streaming/hadoop-streaming.jar')
x.config.mainclass.should.equal('')
# properties
x.should.have.property('id').should.be.a(six.string_types)
x.name.should.equal(y.name)
x.status.state.should.be.within(['STARTING', 'PENDING'])
# x.status.statechangereason
x.status.timeline.creationdatetime.should.be.a(six.string_types)
# x.status.timeline.enddatetime.should.be.a(six.string_types)
# x.status.timeline.startdatetime.should.be.a(six.string_types)
x = conn.describe_step(cluster_id, x.id)
list(arg.value for arg in x.config.args).should.equal([
'-mapper', y.mapper,
'-reducer', y.reducer,
'-input', y.input,
'-output', y.output,
])
x.config.jar.should.equal('/home/hadoop/contrib/streaming/hadoop-streaming.jar')
x.config.mainclass.should.equal('')
# properties
x.should.have.property('id').should.be.a(six.string_types)
x.name.should.equal(y.name)
x.status.state.should.be.within(['STARTING', 'PENDING'])
# x.status.statechangereason
x.status.timeline.creationdatetime.should.be.a(six.string_types)
# x.status.timeline.enddatetime.should.be.a(six.string_types)
# x.status.timeline.startdatetime.should.be.a(six.string_types)
@mock_emr
def test_tags():
input_tags = {"tag1": "val1", "tag2": "val2"}
conn = boto.connect_emr()
cluster_id = conn.run_jobflow(**run_jobflow_args)
conn.add_tags(cluster_id, input_tags)
cluster = conn.describe_cluster(cluster_id) cluster = conn.describe_cluster(cluster_id)
cluster.tags.should.have.length_of(2) cluster.tags.should.have.length_of(2)
tags = dict((tag.key, tag.value) for tag in cluster.tags) dict((t.key, t.value) for t in cluster.tags).should.equal(input_tags)
tags['tag1'].should.equal('val1')
tags['tag2'].should.equal('val2')
# Remove a tag conn.remove_tags(cluster_id, list(input_tags.keys()))
conn.remove_tags(cluster_id, ["tag1"])
cluster = conn.describe_cluster(cluster_id) cluster = conn.describe_cluster(cluster_id)
cluster.tags.should.have.length_of(1) cluster.tags.should.have.length_of(0)
tags = dict((tag.key, tag.value) for tag in cluster.tags)
tags['tag2'].should.equal('val2')

View File

@ -1,46 +1,586 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import unicode_literals from __future__ import unicode_literals
from copy import deepcopy
import boto3 import boto3
import six
import sure # noqa import sure # noqa
from botocore.exceptions import ClientError
from nose.tools import assert_raises
from moto import mock_emr from moto import mock_emr
run_job_flow_args = dict(
Instances={
'InstanceCount': 3,
'KeepJobFlowAliveWhenNoSteps': True,
'MasterInstanceType': 'c3.medium',
'Placement': {'AvailabilityZone': 'us-east-1a'},
'SlaveInstanceType': 'c3.xlarge',
},
JobFlowRole='EMR_EC2_DefaultRole',
LogUri='s3://mybucket/log',
Name='cluster',
ServiceRole='EMR_DefaultRole',
VisibleToAllUsers=True)
input_instance_groups = [
{'InstanceCount': 1,
'InstanceRole': 'MASTER',
'InstanceType': 'c1.medium',
'Market': 'ON_DEMAND',
'Name': 'master'},
{'InstanceCount': 3,
'InstanceRole': 'CORE',
'InstanceType': 'c1.medium',
'Market': 'ON_DEMAND',
'Name': 'core'},
{'InstanceCount': 6,
'InstanceRole': 'TASK',
'InstanceType': 'c1.large',
'Market': 'SPOT',
'Name': 'task-1',
'BidPrice': '0.07'},
{'InstanceCount': 10,
'InstanceRole': 'TASK',
'InstanceType': 'c1.xlarge',
'Market': 'SPOT',
'Name': 'task-2',
'BidPrice': '0.05'},
]
@mock_emr @mock_emr
def test_run_job_flow(): def test_describe_cluster():
client = boto3.client('emr', region_name='us-east-1') client = boto3.client('emr', region_name='us-east-1')
cluster_id = client.run_job_flow(
Name='cluster', args = deepcopy(run_job_flow_args)
Instances={ args['Applications'] = [{'Name': 'Spark', 'Version': '2.4.2'}]
'MasterInstanceType': 'c3.xlarge', args['Configurations'] = [
'SlaveInstanceType': 'c3.xlarge', {'Classification': 'yarn-site',
'InstanceCount': 3, 'Properties': {'someproperty': 'somevalue'}}]
'Placement': {'AvailabilityZone': 'us-east-1a'}, args['Instances']['AdditionalMasterSecurityGroups'] = ['additional-master']
'KeepJobFlowAliveWhenNoSteps': True, args['Instances']['AdditionalSlaveSecurityGroups'] = ['additional-slave']
}, args['Instances']['Ec2KeyName'] = 'mykey'
VisibleToAllUsers=True, args['Instances']['Ec2SubnetId'] = 'subnet-8be41cec'
) args['Instances']['EmrManagedMasterSecurityGroup'] = 'master-security-group'
cluster_id.should.have.key('JobFlowId') args['Instances']['EmrManagedSlaveSecurityGroup'] = 'slave-security-group'
args['Instances']['KeepJobFlowAliveWhenNoSteps'] = False
args['Instances']['ServiceAccessSecurityGroup'] = 'service-access-security-group'
args['Tags'] = [{'Key': 'tag1', 'Value': 'val1'},
{'Key': 'tag2', 'Value': 'val2'}]
cluster_id = client.run_job_flow(**args)['JobFlowId']
cl = client.describe_cluster(ClusterId=cluster_id)['Cluster']
cl['Applications'][0]['Name'].should.equal('Spark')
cl['Applications'][0]['Version'].should.equal('2.4.2')
cl['AutoTerminate'].should.equal(True)
config = cl['Configurations'][0]
config['Classification'].should.equal('yarn-site')
config['Properties'].should.equal(args['Configurations'][0]['Properties'])
attrs = cl['Ec2InstanceAttributes']
attrs['AdditionalMasterSecurityGroups'].should.equal(args['Instances']['AdditionalMasterSecurityGroups'])
attrs['AdditionalSlaveSecurityGroups'].should.equal(args['Instances']['AdditionalSlaveSecurityGroups'])
attrs['Ec2AvailabilityZone'].should.equal('us-east-1a')
attrs['Ec2KeyName'].should.equal(args['Instances']['Ec2KeyName'])
attrs['Ec2SubnetId'].should.equal(args['Instances']['Ec2SubnetId'])
attrs['EmrManagedMasterSecurityGroup'].should.equal(args['Instances']['EmrManagedMasterSecurityGroup'])
attrs['EmrManagedSlaveSecurityGroup'].should.equal(args['Instances']['EmrManagedSlaveSecurityGroup'])
attrs['IamInstanceProfile'].should.equal(args['JobFlowRole'])
attrs['ServiceAccessSecurityGroup'].should.equal(args['Instances']['ServiceAccessSecurityGroup'])
cl['Id'].should.equal(cluster_id)
cl['LogUri'].should.equal(args['LogUri'])
cl['MasterPublicDnsName'].should.be.a(six.string_types)
cl['Name'].should.equal(args['Name'])
cl['NormalizedInstanceHours'].should.equal(0)
# cl['ReleaseLabel'].should.equal('emr-5.0.0')
cl.shouldnt.have.key('RequestedAmiVersion')
cl['RunningAmiVersion'].should.equal('1.0.0')
# cl['SecurityConfiguration'].should.be.a(six.string_types)
cl['ServiceRole'].should.equal(args['ServiceRole'])
status = cl['Status']
status['State'].should.equal('TERMINATED')
# cluster['Status']['StateChangeReason']
status['Timeline']['CreationDateTime'].should.be.a('datetime.datetime')
# status['Timeline']['EndDateTime'].should.equal(datetime(2014, 1, 24, 2, 19, 46, tzinfo=pytz.utc))
status['Timeline']['ReadyDateTime'].should.be.a('datetime.datetime')
dict((t['Key'], t['Value']) for t in cl['Tags']).should.equal(
dict((t['Key'], t['Value']) for t in args['Tags']))
cl['TerminationProtected'].should.equal(False)
cl['VisibleToAllUsers'].should.equal(True)
@mock_emr
def test_describe_job_flows():
client = boto3.client('emr', region_name='us-east-1')
cluster1_id = client.run_job_flow(**run_job_flow_args)['JobFlowId']
cluster2_id = client.run_job_flow(**run_job_flow_args)['JobFlowId']
resp = client.describe_job_flows()
resp['JobFlows'].should.have.length_of(2)
resp = client.describe_job_flows(JobFlowIds=[cluster2_id])
resp['JobFlows'].should.have.length_of(1)
resp['JobFlows'][0]['JobFlowId'].should.equal(cluster2_id)
resp = client.describe_job_flows(JobFlowIds=[cluster1_id])
resp['JobFlows'].should.have.length_of(1)
resp['JobFlows'][0]['JobFlowId'].should.equal(cluster1_id)
@mock_emr
def test_describe_job_flow():
client = boto3.client('emr', region_name='us-east-1')
args = deepcopy(run_job_flow_args)
args['AmiVersion'] = '3.8.1'
args['Instances'].update(
{'Ec2KeyName': 'ec2keyname',
'Ec2SubnetId': 'subnet-8be41cec',
'HadoopVersion': '2.4.0'})
args['VisibleToAllUsers'] = True
cluster_id = client.run_job_flow(**args)['JobFlowId']
jf = client.describe_job_flows(JobFlowIds=[cluster_id])['JobFlows'][0]
jf['AmiVersion'].should.equal(args['AmiVersion'])
jf.shouldnt.have.key('BootstrapActions')
esd = jf['ExecutionStatusDetail']
esd['CreationDateTime'].should.be.a('datetime.datetime')
# esd['EndDateTime'].should.be.a('datetime.datetime')
# esd['LastStateChangeReason'].should.be.a(six.string_types)
esd['ReadyDateTime'].should.be.a('datetime.datetime')
esd['StartDateTime'].should.be.a('datetime.datetime')
esd['State'].should.equal('WAITING')
attrs = jf['Instances']
attrs['Ec2KeyName'].should.equal(args['Instances']['Ec2KeyName'])
attrs['Ec2SubnetId'].should.equal(args['Instances']['Ec2SubnetId'])
attrs['HadoopVersion'].should.equal(args['Instances']['HadoopVersion'])
attrs['InstanceCount'].should.equal(args['Instances']['InstanceCount'])
for ig in attrs['InstanceGroups']:
# ig['BidPrice']
ig['CreationDateTime'].should.be.a('datetime.datetime')
# ig['EndDateTime'].should.be.a('datetime.datetime')
ig['InstanceGroupId'].should.be.a(six.string_types)
ig['InstanceRequestCount'].should.be.a(int)
ig['InstanceRole'].should.be.within(['MASTER', 'CORE'])
ig['InstanceRunningCount'].should.be.a(int)
ig['InstanceType'].should.be.within(['c3.medium', 'c3.xlarge'])
# ig['LastStateChangeReason'].should.be.a(six.string_types)
ig['Market'].should.equal('ON_DEMAND')
ig['Name'].should.be.a(six.string_types)
ig['ReadyDateTime'].should.be.a('datetime.datetime')
ig['StartDateTime'].should.be.a('datetime.datetime')
ig['State'].should.equal('RUNNING')
attrs['KeepJobFlowAliveWhenNoSteps'].should.equal(True)
# attrs['MasterInstanceId'].should.be.a(six.string_types)
attrs['MasterInstanceType'].should.equal(args['Instances']['MasterInstanceType'])
attrs['MasterPublicDnsName'].should.be.a(six.string_types)
attrs['NormalizedInstanceHours'].should.equal(0)
attrs['Placement']['AvailabilityZone'].should.equal(args['Instances']['Placement']['AvailabilityZone'])
attrs['SlaveInstanceType'].should.equal(args['Instances']['SlaveInstanceType'])
attrs['TerminationProtected'].should.equal(False)
jf['JobFlowId'].should.equal(cluster_id)
jf['JobFlowRole'].should.equal(args['JobFlowRole'])
jf['LogUri'].should.equal(args['LogUri'])
jf['Name'].should.equal(args['Name'])
jf['ServiceRole'].should.equal(args['ServiceRole'])
jf.shouldnt.have.key('Steps')
jf.shouldnt.have.key('SupportedProducts')
jf['VisibleToAllUsers'].should.equal(True)
@mock_emr @mock_emr
def test_list_clusters(): def test_list_clusters():
client = boto3.client('emr', region_name='us-east-1') client = boto3.client('emr', region_name='us-east-1')
client.run_job_flow( args = deepcopy(run_job_flow_args)
Name='cluster', args['Name'] = 'jobflow1'
Instances={ cluster1_id = client.run_job_flow(**args)['JobFlowId']
'MasterInstanceType': 'c3.xlarge', args['Name'] = 'jobflow2'
'SlaveInstanceType': 'c3.xlarge', cluster2_id = client.run_job_flow(**args)['JobFlowId']
'InstanceCount': 3, client.terminate_job_flows(JobFlowIds=[cluster2_id])
'Placement': {'AvailabilityZone': 'us-east-1a'},
'KeepJobFlowAliveWhenNoSteps': True,
},
VisibleToAllUsers=True,
)
summary = client.list_clusters() summary = client.list_clusters()
clusters = summary['Clusters'] clusters = summary['Clusters']
clusters.should.have.length_of(1) clusters.should.have.length_of(2)
cluster = clusters[0]
cluster['NormalizedInstanceHours'].should.equal(0) expected = {
cluster['Status']['State'].should.equal("RUNNING") cluster1_id: {
'Id': cluster1_id,
'Name': 'jobflow1',
'NormalizedInstanceHours': 0,
'State': 'WAITING'},
cluster2_id: {
'Id': cluster2_id,
'Name': 'jobflow2',
'NormalizedInstanceHours': 0,
'State': 'TERMINATED'},
}
for x in clusters:
y = expected[x['Id']]
x['Id'].should.equal(y['Id'])
x['Name'].should.equal(y['Name'])
x['NormalizedInstanceHours'].should.equal(y['NormalizedInstanceHours'])
x['Status']['State'].should.equal(y['State'])
x['Status']['Timeline']['CreationDateTime'].should.be.a('datetime.datetime')
if y['State'] == 'TERMINATED':
x['Status']['Timeline']['EndDateTime'].should.be.a('datetime.datetime')
else:
x['Status']['Timeline'].shouldnt.have.key('EndDateTime')
x['Status']['Timeline']['ReadyDateTime'].should.be.a('datetime.datetime')
@mock_emr
def test_run_job_flow():
client = boto3.client('emr', region_name='us-east-1')
args = deepcopy(run_job_flow_args)
cluster_id = client.run_job_flow(**args)['JobFlowId']
resp = client.describe_job_flows(JobFlowIds=[cluster_id])['JobFlows'][0]
resp['ExecutionStatusDetail']['State'].should.equal('WAITING')
resp['JobFlowId'].should.equal(cluster_id)
resp['Name'].should.equal(args['Name'])
resp['Instances']['MasterInstanceType'].should.equal(args['Instances']['MasterInstanceType'])
resp['Instances']['SlaveInstanceType'].should.equal(args['Instances']['SlaveInstanceType'])
resp['LogUri'].should.equal(args['LogUri'])
resp['VisibleToAllUsers'].should.equal(args['VisibleToAllUsers'])
resp['Instances']['NormalizedInstanceHours'].should.equal(0)
resp.shouldnt.have.key('Steps')
@mock_emr
def test_run_job_flow_with_invalid_params():
client = boto3.client('emr', region_name='us-east-1')
with assert_raises(ClientError) as e:
# cannot set both AmiVersion and ReleaseLabel
args = deepcopy(run_job_flow_args)
args['AmiVersion'] = '2.4'
args['ReleaseLabel'] = 'emr-5.0.0'
client.run_job_flow(**args)
e.exception.response['Error']['Code'].should.equal('ValidationException')
@mock_emr
def test_run_job_flow_in_multiple_regions():
regions = {}
for region in ['us-east-1', 'eu-west-1']:
client = boto3.client('emr', region_name=region)
args = deepcopy(run_job_flow_args)
args['Name'] = region
cluster_id = client.run_job_flow(**args)['JobFlowId']
regions[region] = {'client': client, 'cluster_id': cluster_id}
for region in regions.keys():
client = regions[region]['client']
resp = client.describe_cluster(ClusterId=regions[region]['cluster_id'])
resp['Cluster']['Name'].should.equal(region)
@mock_emr
def test_run_job_flow_with_new_params():
client = boto3.client('emr', region_name='us-east-1')
resp = client.run_job_flow(**run_job_flow_args)
resp.should.have.key('JobFlowId')
@mock_emr
def test_run_job_flow_with_visible_to_all_users():
client = boto3.client('emr', region_name='us-east-1')
for expected in (True, False):
args = deepcopy(run_job_flow_args)
args['VisibleToAllUsers'] = expected
resp = client.run_job_flow(**args)
cluster_id = resp['JobFlowId']
resp = client.describe_cluster(ClusterId=cluster_id)
resp['Cluster']['VisibleToAllUsers'].should.equal(expected)
@mock_emr
def test_run_job_flow_with_instance_groups():
input_groups = dict((g['Name'], g) for g in input_instance_groups)
client = boto3.client('emr', region_name='us-east-1')
args = deepcopy(run_job_flow_args)
args['Instances'] = {'InstanceGroups': input_instance_groups}
cluster_id = client.run_job_flow(**args)['JobFlowId']
groups = client.list_instance_groups(ClusterId=cluster_id)['InstanceGroups']
for x in groups:
y = input_groups[x['Name']]
x.should.have.key('Id')
x['RequestedInstanceCount'].should.equal(y['InstanceCount'])
x['InstanceGroupType'].should.equal(y['InstanceRole'])
x['InstanceType'].should.equal(y['InstanceType'])
x['Market'].should.equal(y['Market'])
if 'BidPrice' in y:
x['BidPrice'].should.equal(y['BidPrice'])
@mock_emr
def test_set_termination_protection():
client = boto3.client('emr', region_name='us-east-1')
args = deepcopy(run_job_flow_args)
args['Instances']['TerminationProtected'] = False
resp = client.run_job_flow(**args)
cluster_id = resp['JobFlowId']
resp = client.describe_cluster(ClusterId=cluster_id)
resp['Cluster']['TerminationProtected'].should.equal(False)
for expected in (True, False):
resp = client.set_termination_protection(JobFlowIds=[cluster_id],
TerminationProtected=expected)
resp = client.describe_cluster(ClusterId=cluster_id)
resp['Cluster']['TerminationProtected'].should.equal(expected)
@mock_emr
def test_set_visible_to_all_users():
client = boto3.client('emr', region_name='us-east-1')
args = deepcopy(run_job_flow_args)
args['VisibleToAllUsers'] = False
resp = client.run_job_flow(**args)
cluster_id = resp['JobFlowId']
resp = client.describe_cluster(ClusterId=cluster_id)
resp['Cluster']['VisibleToAllUsers'].should.equal(False)
for expected in (True, False):
resp = client.set_visible_to_all_users(JobFlowIds=[cluster_id],
VisibleToAllUsers=expected)
resp = client.describe_cluster(ClusterId=cluster_id)
resp['Cluster']['VisibleToAllUsers'].should.equal(expected)
@mock_emr
def test_terminate_job_flows():
client = boto3.client('emr', region_name='us-east-1')
resp = client.run_job_flow(**run_job_flow_args)
cluster_id = resp['JobFlowId']
resp = client.describe_cluster(ClusterId=cluster_id)
resp['Cluster']['Status']['State'].should.equal('WAITING')
resp = client.terminate_job_flows(JobFlowIds=[cluster_id])
resp = client.describe_cluster(ClusterId=cluster_id)
resp['Cluster']['Status']['State'].should.equal('TERMINATED')
# testing multiple end points for each feature
@mock_emr
def test_bootstrap_actions():
bootstrap_actions = [
{'Name': 'bs1',
'ScriptBootstrapAction': {
'Args': ['arg1', 'arg2'],
'Path': 'path/to/script'}},
{'Name': 'bs2',
'ScriptBootstrapAction': {
'Path': 'path/to/anotherscript'}}
]
client = boto3.client('emr', region_name='us-east-1')
args = deepcopy(run_job_flow_args)
args['BootstrapActions'] = bootstrap_actions
cluster_id = client.run_job_flow(**args)['JobFlowId']
cl = client.describe_job_flows(JobFlowIds=[cluster_id])['JobFlows'][0]
for x, y in zip(cl['BootstrapActions'], bootstrap_actions):
x['BootstrapActionConfig'].should.equal(y)
resp = client.list_bootstrap_actions(ClusterId=cluster_id)
for x, y in zip(resp['BootstrapActions'], bootstrap_actions):
x['Name'].should.equal(y['Name'])
if 'Args' in y['ScriptBootstrapAction']:
x['Args'].should.equal(y['ScriptBootstrapAction']['Args'])
x['ScriptPath'].should.equal(y['ScriptBootstrapAction']['Path'])
@mock_emr
def test_instance_groups():
input_groups = dict((g['Name'], g) for g in input_instance_groups)
client = boto3.client('emr', region_name='us-east-1')
args = deepcopy(run_job_flow_args)
for key in ['MasterInstanceType', 'SlaveInstanceType', 'InstanceCount']:
del args['Instances'][key]
args['Instances']['InstanceGroups'] = input_instance_groups[:2]
cluster_id = client.run_job_flow(**args)['JobFlowId']
jf = client.describe_job_flows(JobFlowIds=[cluster_id])['JobFlows'][0]
base_instance_count = jf['Instances']['InstanceCount']
client.add_instance_groups(JobFlowId=cluster_id, InstanceGroups=input_instance_groups[2:])
jf = client.describe_job_flows(JobFlowIds=[cluster_id])['JobFlows'][0]
jf['Instances']['InstanceCount'].should.equal(sum(g['InstanceCount'] for g in input_instance_groups))
for x in jf['Instances']['InstanceGroups']:
y = input_groups[x['Name']]
if hasattr(y, 'BidPrice'):
x['BidPrice'].should.equal('BidPrice')
x['CreationDateTime'].should.be.a('datetime.datetime')
# x['EndDateTime'].should.be.a('datetime.datetime')
x.should.have.key('InstanceGroupId')
x['InstanceRequestCount'].should.equal(y['InstanceCount'])
x['InstanceRole'].should.equal(y['InstanceRole'])
x['InstanceRunningCount'].should.equal(y['InstanceCount'])
x['InstanceType'].should.equal(y['InstanceType'])
# x['LastStateChangeReason'].should.equal(y['LastStateChangeReason'])
x['Market'].should.equal(y['Market'])
x['Name'].should.equal(y['Name'])
x['ReadyDateTime'].should.be.a('datetime.datetime')
x['StartDateTime'].should.be.a('datetime.datetime')
x['State'].should.equal('RUNNING')
groups = client.list_instance_groups(ClusterId=cluster_id)['InstanceGroups']
for x in groups:
y = input_groups[x['Name']]
if hasattr(y, 'BidPrice'):
x['BidPrice'].should.equal('BidPrice')
# Configurations
# EbsBlockDevices
# EbsOptimized
x.should.have.key('Id')
x['InstanceGroupType'].should.equal(y['InstanceRole'])
x['InstanceType'].should.equal(y['InstanceType'])
x['Market'].should.equal(y['Market'])
x['Name'].should.equal(y['Name'])
x['RequestedInstanceCount'].should.equal(y['InstanceCount'])
x['RunningInstanceCount'].should.equal(y['InstanceCount'])
# ShrinkPolicy
x['Status']['State'].should.equal('RUNNING')
x['Status']['StateChangeReason']['Code'].should.be.a(six.string_types)
# x['Status']['StateChangeReason']['Message'].should.be.a(six.string_types)
x['Status']['Timeline']['CreationDateTime'].should.be.a('datetime.datetime')
# x['Status']['Timeline']['EndDateTime'].should.be.a('datetime.datetime')
x['Status']['Timeline']['ReadyDateTime'].should.be.a('datetime.datetime')
igs = dict((g['Name'], g) for g in groups)
client.modify_instance_groups(
InstanceGroups=[
{'InstanceGroupId': igs['task-1']['Id'],
'InstanceCount': 2},
{'InstanceGroupId': igs['task-2']['Id'],
'InstanceCount': 3}])
jf = client.describe_job_flows(JobFlowIds=[cluster_id])['JobFlows'][0]
jf['Instances']['InstanceCount'].should.equal(base_instance_count + 5)
igs = dict((g['Name'], g) for g in jf['Instances']['InstanceGroups'])
igs['task-1']['InstanceRunningCount'].should.equal(2)
igs['task-2']['InstanceRunningCount'].should.equal(3)
@mock_emr
def test_steps():
input_steps = [{
'HadoopJarStep': {
'Args': [
'hadoop-streaming',
'-files', 's3://elasticmapreduce/samples/wordcount/wordSplitter.py#wordSplitter.py',
'-mapper', 'python wordSplitter.py',
'-input', 's3://elasticmapreduce/samples/wordcount/input',
'-output', 's3://output_bucket/output/wordcount_output',
'-reducer', 'aggregate'
],
'Jar': 'command-runner.jar',
},
'Name': 'My wordcount example',
}, {
'HadoopJarStep': {
'Args': [
'hadoop-streaming',
'-files', 's3://elasticmapreduce/samples/wordcount/wordSplitter2.py#wordSplitter2.py',
'-mapper', 'python wordSplitter2.py',
'-input', 's3://elasticmapreduce/samples/wordcount/input2',
'-output', 's3://output_bucket/output/wordcount_output2',
'-reducer', 'aggregate'
],
'Jar': 'command-runner.jar',
},
'Name': 'My wordcount example2',
}]
# TODO: implementation and test for cancel_steps
client = boto3.client('emr', region_name='us-east-1')
args = deepcopy(run_job_flow_args)
args['Steps'] = [input_steps[0]]
cluster_id = client.run_job_flow(**args)['JobFlowId']
jf = client.describe_job_flows(JobFlowIds=[cluster_id])['JobFlows'][0]
jf['Steps'].should.have.length_of(1)
client.add_job_flow_steps(JobFlowId=cluster_id, Steps=[input_steps[1]])
jf = client.describe_job_flows(JobFlowIds=[cluster_id])['JobFlows'][0]
jf['Steps'].should.have.length_of(2)
for idx, (x, y) in enumerate(zip(jf['Steps'], input_steps)):
x['ExecutionStatusDetail'].should.have.key('CreationDateTime')
# x['ExecutionStatusDetail'].should.have.key('EndDateTime')
# x['ExecutionStatusDetail'].should.have.key('LastStateChangeReason')
# x['ExecutionStatusDetail'].should.have.key('StartDateTime')
x['ExecutionStatusDetail']['State'].should.equal('STARTING' if idx == 0 else 'PENDING')
x['StepConfig']['ActionOnFailure'].should.equal('TERMINATE_CLUSTER')
x['StepConfig']['HadoopJarStep']['Args'].should.equal(y['HadoopJarStep']['Args'])
x['StepConfig']['HadoopJarStep']['Jar'].should.equal(y['HadoopJarStep']['Jar'])
if 'MainClass' in y['HadoopJarStep']:
x['StepConfig']['HadoopJarStep']['MainClass'].should.equal(y['HadoopJarStep']['MainClass'])
if 'Properties' in y['HadoopJarStep']:
x['StepConfig']['HadoopJarStep']['Properties'].should.equal(y['HadoopJarStep']['Properties'])
x['StepConfig']['Name'].should.equal(y['Name'])
expected = dict((s['Name'], s) for s in input_steps)
steps = client.list_steps(ClusterId=cluster_id)['Steps']
steps.should.have.length_of(2)
for x in steps:
y = expected[x['Name']]
x['ActionOnFailure'].should.equal('TERMINATE_CLUSTER')
x['Config']['Args'].should.equal(y['HadoopJarStep']['Args'])
x['Config']['Jar'].should.equal(y['HadoopJarStep']['Jar'])
# x['Config']['MainClass'].should.equal(y['HadoopJarStep']['MainClass'])
# Properties
x['Id'].should.be.a(six.string_types)
x['Name'].should.equal(y['Name'])
x['Status']['State'].should.be.within(['STARTING', 'PENDING'])
# StateChangeReason
x['Status']['Timeline']['CreationDateTime'].should.be.a('datetime.datetime')
# x['Status']['Timeline']['EndDateTime'].should.be.a('datetime.datetime')
# x['Status']['Timeline']['StartDateTime'].should.be.a('datetime.datetime')
x = client.describe_step(ClusterId=cluster_id, StepId=x['Id'])['Step']
x['ActionOnFailure'].should.equal('TERMINATE_CLUSTER')
x['Config']['Args'].should.equal(y['HadoopJarStep']['Args'])
x['Config']['Jar'].should.equal(y['HadoopJarStep']['Jar'])
# x['Config']['MainClass'].should.equal(y['HadoopJarStep']['MainClass'])
# Properties
x['Id'].should.be.a(six.string_types)
x['Name'].should.equal(y['Name'])
x['Status']['State'].should.be.within(['STARTING', 'PENDING'])
# StateChangeReason
x['Status']['Timeline']['CreationDateTime'].should.be.a('datetime.datetime')
# x['Status']['Timeline']['EndDateTime'].should.be.a('datetime.datetime')
# x['Status']['Timeline']['StartDateTime'].should.be.a('datetime.datetime')
@mock_emr
def test_tags():
input_tags = [{'Key': 'newkey1', 'Value': 'newval1'},
{'Key': 'newkey2', 'Value': 'newval2'}]
client = boto3.client('emr', region_name='us-east-1')
cluster_id = client.run_job_flow(**run_job_flow_args)['JobFlowId']
client.add_tags(ResourceId=cluster_id, Tags=input_tags)
resp = client.describe_cluster(ClusterId=cluster_id)['Cluster']
resp['Tags'].should.have.length_of(2)
dict((t['Key'], t['Value']) for t in resp['Tags']).should.equal(dict((t['Key'], t['Value']) for t in input_tags))
client.remove_tags(ResourceId=cluster_id, TagKeys=[t['Key'] for t in input_tags])
resp = client.describe_cluster(ClusterId=cluster_id)['Cluster']
resp.shouldnt.have.key('Tags')