from __future__ import unicode_literals import time from datetime import datetime import boto import pytz from boto.emr.bootstrap_action import BootstrapAction from boto.emr.instance_group import InstanceGroup from boto.emr.step import StreamingStep import sure # noqa from moto import mock_emr_deprecated from tests.helpers import requires_boto_gte run_jobflow_args = dict( job_flow_role="EMR_EC2_DefaultRole", keep_alive=True, log_uri="s3://some_bucket/jobflow_logs", master_instance_type="c1.medium", name="My jobflow", num_instances=2, service_role="EMR_DefaultRole", slave_instance_type="c1.medium", ) input_instance_groups = [ InstanceGroup(1, "MASTER", "c1.medium", "ON_DEMAND", "master"), InstanceGroup(3, "CORE", "c1.medium", "ON_DEMAND", "core"), InstanceGroup(6, "TASK", "c1.large", "SPOT", "task-1", "0.07"), InstanceGroup(10, "TASK", "c1.xlarge", "SPOT", "task-2", "0.05"), ] @mock_emr_deprecated def test_describe_cluster(): conn = boto.connect_emr() args = run_jobflow_args.copy() args.update( dict( api_params={ "Applications.member.1.Name": "Spark", "Applications.member.1.Version": "2.4.2", "Configurations.member.1.Classification": "yarn-site", "Configurations.member.1.Properties.entry.1.key": "someproperty", "Configurations.member.1.Properties.entry.1.value": "somevalue", "Configurations.member.1.Properties.entry.2.key": "someotherproperty", "Configurations.member.1.Properties.entry.2.value": "someothervalue", "Instances.EmrManagedMasterSecurityGroup": "master-security-group", "Instances.Ec2SubnetId": "subnet-8be41cec", }, availability_zone="us-east-2b", ec2_keyname="mykey", job_flow_role="EMR_EC2_DefaultRole", keep_alive=False, log_uri="s3://some_bucket/jobflow_logs", name="My jobflow", service_role="EMR_DefaultRole", visible_to_all_users=True, ) ) cluster_id = conn.run_jobflow(**args) input_tags = {"tag1": "val1", "tag2": "val2"} conn.add_tags(cluster_id, input_tags) cluster = conn.describe_cluster(cluster_id) cluster.applications[0].name.should.equal("Spark") cluster.applications[0].version.should.equal("2.4.2") cluster.autoterminate.should.equal("true") # configurations appear not be supplied as attributes? attrs = cluster.ec2instanceattributes # AdditionalMasterSecurityGroups # AdditionalSlaveSecurityGroups attrs.ec2availabilityzone.should.equal(args["availability_zone"]) attrs.ec2keyname.should.equal(args["ec2_keyname"]) attrs.ec2subnetid.should.equal(args["api_params"]["Instances.Ec2SubnetId"]) # EmrManagedMasterSecurityGroups # EmrManagedSlaveSecurityGroups attrs.iaminstanceprofile.should.equal(args["job_flow_role"]) # ServiceAccessSecurityGroup cluster.id.should.equal(cluster_id) cluster.loguri.should.equal(args["log_uri"]) cluster.masterpublicdnsname.should.be.a(str) cluster.name.should.equal(args["name"]) int(cluster.normalizedinstancehours).should.equal(0) # cluster.release_label cluster.shouldnt.have.property("requestedamiversion") cluster.runningamiversion.should.equal("1.0.0") # cluster.securityconfiguration cluster.servicerole.should.equal(args["service_role"]) cluster.status.state.should.equal("TERMINATED") cluster.status.statechangereason.message.should.be.a(str) cluster.status.statechangereason.code.should.be.a(str) cluster.status.timeline.creationdatetime.should.be.a(str) # cluster.status.timeline.enddatetime.should.be.a(str) # cluster.status.timeline.readydatetime.should.be.a(str) dict((item.key, item.value) for item in cluster.tags).should.equal(input_tags) cluster.terminationprotected.should.equal("false") cluster.visibletoallusers.should.equal("true") @mock_emr_deprecated def test_describe_jobflows(): conn = boto.connect_emr() args = run_jobflow_args.copy() expected = {} for idx in range(4): cluster_name = "cluster" + str(idx) args["name"] = cluster_name cluster_id = conn.run_jobflow(**args) expected[cluster_id] = { "id": cluster_id, "name": cluster_name, "state": "WAITING", } # need sleep since it appears the timestamp is always rounded to # the nearest second internally time.sleep(1) timestamp = datetime.now(pytz.utc) time.sleep(1) for idx in range(4, 6): cluster_name = "cluster" + str(idx) args["name"] = cluster_name cluster_id = conn.run_jobflow(**args) conn.terminate_jobflow(cluster_id) expected[cluster_id] = { "id": cluster_id, "name": cluster_name, "state": "TERMINATED", } jobs = conn.describe_jobflows() jobs.should.have.length_of(6) for cluster_id, y in expected.items(): resp = conn.describe_jobflows(jobflow_ids=[cluster_id]) resp.should.have.length_of(1) resp[0].jobflowid.should.equal(cluster_id) resp = conn.describe_jobflows(states=["WAITING"]) resp.should.have.length_of(4) for x in resp: x.state.should.equal("WAITING") resp = conn.describe_jobflows(created_before=timestamp) resp.should.have.length_of(4) resp = conn.describe_jobflows(created_after=timestamp) resp.should.have.length_of(2) @mock_emr_deprecated def test_describe_jobflow(): conn = boto.connect_emr() args = run_jobflow_args.copy() args.update( dict( ami_version="3.8.1", api_params={ #'Applications.member.1.Name': 'Spark', #'Applications.member.1.Version': '2.4.2', #'Configurations.member.1.Classification': 'yarn-site', #'Configurations.member.1.Properties.entry.1.key': 'someproperty', #'Configurations.member.1.Properties.entry.1.value': 'somevalue', #'Instances.EmrManagedMasterSecurityGroup': 'master-security-group', "Instances.Ec2SubnetId": "subnet-8be41cec" }, ec2_keyname="mykey", hadoop_version="2.4.0", name="My jobflow", log_uri="s3://some_bucket/jobflow_logs", keep_alive=True, master_instance_type="c1.medium", slave_instance_type="c1.medium", num_instances=2, availability_zone="us-west-2b", job_flow_role="EMR_EC2_DefaultRole", service_role="EMR_DefaultRole", visible_to_all_users=True, ) ) cluster_id = conn.run_jobflow(**args) jf = conn.describe_jobflow(cluster_id) jf.amiversion.should.equal(args["ami_version"]) jf.bootstrapactions.should.equal(None) jf.creationdatetime.should.be.a(str) jf.should.have.property("laststatechangereason") jf.readydatetime.should.be.a(str) jf.startdatetime.should.be.a(str) jf.state.should.equal("WAITING") jf.ec2keyname.should.equal(args["ec2_keyname"]) # Ec2SubnetId jf.hadoopversion.should.equal(args["hadoop_version"]) int(jf.instancecount).should.equal(2) for ig in jf.instancegroups: ig.creationdatetime.should.be.a(str) # ig.enddatetime.should.be.a(str) ig.should.have.property("instancegroupid").being.a(str) int(ig.instancerequestcount).should.equal(1) ig.instancerole.should.be.within(["MASTER", "CORE"]) int(ig.instancerunningcount).should.equal(1) ig.instancetype.should.equal("c1.medium") ig.laststatechangereason.should.be.a(str) ig.market.should.equal("ON_DEMAND") ig.name.should.be.a(str) ig.readydatetime.should.be.a(str) ig.startdatetime.should.be.a(str) ig.state.should.equal("RUNNING") jf.keepjobflowalivewhennosteps.should.equal("true") jf.masterinstanceid.should.be.a(str) jf.masterinstancetype.should.equal(args["master_instance_type"]) jf.masterpublicdnsname.should.be.a(str) int(jf.normalizedinstancehours).should.equal(0) jf.availabilityzone.should.equal(args["availability_zone"]) jf.slaveinstancetype.should.equal(args["slave_instance_type"]) jf.terminationprotected.should.equal("false") jf.jobflowid.should.equal(cluster_id) # jf.jobflowrole.should.equal(args['job_flow_role']) jf.loguri.should.equal(args["log_uri"]) jf.name.should.equal(args["name"]) # jf.servicerole.should.equal(args['service_role']) jf.steps.should.have.length_of(0) list(i.value for i in jf.supported_products).should.equal([]) jf.visibletoallusers.should.equal("true") @mock_emr_deprecated def test_list_clusters(): conn = boto.connect_emr() args = run_jobflow_args.copy() expected = {} for idx in range(40): cluster_name = "jobflow" + str(idx) args["name"] = cluster_name cluster_id = conn.run_jobflow(**args) expected[cluster_id] = { "id": cluster_id, "name": cluster_name, "normalizedinstancehours": "0", "state": "WAITING", } # need sleep since it appears the timestamp is always rounded to # the nearest second internally time.sleep(1) timestamp = datetime.now(pytz.utc) time.sleep(1) for idx in range(40, 70): cluster_name = "jobflow" + str(idx) args["name"] = cluster_name cluster_id = conn.run_jobflow(**args) conn.terminate_jobflow(cluster_id) expected[cluster_id] = { "id": cluster_id, "name": cluster_name, "normalizedinstancehours": "0", "state": "TERMINATED", } args = {} while 1: resp = conn.list_clusters(**args) clusters = resp.clusters len(clusters).should.be.lower_than_or_equal_to(50) for x in clusters: y = expected[x.id] x.id.should.equal(y["id"]) x.name.should.equal(y["name"]) x.normalizedinstancehours.should.equal(y["normalizedinstancehours"]) x.status.state.should.equal(y["state"]) x.status.timeline.creationdatetime.should.be.a(str) if y["state"] == "TERMINATED": x.status.timeline.enddatetime.should.be.a(str) else: x.status.timeline.shouldnt.have.property("enddatetime") x.status.timeline.readydatetime.should.be.a(str) if not hasattr(resp, "marker"): break args = {"marker": resp.marker} resp = conn.list_clusters(cluster_states=["TERMINATED"]) resp.clusters.should.have.length_of(30) for x in resp.clusters: x.status.state.should.equal("TERMINATED") resp = conn.list_clusters(created_before=timestamp) resp.clusters.should.have.length_of(40) resp = conn.list_clusters(created_after=timestamp) resp.clusters.should.have.length_of(30) @mock_emr_deprecated def test_run_jobflow(): conn = boto.connect_emr() args = run_jobflow_args.copy() job_id = conn.run_jobflow(**args) job_flow = conn.describe_jobflow(job_id) job_flow.state.should.equal("WAITING") job_flow.jobflowid.should.equal(job_id) job_flow.name.should.equal(args["name"]) job_flow.masterinstancetype.should.equal(args["master_instance_type"]) job_flow.slaveinstancetype.should.equal(args["slave_instance_type"]) job_flow.loguri.should.equal(args["log_uri"]) job_flow.visibletoallusers.should.equal("false") int(job_flow.normalizedinstancehours).should.equal(0) job_flow.steps.should.have.length_of(0) @mock_emr_deprecated def test_run_jobflow_in_multiple_regions(): regions = {} for region in ["us-east-1", "eu-west-1"]: conn = boto.emr.connect_to_region(region) args = run_jobflow_args.copy() args["name"] = region cluster_id = conn.run_jobflow(**args) regions[region] = {"conn": conn, "cluster_id": cluster_id} for region in regions.keys(): conn = regions[region]["conn"] jf = conn.describe_jobflow(regions[region]["cluster_id"]) jf.name.should.equal(region) @requires_boto_gte("2.8") @mock_emr_deprecated def test_run_jobflow_with_new_params(): # Test that run_jobflow works with newer params conn = boto.connect_emr() conn.run_jobflow(**run_jobflow_args) @requires_boto_gte("2.8") @mock_emr_deprecated def test_run_jobflow_with_visible_to_all_users(): conn = boto.connect_emr() for expected in (True, False): job_id = conn.run_jobflow(visible_to_all_users=expected, **run_jobflow_args) job_flow = conn.describe_jobflow(job_id) job_flow.visibletoallusers.should.equal(str(expected).lower()) @requires_boto_gte("2.8") @mock_emr_deprecated def test_run_jobflow_with_instance_groups(): input_groups = dict((g.name, g) for g in input_instance_groups) conn = boto.connect_emr() job_id = conn.run_jobflow(instance_groups=input_instance_groups, **run_jobflow_args) job_flow = conn.describe_jobflow(job_id) int(job_flow.instancecount).should.equal( sum(g.num_instances for g in input_instance_groups) ) for instance_group in job_flow.instancegroups: expected = input_groups[instance_group.name] instance_group.should.have.property("instancegroupid") int(instance_group.instancerunningcount).should.equal(expected.num_instances) instance_group.instancerole.should.equal(expected.role) instance_group.instancetype.should.equal(expected.type) instance_group.market.should.equal(expected.market) if hasattr(expected, "bidprice"): instance_group.bidprice.should.equal(expected.bidprice) @requires_boto_gte("2.8") @mock_emr_deprecated def test_set_termination_protection(): conn = boto.connect_emr() job_id = conn.run_jobflow(**run_jobflow_args) job_flow = conn.describe_jobflow(job_id) job_flow.terminationprotected.should.equal("false") conn.set_termination_protection(job_id, True) job_flow = conn.describe_jobflow(job_id) job_flow.terminationprotected.should.equal("true") conn.set_termination_protection(job_id, False) job_flow = conn.describe_jobflow(job_id) job_flow.terminationprotected.should.equal("false") @requires_boto_gte("2.8") @mock_emr_deprecated def test_set_visible_to_all_users(): conn = boto.connect_emr() args = run_jobflow_args.copy() args["visible_to_all_users"] = False job_id = conn.run_jobflow(**args) job_flow = conn.describe_jobflow(job_id) job_flow.visibletoallusers.should.equal("false") conn.set_visible_to_all_users(job_id, True) job_flow = conn.describe_jobflow(job_id) job_flow.visibletoallusers.should.equal("true") conn.set_visible_to_all_users(job_id, False) job_flow = conn.describe_jobflow(job_id) job_flow.visibletoallusers.should.equal("false") @mock_emr_deprecated def test_terminate_jobflow(): conn = boto.connect_emr() job_id = conn.run_jobflow(**run_jobflow_args) flow = conn.describe_jobflows()[0] flow.state.should.equal("WAITING") conn.terminate_jobflow(job_id) flow = conn.describe_jobflows()[0] flow.state.should.equal("TERMINATED") # testing multiple end points for each feature @mock_emr_deprecated def test_bootstrap_actions(): bootstrap_actions = [ BootstrapAction( name="bs1", path="path/to/script", bootstrap_action_args=["arg1", "arg2&arg3"], ), BootstrapAction( name="bs2", path="path/to/anotherscript", bootstrap_action_args=[] ), ] conn = boto.connect_emr() cluster_id = conn.run_jobflow( bootstrap_actions=bootstrap_actions, **run_jobflow_args ) jf = conn.describe_jobflow(cluster_id) for x, y in zip(jf.bootstrapactions, bootstrap_actions): x.name.should.equal(y.name) x.path.should.equal(y.path) list(o.value for o in x.args).should.equal(y.args()) resp = conn.list_bootstrap_actions(cluster_id) for i, y in enumerate(bootstrap_actions): x = resp.actions[i] x.name.should.equal(y.name) x.scriptpath.should.equal(y.path) list(arg.value for arg in x.args).should.equal(y.args()) @mock_emr_deprecated def test_instance_groups(): input_groups = dict((g.name, g) for g in input_instance_groups) conn = boto.connect_emr() args = run_jobflow_args.copy() for key in ["master_instance_type", "slave_instance_type", "num_instances"]: del args[key] args["instance_groups"] = input_instance_groups[:2] job_id = conn.run_jobflow(**args) jf = conn.describe_jobflow(job_id) base_instance_count = int(jf.instancecount) conn.add_instance_groups(job_id, input_instance_groups[2:]) jf = conn.describe_jobflow(job_id) int(jf.instancecount).should.equal( sum(g.num_instances for g in input_instance_groups) ) for x in jf.instancegroups: y = input_groups[x.name] if hasattr(y, "bidprice"): x.bidprice.should.equal(y.bidprice) x.creationdatetime.should.be.a(str) # x.enddatetime.should.be.a(str) x.should.have.property("instancegroupid") int(x.instancerequestcount).should.equal(y.num_instances) x.instancerole.should.equal(y.role) int(x.instancerunningcount).should.equal(y.num_instances) x.instancetype.should.equal(y.type) x.laststatechangereason.should.be.a(str) x.market.should.equal(y.market) x.name.should.be.a(str) x.readydatetime.should.be.a(str) x.startdatetime.should.be.a(str) x.state.should.equal("RUNNING") for x in conn.list_instance_groups(job_id).instancegroups: y = input_groups[x.name] if hasattr(y, "bidprice"): x.bidprice.should.equal(y.bidprice) # Configurations # EbsBlockDevices # EbsOptimized x.should.have.property("id") x.instancegrouptype.should.equal(y.role) x.instancetype.should.equal(y.type) x.market.should.equal(y.market) x.name.should.equal(y.name) int(x.requestedinstancecount).should.equal(y.num_instances) int(x.runninginstancecount).should.equal(y.num_instances) # ShrinkPolicy x.status.state.should.equal("RUNNING") x.status.statechangereason.code.should.be.a(str) x.status.statechangereason.message.should.be.a(str) x.status.timeline.creationdatetime.should.be.a(str) # x.status.timeline.enddatetime.should.be.a(str) x.status.timeline.readydatetime.should.be.a(str) igs = dict((g.name, g) for g in jf.instancegroups) conn.modify_instance_groups( [igs["task-1"].instancegroupid, igs["task-2"].instancegroupid], [2, 3] ) jf = conn.describe_jobflow(job_id) int(jf.instancecount).should.equal(base_instance_count + 5) igs = dict((g.name, g) for g in jf.instancegroups) int(igs["task-1"].instancerunningcount).should.equal(2) int(igs["task-2"].instancerunningcount).should.equal(3) @mock_emr_deprecated def test_steps(): input_steps = [ StreamingStep( name="My wordcount example", mapper="s3n://elasticmapreduce/samples/wordcount/wordSplitter.py", reducer="aggregate", input="s3n://elasticmapreduce/samples/wordcount/input", output="s3n://output_bucket/output/wordcount_output", ), StreamingStep( name="My wordcount example & co.", mapper="s3n://elasticmapreduce/samples/wordcount/wordSplitter2.py", reducer="aggregate", input="s3n://elasticmapreduce/samples/wordcount/input2", output="s3n://output_bucket/output/wordcount_output2", ), ] # TODO: implementation and test for cancel_steps conn = boto.connect_emr() cluster_id = conn.run_jobflow(steps=[input_steps[0]], **run_jobflow_args) jf = conn.describe_jobflow(cluster_id) jf.steps.should.have.length_of(1) conn.add_jobflow_steps(cluster_id, [input_steps[1]]) jf = conn.describe_jobflow(cluster_id) jf.steps.should.have.length_of(2) for step in jf.steps: step.actiononfailure.should.equal("TERMINATE_JOB_FLOW") list(arg.value for arg in step.args).should.have.length_of(8) step.creationdatetime.should.be.a(str) # step.enddatetime.should.be.a(str) step.jar.should.equal("/home/hadoop/contrib/streaming/hadoop-streaming.jar") step.laststatechangereason.should.be.a(str) step.mainclass.should.equal("") step.name.should.be.a(str) # step.readydatetime.should.be.a(str) # step.startdatetime.should.be.a(str) step.state.should.be.within(["STARTING", "PENDING"]) expected = dict((s.name, s) for s in input_steps) steps = conn.list_steps(cluster_id).steps for x in steps: y = expected[x.name] # actiononfailure list(arg.value for arg in x.config.args).should.equal( [ "-mapper", y.mapper, "-reducer", y.reducer, "-input", y.input, "-output", y.output, ] ) x.config.jar.should.equal("/home/hadoop/contrib/streaming/hadoop-streaming.jar") x.config.mainclass.should.equal("") # properties x.should.have.property("id").should.be.a(str) x.name.should.equal(y.name) x.status.state.should.be.within(["STARTING", "PENDING"]) # x.status.statechangereason x.status.timeline.creationdatetime.should.be.a(str) # x.status.timeline.enddatetime.should.be.a(str) # x.status.timeline.startdatetime.should.be.a(str) x = conn.describe_step(cluster_id, x.id) list(arg.value for arg in x.config.args).should.equal( [ "-mapper", y.mapper, "-reducer", y.reducer, "-input", y.input, "-output", y.output, ] ) x.config.jar.should.equal("/home/hadoop/contrib/streaming/hadoop-streaming.jar") x.config.mainclass.should.equal("") # properties x.should.have.property("id").should.be.a(str) x.name.should.equal(y.name) x.status.state.should.be.within(["STARTING", "PENDING"]) # x.status.statechangereason x.status.timeline.creationdatetime.should.be.a(str) # x.status.timeline.enddatetime.should.be.a(str) # x.status.timeline.startdatetime.should.be.a(str) @requires_boto_gte("2.39") def test_list_steps_with_states(): # boto's list_steps prior to 2.39 has a bug that ignores # step_states argument. steps = conn.list_steps(cluster_id).steps step_id = steps[0].id steps = conn.list_steps(cluster_id, step_states=["STARTING"]).steps steps.should.have.length_of(1) steps[0].id.should.equal(step_id) test_list_steps_with_states() @mock_emr_deprecated def test_tags(): input_tags = {"tag1": "val1", "tag2": "val2"} conn = boto.connect_emr() cluster_id = conn.run_jobflow(**run_jobflow_args) conn.add_tags(cluster_id, input_tags) cluster = conn.describe_cluster(cluster_id) cluster.tags.should.have.length_of(2) dict((t.key, t.value) for t in cluster.tags).should.equal(input_tags) conn.remove_tags(cluster_id, list(input_tags.keys())) cluster = conn.describe_cluster(cluster_id) cluster.tags.should.have.length_of(0)