moto/tests/test_emr/test_emr_boto3.py

import json
import time
from copy import deepcopy
from datetime import datetime, timezone

import boto3
import pytest
from botocore.exceptions import ClientError

from moto import mock_emr
from moto.core import DEFAULT_ACCOUNT_ID as ACCOUNT_ID

run_job_flow_args = dict(
    Instances={
        "InstanceCount": 3,
        "KeepJobFlowAliveWhenNoSteps": True,
        "MasterInstanceType": "c3.medium",
        "Placement": {"AvailabilityZone": "us-east-1a"},
        "SlaveInstanceType": "c3.xlarge",
    },
    JobFlowRole="EMR_EC2_DefaultRole",
    LogUri="s3://mybucket/log",
    Name="cluster",
    ServiceRole="EMR_DefaultRole",
    VisibleToAllUsers=True,
)


input_instance_groups = [
    {
        "InstanceCount": 1,
        "InstanceRole": "MASTER",
        "InstanceType": "c1.medium",
        "Market": "ON_DEMAND",
        "Name": "master",
    },
    {
        "InstanceCount": 3,
        "InstanceRole": "CORE",
        "InstanceType": "c1.medium",
        "Market": "ON_DEMAND",
        "Name": "core",
    },
    {
        "InstanceCount": 6,
        "InstanceRole": "TASK",
        "InstanceType": "c3.large",
        "Market": "SPOT",
        "Name": "task-1",
        "BidPrice": "0.07",
    },
    {
        "InstanceCount": 10,
        "InstanceRole": "TASK",
        "InstanceType": "c1.xlarge",
        "Market": "SPOT",
        "Name": "task-2",
        "BidPrice": "0.05",
        "EbsConfiguration": {
            "EbsBlockDeviceConfigs": [
                {
                    "VolumeSpecification": {"VolumeType": "gp2", "SizeInGB": 800},
                    "VolumesPerInstance": 6,
                },
            ],
            "EbsOptimized": True,
        },
    },
]


@mock_emr
@pytest.mark.filterwarnings("ignore")
def test_describe_cluster():
    region_name = "us-east-1"
    client = boto3.client("emr", region_name=region_name)

    args = deepcopy(run_job_flow_args)
    args["Applications"] = [{"Name": "Spark", "Version": "2.4.2"}]
    args["Configurations"] = [
        {
            "Classification": "yarn-site",
            "Properties": {
                "someproperty": "somevalue",
                "someotherproperty": "someothervalue",
            },
        },
        {
            "Classification": "nested-configs",
            "Properties": {},
            "Configurations": [
                {
                    "Classification": "nested-config",
                    "Properties": {"nested-property": "nested-value"},
                }
            ],
        },
    ]
    args["Instances"]["AdditionalMasterSecurityGroups"] = ["additional-master"]
    args["Instances"]["AdditionalSlaveSecurityGroups"] = ["additional-slave"]
    args["Instances"]["Ec2KeyName"] = "mykey"
    args["Instances"]["Ec2SubnetId"] = "subnet-8be41cec"
    args["Instances"]["EmrManagedMasterSecurityGroup"] = "master-security-group"
    args["Instances"]["EmrManagedSlaveSecurityGroup"] = "slave-security-group"
    args["Instances"]["KeepJobFlowAliveWhenNoSteps"] = False
    args["Instances"]["ServiceAccessSecurityGroup"] = "service-access-security-group"
    args["KerberosAttributes"] = {
        "Realm": "MY-REALM.COM",
        "KdcAdminPassword": "SuperSecretPassword2",
        "CrossRealmTrustPrincipalPassword": "SuperSecretPassword3",
        "ADDomainJoinUser": "Bob",
        "ADDomainJoinPassword": "SuperSecretPassword4",
    }
    args["Tags"] = [{"Key": "tag1", "Value": "val1"}, {"Key": "tag2", "Value": "val2"}]
    args["SecurityConfiguration"] = "my-security-configuration"
    args["AutoScalingRole"] = "EMR_AutoScaling_DefaultRole"
    args["AutoTerminationPolicy"] = {"IdleTimeout": 123}

    cluster_id = client.run_job_flow(**args)["JobFlowId"]

    cl = client.describe_cluster(ClusterId=cluster_id)["Cluster"]
    assert cl["Applications"][0]["Name"] == "Spark"
    assert cl["Applications"][0]["Version"] == "2.4.2"
    assert cl["AutoTerminate"] is True

    config = cl["Configurations"][0]
    assert config["Classification"] == "yarn-site"
    assert config["Properties"] == args["Configurations"][0]["Properties"]

    nested_config = cl["Configurations"][1]
    assert nested_config["Classification"] == "nested-configs"
    assert nested_config["Properties"] == args["Configurations"][1]["Properties"]

    attrs = cl["Ec2InstanceAttributes"]
    assert (
        attrs["AdditionalMasterSecurityGroups"]
        == args["Instances"]["AdditionalMasterSecurityGroups"]
    )
    assert (
        attrs["AdditionalSlaveSecurityGroups"]
        == args["Instances"]["AdditionalSlaveSecurityGroups"]
    )
    assert attrs["Ec2AvailabilityZone"] == "us-east-1a"
    assert attrs["Ec2KeyName"] == args["Instances"]["Ec2KeyName"]
    assert attrs["Ec2SubnetId"] == args["Instances"]["Ec2SubnetId"]
    assert (
        attrs["EmrManagedMasterSecurityGroup"]
        == args["Instances"]["EmrManagedMasterSecurityGroup"]
    )
    assert (
        attrs["EmrManagedSlaveSecurityGroup"]
        == args["Instances"]["EmrManagedSlaveSecurityGroup"]
    )
    assert attrs["IamInstanceProfile"] == args["JobFlowRole"]
    assert (
        attrs["ServiceAccessSecurityGroup"]
        == args["Instances"]["ServiceAccessSecurityGroup"]
    )
    assert cl["Id"] == cluster_id
    assert cl["KerberosAttributes"] == args["KerberosAttributes"]
    assert cl["LogUri"] == args["LogUri"]
    assert isinstance(cl["MasterPublicDnsName"], str)
    assert cl["Name"] == args["Name"]
    assert cl["NormalizedInstanceHours"] == 0
    # assert cl['ReleaseLabel'] == 'emr-5.0.0'
    assert "RequestedAmiVersion" not in cl
    assert cl["RunningAmiVersion"] == "1.0.0"
    assert isinstance(cl["SecurityConfiguration"], str)
    assert cl["SecurityConfiguration"] == args["SecurityConfiguration"]
    assert cl["ServiceRole"] == args["ServiceRole"]
    assert cl["AutoScalingRole"] == args["AutoScalingRole"]

    status = cl["Status"]
    assert status["State"] == "TERMINATED"
    # cluster['Status']['StateChangeReason']
    assert isinstance(status["Timeline"]["CreationDateTime"], datetime)
    # assert status['Timeline']['EndDateTime'] == datetime(2014, 1, 24, 2, 19, 46, tzinfo=timezone.utc)
    assert isinstance(status["Timeline"]["ReadyDateTime"], datetime)

    assert {t["Key"]: t["Value"] for t in cl["Tags"]} == {
        t["Key"]: t["Value"] for t in args["Tags"]
    }

    assert cl["TerminationProtected"] is False
    assert cl["VisibleToAllUsers"] is True
    assert (
        cl["ClusterArn"]
        == f"arn:aws:elasticmapreduce:{region_name}:{ACCOUNT_ID}:cluster/{cluster_id}"
    )


@mock_emr
def test_describe_cluster_not_found():
    conn = boto3.client("emr", region_name="us-east-1")
    with pytest.raises(ClientError) as e:
        conn.describe_cluster(ClusterId="DummyId")

    assert e.value.response["Error"]["Code"] == "ResourceNotFoundException"


@mock_emr
def test_describe_job_flows():
    client = boto3.client("emr", region_name="us-east-1")
    args = deepcopy(run_job_flow_args)
    expected = {}

    for idx in range(4):
        cluster_name = "cluster" + str(idx)
        args["Name"] = cluster_name
        cluster_id = client.run_job_flow(**args)["JobFlowId"]
        expected[cluster_id] = {
            "Id": cluster_id,
            "Name": cluster_name,
            "State": "WAITING",
        }

    # need sleep since it appears the timestamp is always rounded to
    # the nearest second internally
    time.sleep(1)
    timestamp = datetime.now(timezone.utc)
    time.sleep(1)

    for idx in range(4, 6):
        cluster_name = "cluster" + str(idx)
        args["Name"] = cluster_name
        cluster_id = client.run_job_flow(**args)["JobFlowId"]
        client.terminate_job_flows(JobFlowIds=[cluster_id])
        expected[cluster_id] = {
            "Id": cluster_id,
            "Name": cluster_name,
            "State": "TERMINATED",
        }

    resp = client.describe_job_flows()
    assert len(resp["JobFlows"]) == 6

    for cluster_id in expected:
        resp = client.describe_job_flows(JobFlowIds=[cluster_id])
        assert len(resp["JobFlows"]) == 1
        assert resp["JobFlows"][0]["JobFlowId"] == cluster_id

    resp = client.describe_job_flows(JobFlowStates=["WAITING"])
    assert len(resp["JobFlows"]) == 4
    for x in resp["JobFlows"]:
        assert x["ExecutionStatusDetail"]["State"] == "WAITING"

    resp = client.describe_job_flows(CreatedBefore=timestamp)
    assert len(resp["JobFlows"]) == 4

    resp = client.describe_job_flows(CreatedAfter=timestamp)
    assert len(resp["JobFlows"]) == 2


@mock_emr
@pytest.mark.filterwarnings("ignore")
def test_describe_job_flow():
    client = boto3.client("emr", region_name="us-east-1")

    args = deepcopy(run_job_flow_args)
    args["AmiVersion"] = "3.8.1"
    args["Instances"].update(
        {
            "Ec2KeyName": "ec2keyname",
            "Ec2SubnetId": "subnet-8be41cec",
            "HadoopVersion": "2.4.0",
        }
    )
    args["VisibleToAllUsers"] = True

    cluster_id = client.run_job_flow(**args)["JobFlowId"]

    jf = client.describe_job_flows(JobFlowIds=[cluster_id])["JobFlows"][0]

    assert jf["AmiVersion"] == args["AmiVersion"]
    assert "BootstrapActions" not in jf
    esd = jf["ExecutionStatusDetail"]
    assert isinstance(esd["CreationDateTime"], datetime)
    # assert isinstance(esd['EndDateTime'], 'datetime.datetime')
    # assert isinstance(esd['LastStateChangeReason'], str)
    assert isinstance(esd["ReadyDateTime"], datetime)
    assert isinstance(esd["StartDateTime"], datetime)
    assert esd["State"] == "WAITING"
    attrs = jf["Instances"]
    assert attrs["Ec2KeyName"] == args["Instances"]["Ec2KeyName"]
    assert attrs["Ec2SubnetId"] == args["Instances"]["Ec2SubnetId"]
    assert attrs["HadoopVersion"] == args["Instances"]["HadoopVersion"]
    assert attrs["InstanceCount"] == args["Instances"]["InstanceCount"]
    for ig in attrs["InstanceGroups"]:
        # ig['BidPrice']
        assert isinstance(ig["CreationDateTime"], datetime)
        # assert isinstance(ig['EndDateTime'], 'datetime.datetime')
        assert isinstance(ig["InstanceGroupId"], str)
        assert isinstance(ig["InstanceRequestCount"], int)
        assert ig["InstanceRole"] in ["MASTER", "CORE"]
        assert isinstance(ig["InstanceRunningCount"], int)
        assert ig["InstanceType"] in ["c3.medium", "c3.xlarge"]
        # assert isinstance(ig['LastStateChangeReason'], str)
        assert ig["Market"] == "ON_DEMAND"
        assert isinstance(ig["Name"], str)
        assert isinstance(ig["ReadyDateTime"], datetime)
        assert isinstance(ig["StartDateTime"], datetime)
        assert ig["State"] == "RUNNING"
    assert attrs["KeepJobFlowAliveWhenNoSteps"] is True
    # assert isinstance(attrs['MasterInstanceId'], str)
    assert attrs["MasterInstanceType"] == args["Instances"]["MasterInstanceType"]
    assert isinstance(attrs["MasterPublicDnsName"], str)
    assert attrs["NormalizedInstanceHours"] == 0
    assert (
        attrs["Placement"]["AvailabilityZone"]
        == args["Instances"]["Placement"]["AvailabilityZone"]
    )
    assert attrs["SlaveInstanceType"] == args["Instances"]["SlaveInstanceType"]
    assert attrs["TerminationProtected"] is False
    assert jf["JobFlowId"] == cluster_id
    assert jf["JobFlowRole"] == args["JobFlowRole"]
    assert jf["LogUri"] == args["LogUri"]
    assert jf["Name"] == args["Name"]
    assert jf["ServiceRole"] == args["ServiceRole"]
    assert jf["Steps"] == []
    assert jf["SupportedProducts"] == []
    assert jf["VisibleToAllUsers"] is True


@mock_emr
def test_list_clusters():
    client = boto3.client("emr", region_name="us-east-1")
    args = deepcopy(run_job_flow_args)
    expected = {}

    for idx in range(40):
        cluster_name = "jobflow" + str(idx)
        args["Name"] = cluster_name
        cluster_id = client.run_job_flow(**args)["JobFlowId"]
        expected[cluster_id] = {
            "Id": cluster_id,
            "Name": cluster_name,
            "NormalizedInstanceHours": 0,
            "State": "WAITING",
        }

    # need sleep since it appears the timestamp is always rounded to
    # the nearest second internally
    time.sleep(1)
    timestamp = datetime.now(timezone.utc)
    time.sleep(1)

    for idx in range(40, 70):
        cluster_name = "jobflow" + str(idx)
        args["Name"] = cluster_name
        cluster_id = client.run_job_flow(**args)["JobFlowId"]
        client.terminate_job_flows(JobFlowIds=[cluster_id])
        expected[cluster_id] = {
            "Id": cluster_id,
            "Name": cluster_name,
            "NormalizedInstanceHours": 0,
            "State": "TERMINATED",
        }

    args = {}
    while 1:
        resp = client.list_clusters(**args)
        clusters = resp["Clusters"]
        assert len(clusters) <= 50
        for x in clusters:
            y = expected[x["Id"]]
            assert x["Id"] == y["Id"]
            assert x["Name"] == y["Name"]
            assert x["NormalizedInstanceHours"] == y["NormalizedInstanceHours"]
            assert x["Status"]["State"] == y["State"]
            assert isinstance(x["Status"]["Timeline"]["CreationDateTime"], datetime)
            if y["State"] == "TERMINATED":
                assert isinstance(x["Status"]["Timeline"]["EndDateTime"], datetime)
            else:
                assert "EndDateTime" not in x["Status"]["Timeline"]
            assert isinstance(x["Status"]["Timeline"]["ReadyDateTime"], datetime)
        marker = resp.get("Marker")
        if marker is None:
            break
        args = {"Marker": marker}

    resp = client.list_clusters(ClusterStates=["TERMINATED"])
    assert len(resp["Clusters"]) == 30
    for x in resp["Clusters"]:
        assert x["Status"]["State"] == "TERMINATED"

    resp = client.list_clusters(CreatedBefore=timestamp)
    assert len(resp["Clusters"]) == 40

    resp = client.list_clusters(CreatedAfter=timestamp)
    assert len(resp["Clusters"]) == 30


@mock_emr
def test_run_job_flow():
    region_name = "us-east-1"
    client = boto3.client("emr", region_name=region_name)
    args = deepcopy(run_job_flow_args)
    resp = client.run_job_flow(**args)
    resp["ClusterArn"].startswith(
        f"arn:aws:elasticmapreduce:{region_name}:{ACCOUNT_ID}:cluster/"
    )
    job_flow_id = resp["JobFlowId"]
    resp = client.describe_job_flows(JobFlowIds=[job_flow_id])["JobFlows"][0]
    assert resp["ExecutionStatusDetail"]["State"] == "WAITING"
    assert resp["JobFlowId"] == job_flow_id
    assert resp["Name"] == args["Name"]
    assert (
        resp["Instances"]["MasterInstanceType"]
        == args["Instances"]["MasterInstanceType"]
    )
    assert (
        resp["Instances"]["SlaveInstanceType"] == args["Instances"]["SlaveInstanceType"]
    )
    assert resp["LogUri"] == args["LogUri"]
    assert resp["VisibleToAllUsers"] == args["VisibleToAllUsers"]
    assert resp["Instances"]["NormalizedInstanceHours"] == 0
    assert resp["Steps"] == []


@mock_emr
def test_run_job_flow_with_invalid_params():
    client = boto3.client("emr", region_name="us-east-1")
    with pytest.raises(ClientError) as ex:
        # cannot set both AmiVersion and ReleaseLabel
        args = deepcopy(run_job_flow_args)
        args["AmiVersion"] = "2.4"
        args["ReleaseLabel"] = "emr-5.0.0"
        client.run_job_flow(**args)
    assert ex.value.response["Error"]["Code"] == "ValidationException"


@mock_emr
def test_run_job_flow_in_multiple_regions():
    regions = {}
    for region in ["us-east-1", "eu-west-1"]:
        client = boto3.client("emr", region_name=region)
        args = deepcopy(run_job_flow_args)
        args["Name"] = region
        cluster_id = client.run_job_flow(**args)["JobFlowId"]
        regions[region] = {"client": client, "cluster_id": cluster_id}

    for region in regions.keys():
        client = regions[region]["client"]
        resp = client.describe_cluster(ClusterId=regions[region]["cluster_id"])
        assert resp["Cluster"]["Name"] == region


@mock_emr
def test_run_job_flow_with_new_params():
    client = boto3.client("emr", region_name="us-east-1")
    resp = client.run_job_flow(**run_job_flow_args)
    assert "JobFlowId" in resp


@mock_emr
def test_run_job_flow_with_visible_to_all_users():
    client = boto3.client("emr", region_name="us-east-1")
    for expected in (True, False):
        args = deepcopy(run_job_flow_args)
        args["VisibleToAllUsers"] = expected
        resp = client.run_job_flow(**args)
        cluster_id = resp["JobFlowId"]
        resp = client.describe_cluster(ClusterId=cluster_id)
        assert resp["Cluster"]["VisibleToAllUsers"] == expected


def _do_assertion_ebs_configuration(x, y):
    total_volumes = 0
    total_size = 0
    for ebs_block in y["EbsConfiguration"]["EbsBlockDeviceConfigs"]:
        total_volumes += ebs_block["VolumesPerInstance"]
        total_size += ebs_block["VolumeSpecification"]["SizeInGB"]
    # Multiply by total volumes
    total_size = total_size * total_volumes
    comp_total_size = 0
    for ebs_block in x["EbsBlockDevices"]:
        comp_total_size += ebs_block["VolumeSpecification"]["SizeInGB"]
    assert len(x["EbsBlockDevices"]) == total_volumes
    assert comp_total_size == comp_total_size


@mock_emr
def test_run_job_flow_with_instance_groups():
    input_groups = dict((g["Name"], g) for g in input_instance_groups)
    client = boto3.client("emr", region_name="us-east-1")
    args = deepcopy(run_job_flow_args)
    args["Instances"] = {"InstanceGroups": input_instance_groups}
    cluster_id = client.run_job_flow(**args)["JobFlowId"]
    groups = client.list_instance_groups(ClusterId=cluster_id)["InstanceGroups"]
    for x in groups:
        y = input_groups[x["Name"]]
        assert "Id" in x
        assert x["RequestedInstanceCount"] == y["InstanceCount"]
        assert x["InstanceGroupType"] == y["InstanceRole"]
        assert x["InstanceType"] == y["InstanceType"]
        assert x["Market"] == y["Market"]
        if "BidPrice" in y:
            assert x["BidPrice"] == y["BidPrice"]

        if "EbsConfiguration" in y:
            _do_assertion_ebs_configuration(x, y)


auto_scaling_policy = {
    "Constraints": {"MinCapacity": 2, "MaxCapacity": 10},
    "Rules": [
        {
            "Name": "Default-scale-out",
            "Description": "Replicates the default scale-out rule in the console for YARN memory.",
            "Action": {
                "SimpleScalingPolicyConfiguration": {
                    "AdjustmentType": "CHANGE_IN_CAPACITY",
                    "ScalingAdjustment": 1,
                    "CoolDown": 300,
                }
            },
            "Trigger": {
                "CloudWatchAlarmDefinition": {
                    "ComparisonOperator": "LESS_THAN",
                    "EvaluationPeriods": 1,
                    "MetricName": "YARNMemoryAvailablePercentage",
                    "Namespace": "AWS/ElasticMapReduce",
                    "Period": 300,
                    "Threshold": 15.0,
                    "Statistic": "AVERAGE",
                    "Unit": "PERCENT",
                    "Dimensions": [{"Key": "JobFlowId", "Value": "${emr.clusterId}"}],
                }
            },
        }
    ],
}


@mock_emr
def test_run_job_flow_with_instance_groups_with_autoscaling():
    input_groups = dict((g["Name"], g) for g in input_instance_groups)

    input_groups["core"]["AutoScalingPolicy"] = auto_scaling_policy
    input_groups["task-1"]["AutoScalingPolicy"] = auto_scaling_policy

    client = boto3.client("emr", region_name="us-east-1")
    args = deepcopy(run_job_flow_args)
    args["Instances"] = {"InstanceGroups": input_instance_groups}
    cluster_id = client.run_job_flow(**args)["JobFlowId"]
    groups = client.list_instance_groups(ClusterId=cluster_id)["InstanceGroups"]
    for x in groups:
        y = deepcopy(input_groups[x["Name"]])
        if "AutoScalingPolicy" in y:
            assert x["AutoScalingPolicy"]["Status"]["State"] == "ATTACHED"
            returned_policy = deepcopy(x["AutoScalingPolicy"])
            auto_scaling_policy_with_cluster_id = (
                _patch_cluster_id_placeholder_in_autoscaling_policy(
                    y["AutoScalingPolicy"], cluster_id
                )
            )
            del returned_policy["Status"]
            assert returned_policy == auto_scaling_policy_with_cluster_id


@mock_emr
def test_put_remove_auto_scaling_policy():
    region_name = "us-east-1"
    client = boto3.client("emr", region_name=region_name)
    args = deepcopy(run_job_flow_args)
    args["Instances"] = {"InstanceGroups": input_instance_groups}
    cluster_id = client.run_job_flow(**args)["JobFlowId"]

    core_instance_group = [
        ig
        for ig in client.list_instance_groups(ClusterId=cluster_id)["InstanceGroups"]
        if ig["InstanceGroupType"] == "CORE"
    ][0]

    resp = client.put_auto_scaling_policy(
        ClusterId=cluster_id,
        InstanceGroupId=core_instance_group["Id"],
        AutoScalingPolicy=auto_scaling_policy,
    )

    auto_scaling_policy_with_cluster_id = (
        _patch_cluster_id_placeholder_in_autoscaling_policy(
            auto_scaling_policy, cluster_id
        )
    )
    del resp["AutoScalingPolicy"]["Status"]
    assert resp["AutoScalingPolicy"] == auto_scaling_policy_with_cluster_id
    assert (
        resp["ClusterArn"]
        == f"arn:aws:elasticmapreduce:{region_name}:{ACCOUNT_ID}:cluster/{cluster_id}"
    )

    core_instance_group = [
        ig
        for ig in client.list_instance_groups(ClusterId=cluster_id)["InstanceGroups"]
        if ig["InstanceGroupType"] == "CORE"
    ][0]

    assert "AutoScalingPolicy" in core_instance_group

    client.remove_auto_scaling_policy(
        ClusterId=cluster_id, InstanceGroupId=core_instance_group["Id"]
    )

    core_instance_group = [
        ig
        for ig in client.list_instance_groups(ClusterId=cluster_id)["InstanceGroups"]
        if ig["InstanceGroupType"] == "CORE"
    ][0]

    assert "AutoScalingPolicy" not in core_instance_group


def _patch_cluster_id_placeholder_in_autoscaling_policy(policy, cluster_id):
    policy_copy = deepcopy(policy)
    for rule in policy_copy["Rules"]:
        for dimension in rule["Trigger"]["CloudWatchAlarmDefinition"]["Dimensions"]:
            dimension["Value"] = cluster_id
    return policy_copy


@mock_emr
def test_run_job_flow_with_custom_ami():
    client = boto3.client("emr", region_name="us-east-1")

    with pytest.raises(ClientError) as ex:
        # CustomAmiId available in Amazon EMR 5.7.0 and later
        args = deepcopy(run_job_flow_args)
        args["CustomAmiId"] = "MyEmrCustomId"
        args["ReleaseLabel"] = "emr-5.6.0"
        client.run_job_flow(**args)
    assert ex.value.response["Error"]["Code"] == "ValidationException"
    assert ex.value.response["Error"]["Message"] == "Custom AMI is not allowed"

    with pytest.raises(ClientError) as ex:
        args = deepcopy(run_job_flow_args)
        args["CustomAmiId"] = "MyEmrCustomId"
        args["AmiVersion"] = "3.8.1"
        client.run_job_flow(**args)
    err = ex.value.response["Error"]
    assert err["Code"] == "ValidationException"
    assert err["Message"] == "Custom AMI is not supported in this version of EMR"

    with pytest.raises(ClientError) as ex:
        # AMI version and release label exception  raises before CustomAmi exception
        args = deepcopy(run_job_flow_args)
        args["CustomAmiId"] = "MyEmrCustomId"
        args["ReleaseLabel"] = "emr-5.6.0"
        args["AmiVersion"] = "3.8.1"
        client.run_job_flow(**args)
    err = ex.value.response["Error"]
    assert err["Code"] == "ValidationException"
    assert "Only one AMI version and release label may be specified." in err["Message"]

    args = deepcopy(run_job_flow_args)
    args["CustomAmiId"] = "MyEmrCustomAmi"
    args["ReleaseLabel"] = "emr-5.31.0"
    cluster_id = client.run_job_flow(**args)["JobFlowId"]
    resp = client.describe_cluster(ClusterId=cluster_id)
    assert resp["Cluster"]["CustomAmiId"] == "MyEmrCustomAmi"


@mock_emr
def test_run_job_flow_with_step_concurrency():
    client = boto3.client("emr", region_name="us-east-1")
    args = deepcopy(run_job_flow_args)
    args["StepConcurrencyLevel"] = 2
    cluster_id = client.run_job_flow(**args)["JobFlowId"]
    resp = client.describe_cluster(ClusterId=cluster_id)["Cluster"]
    assert resp["Name"] == args["Name"]
    assert resp["Status"]["State"] == "WAITING"
    assert resp["StepConcurrencyLevel"] == 2


@mock_emr
def test_modify_cluster():
    client = boto3.client("emr", region_name="us-east-1")
    args = deepcopy(run_job_flow_args)
    args["StepConcurrencyLevel"] = 2
    cluster_id = client.run_job_flow(**args)["JobFlowId"]
    resp = client.describe_cluster(ClusterId=cluster_id)["Cluster"]
    assert resp["Name"] == args["Name"]
    assert resp["Status"]["State"] == "WAITING"
    assert resp["StepConcurrencyLevel"] == 2

    resp = client.modify_cluster(ClusterId=cluster_id, StepConcurrencyLevel=4)
    assert resp["StepConcurrencyLevel"] == 4

    resp = client.describe_cluster(ClusterId=cluster_id)["Cluster"]
    assert resp["StepConcurrencyLevel"] == 4


@mock_emr
def test_set_termination_protection():
    client = boto3.client("emr", region_name="us-east-1")
    args = deepcopy(run_job_flow_args)
    args["Instances"]["TerminationProtected"] = False
    resp = client.run_job_flow(**args)
    cluster_id = resp["JobFlowId"]
    resp = client.describe_cluster(ClusterId=cluster_id)
    assert resp["Cluster"]["TerminationProtected"] is False

    for expected in (True, False):
        resp = client.set_termination_protection(
            JobFlowIds=[cluster_id], TerminationProtected=expected
        )
        resp = client.describe_cluster(ClusterId=cluster_id)
        assert resp["Cluster"]["TerminationProtected"] == expected


@mock_emr
def test_terminate_protected_job_flow_raises_error():
    client = boto3.client("emr", region_name="us-east-1")
    resp = client.run_job_flow(**run_job_flow_args)
    cluster_id = resp["JobFlowId"]
    client.set_termination_protection(
        JobFlowIds=[cluster_id], TerminationProtected=True
    )
    with pytest.raises(ClientError) as ex:
        client.terminate_job_flows(JobFlowIds=[cluster_id])
    error = ex.value.response["Error"]
    assert error["Code"] == "ValidationException"
    assert (
        error["Message"]
        == "Could not shut down one or more job flows since they are termination protected."
    )


@mock_emr
def test_set_visible_to_all_users():
    client = boto3.client("emr", region_name="us-east-1")
    args = deepcopy(run_job_flow_args)
    args["VisibleToAllUsers"] = False
    resp = client.run_job_flow(**args)
    cluster_id = resp["JobFlowId"]
    resp = client.describe_cluster(ClusterId=cluster_id)
    assert resp["Cluster"]["VisibleToAllUsers"] is False

    for expected in (True, False):
        resp = client.set_visible_to_all_users(
            JobFlowIds=[cluster_id], VisibleToAllUsers=expected
        )
        resp = client.describe_cluster(ClusterId=cluster_id)
        assert resp["Cluster"]["VisibleToAllUsers"] == expected


@mock_emr
def test_terminate_job_flows():
    client = boto3.client("emr", region_name="us-east-1")

    resp = client.run_job_flow(**run_job_flow_args)
    cluster_id = resp["JobFlowId"]
    resp = client.describe_cluster(ClusterId=cluster_id)
    assert resp["Cluster"]["Status"]["State"] == "WAITING"

    resp = client.terminate_job_flows(JobFlowIds=[cluster_id])
    resp = client.describe_cluster(ClusterId=cluster_id)
    assert resp["Cluster"]["Status"]["State"] == "TERMINATED"


# testing multiple end points for each feature


@mock_emr
def test_bootstrap_actions():
    bootstrap_actions = [
        {
            "Name": "bs1",
            "ScriptBootstrapAction": {
                "Args": ["arg1", "arg2"],
                "Path": "s3://path/to/script",
            },
        },
        {
            "Name": "bs2",
            "ScriptBootstrapAction": {"Args": [], "Path": "s3://path/to/anotherscript"},
        },
    ]

    client = boto3.client("emr", region_name="us-east-1")
    args = deepcopy(run_job_flow_args)
    args["BootstrapActions"] = bootstrap_actions
    cluster_id = client.run_job_flow(**args)["JobFlowId"]

    cl = client.describe_job_flows(JobFlowIds=[cluster_id])["JobFlows"][0]
    for x, y in zip(cl["BootstrapActions"], bootstrap_actions):
        assert x["BootstrapActionConfig"] == y

    resp = client.list_bootstrap_actions(ClusterId=cluster_id)
    for x, y in zip(resp["BootstrapActions"], bootstrap_actions):
        assert x["Name"] == y["Name"]
        if "Args" in y["ScriptBootstrapAction"]:
            assert x["Args"] == y["ScriptBootstrapAction"]["Args"]
        assert x["ScriptPath"] == y["ScriptBootstrapAction"]["Path"]


@mock_emr
def test_instances():
    input_groups = dict((g["Name"], g) for g in input_instance_groups)
    client = boto3.client("emr", region_name="us-east-1")
    args = deepcopy(run_job_flow_args)
    args["Instances"] = {"InstanceGroups": input_instance_groups}
    cluster_id = client.run_job_flow(**args)["JobFlowId"]
    jf = client.describe_job_flows(JobFlowIds=[cluster_id])["JobFlows"][0]
    instances = client.list_instances(ClusterId=cluster_id)["Instances"]
    assert len(instances) == sum(g["InstanceCount"] for g in input_instance_groups)
    for x in instances:
        assert "InstanceGroupId" in x
        instance_group = [
            j
            for j in jf["Instances"]["InstanceGroups"]
            if j["InstanceGroupId"] == x["InstanceGroupId"]
        ]
        assert len(instance_group) == 1
        y = input_groups[instance_group[0]["Name"]]
        assert "Id" in x
        assert "Ec2InstanceId" in x
        assert "PublicDnsName" in x
        assert "PublicIpAddress" in x
        assert "PrivateDnsName" in x
        assert "PrivateIpAddress" in x
        assert "InstanceFleetId" in x
        assert x["InstanceType"] == y["InstanceType"]
        assert x["Market"] == y["Market"]
        assert isinstance(x["Status"]["Timeline"]["ReadyDateTime"], datetime)
        assert isinstance(x["Status"]["Timeline"]["CreationDateTime"], datetime)
        assert x["Status"]["State"] == "RUNNING"

    for x in [["MASTER"], ["CORE"], ["TASK"], ["MASTER", "TASK"]]:
        instances = client.list_instances(ClusterId=cluster_id, InstanceGroupTypes=x)[
            "Instances"
        ]
        assert len(instances) == sum(
            g["InstanceCount"] for g in input_instance_groups if g["InstanceRole"] in x
        )


@mock_emr
def test_instance_groups():
    input_groups = dict((g["Name"], g) for g in input_instance_groups)

    client = boto3.client("emr", region_name="us-east-1")
    args = deepcopy(run_job_flow_args)
    for key in ["MasterInstanceType", "SlaveInstanceType", "InstanceCount"]:
        del args["Instances"][key]
    args["Instances"]["InstanceGroups"] = input_instance_groups[:2]
    cluster_id = client.run_job_flow(**args)["JobFlowId"]

    jf = client.describe_job_flows(JobFlowIds=[cluster_id])["JobFlows"][0]
    base_instance_count = jf["Instances"]["InstanceCount"]

    instance_groups_to_add = deepcopy(input_instance_groups[2:])
    instance_groups_to_add[0]["AutoScalingPolicy"] = auto_scaling_policy
    instance_groups_to_add[1]["AutoScalingPolicy"] = auto_scaling_policy
    client.add_instance_groups(
        JobFlowId=cluster_id, InstanceGroups=instance_groups_to_add
    )

    jf = client.describe_job_flows(JobFlowIds=[cluster_id])["JobFlows"][0]
    assert jf["Instances"]["InstanceCount"] == sum(
        g["InstanceCount"] for g in input_instance_groups
    )
    for x in jf["Instances"]["InstanceGroups"]:
        y = input_groups[x["Name"]]
        if "BidPrice" in y:
            assert x["BidPrice"] == y["BidPrice"]
        assert isinstance(x["CreationDateTime"], datetime)
        # assert isinstance(x['EndDateTime'], 'datetime.datetime')
        assert "InstanceGroupId" in x
        assert x["InstanceRequestCount"] == y["InstanceCount"]
        assert x["InstanceRole"] == y["InstanceRole"]
        assert x["InstanceRunningCount"] == y["InstanceCount"]
        assert x["InstanceType"] == y["InstanceType"]
        # assert x['LastStateChangeReason'] == y['LastStateChangeReason']
        assert x["Market"] == y["Market"]
        assert x["Name"] == y["Name"]
        assert isinstance(x["ReadyDateTime"], datetime)
        assert isinstance(x["StartDateTime"], datetime)
        assert x["State"] == "RUNNING"
    groups = client.list_instance_groups(ClusterId=cluster_id)["InstanceGroups"]
    for x in groups:
        y = deepcopy(input_groups[x["Name"]])
        if "BidPrice" in y:
            assert x["BidPrice"] == y["BidPrice"]
        if "AutoScalingPolicy" in y:
            assert x["AutoScalingPolicy"]["Status"]["State"] == "ATTACHED"
            returned_policy = dict(x["AutoScalingPolicy"])
            del returned_policy["Status"]
            policy = json.loads(
                json.dumps(y["AutoScalingPolicy"]).replace(
                    "${emr.clusterId}", cluster_id
                )
            )
            assert returned_policy == policy
        if "EbsConfiguration" in y:
            _do_assertion_ebs_configuration(x, y)
        # Configurations
        # EbsBlockDevices
        # EbsOptimized
        assert "Id" in x
        assert x["InstanceGroupType"] == y["InstanceRole"]
        assert x["InstanceType"] == y["InstanceType"]
        assert x["Market"] == y["Market"]
        assert x["Name"] == y["Name"]
        assert x["RequestedInstanceCount"] == y["InstanceCount"]
        assert x["RunningInstanceCount"] == y["InstanceCount"]
        # ShrinkPolicy
        assert x["Status"]["State"] == "RUNNING"
        assert isinstance(x["Status"]["StateChangeReason"]["Code"], str)
        # assert isinstance(x['Status']['StateChangeReason']['Message'], str)
        assert isinstance(x["Status"]["Timeline"]["CreationDateTime"], datetime)
        # assert isinstance(x['Status']['Timeline']['EndDateTime'], 'datetime.datetime')
        assert isinstance(x["Status"]["Timeline"]["ReadyDateTime"], datetime)

    igs = dict((g["Name"], g) for g in groups)
    client.modify_instance_groups(
        InstanceGroups=[
            {"InstanceGroupId": igs["task-1"]["Id"], "InstanceCount": 2},
            {"InstanceGroupId": igs["task-2"]["Id"], "InstanceCount": 3},
        ]
    )
    jf = client.describe_job_flows(JobFlowIds=[cluster_id])["JobFlows"][0]
    assert jf["Instances"]["InstanceCount"] == base_instance_count + 5
    igs = dict((g["Name"], g) for g in jf["Instances"]["InstanceGroups"])
    assert igs["task-1"]["InstanceRunningCount"] == 2
    assert igs["task-2"]["InstanceRunningCount"] == 3


@mock_emr
def test_steps():
    input_steps = [
        {
            "HadoopJarStep": {
                "Args": [
                    "hadoop-streaming",
                    "-files",
                    "s3://elasticmapreduce/samples/wordcount/wordSplitter.py#wordSplitter.py",
                    "-mapper",
                    "python wordSplitter.py",
                    "-input",
                    "s3://elasticmapreduce/samples/wordcount/input",
                    "-output",
                    "s3://output_bucket/output/wordcount_output",
                    "-reducer",
                    "aggregate",
                ],
                "Jar": "command-runner.jar",
                "Properties": [
                    {"Key": "mapred.tasktracker.map.tasks.maximum", "Value": "2"}
                ],
            },
            "Name": "My wordcount example",
        },
        {
            "HadoopJarStep": {
                "Args": [
                    "hadoop-streaming",
                    "-files",
                    "s3://elasticmapreduce/samples/wordcount/wordSplitter2.py#wordSplitter2.py",
                    "-mapper",
                    "python wordSplitter2.py",
                    "-input",
                    "s3://elasticmapreduce/samples/wordcount/input2",
                    "-output",
                    "s3://output_bucket/output/wordcount_output2",
                    "-reducer",
                    "aggregate",
                ],
                "Jar": "command-runner.jar",
                "Properties": [
                    {"Key": "mapred.reduce.tasks", "Value": "0"},
                    {"Key": "stream.map.output.field.separator", "Value": "."},
                ],
            },
            "Name": "My wordcount example2",
        },
    ]

    # TODO: implementation and test for cancel_steps

    client = boto3.client("emr", region_name="us-east-1")
    args = deepcopy(run_job_flow_args)
    args["Steps"] = [input_steps[0]]
    cluster_id = client.run_job_flow(**args)["JobFlowId"]

    jf = client.describe_job_flows(JobFlowIds=[cluster_id])["JobFlows"][0]
    assert len(jf["Steps"]) == 1

    client.add_job_flow_steps(JobFlowId=cluster_id, Steps=[input_steps[1]])

    jf = client.describe_job_flows(JobFlowIds=[cluster_id])["JobFlows"][0]
    assert len(jf["Steps"]) == 2
    for idx, (x, y) in enumerate(zip(jf["Steps"], input_steps)):
        assert "CreationDateTime" in x["ExecutionStatusDetail"]
        # assert 'EndDateTime' in x['ExecutionStatusDetail']
        # assert 'LastStateChangeReason' in x['ExecutionStatusDetail']
        # assert 'StartDateTime' in x['ExecutionStatusDetail']
        assert (
            x["ExecutionStatusDetail"]["State"] == "RUNNING" if idx == 0 else "PENDING"
        )
        assert x["StepConfig"]["ActionOnFailure"] == "TERMINATE_CLUSTER"
        assert x["StepConfig"]["HadoopJarStep"]["Args"] == y["HadoopJarStep"]["Args"]
        assert x["StepConfig"]["HadoopJarStep"]["Jar"] == y["HadoopJarStep"]["Jar"]
        if "MainClass" in y["HadoopJarStep"]:
            assert (
                x["StepConfig"]["HadoopJarStep"]["MainClass"]
                == y["HadoopJarStep"]["MainClass"]
            )
        if "Properties" in y["HadoopJarStep"]:
            assert (
                x["StepConfig"]["HadoopJarStep"]["Properties"]
                == y["HadoopJarStep"]["Properties"]
            )
        assert x["StepConfig"]["Name"] == y["Name"]

    expected = dict((s["Name"], s) for s in input_steps)

    steps = client.list_steps(ClusterId=cluster_id)["Steps"]
    assert len(steps) == 2
    # Steps should be returned in reverse order.
    assert (
        sorted(
            steps,
            key=lambda o: o["Status"]["Timeline"]["CreationDateTime"],
            reverse=True,
        )
        == steps
    )
    for x in steps:
        y = expected[x["Name"]]
        assert x["ActionOnFailure"] == "TERMINATE_CLUSTER"
        assert x["Config"]["Args"] == y["HadoopJarStep"]["Args"]
        assert x["Config"]["Jar"] == y["HadoopJarStep"]["Jar"]
        # assert x['Config']['MainClass'] == y['HadoopJarStep']['MainClass']
        # Properties
        assert isinstance(x["Id"], str)
        assert x["Name"] == y["Name"]
        assert x["Status"]["State"] in ["RUNNING", "PENDING"]
        # StateChangeReason
        assert isinstance(x["Status"]["Timeline"]["CreationDateTime"], datetime)
        # assert isinstance(x['Status']['Timeline']['EndDateTime'], 'datetime.datetime')
        # Only the first step will have started - we don't know anything about when it finishes, so the second step never starts
        if x["Name"] == "My wordcount example":
            assert isinstance(x["Status"]["Timeline"]["StartDateTime"], datetime)

        x = client.describe_step(ClusterId=cluster_id, StepId=x["Id"])["Step"]
        assert x["ActionOnFailure"] == "TERMINATE_CLUSTER"
        assert x["Config"]["Args"] == y["HadoopJarStep"]["Args"]
        assert x["Config"]["Jar"] == y["HadoopJarStep"]["Jar"]
        # assert x['Config']['MainClass'] == y['HadoopJarStep']['MainClass']
        # Properties
        assert isinstance(x["Id"], str)
        assert x["Name"] == y["Name"]
        assert x["Status"]["State"] in ["RUNNING", "PENDING"]
        # StateChangeReason
        assert isinstance(x["Status"]["Timeline"]["CreationDateTime"], datetime)
        # assert isinstance(x['Status']['Timeline']['EndDateTime'], 'datetime.datetime')
        # assert isinstance(x['Status']['Timeline']['StartDateTime'], 'datetime.datetime')

    step_id = steps[-1]["Id"]  # Last step is first created step.
    steps = client.list_steps(ClusterId=cluster_id, StepIds=[step_id])["Steps"]
    assert len(steps) == 1
    assert steps[0]["Id"] == step_id

    steps = client.list_steps(ClusterId=cluster_id, StepStates=["RUNNING"])["Steps"]
    assert len(steps) == 1
    assert steps[0]["Id"] == step_id


@mock_emr
def test_tags():
    input_tags = [
        {"Key": "newkey1", "Value": "newval1"},
        {"Key": "newkey2", "Value": "newval2"},
    ]

    client = boto3.client("emr", region_name="us-east-1")
    cluster_id = client.run_job_flow(**run_job_flow_args)["JobFlowId"]

    client.add_tags(ResourceId=cluster_id, Tags=input_tags)
    resp = client.describe_cluster(ClusterId=cluster_id)["Cluster"]
    assert len(resp["Tags"]) == 2
    assert {t["Key"]: t["Value"] for t in resp["Tags"]} == {
        t["Key"]: t["Value"] for t in input_tags
    }

    client.remove_tags(ResourceId=cluster_id, TagKeys=[t["Key"] for t in input_tags])
    resp = client.describe_cluster(ClusterId=cluster_id)["Cluster"]
    assert resp["Tags"] == []


@mock_emr
def test_security_configurations():

    client = boto3.client("emr", region_name="us-east-1")

    security_configuration_name = "MySecurityConfiguration"

    security_configuration = """
{
  "EncryptionConfiguration": {
    "AtRestEncryptionConfiguration": {
      "S3EncryptionConfiguration": {
        "EncryptionMode": "SSE-S3"
      }
    },
    "EnableInTransitEncryption": false,
    "EnableAtRestEncryption": true
  }
}
    """.strip()

    resp = client.create_security_configuration(
        Name=security_configuration_name, SecurityConfiguration=security_configuration
    )

    assert resp["Name"] == security_configuration_name
    assert isinstance(resp["CreationDateTime"], datetime)

    resp = client.describe_security_configuration(Name=security_configuration_name)
    assert resp["Name"] == security_configuration_name
    assert resp["SecurityConfiguration"] == security_configuration
    assert isinstance(resp["CreationDateTime"], datetime)

    client.delete_security_configuration(Name=security_configuration_name)

    with pytest.raises(ClientError) as ex:
        client.describe_security_configuration(Name=security_configuration_name)
    err = ex.value.response["Error"]
    assert err["Code"] == "InvalidRequestException"
    assert (
        err["Message"]
        == "Security configuration with name 'MySecurityConfiguration' does not exist."
    )

    with pytest.raises(ClientError) as ex:
        client.delete_security_configuration(Name=security_configuration_name)
    err = ex.value.response["Error"]
    assert err["Code"] == "InvalidRequestException"
    assert (
        err["Message"]
        == "Security configuration with name 'MySecurityConfiguration' does not exist."
    )


@mock_emr
def test_run_job_flow_with_invalid_number_of_master_nodes_raises_error():
    client = boto3.client("emr", region_name="us-east-1")
    params = dict(
        Name="test-cluster",
        Instances={
            "InstanceGroups": [
                {
                    "InstanceCount": 2,
                    "InstanceRole": "MASTER",
                    "InstanceType": "c1.medium",
                    "Market": "ON_DEMAND",
                    "Name": "master",
                }
            ]
        },
    )
    with pytest.raises(ClientError) as ex:
        client.run_job_flow(**params)
    error = ex.value.response["Error"]
    assert error["Code"] == "ValidationException"
    assert (
        error["Message"]
        == "Master instance group must have exactly 3 instances for HA clusters."
    )


@mock_emr
def test_run_job_flow_with_multiple_master_nodes():
    client = boto3.client("emr", region_name="us-east-1")
    params = dict(
        Name="test-cluster",
        Instances={
            "InstanceGroups": [
                {
                    "InstanceCount": 3,
                    "InstanceRole": "MASTER",
                    "InstanceType": "c1.medium",
                    "Market": "ON_DEMAND",
                    "Name": "master",
                }
            ],
            "KeepJobFlowAliveWhenNoSteps": False,
            "TerminationProtected": False,
        },
    )
    cluster_id = client.run_job_flow(**params)["JobFlowId"]
    cluster = client.describe_cluster(ClusterId=cluster_id)["Cluster"]
    assert cluster["AutoTerminate"] is False
    assert cluster["TerminationProtected"] is True
    groups = client.list_instance_groups(ClusterId=cluster_id)["InstanceGroups"]
    master_instance_group = next(
        group for group in groups if group["InstanceGroupType"] == "MASTER"
    )
    assert master_instance_group["RequestedInstanceCount"] == 3
    assert master_instance_group["RunningInstanceCount"] == 3