Better EMR coverage and boto3 request/response handling

This revision includes:

- A handler for requests for which content-type is JSON (from boto3).

- A decorator (generate_boto3_response) to convert XML responses to
  JSON (for boto3). This way, existing response templates for boto can
  be shared for generating boto3 response.

- Utility class/functions to use botocore's service specification data
  (accessible under botocore.data) for type casting, from query
  parameters to Python objects and XML to JSON.

- Updates to response handlers/models to cover more EMR end points and
  mockable parameters
This commit is contained in:
Taro Sato 2016-09-21 20:59:19 -07:00
commit 7cd404808b
10 changed files with 2399 additions and 841 deletions

View file

@ -0,0 +1,73 @@
from __future__ import unicode_literals
import sure # noqa
from moto.core.responses import AWSServiceSpec
from moto.core.responses import flatten_json_request_body
def test_flatten_json_request_body():
spec = AWSServiceSpec('data/emr/2009-03-31/service-2.json').input_spec('RunJobFlow')
body = {
'Name': 'cluster',
'Instances': {
'Ec2KeyName': 'ec2key',
'InstanceGroups': [
{'InstanceRole': 'MASTER',
'InstanceType': 'm1.small'},
{'InstanceRole': 'CORE',
'InstanceType': 'm1.medium'},
],
'Placement': {'AvailabilityZone': 'us-east-1'},
},
'Steps': [
{'HadoopJarStep': {
'Properties': [
{'Key': 'k1', 'Value': 'v1'},
{'Key': 'k2', 'Value': 'v2'}
],
'Args': ['arg1', 'arg2']}},
],
'Configurations': [
{'Classification': 'class',
'Properties': {'propkey1': 'propkey1',
'propkey2': 'propkey2'}},
{'Classification': 'anotherclass',
'Properties': {'propkey3': 'propkey3'}},
]
}
flat = flatten_json_request_body('', body, spec)
flat['Name'].should.equal(body['Name'])
flat['Instances.Ec2KeyName'].should.equal(body['Instances']['Ec2KeyName'])
for idx in range(2):
flat['Instances.InstanceGroups.member.' + str(idx + 1) + '.InstanceRole'].should.equal(body['Instances']['InstanceGroups'][idx]['InstanceRole'])
flat['Instances.InstanceGroups.member.' + str(idx + 1) + '.InstanceType'].should.equal(body['Instances']['InstanceGroups'][idx]['InstanceType'])
flat['Instances.Placement.AvailabilityZone'].should.equal(body['Instances']['Placement']['AvailabilityZone'])
for idx in range(1):
prefix = 'Steps.member.' + str(idx + 1) + '.HadoopJarStep'
step = body['Steps'][idx]['HadoopJarStep']
i = 0
while prefix + '.Properties.member.' + str(i + 1) + '.Key' in flat:
flat[prefix + '.Properties.member.' + str(i + 1) + '.Key'].should.equal(step['Properties'][i]['Key'])
flat[prefix + '.Properties.member.' + str(i + 1) + '.Value'].should.equal(step['Properties'][i]['Value'])
i += 1
i = 0
while prefix + '.Args.member.' + str(i + 1) in flat:
flat[prefix + '.Args.member.' + str(i + 1)].should.equal(step['Args'][i])
i += 1
for idx in range(2):
flat['Configurations.member.' + str(idx + 1) + '.Classification'].should.equal(body['Configurations'][idx]['Classification'])
props = {}
i = 1
keyfmt = 'Configurations.member.{0}.Properties.entry.{1}'
key = keyfmt.format(idx + 1, i)
while key + '.key' in flat:
props[flat[key + '.key']] = flat[key + '.value']
i += 1
key = keyfmt.format(idx + 1, i)
props.should.equal(body['Configurations'][idx]['Properties'])

View file

@ -1,197 +1,111 @@
from __future__ import unicode_literals
import boto
from boto.emr.bootstrap_action import BootstrapAction
from boto.emr.instance_group import InstanceGroup
from boto.emr.step import StreamingStep
import six
import sure # noqa
from moto import mock_emr
from tests.helpers import requires_boto_gte
@mock_emr
def test_create_job_flow_in_multiple_regions():
step = StreamingStep(
name='My wordcount example',
mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
reducer='aggregate',
input='s3n://elasticmapreduce/samples/wordcount/input',
output='s3n://output_bucket/output/wordcount_output'
)
run_jobflow_args = dict(
job_flow_role='EMR_EC2_DefaultRole',
keep_alive=True,
log_uri='s3://some_bucket/jobflow_logs',
master_instance_type='c1.medium',
name='My jobflow',
num_instances=2,
service_role='EMR_DefaultRole',
slave_instance_type='c1.medium',
)
west1_conn = boto.emr.connect_to_region('us-east-1')
west1_job_id = west1_conn.run_jobflow(
name='us-east-1',
log_uri='s3://some_bucket/jobflow_logs',
master_instance_type='m1.medium',
slave_instance_type='m1.small',
steps=[step],
)
west2_conn = boto.emr.connect_to_region('eu-west-1')
west2_job_id = west2_conn.run_jobflow(
name='eu-west-1',
log_uri='s3://some_bucket/jobflow_logs',
master_instance_type='m1.medium',
slave_instance_type='m1.small',
steps=[step],
)
west1_job_flow = west1_conn.describe_jobflow(west1_job_id)
west1_job_flow.name.should.equal('us-east-1')
west2_job_flow = west2_conn.describe_jobflow(west2_job_id)
west2_job_flow.name.should.equal('eu-west-1')
input_instance_groups = [
InstanceGroup(1, 'MASTER', 'c1.medium', 'ON_DEMAND', 'master'),
InstanceGroup(3, 'CORE', 'c1.medium', 'ON_DEMAND', 'core'),
InstanceGroup(6, 'TASK', 'c1.large', 'SPOT', 'task-1', '0.07'),
InstanceGroup(10, 'TASK', 'c1.xlarge', 'SPOT', 'task-2', '0.05'),
]
@mock_emr
def test_create_job_flow():
def test_describe_cluster():
conn = boto.connect_emr()
step1 = StreamingStep(
name='My wordcount example',
mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
reducer='aggregate',
input='s3n://elasticmapreduce/samples/wordcount/input',
output='s3n://output_bucket/output/wordcount_output'
)
step2 = StreamingStep(
name='My wordcount example2',
mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter2.py',
reducer='aggregate',
input='s3n://elasticmapreduce/samples/wordcount/input2',
output='s3n://output_bucket/output/wordcount_output2'
)
job_id = conn.run_jobflow(
name='My jobflow',
args = run_jobflow_args.copy()
args.update(dict(
api_params={
'Applications.member.1.Name': 'Spark',
'Applications.member.1.Version': '2.4.2',
'Configurations.member.1.Classification': 'yarn-site',
'Configurations.member.1.Properties.entry.1.key': 'someproperty',
'Configurations.member.1.Properties.entry.1.value': 'somevalue',
'Instances.EmrManagedMasterSecurityGroup': 'master-security-group',
'Instances.Ec2SubnetId': 'subnet-8be41cec',
},
availability_zone='us-east-2b',
ec2_keyname='mykey',
job_flow_role='EMR_EC2_DefaultRole',
keep_alive=False,
log_uri='s3://some_bucket/jobflow_logs',
master_instance_type='m1.medium',
slave_instance_type='m1.small',
steps=[step1, step2],
)
job_flow = conn.describe_jobflow(job_id)
job_flow.state.should.equal('STARTING')
job_flow.jobflowid.should.equal(job_id)
job_flow.name.should.equal('My jobflow')
job_flow.masterinstancetype.should.equal('m1.medium')
job_flow.slaveinstancetype.should.equal('m1.small')
job_flow.loguri.should.equal('s3://some_bucket/jobflow_logs')
job_flow.visibletoallusers.should.equal('False')
int(job_flow.normalizedinstancehours).should.equal(0)
job_step = job_flow.steps[0]
job_step.name.should.equal('My wordcount example')
job_step.state.should.equal('STARTING')
args = [arg.value for arg in job_step.args]
args.should.equal([
'-mapper',
's3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
'-reducer',
'aggregate',
'-input',
's3n://elasticmapreduce/samples/wordcount/input',
'-output',
's3n://output_bucket/output/wordcount_output',
])
job_step2 = job_flow.steps[1]
job_step2.name.should.equal('My wordcount example2')
job_step2.state.should.equal('PENDING')
args = [arg.value for arg in job_step2.args]
args.should.equal([
'-mapper',
's3n://elasticmapreduce/samples/wordcount/wordSplitter2.py',
'-reducer',
'aggregate',
'-input',
's3n://elasticmapreduce/samples/wordcount/input2',
'-output',
's3n://output_bucket/output/wordcount_output2',
])
@requires_boto_gte("2.8")
@mock_emr
def test_create_job_flow_with_new_params():
# Test that run_jobflow works with newer params
conn = boto.connect_emr()
conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
master_instance_type='m1.medium',
slave_instance_type='m1.small',
job_flow_role='some-role-arn',
steps=[],
)
@requires_boto_gte("2.8")
@mock_emr
def test_create_job_flow_visible_to_all_users():
conn = boto.connect_emr()
job_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[],
service_role='EMR_DefaultRole',
visible_to_all_users=True,
)
job_flow = conn.describe_jobflow(job_id)
job_flow.visibletoallusers.should.equal('True')
))
cluster_id = conn.run_jobflow(**args)
input_tags = {'tag1': 'val1', 'tag2': 'val2'}
conn.add_tags(cluster_id, input_tags)
cluster = conn.describe_cluster(cluster_id)
cluster.applications[0].name.should.equal('Spark')
cluster.applications[0].version.should.equal('2.4.2')
cluster.autoterminate.should.equal('true')
@requires_boto_gte("2.8")
@mock_emr
def test_create_job_flow_with_instance_groups():
conn = boto.connect_emr()
# configurations appear not be supplied as attributes?
instance_groups = [InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07'),
InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07')]
job_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[],
instance_groups=instance_groups
)
attrs = cluster.ec2instanceattributes
# AdditionalMasterSecurityGroups
# AdditionalSlaveSecurityGroups
attrs.ec2availabilityzone.should.equal(args['availability_zone'])
attrs.ec2keyname.should.equal(args['ec2_keyname'])
attrs.ec2subnetid.should.equal(args['api_params']['Instances.Ec2SubnetId'])
# EmrManagedMasterSecurityGroups
# EmrManagedSlaveSecurityGroups
attrs.iaminstanceprofile.should.equal(args['job_flow_role'])
# ServiceAccessSecurityGroup
job_flow = conn.describe_jobflow(job_id)
int(job_flow.instancecount).should.equal(12)
instance_group = job_flow.instancegroups[0]
int(instance_group.instancerunningcount).should.equal(6)
cluster.id.should.equal(cluster_id)
cluster.loguri.should.equal(args['log_uri'])
cluster.masterpublicdnsname.should.be.a(six.string_types)
cluster.name.should.equal(args['name'])
int(cluster.normalizedinstancehours).should.equal(0)
# cluster.release_label
cluster.shouldnt.have.property('requestedamiversion')
cluster.runningamiversion.should.equal('1.0.0')
# cluster.securityconfiguration
cluster.servicerole.should.equal(args['service_role'])
cluster.status.state.should.equal('TERMINATED')
cluster.status.statechangereason.message.should.be.a(six.string_types)
cluster.status.statechangereason.code.should.be.a(six.string_types)
cluster.status.timeline.creationdatetime.should.be.a(six.string_types)
# cluster.status.timeline.enddatetime.should.be.a(six.string_types)
# cluster.status.timeline.readydatetime.should.be.a(six.string_types)
dict((item.key, item.value) for item in cluster.tags).should.equal(input_tags)
cluster.terminationprotected.should.equal('false')
cluster.visibletoallusers.should.equal('true')
@mock_emr
def test_terminate_job_flow():
def test_describe_jobflows():
conn = boto.connect_emr()
job_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[]
)
flow = conn.describe_jobflows()[0]
flow.state.should.equal('STARTING')
conn.terminate_jobflow(job_id)
flow = conn.describe_jobflows()[0]
flow.state.should.equal('TERMINATED')
@mock_emr
def test_describe_job_flows():
conn = boto.connect_emr()
job1_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[]
)
job2_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[]
)
job1_id = conn.run_jobflow(**run_jobflow_args)
job2_id = conn.run_jobflow(**run_jobflow_args)
jobs = conn.describe_jobflows()
jobs.should.have.length_of(2)
@ -205,252 +119,454 @@ def test_describe_job_flows():
@mock_emr
def test_add_steps_to_flow():
def test_describe_jobflow():
conn = boto.connect_emr()
args = run_jobflow_args.copy()
args.update(dict(
ami_version='3.8.1',
api_params={
#'Applications.member.1.Name': 'Spark',
#'Applications.member.1.Version': '2.4.2',
#'Configurations.member.1.Classification': 'yarn-site',
#'Configurations.member.1.Properties.entry.1.key': 'someproperty',
#'Configurations.member.1.Properties.entry.1.value': 'somevalue',
#'Instances.EmrManagedMasterSecurityGroup': 'master-security-group',
'Instances.Ec2SubnetId': 'subnet-8be41cec',
},
ec2_keyname='mykey',
hadoop_version='2.4.0',
step1 = StreamingStep(
name='My wordcount example',
mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
reducer='aggregate',
input='s3n://elasticmapreduce/samples/wordcount/input',
output='s3n://output_bucket/output/wordcount_output'
)
job_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[step1]
)
keep_alive=True,
master_instance_type='c1.medium',
slave_instance_type='c1.medium',
num_instances=2,
availability_zone='us-west-2b',
job_flow_role='EMR_EC2_DefaultRole',
service_role='EMR_DefaultRole',
visible_to_all_users=True,
))
cluster_id = conn.run_jobflow(**args)
jf = conn.describe_jobflow(cluster_id)
jf.amiversion.should.equal(args['ami_version'])
jf.bootstrapactions.should.equal(None)
jf.creationdatetime.should.be.a(six.string_types)
jf.should.have.property('laststatechangereason')
jf.readydatetime.should.be.a(six.string_types)
jf.startdatetime.should.be.a(six.string_types)
jf.state.should.equal('WAITING')
jf.ec2keyname.should.equal(args['ec2_keyname'])
# Ec2SubnetId
jf.hadoopversion.should.equal(args['hadoop_version'])
int(jf.instancecount).should.equal(2)
for ig in jf.instancegroups:
ig.creationdatetime.should.be.a(six.string_types)
# ig.enddatetime.should.be.a(six.string_types)
ig.should.have.property('instancegroupid').being.a(six.string_types)
int(ig.instancerequestcount).should.equal(1)
ig.instancerole.should.be.within(['MASTER', 'CORE'])
int(ig.instancerunningcount).should.equal(1)
ig.instancetype.should.equal('c1.medium')
ig.laststatechangereason.should.be.a(six.string_types)
ig.market.should.equal('ON_DEMAND')
ig.name.should.be.a(six.string_types)
ig.readydatetime.should.be.a(six.string_types)
ig.startdatetime.should.be.a(six.string_types)
ig.state.should.equal('RUNNING')
jf.keepjobflowalivewhennosteps.should.equal('true')
jf.masterinstanceid.should.be.a(six.string_types)
jf.masterinstancetype.should.equal(args['master_instance_type'])
jf.masterpublicdnsname.should.be.a(six.string_types)
int(jf.normalizedinstancehours).should.equal(0)
jf.availabilityzone.should.equal(args['availability_zone'])
jf.slaveinstancetype.should.equal(args['slave_instance_type'])
jf.terminationprotected.should.equal('false')
jf.jobflowid.should.equal(cluster_id)
# jf.jobflowrole.should.equal(args['job_flow_role'])
jf.loguri.should.equal(args['log_uri'])
jf.name.should.equal(args['name'])
# jf.servicerole.should.equal(args['service_role'])
jf.steps.should.have.length_of(0)
list(i.value for i in jf.supported_products).should.equal([])
jf.visibletoallusers.should.equal('true')
@mock_emr
def test_list_clusters():
conn = boto.connect_emr()
args = run_jobflow_args.copy()
args['name'] = 'jobflow1'
cluster1_id = conn.run_jobflow(**args)
args['name'] = 'jobflow2'
cluster2_id = conn.run_jobflow(**args)
conn.terminate_jobflow(cluster2_id)
summary = conn.list_clusters()
clusters = summary.clusters
clusters.should.have.length_of(2)
expected = {
cluster1_id: {
'id': cluster1_id,
'name': 'jobflow1',
'normalizedinstancehours': 0,
'state': 'WAITING'},
cluster2_id: {
'id': cluster2_id,
'name': 'jobflow2',
'normalizedinstancehours': 0,
'state': 'TERMINATED'},
}
for x in clusters:
y = expected[x.id]
x.id.should.equal(y['id'])
x.name.should.equal(y['name'])
int(x.normalizedinstancehours).should.equal(y['normalizedinstancehours'])
x.status.state.should.equal(y['state'])
x.status.timeline.creationdatetime.should.be.a(six.string_types)
if y['state'] == 'TERMINATED':
x.status.timeline.enddatetime.should.be.a(six.string_types)
else:
x.status.timeline.shouldnt.have.property('enddatetime')
x.status.timeline.readydatetime.should.be.a(six.string_types)
@mock_emr
def test_run_jobflow():
conn = boto.connect_emr()
args = run_jobflow_args.copy()
job_id = conn.run_jobflow(**args)
job_flow = conn.describe_jobflow(job_id)
job_flow.state.should.equal('STARTING')
job_flow.state.should.equal('WAITING')
job_flow.jobflowid.should.equal(job_id)
job_flow.name.should.equal('My jobflow')
job_flow.loguri.should.equal('s3://some_bucket/jobflow_logs')
step2 = StreamingStep(
name='My wordcount example2',
mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter2.py',
reducer='aggregate',
input='s3n://elasticmapreduce/samples/wordcount/input2',
output='s3n://output_bucket/output/wordcount_output2'
)
conn.add_jobflow_steps(job_id, [step2])
job_flow = conn.describe_jobflow(job_id)
job_step = job_flow.steps[0]
job_step.name.should.equal('My wordcount example')
job_step.state.should.equal('STARTING')
args = [arg.value for arg in job_step.args]
args.should.equal([
'-mapper',
's3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
'-reducer',
'aggregate',
'-input',
's3n://elasticmapreduce/samples/wordcount/input',
'-output',
's3n://output_bucket/output/wordcount_output',
])
job_step2 = job_flow.steps[1]
job_step2.name.should.equal('My wordcount example2')
job_step2.state.should.equal('PENDING')
args = [arg.value for arg in job_step2.args]
args.should.equal([
'-mapper',
's3n://elasticmapreduce/samples/wordcount/wordSplitter2.py',
'-reducer',
'aggregate',
'-input',
's3n://elasticmapreduce/samples/wordcount/input2',
'-output',
's3n://output_bucket/output/wordcount_output2',
])
job_flow.name.should.equal(args['name'])
job_flow.masterinstancetype.should.equal(args['master_instance_type'])
job_flow.slaveinstancetype.should.equal(args['slave_instance_type'])
job_flow.loguri.should.equal(args['log_uri'])
job_flow.visibletoallusers.should.equal('false')
int(job_flow.normalizedinstancehours).should.equal(0)
job_flow.steps.should.have.length_of(0)
@mock_emr
def test_create_instance_groups():
conn = boto.connect_emr()
def test_run_jobflow_in_multiple_regions():
regions = {}
for region in ['us-east-1', 'eu-west-1']:
conn = boto.emr.connect_to_region(region)
args = run_jobflow_args.copy()
args['name'] = region
cluster_id = conn.run_jobflow(**args)
regions[region] = {'conn': conn, 'cluster_id': cluster_id}
step1 = StreamingStep(
name='My wordcount example',
mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
reducer='aggregate',
input='s3n://elasticmapreduce/samples/wordcount/input',
output='s3n://output_bucket/output/wordcount_output'
)
job_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[step1],
)
instance_group = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07')
instance_group = conn.add_instance_groups(job_id, [instance_group])
instance_group_id = instance_group.instancegroupids
job_flow = conn.describe_jobflows()[0]
int(job_flow.instancecount).should.equal(6)
instance_group = job_flow.instancegroups[0]
instance_group.instancegroupid.should.equal(instance_group_id)
int(instance_group.instancerunningcount).should.equal(6)
instance_group.instancerole.should.equal('TASK')
instance_group.instancetype.should.equal('c1.medium')
instance_group.market.should.equal('SPOT')
instance_group.name.should.equal('spot-0.07')
instance_group.bidprice.should.equal('0.07')
@mock_emr
def test_modify_instance_groups():
conn = boto.connect_emr()
step1 = StreamingStep(
name='My wordcount example',
mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
reducer='aggregate',
input='s3n://elasticmapreduce/samples/wordcount/input',
output='s3n://output_bucket/output/wordcount_output'
)
job_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[step1]
)
instance_group1 = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07')
instance_group2 = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07')
instance_group = conn.add_instance_groups(job_id, [instance_group1, instance_group2])
instance_group_ids = instance_group.instancegroupids.split(",")
job_flow = conn.describe_jobflows()[0]
int(job_flow.instancecount).should.equal(12)
instance_group = job_flow.instancegroups[0]
int(instance_group.instancerunningcount).should.equal(6)
conn.modify_instance_groups(instance_group_ids, [2, 3])
job_flow = conn.describe_jobflows()[0]
int(job_flow.instancecount).should.equal(5)
instance_group1 = [
group for group
in job_flow.instancegroups
if group.instancegroupid == instance_group_ids[0]
][0]
int(instance_group1.instancerunningcount).should.equal(2)
instance_group2 = [
group for group
in job_flow.instancegroups
if group.instancegroupid == instance_group_ids[1]
][0]
int(instance_group2.instancerunningcount).should.equal(3)
for region in regions.keys():
conn = regions[region]['conn']
jf = conn.describe_jobflow(regions[region]['cluster_id'])
jf.name.should.equal(region)
@requires_boto_gte("2.8")
@mock_emr
def test_set_visible_to_all_users():
def test_run_jobflow_with_new_params():
# Test that run_jobflow works with newer params
conn = boto.connect_emr()
conn.run_jobflow(**run_jobflow_args)
job_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[],
visible_to_all_users=False,
)
@requires_boto_gte("2.8")
@mock_emr
def test_run_jobflow_with_visible_to_all_users():
conn = boto.connect_emr()
for expected in (True, False):
job_id = conn.run_jobflow(
visible_to_all_users=expected,
**run_jobflow_args
)
job_flow = conn.describe_jobflow(job_id)
job_flow.visibletoallusers.should.equal(str(expected).lower())
@requires_boto_gte("2.8")
@mock_emr
def test_run_jobflow_with_instance_groups():
input_groups = dict((g.name, g) for g in input_instance_groups)
conn = boto.connect_emr()
job_id = conn.run_jobflow(instance_groups=input_instance_groups,
**run_jobflow_args)
job_flow = conn.describe_jobflow(job_id)
job_flow.visibletoallusers.should.equal('False')
conn.set_visible_to_all_users(job_id, True)
job_flow = conn.describe_jobflow(job_id)
job_flow.visibletoallusers.should.equal('True')
conn.set_visible_to_all_users(job_id, False)
job_flow = conn.describe_jobflow(job_id)
job_flow.visibletoallusers.should.equal('False')
int(job_flow.instancecount).should.equal(sum(g.num_instances for g in input_instance_groups))
for instance_group in job_flow.instancegroups:
expected = input_groups[instance_group.name]
instance_group.should.have.property('instancegroupid')
int(instance_group.instancerunningcount).should.equal(expected.num_instances)
instance_group.instancerole.should.equal(expected.role)
instance_group.instancetype.should.equal(expected.type)
instance_group.market.should.equal(expected.market)
if hasattr(expected, 'bidprice'):
instance_group.bidprice.should.equal(expected.bidprice)
@requires_boto_gte("2.8")
@mock_emr
def test_set_termination_protection():
conn = boto.connect_emr()
job_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[]
)
job_id = conn.run_jobflow(**run_jobflow_args)
job_flow = conn.describe_jobflow(job_id)
job_flow.terminationprotected.should.equal(u'None')
job_flow.terminationprotected.should.equal('false')
conn.set_termination_protection(job_id, True)
job_flow = conn.describe_jobflow(job_id)
job_flow.terminationprotected.should.equal('true')
conn.set_termination_protection(job_id, False)
job_flow = conn.describe_jobflow(job_id)
job_flow.terminationprotected.should.equal('false')
@requires_boto_gte("2.8")
@mock_emr
def test_list_clusters():
def test_set_visible_to_all_users():
conn = boto.connect_emr()
conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[],
)
args = run_jobflow_args.copy()
args['visible_to_all_users'] = False
job_id = conn.run_jobflow(**args)
job_flow = conn.describe_jobflow(job_id)
job_flow.visibletoallusers.should.equal('false')
summary = conn.list_clusters()
clusters = summary.clusters
clusters.should.have.length_of(1)
cluster = clusters[0]
cluster.name.should.equal("My jobflow")
cluster.normalizedinstancehours.should.equal('0')
cluster.status.state.should.equal("RUNNING")
conn.set_visible_to_all_users(job_id, True)
job_flow = conn.describe_jobflow(job_id)
job_flow.visibletoallusers.should.equal('true')
conn.set_visible_to_all_users(job_id, False)
job_flow = conn.describe_jobflow(job_id)
job_flow.visibletoallusers.should.equal('false')
@mock_emr
def test_describe_cluster():
def test_terminate_jobflow():
conn = boto.connect_emr()
job_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[],
job_id = conn.run_jobflow(**run_jobflow_args)
flow = conn.describe_jobflows()[0]
flow.state.should.equal('WAITING')
conn.terminate_jobflow(job_id)
flow = conn.describe_jobflows()[0]
flow.state.should.equal('TERMINATED')
# testing multiple end points for each feature
@mock_emr
def test_bootstrap_actions():
bootstrap_actions = [
BootstrapAction(
name='bs1',
path='path/to/script',
bootstrap_action_args=['arg1', 'arg2']),
BootstrapAction(
name='bs2',
path='path/to/anotherscript',
bootstrap_action_args=[])
]
conn = boto.connect_emr()
cluster_id = conn.run_jobflow(
bootstrap_actions=bootstrap_actions,
**run_jobflow_args
)
cluster = conn.describe_cluster(job_id)
cluster.name.should.equal("My jobflow")
cluster.normalizedinstancehours.should.equal('0')
cluster.status.state.should.equal("RUNNING")
jf = conn.describe_jobflow(cluster_id)
for x, y in zip(jf.bootstrapactions, bootstrap_actions):
x.name.should.equal(y.name)
x.path.should.equal(y.path)
list(o.value for o in x.args).should.equal(y.args())
resp = conn.list_bootstrap_actions(cluster_id)
for i, y in enumerate(bootstrap_actions):
x = resp.actions[i]
x.name.should.equal(y.name)
x.scriptpath.should.equal(y.path)
list(arg.value for arg in x.args).should.equal(y.args())
@mock_emr
def test_cluster_tagging():
conn = boto.connect_emr()
job_id = conn.run_jobflow(
name='My jobflow',
log_uri='s3://some_bucket/jobflow_logs',
steps=[],
)
cluster_id = job_id
conn.add_tags(cluster_id, {"tag1": "val1", "tag2": "val2"})
def test_instance_groups():
input_groups = dict((g.name, g) for g in input_instance_groups)
conn = boto.connect_emr()
args = run_jobflow_args.copy()
for key in ['master_instance_type', 'slave_instance_type', 'num_instances']:
del args[key]
args['instance_groups'] = input_instance_groups[:2]
job_id = conn.run_jobflow(**args)
jf = conn.describe_jobflow(job_id)
base_instance_count = int(jf.instancecount)
conn.add_instance_groups(job_id, input_instance_groups[2:])
jf = conn.describe_jobflow(job_id)
int(jf.instancecount).should.equal(sum(g.num_instances for g in input_instance_groups))
for x in jf.instancegroups:
y = input_groups[x.name]
if hasattr(y, 'bidprice'):
x.bidprice.should.equal(y.bidprice)
x.creationdatetime.should.be.a(six.string_types)
# x.enddatetime.should.be.a(six.string_types)
x.should.have.property('instancegroupid')
int(x.instancerequestcount).should.equal(y.num_instances)
x.instancerole.should.equal(y.role)
int(x.instancerunningcount).should.equal(y.num_instances)
x.instancetype.should.equal(y.type)
x.laststatechangereason.should.be.a(six.string_types)
x.market.should.equal(y.market)
x.name.should.be.a(six.string_types)
x.readydatetime.should.be.a(six.string_types)
x.startdatetime.should.be.a(six.string_types)
x.state.should.equal('RUNNING')
for x in conn.list_instance_groups(job_id).instancegroups:
y = input_groups[x.name]
if hasattr(y, 'bidprice'):
x.bidprice.should.equal(y.bidprice)
# Configurations
# EbsBlockDevices
# EbsOptimized
x.should.have.property('id')
x.instancegrouptype.should.equal(y.role)
x.instancetype.should.equal(y.type)
x.market.should.equal(y.market)
x.name.should.equal(y.name)
int(x.requestedinstancecount).should.equal(y.num_instances)
int(x.runninginstancecount).should.equal(y.num_instances)
# ShrinkPolicy
x.status.state.should.equal('RUNNING')
x.status.statechangereason.code.should.be.a(six.string_types)
x.status.statechangereason.message.should.be.a(six.string_types)
x.status.timeline.creationdatetime.should.be.a(six.string_types)
# x.status.timeline.enddatetime.should.be.a(six.string_types)
x.status.timeline.readydatetime.should.be.a(six.string_types)
igs = dict((g.name, g) for g in jf.instancegroups)
conn.modify_instance_groups(
[igs['task-1'].instancegroupid, igs['task-2'].instancegroupid],
[2, 3])
jf = conn.describe_jobflow(job_id)
int(jf.instancecount).should.equal(base_instance_count + 5)
igs = dict((g.name, g) for g in jf.instancegroups)
int(igs['task-1'].instancerunningcount).should.equal(2)
int(igs['task-2'].instancerunningcount).should.equal(3)
@mock_emr
def test_steps():
input_steps = [
StreamingStep(
name='My wordcount example',
mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
reducer='aggregate',
input='s3n://elasticmapreduce/samples/wordcount/input',
output='s3n://output_bucket/output/wordcount_output'),
StreamingStep(
name='My wordcount example2',
mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter2.py',
reducer='aggregate',
input='s3n://elasticmapreduce/samples/wordcount/input2',
output='s3n://output_bucket/output/wordcount_output2')
]
# TODO: implementation and test for cancel_steps
conn = boto.connect_emr()
cluster_id = conn.run_jobflow(
steps=[input_steps[0]],
**run_jobflow_args)
jf = conn.describe_jobflow(cluster_id)
jf.steps.should.have.length_of(1)
conn.add_jobflow_steps(cluster_id, [input_steps[1]])
jf = conn.describe_jobflow(cluster_id)
jf.steps.should.have.length_of(2)
for step in jf.steps:
step.actiononfailure.should.equal('TERMINATE_JOB_FLOW')
list(arg.value for arg in step.args).should.have.length_of(8)
step.creationdatetime.should.be.a(six.string_types)
# step.enddatetime.should.be.a(six.string_types)
step.jar.should.equal('/home/hadoop/contrib/streaming/hadoop-streaming.jar')
step.laststatechangereason.should.be.a(six.string_types)
step.mainclass.should.equal('')
step.name.should.be.a(six.string_types)
# step.readydatetime.should.be.a(six.string_types)
# step.startdatetime.should.be.a(six.string_types)
step.state.should.be.within(['STARTING', 'PENDING'])
expected = dict((s.name, s) for s in input_steps)
for x in conn.list_steps(cluster_id).steps:
y = expected[x.name]
# actiononfailure
list(arg.value for arg in x.config.args).should.equal([
'-mapper', y.mapper,
'-reducer', y.reducer,
'-input', y.input,
'-output', y.output,
])
x.config.jar.should.equal('/home/hadoop/contrib/streaming/hadoop-streaming.jar')
x.config.mainclass.should.equal('')
# properties
x.should.have.property('id').should.be.a(six.string_types)
x.name.should.equal(y.name)
x.status.state.should.be.within(['STARTING', 'PENDING'])
# x.status.statechangereason
x.status.timeline.creationdatetime.should.be.a(six.string_types)
# x.status.timeline.enddatetime.should.be.a(six.string_types)
# x.status.timeline.startdatetime.should.be.a(six.string_types)
x = conn.describe_step(cluster_id, x.id)
list(arg.value for arg in x.config.args).should.equal([
'-mapper', y.mapper,
'-reducer', y.reducer,
'-input', y.input,
'-output', y.output,
])
x.config.jar.should.equal('/home/hadoop/contrib/streaming/hadoop-streaming.jar')
x.config.mainclass.should.equal('')
# properties
x.should.have.property('id').should.be.a(six.string_types)
x.name.should.equal(y.name)
x.status.state.should.be.within(['STARTING', 'PENDING'])
# x.status.statechangereason
x.status.timeline.creationdatetime.should.be.a(six.string_types)
# x.status.timeline.enddatetime.should.be.a(six.string_types)
# x.status.timeline.startdatetime.should.be.a(six.string_types)
@mock_emr
def test_tags():
input_tags = {"tag1": "val1", "tag2": "val2"}
conn = boto.connect_emr()
cluster_id = conn.run_jobflow(**run_jobflow_args)
conn.add_tags(cluster_id, input_tags)
cluster = conn.describe_cluster(cluster_id)
cluster.tags.should.have.length_of(2)
tags = dict((tag.key, tag.value) for tag in cluster.tags)
tags['tag1'].should.equal('val1')
tags['tag2'].should.equal('val2')
dict((t.key, t.value) for t in cluster.tags).should.equal(input_tags)
# Remove a tag
conn.remove_tags(cluster_id, ["tag1"])
conn.remove_tags(cluster_id, list(input_tags.keys()))
cluster = conn.describe_cluster(cluster_id)
cluster.tags.should.have.length_of(1)
tags = dict((tag.key, tag.value) for tag in cluster.tags)
tags['tag2'].should.equal('val2')
cluster.tags.should.have.length_of(0)

View file

@ -1,46 +1,586 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from copy import deepcopy
import boto3
import six
import sure # noqa
from botocore.exceptions import ClientError
from nose.tools import assert_raises
from moto import mock_emr
run_job_flow_args = dict(
Instances={
'InstanceCount': 3,
'KeepJobFlowAliveWhenNoSteps': True,
'MasterInstanceType': 'c3.medium',
'Placement': {'AvailabilityZone': 'us-east-1a'},
'SlaveInstanceType': 'c3.xlarge',
},
JobFlowRole='EMR_EC2_DefaultRole',
LogUri='s3://mybucket/log',
Name='cluster',
ServiceRole='EMR_DefaultRole',
VisibleToAllUsers=True)
input_instance_groups = [
{'InstanceCount': 1,
'InstanceRole': 'MASTER',
'InstanceType': 'c1.medium',
'Market': 'ON_DEMAND',
'Name': 'master'},
{'InstanceCount': 3,
'InstanceRole': 'CORE',
'InstanceType': 'c1.medium',
'Market': 'ON_DEMAND',
'Name': 'core'},
{'InstanceCount': 6,
'InstanceRole': 'TASK',
'InstanceType': 'c1.large',
'Market': 'SPOT',
'Name': 'task-1',
'BidPrice': '0.07'},
{'InstanceCount': 10,
'InstanceRole': 'TASK',
'InstanceType': 'c1.xlarge',
'Market': 'SPOT',
'Name': 'task-2',
'BidPrice': '0.05'},
]
@mock_emr
def test_run_job_flow():
def test_describe_cluster():
client = boto3.client('emr', region_name='us-east-1')
cluster_id = client.run_job_flow(
Name='cluster',
Instances={
'MasterInstanceType': 'c3.xlarge',
'SlaveInstanceType': 'c3.xlarge',
'InstanceCount': 3,
'Placement': {'AvailabilityZone': 'us-east-1a'},
'KeepJobFlowAliveWhenNoSteps': True,
},
VisibleToAllUsers=True,
)
cluster_id.should.have.key('JobFlowId')
args = deepcopy(run_job_flow_args)
args['Applications'] = [{'Name': 'Spark', 'Version': '2.4.2'}]
args['Configurations'] = [
{'Classification': 'yarn-site',
'Properties': {'someproperty': 'somevalue'}}]
args['Instances']['AdditionalMasterSecurityGroups'] = ['additional-master']
args['Instances']['AdditionalSlaveSecurityGroups'] = ['additional-slave']
args['Instances']['Ec2KeyName'] = 'mykey'
args['Instances']['Ec2SubnetId'] = 'subnet-8be41cec'
args['Instances']['EmrManagedMasterSecurityGroup'] = 'master-security-group'
args['Instances']['EmrManagedSlaveSecurityGroup'] = 'slave-security-group'
args['Instances']['KeepJobFlowAliveWhenNoSteps'] = False
args['Instances']['ServiceAccessSecurityGroup'] = 'service-access-security-group'
args['Tags'] = [{'Key': 'tag1', 'Value': 'val1'},
{'Key': 'tag2', 'Value': 'val2'}]
cluster_id = client.run_job_flow(**args)['JobFlowId']
cl = client.describe_cluster(ClusterId=cluster_id)['Cluster']
cl['Applications'][0]['Name'].should.equal('Spark')
cl['Applications'][0]['Version'].should.equal('2.4.2')
cl['AutoTerminate'].should.equal(True)
config = cl['Configurations'][0]
config['Classification'].should.equal('yarn-site')
config['Properties'].should.equal(args['Configurations'][0]['Properties'])
attrs = cl['Ec2InstanceAttributes']
attrs['AdditionalMasterSecurityGroups'].should.equal(args['Instances']['AdditionalMasterSecurityGroups'])
attrs['AdditionalSlaveSecurityGroups'].should.equal(args['Instances']['AdditionalSlaveSecurityGroups'])
attrs['Ec2AvailabilityZone'].should.equal('us-east-1a')
attrs['Ec2KeyName'].should.equal(args['Instances']['Ec2KeyName'])
attrs['Ec2SubnetId'].should.equal(args['Instances']['Ec2SubnetId'])
attrs['EmrManagedMasterSecurityGroup'].should.equal(args['Instances']['EmrManagedMasterSecurityGroup'])
attrs['EmrManagedSlaveSecurityGroup'].should.equal(args['Instances']['EmrManagedSlaveSecurityGroup'])
attrs['IamInstanceProfile'].should.equal(args['JobFlowRole'])
attrs['ServiceAccessSecurityGroup'].should.equal(args['Instances']['ServiceAccessSecurityGroup'])
cl['Id'].should.equal(cluster_id)
cl['LogUri'].should.equal(args['LogUri'])
cl['MasterPublicDnsName'].should.be.a(six.string_types)
cl['Name'].should.equal(args['Name'])
cl['NormalizedInstanceHours'].should.equal(0)
# cl['ReleaseLabel'].should.equal('emr-5.0.0')
cl.shouldnt.have.key('RequestedAmiVersion')
cl['RunningAmiVersion'].should.equal('1.0.0')
# cl['SecurityConfiguration'].should.be.a(six.string_types)
cl['ServiceRole'].should.equal(args['ServiceRole'])
status = cl['Status']
status['State'].should.equal('TERMINATED')
# cluster['Status']['StateChangeReason']
status['Timeline']['CreationDateTime'].should.be.a('datetime.datetime')
# status['Timeline']['EndDateTime'].should.equal(datetime(2014, 1, 24, 2, 19, 46, tzinfo=pytz.utc))
status['Timeline']['ReadyDateTime'].should.be.a('datetime.datetime')
dict((t['Key'], t['Value']) for t in cl['Tags']).should.equal(
dict((t['Key'], t['Value']) for t in args['Tags']))
cl['TerminationProtected'].should.equal(False)
cl['VisibleToAllUsers'].should.equal(True)
@mock_emr
def test_describe_job_flows():
client = boto3.client('emr', region_name='us-east-1')
cluster1_id = client.run_job_flow(**run_job_flow_args)['JobFlowId']
cluster2_id = client.run_job_flow(**run_job_flow_args)['JobFlowId']
resp = client.describe_job_flows()
resp['JobFlows'].should.have.length_of(2)
resp = client.describe_job_flows(JobFlowIds=[cluster2_id])
resp['JobFlows'].should.have.length_of(1)
resp['JobFlows'][0]['JobFlowId'].should.equal(cluster2_id)
resp = client.describe_job_flows(JobFlowIds=[cluster1_id])
resp['JobFlows'].should.have.length_of(1)
resp['JobFlows'][0]['JobFlowId'].should.equal(cluster1_id)
@mock_emr
def test_describe_job_flow():
client = boto3.client('emr', region_name='us-east-1')
args = deepcopy(run_job_flow_args)
args['AmiVersion'] = '3.8.1'
args['Instances'].update(
{'Ec2KeyName': 'ec2keyname',
'Ec2SubnetId': 'subnet-8be41cec',
'HadoopVersion': '2.4.0'})
args['VisibleToAllUsers'] = True
cluster_id = client.run_job_flow(**args)['JobFlowId']
jf = client.describe_job_flows(JobFlowIds=[cluster_id])['JobFlows'][0]
jf['AmiVersion'].should.equal(args['AmiVersion'])
jf.shouldnt.have.key('BootstrapActions')
esd = jf['ExecutionStatusDetail']
esd['CreationDateTime'].should.be.a('datetime.datetime')
# esd['EndDateTime'].should.be.a('datetime.datetime')
# esd['LastStateChangeReason'].should.be.a(six.string_types)
esd['ReadyDateTime'].should.be.a('datetime.datetime')
esd['StartDateTime'].should.be.a('datetime.datetime')
esd['State'].should.equal('WAITING')
attrs = jf['Instances']
attrs['Ec2KeyName'].should.equal(args['Instances']['Ec2KeyName'])
attrs['Ec2SubnetId'].should.equal(args['Instances']['Ec2SubnetId'])
attrs['HadoopVersion'].should.equal(args['Instances']['HadoopVersion'])
attrs['InstanceCount'].should.equal(args['Instances']['InstanceCount'])
for ig in attrs['InstanceGroups']:
# ig['BidPrice']
ig['CreationDateTime'].should.be.a('datetime.datetime')
# ig['EndDateTime'].should.be.a('datetime.datetime')
ig['InstanceGroupId'].should.be.a(six.string_types)
ig['InstanceRequestCount'].should.be.a(int)
ig['InstanceRole'].should.be.within(['MASTER', 'CORE'])
ig['InstanceRunningCount'].should.be.a(int)
ig['InstanceType'].should.be.within(['c3.medium', 'c3.xlarge'])
# ig['LastStateChangeReason'].should.be.a(six.string_types)
ig['Market'].should.equal('ON_DEMAND')
ig['Name'].should.be.a(six.string_types)
ig['ReadyDateTime'].should.be.a('datetime.datetime')
ig['StartDateTime'].should.be.a('datetime.datetime')
ig['State'].should.equal('RUNNING')
attrs['KeepJobFlowAliveWhenNoSteps'].should.equal(True)
# attrs['MasterInstanceId'].should.be.a(six.string_types)
attrs['MasterInstanceType'].should.equal(args['Instances']['MasterInstanceType'])
attrs['MasterPublicDnsName'].should.be.a(six.string_types)
attrs['NormalizedInstanceHours'].should.equal(0)
attrs['Placement']['AvailabilityZone'].should.equal(args['Instances']['Placement']['AvailabilityZone'])
attrs['SlaveInstanceType'].should.equal(args['Instances']['SlaveInstanceType'])
attrs['TerminationProtected'].should.equal(False)
jf['JobFlowId'].should.equal(cluster_id)
jf['JobFlowRole'].should.equal(args['JobFlowRole'])
jf['LogUri'].should.equal(args['LogUri'])
jf['Name'].should.equal(args['Name'])
jf['ServiceRole'].should.equal(args['ServiceRole'])
jf.shouldnt.have.key('Steps')
jf.shouldnt.have.key('SupportedProducts')
jf['VisibleToAllUsers'].should.equal(True)
@mock_emr
def test_list_clusters():
client = boto3.client('emr', region_name='us-east-1')
client.run_job_flow(
Name='cluster',
Instances={
'MasterInstanceType': 'c3.xlarge',
'SlaveInstanceType': 'c3.xlarge',
'InstanceCount': 3,
'Placement': {'AvailabilityZone': 'us-east-1a'},
'KeepJobFlowAliveWhenNoSteps': True,
},
VisibleToAllUsers=True,
)
args = deepcopy(run_job_flow_args)
args['Name'] = 'jobflow1'
cluster1_id = client.run_job_flow(**args)['JobFlowId']
args['Name'] = 'jobflow2'
cluster2_id = client.run_job_flow(**args)['JobFlowId']
client.terminate_job_flows(JobFlowIds=[cluster2_id])
summary = client.list_clusters()
clusters = summary['Clusters']
clusters.should.have.length_of(1)
cluster = clusters[0]
cluster['NormalizedInstanceHours'].should.equal(0)
cluster['Status']['State'].should.equal("RUNNING")
clusters.should.have.length_of(2)
expected = {
cluster1_id: {
'Id': cluster1_id,
'Name': 'jobflow1',
'NormalizedInstanceHours': 0,
'State': 'WAITING'},
cluster2_id: {
'Id': cluster2_id,
'Name': 'jobflow2',
'NormalizedInstanceHours': 0,
'State': 'TERMINATED'},
}
for x in clusters:
y = expected[x['Id']]
x['Id'].should.equal(y['Id'])
x['Name'].should.equal(y['Name'])
x['NormalizedInstanceHours'].should.equal(y['NormalizedInstanceHours'])
x['Status']['State'].should.equal(y['State'])
x['Status']['Timeline']['CreationDateTime'].should.be.a('datetime.datetime')
if y['State'] == 'TERMINATED':
x['Status']['Timeline']['EndDateTime'].should.be.a('datetime.datetime')
else:
x['Status']['Timeline'].shouldnt.have.key('EndDateTime')
x['Status']['Timeline']['ReadyDateTime'].should.be.a('datetime.datetime')
@mock_emr
def test_run_job_flow():
client = boto3.client('emr', region_name='us-east-1')
args = deepcopy(run_job_flow_args)
cluster_id = client.run_job_flow(**args)['JobFlowId']
resp = client.describe_job_flows(JobFlowIds=[cluster_id])['JobFlows'][0]
resp['ExecutionStatusDetail']['State'].should.equal('WAITING')
resp['JobFlowId'].should.equal(cluster_id)
resp['Name'].should.equal(args['Name'])
resp['Instances']['MasterInstanceType'].should.equal(args['Instances']['MasterInstanceType'])
resp['Instances']['SlaveInstanceType'].should.equal(args['Instances']['SlaveInstanceType'])
resp['LogUri'].should.equal(args['LogUri'])
resp['VisibleToAllUsers'].should.equal(args['VisibleToAllUsers'])
resp['Instances']['NormalizedInstanceHours'].should.equal(0)
resp.shouldnt.have.key('Steps')
@mock_emr
def test_run_job_flow_with_invalid_params():
client = boto3.client('emr', region_name='us-east-1')
with assert_raises(ClientError) as e:
# cannot set both AmiVersion and ReleaseLabel
args = deepcopy(run_job_flow_args)
args['AmiVersion'] = '2.4'
args['ReleaseLabel'] = 'emr-5.0.0'
client.run_job_flow(**args)
e.exception.response['Error']['Code'].should.equal('ValidationException')
@mock_emr
def test_run_job_flow_in_multiple_regions():
regions = {}
for region in ['us-east-1', 'eu-west-1']:
client = boto3.client('emr', region_name=region)
args = deepcopy(run_job_flow_args)
args['Name'] = region
cluster_id = client.run_job_flow(**args)['JobFlowId']
regions[region] = {'client': client, 'cluster_id': cluster_id}
for region in regions.keys():
client = regions[region]['client']
resp = client.describe_cluster(ClusterId=regions[region]['cluster_id'])
resp['Cluster']['Name'].should.equal(region)
@mock_emr
def test_run_job_flow_with_new_params():
client = boto3.client('emr', region_name='us-east-1')
resp = client.run_job_flow(**run_job_flow_args)
resp.should.have.key('JobFlowId')
@mock_emr
def test_run_job_flow_with_visible_to_all_users():
client = boto3.client('emr', region_name='us-east-1')
for expected in (True, False):
args = deepcopy(run_job_flow_args)
args['VisibleToAllUsers'] = expected
resp = client.run_job_flow(**args)
cluster_id = resp['JobFlowId']
resp = client.describe_cluster(ClusterId=cluster_id)
resp['Cluster']['VisibleToAllUsers'].should.equal(expected)
@mock_emr
def test_run_job_flow_with_instance_groups():
input_groups = dict((g['Name'], g) for g in input_instance_groups)
client = boto3.client('emr', region_name='us-east-1')
args = deepcopy(run_job_flow_args)
args['Instances'] = {'InstanceGroups': input_instance_groups}
cluster_id = client.run_job_flow(**args)['JobFlowId']
groups = client.list_instance_groups(ClusterId=cluster_id)['InstanceGroups']
for x in groups:
y = input_groups[x['Name']]
x.should.have.key('Id')
x['RequestedInstanceCount'].should.equal(y['InstanceCount'])
x['InstanceGroupType'].should.equal(y['InstanceRole'])
x['InstanceType'].should.equal(y['InstanceType'])
x['Market'].should.equal(y['Market'])
if 'BidPrice' in y:
x['BidPrice'].should.equal(y['BidPrice'])
@mock_emr
def test_set_termination_protection():
client = boto3.client('emr', region_name='us-east-1')
args = deepcopy(run_job_flow_args)
args['Instances']['TerminationProtected'] = False
resp = client.run_job_flow(**args)
cluster_id = resp['JobFlowId']
resp = client.describe_cluster(ClusterId=cluster_id)
resp['Cluster']['TerminationProtected'].should.equal(False)
for expected in (True, False):
resp = client.set_termination_protection(JobFlowIds=[cluster_id],
TerminationProtected=expected)
resp = client.describe_cluster(ClusterId=cluster_id)
resp['Cluster']['TerminationProtected'].should.equal(expected)
@mock_emr
def test_set_visible_to_all_users():
client = boto3.client('emr', region_name='us-east-1')
args = deepcopy(run_job_flow_args)
args['VisibleToAllUsers'] = False
resp = client.run_job_flow(**args)
cluster_id = resp['JobFlowId']
resp = client.describe_cluster(ClusterId=cluster_id)
resp['Cluster']['VisibleToAllUsers'].should.equal(False)
for expected in (True, False):
resp = client.set_visible_to_all_users(JobFlowIds=[cluster_id],
VisibleToAllUsers=expected)
resp = client.describe_cluster(ClusterId=cluster_id)
resp['Cluster']['VisibleToAllUsers'].should.equal(expected)
@mock_emr
def test_terminate_job_flows():
client = boto3.client('emr', region_name='us-east-1')
resp = client.run_job_flow(**run_job_flow_args)
cluster_id = resp['JobFlowId']
resp = client.describe_cluster(ClusterId=cluster_id)
resp['Cluster']['Status']['State'].should.equal('WAITING')
resp = client.terminate_job_flows(JobFlowIds=[cluster_id])
resp = client.describe_cluster(ClusterId=cluster_id)
resp['Cluster']['Status']['State'].should.equal('TERMINATED')
# testing multiple end points for each feature
@mock_emr
def test_bootstrap_actions():
bootstrap_actions = [
{'Name': 'bs1',
'ScriptBootstrapAction': {
'Args': ['arg1', 'arg2'],
'Path': 'path/to/script'}},
{'Name': 'bs2',
'ScriptBootstrapAction': {
'Path': 'path/to/anotherscript'}}
]
client = boto3.client('emr', region_name='us-east-1')
args = deepcopy(run_job_flow_args)
args['BootstrapActions'] = bootstrap_actions
cluster_id = client.run_job_flow(**args)['JobFlowId']
cl = client.describe_job_flows(JobFlowIds=[cluster_id])['JobFlows'][0]
for x, y in zip(cl['BootstrapActions'], bootstrap_actions):
x['BootstrapActionConfig'].should.equal(y)
resp = client.list_bootstrap_actions(ClusterId=cluster_id)
for x, y in zip(resp['BootstrapActions'], bootstrap_actions):
x['Name'].should.equal(y['Name'])
if 'Args' in y['ScriptBootstrapAction']:
x['Args'].should.equal(y['ScriptBootstrapAction']['Args'])
x['ScriptPath'].should.equal(y['ScriptBootstrapAction']['Path'])
@mock_emr
def test_instance_groups():
input_groups = dict((g['Name'], g) for g in input_instance_groups)
client = boto3.client('emr', region_name='us-east-1')
args = deepcopy(run_job_flow_args)
for key in ['MasterInstanceType', 'SlaveInstanceType', 'InstanceCount']:
del args['Instances'][key]
args['Instances']['InstanceGroups'] = input_instance_groups[:2]
cluster_id = client.run_job_flow(**args)['JobFlowId']
jf = client.describe_job_flows(JobFlowIds=[cluster_id])['JobFlows'][0]
base_instance_count = jf['Instances']['InstanceCount']
client.add_instance_groups(JobFlowId=cluster_id, InstanceGroups=input_instance_groups[2:])
jf = client.describe_job_flows(JobFlowIds=[cluster_id])['JobFlows'][0]
jf['Instances']['InstanceCount'].should.equal(sum(g['InstanceCount'] for g in input_instance_groups))
for x in jf['Instances']['InstanceGroups']:
y = input_groups[x['Name']]
if hasattr(y, 'BidPrice'):
x['BidPrice'].should.equal('BidPrice')
x['CreationDateTime'].should.be.a('datetime.datetime')
# x['EndDateTime'].should.be.a('datetime.datetime')
x.should.have.key('InstanceGroupId')
x['InstanceRequestCount'].should.equal(y['InstanceCount'])
x['InstanceRole'].should.equal(y['InstanceRole'])
x['InstanceRunningCount'].should.equal(y['InstanceCount'])
x['InstanceType'].should.equal(y['InstanceType'])
# x['LastStateChangeReason'].should.equal(y['LastStateChangeReason'])
x['Market'].should.equal(y['Market'])
x['Name'].should.equal(y['Name'])
x['ReadyDateTime'].should.be.a('datetime.datetime')
x['StartDateTime'].should.be.a('datetime.datetime')
x['State'].should.equal('RUNNING')
groups = client.list_instance_groups(ClusterId=cluster_id)['InstanceGroups']
for x in groups:
y = input_groups[x['Name']]
if hasattr(y, 'BidPrice'):
x['BidPrice'].should.equal('BidPrice')
# Configurations
# EbsBlockDevices
# EbsOptimized
x.should.have.key('Id')
x['InstanceGroupType'].should.equal(y['InstanceRole'])
x['InstanceType'].should.equal(y['InstanceType'])
x['Market'].should.equal(y['Market'])
x['Name'].should.equal(y['Name'])
x['RequestedInstanceCount'].should.equal(y['InstanceCount'])
x['RunningInstanceCount'].should.equal(y['InstanceCount'])
# ShrinkPolicy
x['Status']['State'].should.equal('RUNNING')
x['Status']['StateChangeReason']['Code'].should.be.a(six.string_types)
# x['Status']['StateChangeReason']['Message'].should.be.a(six.string_types)
x['Status']['Timeline']['CreationDateTime'].should.be.a('datetime.datetime')
# x['Status']['Timeline']['EndDateTime'].should.be.a('datetime.datetime')
x['Status']['Timeline']['ReadyDateTime'].should.be.a('datetime.datetime')
igs = dict((g['Name'], g) for g in groups)
client.modify_instance_groups(
InstanceGroups=[
{'InstanceGroupId': igs['task-1']['Id'],
'InstanceCount': 2},
{'InstanceGroupId': igs['task-2']['Id'],
'InstanceCount': 3}])
jf = client.describe_job_flows(JobFlowIds=[cluster_id])['JobFlows'][0]
jf['Instances']['InstanceCount'].should.equal(base_instance_count + 5)
igs = dict((g['Name'], g) for g in jf['Instances']['InstanceGroups'])
igs['task-1']['InstanceRunningCount'].should.equal(2)
igs['task-2']['InstanceRunningCount'].should.equal(3)
@mock_emr
def test_steps():
input_steps = [{
'HadoopJarStep': {
'Args': [
'hadoop-streaming',
'-files', 's3://elasticmapreduce/samples/wordcount/wordSplitter.py#wordSplitter.py',
'-mapper', 'python wordSplitter.py',
'-input', 's3://elasticmapreduce/samples/wordcount/input',
'-output', 's3://output_bucket/output/wordcount_output',
'-reducer', 'aggregate'
],
'Jar': 'command-runner.jar',
},
'Name': 'My wordcount example',
}, {
'HadoopJarStep': {
'Args': [
'hadoop-streaming',
'-files', 's3://elasticmapreduce/samples/wordcount/wordSplitter2.py#wordSplitter2.py',
'-mapper', 'python wordSplitter2.py',
'-input', 's3://elasticmapreduce/samples/wordcount/input2',
'-output', 's3://output_bucket/output/wordcount_output2',
'-reducer', 'aggregate'
],
'Jar': 'command-runner.jar',
},
'Name': 'My wordcount example2',
}]
# TODO: implementation and test for cancel_steps
client = boto3.client('emr', region_name='us-east-1')
args = deepcopy(run_job_flow_args)
args['Steps'] = [input_steps[0]]
cluster_id = client.run_job_flow(**args)['JobFlowId']
jf = client.describe_job_flows(JobFlowIds=[cluster_id])['JobFlows'][0]
jf['Steps'].should.have.length_of(1)
client.add_job_flow_steps(JobFlowId=cluster_id, Steps=[input_steps[1]])
jf = client.describe_job_flows(JobFlowIds=[cluster_id])['JobFlows'][0]
jf['Steps'].should.have.length_of(2)
for idx, (x, y) in enumerate(zip(jf['Steps'], input_steps)):
x['ExecutionStatusDetail'].should.have.key('CreationDateTime')
# x['ExecutionStatusDetail'].should.have.key('EndDateTime')
# x['ExecutionStatusDetail'].should.have.key('LastStateChangeReason')
# x['ExecutionStatusDetail'].should.have.key('StartDateTime')
x['ExecutionStatusDetail']['State'].should.equal('STARTING' if idx == 0 else 'PENDING')
x['StepConfig']['ActionOnFailure'].should.equal('TERMINATE_CLUSTER')
x['StepConfig']['HadoopJarStep']['Args'].should.equal(y['HadoopJarStep']['Args'])
x['StepConfig']['HadoopJarStep']['Jar'].should.equal(y['HadoopJarStep']['Jar'])
if 'MainClass' in y['HadoopJarStep']:
x['StepConfig']['HadoopJarStep']['MainClass'].should.equal(y['HadoopJarStep']['MainClass'])
if 'Properties' in y['HadoopJarStep']:
x['StepConfig']['HadoopJarStep']['Properties'].should.equal(y['HadoopJarStep']['Properties'])
x['StepConfig']['Name'].should.equal(y['Name'])
expected = dict((s['Name'], s) for s in input_steps)
steps = client.list_steps(ClusterId=cluster_id)['Steps']
steps.should.have.length_of(2)
for x in steps:
y = expected[x['Name']]
x['ActionOnFailure'].should.equal('TERMINATE_CLUSTER')
x['Config']['Args'].should.equal(y['HadoopJarStep']['Args'])
x['Config']['Jar'].should.equal(y['HadoopJarStep']['Jar'])
# x['Config']['MainClass'].should.equal(y['HadoopJarStep']['MainClass'])
# Properties
x['Id'].should.be.a(six.string_types)
x['Name'].should.equal(y['Name'])
x['Status']['State'].should.be.within(['STARTING', 'PENDING'])
# StateChangeReason
x['Status']['Timeline']['CreationDateTime'].should.be.a('datetime.datetime')
# x['Status']['Timeline']['EndDateTime'].should.be.a('datetime.datetime')
# x['Status']['Timeline']['StartDateTime'].should.be.a('datetime.datetime')
x = client.describe_step(ClusterId=cluster_id, StepId=x['Id'])['Step']
x['ActionOnFailure'].should.equal('TERMINATE_CLUSTER')
x['Config']['Args'].should.equal(y['HadoopJarStep']['Args'])
x['Config']['Jar'].should.equal(y['HadoopJarStep']['Jar'])
# x['Config']['MainClass'].should.equal(y['HadoopJarStep']['MainClass'])
# Properties
x['Id'].should.be.a(six.string_types)
x['Name'].should.equal(y['Name'])
x['Status']['State'].should.be.within(['STARTING', 'PENDING'])
# StateChangeReason
x['Status']['Timeline']['CreationDateTime'].should.be.a('datetime.datetime')
# x['Status']['Timeline']['EndDateTime'].should.be.a('datetime.datetime')
# x['Status']['Timeline']['StartDateTime'].should.be.a('datetime.datetime')
@mock_emr
def test_tags():
input_tags = [{'Key': 'newkey1', 'Value': 'newval1'},
{'Key': 'newkey2', 'Value': 'newval2'}]
client = boto3.client('emr', region_name='us-east-1')
cluster_id = client.run_job_flow(**run_job_flow_args)['JobFlowId']
client.add_tags(ResourceId=cluster_id, Tags=input_tags)
resp = client.describe_cluster(ClusterId=cluster_id)['Cluster']
resp['Tags'].should.have.length_of(2)
dict((t['Key'], t['Value']) for t in resp['Tags']).should.equal(dict((t['Key'], t['Value']) for t in input_tags))
client.remove_tags(ResourceId=cluster_id, TagKeys=[t['Key'] for t in input_tags])
resp = client.describe_cluster(ClusterId=cluster_id)['Cluster']
resp.shouldnt.have.key('Tags')