AWS EMR Terraform module

Terraform module which creates AWS EMR resources.

This module supports the creation of:

EMR clusters using instance fleets or instance groups deployed in public or private subnets
EMR Virtual clusters that run on Amazon EKS
EMR Serverless clusters
EMR Studios
Security groups for master, core, and task nodes
Security group for EMR service to support private clusters
IAM roles for autoscaling, EMR service, and EC2 instance profiles

ℹ️ The appropriate resources have been tagged with { "for-use-with-amazon-emr-managed-policies" = true } to support the use of the recommended IAM policy "arn:aws:iam::aws:policy/service-role/AmazonEMRServicePolicy_v2". Users are required to tag the appropriate VPC resources (VPC and subnets) as needed. See here for more details regarding v2 of managed EMR policies and their usage requirements.

Usage

Private Cluster w/ Instance Fleet

module "emr" {
  source = "terraform-aws-modules/emr/aws"

  name = "example-instance-fleet"

  release_label = "emr-6.9.0"
  applications  = ["spark", "trino"]
  auto_termination_policy = {
    idle_timeout = 3600
  }

  bootstrap_action = {
    example = {
      path = "file:/bin/echo",
      name = "Just an example",
      args = ["Hello World!"]
    }
  }

  configurations_json = jsonencode([
    {
      "Classification" : "spark-env",
      "Configurations" : [
        {
          "Classification" : "export",
          "Properties" : {
            "JAVA_HOME" : "/usr/lib/jvm/java-1.8.0"
          }
        }
      ],
      "Properties" : {}
    }
  ])

  master_instance_fleet = {
    name                      = "master-fleet"
    target_on_demand_capacity = 1
    instance_type_configs = [
      {
        instance_type = "m5.xlarge"
      }
    ]
  }

  core_instance_fleet = {
    name                      = "core-fleet"
    target_on_demand_capacity = 2
    target_spot_capacity      = 2
    instance_type_configs = [
      {
        instance_type     = "c4.large"
        weighted_capacity = 1
      },
      {
        bid_price_as_percentage_of_on_demand_price = 100
        ebs_config = [{
          size                 = 256
          type                 = "gp3"
          volumes_per_instance = 1
        }]
        instance_type     = "c5.xlarge"
        weighted_capacity = 2
      },
      {
        bid_price_as_percentage_of_on_demand_price = 100
        instance_type                              = "c6i.xlarge"
        weighted_capacity                          = 2
      }
    ]
    launch_specifications = {
      spot_specification = {
        allocation_strategy      = "capacity-optimized"
        block_duration_minutes   = 0
        timeout_action           = "SWITCH_TO_ON_DEMAND"
        timeout_duration_minutes = 5
      }
    }
  }

  task_instance_fleet = {
    name                      = "task-fleet"
    target_on_demand_capacity = 1
    target_spot_capacity      = 2
    instance_type_configs = [
      {
        instance_type     = "c4.large"
        weighted_capacity = 1
      },
      {
        bid_price_as_percentage_of_on_demand_price = 100
        ebs_config = [{
          size                 = 256
          type                 = "gp3"
          volumes_per_instance = 1
        }]
        instance_type     = "c5.xlarge"
        weighted_capacity = 2
      }
    ]
    launch_specifications = {
      spot_specification = {
        allocation_strategy      = "capacity-optimized"
        block_duration_minutes   = 0
        timeout_action           = "SWITCH_TO_ON_DEMAND"
        timeout_duration_minutes = 5
      }
    }
  }

  ebs_root_volume_size = 64
  ec2_attributes = {
    # Subnets should be private subnets and tagged with
    # { "for-use-with-amazon-emr-managed-policies" = true }
    subnet_ids = ["subnet-abcde012", "subnet-bcde012a", "subnet-fghi345a"]
  }
  vpc_id = "vpc-1234556abcdef"

  list_steps_states  = ["PENDING", "RUNNING", "FAILED", "INTERRUPTED"]
  log_uri            = "s3://my-elasticmapreduce-bucket/"

  scale_down_behavior    = "TERMINATE_AT_TASK_COMPLETION"
  step_concurrency_level = 3
  termination_protection = false
  visible_to_all_users   = true

  tags = {
    Terraform   = "true"
    Environment = "dev"
  }
}

Public Cluster w/ Instance Fleet

Configuration is the same as the public version shown above except for the following changes noted below. Users should utilize S3 and EMR VPC endpoints for private connectivity and avoid data transfer charges across NAT gateways.

...
  ec2_attributes = {
    # Subnets should be public subnets and tagged with
    # { "for-use-with-amazon-emr-managed-policies" = true }
    subnet_ids = ["subnet-xyzde987", "subnet-slkjf456", "subnet-qeiru789"]
  }

  # Required for creating public cluster
  is_private_cluster = false
...

Private Cluster w/ Instance Group

module "emr" {
  source = "terraform-aws-modules/emr/aws"

  name = "example-instance-group"

  release_label = "emr-6.9.0"
  applications  = ["spark", "trino"]
  auto_termination_policy = {
    idle_timeout = 3600
  }

  bootstrap_action = {
    example = {
      name = "Just an example",
      path = "file:/bin/echo",
      args = ["Hello World!"]
    }
  }

  configurations_json = jsonencode([
    {
      "Classification" : "spark-env",
      "Configurations" : [
        {
          "Classification" : "export",
          "Properties" : {
            "JAVA_HOME" : "/usr/lib/jvm/java-1.8.0"
          }
        }
      ],
      "Properties" : {}
    }
  ])

  master_instance_group = {
    name           = "master-group"
    instance_count = 1
    instance_type  = "m5.xlarge"
  }

  core_instance_group = {
    name           = "core-group"
    instance_count = 2
    instance_type  = "c4.large"
  }

  task_instance_group = {
    name           = "task-group"
    instance_count = 2
    instance_type  = "c5.xlarge"
    bid_price      = "0.1"

    ebs_config = [{
      size                 = 256
      type                 = "gp3"
      volumes_per_instance = 1
    }]
    ebs_optimized = true
  }

  ebs_root_volume_size = 64
  ec2_attributes = {
    # Instance groups only support one Subnet/AZ
    # Subnets should be private subnets and tagged with
    # { "for-use-with-amazon-emr-managed-policies" = true }
    subnet_id = "subnet-abcde012"
  }
  vpc_id = "vpc-1234556abcdef"

  list_steps_states  = ["PENDING", "RUNNING", "FAILED", "INTERRUPTED"]
  log_uri            = "s3://my-elasticmapreduce-bucket/"

  scale_down_behavior    = "TERMINATE_AT_TASK_COMPLETION"
  step_concurrency_level = 3
  termination_protection = false
  visible_to_all_users   = true

  tags = {
    Terraform   = "true"
    Environment = "dev"
  }
}

Public Cluster w/ Instance Group

...
  ec2_attributes = {
    # Instance groups only support one Subnet/AZ
    # Subnets should be public subnets and tagged with
    # { "for-use-with-amazon-emr-managed-policies" = true }
    subnet_id = "subnet-xyzde987"
  }

  # Required for creating public cluster
  is_private_cluster = false
...

Conditional Creation

The following values are provided to toggle on/off creation of the associated resources as desired:

module "emr" {
  source = "terraform-aws-modules/emr/aws"

  # Disables all resources from being created
  create = false

  # Enables the creation of a security configuration for the cluster
  # Configuration should be supplied via the `security_configuration` variable
  create_security_configuration = true

  # Disables the creation of the role used by the service
  # An externally created role must be supplied via the `service_iam_role_arn` variable
  create_service_iam_role = false

  # Disables the creation of the role used by the service
  # An externally created role can be supplied via the `autoscaling_iam_role_arn` variable
  create_autoscaling_iam_role = false

  # Disables the creation of the IAM role/instance profile used by the EC2 instances
  # An externally created IAM instance profile must be supplied
  # via the `iam_instance_profile_name` variable
  create_iam_instance_profile = false

  # Disables the creation of the security groups used by the EC2 instances. Users can supplied
  # security groups for `master`, `slave`, and `service` security groups via the
  # `ec2_attributes` map variable. If not, the EMR service will create and associate
  # the necessary security groups. Note - the VPC will need to be tagged with
  # { "for-use-with-amazon-emr-managed-policies" = true } for EMR to create security groups
  # https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-man-sec-groups.html
  create_managed_security_groups = false

  is_private_cluster = false
}

Examples

Examples codified under the examples are intended to give users references for how to use the module(s) as well as testing/validating changes to the source code of the module. If contributing to the project, please be sure to make any appropriate updates to the relevant examples to allow maintainers to test your changes and to keep the examples up to date for users. Thank you!

Private clusters using instance fleet or instance group
Public clusters using instance fleet or instance group
Serverless clusters running Spark or Hive
Studios with either IAM or SSO authentication
Virtual cluster running on Amazon EKS

Requirements

Name	Version
terraform	>= 1.0
aws	>= 5.44

Providers

Name	Version
aws	>= 5.44

Modules

No modules.

Resources

Name	Type
aws_emr_cluster.this	resource
aws_emr_instance_fleet.this	resource
aws_emr_instance_group.this	resource
aws_emr_managed_scaling_policy.this	resource
aws_emr_security_configuration.this	resource
aws_iam_instance_profile.this	resource
aws_iam_policy.service_pass_role	resource
aws_iam_role.autoscaling	resource
aws_iam_role.instance_profile	resource
aws_iam_role.service	resource
aws_iam_role_policy_attachment.autoscaling	resource
aws_iam_role_policy_attachment.instance_profile	resource
aws_iam_role_policy_attachment.service	resource
aws_iam_role_policy_attachment.service_pass_role	resource
aws_security_group.master	resource
aws_security_group.service	resource
aws_security_group.slave	resource
aws_security_group_rule.master	resource
aws_security_group_rule.service	resource
aws_security_group_rule.slave	resource
aws_caller_identity.current	data source
aws_emr_release_labels.this	data source
aws_iam_policy_document.autoscaling	data source
aws_iam_policy_document.instance_profile	data source
aws_iam_policy_document.service	data source
aws_iam_policy_document.service_pass_role	data source
aws_partition.current	data source
aws_region.current	data source

Inputs

Name	Description	Type	Default	Required
additional_info	JSON string for selecting additional features such as adding proxy information. Note: Currently there is no API to retrieve the value of this argument after EMR cluster creation from provider, therefore Terraform cannot detect drift from the actual EMR cluster if its value is changed outside Terraform	`string`	`null`	no
applications	A case-insensitive list of applications for Amazon EMR to install and configure when launching the cluster	`list(string)`	`[]`	no
auto_termination_policy	An auto-termination policy for an Amazon EMR cluster. An auto-termination policy defines the amount of idle time in seconds after which a cluster automatically terminates	`any`	`{}`	no
autoscaling_iam_role_arn	The ARN of an existing IAM role to use for autoscaling	`string`	`null`	no
autoscaling_iam_role_description	Description of the role	`string`	`null`	no
autoscaling_iam_role_name	Name to use on IAM role created	`string`	`null`	no
bootstrap_action	Ordered list of bootstrap actions that will be run before Hadoop is started on the cluster nodes	`any`	`{}`	no
configurations	List of configurations supplied for the EMR cluster you are creating. Supply a configuration object for applications to override their default configuration	`string`	`null`	no
configurations_json	JSON string for supplying list of configurations for the EMR cluster	`string`	`null`	no
core_instance_fleet	Configuration block to use an Instance Fleet for the core node type. Cannot be specified if any `core_instance_group` configuration blocks are set	`any`	`{}`	no
core_instance_group	Configuration block to use an [Instance Group] for the core node type	`any`	`{}`	no
create	Controls if resources should be created (affects nearly all resources)	`bool`	`true`	no
create_autoscaling_iam_role	Determines whether the autoscaling IAM role should be created	`bool`	`true`	no
create_iam_instance_profile	Determines whether the EC2 IAM role/instance profile should be created	`bool`	`true`	no
create_managed_security_groups	Determines whether managed security groups are created	`bool`	`true`	no
create_security_configuration	Determines whether a security configuration is created	`bool`	`false`	no
create_service_iam_role	Determines whether the service IAM role should be created	`bool`	`true`	no
custom_ami_id	Custom Amazon Linux AMI for the cluster (instead of an EMR-owned AMI). Available in Amazon EMR version 5.7.0 and later	`string`	`null`	no
ebs_root_volume_size	Size in GiB of the EBS root device volume of the Linux AMI that is used for each EC2 instance. Available in Amazon EMR version 4.x and later	`number`	`null`	no
ec2_attributes	Attributes for the EC2 instances running the job flow	`any`	`{}`	no
iam_instance_profile_description	Description of the EC2 IAM role/instance profile	`string`	`null`	no
iam_instance_profile_name	Name to use on EC2 IAM role/instance profile created	`string`	`null`	no
iam_instance_profile_policies	Map of IAM policies to attach to the EC2 IAM role/instance profile	`map(string)`	{ "AmazonElasticMapReduceforEC2Role": "arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceforEC2Role" }	no
iam_role_path	IAM role path	`string`	`null`	no
iam_role_permissions_boundary	ARN of the policy that is used to set the permissions boundary for the IAM role	`string`	`null`	no
iam_role_tags	A map of additional tags to add to the IAM role created	`map(string)`	`{}`	no
iam_role_use_name_prefix	Determines whether the IAM role name is used as a prefix	`bool`	`true`	no
is_private_cluster	Identifies whether the cluster is created in a private subnet	`bool`	`true`	no
keep_job_flow_alive_when_no_steps	Switch on/off run cluster with no steps or when all steps are complete (default is on)	`bool`	`null`	no
kerberos_attributes	Kerberos configuration for the cluster	`any`	`{}`	no
list_steps_states	List of step states used to filter returned steps	`list(string)`	`[]`	no
log_encryption_kms_key_id	AWS KMS customer master key (CMK) key ID or arn used for encrypting log files. This attribute is only available with EMR version 5.30.0 and later, excluding EMR 6.0.0	`string`	`null`	no
log_uri	S3 bucket to write the log files of the job flow. If a value is not provided, logs are not created	`string`	`null`	no
managed_scaling_policy	Compute limit configuration for a Managed Scaling Policy	`any`	`{}`	no
managed_security_group_name	Name to use on manged security group created. Note - `-master`, `-slave`, and `-service` will be appended to this name to distinguish	`string`	`null`	no
managed_security_group_tags	A map of additional tags to add to the security group created	`map(string)`	`{}`	no
managed_security_group_use_name_prefix	Determines whether the security group name (`security_group_name`) is used as a prefix	`bool`	`true`	no
master_instance_fleet	Configuration block to use an Instance Fleet for the master node type. Cannot be specified if any `master_instance_group` configuration blocks are set	`any`	`{}`	no
master_instance_group	Configuration block to use an Instance Group for the master node type	`any`	`{}`	no
master_security_group_description	Description of the security group created	`string`	`"Managed master security group"`	no
master_security_group_rules	Security group rules to add to the security group created	`any`	{ "default": { "cidr_blocks": [ "0.0.0.0/0" ], "description": "Allow all egress traffic", "from_port": 0, "ipv6_cidr_blocks": [ "::/0" ], "protocol": "-1", "to_port": 0, "type": "egress" } }	no
name	Name of the job flow	`string`	`""`	no
placement_group_config	The specified placement group configuration	`any`	`{}`	no
release_label	Release label for the Amazon EMR release	`string`	`null`	no
release_label_filters	Map of release label filters use to lookup a release label	`any`	{ "default": { "prefix": "emr-6" } }	no
scale_down_behavior	Way that individual Amazon EC2 instances terminate when an automatic scale-in activity occurs or an instance group is resized	`string`	`"TERMINATE_AT_TASK_COMPLETION"`	no
security_configuration	Security configuration to create, or attach if `create_security_configuration` is `false`. Only valid for EMR clusters with `release_label` 4.8.0 or greater	`string`	`null`	no
security_configuration_name	Name of the security configuration to create, or attach if `create_security_configuration` is `false`. Only valid for EMR clusters with `release_label` 4.8.0 or greater	`string`	`null`	no
security_configuration_use_name_prefix	Determines whether `security_configuration_name` is used as a prefix	`bool`	`true`	no
service_iam_role_arn	The ARN of an existing IAM role to use for the service	`string`	`null`	no
service_iam_role_description	Description of the role	`string`	`null`	no
service_iam_role_name	Name to use on IAM role created	`string`	`null`	no
service_iam_role_policies	Map of IAM policies to attach to the service role	`map(string)`	{ "AmazonEMRServicePolicy_v2": "arn:aws:iam::aws:policy/service-role/AmazonEMRServicePolicy_v2" }	no
service_pass_role_policy_description	Description of the policy	`string`	`null`	no
service_pass_role_policy_name	Name to use on IAM policy created	`string`	`null`	no
service_security_group_description	Description of the security group created	`string`	`"Managed service access security group"`	no
service_security_group_rules	Security group rules to add to the security group created	`any`	`{}`	no
slave_security_group_description	Description of the security group created	`string`	`"Managed slave security group"`	no
slave_security_group_rules	Security group rules to add to the security group created	`any`	{ "default": { "cidr_blocks": [ "0.0.0.0/0" ], "description": "Allow all egress traffic", "from_port": 0, "ipv6_cidr_blocks": [ "::/0" ], "protocol": "-1", "to_port": 0, "type": "egress" } }	no
step	Steps to run when creating the cluster	`any`	`{}`	no
step_concurrency_level	Number of steps that can be executed concurrently. You can specify a maximum of 256 steps. Only valid for EMR clusters with `release_label` 5.28.0 or greater (default is 1)	`number`	`null`	no
tags	A map of tags to add to all resources	`map(string)`	`{}`	no
task_instance_fleet	Configuration block to use an Instance Fleet for the task node type. Cannot be specified if any `task_instance_group` configuration blocks are set	`any`	`{}`	no
task_instance_group	Configuration block to use an Instance Group for the task node type	`any`	`{}`	no
termination_protection	Switch on/off termination protection (default is `false`, except when using multiple master nodes). Before attempting to destroy the resource when termination protection is enabled, this configuration must be applied with its value set to `false`	`bool`	`null`	no
unhealthy_node_replacement	Whether whether Amazon EMR should gracefully replace core nodes that have degraded within the cluster. Default value is `false`	`bool`	`null`	no
visible_to_all_users	Whether the job flow is visible to all IAM users of the AWS account associated with the job flow. Default value is `true`	`bool`	`null`	no
vpc_id	The ID of the Amazon Virtual Private Cloud (Amazon VPC) where the security groups will be created	`string`	`""`	no

Outputs

Name	Description
autoscaling_iam_role_arn	Autoscaling IAM role ARN
autoscaling_iam_role_name	Autoscaling IAM role name
autoscaling_iam_role_unique_id	Stable and unique string identifying the autoscaling IAM role
cluster_arn	The ARN of the cluster
cluster_core_instance_group_id	Core node type Instance Group ID, if using Instance Group for this node type
cluster_id	The ID of the cluster
cluster_master_instance_group_id	Master node type Instance Group ID, if using Instance Group for this node type
cluster_master_public_dns	The DNS name of the master node. If the cluster is on a private subnet, this is the private DNS name. On a public subnet, this is the public DNS name
iam_instance_profile_arn	ARN assigned by AWS to the instance profile
iam_instance_profile_iam_role_arn	Instance profile IAM role ARN
iam_instance_profile_iam_role_name	Instance profile IAM role name
iam_instance_profile_iam_role_unique_id	Stable and unique string identifying the instance profile IAM role
iam_instance_profile_id	Instance profile's ID
iam_instance_profile_unique	Stable and unique string identifying the IAM instance profile
managed_master_security_group_arn	Amazon Resource Name (ARN) of the managed master security group
managed_master_security_group_id	ID of the managed master security group
managed_service_access_security_group_arn	Amazon Resource Name (ARN) of the managed service access security group
managed_service_access_security_group_id	ID of the managed service access security group
managed_slave_security_group_arn	Amazon Resource Name (ARN) of the managed slave security group
managed_slave_security_group_id	ID of the managed slave security group
security_configuration_id	The ID of the security configuration
security_configuration_name	The name of the security configuration
service_iam_role_arn	Service IAM role ARN
service_iam_role_name	Service IAM role name
service_iam_role_unique_id	Stable and unique string identifying the service IAM role

License

Apache-2.0 Licensed. See LICENSE.

Sudokamikaze/terraform-aws-emr