rancher/terraform-provider-rke

AWS health checks consistently failing when creating an HA with rke provider v1.3.3 and up

Closed this issue ยท 7 comments

Description
When I attempt to use the RKE provider version 1.3.3 or higher to create an HA using AWS resources, the provisioning of RKE on the clusters fails out and the error given is about health checks failing. If I use RKE provider version 1.3.2 or lower, the provisioning completes without error. Running rke up in the cli also works just fine, so it seems to be something specifically related to either my terraform configs or with the RKE provider.

I originally thought the issue was timing related, but even after forcing a three minute wait to ensure the instances have enough time to fully come up, the issue persisted.

Provider Versions

  • RKE Provider version: v1.3.3 (or higher)
  • AWS Provider version: v5.4.0
TF Setup
terraform {
    required_providers {
    aws = {
      source  = "hashicorp/aws"
      version = "5.4.0"
    }
    rke = {
      source = "rancher/rke"
      version = "1.4.1"
    }
  }
}

provider "aws" {
  region     = var.aws_region
  access_key = var.aws_access_key
  secret_key = var.aws_secret_key
}

provider "rke" {
  # Configuration options
}

############################# E C 2   I N F R A S T R U C T U R E #############################
############################# I N S T A N C E S #############################
# create 3 instances 
resource "aws_instance" "cluster" {
  count = 3
  ami                    = var.aws_ami
  instance_type          = var.aws_instance_type
  subnet_id              = var.aws_subnet_a
  vpc_security_group_ids = [var.aws_security_group]
  key_name               = var.aws_key_name

  root_block_device {
    volume_size = var.aws_instance_size
  }

  tags = {
    Name        = "${var.aws_prefix}-${count.index}"
    Owner       = var.aws_owner_tag
    DoNotDelete = var.aws_do_not_delete_tag
  }
}

# print the instance info
output "instance_public_ip" {
  value = [for instance in aws_instance.cluster : instance.public_ip]
}
output "instance_private_ip" {
  value = [for instance in aws_instance.cluster : instance.private_ip]
}

############################# L O A D   B A L A N C I N G #############################
# create a target group for 80
resource "aws_lb_target_group" "aws_lb_target_group_80" {
  name        = "${var.aws_prefix}-80"
  port        = 80
  protocol    = "TCP"
  target_type = "instance"
  vpc_id      = var.aws_vpc
  health_check {
    interval            = 10
    timeout             = 6
    path                = "/healthz"
    port                = 80
    protocol            = "HTTP"
    healthy_threshold   = 3
    unhealthy_threshold = 3
    matcher             = "200-399"
  }
}

# create a target group for 443
resource "aws_lb_target_group" "aws_lb_target_group_443" {
  name        = "${var.aws_prefix}-443"
  port        = 443
  protocol    = "TCP"
  target_type = "instance"
  vpc_id      = var.aws_vpc
  health_check {
    interval            = 10
    timeout             = 6
    path                = "/healthz"
    port                = 443
    protocol            = "HTTP"
    healthy_threshold   = 3
    unhealthy_threshold = 3
    matcher             = "200-399"
  }
}

# attach instances to the target group 80
resource "aws_lb_target_group_attachment" "attach_tg_80" {
  depends_on       = [aws_lb.aws_lb]
  count            = length(aws_instance.cluster)
  target_group_arn = aws_lb_target_group.aws_lb_target_group_80.arn
  target_id        = aws_instance.cluster[count.index].id
  port             = 80
}

# attach instances to the target group 443
resource "aws_lb_target_group_attachment" "attach_tg_443" {
  depends_on       = [aws_lb.aws_lb]
  count            = length(aws_instance.cluster)
  target_group_arn = aws_lb_target_group.aws_lb_target_group_443.arn
  target_id        = aws_instance.cluster[count.index].id
  port             = 443
}

# create a load balancer
resource "aws_lb" "aws_lb" {
  load_balancer_type = "network"
  name               = "${var.aws_prefix}-lb"
  internal           = false
  ip_address_type    = "ipv4"
  subnets            = [var.aws_subnet_a, var.aws_subnet_b, var.aws_subnet_c]
}

# add a listener for port 80
resource "aws_lb_listener" "aws_lb_listener_80" {
  load_balancer_arn = aws_lb.aws_lb.arn
  port              = "80"
  protocol          = "TCP"

  default_action {
    type             = "forward"
    target_group_arn = aws_lb_target_group.aws_lb_target_group_80.arn
  }
}

# add a listener for port 443
resource "aws_lb_listener" "aws_lb_listener_443" {
  load_balancer_arn = aws_lb.aws_lb.arn
  port                 = "443"
  protocol          = "TCP"

  default_action {
    type             = "forward"
    target_group_arn = aws_lb_target_group.aws_lb_target_group_443.arn
  }
}

############################# R O U T E   5 3 #############################
# find route 53 zone id 
data "aws_route53_zone" "zone" {
  name = var.aws_route_zone_name
}

# create a route53 record using the aws_instance
resource "aws_route53_record" "route_53_record" {
  zone_id = data.aws_route53_zone.zone.zone_id
  name    = var.aws_prefix
  type    = "CNAME"
  ttl     = "300"
  records = [aws_lb.aws_lb.dns_name]
}

# print route53 full record
output "route_53_record" {
  value = aws_route53_record.route_53_record.fqdn
}

resource "time_sleep" "wait_for_cluster_ready" {
create_duration = "180s"

depends_on = [aws_instance.cluster]
}

############################# K U B E R N E T E S #############################
############################# R K E   C L U S T E R #############################
# create an rke cluster
resource "rke_cluster" "cluster" {
  depends_on = [time_sleep.wait_for_cluster_ready]
  ssh_key_path = var.ssh_private_key_path
  kubernetes_version = var.k8s_version
  delay_on_creation = 180

  nodes {
    address          = aws_instance.cluster[0].public_ip
    internal_address = aws_instance.cluster[0].private_ip
    user             = "ubuntu"
    role             = ["controlplane", "worker", "etcd"]
  }
    nodes {
    address = aws_instance.cluster[1].public_ip
    internal_address = aws_instance.cluster[1].private_ip
    user    = "ubuntu"
    role    = ["controlplane", "worker", "etcd"]
  }
    nodes {
    address = aws_instance.cluster[2].public_ip
    internal_address = aws_instance.cluster[2].private_ip
    user    = "ubuntu"
    role    = ["controlplane", "worker", "etcd"]
  } 
}

Redacted RKE Logs:
rke_logs.txt

@nickwsuse:

The default value of the argument enable_cri_dockerd is false. That said after k8s v1.24 that value must be set to true.

Probably adding it should fix the problem.

# create an rke cluster
resource "rke_cluster" "cluster" {
  kubernetes_version = var.k8s_version
  enable_cri_dockerd = true
  ...
}


I'll talk with the team to see if we can change that default behavior.

@nickwsuse Is the customer still blocked on this? Are they able to use TFP RKE 1.3.2 until k8s 1.23 is fully deprecated? Thanks so much.

After taking a deeper look into this issue and talking with the team we decided to postpone the fix of this issue to the deprecation of RKE v1.23.

The issue is cause due to the fact that k8s 1.24+ requires the argument enable_cri_dockerd to be set to true, that said the default value of the argument is false.

Due to limitations on the terraform-sdk we can't have a dynamic default value based on another argument.

We found 2 solutions:
a) change the default value of enable_cri_dockerd to true

  • This would change the behavior of cluster using k8s 1.23 or less.

b) Set enable_cri_dockerd to true during execution time:

  • This would generate inconsistency on terraform plans and would result on the resource being applied every single time.

Considering the consequences of the fixes, we decided that would be safer to keep this as it is and just change the default value once RKE v1.23 is deprecated.

@a-blender this was actually my own issue as I was learning some Terraform and trying to have a backup script for making HAs for when the QA Jenkins job(s) have issues, so no customer impact that I'm currently aware of.

Waiting for deprecation of k8s 1.23 makes sense though considering the consequences of the fixes.

QA Test Template

Root Cause

k8s 1.24+ requires the argument enable_cri_dockerd to be set to true. The way that the provideri s we can't just pass it as nill and let the RKE take care of it.
Changing it to a new default or changing it in execution time would messup plans.

What was fixed, or what change have occurred

A documentation was added and a validation was added so we will get errors if we try to use K8s >= 1.24 without enable_cri_dockerd set to true.

Areas or cases that should be tested

Provisioning an RKE cluster.

RC: https://github.com/rancher/terraform-provider-rke/releases/tag/v1.4.3-rc2

What areas could experience regressions

None.

Tf example:

resource "rke_cluster" "default_values" {
  delay_on_creation = 20
  kubernetes_version = "v1.24.4-rancher1-1"
  enable_cri_dockerd = false
  nodes {
    address          = data.aws_instance.instance-1.public_ip
    internal_address = data.aws_instance.instance-1.private_ip
    user             = "ubuntu"
    role             = ["controlplane", "worker", "etcd"]
    ssh_key =        file("~/.ssh/felipe.pem")
  }
}

QA TEST PLAN

Scenarios

Scenario Test Case
1 Provision a cluster via TFP RKE w/highest K8s version
2 Provision a cluster via TFP RKE w/K8s version v1.25, upgrade the cluster to highest K8s version
3 Provision a cluster via TFP RKE w/K8s version v1.23 and cri-dockerd not set
4 Provision a cluster via TFP RKE w/K8s version v1.23 and cri-dockerd not set, upgrade to latest K8s version w/cri-dockerd set to true

Validated this against v1.4.3-rc2. See details below:

ENVIRONMENT DETAILS

  • TFP-RKE version: v1.4.3-rc2

TEST RESULT

Scenario Test Case Result
1 Provision a cluster via TFP RKE w/highest K8s version โœ…
2 Provision a cluster via TFP RKE w/K8s version v1.25, upgrade the cluster to highest K8s version โœ…
3 Provision a cluster via TFP RKE w/K8s version v1.23 and cri-dockerd not set โœ…
4 Provision a cluster via TFP RKE w/K8s version v1.23 and cri-dockerd not set, upgrade to latest K8s version w/cri-dockerd set to true โœ…

VALIDATION STEPS

Scenario 1

  1. Navigated to a client machine with Terraform and kubectl installed to install my cluster (3 etcd, 3 cp, 3 worker).
  2. Created a main.tf file with the following configuration:
terraform {
  required_providers {
    rke = {
      source  = "terraform.local/local/rke"
      version = "1.4.3-rc2"
    }
  }
}

provider "rke" {
  log_file = "rke_debug.log"
}

resource "rke_cluster" "main" {
  nodes {
    address = "<retracted>"
    user    = "ec2-user"
    role    = ["controlplane", "worker", "etcd"]
    ssh_key = file("<retracted>")
  }
  nodes {
    address = "<retracted>"
    user    = "ec2-user"
    role    = ["controlplane", "worker", "etcd"]
    ssh_key = file("<retracted>")
  }
  nodes {
    address = "<retracted>"
    user    = "ec2-user"
    role    = ["controlplane", "worker", "etcd"]
    ssh_key = file("<retracted>")
  }
}

output "kube_config" {
  value     = rke_cluster.main.kube_config_yaml
  sensitive = true
}
  1. Ran terraform apply --auto-approve; validated that the cluster was successfully created;
> kubectl get nodes
NAME             STATUS   ROLES                      AGE     VERSION
<retracted>   Ready    controlplane,etcd,worker   73s   v1.26.9
<retracted>    Ready    controlplane,etcd,worker   73s   v1.26.9
<retracted>   Ready    controlplane,etcd,worker   73s   v1.26.9

Scenario 2

  1. Repeated scenario 1, but ensured that I provisioned a downstream cluster with added field kubernetes_version set to v1.25.14-rancher1-1:
> kubectl get nodes
NAME             STATUS   ROLES                      AGE    VERSION
<retracted>   Ready    controlplane,etcd,worker   110s   v1.25.14
<retracted>    Ready    controlplane,etcd,worker   110s   v1.25.14
<retracted>    Ready    controlplane,etcd,worker   110s   v1.25.14
  1. Updated the main.tf to have v1.26.9-rancher1-1 as the latest K8s version; ran terraform apply --auto-approve with successful results:
> kubectl get nodes
NAME             STATUS   ROLES                      AGE     VERSION
<retracted>   Ready    controlplane,etcd,worker   6m10s   v1.26.9
<retracted>   Ready    controlplane,etcd,worker   6m10s   v1.26.9
<retracted>    Ready    controlplane,etcd,worker   6m10s   v1.26.9

Scenario 3

  1. Navigated to a client machine with Terraform and kubectl installed to install my cluster (3 etcd, 3 cp, 3 worker).
  2. Created a main.tf file with the following configuration:
terraform {
  required_providers {
    rke = {
      source  = "terraform.local/local/rke"
      version = "1.4.3-rc2"
    }
  }
}

provider "rke" {
  log_file = "rke_debug.log"
}

resource "rke_cluster" "main" {
  nodes {
    address = "<retracted>"
    user    = "ec2-user"
    role    = ["controlplane", "worker", "etcd"]
    ssh_key = file("<retracted>")
  }
  nodes {
    address = "<retracted>"
    user    = "ec2-user"
    role    = ["controlplane", "worker", "etcd"]
    ssh_key = file("<retracted>")
  }
  nodes {
    address = "<retracted>"
    user    = "ec2-user"
    role    = ["controlplane", "worker", "etcd"]
    ssh_key = file("<retracted>")
  }
  kubernetes_version = "v1.23.16-rancher2-3"
}

output "kube_config" {
  value     = rke_cluster.main.kube_config_yaml
  sensitive = true
}
  1. Ran terraform apply --auto-approve; validated that the cluster was successfully created;
> kubectl get nodes
NAME             STATUS   ROLES                      AGE   VERSION
<retracted>   Ready    controlplane,etcd,worker   80s   v1.23.16
<retracted>   Ready    controlplane,etcd,worker   80s   v1.23.16
<retracted>    Ready    controlplane,etcd,worker   80s   v1.23.16

Scenario 4

  1. Repeated scenario 3, steps 1-3.
  2. Modified the main.tf file to set kubernetes_version to v1.26.9-rancher1-1 and set enable_cri_dockerd to true; validated the cluster upgraded:
> kubectl get nodes
NAME             STATUS   ROLES                      AGE   VERSION
<retracted>   Ready    controlplane,etcd,worker   10m   v1.26.9
<retracted>    Ready    controlplane,etcd,worker   10m   v1.26.9
<retracted>    Ready    controlplane,etcd,worker   10m   v1.26.9