terraform-google-modules/terraform-google-lb-internal

Failed Healthcheck

moos3 opened this issue · 3 comments

moos3 commented

So using the latest version of the module in the following:

// Client internal client load balancer IP address
resource "google_compute_address" "es_client_ilb" {
  name         = "${var.cluster_name}-client-ilb"
  address_type = "INTERNAL"
  subnetwork    = data.google_compute_subnetwork.default.self_link
  project      = var.project
}

// Client internal load balancer
module "es_client_ilb" {
  source     = "GoogleCloudPlatform/lb-internal/google"
  version    = "~> 2.0"
  project    = var.project
  region     = var.region
  name       = "${var.cluster_name}-client-ilb"
  ip_address = google_compute_address.es_client_ilb.address
  ports      = ["9200", "9300"]
  health_check = {
    type                = "http"
    check_interval_sec  = 1
    healthy_threshold   = 4
    timeout_sec         = 1
    unhealthy_threshold = 5
    proxy_header        = "NONE"
    port                = 9200
    port_name           = "health-check-port"
    request_path        = "/"
  }
  source_tags = ["${var.cluster_name}-kibana", "${var.cluster_name}-external"]
  target_tags = ["${var.cluster_name}-client"]
  network     = "default"
  subnetwork  = "default"

  backends = [
    {
      group = module.es_client.instance_group
      description = "elasticsearch-clients"
    },
  ]
}

When I go into the console it's showing that no healthly nodes. My MIG looks like this

module "es_client" {
  source        = "../../../modules/terraform-elasticsearch"
  cluster_name  = var.cluster_name
  name          = "${var.cluster_name}-client"
  region        = var.region
  zones         = var.zones
  num_nodes     = var.client_num_nodes
  machine_type  = var.client_machine_type
  heap_size     = var.client_heap_size
  masters_count = format("%d", floor(var.master_num_nodes / 2 + 1))
  master_node   = false
  data_node     = false
  access_config = []
  network       = data.google_compute_network.default.self_link
  subnetwork    = data.google_compute_subnetwork.default.self_link
  subnetwork_project = var.subnetwork_project_id
  project       = var.project
  node_tags     = [var.cluster_name]
  hostname      = "es-client"

  source_image_family = "debian-9"
  source_image_project =  "debian-cloud"

  node_labels   = {
    environment = "staging"
    department  = "engineering"
    application = "elasticsearch"
    terraform_created = "true"
  }  
  named_ports        = local.named_ports
  service_account = {
    email  = "1048047287745-compute@developer.gserviceaccount.com"
    scopes = ["https://www.googleapis.com/auth/cloud-platform"]
  }
}

Which is calling in the terraform-elasticsearch module.

data "template_file" "node-startup-script" {
  template = file("${path.module}/config/user_data.sh")

  vars = {
    project_id             = var.project
    zones                  = join(",", var.zones)
    elasticsearch_data_dir = var.elasticsearch_data_dir
    elasticsearch_logs_dir = var.elasticsearch_logs_dir
    heap_size              = var.heap_size
    cluster_name           = var.cluster_name
    minimum_master_nodes   = var.masters_count
    master                 = var.master_node ? "true" : "false"
    data                   = var.data_node ? "true" : "false"
    ingest                 = var.ingest_node ? "true" : "false"
    http_enabled           = var.http_enabled ? "true" : "false"
    security_enabled       = var.security_enabled ? "true" : "false"
    monitoring_enabled     = var.monitoring_enabled ? "true" : "false"
  }
}


module "instance_template" {
  source                  = "terraform-google-modules/vm/google//modules/instance_template"
  version                 = "1.3.0"

  project_id              = var.project
  machine_type            = var.machine_type
  tags                    = var.node_tags
  labels                  = var.node_labels
  startup_script          = data.template_file.node-startup-script.rendered

  /* network */
  network                 = var.network
  subnetwork              = var.subnetwork
  subnetwork_project      = var.subnetwork_project
  can_ip_forward          = var.can_ip_forward

  /* image */
  source_image            = var.source_image
  source_image_family     = var.source_image_family
  source_image_project    = var.source_image_project

  /* disks */
  disk_size_gb            = var.disk_size_gb
  disk_type               = var.disk_type
  auto_delete             = var.auto_delete
  additional_disks        = var.additional_disks

  service_account         = var.service_account
  
}


module "node" {
  source                  = "terraform-google-modules/vm/google//modules/mig"
  version                 = "1.3.0"
  project_id                = var.project
  network                   = var.network
 /* subnetwork                = var.subnetwork
  subnetwork_project        = var.subnetwork_project */
  hostname                  = var.hostname
  region                    = var.region
  instance_template         = module.instance_template.self_link
  target_size               = var.num_nodes
  target_pools              = var.target_pools
  distribution_policy_zones = var.distribution_policy_zones
  update_policy             = var.update_policy
  named_ports               = var.named_ports
  min_replicas              = var.num_nodes
}


data "google_compute_region_instance_group" "default" {
  self_link = module.node.self_link
}

Cluster firewall rule is as such

// Cluster firewall
resource "google_compute_firewall" "cluster" {
  name    = var.cluster_name
  network = data.google_compute_network.default.self_link
  project = var.project

  allow {
    protocol = "tcp"
    ports    = ["9200", "9300"]
  }

  source_tags = [var.cluster_name, "${var.cluster_name}-external",var.k8s_cluster_tag,"elasticsearch"]
  target_tags = [var.cluster_name, "elasticsearch"]
}

I'm at a lost on why this wouldn't work. I have confirmed that the nodes are recieving traffic if call them by IP address from other nodes in my project.

moos3 commented

This makes me wonder if my firewall rule isn't correct or my MIG needs its own health check.

Transferring this to the lb-internal repo since it looks like that's what you're using.

In the console, can you check if the health check looks to be correctly configured?

Looking at your config in more detail, I think I see a potential issue with your firewall config:

In the LB config, you specify network tags as:

module "es_client_ilb" {
  source     = "GoogleCloudPlatform/lb-internal/google"
...
  target_tags = ["${var.cluster_name}-client"]
}

But in the MIG config, they're set as:

module "es_client" {
  source        = "../../../modules/terraform-elasticsearch"
...
  node_tags     = [var.cluster_name]
}