elasticsearchv8 cluster_settings unmarshal error

Related issue: #509

I was testing elasticsearch_exporter on elasticsearch-8.x and found unmarshal error with --collector.clustersettings flag on

Environment
- elasticsearch_exporter: v1.7.0
- elasticsearch: 8.11.3 (and all versions that include the new setting cluster.routing.allocation.disk.watermark.low.max_headroom"
- golang: 1.21.3
Steps to reproduce

# Prepare elasticsearch_exportetr
go build .
./elasticsearch_exporter --es.uri http://localhost:9200 --log.level debug --collector.clustersettings --es.all --es.indices_settings --es.shards

# run elasticsearchv8 instance
docker run -d --name elasticsearch -p 9200:9200 -e "discovery.type=single-node" -e"xpack.security.enabled=false" elasticsearch:8.11.3

# Change default settings. This removes the "cluster.routing.allocation.disk.watermark.low" field in the Defaults section
curl -XPUT -H"Content-Type: application/json" http://localhost:9200/_cluster/settings -d'{"transient":{"cluster.routing.allocation.disk.watermark.low": "90%"}}'

Expected result: "elasticsearch_clustersettings_allocation_watermark_low_ratio" metric in /metrics

# HELP elasticsearch_clustersettings_allocation_watermark_low_ratio Low watermark for disk usage as a ratio.
# TYPE elasticsearch_clustersettings_allocation_watermark_low_ratio gauge
elasticsearch_clustersettings_allocation_watermark_low_ratio 0.9

Actual result: unmarshal error shown in elasticsearch_exporter logs and the expected metric is not exported in /metrics

level=error ts=2023-12-25T09:56:11.492789Z caller=collector.go:189 msg="collector failed" name=clustersettings duration_seconds=0.015642208 err="json: cannot unmarshal object into Go struct field clusterSettingsWatermark.defaults.cluster.routing.allocation.disk.watermark.low of type string"

Possible solution: apply flat_settings: true when invoking "GET _cluster/settings" and refactor the struct to match the flat_settings format

Example:

elasticsearch_exporter/collector/cluster_settings.go

Lines 113 to 147 in b24d0ac

    
           // clusterSettingsSection is a representation of a Elasticsearch Cluster Settings 
        
           type clusterSettingsSection struct { 
        
           	Cluster clusterSettingsCluster `json:"cluster"` 
        
           } 
        
           // clusterSettingsCluster is a representation of a Elasticsearch clusterSettingsCluster Settings 
        
           type clusterSettingsCluster struct { 
        
           	Routing clusterSettingsRouting `json:"routing"` 
        
           	// This can be either a JSON object (which does not contain the value we are interested in) or a string 
        
           	MaxShardsPerNode interface{} `json:"max_shards_per_node"` 
        
           } 
        
           // clusterSettingsRouting is a representation of a Elasticsearch Cluster shard routing configuration 
        
           type clusterSettingsRouting struct { 
        
           	Allocation clusterSettingsAllocation `json:"allocation"` 
        
           } 
        
           // clusterSettingsAllocation is a representation of a Elasticsearch Cluster shard routing allocation settings 
        
           type clusterSettingsAllocation struct { 
        
           	Enabled string              `json:"enable"` 
        
           	Disk    clusterSettingsDisk `json:"disk"` 
        
           } 
        
           // clusterSettingsDisk is a representation of a Elasticsearch Cluster shard routing disk allocation settings 
        
           type clusterSettingsDisk struct { 
        
           	ThresholdEnabled string                   `json:"threshold_enabled"` 
        
           	Watermark        clusterSettingsWatermark `json:"watermark"` 
        
           } 
        
           // clusterSettingsWatermark is representation of Elasticsearch Cluster shard routing disk allocation watermark settings 
        
           type clusterSettingsWatermark struct { 
        
           	FloodStage string `json:"flood_stage"` 
        
           	High       string `json:"high"` 
        
           	Low        string `json:"low"` 
        
           }

// clusterSettingsResponse is a representation of a Elasticsearch Cluster Settings
type clusterSettingsResponse struct {
	Defaults   clusterSettingsSection `json:"defaults"`
	Persistent clusterSettingsSection `json:"persistent"`
	Transient  clusterSettingsSection `json:"transient"`
}

// clusterSettingsSection is a representation of a Elasticsearch Cluster Settings
type clusterSettingsSection struct {
	ClusterMaxShardsPerNode                         string `json:"cluster.max_shards_per_node"`
	ClusterRoutingAllocationBalanceDiskUsage        string `json:"cluster.routing.allocation.balance.disk_usage"`
	ClusterRoutingAllocationBalanceIndex            string `json:"cluster.routing.allocation.balance.index"`
	ClusterRoutingAllocationBalanceShard            string `json:"cluster.routing.allocation.balance.shard"`
	ClusterRoutingAllocationBalanceThreshold        string `json:"cluster.routing.allocation.balance.threshold"`
	ClusterRoutingAllocationBalanceWriteLoad        string `json:"cluster.routing.allocation.balance.write_load"`
	ClusterRoutingAllocationEnable                  string `json:"cluster.routing.allocation.enable"`
	ClusterRoutingAllocationDiskThresholdEnabled    string `json:"cluster.routing.allocation.disk.threshold_enabled"`
	ClusterRoutingAllocationDiskWatermarkFloodStage string `json:"cluster.routing.allocation.disk.watermark.flood_stage"`
	ClusterRoutingAllocationDiskWatermarkHigh       string `json:"cluster.routing.allocation.disk.watermark.high"`
	ClusterRoutingAllocationDiskWatermarkLow        string `json:"cluster.routing.allocation.disk.watermark.low"`
}

elasticsearch_exporter/collector/cluster_settings.go

Line 152 in b24d0ac

q.Set("include_defaults", "true")

...
u := c.u.ResolveReference(&url.URL{Path: "_cluster/settings"})
q := u.Query()
q.Set("flat_settings", "true")
q.Set("include_defaults", "true")
...

I can reproduce the problem, on elasticsearch_exporter 1.7.0, with ES 8.12.0 and ES 7.17;

By setting only cluster.routing.allocation.disk.watermark.low, the unmarshall error happens, but it's ok on another cluster, and it fails there with cluster.routing.allocation.disk.watermark.flood_stage 🤔

It would be really helpful to have an example of the API response from elasticsearch to use in our tests. cluster_settings_test.go has tests to cover the cluster settings endpoint including the watermark metrics. From what I understand, based on the provided error message, the problem is that instead of a string, there is now a nested json object being returned for the watermark metrics.

This could be a scenario where we need to use different structs based on elasticsearch version or customize the unmarshal.

Hello! Here is an extract of a simple _cluster/settings route on a ES 7.17 cluster where I have the problem:

{                        
  "persistent" : {                     
    "cluster" : {
      "routing" : {
        "allocation" : {
          "disk" : {                  
            "watermark" : {               
              "low" : "88%",
              "flood_stage" : "100%",
              "high" : "93%"
            }                
          }
        }
      }
    }
  }
}

AFAIK, the format/hierarchy is the same with ES8, and haven't changed for ages 🤔

Ooooh, by querying it with include_defaults=true (like the exporter), we can see the following in the defaults section:
ES 7.17:

          "disk" : {
            "threshold_enabled" : "true",
            "watermark" : {
              "enable_for_single_data_node" : "false",
              "flood_stage" : {
                "frozen" : "95%",
                "frozen.max_headroom" : "20GB"
              }
            },
            "include_relocations" : "true",
            "reroute_interval" : "60s"

ES 8.12:

          "disk" : {
            "threshold_enabled" : "true",
            "reroute_interval" : "60s",
            "watermark" : {
              "flood_stage" : {
                "frozen" : "95%",
                "frozen.max_headroom" : "20GB",
                "max_headroom" : "-1"
              },
              "high" : {
                "max_headroom" : "-1"
              },
              "low" : {
                "max_headroom" : "-1"
              },
              "enable_for_single_data_node" : "true"
            }
          },

So we can have an entry in persistent as cluster.routing.allocation.disk.watermark.low, and in defaults as cluster.routing.allocation.disk.watermark.low.max_headroom, can the problem comes from here?

I suspect this is the same bug here, but we are seeing it with another metric in the clustersettings collector:

json: cannot unmarshal object into Go struct field clusterSettingsWatermark.defaults.cluster.routing.allocation.disk.watermark.flood_stage of type string

We are on version ES 7.17.6 and the exporter version is 1.7.0. And this is the JSON response to /_cluster/settings:

{
  "persistent" : {
    "cluster" : {
      "routing" : {
        "allocation" : {
          "disk" : {
            "watermark" : {
              "low" : "92%",
              "flood_stage" : "97%",
              "high" : "95%"
            }
          }
        }
      },
      "max_shards_per_node" : "2000"
    },
    "xpack" : {
      "monitoring" : {
        "collection" : {
          "enabled" : "true"
        }
      }
    }
  },
  "transient" : { }
}

	// clusterSettingsSection is a representation of a Elasticsearch Cluster Settings
	type clusterSettingsSection struct {
	Cluster clusterSettingsCluster `json:"cluster"`
	}

	// clusterSettingsCluster is a representation of a Elasticsearch clusterSettingsCluster Settings
	type clusterSettingsCluster struct {
	Routing clusterSettingsRouting `json:"routing"`
	// This can be either a JSON object (which does not contain the value we are interested in) or a string
	MaxShardsPerNode interface{} `json:"max_shards_per_node"`
	}

	// clusterSettingsRouting is a representation of a Elasticsearch Cluster shard routing configuration
	type clusterSettingsRouting struct {
	Allocation clusterSettingsAllocation `json:"allocation"`
	}

	// clusterSettingsAllocation is a representation of a Elasticsearch Cluster shard routing allocation settings
	type clusterSettingsAllocation struct {
	Enabled string `json:"enable"`
	Disk clusterSettingsDisk `json:"disk"`
	}

	// clusterSettingsDisk is a representation of a Elasticsearch Cluster shard routing disk allocation settings
	type clusterSettingsDisk struct {
	ThresholdEnabled string `json:"threshold_enabled"`
	Watermark clusterSettingsWatermark `json:"watermark"`
	}

	// clusterSettingsWatermark is representation of Elasticsearch Cluster shard routing disk allocation watermark settings
	type clusterSettingsWatermark struct {
	FloodStage string `json:"flood_stage"`
	High string `json:"high"`
	Low string `json:"low"`
	}