Query frontend: too many unhealthy instances in the ring
Opened this issue · 0 comments
ahsifer commented
Describe the bug
Once in a while, Loki becomes too slow and even the labels query fails. I have attached log lines from the query frontend, querier, and ingester
To Reproduce
Steps to reproduce the behavior:
helm ls
NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION
loki default 1 2024-11-14 03:01:14.312731379 +0300 +03 deployed loki-6.19.0 3.2.0
Environment:
- Infrastructure: Kubernetes (10 nodes 1 master and 9 workers each node with 4 CPU, 8 RAM, and 70GB local storage for persistent storage for the pods)
- Deployment tool: helm
Screenshots, Promtail config, or terminal output
This is the logs from the query-frontend
{"caller":"retry.go:107","code":"Code(500)","end":"2024-11-14T18:15:45.436Z","end_delta":"-57.250601969s","err":"rpc error: code = Code(500) desc = too many unhealthy instances in the ring","length":"5m0s","level":"error","msg":"error processing request","org_id":"fake","query":"{log_level=\"Verbose\"} |= \"\"","query_hash":3288896298,"retry_in":"4.596118935s","start":"2024-11-14T18:10:45.436Z","start_delta":"4m2.749397134s","traceID":"1fdbad8d04dbd3a4","try":2,"ts":"2024-11-14T18:14:48.185404545Z","type":"queryrange.LokiRequest"}
{"caller":"retry.go:107","code":"Code(500)","end":"2024-11-14T18:15:45.414Z","end_delta":"-52.786385373s","err":"rpc error: code = Code(500) desc = too many unhealthy instances in the ring","length":"5m0s","level":"error","msg":"error processing request","org_id":"fake","query":"{log_level=\"Verbose\"} |= \"\"","query_hash":3288896298,"retry_in":"4.464963017s","start":"2024-11-14T18:10:45.414Z","start_delta":"4m7.213613697s","traceID":"29cec51b93debe20","try":3,"ts":"2024-11-14T18:14:52.627635884Z","type":"queryrange.LokiRequest"}
{"cache_chunk_bytes_fetched":0,"cache_chunk_bytes_stored":0,"cache_chunk_download_time":"0s","cache_chunk_hit":0,"cache_chunk_req":0,"cache_index_download_time":"0s","cache_index_hit":0,"cache_index_req":0,"cache_result_download_time":"0s","cache_result_hit":0,"cache_result_query_length_served":"0s","cache_result_req":0,"cache_stats_results_download_time":"0s","cache_stats_results_hit":0,"cache_stats_results_req":0,"cache_volume_results_download_time":"0s","cache_volume_results_hit":0,"cache_volume_results_req":0,"caller":"metrics.go:223","chunk_refs_fetch_time":"2.837299ms","component":"frontend","congestion_control_latency":"0s","disable_pipeline_wrappers":"false","duration":"8.963556604s","end_delta":"-52.791194475s","index_bloom_filter_ratio":"0.00","index_post_bloom_filter_chunks":0,"index_shard_resolver_duration":"0s","index_total_chunks":0,"ingester_chunk_compressed_bytes":"24kB","ingester_chunk_decompressed_bytes":"268kB","ingester_chunk_downloaded":0,"ingester_chunk_head_bytes":"23kB","ingester_chunk_matches":1,"ingester_chunk_refs":0,"ingester_post_filter_lines":197,"ingester_requests":3,"latency":"fast","length":"5m0s","level":"info","limit":500,"lines_per_second":21,"org_id":"fake","pipeline_wrapper_filtered_lines":0,"post_filter_lines":197,"query":"{log_level=\"Verbose\"} |= ``","query_hash":2248683790,"query_referenced_structured_metadata":false,"query_type":"limited","queue_time":"227µs","range_type":"range","returned_lines":0,"shards":1,"splits":0,"start_delta":"4m7.208805422s","status":"200","step":"200ms","store_chunks_download_time":"0s","throughput":"32kB","total_bytes":"291kB","total_bytes_structured_metadata":"1.8kB","total_entries":84,"total_lines":197,"traceID":"1fdbad8d04dbd3a4","ts":"2024-11-14T18:14:52.644858332Z"}
The following attached images from the distributor pods ring status page
The following is my Loki configuration
global:
dnsService: "kube-dns"
dnsNamespace: "kube-system"
deploymentMode: Distributed
loki:
auth_enabled: false
schemaConfig:
configs:
- from: "2024-10-25"
store: tsdb
object_store: s3
schema: v13
index:
prefix: loki_index_
period: 24h
storage_config:
tsdb_shipper:
active_index_directory: /var/loki/tsdb-shipper-active
cache_location: /var/loki/tsdb-shipper-cache
cache_ttl: 48h0m0s
resync_interval: 5m
index_gateway_client:
server_address: '{{ include "loki.indexGatewayAddress" . }}'
hedging:
at: "300ms"
up_to: 2
max_per_second: 15
server:
http_listen_port: 3100
grpc_listen_port: 9095
http_server_read_timeout: 5m
http_server_write_timeout: 5m
http_listen_conn_limit: 0
register_instrumentation: true
http_server_idle_timeout: 5m
log_level: debug
log_format: "json"
log_source_ips_enabled: true
log_request_headers: true
grpc_server_max_concurrent_streams: 256
limits_config:
discover_service_name: []
unordered_writes: true
reject_old_samples: true
reject_old_samples_max_age: 1w
max_cache_freshness_per_query: 5m
split_queries_by_interval: 3h
tsdb_max_query_parallelism: 256
query_timeout: 5m
volume_enabled: true
max_entries_limit_per_query: 10000
retention_period: 4320h0m0s
max_query_lookback: 4320h0m0s
max_global_streams_per_user: 0
max_streams_per_user: 0
per_stream_rate_limit: 16MB
per_stream_rate_limit_burst: 64MB
max_streams_matchers_per_query: 300
max_query_series: 300
max_query_length: 31d1h
tsdb_max_bytes_per_shard: 600MB
ingestion_burst_size_mb: 100
ingestion_rate_mb: 100000
ingestion_rate_strategy: local
max_line_size: 256KB
max_line_size_truncate: true
commonConfig:
replication_factor: 1
ingester:
chunk_encoding: snappy
chunk_retain_period: 10s
chunk_idle_period: 1h
max_chunk_age: 2h0m0s
wal:
enabled: true
checkpoint_duration: 5m0s
dir: /var/loki/wal
flush_on_shutdown: true
replay_memory_ceiling: 3GB
frontend:
log_queries_longer_than: 10s
querier:
max_concurrent: 16
query_ingesters_within: 3h0m0s
compactor:
retention_enabled: true
working_directory: /tmp/loki/compactor
delete_request_store: s3
compaction_interval: 10m
retention_delete_delay: 1h
retention_delete_worker_count: 32
tracing:
enabled: true
storage:
type: s3
bucketNames:
chunks: "loki-test"
s3:
endpoint: s3.net
secretAccessKey: ****
accessKeyId: ****
signatureVersion: v4
s3ForcePathStyle: true
insecure: false
http_config:
timeout: 15s
insecure_skip_verify: true
backoff_config:
min_period: 100ms
max_period: 3s
max_retries: 5
ingester:
replicas: 3
maxUnavailable: 1
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/path: "/metrics"
prometheus.io/port: "3100"
zoneAwareReplication:
enabled: false
persistence:
enabled: true
claims:
- name: data
size: 25G
mountPath: /var/loki/
storageClass: local-storage
querier:
replicas: 3
maxUnavailable: 2
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/path: "/metrics"
prometheus.io/port: "3100"
autoscaling:
enabled: true
minReplicas: 3
maxReplicas: 6
targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 50
extraArgs:
gateway:
enabled: true
replicas: 2
maxUnavailable: 1
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/path: "/metrics"
prometheus.io/port: "3100"
containerPort: 443
basicAuth:
enabled: true
username: admin
password: admin
service:
type: NodePort
nodePort: 31000
port: 80
extraVolumes:
- name: loki-gateway-tls
secret:
secretName: loki-gateway-tls
extraVolumeMounts:
- name: loki-gateway-tls
mountPath: /etc/nginx/tls
readOnly: true
readinessProbe:
failureThreshold: 3
httpGet:
path: /
port: 443
scheme: HTTPS
nginxConfig:
schema: http
enableIPv6: false
logFormat: |-
main '$remote_addr - $remote_user [$time_local] $status '
'"$request" $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for"';
serverSnippet: |
listen 443 ssl;
ssl_certificate /etc/nginx/tls/tls.crt;
ssl_certificate_key /etc/nginx/tls/tls.key;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers 'TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256';
ssl_prefer_server_ciphers on;
httpSnippet: >-
{{ if .Values.loki.tenants }}proxy_set_header X-Scope-OrgID $remote_user;{{ end }}
ssl: true
queryFrontend:
replicas: 2
maxUnavailable: 1
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/path: "/metrics"
prometheus.io/port: "3100"
queryScheduler:
replicas: 2
maxUnavailable: 1
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/path: "/metrics"
prometheus.io/port: "3100"
distributor:
replicas: 3
maxUnavailable: 1
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/path: "/metrics"
prometheus.io/port: "3100"
compactor:
replicas: 1
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/path: "/metrics"
prometheus.io/port: "3100"
indexGateway:
replicas: 2
maxUnavailable: 1
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/path: "/metrics"
prometheus.io/port: "3100"
persistence:
enabled: true
inMemory: false
storageClass: local-storage
size: 24G
test:
enabled: false
lokiCanary:
enabled: false
extraArgs:
- "-user=admin"
- "-pass=admin"
memcached:
image:
repository: memcached
tag: 1.6.32
resultsCache:
enabled: true
defaultValidity: 12h
timeout: 500ms
replicas: 2
port: 11211
allocatedMemory: 1024
maxItemMemory: 20
connectionLimit: 16384
writebackSizeLimit: 500MB
writebackBuffer: 500000
writebackParallelism: 8
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/path: "/metrics"
prometheus.io/port: "9150"
chunksCache:
enabled: true
batchSize: 8
parallelism: 5
timeout: 2000ms
defaultValidity: 0s
replicas: 2
port: 11211
allocatedMemory: 1024
maxItemMemory: 16
connectionLimit: 16384
writebackSizeLimit: 500MB
writebackBuffer: 500000
writebackParallelism: 8
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/path: "/metrics"
prometheus.io/port: "9150"
persistence:
enabled: true
storageClass: local-storage
storageSize: 25G
mountPath: /data
bloomCompactor:
replicas: 0
bloomGateway:
replicas: 0
ruler:
replicas: 0
backend:
replicas: 0
read:
replicas: 0
write:
replicas: 0
singleBinary:
replicas: 0