Version 3.11 Wait for heketi pod fails retrying

Question

Version 3.11 Wait for heketi pod fails retrying

kb7791 opened this issue 4 years ago · 1 comments

Description

FAILED - RETRYING: Wait for heketi pod (1 retries left).
[WARNING]: Module invocation had junk after the JSON data: Last login: Wed Sep 16 14:53:54 EDT 2020
fatal: [master0.gluster-staging-xxxxx.com]: FAILED! => {"attempts": 30, "changed": false, "module_results": {"cmd": "/bin/oc get pod --selector=glusterfs=heketi-storage-pod -o json -n glusterfs", "results": [{"apiVersion": "v1", "items": [{"apiVersion": "v1", "kind": "Pod", "metadata": {"annotations": {"openshift.io/deployment-config.latest-version": "1", "openshift.io/deployment-config.name": "heketi-storage", "openshift.io/deployment.name": "heketi-storage-1", "openshift.io/scc": "privileged"}, "creationTimestamp": "2020-09-16T18:48:41Z", "generateName": "heketi-storage-1-", "labels": {"deployment": "heketi-storage-1", "deploymentconfig": "heketi-storage", "glusterfs": "heketi-storage-pod", "heketi": "storage-pod"}, "name": "heketi-storage-1-x729j", "namespace": "glusterfs", "ownerReferences": [{"apiVersion": "v1", "blockOwnerDeletion": true, "controller": true, "kind": "ReplicationController", "name": "heketi-storage-1", "uid": "3b7227d2-f84d-11ea-931a-005056b42cf4"}], "resourceVersion": "3830", "selfLink": "/api/v1/namespaces/glusterfs/pods/heketi-storage-1-x729j", "uid": "3d560f5b-f84d-11ea-aa78-005056b4152a"}, "spec": {"containers": [{"env": [{"name": "HEKETI_USER_KEY", "value": "fbJkWCozh/I8+m5wq0OeUx/B7fhao+ogL4HxAh8RCVk="}, {"name": "HEKETI_ADMIN_KEY", "value": "NNnkf72k9FvdXXmfbnSGKDgjirxNz93J6pIhTnoV1m0="}, {"name": "HEKETI_CLI_USER", "value": "admin"}, {"name": "HEKETI_CLI_KEY", "value": "NNnkf72k9FvdXXmfbnSGKDgjirxNz93J6pIhTnoV1m0="}, {"name": "HEKETI_EXECUTOR", "value": "ssh"}, {"name": "HEKETI_FSTAB", "value": "/etc/fstab"}, {"name": "HEKETI_SNAPSHOT_LIMIT", "value": "14"}, {"name": "HEKETI_KUBE_GLUSTER_DAEMONSET", "value": "1"}, {"name": "HEKETI_IGNORE_STALE_OPERATIONS", "value": "true"}, {"name": "HEKETI_DEBUG_UMOUNT_FAILURES", "value": "true"}, {"name": "HEKETI_LVM_WRAPPER"}], "image": "10.22.0.227:5000/heketi/heketi:latest", "imagePullPolicy": "IfNotPresent", "livenessProbe": {"failureThreshold": 3, "httpGet": {"path": "/hello", "port": 8080, "scheme": "HTTP"}, "initialDelaySeconds": 30, "periodSeconds": 10, "successThreshold": 1, "timeoutSeconds": 3}, "name": "heketi", "ports": [{"containerPort": 8080, "protocol": "TCP"}], "readinessProbe": {"failureThreshold": 3, "httpGet": {"path": "/hello", "port": 8080, "scheme": "HTTP"}, "initialDelaySeconds": 3, "periodSeconds": 10, "successThreshold": 1, "timeoutSeconds": 3}, "resources": {}, "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "volumeMounts": [{"mountPath": "/var/lib/heketi", "name": "db"}, {"mountPath": "/etc/heketi", "name": "config"}, {"mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", "name": "heketi-storage-service-account-token-gf9jp", "readOnly": true}]}], "dnsPolicy": "ClusterFirst", "imagePullSecrets": [{"name": "heketi-storage-service-account-dockercfg-bzfbl"}], "nodeName": "infra1.gluster-staging-xxxxx.com", "priority": 0, "restartPolicy": "Always", "schedulerName": "default-scheduler", "securityContext": {}, "serviceAccount": "heketi-storage-service-account", "serviceAccountName": "heketi-storage-service-account", "terminationGracePeriodSeconds": 30, "volumes": [{"glusterfs": {"endpoints": "heketi-db-storage-endpoints", "path": "heketidbstorage"}, "name": "db"}, {"name": "config", "secret": {"defaultMode": 420, "secretName": "heketi-storage-config-secret"}}, {"name": "heketi-storage-service-account-token-gf9jp", "secret": {"defaultMode": 420, "secretName": "heketi-storage-service-account-token-gf9jp"}}]}, "status": {"conditions": [{"lastProbeTime": null, "lastTransitionTime": "2020-09-16T18:48:41Z", "status": "True", "type": "Initialized"}, {"lastProbeTime": null, "lastTransitionTime": "2020-09-16T18:48:41Z", "message": "containers with unready status: [heketi]", "reason": "ContainersNotReady", "status": "False", "type": "Ready"}, {"lastProbeTime": null, "lastTransitionTime": null, "message": "containers with unready status: [heketi]", "reason": "ContainersNotReady", "status": "False", "type": "ContainersReady"}, {"lastProbeTime": null, "lastTransitionTime": "2020-09-16T18:48:41Z", "status": "True", "type": "PodScheduled"}], "containerStatuses": [{"image": "10.22.0.227:5000/heketi/heketi:latest", "imageID": "", "lastState": {}, "name": "heketi", "ready": false, "restartCount": 0, "state": {"waiting": {"reason": "ContainerCreating"}}}], "hostIP": "10.22.0.232", "phase": "Pending", "qosClass": "BestEffort", "startTime": "2020-09-16T18:48:41Z"}}], "kind": "List", "metadata": {"resourceVersion": "", "selfLink": ""}}], "returncode": 0}, "state": "list"}

PLAY RECAP *************************************************************************************************************************************************************************************************************************************
glusterfs0.gluster-staging-xxxxx.com : ok=3    changed=2    unreachable=0    failed=0    skipped=6    rescued=0    ignored=0   
glusterfs1.gluster-staging-xxxxx.com : ok=3    changed=2    unreachable=0    failed=0    skipped=6    rescued=0    ignored=0   
glusterfs2.gluster-staging-xxxxx.com : ok=3    changed=2    unreachable=0    failed=0    skipped=6    rescued=0    ignored=0   
infra0.gluster-staging-xxxxx.com : ok=165  changed=82   unreachable=0    failed=0    skipped=259  rescued=0    ignored=0   
infra1.gluster-staging-xxxxx.com : ok=165  changed=82   unreachable=0    failed=0    skipped=259  rescued=0    ignored=0   
infra2.gluster-staging-xxxxx.com : ok=165  changed=82   unreachable=0    failed=0    skipped=259  rescued=0    ignored=0   
lb.gluster-staging-xxxxx.com : ok=86   changed=28   unreachable=0    failed=0    skipped=74   rescued=0    ignored=0   
localhost                  : ok=28   changed=0    unreachable=0    failed=0    skipped=4    rescued=0    ignored=0   
master0.gluster-staging-xxxxx.com : ok=519  changed=253  unreachable=0    failed=1    skipped=705  rescued=0    ignored=0   
master1.gluster-staging-xxxxx.com : ok=334  changed=165  unreachable=0    failed=0    skipped=535  rescued=0    ignored=0   
master2.gluster-staging-xxxxx.com : ok=334  changed=165  unreachable=0    failed=0    skipped=535  rescued=0    ignored=0   
node0.gluster-staging-xxxxx.com : ok=165  changed=82   unreachable=0    failed=0    skipped=259  rescued=0    ignored=0   
node1.gluster-staging-xxxxx.com : ok=165  changed=82   unreachable=0    failed=0    skipped=259  rescued=0    ignored=0   
node2.gluster-staging-xxxxx.com : ok=165  changed=82   unreachable=0    failed=0    skipped=259  rescued=0    ignored=0   


INSTALLER STATUS *******************************************************************************************************************************************************************************************************************************
Initialization              : Complete (0:00:35)
Health Check                : Complete (0:00:30)
Node Bootstrap Preparation  : Complete (0:08:17)
etcd Install                : Complete (0:01:55)
NFS Install                 : Complete (0:00:23)
Load Balancer Install       : Complete (0:00:32)
Master Install              : Complete (0:08:46)
Master Additional Install   : Complete (0:01:42)
Node Join                   : Complete (0:01:26)
GlusterFS Install           : In Progress (0:12:58)
        This phase can be restarted by running: playbooks/openshift-glusterfs/new_install.yml
Wednesday 16 September 2020  14:54:05 -0400 (0:05:27.268)       0:39:58.381 *** 
=============================================================================== 
openshift_storage_glusterfs : Wait for heketi pod ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 327.27s
openshift_storage_glusterfs : Wait for copy job to finish ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 325.60s
openshift_node : Install node, clients, and conntrack packages ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 127.41s
openshift_control_plane : Wait for all control plane pods to come up and become ready ------------------------------------------------------------------------------------------------------------------------------------------------- 119.33s
openshift_master_certificates : copy --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 39.19s
openshift_storage_glusterfs : Wait for deploy-heketi pod ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 33.39s
Approve node certificates when bootstrapping ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 31.91s
Run health checks (install) - EL ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 29.36s
openshift_node : install needed rpm(s) ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 28.94s
tuned : Ensure files are populated from templates -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 18.85s
openshift_cloud_provider : verify API server ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 15.86s
openshift_sdn : Copy templates to temp directory --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 15.75s
openshift_node : Update journald setup ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 14.53s
openshift_node : pre-pull pod image ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 14.33s
openshift_examples : Unarchive the OpenShift examples on the remote -------------------------------------------------------------------------------------------------------------------------------------------------------------------- 13.75s
tuned : Ensure files are populated from templates -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 13.44s
openshift_node_group : Copy templates to temp directory -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 12.24s
openshift_node : restart services ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 11.80s
openshift_control_plane : Copy static master scripts ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 11.67s
openshift_node : Add firewalld allow rules --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 11.27s


Failure summary:


  1. Hosts:    master0.gluster-staging-xxxxx.com
     Play:     Configure GlusterFS
     Task:     Wait for heketi pod
     Message:  Failed without returning a message.

Version

Please put the following version information in the code block
indicated below.

[root@master0 ~]# ansible --version
ansible 2.9.10
config file = /etc/ansible/ansible.cfg
configured module search path = [u'/root/.ansible/plugins/modules', u'/usr/share/ansible/plugins/modules']
ansible python module location = /usr/lib/python2.7/site-packages/ansible
executable location = /bin/ansible
python version = 2.7.5 (default, Aug 7 2019, 00:51:29) [GCC 4.8.5 20150623 (Red Hat 4.8.5-39)]

If you're operating from a git clone:

openshift-ansible-3.11.213-1

Steps To Reproduce

runing deploy-cluster.yml

Expected Results

Glusterfs playbook to install correctly

Observed Results

Describe what is actually happening.

[root@master0 ~]# oc logs heketi-storage-3-deploy 
--> Scaling heketi-storage-3 to 1
error: update acceptor rejected heketi-storage-3: pods for rc 'glusterfs/heketi-storage-3' took longer than 600 seconds to become available

oc describe

the following error information was pulled from the glusterfs log to help diagnose this issue: 
[2020-09-18 15:13:31.596342] E [MSGID: 101019] [graph.y:321:volume_end] 0-parser: "type" not specified for volume heketidbstorage-utime
[2020-09-18 15:13:31.596435] E [MSGID: 100026] [glusterfsd.c:2473:glusterfs_process_volfp] 0-: failed to construct the graph
  Warning  FailedMount  13s  kubelet, infra1.gluster-staging-xxxxx.com  MountVolume.SetUp failed for volume "db" : mount failed: mount failed: exit status 1
Mounting command: systemd-run
Mounting arguments: --description=Kubernetes transient mount for /var/lib/origin/openshift.local.volumes/pods/7e483db6-f9c1-11ea-84c6-005056b4376d/volumes/kubernetes.io~glusterfs/db --scope -- mount -t glusterfs -o log-level=ERROR,log-file=/var/lib/origin/openshift.local.volumes/plugins/kubernetes.io/glusterfs/db/heketi-storage-3-tgl9c-glusterfs.log,backup-volfile-servers=10.22.0.237:10.22.0.238:10.22.0.239 10.22.0.237:heketidbstorage /var/lib/origin/openshift.local.volumes/pods/7e483db6-f9c1-11ea-84c6-005056b4376d/volumes/kubernetes.io~glusterfs/db
Output: Running scope as unit run-86508.scope.
Mount failed. Please check the log file for more details.

glusterfs namespace oc events

https://pastebin.com/xvEVSw5X

For long output or logs, consider using a gist

Additional Information

Provide any additional information which may help us diagnose the
issue.

os:

 [root@master0 ~]# cat /etc/redhat-release 
Red Hat Enterprise Linux Server release 7.6 (Maipo)

inventory file
https://pastebin.com/XnEym3n6

Answer 1 · 2020-09-30T16:05:10.000Z

Issues was nodes (non-glusterfs-nodes) on cluster were using gluster3.12, caused compatability issue with gluster nodes running gluster 7