Version 3.11 Wait for heketi pod fails retrying
kb7791 opened this issue · 1 comments
kb7791 commented
Description
FAILED - RETRYING: Wait for heketi pod (1 retries left).
[WARNING]: Module invocation had junk after the JSON data: Last login: Wed Sep 16 14:53:54 EDT 2020
fatal: [master0.gluster-staging-xxxxx.com]: FAILED! => {"attempts": 30, "changed": false, "module_results": {"cmd": "/bin/oc get pod --selector=glusterfs=heketi-storage-pod -o json -n glusterfs", "results": [{"apiVersion": "v1", "items": [{"apiVersion": "v1", "kind": "Pod", "metadata": {"annotations": {"openshift.io/deployment-config.latest-version": "1", "openshift.io/deployment-config.name": "heketi-storage", "openshift.io/deployment.name": "heketi-storage-1", "openshift.io/scc": "privileged"}, "creationTimestamp": "2020-09-16T18:48:41Z", "generateName": "heketi-storage-1-", "labels": {"deployment": "heketi-storage-1", "deploymentconfig": "heketi-storage", "glusterfs": "heketi-storage-pod", "heketi": "storage-pod"}, "name": "heketi-storage-1-x729j", "namespace": "glusterfs", "ownerReferences": [{"apiVersion": "v1", "blockOwnerDeletion": true, "controller": true, "kind": "ReplicationController", "name": "heketi-storage-1", "uid": "3b7227d2-f84d-11ea-931a-005056b42cf4"}], "resourceVersion": "3830", "selfLink": "/api/v1/namespaces/glusterfs/pods/heketi-storage-1-x729j", "uid": "3d560f5b-f84d-11ea-aa78-005056b4152a"}, "spec": {"containers": [{"env": [{"name": "HEKETI_USER_KEY", "value": "fbJkWCozh/I8+m5wq0OeUx/B7fhao+ogL4HxAh8RCVk="}, {"name": "HEKETI_ADMIN_KEY", "value": "NNnkf72k9FvdXXmfbnSGKDgjirxNz93J6pIhTnoV1m0="}, {"name": "HEKETI_CLI_USER", "value": "admin"}, {"name": "HEKETI_CLI_KEY", "value": "NNnkf72k9FvdXXmfbnSGKDgjirxNz93J6pIhTnoV1m0="}, {"name": "HEKETI_EXECUTOR", "value": "ssh"}, {"name": "HEKETI_FSTAB", "value": "/etc/fstab"}, {"name": "HEKETI_SNAPSHOT_LIMIT", "value": "14"}, {"name": "HEKETI_KUBE_GLUSTER_DAEMONSET", "value": "1"}, {"name": "HEKETI_IGNORE_STALE_OPERATIONS", "value": "true"}, {"name": "HEKETI_DEBUG_UMOUNT_FAILURES", "value": "true"}, {"name": "HEKETI_LVM_WRAPPER"}], "image": "10.22.0.227:5000/heketi/heketi:latest", "imagePullPolicy": "IfNotPresent", "livenessProbe": {"failureThreshold": 3, "httpGet": {"path": "/hello", "port": 8080, "scheme": "HTTP"}, "initialDelaySeconds": 30, "periodSeconds": 10, "successThreshold": 1, "timeoutSeconds": 3}, "name": "heketi", "ports": [{"containerPort": 8080, "protocol": "TCP"}], "readinessProbe": {"failureThreshold": 3, "httpGet": {"path": "/hello", "port": 8080, "scheme": "HTTP"}, "initialDelaySeconds": 3, "periodSeconds": 10, "successThreshold": 1, "timeoutSeconds": 3}, "resources": {}, "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "volumeMounts": [{"mountPath": "/var/lib/heketi", "name": "db"}, {"mountPath": "/etc/heketi", "name": "config"}, {"mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", "name": "heketi-storage-service-account-token-gf9jp", "readOnly": true}]}], "dnsPolicy": "ClusterFirst", "imagePullSecrets": [{"name": "heketi-storage-service-account-dockercfg-bzfbl"}], "nodeName": "infra1.gluster-staging-xxxxx.com", "priority": 0, "restartPolicy": "Always", "schedulerName": "default-scheduler", "securityContext": {}, "serviceAccount": "heketi-storage-service-account", "serviceAccountName": "heketi-storage-service-account", "terminationGracePeriodSeconds": 30, "volumes": [{"glusterfs": {"endpoints": "heketi-db-storage-endpoints", "path": "heketidbstorage"}, "name": "db"}, {"name": "config", "secret": {"defaultMode": 420, "secretName": "heketi-storage-config-secret"}}, {"name": "heketi-storage-service-account-token-gf9jp", "secret": {"defaultMode": 420, "secretName": "heketi-storage-service-account-token-gf9jp"}}]}, "status": {"conditions": [{"lastProbeTime": null, "lastTransitionTime": "2020-09-16T18:48:41Z", "status": "True", "type": "Initialized"}, {"lastProbeTime": null, "lastTransitionTime": "2020-09-16T18:48:41Z", "message": "containers with unready status: [heketi]", "reason": "ContainersNotReady", "status": "False", "type": "Ready"}, {"lastProbeTime": null, "lastTransitionTime": null, "message": "containers with unready status: [heketi]", "reason": "ContainersNotReady", "status": "False", "type": "ContainersReady"}, {"lastProbeTime": null, "lastTransitionTime": "2020-09-16T18:48:41Z", "status": "True", "type": "PodScheduled"}], "containerStatuses": [{"image": "10.22.0.227:5000/heketi/heketi:latest", "imageID": "", "lastState": {}, "name": "heketi", "ready": false, "restartCount": 0, "state": {"waiting": {"reason": "ContainerCreating"}}}], "hostIP": "10.22.0.232", "phase": "Pending", "qosClass": "BestEffort", "startTime": "2020-09-16T18:48:41Z"}}], "kind": "List", "metadata": {"resourceVersion": "", "selfLink": ""}}], "returncode": 0}, "state": "list"}
PLAY RECAP *************************************************************************************************************************************************************************************************************************************
glusterfs0.gluster-staging-xxxxx.com : ok=3 changed=2 unreachable=0 failed=0 skipped=6 rescued=0 ignored=0
glusterfs1.gluster-staging-xxxxx.com : ok=3 changed=2 unreachable=0 failed=0 skipped=6 rescued=0 ignored=0
glusterfs2.gluster-staging-xxxxx.com : ok=3 changed=2 unreachable=0 failed=0 skipped=6 rescued=0 ignored=0
infra0.gluster-staging-xxxxx.com : ok=165 changed=82 unreachable=0 failed=0 skipped=259 rescued=0 ignored=0
infra1.gluster-staging-xxxxx.com : ok=165 changed=82 unreachable=0 failed=0 skipped=259 rescued=0 ignored=0
infra2.gluster-staging-xxxxx.com : ok=165 changed=82 unreachable=0 failed=0 skipped=259 rescued=0 ignored=0
lb.gluster-staging-xxxxx.com : ok=86 changed=28 unreachable=0 failed=0 skipped=74 rescued=0 ignored=0
localhost : ok=28 changed=0 unreachable=0 failed=0 skipped=4 rescued=0 ignored=0
master0.gluster-staging-xxxxx.com : ok=519 changed=253 unreachable=0 failed=1 skipped=705 rescued=0 ignored=0
master1.gluster-staging-xxxxx.com : ok=334 changed=165 unreachable=0 failed=0 skipped=535 rescued=0 ignored=0
master2.gluster-staging-xxxxx.com : ok=334 changed=165 unreachable=0 failed=0 skipped=535 rescued=0 ignored=0
node0.gluster-staging-xxxxx.com : ok=165 changed=82 unreachable=0 failed=0 skipped=259 rescued=0 ignored=0
node1.gluster-staging-xxxxx.com : ok=165 changed=82 unreachable=0 failed=0 skipped=259 rescued=0 ignored=0
node2.gluster-staging-xxxxx.com : ok=165 changed=82 unreachable=0 failed=0 skipped=259 rescued=0 ignored=0
INSTALLER STATUS *******************************************************************************************************************************************************************************************************************************
Initialization : Complete (0:00:35)
Health Check : Complete (0:00:30)
Node Bootstrap Preparation : Complete (0:08:17)
etcd Install : Complete (0:01:55)
NFS Install : Complete (0:00:23)
Load Balancer Install : Complete (0:00:32)
Master Install : Complete (0:08:46)
Master Additional Install : Complete (0:01:42)
Node Join : Complete (0:01:26)
GlusterFS Install : In Progress (0:12:58)
This phase can be restarted by running: playbooks/openshift-glusterfs/new_install.yml
Wednesday 16 September 2020 14:54:05 -0400 (0:05:27.268) 0:39:58.381 ***
===============================================================================
openshift_storage_glusterfs : Wait for heketi pod ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 327.27s
openshift_storage_glusterfs : Wait for copy job to finish ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 325.60s
openshift_node : Install node, clients, and conntrack packages ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 127.41s
openshift_control_plane : Wait for all control plane pods to come up and become ready ------------------------------------------------------------------------------------------------------------------------------------------------- 119.33s
openshift_master_certificates : copy --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 39.19s
openshift_storage_glusterfs : Wait for deploy-heketi pod ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 33.39s
Approve node certificates when bootstrapping ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 31.91s
Run health checks (install) - EL ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 29.36s
openshift_node : install needed rpm(s) ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 28.94s
tuned : Ensure files are populated from templates -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 18.85s
openshift_cloud_provider : verify API server ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 15.86s
openshift_sdn : Copy templates to temp directory --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 15.75s
openshift_node : Update journald setup ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 14.53s
openshift_node : pre-pull pod image ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 14.33s
openshift_examples : Unarchive the OpenShift examples on the remote -------------------------------------------------------------------------------------------------------------------------------------------------------------------- 13.75s
tuned : Ensure files are populated from templates -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 13.44s
openshift_node_group : Copy templates to temp directory -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 12.24s
openshift_node : restart services ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 11.80s
openshift_control_plane : Copy static master scripts ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 11.67s
openshift_node : Add firewalld allow rules --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 11.27s
Failure summary:
1. Hosts: master0.gluster-staging-xxxxx.com
Play: Configure GlusterFS
Task: Wait for heketi pod
Message: Failed without returning a message.
Version
Please put the following version information in the code block
indicated below.
- [root@master0 ~]# ansible --version
ansible 2.9.10
config file = /etc/ansible/ansible.cfg
configured module search path = [u'/root/.ansible/plugins/modules', u'/usr/share/ansible/plugins/modules']
ansible python module location = /usr/lib/python2.7/site-packages/ansible
executable location = /bin/ansible
python version = 2.7.5 (default, Aug 7 2019, 00:51:29) [GCC 4.8.5 20150623 (Red Hat 4.8.5-39)]
If you're operating from a git clone:
- openshift-ansible-3.11.213-1
Steps To Reproduce
- runing deploy-cluster.yml
Expected Results
Glusterfs playbook to install correctly
Observed Results
Describe what is actually happening.
[root@master0 ~]# oc logs heketi-storage-3-deploy
--> Scaling heketi-storage-3 to 1
error: update acceptor rejected heketi-storage-3: pods for rc 'glusterfs/heketi-storage-3' took longer than 600 seconds to become available
oc describe
the following error information was pulled from the glusterfs log to help diagnose this issue:
[2020-09-18 15:13:31.596342] E [MSGID: 101019] [graph.y:321:volume_end] 0-parser: "type" not specified for volume heketidbstorage-utime
[2020-09-18 15:13:31.596435] E [MSGID: 100026] [glusterfsd.c:2473:glusterfs_process_volfp] 0-: failed to construct the graph
Warning FailedMount 13s kubelet, infra1.gluster-staging-xxxxx.com MountVolume.SetUp failed for volume "db" : mount failed: mount failed: exit status 1
Mounting command: systemd-run
Mounting arguments: --description=Kubernetes transient mount for /var/lib/origin/openshift.local.volumes/pods/7e483db6-f9c1-11ea-84c6-005056b4376d/volumes/kubernetes.io~glusterfs/db --scope -- mount -t glusterfs -o log-level=ERROR,log-file=/var/lib/origin/openshift.local.volumes/plugins/kubernetes.io/glusterfs/db/heketi-storage-3-tgl9c-glusterfs.log,backup-volfile-servers=10.22.0.237:10.22.0.238:10.22.0.239 10.22.0.237:heketidbstorage /var/lib/origin/openshift.local.volumes/pods/7e483db6-f9c1-11ea-84c6-005056b4376d/volumes/kubernetes.io~glusterfs/db
Output: Running scope as unit run-86508.scope.
Mount failed. Please check the log file for more details.
glusterfs namespace oc events
For long output or logs, consider using a gist
Additional Information
Provide any additional information which may help us diagnose the
issue.
os:
[root@master0 ~]# cat /etc/redhat-release
Red Hat Enterprise Linux Server release 7.6 (Maipo)
inventory file
https://pastebin.com/XnEym3n6
kb7791 commented
Issues was nodes (non-glusterfs-nodes) on cluster were using gluster3.12, caused compatability issue with gluster nodes running gluster 7