openshift/lvm-operator

components stuck in init

Closed this issue · 5 comments

Hello, after deploying operator and a lvmcluster, the pods fail to spawn

[root@cnf10-worker-0 ~]# oc get pod -n odf-lvm
NAME                                  READY   STATUS             RESTARTS        AGE
controller-manager-765f44745b-hgcpn   3/3     Running            0               31m
topolvm-controller-5ffdc8cd9f-sktg9   4/4     Running            8 (7m41s ago)   31m
topolvm-node-8ffm7                    0/4     Init:0/1           0               31m
topolvm-node-w2rvd                    0/4     Pending            0               31m
topolvm-node-w5s7w                    0/4     Init:0/1           0               31m
vg-manager-8mvn8                      0/1     CrashLoopBackOff   7 (4m4s ago)    31m
vg-manager-mg9xj                      0/1     CrashLoopBackOff   7 (4m19s ago)   31m
vg-manager-wvhbv                      0/1     CrashLoopBackOff   7 (3m53s ago)   31m
[root@cnf10-worker-0 ~]# oc describe pod  -n odf-lvm topolvm-node-8ffm7
Name:         topolvm-node-8ffm7
Namespace:    odf-lvm
Priority:     0
Node:         ci-ovirt-master-0.karmalabs.com/10.19.135.249
Start Time:   Wed, 20 Apr 2022 13:53:03 -0400
Labels:       app=topolvm-node
              controller-revision-hash=5685697cf9
              pod-template-generation=1
Annotations:  k8s.v1.cni.cncf.io/network-status:
                [{
                    "name": "openshift-sdn",
                    "interface": "eth0",
                    "ips": [
                        "10.133.0.246"
                    ],
                    "default": true,
                    "dns": {}
                }]
              k8s.v1.cni.cncf.io/networks-status:
                [{
                    "name": "openshift-sdn",
                    "interface": "eth0",
                    "ips": [
                        "10.133.0.246"
                    ],
                    "default": true,
                    "dns": {}
                }]
              openshift.io/scc: odf-lvm-topolvm-node
Status:       Pending
IP:           10.133.0.246
IPs:
  IP:           10.133.0.246
Controlled By:  DaemonSet/topolvm-node
Init Containers:
  file-checker:
    Container ID:  cri-o://a8a17b40bc03851f13063e7bb245e4a0214b39411a54ab1ebfabec0b634ef14b
    Image:         registry.redhat.io/odf4/odf-lvm-rhel8-operator@sha256:2bad9a3ab52faf43f8f5258c64ea6734ab40114addfdde116c0bd27d9088bf49
    Image ID:      registry.redhat.io/odf4/odf-lvm-rhel8-operator@sha256:2bad9a3ab52faf43f8f5258c64ea6734ab40114addfdde116c0bd27d9088bf49
    Port:          <none>
    Host Port:     <none>
    Command:
      /usr/bin/bash
      -c
      until [ -f /etc/topolvm/lvmd.yaml ]; do echo waiting for lvmd config file; sleep 5; done
    State:          Running
      Started:      Wed, 20 Apr 2022 13:53:14 -0400
    Ready:          False
    Restart Count:  0
    Environment:    <none>
    Mounts:
      /etc/topolvm from lvmd-config-dir (rw)
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-pck6f (ro)
Containers:
  lvmd:
    Container ID:
    Image:         registry.redhat.io/odf4/odf-topolvm-rhel8@sha256:4fb7b673d4a14021df0ad89cd99eed68dd837163bfc32aa8dc8b3eb10d60acee
    Image ID:
    Port:          <none>
    Host Port:     <none>
    Command:
      /lvmd
      --config=/etc/topolvm/lvmd.yaml
      --container=true
    State:          Waiting
      Reason:       PodInitializing
    Ready:          False
    Restart Count:  0
    Limits:
      cpu:     250m
      memory:  250Mi
    Requests:
      cpu:        250m
      memory:     250Mi
    Environment:  <none>
    Mounts:
      /etc/topolvm from lvmd-config-dir (rw)
      /run/lvmd from lvmd-socket-dir (rw)
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-pck6f (ro)
  topolvm-node:
    Container ID:
    Image:         registry.redhat.io/odf4/odf-topolvm-rhel8@sha256:4fb7b673d4a14021df0ad89cd99eed68dd837163bfc32aa8dc8b3eb10d60acee
    Image ID:
    Port:          9808/TCP
    Host Port:     0/TCP
    Command:
      /topolvm-node
      --lvmd-socket=/run/lvmd/lvmd.sock
    State:          Waiting
      Reason:       PodInitializing
    Ready:          False
    Restart Count:  0
    Limits:
      cpu:     250m
      memory:  250Mi
    Requests:
      cpu:     250m
      memory:  250Mi
    Liveness:  http-get http://:healthz/healthz delay=10s timeout=3s period=60s #success=1 #failure=3
    Environment:
      NODE_NAME:   (v1:spec.nodeName)
    Mounts:
      /run/lvmd from lvmd-socket-dir (rw)
      /run/topolvm from node-plugin-dir (rw)
      /var/lib/kubelet/plugins/kubernetes.io/csi from csi-plugin-dir (rw)
      /var/lib/kubelet/pods from pod-volumes-dir (rw)
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-pck6f (ro)
  csi-registrar:
    Container ID:
    Image:         registry.redhat.io/openshift4/ose-csi-node-driver-registrar@sha256:3308ef98afab494b80aa1a702924407cf114bce6e0ad92436e508d7dc951521c
    Image ID:
    Port:          <none>
    Host Port:     <none>
    Args:
      --csi-address=/run/topolvm/csi-topolvm.sock
      --kubelet-registration-path=/var/lib/kubelet/plugins/topolvm.cybozu.com/node/csi-topolvm.sock
    State:          Waiting
      Reason:       PodInitializing
    Ready:          False
    Restart Count:  0
    Environment:    <none>
    Mounts:
      /registration from registration-dir (rw)
      /run/topolvm from node-plugin-dir (rw)
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-pck6f (ro)
  liveness-probe:
    Container ID:
    Image:         registry.redhat.io/openshift4/ose-csi-livenessprobe@sha256:6b40bb1cb5bffc8e8689b8d01e43096a2d57981aa20ae7859618054ed3800bd7
    Image ID:
    Port:          <none>
    Host Port:     <none>
    Args:
      --csi-address=/run/topolvm/csi-topolvm.sock
    State:          Waiting
      Reason:       PodInitializing
    Ready:          False
    Restart Count:  0
    Environment:    <none>
    Mounts:
      /run/topolvm from node-plugin-dir (rw)
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-pck6f (ro)
Conditions:
  Type              Status
  Initialized       False
  Ready             False
  ContainersReady   False
  PodScheduled      True
Volumes:
  registration-dir:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/kubelet/plugins_registry/
    HostPathType:  Directory
  node-plugin-dir:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/kubelet/plugins/topolvm.cybozu.com/node
    HostPathType:  DirectoryOrCreate
  csi-plugin-dir:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/kubelet/plugins/kubernetes.io/csi
    HostPathType:  DirectoryOrCreate
  pod-volumes-dir:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/kubelet/pods/
    HostPathType:  DirectoryOrCreate
  lvmd-config-dir:
    Type:          HostPath (bare host directory volume)
    Path:          /etc/topolvm
    HostPathType:  Directory
  lvmd-socket-dir:
    Type:       EmptyDir (a temporary directory that shares a pod's lifetime)
    Medium:     Memory
    SizeLimit:  <unset>
  kube-api-access-pck6f:
    Type:                    Projected (a volume that contains injected data from multiple sources)
    TokenExpirationSeconds:  3607
    ConfigMapName:           kube-root-ca.crt
    ConfigMapOptional:       <nil>
    DownwardAPI:             true
    ConfigMapName:           openshift-service-ca.crt
    ConfigMapOptional:       <nil>
QoS Class:                   Burstable
Node-Selectors:              <none>
Tolerations:                 node.kubernetes.io/disk-pressure:NoSchedule op=Exists
                             node.kubernetes.io/memory-pressure:NoSchedule op=Exists
                             node.kubernetes.io/not-ready:NoExecute op=Exists
                             node.kubernetes.io/pid-pressure:NoSchedule op=Exists
                             node.kubernetes.io/unreachable:NoExecute op=Exists
                             node.kubernetes.io/unschedulable:NoSchedule op=Exists
Events:
  Type    Reason          Age   From               Message
  ----    ------          ----  ----               -------
  Normal  Scheduled       31m   default-scheduler  Successfully assigned odf-lvm/topolvm-node-8ffm7 to ci-ovirt-master-0.karmalabs.com
  Normal  AddedInterface  31m   multus             Add eth0 [10.133.0.246/23] from openshift-sdn
  Normal  Pulling         31m   kubelet            Pulling image "registry.redhat.io/odf4/odf-lvm-rhel8-operator@sha256:2bad9a3ab52faf43f8f5258c64ea6734ab40114addfdde116c0bd27d9088bf49"
  Normal  Pulled          30m   kubelet            Successfully pulled image "registry.redhat.io/odf4/odf-lvm-rhel8-operator@sha256:2bad9a3ab52faf43f8f5258c64ea6734ab40114addfdde116c0bd27d9088bf49" in 7.936196491s
  Normal  Created         30m   kubelet            Created container file-checker
  Normal  Started         30m   kubelet            Started container file-checker

issue with vgmanager pod is

I0420 19:13:21.737554 2360300 request.go:665] Waited for 1.026197652s due to client-side throttling, not priority and fairness, request: GET:https://172.30.0.1:443/apis/snapshot.kubevirt.io/v1alpha1?timeout=32s
{"level":"info","ts":1650482004.4948282,"logger":"controller-runtime.metrics","msg":"metrics server is starting to listen","addr":":8080"}
{"level":"info","ts":1650482004.4951057,"logger":"setup","msg":"starting manager"}
{"level":"info","ts":1650482004.4952693,"msg":"starting metrics server","path":"/metrics"}
{"level":"info","ts":1650482004.4954143,"logger":"controller.lvmvolumegroup","msg":"Starting EventSource","reconciler group":"lvm.topolvm.io","reconciler kind":"LVMVolumeGroup","source":"kind source: /, Kind="}
{"level":"info","ts":1650482004.4955132,"logger":"controller.lvmvolumegroup","msg":"Starting Controller","reconciler group":"lvm.topolvm.io","reconciler kind":"LVMVolumeGroup"}
E0420 19:13:24.498715 2360300 reflector.go:138] sigs.k8s.io/controller-runtime/pkg/cache/internal/informers_map.go:250: Failed to watch *v1alpha1.LVMVolumeGroup: failed to list *v1alpha1.LVMVolumeGroup: lvmvolumegroups.lvm.topolvm.io is forbidden: User "system:serviceaccount:odf-lvm:vg-manager" cannot list resource "lvmvolumegroups" in API group "lvm.topolvm.io" in the namespace "odf-lvm"
E0420 19:13:25.647058 2360300 reflector.go:138] sigs.k8s.io/controller-runtime/pkg/cache/internal/informers_map.go:250: Failed to watch *v1alpha1.LVMVolumeGroup: failed to list *v1alpha1.LVMVolumeGroup: lvmvolumegroups.lvm.topolvm.io is forbidden: User "system:serviceaccount:odf-lvm:vg-manager" cannot list resource "lvmvolumegroups" in API group "lvm.topolvm.io" in the namespace "odf-lvm"
E0420 19:13:28.740610 2360300 reflector.go:138] sigs.k8s.io/controller-runtime/pkg/cache/internal/informers_map.go:250: Failed to watch *v1alpha1.LVMVolumeGroup: failed to list *v1alpha1.LVMVolumeGroup: lvmvolumegroups.lvm.topolvm.io is forbidden: User "system:serviceaccount:odf-lvm:vg-manager" cannot list resource "lvmvolumegroups" in API group "lvm.topolvm.io" in the namespace "odf-lvm"
E0420 19:13:33.447567 2360300 reflector.go:138] sigs.k8s.io/controller-runtime/pkg/cache/internal/informers_map.go:250: Failed to watch *v1alpha1.LVMVolumeGroup: failed to list *v1alpha1.LVMVolumeGroup: lvmvolumegroups.lvm.topolvm.io is forbidden: User "system:serviceaccount:odf-lvm:vg-manager" cannot list resource "lvmvolumegroups" in API group "lvm.topolvm.io" in the namespace "odf-lvm"
E0420 19:13:43.227955 2360300 reflector.go:138] sigs.k8s.io/controller-runtime/pkg/cache/internal/informers_map.go:250: Failed to watch *v1alpha1.LVMVolumeGroup: failed to list *v1alpha1.LVMVolumeGroup: lvmvolumegroups.lvm.topolvm.io is forbidden: User "system:serviceaccount:odf-lvm:vg-manager" cannot list resource "lvmvolumegroups" in API group "lvm.topolvm.io" in the namespace "odf-lvm"
E0420 19:14:03.151838 2360300 reflector.go:138] sigs.k8s.io/controller-runtime/pkg/cache/internal/informers_map.go:250: Failed to watch *v1alpha1.LVMVolumeGroup: failed to list *v1alpha1.LVMVolumeGroup: lvmvolumegroups.lvm.topolvm.io is forbidden: User "system:serviceaccount:odf-lvm:vg-manager" cannot list resource "lvmvolumegroups" in API group "lvm.topolvm.io" in the namespace "odf-lvm"
E0420 19:14:42.636428 2360300 reflector.go:138] sigs.k8s.io/controller-runtime/pkg/cache/internal/informers_map.go:250: Failed to watch *v1alpha1.LVMVolumeGroup: failed to list *v1alpha1.LVMVolumeGroup: lvmvolumegroups.lvm.topolvm.io is forbidden: User "system:serviceaccount:odf-lvm:vg-manager" cannot list resource "lvmvolumegroups" in API group "lvm.topolvm.io" in the namespace "odf-lvm"
E0420 19:15:21.048957 2360300 reflector.go:138] sigs.k8s.io/controller-runtime/pkg/cache/internal/informers_map.go:250: Failed to watch *v1alpha1.LVMVolumeGroup: failed to list *v1alpha1.LVMVolumeGroup: lvmvolumegroups.lvm.topolvm.io is forbidden: User "system:serviceaccount:odf-lvm:vg-manager" cannot list resource "lvmvolumegroups" in API group "lvm.topolvm.io" in the namespace "odf-lvm"
{"level":"error","ts":1650482124.502881,"logger":"controller.lvmvolumegroup","msg":"Could not wait for Cache to sync","reconciler group":"lvm.topolvm.io","reconciler kind":"LVMVolumeGroup","error":"failed to wait for lvmvolumegroup caches to sync: timed out waiting for cache to be synced","stacktrace":"sigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).Start\n\t/remote-source/app/vendor/sigs.k8s.io/controller-runtime/pkg/internal/controller/controller.go:234\nsigs.k8s.io/controller-runtime/pkg/manager.(*controllerManager).startRunnable.func1\n\t/remote-source/app/vendor/sigs.k8s.io/controller-runtime/pkg/manager/internal.go:696"}
{"level":"error","ts":1650482124.5031688,"logger":"setup","msg":"problem running manager","error":"failed to wait for lvmvolumegroup caches to sync: timed out waiting for cache to be synced"}

the following did the trick

oc adm policy add-cluster-role-to-user cluster-admin -z vg-manager -n odf-lvm
oc adm policy add-cluster-role-to-user cluster-admin -z topolvm-controller -n odf-lvm
oc adm policy add-cluster-role-to-user cluster-admin -z topolvm-node -n odf-lvm
sp98 commented

@karmab our rbacs are restricted to openshift-storage only as of now. Please try with openshift-storage namespace instead of odf-lvm namespace for now and let us know if that fixes that issue for you.

We will work on making the operator deploy-able with other namespaces.

ok, could you please populate operatorframework.io/suggested-namespace in the csv of your olm metadata to reflect that.
That's what i use to target a given namespace

Fixed in #171