kubernetes-csi/external-provisioner

clone controller shallow copy bug

1978629634 opened this issue · 4 comments

What happened:

If the cloner controller directly updates the PVC reference without using a deep copy and then calls the Update() method, it can lead to a situation where the cached PVC is updated (removing the finalizer), while the actual PVC remains unmodified in case the Update() operation fails.

// syncClaimHandler gets the claim from informer's cache then calls syncClaim
func (p *CloningProtectionController) syncClaimHandler(ctx context.Context, key string) error {
	namespace, name, err := cache.SplitMetaNamespaceKey(key)
	if err != nil {
		utilruntime.HandleError(fmt.Errorf("invalid resource key: %s", key))
		return nil
	}

        // shallow copy
	claim, err := p.claimLister.PersistentVolumeClaims(namespace).Get(name)
	if err != nil {
		if apierrs.IsNotFound(err) {
			utilruntime.HandleError(fmt.Errorf("Item '%s' in work queue no longer exists", key))
			return nil
		}

		return err
	}

	return p.syncClaim(ctx, claim)
}

// syncClaim removes finalizers from a PVC, when cloning is finished
func (p *CloningProtectionController) syncClaim(ctx context.Context, claim *v1.PersistentVolumeClaim) error {
	if !checkFinalizer(claim, pvcCloneFinalizer) {
		return nil
	}

	// Checking for PVCs in the same namespace to have other states aside from Pending, which means that cloning is still in progress
	pvcList, err := p.claimLister.PersistentVolumeClaims(claim.Namespace).List(labels.Everything())
	if err != nil {
		return err
	}

	// Check for pvc state with DataSource pointing to claim
	for _, pvc := range pvcList {
		if pvc.Spec.DataSource == nil {
			continue
		}

		// Requeue when at least one PVC is still works on cloning
		if pvc.Spec.DataSource.Kind == pvcKind &&
			pvc.Spec.DataSource.Name == claim.Name &&
			pvc.Status.Phase == v1.ClaimPending {
			return fmt.Errorf("PVC '%s' is in 'Pending' state, cloning in progress", pvc.Name)
		}
	}

	// Remove clone finalizer
	finalizers := make([]string, 0)
	for _, finalizer := range claim.ObjectMeta.Finalizers {
		if finalizer != pvcCloneFinalizer {
			finalizers = append(finalizers, finalizer)
		}
	}
	claim.ObjectMeta.Finalizers = finalizers

	if _, err = p.client.CoreV1().PersistentVolumeClaims(claim.Namespace).Update(ctx, claim, metav1.UpdateOptions{}); err != nil {
		if !apierrs.IsNotFound(err) {
			// Couldn't remove finalizer and the object still exists, the controller may
			// try to remove the finalizer again on the next update
			klog.Infof("failed to remove clone finalizer from PVC %v", claim.Name)
			return err
		}
	}

	return nil
}

What you expected to happen:

deep copy before update()

How to reproduce it:

Anything else we need to know?:

Environment:

  • Driver version:
  • Kubernetes version (use kubectl version):
  • OS (e.g. from /etc/os-release):
  • Kernel (e.g. uname -a):
  • Install tools:
  • Others:

@pohly I'm facing an issue and would appreciate if you could take a look when you have a chance.

Here's a link to the relevant source code:

claim.ObjectMeta.Finalizers = finalizers
if _, err = p.client.CoreV1().PersistentVolumeClaims(claim.Namespace).Update(ctx, claim, metav1.UpdateOptions{}); err != nil {

@1978629634: your analysis seems correct to me, a DeepCopy seems appropriate here. Care to submit a PR?

Here's a link to the relevant source code:

claim.ObjectMeta.Finalizers = finalizers
if _, err = p.client.CoreV1().PersistentVolumeClaims(claim.Namespace).Update(ctx, claim, metav1.UpdateOptions{}); err != nil {

@1978629634: your analysis seems correct to me, a DeepCopy seems appropriate here. Care to submit a PR?

Sure, I'd be happy to submit a PR for that fix

/assign