一、几个存储的概念

1. pv

A PersistentVolume is a piece of storage in the cluster that has been provisioned by an administrator.

2. pvc

A PersistentVolumeClaims is a request for storage by a user.

3. storageclass

A storageclass provides a way for administrator to describe the classes of storage they offer.

pv 支持两种方式的 provision,静态和动态

3.1 static

A cluster administrator create a number of PVs. 管理员先创建一堆 PVs,然后提供给 PVC 绑定

3.2 Dynamic

Base StorageClasses, when none of the static PVs the administrator created matches a user’s PersistentVolumeClaim, the cluster may try to dynamically provision a volume specially for the PVC.

二、StorageClass 是怎么完成 Dynamic provison 的

# k get sc rbd -o yaml --export=true
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
  name: rbd
  selfLink: /apis/storage.k8s.io/v1/storageclasses/rbd
parameters:
  adminId: ***
  adminSecretName: ***
  adminSecretNamespace: ***
  imageFeatures: ***
  imageFormat: "***"
  monitors: ***
  pool: ***
  userId: ***
  userSecretName: ***
  userSecretNamespace: ***
provisioner: ceph.com/rbd
reclaimPolicy: Delete

我们来重点关注这个 provisioner

在代码 https://github.com/kubernetes-incubator/external-storage/blob/v5.1.0/ceph/rbd/pkg/provision/provision.go#L114 中我们可以看到

pv := &v1.PersistentVolume{
	ObjectMeta: metav1.ObjectMeta{
		Name: options.PVName,
		Annotations: map[string]string{
			provisionerIDAnn: p.identity,
		},
	},
	Spec: v1.PersistentVolumeSpec{
		PersistentVolumeReclaimPolicy: options.PersistentVolumeReclaimPolicy,
		AccessModes:                   options.PVC.Spec.AccessModes,
		MountOptions:                  options.MountOptions,
		Capacity: v1.ResourceList{
			v1.ResourceName(v1.ResourceStorage): resource.MustParse(fmt.Sprintf("%dMi", sizeMB)),
		},
		PersistentVolumeSource: v1.PersistentVolumeSource{
			RBD: rbd,
		},
	},
}

我们的 provisioner 会根据请求自动创建一个 pv 然后返回

三、pv 的生命周期

  • Pending 不可用
  • Available 可用
  • Bound 已绑定
  • Released pvc 已经删除,要先回收才能再次使用
  • Failed 不能正确回收,或者在 pvc 释放之后不能删除

四、三种PV的访问模式

  • ReadWriteOnce:是最基本的方式,可读可写,但只支持被单个Pod挂载。
  • ReadOnlyMany:可以以只读的方式被多个Pod挂载。
  • ReadWriteMany:这种存储可以以读写的方式被多个Pod共享。

五、PersistentVolumeController 部分代码分析

// provisionClaimOperation provisions a volume. This method is running in
// standalone goroutine and already has all necessary locks.
func (ctrl *PersistentVolumeController) provisionClaimOperation(claimObj interface{}) {
   // get pv
	claim, ok := claimObj.(*v1.PersistentVolumeClaim)
	if !ok {
		glog.Errorf("Cannot convert provisionClaimOperation argument to claim, got %#v", claimObj)
		return
	}

   // 从 pvc 的 annotation 或 spec 获取 storageclass 名称
	claimClass := v1helper.GetPersistentVolumeClaimClass(claim)
	glog.V(4).Infof("provisionClaimOperation [%s] started, class: %q", claimToClaimKey(claim), claimClass)

   // 返回 volume 插件和 storageclass 实例
	plugin, storageClass, err := ctrl.findProvisionablePlugin(claim)
	if err != nil {
		ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, events.ProvisioningFailed, err.Error())
		glog.V(2).Infof("error finding provisioning plugin for claim %s: %v", claimToClaimKey(claim), err)
		// The controller will retry provisioning the volume in every
		// syncVolume() call.
		return
	}

	// Add provisioner annotation so external provisioners know when to start
	// 在注解中加入 storageclass 
	newClaim, err := ctrl.setClaimProvisioner(claim, storageClass)
	if err != nil {
		// Save failed, the controller will retry in the next sync
		glog.V(2).Infof("error saving claim %s: %v", claimToClaimKey(claim), err)
		return
	}
	claim = newClaim

	if plugin == nil {
		// findProvisionablePlugin returned no error nor plugin.
		// This means that an unknown provisioner is requested. Report an event
		// and wait for the external provisioner
		msg := fmt.Sprintf("waiting for a volume to be created, either by external provisioner %q or manually created by system administrator", storageClass.Provisioner)
		ctrl.eventRecorder.Event(claim, v1.EventTypeNormal, events.ExternalProvisioning, msg)
		glog.V(3).Infof("provisioning claim %q: %s", claimToClaimKey(claim), msg)
		return
	}

	// internal provisioning

	//  A previous doProvisionClaim may just have finished while we were waiting for
	//  the locks. Check that PV (with deterministic name) hasn't been provisioned
	//  yet.
  // pvName = 'pvc' + pvc.UID
	pvName := ctrl.getProvisionedVolumeNameForClaim(claim)
	volume, err := ctrl.kubeClient.CoreV1().PersistentVolumes().Get(pvName, metav1.GetOptions{})
	// pv 已经存在,返回
	if err == nil && volume != nil {
		// Volume has been already provisioned, nothing to do.
		glog.V(4).Infof("provisionClaimOperation [%s]: volume already exists, skipping", claimToClaimKey(claim))
		return
	}

	// Prepare a claimRef to the claim early (to fail before a volume is
	// provisioned)
	claimRef, err := ref.GetReference(scheme.Scheme, claim)
	if err != nil {
		glog.V(3).Infof("unexpected error getting claim reference: %v", err)
		return
	}

	// Gather provisioning options
	tags := make(map[string]string)
	tags[CloudVolumeCreatedForClaimNamespaceTag] = claim.Namespace
	tags[CloudVolumeCreatedForClaimNameTag] = claim.Name
	tags[CloudVolumeCreatedForVolumeNameTag] = pvName

	options := vol.VolumeOptions{
		PersistentVolumeReclaimPolicy: *storageClass.ReclaimPolicy,
		MountOptions:                  storageClass.MountOptions,
		CloudTags:                     &tags,
		ClusterName:                   ctrl.clusterName,
		PVName:                        pvName,
		PVC:                           claim,
		Parameters:                    storageClass.Parameters,
	}

	// Refuse to provision if the plugin doesn't support mount options, creation
	// of PV would be rejected by validation anyway
	if !plugin.SupportsMountOption() && len(options.MountOptions) > 0 {
		strerr := fmt.Sprintf("Mount options are not supported by the provisioner but StorageClass %q has mount options %v", storageClass.Name, options.MountOptions)
		glog.V(2).Infof("Mount options are not supported by the provisioner but claim %q's StorageClass %q has mount options %v", claimToClaimKey(claim), storageClass.Name, options.MountOptions)
		ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, events.ProvisioningFailed, strerr)
		return
	}

	// Provision the volume
	provisioner, err := plugin.NewProvisioner(options)
	if err != nil {
		strerr := fmt.Sprintf("Failed to create provisioner: %v", err)
		glog.V(2).Infof("failed to create provisioner for claim %q with StorageClass %q: %v", claimToClaimKey(claim), storageClass.Name, err)
		ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, events.ProvisioningFailed, strerr)
		return
	}

	opComplete := util.OperationCompleteHook(plugin.GetPluginName(), "volume_provision")
	// 还记得上面的 Provison 么?
	// 继续参见 https://github.com/kubernetes-incubator/external-storage/blob/v5.1.0/ceph/rbd/pkg/provision/provision.go#L114
	volume, err = provisioner.Provision()
	opComplete(err)
	if err != nil {
		strerr := fmt.Sprintf("Failed to provision volume with StorageClass %q: %v", storageClass.Name, err)
		glog.V(2).Infof("failed to provision volume for claim %q with StorageClass %q: %v", claimToClaimKey(claim), storageClass.Name, err)
		ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, events.ProvisioningFailed, strerr)
		return
	}

	glog.V(3).Infof("volume %q for claim %q created", volume.Name, claimToClaimKey(claim))

	// Create Kubernetes PV object for the volume.
	if volume.Name == "" {
		volume.Name = pvName
	}
	// Bind it to the claim
	// 修改 pv 的值,绑定到 pvc
	volume.Spec.ClaimRef = claimRef
	volume.Status.Phase = v1.VolumeBound
	volume.Spec.StorageClassName = claimClass

	// Add annBoundByController (used in deleting the volume)
	metav1.SetMetaDataAnnotation(&volume.ObjectMeta, annBoundByController, "yes")
	metav1.SetMetaDataAnnotation(&volume.ObjectMeta, annDynamicallyProvisioned, plugin.GetPluginName())

	// Try to create the PV object several times
	for i := 0; i < ctrl.createProvisionedPVRetryCount; i++ {
		glog.V(4).Infof("provisionClaimOperation [%s]: trying to save volume %s", claimToClaimKey(claim), volume.Name)
		var newVol *v1.PersistentVolume
		if newVol, err = ctrl.kubeClient.CoreV1().PersistentVolumes().Create(volume); err == nil || apierrs.IsAlreadyExists(err) {
			// Save succeeded.
			if err != nil {
				glog.V(3).Infof("volume %q for claim %q already exists, reusing", volume.Name, claimToClaimKey(claim))
				err = nil
			} else {
				glog.V(3).Infof("volume %q for claim %q saved", volume.Name, claimToClaimKey(claim))

				_, updateErr := ctrl.storeVolumeUpdate(newVol)
				if updateErr != nil {
					// We will get an "volume added" event soon, this is not a big error
					glog.V(4).Infof("provisionClaimOperation [%s]: cannot update internal cache: %v", volume.Name, updateErr)
				}
			}
			break
		}
		// Save failed, try again after a while.
		glog.V(3).Infof("failed to save volume %q for claim %q: %v", volume.Name, claimToClaimKey(claim), err)
		time.Sleep(ctrl.createProvisionedPVInterval)
	}

	if err != nil {
		// Save failed. Now we have a storage asset outside of Kubernetes,
		// but we don't have appropriate PV object for it.
		// Emit some event here and try to delete the storage asset several
		// times.
		strerr := fmt.Sprintf("Error creating provisioned PV object for claim %s: %v. Deleting the volume.", claimToClaimKey(claim), err)
		glog.V(3).Info(strerr)
		ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, events.ProvisioningFailed, strerr)

		var deleteErr error
		var deleted bool
		for i := 0; i < ctrl.createProvisionedPVRetryCount; i++ {
			deleted, deleteErr = ctrl.doDeleteVolume(volume)
			if deleteErr == nil && deleted {
				// Delete succeeded
				glog.V(4).Infof("provisionClaimOperation [%s]: cleaning volume %s succeeded", claimToClaimKey(claim), volume.Name)
				break
			}
			if !deleted {
				// This is unreachable code, the volume was provisioned by an
				// internal plugin and therefore there MUST be an internal
				// plugin that deletes it.
				glog.Errorf("Error finding internal deleter for volume plugin %q", plugin.GetPluginName())
				break
			}
			// Delete failed, try again after a while.
			glog.V(3).Infof("failed to delete volume %q: %v", volume.Name, deleteErr)
			time.Sleep(ctrl.createProvisionedPVInterval)
		}

		if deleteErr != nil {
			// Delete failed several times. There is an orphaned volume and there
			// is nothing we can do about it.
			strerr := fmt.Sprintf("Error cleaning provisioned volume for claim %s: %v. Please delete manually.", claimToClaimKey(claim), deleteErr)
			glog.V(2).Info(strerr)
			ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, events.ProvisioningCleanupFailed, strerr)
		}
	} else {
		glog.V(2).Infof("volume %q provisioned for claim %q", volume.Name, claimToClaimKey(claim))
		msg := fmt.Sprintf("Successfully provisioned volume %s using %s", volume.Name, plugin.GetPluginName())
		ctrl.eventRecorder.Event(claim, v1.EventTypeNormal, events.ProvisioningSucceeded, msg)
	}
}