kubernetes 当虚拟机重启时,pod(restartPolicy == Never)卡在pending状态,

sz81bmfz  于 4个月前  发布在  Kubernetes
关注(0)|答案(8)|浏览(73)

发生了什么?

  1. 创建一个作业,将作业的重启策略设置为Never;
apiVersion: batch/v1
kind: Job
metadata:
  annotations:
    description: ''
  enable: true
  name: test-created
  namespace: default
  lables: {}
spec:
  completions: 1
  parallelism: 1
  activeDeadlineSeconds: null
  template:
    metadata:
      enable: true
      name: test-created
      labels:
        app: test-created
        version: v1
    spec:
      containers:
        - name: container-1
          image: ubuntu:xenial-20190610
          imagePullPolicy: IfNotPresent
          command:
            - /bin/bash
            - '-c'
            - sleep 60
          resources: {}
      imagePullSecrets:
        - name: default-secret
      restartPolicy: Never
      volumes: []
      dnsConfig:
        options:
          - name: single-request-reopen
      initContainers: []
  completionMode: NonIndexed
  1. 将pod调度到Node A,然后Node A重启。重启后,pod处于挂起状态。查看kubelet日志,只看到"找不到可用于pod的就绪沙箱。需要启动一个新的" pod="default/test-created-w2bk5"
    在代码审查后,如果在the container created success, but it is not started发生重启时,可能会导致这个问题;
    2.1 在重启后,pod的沙箱退出,因此PodSandboxChanged返回true,并将新的尝试计数设置为1,kubelet日志"o ready sandbox for pod can be found. Need to start a new one"可以确认这一点。
// PodSandboxChanged checks whether the spec of the pod is changed and returns
// (changed, new attempt, original sandboxID if exist).
func PodSandboxChanged(pod *v1.Pod, podStatus *kubecontainer.PodStatus) (bool, uint32, string) {
	if len(podStatus.SandboxStatuses) == 0 {
		klog.V(2).InfoS("No sandbox for pod can be found. Need to start a new one", "pod", klog.KObj(pod))
		return true, 0, ""
	}

	readySandboxCount := 0
	for _, s := range podStatus.SandboxStatuses {
		if s.State == runtimeapi.PodSandboxState_SANDBOX_READY {
			readySandboxCount++
		}
	}

	// Needs to create a new sandbox when readySandboxCount > 1 or the ready sandbox is not the latest one.
	sandboxStatus := podStatus.SandboxStatuses[0]
	if readySandboxCount > 1 {
		klog.V(2).InfoS("Multiple sandboxes are ready for Pod. Need to reconcile them", "pod", klog.KObj(pod))
		return true, sandboxStatus.Metadata.Attempt + 1, sandboxStatus.Id
	}
	if sandboxStatus.State != runtimeapi.PodSandboxState_SANDBOX_READY {
		klog.V(2).InfoS("No ready sandbox for pod can be found. Need to start a new one", "pod", klog.KObj(pod))
		return true, sandboxStatus.Metadata.Attempt + 1, sandboxStatus.Id
	}

	// Needs to create a new sandbox when network namespace changed.
	if sandboxStatus.GetLinux().GetNamespaces().GetOptions().GetNetwork() != NetworkNamespaceForPod(pod) {
		klog.V(2).InfoS("Sandbox for pod has changed. Need to start a new one", "pod", klog.KObj(pod))
		return true, sandboxStatus.Metadata.Attempt + 1, ""
	}

	// Needs to create a new sandbox when the sandbox does not have an IP address.
	if !kubecontainer.IsHostNetworkPod(pod) && sandboxStatus.Network != nil && sandboxStatus.Network.Ip == "" {
		klog.V(2).InfoS("Sandbox for pod has no IP address. Need to start a new one", "pod", klog.KObj(pod))
		return true, sandboxStatus.Metadata.Attempt + 1, sandboxStatus.Id
	}

	return false, sandboxStatus.Metadata.Attempt, sandboxStatus.Id
}

2.2 在重启后,kubelet从cri获取容器状态,容器处于ContainerStateCreated状态(表示已创建(例如使用docker create)但尚未启动的容器)。现在kubelet认为在此情况下,它不应该为已经完成的pod创建沙箱;但是容器只是创建了但尚未启动,我们应该恢复这个pod吗?

// If we need to (re-)create the pod sandbox, everything will need to be
	// killed and recreated, and init containers should be purged.
	if createPodSandbox {
		if !shouldRestartOnFailure(pod) && attempt != 0 && len(podStatus.ContainerStatuses) != 0 {
			// Should not restart the pod, just return.
			// we should not create a sandbox for a pod if it is already done.
			// if all containers are done and should not be started, there is no need to create a new sandbox.
			// this stops confusing logs on pods whose containers all have exit codes, but we recreate a sandbox before terminating it.
			//
			// If ContainerStatuses is empty, we assume that we've never
			// successfully created any containers. In this case, we should
			// retry creating the sandbox.
			changes.CreateSandbox = false
			return changes
		}

		// Get the containers to start, excluding the ones that succeeded if RestartPolicy is OnFailure.
		var containersToStart []int
		for idx, c := range pod.Spec.Containers {
			if pod.Spec.RestartPolicy == v1.RestartPolicyOnFailure && containerSucceeded(&c, podStatus) {
				continue
			}
			containersToStart = append(containersToStart, idx)
		}
		// We should not create a sandbox for a Pod if initialization is done and there is no container to start.
		if len(containersToStart) == 0 {
			_, _, done := findNextInitContainerToRun(pod, podStatus)
			if done {
				changes.CreateSandbox = false
				return changes
			}
		}

		if len(pod.Spec.InitContainers) != 0 {
			// Pod has init containers, return the first one.
			changes.NextInitContainerToStart = &pod.Spec.InitContainers[0]
			return changes
		}
		changes.ContainersToStart = containersToStart
		return changes
	}

3.3 kubelet生成容器状态,kubecontainer.ContainerStateCreated将转换为ContainerStateWaiting(这是由kubelet: If the container status is created, we are waiting引入的),并且ContainerStateWaiting可能导致pod阶段变为Pending。

convertContainerStatus := func(cs *kubecontainer.Status, oldStatus *v1.ContainerStatus) *v1.ContainerStatus {
		cid := cs.ID.String()
		status := &v1.ContainerStatus{
			Name:         cs.Name,
			RestartCount: int32(cs.RestartCount),
			Image:        cs.Image,
			ImageID:      cs.ImageID,
			ContainerID:  cid,
		}
		switch {
		case cs.State == kubecontainer.ContainerStateRunning:
			status.State.Running = &v1.ContainerStateRunning{StartedAt: metav1.NewTime(cs.StartedAt)}
		case cs.State == kubecontainer.ContainerStateCreated:
			// containers that are created but not running are "waiting to be running"
			status.State.Waiting = &v1.ContainerStateWaiting{}
		case cs.State == kubecontainer.ContainerStateExited:
			status.State.Terminated = &v1.ContainerStateTerminated{
				ExitCode:    int32(cs.ExitCode),
				Reason:      cs.Reason,
				Message:     cs.Message,
				StartedAt:   metav1.NewTime(cs.StartedAt),
				FinishedAt:  metav1.NewTime(cs.FinishedAt),
				ContainerID: cid,
			}

		case cs.State == kubecontainer.ContainerStateUnknown &&
			oldStatus != nil && // we have an old status
			oldStatus.State.Running != nil: // our previous status was running
			// if this happens, then we know that this container was previously running and isn't anymore (assuming the CRI isn't failing to return running containers).
			// you can imagine this happening in cases where a container failed and the kubelet didn't ask about it in time to see the result.
			// in this case, the container should not to into waiting state immediately because that can make cases like runonce pods actually run
			// twice. "container never ran" is different than "container ran and failed".  This is handled differently in the kubelet
			// and it is handled differently in higher order logic like crashloop detection and handling
			status.State.Terminated = &v1.ContainerStateTerminated{
				Reason:   "ContainerStatusUnknown",
				Message:  "The container could not be located when the pod was terminated",
				ExitCode: 137, // this code indicates an error
			}
			// the restart count normally comes from the CRI (see near the top of this method), but since this is being added explicitly
			// for the case where the CRI did not return a status, we need to manually increment the restart count to be accurate.
			status.RestartCount = oldStatus.RestartCount + 1

		default:
			// this collapses any unknown state to container waiting.  If any container is waiting, then the pod status moves to pending even if it is running.
			// if I'm reading this correctly, then any failure to read status on any container results in the entire pod going pending even if the containers
			// are actually running.
			// see https://github.com/kubernetes/kubernetes/blob/5d1b3e26af73dde33ecb6a3e69fb5876ceab192f/pkg/kubelet/kuberuntime/kuberuntime_container.go#L497 to
			// https://github.com/kubernetes/kubernetes/blob/8976e3620f8963e72084971d9d4decbd026bf49f/pkg/kubelet/kuberuntime/helpers.go#L58-L71
			// and interpreted here https://github.com/kubernetes/kubernetes/blob/b27e78f590a0d43e4a23ca3b2bf1739ca4c6e109/pkg/kubelet/kubelet_pods.go#L1434-L1439
			status.State.Waiting = &v1.ContainerStateWaiting{}
		}
		return status
	}

你期望发生什么?

pod更改为最终状态(成功或失败)

我们如何尽可能精确地重现它?

  1. 向kubelet添加一些伪代码以模拟此场景
if strings.Contains(pod.Name, "test-created") {
               if _, err := m.osInterface.Stat("/var/lib/kubelet/test-created"); os.IsNotExist(err) {
                       klog.V(2).InfoS("==== restart kubelet")
                       m.osInterface.Create("/var/lib/kubelet/test-created")
                       err = m.runtimeService.StopPodSandbox(podSandboxID)
                       if err != nil {
                               klog.ErrorS(err, "== StopPodSandbox failed", "sandboxid", podSandboxID)
                       }
                       klog.V(2).InfoS("=== touch file")
                       klog.Fatalf("=== panic")
               }
       }
  1. 创建名为test-created的作业,将重启策略设置为Never
  2. 查看pod状态卡在pending,无法恢复

我们需要了解的其他信息吗?

  • 无响应*

Kubernetes版本

$ kubectl version
# paste output here

1.25

云提供商

华为云

OS版本

# On Linux:
$ cat /etc/os-release
# paste output here
$ uname -a
# paste output here

# On Windows:
C:\> wmic os get Caption, Version, BuildNumber, OSArchitecture
# paste output here

Ubuntu

安装工具

容器运行时(CRI)和版本(如适用)

相关插件(CNI,CSI等)和版本(如适用)

eanckbw9

eanckbw92#

@ehashman @SergeyKanzhelev

gupuwyp2

gupuwyp24#

看起来问题有足够的详细信息来进行复现。

ncecgwcz

ncecgwcz5#

/priority important-longerm
听起来不像是一个回归。

tf7tbtn2

tf7tbtn26#

@SergeyKanzhelev: 标签(priority/important-longerm)无法应用,因为仓库中没有它们。
对此的回应:
/priority important-longerm
听起来不像是一个回归。
有关如何使用PR评论与我互动的说明,请查看 here 。如果您对我的行为有任何疑问或建议,请针对 kubernetes/test-infra 仓库提交一个问题。

vd2z7a6w

vd2z7a6w7#

@Dingshujie,你能在1.27版本上复现这个问题吗?不确定这是否会有帮助,但我提出这个问题是因为在1.27版本中已经对pod生命周期进行了一些修复,包括#115331

kzipqqlq

kzipqqlq8#

@Dingshujie I tried reproducing the problem in v1.29.0 and the pod doesn't seem to get stuck in Pending anymore; but does this resolve your current problem?

  1. Using the job config yaml in the issue description; I created a job in a single worker node cluster
# kubectl get no,po -o wide
NAME                             STATUS     ROLES           AGE   VERSION   INTERNAL-IP       EXTERNAL-IP   OS-IMAGE          KERNEL-VERSION          CONTAINER-RUNTIME
node/localhost.localdomain       Ready      control-plane   47m   v1.29.0   XXX.XXX.XXX.XXX   <none>        CentOS Stream 8   4.18.0-544.el8.x86_64   containerd://1.6.28
node/localmachine2.localdomain   Ready      <none>          45m   v1.29.0   XXX.XXX.XXX.XXX   <none>        CentOS Stream 8   4.18.0-544.el8.x86_64   containerd://1.6.28

NAME                     READY   STATUS    RESTARTS   AGE   IP            NODE                        NOMINATED NODE   READINESS GATES
pod/test-created-2mfq7   1/1     Running   0          6s    10.244.1.33   localmachine2.localdomain   <none>           <none>

2.a [With Graceful Node Shutdown enabled] After rebooting the worker node; and waiting for a while, the old pod ends up in an Error and starts a new pod which ends up Completed . Zero Restarts.

NAME                     READY   STATUS      RESTARTS   AGE     IP            NODE                        NOMINATED NODE   READINESS GATES
pod/test-created-2mfq7   0/1     Error       0          10m     <none>        localmachine2.localdomain   <none>           <none>
pod/test-created-5g5cc   0/1     Completed   0          9m22s   10.244.1.34   localmachine2.localdomain   <none>           <none>

2.b [Without Graceful Node Shutdown] After rebooting the worker node and waiting for a while, the old pod ends up with Unknown and starts a new pod which ends up Completed . Zero Restarts.

NAME                     READY   STATUS      RESTARTS   AGE     IP            NODE                        NOMINATED NODE   READINESS GATES
pod/test-created-7lx7h   0/1     Unknown     0          3m12s   <none>        localmachine2.localdomain   <none>           <none>
pod/test-created-bfh98   0/1     Completed   0          102s    10.244.1.39   localmachine2.localdomain   <none>           <none>

Also a fix from the job config file since enable is not a valid metadata field:

What happened?

  1. create a job, set job restartPolicy to Never;
apiVersion: batch/v1
kind: Job
metadata:
  annotations:
    description: ''
  # enable: true
   ...
  labels: {}  # lables: {}
spec:
  ...
  template:
    metadata:
      # enable: true
      ...

I

相关问题