- 创建一个作业,将作业的重启策略设置为Never;
apiVersion: batch/v1
kind: Job
description: ''
enable: true
name: test-created
namespace: default
lables: {}
completions: 1
parallelism: 1
activeDeadlineSeconds: null
enable: true
name: test-created
app: test-created
version: v1
- name: container-1
image: ubuntu:xenial-20190610
imagePullPolicy: IfNotPresent
- /bin/bash
- '-c'
- sleep 60
resources: {}
- name: default-secret
restartPolicy: Never
volumes: []
- name: single-request-reopen
initContainers: []
completionMode: NonIndexed
- 将pod调度到Node A,然后Node A重启。重启后,pod处于挂起状态。查看kubelet日志,只看到"找不到可用于pod的就绪沙箱。需要启动一个新的" pod="default/test-created-w2bk5"
在代码审查后,如果在the container created success, but it is not started
2.1 在重启后,pod的沙箱退出,因此PodSandboxChanged返回true,并将新的尝试计数设置为1,kubelet日志"o ready sandbox for pod can be found. Need to start a new one"可以确认这一点。
// PodSandboxChanged checks whether the spec of the pod is changed and returns
// (changed, new attempt, original sandboxID if exist).
func PodSandboxChanged(pod *v1.Pod, podStatus *kubecontainer.PodStatus) (bool, uint32, string) {
if len(podStatus.SandboxStatuses) == 0 {
klog.V(2).InfoS("No sandbox for pod can be found. Need to start a new one", "pod", klog.KObj(pod))
return true, 0, ""
readySandboxCount := 0
for _, s := range podStatus.SandboxStatuses {
if s.State == runtimeapi.PodSandboxState_SANDBOX_READY {
// Needs to create a new sandbox when readySandboxCount > 1 or the ready sandbox is not the latest one.
sandboxStatus := podStatus.SandboxStatuses[0]
if readySandboxCount > 1 {
klog.V(2).InfoS("Multiple sandboxes are ready for Pod. Need to reconcile them", "pod", klog.KObj(pod))
return true, sandboxStatus.Metadata.Attempt + 1, sandboxStatus.Id
if sandboxStatus.State != runtimeapi.PodSandboxState_SANDBOX_READY {
klog.V(2).InfoS("No ready sandbox for pod can be found. Need to start a new one", "pod", klog.KObj(pod))
return true, sandboxStatus.Metadata.Attempt + 1, sandboxStatus.Id
// Needs to create a new sandbox when network namespace changed.
if sandboxStatus.GetLinux().GetNamespaces().GetOptions().GetNetwork() != NetworkNamespaceForPod(pod) {
klog.V(2).InfoS("Sandbox for pod has changed. Need to start a new one", "pod", klog.KObj(pod))
return true, sandboxStatus.Metadata.Attempt + 1, ""
// Needs to create a new sandbox when the sandbox does not have an IP address.
if !kubecontainer.IsHostNetworkPod(pod) && sandboxStatus.Network != nil && sandboxStatus.Network.Ip == "" {
klog.V(2).InfoS("Sandbox for pod has no IP address. Need to start a new one", "pod", klog.KObj(pod))
return true, sandboxStatus.Metadata.Attempt + 1, sandboxStatus.Id
return false, sandboxStatus.Metadata.Attempt, sandboxStatus.Id
2.2 在重启后,kubelet从cri获取容器状态,容器处于ContainerStateCreated状态(表示已创建(例如使用docker create)但尚未启动的容器)。现在kubelet认为在此情况下,它不应该为已经完成的pod创建沙箱;但是容器只是创建了但尚未启动,我们应该恢复这个pod吗?
// If we need to (re-)create the pod sandbox, everything will need to be
// killed and recreated, and init containers should be purged.
if createPodSandbox {
if !shouldRestartOnFailure(pod) && attempt != 0 && len(podStatus.ContainerStatuses) != 0 {
// Should not restart the pod, just return.
// we should not create a sandbox for a pod if it is already done.
// if all containers are done and should not be started, there is no need to create a new sandbox.
// this stops confusing logs on pods whose containers all have exit codes, but we recreate a sandbox before terminating it.
// If ContainerStatuses is empty, we assume that we've never
// successfully created any containers. In this case, we should
// retry creating the sandbox.
changes.CreateSandbox = false
return changes
// Get the containers to start, excluding the ones that succeeded if RestartPolicy is OnFailure.
var containersToStart []int
for idx, c := range pod.Spec.Containers {
if pod.Spec.RestartPolicy == v1.RestartPolicyOnFailure && containerSucceeded(&c, podStatus) {
containersToStart = append(containersToStart, idx)
// We should not create a sandbox for a Pod if initialization is done and there is no container to start.
if len(containersToStart) == 0 {
_, _, done := findNextInitContainerToRun(pod, podStatus)
if done {
changes.CreateSandbox = false
return changes
if len(pod.Spec.InitContainers) != 0 {
// Pod has init containers, return the first one.
changes.NextInitContainerToStart = &pod.Spec.InitContainers[0]
return changes
changes.ContainersToStart = containersToStart
return changes
3.3 kubelet生成容器状态,kubecontainer.ContainerStateCreated将转换为ContainerStateWaiting(这是由kubelet: If the container status is created, we are waiting引入的),并且ContainerStateWaiting可能导致pod阶段变为Pending。
convertContainerStatus := func(cs *kubecontainer.Status, oldStatus *v1.ContainerStatus) *v1.ContainerStatus {
cid := cs.ID.String()
status := &v1.ContainerStatus{
Name: cs.Name,
RestartCount: int32(cs.RestartCount),
Image: cs.Image,
ImageID: cs.ImageID,
ContainerID: cid,
switch {
case cs.State == kubecontainer.ContainerStateRunning:
status.State.Running = &v1.ContainerStateRunning{StartedAt: metav1.NewTime(cs.StartedAt)}
case cs.State == kubecontainer.ContainerStateCreated:
// containers that are created but not running are "waiting to be running"
status.State.Waiting = &v1.ContainerStateWaiting{}
case cs.State == kubecontainer.ContainerStateExited:
status.State.Terminated = &v1.ContainerStateTerminated{
ExitCode: int32(cs.ExitCode),
Reason: cs.Reason,
Message: cs.Message,
StartedAt: metav1.NewTime(cs.StartedAt),
FinishedAt: metav1.NewTime(cs.FinishedAt),
ContainerID: cid,
case cs.State == kubecontainer.ContainerStateUnknown &&
oldStatus != nil && // we have an old status
oldStatus.State.Running != nil: // our previous status was running
// if this happens, then we know that this container was previously running and isn't anymore (assuming the CRI isn't failing to return running containers).
// you can imagine this happening in cases where a container failed and the kubelet didn't ask about it in time to see the result.
// in this case, the container should not to into waiting state immediately because that can make cases like runonce pods actually run
// twice. "container never ran" is different than "container ran and failed". This is handled differently in the kubelet
// and it is handled differently in higher order logic like crashloop detection and handling
status.State.Terminated = &v1.ContainerStateTerminated{
Reason: "ContainerStatusUnknown",
Message: "The container could not be located when the pod was terminated",
ExitCode: 137, // this code indicates an error
// the restart count normally comes from the CRI (see near the top of this method), but since this is being added explicitly
// for the case where the CRI did not return a status, we need to manually increment the restart count to be accurate.
status.RestartCount = oldStatus.RestartCount + 1
// this collapses any unknown state to container waiting. If any container is waiting, then the pod status moves to pending even if it is running.
// if I'm reading this correctly, then any failure to read status on any container results in the entire pod going pending even if the containers
// are actually running.
// see https://github.com/kubernetes/kubernetes/blob/5d1b3e26af73dde33ecb6a3e69fb5876ceab192f/pkg/kubelet/kuberuntime/kuberuntime_container.go#L497 to
// https://github.com/kubernetes/kubernetes/blob/8976e3620f8963e72084971d9d4decbd026bf49f/pkg/kubelet/kuberuntime/helpers.go#L58-L71
// and interpreted here https://github.com/kubernetes/kubernetes/blob/b27e78f590a0d43e4a23ca3b2bf1739ca4c6e109/pkg/kubelet/kubelet_pods.go#L1434-L1439
status.State.Waiting = &v1.ContainerStateWaiting{}
return status
- 向kubelet添加一些伪代码以模拟此场景
if strings.Contains(pod.Name, "test-created") {
if _, err := m.osInterface.Stat("/var/lib/kubelet/test-created"); os.IsNotExist(err) {
klog.V(2).InfoS("==== restart kubelet")
err = m.runtimeService.StopPodSandbox(podSandboxID)
if err != nil {
klog.ErrorS(err, "== StopPodSandbox failed", "sandboxid", podSandboxID)
klog.V(2).InfoS("=== touch file")
klog.Fatalf("=== panic")
- 创建名为test-created的作业,将重启策略设置为Never
- 查看pod状态卡在pending,无法恢复
- 无响应*
$ kubectl version
# paste output here
# On Linux:
$ cat /etc/os-release
# paste output here
$ uname -a
# paste output here
# On Windows:
C:\> wmic os get Caption, Version, BuildNumber, OSArchitecture
# paste output here
/sig node
@ehashman @SergeyKanzhelev
/cc @ffromani
/priority important-longerm
@SergeyKanzhelev: 标签(
/priority important-longerm
有关如何使用PR评论与我互动的说明,请查看 here 。如果您对我的行为有任何疑问或建议,请针对 kubernetes/test-infra 仓库提交一个问题。
@Dingshujie I tried reproducing the problem in
and the pod doesn't seem to get stuck inPending
anymore; but does this resolve your current problem?2.a [With Graceful Node Shutdown enabled] After rebooting the worker node; and waiting for a while, the old pod ends up in an
and starts a new pod which ends upCompleted
. Zero Restarts.2.b [Without Graceful Node Shutdown] After rebooting the worker node and waiting for a while, the old pod ends up with
and starts a new pod which ends upCompleted
. Zero Restarts.Also a fix from the job config file since
is not a validmetadata
field:What happened?