// internal/controller/podcrash_controller.go package controller import ( "context" "fmt" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" configv1alpha1 "git.vendetti.ru/andy/operator/api/v1alpha1" ) // PodCrashReconciler reconciles Pods to detect and handle CrashLoopBackOff state. type PodCrashReconciler struct { client.Client Scheme *runtime.Scheme Recorder record.EventRecorder } // +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;delete // +kubebuilder:rbac:groups=operator.andy.vendetti.ru,resources=nodetainterconfigs,verbs=get;list;watch // +kubebuilder:rbac:groups="",resources=events,verbs=create;patch // +kubebuilder:rbac:groups=apps,resources=replicasets,verbs=get;list;watch // +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch func (r *PodCrashReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := log.FromContext(ctx).WithValues("pod", req.NamespacedName) var config configv1alpha1.NodeTainterConfig configKey := types.NamespacedName{Name: GlobalTaintConfigName} if err := r.Get(ctx, configKey, &config); err != nil { if !errors.IsNotFound(err) { log.Error(err, "Failed to get NodeTainterConfig for crash loop policy", "configName", GlobalTaintConfigName) return ctrl.Result{}, err // Requeue on real error } log.V(1).Info("Global NodeTainterConfig not found, crash loop handling skipped.", "configName", GlobalTaintConfigName) return ctrl.Result{}, nil } if config.Spec.CrashLoopPolicy == nil || !config.Spec.CrashLoopPolicy.Enabled { log.V(1).Info("Crash loop policy is disabled in NodeTainterConfig.") return ctrl.Result{}, nil } policy := config.Spec.CrashLoopPolicy if len(policy.MonitoredDeployments) == 0 { log.V(1).Info("No monitored deployments configured in CrashLoopPolicy.") return ctrl.Result{}, nil } monitoredSet := make(map[string]struct{}, len(policy.MonitoredDeployments)) for _, item := range policy.MonitoredDeployments { monitoredSet[item] = struct{}{} } var pod corev1.Pod if err := r.Get(ctx, req.NamespacedName, &pod); err != nil { if errors.IsNotFound(err) { log.Info("Pod not found. Ignoring.") return ctrl.Result{}, nil } log.Error(err, "Failed to get Pod") return ctrl.Result{}, err // Requeue on error } ownerDeploymentName, isOwnedByMonitoredDeployment := r.getOwnerDeploymentIfMonitored(ctx, &pod, monitoredSet) if !isOwnedByMonitoredDeployment { log.V(1).Info("Pod is not owned by a monitored Deployment, skipping.") return ctrl.Result{}, nil } log = log.WithValues("deployment", ownerDeploymentName) podShouldBeDeleted := false var crashingContainerName string var restartCount int32 for _, status := range pod.Status.ContainerStatuses { if status.State.Waiting != nil && status.State.Waiting.Reason == "CrashLoopBackOff" { if status.RestartCount >= policy.RestartThreshold { podShouldBeDeleted = true crashingContainerName = status.Name restartCount = status.RestartCount log.Info("Pod needs deletion due to CrashLoopBackOff threshold", "container", crashingContainerName, "restarts", restartCount, "threshold", policy.RestartThreshold) break } else { log.V(1).Info("Container in CrashLoopBackOff but restart count below threshold", "container", status.Name, "restarts", status.RestartCount, "threshold", policy.RestartThreshold) } } } if podShouldBeDeleted { log.Info("Deleting pod to attempt rescheduling", "reason", "CrashLoopBackOff threshold reached") err := r.Delete(ctx, &pod) if err != nil { if errors.IsNotFound(err) || errors.IsConflict(err) { log.Info("Pod likely already deleted or being deleted.") return ctrl.Result{}, nil } log.Error(err, "Failed to delete pod in CrashLoopBackOff") r.Recorder.Eventf(&pod, corev1.EventTypeWarning, "DeleteFailed", "Failed to delete pod (%s/%s) stuck in CrashLoopBackOff: %v", pod.Namespace, pod.Name, err) return ctrl.Result{}, err // Requeue on deletion error } log.Info("Pod deleted successfully.") r.Recorder.Eventf(&pod, corev1.EventTypeNormal, "PodDeleted", "Deleted pod (%s/%s) stuck in CrashLoopBackOff (container: %s, restarts: %d)", pod.Namespace, pod.Name, crashingContainerName, restartCount) } return ctrl.Result{}, nil } func (r *PodCrashReconciler) getOwnerDeploymentIfMonitored(ctx context.Context, pod *corev1.Pod, monitoredSet map[string]struct{}) (string, bool) { log := log.FromContext(ctx).WithValues("pod", client.ObjectKeyFromObject(pod)) rsOwnerRef := metav1.GetControllerOf(pod) if rsOwnerRef == nil || rsOwnerRef.APIVersion != appsv1.SchemeGroupVersion.String() || rsOwnerRef.Kind != "ReplicaSet" { return "", false } var rs appsv1.ReplicaSet rsKey := types.NamespacedName{Namespace: pod.Namespace, Name: rsOwnerRef.Name} if err := r.Get(ctx, rsKey, &rs); err != nil { log.V(1).Error(err, "Failed to get owner ReplicaSet", "replicaset", rsKey) return "", false } depOwnerRef := metav1.GetControllerOf(&rs) if depOwnerRef == nil || depOwnerRef.APIVersion != appsv1.SchemeGroupVersion.String() || depOwnerRef.Kind != "Deployment" { return "", false } deploymentName := fmt.Sprintf("%s/%s", pod.Namespace, depOwnerRef.Name) if _, exists := monitoredSet[deploymentName]; exists { return deploymentName, true } return deploymentName, false } // SetupWithManager sets up the controller with the Manager. func (r *PodCrashReconciler) SetupWithManager(mgr ctrl.Manager) error { r.Recorder = mgr.GetEventRecorderFor("podcrash-controller") return ctrl.NewControllerManagedBy(mgr). Named("podcrash"). For(&corev1.Pod{}). // Watches( // &configv1alpha1.NodeTainterConfig{}, // handler.EnqueueRequestsFromMapFunc(r.mapConfigToPods), // ). Complete(r) }