pod autoremoval feature added
This commit is contained in:
163
internal/controller/podcrash_controller.go
Normal file
163
internal/controller/podcrash_controller.go
Normal file
@@ -0,0 +1,163 @@
|
||||
// internal/controller/podcrash_controller.go
|
||||
package controller
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
appsv1 "k8s.io/api/apps/v1"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/client-go/tools/record"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
"sigs.k8s.io/controller-runtime/pkg/log"
|
||||
|
||||
configv1alpha1 "git.vendetti.ru/andy/operator/api/v1alpha1"
|
||||
)
|
||||
|
||||
// PodCrashReconciler reconciles Pods to detect and handle CrashLoopBackOff state.
|
||||
type PodCrashReconciler struct {
|
||||
client.Client
|
||||
Scheme *runtime.Scheme
|
||||
Recorder record.EventRecorder
|
||||
}
|
||||
|
||||
// +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;delete
|
||||
// +kubebuilder:rbac:groups=operator.andy.vendetti.ru,resources=nodetainterconfigs,verbs=get;list;watch
|
||||
// +kubebuilder:rbac:groups="",resources=events,verbs=create;patch
|
||||
// +kubebuilder:rbac:groups=apps,resources=replicasets,verbs=get;list;watch
|
||||
// +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch
|
||||
|
||||
func (r *PodCrashReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
|
||||
log := log.FromContext(ctx).WithValues("pod", req.NamespacedName)
|
||||
|
||||
var config configv1alpha1.NodeTainterConfig
|
||||
configKey := types.NamespacedName{Name: GlobalTaintConfigName}
|
||||
if err := r.Get(ctx, configKey, &config); err != nil {
|
||||
if !errors.IsNotFound(err) {
|
||||
log.Error(err, "Failed to get NodeTainterConfig for crash loop policy", "configName", GlobalTaintConfigName)
|
||||
return ctrl.Result{}, err // Requeue on real error
|
||||
}
|
||||
log.V(1).Info("Global NodeTainterConfig not found, crash loop handling skipped.", "configName", GlobalTaintConfigName)
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
if config.Spec.CrashLoopPolicy == nil || !config.Spec.CrashLoopPolicy.Enabled {
|
||||
log.V(1).Info("Crash loop policy is disabled in NodeTainterConfig.")
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
policy := config.Spec.CrashLoopPolicy
|
||||
if len(policy.MonitoredDeployments) == 0 {
|
||||
log.V(1).Info("No monitored deployments configured in CrashLoopPolicy.")
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
monitoredSet := make(map[string]struct{}, len(policy.MonitoredDeployments))
|
||||
for _, item := range policy.MonitoredDeployments {
|
||||
monitoredSet[item] = struct{}{}
|
||||
}
|
||||
|
||||
var pod corev1.Pod
|
||||
if err := r.Get(ctx, req.NamespacedName, &pod); err != nil {
|
||||
if errors.IsNotFound(err) {
|
||||
log.Info("Pod not found. Ignoring.")
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
log.Error(err, "Failed to get Pod")
|
||||
return ctrl.Result{}, err // Requeue on error
|
||||
}
|
||||
|
||||
ownerDeploymentName, isOwnedByMonitoredDeployment := r.getOwnerDeploymentIfMonitored(ctx, &pod, monitoredSet)
|
||||
if !isOwnedByMonitoredDeployment {
|
||||
log.V(1).Info("Pod is not owned by a monitored Deployment, skipping.")
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
log = log.WithValues("deployment", ownerDeploymentName)
|
||||
|
||||
podShouldBeDeleted := false
|
||||
var crashingContainerName string
|
||||
var restartCount int32
|
||||
|
||||
for _, status := range pod.Status.ContainerStatuses {
|
||||
if status.State.Waiting != nil && status.State.Waiting.Reason == "CrashLoopBackOff" {
|
||||
if status.RestartCount >= policy.RestartThreshold {
|
||||
podShouldBeDeleted = true
|
||||
crashingContainerName = status.Name
|
||||
restartCount = status.RestartCount
|
||||
log.Info("Pod needs deletion due to CrashLoopBackOff threshold",
|
||||
"container", crashingContainerName,
|
||||
"restarts", restartCount,
|
||||
"threshold", policy.RestartThreshold)
|
||||
break
|
||||
} else {
|
||||
log.V(1).Info("Container in CrashLoopBackOff but restart count below threshold",
|
||||
"container", status.Name,
|
||||
"restarts", status.RestartCount,
|
||||
"threshold", policy.RestartThreshold)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if podShouldBeDeleted {
|
||||
log.Info("Deleting pod to attempt rescheduling", "reason", "CrashLoopBackOff threshold reached")
|
||||
err := r.Delete(ctx, &pod)
|
||||
if err != nil {
|
||||
if errors.IsNotFound(err) || errors.IsConflict(err) {
|
||||
log.Info("Pod likely already deleted or being deleted.")
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
log.Error(err, "Failed to delete pod in CrashLoopBackOff")
|
||||
r.Recorder.Eventf(&pod, corev1.EventTypeWarning, "DeleteFailed", "Failed to delete pod (%s/%s) stuck in CrashLoopBackOff: %v", pod.Namespace, pod.Name, err)
|
||||
return ctrl.Result{}, err // Requeue on deletion error
|
||||
}
|
||||
log.Info("Pod deleted successfully.")
|
||||
r.Recorder.Eventf(&pod, corev1.EventTypeNormal, "PodDeleted", "Deleted pod (%s/%s) stuck in CrashLoopBackOff (container: %s, restarts: %d)", pod.Namespace, pod.Name, crashingContainerName, restartCount)
|
||||
}
|
||||
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
func (r *PodCrashReconciler) getOwnerDeploymentIfMonitored(ctx context.Context, pod *corev1.Pod, monitoredSet map[string]struct{}) (string, bool) {
|
||||
log := log.FromContext(ctx).WithValues("pod", client.ObjectKeyFromObject(pod))
|
||||
|
||||
rsOwnerRef := metav1.GetControllerOf(pod)
|
||||
if rsOwnerRef == nil || rsOwnerRef.APIVersion != appsv1.SchemeGroupVersion.String() || rsOwnerRef.Kind != "ReplicaSet" {
|
||||
return "", false
|
||||
}
|
||||
|
||||
var rs appsv1.ReplicaSet
|
||||
rsKey := types.NamespacedName{Namespace: pod.Namespace, Name: rsOwnerRef.Name}
|
||||
if err := r.Get(ctx, rsKey, &rs); err != nil {
|
||||
log.V(1).Error(err, "Failed to get owner ReplicaSet", "replicaset", rsKey)
|
||||
return "", false
|
||||
}
|
||||
|
||||
depOwnerRef := metav1.GetControllerOf(&rs)
|
||||
if depOwnerRef == nil || depOwnerRef.APIVersion != appsv1.SchemeGroupVersion.String() || depOwnerRef.Kind != "Deployment" {
|
||||
return "", false
|
||||
}
|
||||
|
||||
deploymentName := fmt.Sprintf("%s/%s", pod.Namespace, depOwnerRef.Name)
|
||||
if _, exists := monitoredSet[deploymentName]; exists {
|
||||
return deploymentName, true
|
||||
}
|
||||
|
||||
return deploymentName, false
|
||||
}
|
||||
|
||||
// SetupWithManager sets up the controller with the Manager.
|
||||
func (r *PodCrashReconciler) SetupWithManager(mgr ctrl.Manager) error {
|
||||
r.Recorder = mgr.GetEventRecorderFor("podcrash-controller")
|
||||
|
||||
return ctrl.NewControllerManagedBy(mgr).
|
||||
Named("podcrash").
|
||||
For(&corev1.Pod{}).
|
||||
// Watches(
|
||||
// &configv1alpha1.NodeTainterConfig{},
|
||||
// handler.EnqueueRequestsFromMapFunc(r.mapConfigToPods),
|
||||
// ).
|
||||
Complete(r)
|
||||
}
|
Reference in New Issue
Block a user