Обновить README.md

Merge pull request 'pod autoremoval feature added' (#4 ) from autofix into main
Reviewed-on: #4
2025-05-05 07:25:24 +03:00 · 2025-05-04 23:10:58 +03:00 · 2025-05-05 00:57:24 +05:00
8 changed files with 261 additions and 2 deletions
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ A Kubernetes operator that can perform various actions:
 - Place taints on nodes based on their names.
 - Place default resource requests/limits on deployments that lack them (but you can also use exclusion words to ignore deployment requests/limits checks).
 - Upgrade deployment images based on a predefined list of periodically checked tags (such as :latest, :master, etc.).
- Attempt to fix CrashLoopBackOff by recreating linked resources (such as secrets, if they have not updated from a third-party provider (e.g., Vault)).
+- Attempt to fix CrashLoopBackOff by deleting pod (so it will appear on another node) in deployments defined in CRD config.
 Note: This is a college graduation project, so it may contain bugs and may not follow best practices.
--- a/api/v1alpha1/nodetainterconfig_types.go
+++ b/api/v1alpha1/nodetainterconfig_types.go
@@ -63,6 +63,26 @@ type ImageUpdatePolicy struct {
 	RestartAnnotation string `json:"restartAnnotation,omitempty"`
 }
 // CrashLoopPolicy defines the policy for handling pods in CrashLoopBackOff.
 type CrashLoopPolicy struct {
 	// Enabled toggles the CrashLoopBackOff handling feature.
 	// +optional
 	Enabled bool `json:"enabled,omitempty"`
 	// MonitoredDeployments is a list of Deployments (in "namespace/name" format)
 	// whose pods should be monitored for CrashLoopBackOff.
 	// +optional
 	MonitoredDeployments []string `json:"monitoredDeployments,omitempty"`
 	// RestartThreshold is the number of container restarts after which
 	// a pod in CrashLoopBackOff will be deleted to attempt rescheduling.
 	// Minimum recommended value: 3 or 5.
 	// +kubebuilder:validation:Minimum=1
 	// +kubebuilder:default=5
 	// +optional
 	RestartThreshold int32 `json:"restartThreshold,omitempty"`
 }
 // NodeTainterConfigSpec defines the desired state of NodeTainterConfig.
 type NodeTainterConfigSpec struct {
 	// INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
@@ -95,6 +115,9 @@ type NodeTainterConfigSpec struct {
 	// +optional
 	ImageUpdatePolicy *ImageUpdatePolicy `json:"imageUpdatePolicy,omitempty"`
 	// +optional
 	CrashLoopPolicy *CrashLoopPolicy `json:"crashLoopPolicy,omitempty"`
 }
 // NodeTainterConfigStatus defines the observed state of NodeTainterConfig.
--- a/api/v1alpha1/zz_generated.deepcopy.go
+++ b/api/v1alpha1/zz_generated.deepcopy.go
@@ -25,6 +25,26 @@ import (
 	runtime "k8s.io/apimachinery/pkg/runtime"
 )
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *CrashLoopPolicy) DeepCopyInto(out *CrashLoopPolicy) {
 	*out = *in
 	if in.MonitoredDeployments != nil {
 		in, out := &in.MonitoredDeployments, &out.MonitoredDeployments
 		*out = make([]string, len(*in))
 		copy(*out, *in)
 	}
 }
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CrashLoopPolicy.
 func (in *CrashLoopPolicy) DeepCopy() *CrashLoopPolicy {
 	if in == nil {
 		return nil
 	}
 	out := new(CrashLoopPolicy)
 	in.DeepCopyInto(out)
 	return out
 }
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *ImageUpdatePolicy) DeepCopyInto(out *ImageUpdatePolicy) {
 	*out = *in
@@ -144,6 +164,11 @@ func (in *NodeTainterConfigSpec) DeepCopyInto(out *NodeTainterConfigSpec) {
 		*out = new(ImageUpdatePolicy)
 		(*in).DeepCopyInto(*out)
 	}
 	if in.CrashLoopPolicy != nil {
 		in, out := &in.CrashLoopPolicy, &out.CrashLoopPolicy
 		*out = new(CrashLoopPolicy)
 		(*in).DeepCopyInto(*out)
 	}
 }
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeTainterConfigSpec.
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -230,6 +230,15 @@ func main() {
 		os.Exit(1)
 	}
 	if err = (&controller.PodCrashReconciler{
 		Client:   mgr.GetClient(),
 		Scheme:   mgr.GetScheme(),
 		Recorder: mgr.GetEventRecorderFor("podcrash-controller"),
 	}).SetupWithManager(mgr); err != nil {
 		setupLog.Error(err, "unable to create controller", "controller", "PodCrash")
 		os.Exit(1)
 	}
 	// +kubebuilder:scaffold:builder
 	if metricsCertWatcher != nil {
--- a/config/crd/bases/operator.andy.vendetti.ru_nodetainterconfigs.yaml
+++ b/config/crd/bases/operator.andy.vendetti.ru_nodetainterconfigs.yaml
@@ -41,6 +41,30 @@ spec:
          spec:
            description: NodeTainterConfigSpec defines the desired state of NodeTainterConfig.
            properties:
              crashLoopPolicy:
                description: CrashLoopPolicy defines the policy for handling pods
                  in CrashLoopBackOff.
                properties:
                  enabled:
                    description: Enabled toggles the CrashLoopBackOff handling feature.
                    type: boolean
                  monitoredDeployments:
                    description: |-
                      MonitoredDeployments is a list of Deployments (in "namespace/name" format)
                      whose pods should be monitored for CrashLoopBackOff.
                    items:
                      type: string
                    type: array
                  restartThreshold:
                    default: 5
                    description: |-
                      RestartThreshold is the number of container restarts after which
                      a pod in CrashLoopBackOff will be deleted to attempt rescheduling.
                      Minimum recommended value: 3 or 5.
                    format: int32
                    minimum: 1
                    type: integer
                type: object
              imageUpdatePolicy:
                description: ImageUpdatePolicy defines the policy for automatic image
                  updates.
--- a/config/rbac/role.yaml
+++ b/config/rbac/role.yaml
@@ -26,6 +26,7 @@ rules:
  resources:
  - pods
  verbs:
  - delete
  - get
  - list
  - watch
@@ -39,6 +40,14 @@ rules:
  - patch
  - update
  - watch
 - apiGroups:
  - apps
  resources:
  - replicasets
  verbs:
  - get
  - list
  - watch
 - apiGroups:
  - operator.andy.vendetti.ru
  resources:
--- a/config/samples/operator_v1alpha1_nodetainterconfig.yaml
+++ b/config/samples/operator_v1alpha1_nodetainterconfig.yaml
@@ -19,5 +19,11 @@ spec:
  imageUpdatePolicy:
    enabled: true
    checkInterval: "5m"
-    monitoredTags: ["latest", "dev"]
+    monitoredTags: ["latest", "dev", "master"]
    # restartAnnotation: "andy.vendetti.ru/restartedAt"
  crashLoopPolicy:
    enabled: true
    restartThreshold: 5
    monitoredDeployments:
      - "default/hello-updater-test"
      - "app-namespace/critical-app-deployment"
--- a/internal/controller/podcrash_controller.go
+++ b/internal/controller/podcrash_controller.go
@@ -0,0 +1,163 @@
 // internal/controller/podcrash_controller.go
 package controller
 import (
 	"context"
 	"fmt"
 	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/tools/record"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 	configv1alpha1 "git.vendetti.ru/andy/operator/api/v1alpha1"
 )
 // PodCrashReconciler reconciles Pods to detect and handle CrashLoopBackOff state.
 type PodCrashReconciler struct {
 	client.Client
 	Scheme   *runtime.Scheme
 	Recorder record.EventRecorder
 }
 // +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;delete
 // +kubebuilder:rbac:groups=operator.andy.vendetti.ru,resources=nodetainterconfigs,verbs=get;list;watch
 // +kubebuilder:rbac:groups="",resources=events,verbs=create;patch
 // +kubebuilder:rbac:groups=apps,resources=replicasets,verbs=get;list;watch
 // +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch
 func (r *PodCrashReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
 	log := log.FromContext(ctx).WithValues("pod", req.NamespacedName)
 	var config configv1alpha1.NodeTainterConfig
 	configKey := types.NamespacedName{Name: GlobalTaintConfigName}
 	if err := r.Get(ctx, configKey, &config); err != nil {
 		if !errors.IsNotFound(err) {
 			log.Error(err, "Failed to get NodeTainterConfig for crash loop policy", "configName", GlobalTaintConfigName)
 			return ctrl.Result{}, err // Requeue on real error
 		}
 		log.V(1).Info("Global NodeTainterConfig not found, crash loop handling skipped.", "configName", GlobalTaintConfigName)
 		return ctrl.Result{}, nil
 	}
 	if config.Spec.CrashLoopPolicy == nil || !config.Spec.CrashLoopPolicy.Enabled {
 		log.V(1).Info("Crash loop policy is disabled in NodeTainterConfig.")
 		return ctrl.Result{}, nil
 	}
 	policy := config.Spec.CrashLoopPolicy
 	if len(policy.MonitoredDeployments) == 0 {
 		log.V(1).Info("No monitored deployments configured in CrashLoopPolicy.")
 		return ctrl.Result{}, nil
 	}
 	monitoredSet := make(map[string]struct{}, len(policy.MonitoredDeployments))
 	for _, item := range policy.MonitoredDeployments {
 		monitoredSet[item] = struct{}{}
 	}
 	var pod corev1.Pod
 	if err := r.Get(ctx, req.NamespacedName, &pod); err != nil {
 		if errors.IsNotFound(err) {
 			log.Info("Pod not found. Ignoring.")
 			return ctrl.Result{}, nil
 		}
 		log.Error(err, "Failed to get Pod")
 		return ctrl.Result{}, err // Requeue on error
 	}
 	ownerDeploymentName, isOwnedByMonitoredDeployment := r.getOwnerDeploymentIfMonitored(ctx, &pod, monitoredSet)
 	if !isOwnedByMonitoredDeployment {
 		log.V(1).Info("Pod is not owned by a monitored Deployment, skipping.")
 		return ctrl.Result{}, nil
 	}
 	log = log.WithValues("deployment", ownerDeploymentName)
 	podShouldBeDeleted := false
 	var crashingContainerName string
 	var restartCount int32
 	for _, status := range pod.Status.ContainerStatuses {
 		if status.State.Waiting != nil && status.State.Waiting.Reason == "CrashLoopBackOff" {
 			if status.RestartCount >= policy.RestartThreshold {
 				podShouldBeDeleted = true
 				crashingContainerName = status.Name
 				restartCount = status.RestartCount
 				log.Info("Pod needs deletion due to CrashLoopBackOff threshold",
 					"container", crashingContainerName,
 					"restarts", restartCount,
 					"threshold", policy.RestartThreshold)
 				break
 			} else {
 				log.V(1).Info("Container in CrashLoopBackOff but restart count below threshold",
 					"container", status.Name,
 					"restarts", status.RestartCount,
 					"threshold", policy.RestartThreshold)
 			}
 		}
 	}
 	if podShouldBeDeleted {
 		log.Info("Deleting pod to attempt rescheduling", "reason", "CrashLoopBackOff threshold reached")
 		err := r.Delete(ctx, &pod)
 		if err != nil {
 			if errors.IsNotFound(err) || errors.IsConflict(err) {
 				log.Info("Pod likely already deleted or being deleted.")
 				return ctrl.Result{}, nil
 			}
 			log.Error(err, "Failed to delete pod in CrashLoopBackOff")
 			r.Recorder.Eventf(&pod, corev1.EventTypeWarning, "DeleteFailed", "Failed to delete pod (%s/%s) stuck in CrashLoopBackOff: %v", pod.Namespace, pod.Name, err)
 			return ctrl.Result{}, err // Requeue on deletion error
 		}
 		log.Info("Pod deleted successfully.")
 		r.Recorder.Eventf(&pod, corev1.EventTypeNormal, "PodDeleted", "Deleted pod (%s/%s) stuck in CrashLoopBackOff (container: %s, restarts: %d)", pod.Namespace, pod.Name, crashingContainerName, restartCount)
 	}
 	return ctrl.Result{}, nil
 }
 func (r *PodCrashReconciler) getOwnerDeploymentIfMonitored(ctx context.Context, pod *corev1.Pod, monitoredSet map[string]struct{}) (string, bool) {
 	log := log.FromContext(ctx).WithValues("pod", client.ObjectKeyFromObject(pod))
 	rsOwnerRef := metav1.GetControllerOf(pod)
 	if rsOwnerRef == nil || rsOwnerRef.APIVersion != appsv1.SchemeGroupVersion.String() || rsOwnerRef.Kind != "ReplicaSet" {
 		return "", false
 	}
 	var rs appsv1.ReplicaSet
 	rsKey := types.NamespacedName{Namespace: pod.Namespace, Name: rsOwnerRef.Name}
 	if err := r.Get(ctx, rsKey, &rs); err != nil {
 		log.V(1).Error(err, "Failed to get owner ReplicaSet", "replicaset", rsKey)
 		return "", false
 	}
 	depOwnerRef := metav1.GetControllerOf(&rs)
 	if depOwnerRef == nil || depOwnerRef.APIVersion != appsv1.SchemeGroupVersion.String() || depOwnerRef.Kind != "Deployment" {
 		return "", false
 	}
 	deploymentName := fmt.Sprintf("%s/%s", pod.Namespace, depOwnerRef.Name)
 	if _, exists := monitoredSet[deploymentName]; exists {
 		return deploymentName, true
 	}
 	return deploymentName, false
 }
 // SetupWithManager sets up the controller with the Manager.
 func (r *PodCrashReconciler) SetupWithManager(mgr ctrl.Manager) error {
 	r.Recorder = mgr.GetEventRecorderFor("podcrash-controller")
 	return ctrl.NewControllerManagedBy(mgr).
 		Named("podcrash").
 		For(&corev1.Pod{}).
 		// Watches(
 		// 	&configv1alpha1.NodeTainterConfig{},
 		// 	handler.EnqueueRequestsFromMapFunc(r.mapConfigToPods),
 		// ).
 		Complete(r)
 }
Author	SHA1	Message	Date
Andy Kolibri Vendetti	bad5b2ba1c	Обновить README.md All checks were successful Lint / Run on Ubuntu (push) Successful in 19s Details Tests / Run on Ubuntu (push) Successful in 27s Details	2025-05-05 07:25:24 +03:00
Andy Kolibri Vendetti	56b7b7a0b6	Merge pull request 'pod autoremoval feature added' (#4 ) from autofix into main All checks were successful Lint / Run on Ubuntu (push) Successful in 20s Details Tests / Run on Ubuntu (push) Successful in 24s Details Reviewed-on: #4	2025-05-04 23:10:58 +03:00
Andy Kolibri Vendetti	823a3a0a4d	pod autoremoval feature added All checks were successful Lint / Run on Ubuntu (push) Successful in 22s Details Tests / Run on Ubuntu (push) Successful in 28s Details Lint / Run on Ubuntu (pull_request) Successful in 17s Details Tests / Run on Ubuntu (pull_request) Successful in 25s Details	2025-05-05 00:57:24 +05:00