Compare commits
3 Commits
8192f888f7
...
main
Author | SHA1 | Date | |
---|---|---|---|
bad5b2ba1c | |||
56b7b7a0b6 | |||
823a3a0a4d |
@@ -4,7 +4,7 @@ A Kubernetes operator that can perform various actions:
|
|||||||
- Place taints on nodes based on their names.
|
- Place taints on nodes based on their names.
|
||||||
- Place default resource requests/limits on deployments that lack them (but you can also use exclusion words to ignore deployment requests/limits checks).
|
- Place default resource requests/limits on deployments that lack them (but you can also use exclusion words to ignore deployment requests/limits checks).
|
||||||
- Upgrade deployment images based on a predefined list of periodically checked tags (such as :latest, :master, etc.).
|
- Upgrade deployment images based on a predefined list of periodically checked tags (such as :latest, :master, etc.).
|
||||||
- Attempt to fix CrashLoopBackOff by recreating linked resources (such as secrets, if they have not updated from a third-party provider (e.g., Vault)).
|
- Attempt to fix CrashLoopBackOff by deleting pod (so it will appear on another node) in deployments defined in CRD config.
|
||||||
|
|
||||||
Note: This is a college graduation project, so it may contain bugs and may not follow best practices.
|
Note: This is a college graduation project, so it may contain bugs and may not follow best practices.
|
||||||
|
|
||||||
|
@@ -63,6 +63,26 @@ type ImageUpdatePolicy struct {
|
|||||||
RestartAnnotation string `json:"restartAnnotation,omitempty"`
|
RestartAnnotation string `json:"restartAnnotation,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CrashLoopPolicy defines the policy for handling pods in CrashLoopBackOff.
|
||||||
|
type CrashLoopPolicy struct {
|
||||||
|
// Enabled toggles the CrashLoopBackOff handling feature.
|
||||||
|
// +optional
|
||||||
|
Enabled bool `json:"enabled,omitempty"`
|
||||||
|
|
||||||
|
// MonitoredDeployments is a list of Deployments (in "namespace/name" format)
|
||||||
|
// whose pods should be monitored for CrashLoopBackOff.
|
||||||
|
// +optional
|
||||||
|
MonitoredDeployments []string `json:"monitoredDeployments,omitempty"`
|
||||||
|
|
||||||
|
// RestartThreshold is the number of container restarts after which
|
||||||
|
// a pod in CrashLoopBackOff will be deleted to attempt rescheduling.
|
||||||
|
// Minimum recommended value: 3 or 5.
|
||||||
|
// +kubebuilder:validation:Minimum=1
|
||||||
|
// +kubebuilder:default=5
|
||||||
|
// +optional
|
||||||
|
RestartThreshold int32 `json:"restartThreshold,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
// NodeTainterConfigSpec defines the desired state of NodeTainterConfig.
|
// NodeTainterConfigSpec defines the desired state of NodeTainterConfig.
|
||||||
type NodeTainterConfigSpec struct {
|
type NodeTainterConfigSpec struct {
|
||||||
// INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
|
// INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
|
||||||
@@ -95,6 +115,9 @@ type NodeTainterConfigSpec struct {
|
|||||||
|
|
||||||
// +optional
|
// +optional
|
||||||
ImageUpdatePolicy *ImageUpdatePolicy `json:"imageUpdatePolicy,omitempty"`
|
ImageUpdatePolicy *ImageUpdatePolicy `json:"imageUpdatePolicy,omitempty"`
|
||||||
|
|
||||||
|
// +optional
|
||||||
|
CrashLoopPolicy *CrashLoopPolicy `json:"crashLoopPolicy,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// NodeTainterConfigStatus defines the observed state of NodeTainterConfig.
|
// NodeTainterConfigStatus defines the observed state of NodeTainterConfig.
|
||||||
|
@@ -25,6 +25,26 @@ import (
|
|||||||
runtime "k8s.io/apimachinery/pkg/runtime"
|
runtime "k8s.io/apimachinery/pkg/runtime"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
|
||||||
|
func (in *CrashLoopPolicy) DeepCopyInto(out *CrashLoopPolicy) {
|
||||||
|
*out = *in
|
||||||
|
if in.MonitoredDeployments != nil {
|
||||||
|
in, out := &in.MonitoredDeployments, &out.MonitoredDeployments
|
||||||
|
*out = make([]string, len(*in))
|
||||||
|
copy(*out, *in)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CrashLoopPolicy.
|
||||||
|
func (in *CrashLoopPolicy) DeepCopy() *CrashLoopPolicy {
|
||||||
|
if in == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
out := new(CrashLoopPolicy)
|
||||||
|
in.DeepCopyInto(out)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
|
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
|
||||||
func (in *ImageUpdatePolicy) DeepCopyInto(out *ImageUpdatePolicy) {
|
func (in *ImageUpdatePolicy) DeepCopyInto(out *ImageUpdatePolicy) {
|
||||||
*out = *in
|
*out = *in
|
||||||
@@ -144,6 +164,11 @@ func (in *NodeTainterConfigSpec) DeepCopyInto(out *NodeTainterConfigSpec) {
|
|||||||
*out = new(ImageUpdatePolicy)
|
*out = new(ImageUpdatePolicy)
|
||||||
(*in).DeepCopyInto(*out)
|
(*in).DeepCopyInto(*out)
|
||||||
}
|
}
|
||||||
|
if in.CrashLoopPolicy != nil {
|
||||||
|
in, out := &in.CrashLoopPolicy, &out.CrashLoopPolicy
|
||||||
|
*out = new(CrashLoopPolicy)
|
||||||
|
(*in).DeepCopyInto(*out)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeTainterConfigSpec.
|
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeTainterConfigSpec.
|
||||||
|
@@ -230,6 +230,15 @@ func main() {
|
|||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if err = (&controller.PodCrashReconciler{
|
||||||
|
Client: mgr.GetClient(),
|
||||||
|
Scheme: mgr.GetScheme(),
|
||||||
|
Recorder: mgr.GetEventRecorderFor("podcrash-controller"),
|
||||||
|
}).SetupWithManager(mgr); err != nil {
|
||||||
|
setupLog.Error(err, "unable to create controller", "controller", "PodCrash")
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
// +kubebuilder:scaffold:builder
|
// +kubebuilder:scaffold:builder
|
||||||
|
|
||||||
if metricsCertWatcher != nil {
|
if metricsCertWatcher != nil {
|
||||||
|
@@ -41,6 +41,30 @@ spec:
|
|||||||
spec:
|
spec:
|
||||||
description: NodeTainterConfigSpec defines the desired state of NodeTainterConfig.
|
description: NodeTainterConfigSpec defines the desired state of NodeTainterConfig.
|
||||||
properties:
|
properties:
|
||||||
|
crashLoopPolicy:
|
||||||
|
description: CrashLoopPolicy defines the policy for handling pods
|
||||||
|
in CrashLoopBackOff.
|
||||||
|
properties:
|
||||||
|
enabled:
|
||||||
|
description: Enabled toggles the CrashLoopBackOff handling feature.
|
||||||
|
type: boolean
|
||||||
|
monitoredDeployments:
|
||||||
|
description: |-
|
||||||
|
MonitoredDeployments is a list of Deployments (in "namespace/name" format)
|
||||||
|
whose pods should be monitored for CrashLoopBackOff.
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
type: array
|
||||||
|
restartThreshold:
|
||||||
|
default: 5
|
||||||
|
description: |-
|
||||||
|
RestartThreshold is the number of container restarts after which
|
||||||
|
a pod in CrashLoopBackOff will be deleted to attempt rescheduling.
|
||||||
|
Minimum recommended value: 3 or 5.
|
||||||
|
format: int32
|
||||||
|
minimum: 1
|
||||||
|
type: integer
|
||||||
|
type: object
|
||||||
imageUpdatePolicy:
|
imageUpdatePolicy:
|
||||||
description: ImageUpdatePolicy defines the policy for automatic image
|
description: ImageUpdatePolicy defines the policy for automatic image
|
||||||
updates.
|
updates.
|
||||||
|
@@ -26,6 +26,7 @@ rules:
|
|||||||
resources:
|
resources:
|
||||||
- pods
|
- pods
|
||||||
verbs:
|
verbs:
|
||||||
|
- delete
|
||||||
- get
|
- get
|
||||||
- list
|
- list
|
||||||
- watch
|
- watch
|
||||||
@@ -39,6 +40,14 @@ rules:
|
|||||||
- patch
|
- patch
|
||||||
- update
|
- update
|
||||||
- watch
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- apps
|
||||||
|
resources:
|
||||||
|
- replicasets
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
- apiGroups:
|
- apiGroups:
|
||||||
- operator.andy.vendetti.ru
|
- operator.andy.vendetti.ru
|
||||||
resources:
|
resources:
|
||||||
|
@@ -19,5 +19,11 @@ spec:
|
|||||||
imageUpdatePolicy:
|
imageUpdatePolicy:
|
||||||
enabled: true
|
enabled: true
|
||||||
checkInterval: "5m"
|
checkInterval: "5m"
|
||||||
monitoredTags: ["latest", "dev"]
|
monitoredTags: ["latest", "dev", "master"]
|
||||||
# restartAnnotation: "andy.vendetti.ru/restartedAt"
|
# restartAnnotation: "andy.vendetti.ru/restartedAt"
|
||||||
|
crashLoopPolicy:
|
||||||
|
enabled: true
|
||||||
|
restartThreshold: 5
|
||||||
|
monitoredDeployments:
|
||||||
|
- "default/hello-updater-test"
|
||||||
|
- "app-namespace/critical-app-deployment"
|
||||||
|
163
internal/controller/podcrash_controller.go
Normal file
163
internal/controller/podcrash_controller.go
Normal file
@@ -0,0 +1,163 @@
|
|||||||
|
// internal/controller/podcrash_controller.go
|
||||||
|
package controller
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
appsv1 "k8s.io/api/apps/v1"
|
||||||
|
corev1 "k8s.io/api/core/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/api/errors"
|
||||||
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/runtime"
|
||||||
|
"k8s.io/apimachinery/pkg/types"
|
||||||
|
"k8s.io/client-go/tools/record"
|
||||||
|
ctrl "sigs.k8s.io/controller-runtime"
|
||||||
|
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||||
|
"sigs.k8s.io/controller-runtime/pkg/log"
|
||||||
|
|
||||||
|
configv1alpha1 "git.vendetti.ru/andy/operator/api/v1alpha1"
|
||||||
|
)
|
||||||
|
|
||||||
|
// PodCrashReconciler reconciles Pods to detect and handle CrashLoopBackOff state.
|
||||||
|
type PodCrashReconciler struct {
|
||||||
|
client.Client
|
||||||
|
Scheme *runtime.Scheme
|
||||||
|
Recorder record.EventRecorder
|
||||||
|
}
|
||||||
|
|
||||||
|
// +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;delete
|
||||||
|
// +kubebuilder:rbac:groups=operator.andy.vendetti.ru,resources=nodetainterconfigs,verbs=get;list;watch
|
||||||
|
// +kubebuilder:rbac:groups="",resources=events,verbs=create;patch
|
||||||
|
// +kubebuilder:rbac:groups=apps,resources=replicasets,verbs=get;list;watch
|
||||||
|
// +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch
|
||||||
|
|
||||||
|
func (r *PodCrashReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
|
||||||
|
log := log.FromContext(ctx).WithValues("pod", req.NamespacedName)
|
||||||
|
|
||||||
|
var config configv1alpha1.NodeTainterConfig
|
||||||
|
configKey := types.NamespacedName{Name: GlobalTaintConfigName}
|
||||||
|
if err := r.Get(ctx, configKey, &config); err != nil {
|
||||||
|
if !errors.IsNotFound(err) {
|
||||||
|
log.Error(err, "Failed to get NodeTainterConfig for crash loop policy", "configName", GlobalTaintConfigName)
|
||||||
|
return ctrl.Result{}, err // Requeue on real error
|
||||||
|
}
|
||||||
|
log.V(1).Info("Global NodeTainterConfig not found, crash loop handling skipped.", "configName", GlobalTaintConfigName)
|
||||||
|
return ctrl.Result{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.Spec.CrashLoopPolicy == nil || !config.Spec.CrashLoopPolicy.Enabled {
|
||||||
|
log.V(1).Info("Crash loop policy is disabled in NodeTainterConfig.")
|
||||||
|
return ctrl.Result{}, nil
|
||||||
|
}
|
||||||
|
policy := config.Spec.CrashLoopPolicy
|
||||||
|
if len(policy.MonitoredDeployments) == 0 {
|
||||||
|
log.V(1).Info("No monitored deployments configured in CrashLoopPolicy.")
|
||||||
|
return ctrl.Result{}, nil
|
||||||
|
}
|
||||||
|
monitoredSet := make(map[string]struct{}, len(policy.MonitoredDeployments))
|
||||||
|
for _, item := range policy.MonitoredDeployments {
|
||||||
|
monitoredSet[item] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
|
var pod corev1.Pod
|
||||||
|
if err := r.Get(ctx, req.NamespacedName, &pod); err != nil {
|
||||||
|
if errors.IsNotFound(err) {
|
||||||
|
log.Info("Pod not found. Ignoring.")
|
||||||
|
return ctrl.Result{}, nil
|
||||||
|
}
|
||||||
|
log.Error(err, "Failed to get Pod")
|
||||||
|
return ctrl.Result{}, err // Requeue on error
|
||||||
|
}
|
||||||
|
|
||||||
|
ownerDeploymentName, isOwnedByMonitoredDeployment := r.getOwnerDeploymentIfMonitored(ctx, &pod, monitoredSet)
|
||||||
|
if !isOwnedByMonitoredDeployment {
|
||||||
|
log.V(1).Info("Pod is not owned by a monitored Deployment, skipping.")
|
||||||
|
return ctrl.Result{}, nil
|
||||||
|
}
|
||||||
|
log = log.WithValues("deployment", ownerDeploymentName)
|
||||||
|
|
||||||
|
podShouldBeDeleted := false
|
||||||
|
var crashingContainerName string
|
||||||
|
var restartCount int32
|
||||||
|
|
||||||
|
for _, status := range pod.Status.ContainerStatuses {
|
||||||
|
if status.State.Waiting != nil && status.State.Waiting.Reason == "CrashLoopBackOff" {
|
||||||
|
if status.RestartCount >= policy.RestartThreshold {
|
||||||
|
podShouldBeDeleted = true
|
||||||
|
crashingContainerName = status.Name
|
||||||
|
restartCount = status.RestartCount
|
||||||
|
log.Info("Pod needs deletion due to CrashLoopBackOff threshold",
|
||||||
|
"container", crashingContainerName,
|
||||||
|
"restarts", restartCount,
|
||||||
|
"threshold", policy.RestartThreshold)
|
||||||
|
break
|
||||||
|
} else {
|
||||||
|
log.V(1).Info("Container in CrashLoopBackOff but restart count below threshold",
|
||||||
|
"container", status.Name,
|
||||||
|
"restarts", status.RestartCount,
|
||||||
|
"threshold", policy.RestartThreshold)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if podShouldBeDeleted {
|
||||||
|
log.Info("Deleting pod to attempt rescheduling", "reason", "CrashLoopBackOff threshold reached")
|
||||||
|
err := r.Delete(ctx, &pod)
|
||||||
|
if err != nil {
|
||||||
|
if errors.IsNotFound(err) || errors.IsConflict(err) {
|
||||||
|
log.Info("Pod likely already deleted or being deleted.")
|
||||||
|
return ctrl.Result{}, nil
|
||||||
|
}
|
||||||
|
log.Error(err, "Failed to delete pod in CrashLoopBackOff")
|
||||||
|
r.Recorder.Eventf(&pod, corev1.EventTypeWarning, "DeleteFailed", "Failed to delete pod (%s/%s) stuck in CrashLoopBackOff: %v", pod.Namespace, pod.Name, err)
|
||||||
|
return ctrl.Result{}, err // Requeue on deletion error
|
||||||
|
}
|
||||||
|
log.Info("Pod deleted successfully.")
|
||||||
|
r.Recorder.Eventf(&pod, corev1.EventTypeNormal, "PodDeleted", "Deleted pod (%s/%s) stuck in CrashLoopBackOff (container: %s, restarts: %d)", pod.Namespace, pod.Name, crashingContainerName, restartCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
return ctrl.Result{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *PodCrashReconciler) getOwnerDeploymentIfMonitored(ctx context.Context, pod *corev1.Pod, monitoredSet map[string]struct{}) (string, bool) {
|
||||||
|
log := log.FromContext(ctx).WithValues("pod", client.ObjectKeyFromObject(pod))
|
||||||
|
|
||||||
|
rsOwnerRef := metav1.GetControllerOf(pod)
|
||||||
|
if rsOwnerRef == nil || rsOwnerRef.APIVersion != appsv1.SchemeGroupVersion.String() || rsOwnerRef.Kind != "ReplicaSet" {
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
|
||||||
|
var rs appsv1.ReplicaSet
|
||||||
|
rsKey := types.NamespacedName{Namespace: pod.Namespace, Name: rsOwnerRef.Name}
|
||||||
|
if err := r.Get(ctx, rsKey, &rs); err != nil {
|
||||||
|
log.V(1).Error(err, "Failed to get owner ReplicaSet", "replicaset", rsKey)
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
|
||||||
|
depOwnerRef := metav1.GetControllerOf(&rs)
|
||||||
|
if depOwnerRef == nil || depOwnerRef.APIVersion != appsv1.SchemeGroupVersion.String() || depOwnerRef.Kind != "Deployment" {
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
|
||||||
|
deploymentName := fmt.Sprintf("%s/%s", pod.Namespace, depOwnerRef.Name)
|
||||||
|
if _, exists := monitoredSet[deploymentName]; exists {
|
||||||
|
return deploymentName, true
|
||||||
|
}
|
||||||
|
|
||||||
|
return deploymentName, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetupWithManager sets up the controller with the Manager.
|
||||||
|
func (r *PodCrashReconciler) SetupWithManager(mgr ctrl.Manager) error {
|
||||||
|
r.Recorder = mgr.GetEventRecorderFor("podcrash-controller")
|
||||||
|
|
||||||
|
return ctrl.NewControllerManagedBy(mgr).
|
||||||
|
Named("podcrash").
|
||||||
|
For(&corev1.Pod{}).
|
||||||
|
// Watches(
|
||||||
|
// &configv1alpha1.NodeTainterConfig{},
|
||||||
|
// handler.EnqueueRequestsFromMapFunc(r.mapConfigToPods),
|
||||||
|
// ).
|
||||||
|
Complete(r)
|
||||||
|
}
|
Reference in New Issue
Block a user