From 23b333c07dff2605c49b360a6d7b73d8c12c692e Mon Sep 17 00:00:00 2001 From: Andy Kolibri Vendetti Date: Thu, 1 May 2025 18:00:10 +0500 Subject: [PATCH] alpha ver. of deployments resource controller --- api/v1alpha1/nodetainterconfig_types.go | 29 +++ api/v1alpha1/zz_generated.deepcopy.go | 20 ++ cmd/main.go | 12 + ...r.andy.vendetti.ru_nodetainterconfigs.yaml | 29 +++ config/rbac/role.yaml | 10 + .../operator_v1alpha1_nodetainterconfig.yaml | 6 + .../deploymentdefaults_controller.go | 209 ++++++++++++++++++ 7 files changed, 315 insertions(+) create mode 100644 internal/controller/deploymentdefaults_controller.go diff --git a/api/v1alpha1/nodetainterconfig_types.go b/api/v1alpha1/nodetainterconfig_types.go index 010c8e4..8b70a46 100644 --- a/api/v1alpha1/nodetainterconfig_types.go +++ b/api/v1alpha1/nodetainterconfig_types.go @@ -23,6 +23,22 @@ import ( // EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN! // NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. +// ResourceDefaults defines the default resource requests and limits. +type ResourceDefaults struct { + // Default CPU request (e.g., "100m"). Applied if a container has no CPU request. + // +optional + CPURequest string `json:"cpuRequest,omitempty"` + // Default Memory request (e.g., "128Mi"). Applied if a container has no Memory request. + // +optional + MemoryRequest string `json:"memoryRequest,omitempty"` + // Default CPU limit (e.g., "500m"). Applied if a container has no CPU limit. + // +optional + CPULimit string `json:"cpuLimit,omitempty"` + // Default Memory limit (e.g., "512Mi"). Applied if a container has no Memory limit. + // +optional + MemoryLimit string `json:"memoryLimit,omitempty"` +} + // NodeTainterConfigSpec defines the desired state of NodeTainterConfig. type NodeTainterConfigSpec struct { // INSERT ADDITIONAL SPEC FIELDS - desired state of cluster @@ -38,7 +54,20 @@ type NodeTainterConfigSpec struct { // +kubebuilder:validation:Optional // +kubebuilder:validation:MinProperties=1 + // +optional LabelRules map[string]string `json:"labelRules,omitempty"` + + // ResourceDefaults contains the default requests/limits to apply. + // If this section is omitted, resource defaulting is disabled. + // +optional + ResourceDefaults *ResourceDefaults `json:"resourceDefaults,omitempty"` + + // OptOutLabelKey is the label key used to exempt Deployments from resource defaulting. + // If a Deployment has a label with this key (any value), defaults won't be applied. + // If empty or omitted, the opt-out mechanism is disabled. + // Example: "my-operator.example.com/skip-resource-defaults" + // +optional + OptOutLabelKey string `json:"optOutLabelKey,omitempty"` } // NodeTainterConfigStatus defines the observed state of NodeTainterConfig. diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index cd10c2f..1f315da 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -114,6 +114,11 @@ func (in *NodeTainterConfigSpec) DeepCopyInto(out *NodeTainterConfigSpec) { (*out)[key] = val } } + if in.ResourceDefaults != nil { + in, out := &in.ResourceDefaults, &out.ResourceDefaults + *out = new(ResourceDefaults) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeTainterConfigSpec. @@ -154,3 +159,18 @@ func (in *NodeTainterConfigStatus) DeepCopy() *NodeTainterConfigStatus { in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ResourceDefaults) DeepCopyInto(out *ResourceDefaults) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceDefaults. +func (in *ResourceDefaults) DeepCopy() *ResourceDefaults { + if in == nil { + return nil + } + out := new(ResourceDefaults) + in.DeepCopyInto(out) + return out +} diff --git a/cmd/main.go b/cmd/main.go index 69545ec..7e82a14 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -27,6 +27,7 @@ import ( _ "k8s.io/client-go/plugin/pkg/client/auth" "k8s.io/apimachinery/pkg/runtime" + // "k8s.io/client-go/kubernetes" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" ctrl "sigs.k8s.io/controller-runtime" @@ -205,10 +206,21 @@ func main() { if err = (&controller.NodeTainterConfigReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("nodetainter-controller"), }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "NodeTainterConfig") os.Exit(1) } + + if err = (&controller.DeploymentDefaultsReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("deploymentdefaults-controller"), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "DeploymentDefaults") + os.Exit(1) + } + // +kubebuilder:scaffold:builder if metricsCertWatcher != nil { diff --git a/config/crd/bases/operator.andy.vendetti.ru_nodetainterconfigs.yaml b/config/crd/bases/operator.andy.vendetti.ru_nodetainterconfigs.yaml index 358ebd6..408cdcf 100644 --- a/config/crd/bases/operator.andy.vendetti.ru_nodetainterconfigs.yaml +++ b/config/crd/bases/operator.andy.vendetti.ru_nodetainterconfigs.yaml @@ -46,6 +46,35 @@ spec: type: string minProperties: 1 type: object + optOutLabelKey: + description: |- + OptOutLabelKey is the label key used to exempt Deployments from resource defaulting. + If a Deployment has a label with this key (any value), defaults won't be applied. + If empty or omitted, the opt-out mechanism is disabled. + Example: "my-operator.example.com/skip-resource-defaults" + type: string + resourceDefaults: + description: |- + ResourceDefaults contains the default requests/limits to apply. + If this section is omitted, resource defaulting is disabled. + properties: + cpuLimit: + description: Default CPU limit (e.g., "500m"). Applied if a container + has no CPU limit. + type: string + cpuRequest: + description: Default CPU request (e.g., "100m"). Applied if a + container has no CPU request. + type: string + memoryLimit: + description: Default Memory limit (e.g., "512Mi"). Applied if + a container has no Memory limit. + type: string + memoryRequest: + description: Default Memory request (e.g., "128Mi"). Applied if + a container has no Memory request. + type: string + type: object type: object status: description: NodeTainterConfigStatus defines the observed state of NodeTainterConfig. diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 43c0e17..05e712c 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -21,6 +21,16 @@ rules: - patch - update - watch +- apiGroups: + - apps + resources: + - deployments + verbs: + - get + - list + - patch + - update + - watch - apiGroups: - operator.andy.vendetti.ru resources: diff --git a/config/samples/operator_v1alpha1_nodetainterconfig.yaml b/config/samples/operator_v1alpha1_nodetainterconfig.yaml index 7df3384..6b55b84 100644 --- a/config/samples/operator_v1alpha1_nodetainterconfig.yaml +++ b/config/samples/operator_v1alpha1_nodetainterconfig.yaml @@ -10,3 +10,9 @@ spec: "andy.vendetti.ru/category=priority": "workload/priority=high:NoSchedule" "andy.vendetti.ru/category=gpu": "nvidia.com/gpu=present:NoSchedule" "andy.vendetti.ru/category=svc": "workload/type=service:NoSchedule" + resourceDefaults: + cpuRequest: "100m" + memoryRequest: "128Mi" + cpuLimit: "500m" + memoryLimit: "512Mi" + optOutLabelKey: "andy.vendetti.ru/skip-resource-defaults" diff --git a/internal/controller/deploymentdefaults_controller.go b/internal/controller/deploymentdefaults_controller.go new file mode 100644 index 0000000..98b4d44 --- /dev/null +++ b/internal/controller/deploymentdefaults_controller.go @@ -0,0 +1,209 @@ +// internal/controller/deploymentdefaults_controller.go +package controller + +import ( + "context" + "fmt" + "strings" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + configv1alpha1 "git.vendetti.ru/andy/operator/api/v1alpha1" +) + +// DeploymentDefaultsReconciler reconciles Deployment objects to apply default resources. +type DeploymentDefaultsReconciler struct { + client.Client + Scheme *runtime.Scheme + Recorder record.EventRecorder +} + +// +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;update;patch +// +kubebuilder:rbac:groups=operator.andy.vendetti.ru,resources=nodetainterconfigs,verbs=get;list;watch +// +kubebuilder:rbac:groups="",resources=events,verbs=create;patch + +func (r *DeploymentDefaultsReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := log.FromContext(ctx).WithValues("deployment", req.NamespacedName) + + var deployment appsv1.Deployment + if err := r.Get(ctx, req.NamespacedName, &deployment); err != nil { + if errors.IsNotFound(err) { + log.Info("Deployment not found. Ignoring.") + return ctrl.Result{}, nil + } + log.Error(err, "Failed to get Deployment") + return ctrl.Result{}, err // Requeue on error + } + + var config configv1alpha1.NodeTainterConfig + configKey := types.NamespacedName{Name: GlobalTaintConfigName} + if err := r.Get(ctx, configKey, &config); err != nil { + if errors.IsNotFound(err) { + log.Info("Global NodeTainterConfig not found, skipping resource defaulting", "configName", GlobalTaintConfigName) + return ctrl.Result{}, nil + } + log.Error(err, "Failed to get NodeTainterConfig for defaults", "configName", GlobalTaintConfigName) + r.Recorder.Eventf(&deployment, corev1.EventTypeWarning, "ConfigError", "Failed to get config %s: %v", GlobalTaintConfigName, err) + return ctrl.Result{}, err + } + + if config.Spec.ResourceDefaults == nil { + log.V(1).Info("Resource defaulting is disabled in NodeTainterConfig.") + return ctrl.Result{}, nil + } + + optOutKey := strings.TrimSpace(config.Spec.OptOutLabelKey) + if optOutKey != "" { + labels := deployment.GetLabels() + if _, exists := labels[optOutKey]; exists { + log.Info("Deployment has opt-out label, skipping resource defaulting", "labelKey", optOutKey) + r.Recorder.Eventf(&deployment, corev1.EventTypeNormal, "OptedOut", "Skipping resource defaulting due to label %s", optOutKey) + return ctrl.Result{}, nil + } + } + + defaults := config.Spec.ResourceDefaults + defaultCPUReq, errCPUReq := parseQuantity(defaults.CPURequest) + defaultMemReq, errMemReq := parseQuantity(defaults.MemoryRequest) + defaultCPULim, errCPULim := parseQuantity(defaults.CPULimit) + defaultMemLim, errMemLim := parseQuantity(defaults.MemoryLimit) + + var parseErrors []string + if errCPUReq != nil { parseErrors = append(parseErrors, fmt.Sprintf("CPURequest: %v", errCPUReq)) } + if errMemReq != nil { parseErrors = append(parseErrors, fmt.Sprintf("MemoryRequest: %v", errMemReq)) } + if errCPULim != nil { parseErrors = append(parseErrors, fmt.Sprintf("CPULimit: %v", errCPULim)) } + if errMemLim != nil { parseErrors = append(parseErrors, fmt.Sprintf("MemoryLimit: %v", errMemLim)) } + + if len(parseErrors) > 0 { + errMsg := fmt.Sprintf("Invalid resource quantity format in NodeTainterConfig %s: %s", config.Name, strings.Join(parseErrors, "; ")) + log.Error(fmt.Errorf(errMsg), "Default resource parsing failed") + r.Recorder.Eventf(&deployment, corev1.EventTypeWarning, "ConfigError", "Invalid defaults in config %s: %s", config.Name, strings.Join(parseErrors, "; ")) + return ctrl.Result{}, nil + } + + deploymentCopy := deployment.DeepCopy() + mutated := false + + for i, container := range deploymentCopy.Spec.Template.Spec.Containers { + containerName := container.Name + log := log.WithValues("container", containerName) + + if deploymentCopy.Spec.Template.Spec.Containers[i].Resources.Requests == nil { + deploymentCopy.Spec.Template.Spec.Containers[i].Resources.Requests = corev1.ResourceList{} + } + if deploymentCopy.Spec.Template.Spec.Containers[i].Resources.Limits == nil { + deploymentCopy.Spec.Template.Spec.Containers[i].Resources.Limits = corev1.ResourceList{} + } + + requests := deploymentCopy.Spec.Template.Spec.Containers[i].Resources.Requests + limits := deploymentCopy.Spec.Template.Spec.Containers[i].Resources.Limits + + if _, exists := requests[corev1.ResourceCPU]; !exists && defaultCPUReq != nil { + requests[corev1.ResourceCPU] = *defaultCPUReq + log.V(1).Info("Applied default CPU request", "value", defaultCPUReq.String()) + mutated = true + } + if _, exists := requests[corev1.ResourceMemory]; !exists && defaultMemReq != nil { + requests[corev1.ResourceMemory] = *defaultMemReq + log.V(1).Info("Applied default Memory request", "value", defaultMemReq.String()) + mutated = true + } + if _, exists := limits[corev1.ResourceCPU]; !exists && defaultCPULim != nil { + limits[corev1.ResourceCPU] = *defaultCPULim + log.V(1).Info("Applied default CPU limit", "value", defaultCPULim.String()) + mutated = true + } + if _, exists := limits[corev1.ResourceMemory]; !exists && defaultMemLim != nil { + limits[corev1.ResourceMemory] = *defaultMemLim + log.V(1).Info("Applied default Memory limit", "value", defaultMemLim.String()) + mutated = true + } + } + + if mutated { + log.Info("Applying default resource requests/limits to Deployment") + if err := r.Patch(ctx, deploymentCopy, client.MergeFrom(&deployment)); err != nil { + log.Error(err, "Failed to patch Deployment with default resources") + r.Recorder.Eventf(&deployment, corev1.EventTypeWarning, "UpdateFailed", "Failed to apply default resources: %v", err) + return ctrl.Result{}, err + } + log.Info("Successfully applied default resources") + r.Recorder.Eventf(&deployment, corev1.EventTypeNormal, "DefaultsApplied", "Default resource requests/limits applied") + } else { + log.V(1).Info("Deployment already has necessary resource requests/limits or no defaults configured.") + } + + return ctrl.Result{}, nil +} + +func parseQuantity(s string) (*resource.Quantity, error) { + s = strings.TrimSpace(s) + if s == "" { + return nil, nil + } + q, err := resource.ParseQuantity(s) + if err != nil { + return nil, fmt.Errorf("invalid quantity format '%s': %w", s, err) + } + return &q, nil +} + +// Map function for NodeTainterConfig: Trigger reconcile for ALL Deployments when the specific config changes +func (r *DeploymentDefaultsReconciler) mapConfigToDeployments(ctx context.Context, obj client.Object) []reconcile.Request { + config, ok := obj.(*configv1alpha1.NodeTainterConfig) + log := log.FromContext(ctx) + if !ok || config.Name != GlobalTaintConfigName { + return nil + } + + log.Info("Global NodeTainterConfig changed, queuing reconciliation for all deployments potentially affected by resource defaults", "configName", config.Name) + + var deploymentList appsv1.DeploymentList + if err := r.List(ctx, &deploymentList, client.InNamespace("")); err != nil { + log.Error(err, "Failed to list deployments for config change") + return nil + } + + requests := make([]reconcile.Request, 0, len(deploymentList.Items)) + optOutKey := strings.TrimSpace(config.Spec.OptOutLabelKey) + + for _, deployment := range deploymentList.Items { + if optOutKey != "" { + labels := deployment.GetLabels() + if _, exists := labels[optOutKey]; exists { + continue + } + } + requests = append(requests, reconcile.Request{ + NamespacedName: types.NamespacedName{ + Name: deployment.Name, + Namespace: deployment.Namespace, + }, + }) + } + log.Info("Queued deployment reconcile requests", "count", len(requests)) + return requests +} + +func (r *DeploymentDefaultsReconciler) SetupWithManager(mgr ctrl.Manager) error { + r.Recorder = mgr.GetEventRecorderFor("deploymentdefaults-controller") + + return ctrl.NewControllerManagedBy(mgr). + For(&appsv1.Deployment{}). + Watches( + &configv1alpha1.NodeTainterConfig{}, + handler.EnqueueRequestsFromMapFunc(r.mapConfigToDeployments), + ). + Complete(r) +}