From 26db0d19e63f088b4517aee2339690fb5d432c67 Mon Sep 17 00:00:00 2001 From: Andreas Sommer Date: Thu, 3 Jul 2025 18:47:13 +0200 Subject: [PATCH] Remediate unhealthy MachinePool machines (backport) --- .../bases/cluster.x-k8s.io_machinepools.yaml | 37 +++++ exp/api/v1beta1/machinepool_types.go | 19 +++ exp/api/v1beta1/zz_generated.deepcopy.go | 25 +++ .../machinepool_controller_phases.go | 154 ++++++++++++++++-- .../machinepool_controller_phases_test.go | 115 ++++++++++++- internal/apis/core/exp/v1alpha3/conversion.go | 9 + .../apis/core/exp/v1alpha3/conversion_test.go | 23 ++- .../exp/v1alpha3/zz_generated.conversion.go | 29 +++- internal/apis/core/exp/v1alpha4/conversion.go | 4 + .../apis/core/exp/v1alpha4/conversion_test.go | 19 ++- .../exp/v1alpha4/zz_generated.conversion.go | 16 +- util/collections/machine_filters.go | 10 ++ 12 files changed, 427 insertions(+), 33 deletions(-) diff --git a/config/crd/bases/cluster.x-k8s.io_machinepools.yaml b/config/crd/bases/cluster.x-k8s.io_machinepools.yaml index ba81835620a1..74daec89a718 100644 --- a/config/crd/bases/cluster.x-k8s.io_machinepools.yaml +++ b/config/crd/bases/cluster.x-k8s.io_machinepools.yaml @@ -1133,6 +1133,43 @@ spec: This is a pointer to distinguish between explicit zero and not specified. format: int32 type: integer + strategy: + description: strategy defines how to replace existing machines with + new ones. + properties: + remediation: + description: |- + remediation controls the strategy of remediating unhealthy machines + as marked by a MachineHealthCheck. This only applies to infrastructure + providers supporting "MachinePool Machines". For other providers, + no remediation is done. + properties: + maxInFlight: + anyOf: + - type: integer + - type: string + description: |- + MaxInFlight determines how many in flight remediations should happen at the same time. + + + Remediation only happens on the MachineSet with the most current revision, while + older MachineSets (usually present during rollout operations) aren't allowed to remediate. + + + Note: In general (independent of remediations), unhealthy machines are always + prioritized during scale down operations over healthy ones. + + + MaxInFlight can be set to a fixed number or a percentage. + Example: when this is set to 20%, the MachineSet controller deletes at most 20% of + the desired replicas. + + + If not set, remediation is limited to all machines (bounded by replicas) + under the active MachineSet's management. + x-kubernetes-int-or-string: true + type: object + type: object template: description: Template describes the machines that will be created. properties: diff --git a/exp/api/v1beta1/machinepool_types.go b/exp/api/v1beta1/machinepool_types.go index 0c7839998afa..1122bfc7e091 100644 --- a/exp/api/v1beta1/machinepool_types.go +++ b/exp/api/v1beta1/machinepool_types.go @@ -60,10 +60,29 @@ type MachinePoolSpec struct { // FailureDomains is the list of failure domains this MachinePool should be attached to. // +optional FailureDomains []string `json:"failureDomains,omitempty"` + + // strategy defines how to replace existing machines with new ones. + // +optional + Strategy *MachinePoolStrategy `json:"strategy,omitempty"` } // ANCHOR_END: MachinePoolSpec +// ANCHOR: MachinePoolStrategy + +// MachinePoolStrategy describes how to replace existing machines +// with new ones. +type MachinePoolStrategy struct { + // remediation controls the strategy of remediating unhealthy machines + // as marked by a MachineHealthCheck. This only applies to infrastructure + // providers supporting "MachinePool Machines". For other providers, + // no remediation is done. + // +optional + Remediation *clusterv1.RemediationStrategy `json:"remediation,omitempty"` +} + +// ANCHOR_END: MachinePoolStrategy + // ANCHOR: MachinePoolStatus // MachinePoolStatus defines the observed state of MachinePool. diff --git a/exp/api/v1beta1/zz_generated.deepcopy.go b/exp/api/v1beta1/zz_generated.deepcopy.go index 68cfe8fc8c3c..03455ce846ae 100644 --- a/exp/api/v1beta1/zz_generated.deepcopy.go +++ b/exp/api/v1beta1/zz_generated.deepcopy.go @@ -110,6 +110,11 @@ func (in *MachinePoolSpec) DeepCopyInto(out *MachinePoolSpec) { *out = make([]string, len(*in)) copy(*out, *in) } + if in.Strategy != nil { + in, out := &in.Strategy, &out.Strategy + *out = new(MachinePoolStrategy) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MachinePoolSpec. @@ -158,3 +163,23 @@ func (in *MachinePoolStatus) DeepCopy() *MachinePoolStatus { in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MachinePoolStrategy) DeepCopyInto(out *MachinePoolStrategy) { + *out = *in + if in.Remediation != nil { + in, out := &in.Remediation, &out.Remediation + *out = new(apiv1beta1.RemediationStrategy) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MachinePoolStrategy. +func (in *MachinePoolStrategy) DeepCopy() *MachinePoolStrategy { + if in == nil { + return nil + } + out := new(MachinePoolStrategy) + in.DeepCopyInto(out) + return out +} diff --git a/exp/internal/controllers/machinepool_controller_phases.go b/exp/internal/controllers/machinepool_controller_phases.go index e46b58f8818c..b24e943f9bc6 100644 --- a/exp/internal/controllers/machinepool_controller_phases.go +++ b/exp/internal/controllers/machinepool_controller_phases.go @@ -20,6 +20,8 @@ import ( "context" "fmt" "reflect" + "slices" + "sort" "time" "github.com/pkg/errors" @@ -28,6 +30,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" kerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog/v2" "k8s.io/utils/ptr" @@ -44,6 +47,7 @@ import ( "sigs.k8s.io/cluster-api/internal/util/ssa" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" + "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/conditions" utilconversion "sigs.k8s.io/cluster-api/util/conversion" "sigs.k8s.io/cluster-api/util/labels" @@ -294,7 +298,10 @@ func (r *MachinePoolReconciler) reconcileInfrastructure(ctx context.Context, s * // Get the nodeRefsMap from the cluster. s.nodeRefMap, getNodeRefsErr = r.getNodeRefMap(ctx, clusterClient) - err = r.reconcileMachines(ctx, s, infraConfig) + res := ctrl.Result{} + + reconcileMachinesRes, err := r.reconcileMachines(ctx, s, infraConfig) + res = util.LowestNonZeroResult(res, reconcileMachinesRes) if err != nil || getNodeRefsErr != nil { return ctrl.Result{}, kerrors.NewAggregate([]error{errors.Wrapf(err, "failed to reconcile Machines for MachinePool %s", klog.KObj(mp)), errors.Wrapf(getNodeRefsErr, "failed to get nodeRefs for MachinePool %s", klog.KObj(mp))}) @@ -302,7 +309,7 @@ func (r *MachinePoolReconciler) reconcileInfrastructure(ctx context.Context, s * if !mp.Status.InfrastructureReady { log.Info("Infrastructure provider is not yet ready", infraConfig.GetKind(), klog.KObj(infraConfig)) - return ctrl.Result{}, nil + return res, nil } var providerIDList []string @@ -321,7 +328,7 @@ func (r *MachinePoolReconciler) reconcileInfrastructure(ctx context.Context, s * if len(providerIDList) == 0 && mp.Status.Replicas != 0 { log.Info("Retrieved empty spec.providerIDList from infrastructure provider but status.replicas is not zero.", "replicas", mp.Status.Replicas) - return ctrl.Result{}, nil + return res, nil } if !reflect.DeepEqual(mp.Spec.ProviderIDList, providerIDList) { @@ -331,7 +338,7 @@ func (r *MachinePoolReconciler) reconcileInfrastructure(ctx context.Context, s * mp.Status.UnavailableReplicas = mp.Status.Replicas } - return ctrl.Result{}, nil + return res, nil } // reconcileMachines reconciles Machines associated with a MachinePool. @@ -341,7 +348,7 @@ func (r *MachinePoolReconciler) reconcileInfrastructure(ctx context.Context, s * // infrastructure is created accordingly. // Note: When supported by the cloud provider implementation of the MachinePool, machines will provide a means to interact // with the corresponding infrastructure (e.g. delete a specific machine in case MachineHealthCheck detects it is unhealthy). -func (r *MachinePoolReconciler) reconcileMachines(ctx context.Context, s *scope, infraMachinePool *unstructured.Unstructured) error { +func (r *MachinePoolReconciler) reconcileMachines(ctx context.Context, s *scope, infraMachinePool *unstructured.Unstructured) (ctrl.Result, error) { log := ctrl.LoggerFrom(ctx) mp := s.machinePool @@ -349,10 +356,10 @@ func (r *MachinePoolReconciler) reconcileMachines(ctx context.Context, s *scope, if err := util.UnstructuredUnmarshalField(infraMachinePool, &infraMachineKind, "status", "infrastructureMachineKind"); err != nil { if errors.Is(err, util.ErrUnstructuredFieldNotFound) { log.V(4).Info("MachinePool Machines not supported, no infraMachineKind found") - return nil + return ctrl.Result{}, nil } - return errors.Wrapf(err, "failed to retrieve infraMachineKind from infrastructure provider for MachinePool %s", klog.KObj(mp)) + return ctrl.Result{}, errors.Wrapf(err, "failed to retrieve infraMachineKind from infrastructure provider for MachinePool %s", klog.KObj(mp)) } infraMachineSelector := metav1.LabelSelector{ @@ -369,7 +376,7 @@ func (r *MachinePoolReconciler) reconcileMachines(ctx context.Context, s *scope, infraMachineList.SetAPIVersion(infraMachinePool.GetAPIVersion()) infraMachineList.SetKind(infraMachineKind + "List") if err := r.Client.List(ctx, &infraMachineList, client.InNamespace(mp.Namespace), client.MatchingLabels(infraMachineSelector.MatchLabels)); err != nil { - return errors.Wrapf(err, "failed to list infra machines for MachinePool %q in namespace %q", mp.Name, mp.Namespace) + return ctrl.Result{}, errors.Wrapf(err, "failed to list infra machines for MachinePool %q in namespace %q", mp.Name, mp.Namespace) } // Add watcher for infraMachine, if there isn't one already; this will allow this controller to reconcile @@ -380,21 +387,26 @@ func (r *MachinePoolReconciler) reconcileMachines(ctx context.Context, s *scope, // Add watcher for infraMachine, if there isn't one already. if err := r.externalTracker.Watch(log, sampleInfraMachine, handler.EnqueueRequestsFromMapFunc(r.infraMachineToMachinePoolMapper)); err != nil { - return err + return ctrl.Result{}, err } // Get the list of machines managed by this controller, and align it with the infra machines managed by // the InfraMachinePool controller. machineList := &clusterv1.MachineList{} if err := r.Client.List(ctx, machineList, client.InNamespace(mp.Namespace), client.MatchingLabels(infraMachineSelector.MatchLabels)); err != nil { - return err + return ctrl.Result{}, err } if err := r.createOrUpdateMachines(ctx, s, machineList.Items, infraMachineList.Items); err != nil { - return errors.Wrapf(err, "failed to create machines for MachinePool %q in namespace %q", mp.Name, mp.Namespace) + return ctrl.Result{}, errors.Wrapf(err, "failed to create machines for MachinePool %q in namespace %q", mp.Name, mp.Namespace) } - return nil + res, err := r.reconcileUnhealthyMachines(ctx, s, machineList.Items) + if err != nil { + return ctrl.Result{}, errors.Wrapf(err, "failed to reconcile unhealthy machines for MachinePool %s", klog.KObj(mp)) + } + + return res, nil } // createOrUpdateMachines creates a MachinePool Machine for each infraMachine if it doesn't already exist and sets the owner reference and infraRef. @@ -594,3 +606,121 @@ func (r *MachinePoolReconciler) getNodeRefMap(ctx context.Context, c client.Clie return nodeRefsMap, nil } + +func (r *MachinePoolReconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope, machines []clusterv1.Machine) (ctrl.Result, error) { + if len(machines) == 0 { + return ctrl.Result{}, nil + } + + log := ctrl.LoggerFrom(ctx) + mp := s.machinePool + + machinesWithHealthCheck := slices.DeleteFunc(slices.Clone(machines), func(machine clusterv1.Machine) bool { + return !conditions.Has(&machine, clusterv1.MachineHealthCheckSucceededCondition) + }) + if len(machinesWithHealthCheck) == 0 { + // This means there is no MachineHealthCheck selecting any machines + // of this machine pool. In this case, do not requeue so often, + // but still check regularly in case a MachineHealthCheck became + // deployed or activated. This long interval shouldn't be a problem + // at cluster creation, since newly-created nodes should anyway + // trigger MachinePool reconciliation as the infrastructure provider + // creates the InfraMachines. + log.V(4).Info("Skipping reconciliation of unhealthy MachinePool machines because there are no health-checked machines") + return ctrl.Result{RequeueAfter: 10 * time.Minute}, nil + } + + unhealthyMachines := slices.DeleteFunc(slices.Clone(machines), func(machine clusterv1.Machine) bool { + return !collections.IsUnhealthyAndOwnerRemediated(&machine) + }) + log.V(4).Info("Reconciling unhealthy MachinePool machines", "unhealthyMachines", len(unhealthyMachines)) + + // Calculate how many in flight machines we should remediate. + // By default, we allow all machines to be remediated at the same time. + maxInFlight := len(unhealthyMachines) + if mp.Spec.Strategy != nil && mp.Spec.Strategy.Remediation != nil { + if mp.Spec.Strategy.Remediation.MaxInFlight != nil { + var err error + replicas := int(ptr.Deref(mp.Spec.Replicas, 1)) + maxInFlight, err = intstr.GetScaledValueFromIntOrPercent(mp.Spec.Strategy.Remediation.MaxInFlight, replicas, true) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to calculate maxInFlight to remediate machines: %v", err) + } + log = log.WithValues("maxInFlight", maxInFlight, "replicas", replicas) + } + } + + machinesToRemediate := make([]*clusterv1.Machine, 0, len(unhealthyMachines)) + inFlight := 0 + for _, m := range unhealthyMachines { + if !m.DeletionTimestamp.IsZero() { + if conditions.IsTrue(&m, clusterv1.MachineOwnerRemediatedCondition) { + // Machine has been remediated by this controller and still in flight. + inFlight++ + } + continue + } + if conditions.IsFalse(&m, clusterv1.MachineOwnerRemediatedCondition) { + machinesToRemediate = append(machinesToRemediate, &m) + } + } + log = log.WithValues("inFlight", inFlight) + + if len(machinesToRemediate) == 0 { + // There's a MachineHealthCheck monitoring the machines, but currently + // no action to be taken. A machine could require remediation at any + // time, so use a short interval until next reconciliation. + return ctrl.Result{RequeueAfter: 30 * time.Second}, nil + } + + if inFlight >= maxInFlight { + log.V(3).Info("Remediation strategy is set, and maximum in flight has been reached", "machinesToBeRemediated", len(machinesToRemediate)) + + // Check soon again whether the already-remediating (= deleting) machines are gone + // so that more machines can be remediated + return ctrl.Result{RequeueAfter: 15 * time.Second}, nil + } + + // Sort the machines from newest to oldest. + // We are trying to remediate machines failing to come up first because + // there is a chance that they are not hosting any workloads (minimize disruption). + sort.SliceStable(machinesToRemediate, func(i, j int) bool { + return machinesToRemediate[i].CreationTimestamp.After(machinesToRemediate[j].CreationTimestamp.Time) + }) + + haveMoreMachinesToRemediate := false + if len(machinesToRemediate) > (maxInFlight - inFlight) { + haveMoreMachinesToRemediate = true + log.V(5).Info("Remediation strategy is set, limiting in flight operations", "machinesToBeRemediated", len(machinesToRemediate)) + machinesToRemediate = machinesToRemediate[:(maxInFlight - inFlight)] + } + + // Remediate unhealthy machines by deleting them + var errs []error + for _, m := range machinesToRemediate { + log.Info("Deleting unhealthy Machine", "Machine", klog.KObj(m)) + patch := client.MergeFrom(m.DeepCopy()) + if err := r.Client.Delete(ctx, m); err != nil { + if apierrors.IsNotFound(err) { + continue + } + errs = append(errs, errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(m))) + continue + } + conditions.MarkTrue(m, clusterv1.MachineOwnerRemediatedCondition) + if err := r.Client.Status().Patch(ctx, m, patch); err != nil && !apierrors.IsNotFound(err) { + errs = append(errs, errors.Wrapf(err, "failed to update status of Machine %s", klog.KObj(m))) + } + } + + if len(errs) > 0 { + return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to delete unhealthy Machines") + } + + if haveMoreMachinesToRemediate { + // More machines need remediation, so reconcile again sooner + return ctrl.Result{RequeueAfter: 15 * time.Second}, nil + } + + return ctrl.Result{RequeueAfter: 30 * time.Second}, nil +} diff --git a/exp/internal/controllers/machinepool_controller_phases_test.go b/exp/internal/controllers/machinepool_controller_phases_test.go index a995c2a95838..de976b593860 100644 --- a/exp/internal/controllers/machinepool_controller_phases_test.go +++ b/exp/internal/controllers/machinepool_controller_phases_test.go @@ -41,8 +41,10 @@ import ( expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" "sigs.k8s.io/cluster-api/internal/test/builder" "sigs.k8s.io/cluster-api/internal/util/ssa" + "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/kubeconfig" "sigs.k8s.io/cluster-api/util/labels/format" + "sigs.k8s.io/cluster-api/util/patch" ) const ( @@ -1374,7 +1376,7 @@ func TestReconcileMachinePoolMachines(t *testing.T) { scope := &scope{ machinePool: &machinePool, } - err = r.reconcileMachines(ctx, scope, &unstructured.Unstructured{Object: infraConfig}) + _, err = r.reconcileMachines(ctx, scope, &unstructured.Unstructured{Object: infraConfig}) r.reconcilePhase(&machinePool) g.Expect(err).ToNot(HaveOccurred()) @@ -1439,7 +1441,7 @@ func TestReconcileMachinePoolMachines(t *testing.T) { machinePool: &machinePool, } - err = r.reconcileMachines(ctx, scope, &unstructured.Unstructured{Object: infraConfig}) + _, err = r.reconcileMachines(ctx, scope, &unstructured.Unstructured{Object: infraConfig}) r.reconcilePhase(&machinePool) g.Expect(err).ToNot(HaveOccurred()) @@ -1498,9 +1500,12 @@ func TestReconcileMachinePoolMachines(t *testing.T) { machinePool: &machinePool, } - err = r.reconcileMachines(ctx, scope, &unstructured.Unstructured{Object: infraConfig}) + res, err := r.reconcileMachines(ctx, scope, &unstructured.Unstructured{Object: infraConfig}) r.reconcilePhase(&machinePool) g.Expect(err).ToNot(HaveOccurred()) + // Regular reconciliation makes no sense if infra provider + // doesn't support MachinePool machines + g.Expect(res.RequeueAfter).To(BeZero()) machineList := &clusterv1.MachineList{} labels := map[string]string{ @@ -1510,6 +1515,110 @@ func TestReconcileMachinePoolMachines(t *testing.T) { g.Expect(env.GetAPIReader().List(ctx, machineList, client.InNamespace(cluster.Namespace), client.MatchingLabels(labels))).To(Succeed()) g.Expect(machineList.Items).To(BeEmpty()) }) + + t.Run("Should delete unhealthy machines", func(*testing.T) { + machinePool := getMachinePool(3, "machinepool-test-4", clusterName, ns.Name) + g.Expect(env.CreateAndWait(ctx, &machinePool)).To(Succeed()) + + infraMachines := getInfraMachines(3, machinePool.Name, clusterName, ns.Name) + for i := range infraMachines { + g.Expect(env.CreateAndWait(ctx, &infraMachines[i])).To(Succeed()) + } + + machines := getMachines(3, machinePool.Name, clusterName, ns.Name) + for i := range machines { + g.Expect(env.CreateAndWait(ctx, &machines[i])).To(Succeed()) + } + + // machines[0] isn't changed here (no conditions = considered healthy). + + // machines[1] is marked as unhealthy by conditions + patchHelper, err := patch.NewHelper(&machines[1], env) + unhealthyMachineName := machines[1].Name + conditions.MarkFalse(&machines[1], clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "") + conditions.MarkFalse(&machines[1], clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "") + g.Expect(err).ShouldNot(HaveOccurred()) + g.Expect(patchHelper.Patch(ctx, &machines[1], patch.WithStatusObservedGeneration{}, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{ + clusterv1.MachineHealthCheckSucceededCondition, + clusterv1.MachineOwnerRemediatedCondition, + }}, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{ + clusterv1.MachineHealthCheckSucceededCondition, + clusterv1.MachineOwnerRemediatedCondition, + }})).To(Succeed()) + + // machines[2] is marked as healthy by conditions + patchHelper, err = patch.NewHelper(&machines[2], env) + conditions.MarkTrue(&machines[2], clusterv1.MachineHealthCheckSucceededCondition) + g.Expect(err).ShouldNot(HaveOccurred()) + g.Expect(patchHelper.Patch(ctx, &machines[2], patch.WithStatusObservedGeneration{}, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{ + clusterv1.MachineHealthCheckSucceededCondition, + clusterv1.MachineOwnerRemediatedCondition, + }}, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{ + clusterv1.MachineHealthCheckSucceededCondition, + clusterv1.MachineOwnerRemediatedCondition, + }})).To(Succeed()) + + infraConfig := map[string]interface{}{ + "kind": builder.GenericInfrastructureMachinePoolKind, + "apiVersion": builder.InfrastructureGroupVersion.String(), + "metadata": map[string]interface{}{ + "name": "infra-config4", + "namespace": ns.Name, + }, + "spec": map[string]interface{}{ + "providerIDList": []interface{}{ + "test://id-1", + }, + }, + "status": map[string]interface{}{ + "ready": true, + "addresses": []interface{}{ + map[string]interface{}{ + "type": "InternalIP", + "address": "10.0.0.1", + }, + map[string]interface{}{ + "type": "InternalIP", + "address": "10.0.0.2", + }, + }, + "infrastructureMachineKind": builder.GenericInfrastructureMachineKind, + }, + } + g.Expect(env.CreateAndWait(ctx, &unstructured.Unstructured{Object: infraConfig})).To(Succeed()) + + r := &MachinePoolReconciler{ + Client: env, + ssaCache: ssa.NewCache(), + } + scope := &scope{ + machinePool: &machinePool, + } + res, err := r.reconcileMachines(ctx, scope, &unstructured.Unstructured{Object: infraConfig}) + r.reconcilePhase(&machinePool) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(res.RequeueAfter).To(BeNumerically(">=", 0)) + + machineList := &clusterv1.MachineList{} + labels := map[string]string{ + clusterv1.ClusterNameLabel: clusterName, + clusterv1.MachinePoolNameLabel: machinePool.Name, + } + g.Expect(env.GetAPIReader().List(ctx, machineList, client.InNamespace(cluster.Namespace), client.MatchingLabels(labels))).To(Succeed()) + + // The unhealthy machine should have been remediated (= deleted) + g.Expect(machineList.Items).To(HaveLen(2)) + + for i := range machineList.Items { + machine := &machineList.Items[i] + + // Healthy machines should remain + g.Expect(machine.Name).ToNot(Equal(unhealthyMachineName)) + + _, err := external.Get(ctx, r.Client, &machine.Spec.InfrastructureRef, machine.Namespace) + g.Expect(err).ToNot(HaveOccurred()) + } + }) }) } diff --git a/internal/apis/core/exp/v1alpha3/conversion.go b/internal/apis/core/exp/v1alpha3/conversion.go index 1aae52b47bee..909a37abec1d 100644 --- a/internal/apis/core/exp/v1alpha3/conversion.go +++ b/internal/apis/core/exp/v1alpha3/conversion.go @@ -27,6 +27,15 @@ import ( utilconversion "sigs.k8s.io/cluster-api/util/conversion" ) +func Convert_v1alpha3_MachineDeploymentStrategy_To_v1beta1_MachinePoolStrategy(in *clusterv1alpha3.MachineDeploymentStrategy, out *expv1.MachinePoolStrategy, _ apimachineryconversion.Scope) error { + out.Remediation = nil + return nil +} + +func Convert_v1beta1_MachinePoolStrategy_To_v1alpha3_MachineDeploymentStrategy(in *expv1.MachinePoolStrategy, out *clusterv1alpha3.MachineDeploymentStrategy, _ apimachineryconversion.Scope) error { + return nil +} + // Convert_v1alpha3_MachinePoolSpec_To_v1beta1_MachinePoolSpec is an autogenerated conversion function. func Convert_v1alpha3_MachinePoolSpec_To_v1beta1_MachinePoolSpec(in *MachinePoolSpec, out *expv1.MachinePoolSpec, s apimachineryconversion.Scope) error { return autoConvert_v1alpha3_MachinePoolSpec_To_v1beta1_MachinePoolSpec(in, out, s) diff --git a/internal/apis/core/exp/v1alpha3/conversion_test.go b/internal/apis/core/exp/v1alpha3/conversion_test.go index 6df28a82fba7..0c0630633fbf 100644 --- a/internal/apis/core/exp/v1alpha3/conversion_test.go +++ b/internal/apis/core/exp/v1alpha3/conversion_test.go @@ -26,13 +26,15 @@ import ( expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" clusterv1alpha3 "sigs.k8s.io/cluster-api/internal/apis/core/v1alpha3" utilconversion "sigs.k8s.io/cluster-api/util/conversion" + "sigs.k8s.io/controller-runtime/pkg/conversion" ) func TestFuzzyConversion(t *testing.T) { t.Run("for MachinePool", utilconversion.FuzzTestFunc(utilconversion.FuzzTestFuncInput{ - Hub: &expv1.MachinePool{}, - Spoke: &MachinePool{}, - FuzzerFuncs: []fuzzer.FuzzerFuncs{fuzzFuncs}, + Hub: &expv1.MachinePool{}, + HubAfterMutation: machinePoolHubAfterMutation, + Spoke: &MachinePool{}, + FuzzerFuncs: []fuzzer.FuzzerFuncs{fuzzFuncs}, })) } @@ -41,6 +43,7 @@ func fuzzFuncs(_ runtimeserializer.CodecFactory) []interface{} { BootstrapFuzzer, MachinePoolSpecFuzzer, ObjectMetaFuzzer, + hubMachinePoolSpec, } } @@ -65,7 +68,17 @@ func ObjectMetaFuzzer(in *clusterv1alpha3.ObjectMeta, c fuzz.Continue) { func MachinePoolSpecFuzzer(in *MachinePoolSpec, c fuzz.Continue) { c.Fuzz(in) - // These fields have been removed in v1beta1 - // data is going to be lost, so we're forcing zero values here. + in.Strategy = nil +} + +func machinePoolHubAfterMutation(c conversion.Hub) { + mp := c.(*expv1.MachinePool) + + mp.Spec.Strategy = nil +} + +func hubMachinePoolSpec(in *expv1.MachinePoolSpec, c fuzz.Continue) { + c.Fuzz(in) + in.Strategy = nil } diff --git a/internal/apis/core/exp/v1alpha3/zz_generated.conversion.go b/internal/apis/core/exp/v1alpha3/zz_generated.conversion.go index 35410aaf1175..d3e477200445 100644 --- a/internal/apis/core/exp/v1alpha3/zz_generated.conversion.go +++ b/internal/apis/core/exp/v1alpha3/zz_generated.conversion.go @@ -65,6 +65,11 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } + if err := s.AddConversionFunc((*corev1alpha3.MachineDeploymentStrategy)(nil), (*v1beta1.MachinePoolStrategy)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_v1alpha3_MachineDeploymentStrategy_To_v1beta1_MachinePoolStrategy(a.(*corev1alpha3.MachineDeploymentStrategy), b.(*v1beta1.MachinePoolStrategy), scope) + }); err != nil { + return err + } if err := s.AddConversionFunc((*MachinePoolSpec)(nil), (*v1beta1.MachinePoolSpec)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1alpha3_MachinePoolSpec_To_v1beta1_MachinePoolSpec(a.(*MachinePoolSpec), b.(*v1beta1.MachinePoolSpec), scope) }); err != nil { @@ -80,6 +85,11 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } + if err := s.AddConversionFunc((*v1beta1.MachinePoolStrategy)(nil), (*corev1alpha3.MachineDeploymentStrategy)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_v1beta1_MachinePoolStrategy_To_v1alpha3_MachineDeploymentStrategy(a.(*v1beta1.MachinePoolStrategy), b.(*corev1alpha3.MachineDeploymentStrategy), scope) + }); err != nil { + return err + } if err := s.AddConversionFunc((*v1beta1.MachinePool)(nil), (*MachinePool)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1beta1_MachinePool_To_v1alpha3_MachinePool(a.(*v1beta1.MachinePool), b.(*MachinePool), scope) }); err != nil { @@ -163,7 +173,15 @@ func autoConvert_v1alpha3_MachinePoolSpec_To_v1beta1_MachinePoolSpec(in *Machine if err := Convert_v1alpha3_MachineTemplateSpec_To_v1beta1_MachineTemplateSpec(&in.Template, &out.Template, s); err != nil { return err } - // WARNING: in.Strategy requires manual conversion: does not exist in peer-type + if in.Strategy != nil { + in, out := &in.Strategy, &out.Strategy + *out = new(v1beta1.MachinePoolStrategy) + if err := Convert_v1alpha3_MachineDeploymentStrategy_To_v1beta1_MachinePoolStrategy(*in, *out, s); err != nil { + return err + } + } else { + out.Strategy = nil + } out.MinReadySeconds = (*int32)(unsafe.Pointer(in.MinReadySeconds)) out.ProviderIDList = *(*[]string)(unsafe.Pointer(&in.ProviderIDList)) out.FailureDomains = *(*[]string)(unsafe.Pointer(&in.FailureDomains)) @@ -179,6 +197,15 @@ func autoConvert_v1beta1_MachinePoolSpec_To_v1alpha3_MachinePoolSpec(in *v1beta1 out.MinReadySeconds = (*int32)(unsafe.Pointer(in.MinReadySeconds)) out.ProviderIDList = *(*[]string)(unsafe.Pointer(&in.ProviderIDList)) out.FailureDomains = *(*[]string)(unsafe.Pointer(&in.FailureDomains)) + if in.Strategy != nil { + in, out := &in.Strategy, &out.Strategy + *out = new(corev1alpha3.MachineDeploymentStrategy) + if err := Convert_v1beta1_MachinePoolStrategy_To_v1alpha3_MachineDeploymentStrategy(*in, *out, s); err != nil { + return err + } + } else { + out.Strategy = nil + } return nil } diff --git a/internal/apis/core/exp/v1alpha4/conversion.go b/internal/apis/core/exp/v1alpha4/conversion.go index 70d9bde60f00..dce332118efe 100644 --- a/internal/apis/core/exp/v1alpha4/conversion.go +++ b/internal/apis/core/exp/v1alpha4/conversion.go @@ -71,3 +71,7 @@ func Convert_v1alpha4_MachineTemplateSpec_To_v1beta1_MachineTemplateSpec(in *clu func Convert_v1beta1_MachineTemplateSpec_To_v1alpha4_MachineTemplateSpec(in *clusterv1.MachineTemplateSpec, out *clusterv1alpha4.MachineTemplateSpec, s apimachineryconversion.Scope) error { return clusterv1alpha4.Convert_v1beta1_MachineTemplateSpec_To_v1alpha4_MachineTemplateSpec(in, out, s) } + +func Convert_v1beta1_MachinePoolSpec_To_v1alpha4_MachinePoolSpec(in *expv1.MachinePoolSpec, out *MachinePoolSpec, s apimachineryconversion.Scope) error { + return autoConvert_v1beta1_MachinePoolSpec_To_v1alpha4_MachinePoolSpec(in, out, s) +} diff --git a/internal/apis/core/exp/v1alpha4/conversion_test.go b/internal/apis/core/exp/v1alpha4/conversion_test.go index bfba3b1fb598..bf0e6dacacaf 100644 --- a/internal/apis/core/exp/v1alpha4/conversion_test.go +++ b/internal/apis/core/exp/v1alpha4/conversion_test.go @@ -19,7 +19,9 @@ package v1alpha4 import ( "testing" + fuzz "github.com/google/gofuzz" "k8s.io/apimachinery/pkg/api/apitesting/fuzzer" + runtimeserializer "k8s.io/apimachinery/pkg/runtime/serializer" expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" utilconversion "sigs.k8s.io/cluster-api/util/conversion" @@ -27,8 +29,21 @@ import ( func TestFuzzyConversion(t *testing.T) { t.Run("for MachinePool", utilconversion.FuzzTestFunc(utilconversion.FuzzTestFuncInput{ - Hub: &expv1.MachinePool{}, + Hub: &expv1.MachinePool{}, + // HubAfterMutation: machinePoolHubAfterMutation, Spoke: &MachinePool{}, - FuzzerFuncs: []fuzzer.FuzzerFuncs{}, + FuzzerFuncs: []fuzzer.FuzzerFuncs{fuzzFuncs}, })) } + +func fuzzFuncs(_ runtimeserializer.CodecFactory) []interface{} { + return []interface{}{ + hubMachinePoolSpec, + } +} + +func hubMachinePoolSpec(in *expv1.MachinePoolSpec, c fuzz.Continue) { + c.Fuzz(in) + + in.Strategy = nil +} diff --git a/internal/apis/core/exp/v1alpha4/zz_generated.conversion.go b/internal/apis/core/exp/v1alpha4/zz_generated.conversion.go index 2d0dff177b01..99611b89152d 100644 --- a/internal/apis/core/exp/v1alpha4/zz_generated.conversion.go +++ b/internal/apis/core/exp/v1alpha4/zz_generated.conversion.go @@ -65,11 +65,6 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } - if err := s.AddGeneratedConversionFunc((*v1beta1.MachinePoolSpec)(nil), (*MachinePoolSpec)(nil), func(a, b interface{}, scope conversion.Scope) error { - return Convert_v1beta1_MachinePoolSpec_To_v1alpha4_MachinePoolSpec(a.(*v1beta1.MachinePoolSpec), b.(*MachinePoolSpec), scope) - }); err != nil { - return err - } if err := s.AddGeneratedConversionFunc((*MachinePoolStatus)(nil), (*v1beta1.MachinePoolStatus)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1alpha4_MachinePoolStatus_To_v1beta1_MachinePoolStatus(a.(*MachinePoolStatus), b.(*v1beta1.MachinePoolStatus), scope) }); err != nil { @@ -85,6 +80,11 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } + if err := s.AddConversionFunc((*v1beta1.MachinePoolSpec)(nil), (*MachinePoolSpec)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_v1beta1_MachinePoolSpec_To_v1alpha4_MachinePoolSpec(a.(*v1beta1.MachinePoolSpec), b.(*MachinePoolSpec), scope) + }); err != nil { + return err + } if err := s.AddConversionFunc((*apiv1beta1.MachineTemplateSpec)(nil), (*corev1alpha4.MachineTemplateSpec)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1beta1_MachineTemplateSpec_To_v1alpha4_MachineTemplateSpec(a.(*apiv1beta1.MachineTemplateSpec), b.(*corev1alpha4.MachineTemplateSpec), scope) }); err != nil { @@ -193,14 +193,10 @@ func autoConvert_v1beta1_MachinePoolSpec_To_v1alpha4_MachinePoolSpec(in *v1beta1 out.MinReadySeconds = (*int32)(unsafe.Pointer(in.MinReadySeconds)) out.ProviderIDList = *(*[]string)(unsafe.Pointer(&in.ProviderIDList)) out.FailureDomains = *(*[]string)(unsafe.Pointer(&in.FailureDomains)) + // WARNING: in.Strategy requires manual conversion: does not exist in peer-type return nil } -// Convert_v1beta1_MachinePoolSpec_To_v1alpha4_MachinePoolSpec is an autogenerated conversion function. -func Convert_v1beta1_MachinePoolSpec_To_v1alpha4_MachinePoolSpec(in *v1beta1.MachinePoolSpec, out *MachinePoolSpec, s conversion.Scope) error { - return autoConvert_v1beta1_MachinePoolSpec_To_v1alpha4_MachinePoolSpec(in, out, s) -} - func autoConvert_v1alpha4_MachinePoolStatus_To_v1beta1_MachinePoolStatus(in *MachinePoolStatus, out *v1beta1.MachinePoolStatus, s conversion.Scope) error { out.NodeRefs = *(*[]v1.ObjectReference)(unsafe.Pointer(&in.NodeRefs)) out.Replicas = in.Replicas diff --git a/util/collections/machine_filters.go b/util/collections/machine_filters.go index 6c0c7813ad9a..8b1c48c0b747 100644 --- a/util/collections/machine_filters.go +++ b/util/collections/machine_filters.go @@ -194,6 +194,16 @@ func HasUnhealthyControlPlaneComponents(isEtcdManaged bool) Func { } } +// IsUnhealthyAndOwnerRemediated returns a filter to find all machines that have a MachineHealthCheckSucceeded condition set to False, +// indicating a problem was detected on the machine, and the MachineOwnerRemediated condition set to False, indicating that the machine owner is +// responsible for performing remediation for the machine. +func IsUnhealthyAndOwnerRemediated(machine *clusterv1.Machine) bool { + if machine == nil { + return false + } + return conditions.IsFalse(machine, clusterv1.MachineHealthCheckSucceededCondition) && conditions.IsFalse(machine, clusterv1.MachineOwnerRemediatedCondition) +} + // IsReady returns a filter to find all machines with the ReadyCondition equals to True. func IsReady() Func { return func(machine *clusterv1.Machine) bool {