diff --git a/.gitignore b/.gitignore index 30a43a92..8680e16b 100644 --- a/.gitignore +++ b/.gitignore @@ -37,4 +37,4 @@ dist/ build/ # Entrypoint for the application -!/cmd/d8 \ No newline at end of file +!/cmd/d8 diff --git a/cmd/commands/cni.go b/cmd/commands/cni.go new file mode 100644 index 00000000..8d1ac8ab --- /dev/null +++ b/cmd/commands/cni.go @@ -0,0 +1,159 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package commands + +import ( + "fmt" + "log" + "strings" + "time" + + "github.com/go-logr/logr" + "github.com/spf13/cobra" + "k8s.io/kubectl/pkg/util/templates" + ctrllog "sigs.k8s.io/controller-runtime/pkg/log" + + "github.com/deckhouse/deckhouse-cli/internal/cni" +) + +var ( + cniSwitchLong = templates.LongDesc(` +A group of commands to switch the CNI (Container Network Interface) provider in the Deckhouse cluster. + +This process is divided into several steps: + + - 'd8 cni-switch prepare' - STEP 1. Prepares the cluster for CNI migration. + - 'd8 cni-switch switch' - STEP 2. Performs the actual CNI switch. + - 'd8 cni-switch cleanup' - STEP 3. Cleans up resources if the switch is aborted. + - 'd8 cni-switch rollback' - (Optional) Rollback CNI if the switch is aborted.`) + + cniPrepareExample = templates.Examples(` + # Prepare to switch to Cilium CNI + d8 cni-switch prepare --to-cni cilium`) + + cniSwitchExample = templates.Examples(` + # Perform the CNI switch after the prepare step is complete + d8 cni-switch switch`) + + cniCleanupExample = templates.Examples(` + # Cleanup resources created by the 'prepare' command + d8 cni-switch cleanup`) + + cniRollbackExample = templates.Examples(` + # Rollback changes, restore previous CNI, cleanup resources created by the 'prepare' command + d8 cni-switch rollback`) + + supportedCNIs = []string{"cilium", "flannel", "simple-bridge"} +) + +func NewCniSwitchCommand() *cobra.Command { + log.SetFlags(0) + ctrllog.SetLogger(logr.Discard()) + + cmd := &cobra.Command{ + Use: "cni-switch", + Short: "A group of commands to switch CNI in the cluster", + Long: cniSwitchLong, + } + cmd.PersistentFlags().Duration("timeout", 1*time.Hour, "The timeout for the entire operation (e.g., 40m, 1h)") + cmd.AddCommand(NewCmdCniPrepare()) + cmd.AddCommand(NewCmdCniSwitch()) + cmd.AddCommand(NewCmdCniCleanup()) + cmd.AddCommand(NewCmdCniRollback()) + return cmd +} + +func NewCmdCniPrepare() *cobra.Command { + cmd := &cobra.Command{ + Use: "prepare", + Short: "Prepares the cluster for CNI switching", + Example: cniPrepareExample, + PreRunE: func(cmd *cobra.Command, _ []string) error { + targetCNI, _ := cmd.Flags().GetString("to-cni") + for _, supported := range supportedCNIs { + if strings.ToLower(targetCNI) == supported { + return nil + } + } + return fmt.Errorf( + "invalid --to-cni value %q. Supported values are: %s", + targetCNI, + strings.Join(supportedCNIs, ", "), + ) + }, + + Run: func(cmd *cobra.Command, _ []string) { + targetCNI, _ := cmd.Flags().GetString("to-cni") + timeout, _ := cmd.Flags().GetDuration("timeout") + if err := cni.RunPrepare(targetCNI, timeout); err != nil { + log.Fatalf("āŒ Error running prepare command: %v", err) + } + }, + } + cmd.Flags().String("to-cni", "", fmt.Sprintf( + "Target CNI provider to switch to. Supported values: %s", + strings.Join(supportedCNIs, ", "), + )) + _ = cmd.MarkFlagRequired("to-cni") + + return cmd +} + +func NewCmdCniSwitch() *cobra.Command { + cmd := &cobra.Command{ + Use: "switch", + Short: "Performs the CNI switching", + Example: cniSwitchExample, + Run: func(cmd *cobra.Command, _ []string) { + timeout, _ := cmd.Flags().GetDuration("timeout") + if err := cni.RunSwitch(timeout); err != nil { + log.Fatalf("āŒ Error running switch command: %v", err) + } + }, + } + return cmd +} + +func NewCmdCniCleanup() *cobra.Command { + cmd := &cobra.Command{ + Use: "cleanup", + Short: "Cleans up resources created during CNI switching", + Example: cniCleanupExample, + Run: func(cmd *cobra.Command, _ []string) { + timeout, _ := cmd.Flags().GetDuration("timeout") + if err := cni.RunCleanup(timeout); err != nil { + log.Fatalf("āŒ Error running cleanup command: %v", err) + } + }, + } + return cmd +} + +func NewCmdCniRollback() *cobra.Command { // TDEN It needs to be done! + cmd := &cobra.Command{ + Use: "rollback", + Short: "Rollback all changes and restore previous CNI", + Example: cniRollbackExample, + Run: func(cmd *cobra.Command, _ []string) { + timeout, _ := cmd.Flags().GetDuration("timeout") + if err := cni.RunRollback(timeout); err != nil { + log.Fatalf("āŒ Error running rollback command: %v", err) + } + }, + } + return cmd +} diff --git a/cmd/commands/kubectl.go b/cmd/commands/kubectl.go index d029fdc0..5f14c148 100644 --- a/cmd/commands/kubectl.go +++ b/cmd/commands/kubectl.go @@ -32,12 +32,12 @@ import ( cliflag "k8s.io/component-base/cli/flag" "k8s.io/component-base/logs" kubecmd "k8s.io/kubectl/pkg/cmd" + + "github.com/deckhouse/deckhouse-cli/internal/cni" ) const ( - cmNamespace = "d8-system" - cmName = "debug-container" - cmImageKey = "image" + cmImageKey = "debug-container-image" ) var d8CommandRegex = regexp.MustCompile("([\"'`])d8 (\\w+)") @@ -112,7 +112,7 @@ func getDebugImage(cmd *cobra.Command) (string, error) { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() - configMap, err := kubeCl.CoreV1().ConfigMaps(cmNamespace).Get(ctx, cmName, v1.GetOptions{}) + configMap, err := kubeCl.CoreV1().ConfigMaps(cni.CMDataNameSpace).Get(ctx, cni.CMDataName, v1.GetOptions{}) if err != nil { return "", ErrGenericImageFetch } diff --git a/cmd/d8/root.go b/cmd/d8/root.go index 46ccce7d..946c7936 100644 --- a/cmd/d8/root.go +++ b/cmd/d8/root.go @@ -98,6 +98,7 @@ func (r *RootCommand) registerCommands() { r.cmd.AddCommand(commands.NewKubectlCommand()) r.cmd.AddCommand(commands.NewLoginCommand()) r.cmd.AddCommand(commands.NewStrongholdCommand()) + r.cmd.AddCommand(commands.NewCniSwitchCommand()) r.cmd.AddCommand(commands.NewHelpJSONCommand(r.cmd)) r.cmd.AddCommand(plugins.NewPluginsCommand(r.logger.Named("plugins-command"))) diff --git a/internal/cni/api/v1alpha1/cni_migration_types.go b/internal/cni/api/v1alpha1/cni_migration_types.go new file mode 100644 index 00000000..5cf9a76d --- /dev/null +++ b/internal/cni/api/v1alpha1/cni_migration_types.go @@ -0,0 +1,75 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// +genclient +// +genclient:nonNamespaced +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +// +k8s:openapi-gen=true + +// CNIMigration is the schema for the CNIMigration API. +// It is a cluster-level resource that serves as the "single source of truth" +// for the entire migration process. It defines the goal (targetCNI) +// and tracks the overall progress across all nodes. +type CNIMigration struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Spec defines the desired state of CNIMigration. + Spec CNIMigrationSpec `json:"spec,omitempty"` + // Status defines the observed state of CNIMigration. + Status CNIMigrationStatus `json:"status,omitempty"` +} + +// CNIMigrationSpec defines the desired state of CNIMigration. +// +k8s:deepcopy-gen=true +type CNIMigrationSpec struct { + // TargetCNI is the CNI to switch to. + // Set by the d8 cli utility when starting Phase 1. + TargetCNI string `json:"targetCNI"` + // Phase is the phase controlled by the d8 cli to command the agents. + // Possible values: Prepare, Migrate, Cleanup, Abort. + Phase string `json:"phase"` +} + +// CNIMigrationStatus defines the observed state of CNIMigration. +// +k8s:deepcopy-gen=true +type CNIMigrationStatus struct { + // CurrentCNI is the detected CNI from which the switch is being made. + CurrentCNI string `json:"currentCNI,omitempty"` + // NodesTotal is the total number of nodes involved in the migration. + NodesTotal int `json:"nodesTotal,omitempty"` + // NodesSucceeded is the number of nodes that have successfully completed the current phase. + NodesSucceeded int `json:"nodesSucceeded,omitempty"` + // NodesFailed is the number of nodes where an error occurred. + NodesFailed int `json:"nodesFailed,omitempty"` + // Conditions reflect the state of the migration as a whole. + // The d8 cli aggregates statuses from all CNINodeMigrations here. + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// CNIMigrationList contains a list of CNIMigration. +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +type CNIMigrationList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []CNIMigration `json:"items"` +} diff --git a/internal/cni/api/v1alpha1/cni_node_migration_types.go b/internal/cni/api/v1alpha1/cni_node_migration_types.go new file mode 100644 index 00000000..b8536266 --- /dev/null +++ b/internal/cni/api/v1alpha1/cni_node_migration_types.go @@ -0,0 +1,63 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// +genclient +// +genclient:nonNamespaced +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +// +k8s:openapi-gen=true + +// CNINodeMigration is the schema for the CNINodeMigration API. +// This resource is created for each node in the cluster. The Helper +// agent running on the node updates this resource to report its local progress. +// The d8 cli reads these resources to display detailed status. +type CNINodeMigration struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Spec can be empty, as all configuration is taken from the parent CNIMigration resource. + Spec CNINodeMigrationSpec `json:"spec,omitempty"` + // Status defines the observed state of CNINodeMigration. + Status CNINodeMigrationStatus `json:"status,omitempty"` +} + +// CNINodeMigrationSpec defines the desired state of CNINodeMigration. +// +k8s:deepcopy-gen=true +type CNINodeMigrationSpec struct { + // The spec can be empty, as all configuration is taken from the parent CNIMigration resource. +} + +// CNINodeMigrationStatus defines the observed state of CNINodeMigration. +// +k8s:deepcopy-gen=true +type CNINodeMigrationStatus struct { + // Phase is the phase of this particular node. + Phase string `json:"phase,omitempty"` + // Conditions are the detailed conditions reflecting the steps performed on the node. + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// CNINodeMigrationList contains a list of CNINodeMigration. +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +type CNINodeMigrationList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []CNINodeMigration `json:"items"` +} diff --git a/internal/cni/api/v1alpha1/register.go b/internal/cni/api/v1alpha1/register.go new file mode 100644 index 00000000..958d272f --- /dev/null +++ b/internal/cni/api/v1alpha1/register.go @@ -0,0 +1,50 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +const ( + APIGroup = "network.deckhouse.io" + APIVersion = "v1alpha1" +) + +// SchemeGroupVersion is group version used to register these objects +var ( + SchemeGroupVersion = schema.GroupVersion{ + Group: APIGroup, + Version: APIVersion, + } + SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes) + AddToScheme = SchemeBuilder.AddToScheme +) + +// Adds the list of known types to Scheme. +func addKnownTypes(scheme *runtime.Scheme) error { + scheme.AddKnownTypes(SchemeGroupVersion, + &CNIMigration{}, + &CNIMigrationList{}, + &CNINodeMigration{}, + &CNINodeMigrationList{}, + ) + metav1.AddToGroupVersion(scheme, SchemeGroupVersion) + return nil +} diff --git a/internal/cni/api/v1alpha1/zz_generated.deepcopy.go b/internal/cni/api/v1alpha1/zz_generated.deepcopy.go new file mode 100644 index 00000000..495df5c2 --- /dev/null +++ b/internal/cni/api/v1alpha1/zz_generated.deepcopy.go @@ -0,0 +1,218 @@ +//go:build !ignore_autogenerated + +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CNIMigration) DeepCopyInto(out *CNIMigration) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CNIMigration. +func (in *CNIMigration) DeepCopy() *CNIMigration { + if in == nil { + return nil + } + out := new(CNIMigration) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *CNIMigration) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CNIMigrationList) DeepCopyInto(out *CNIMigrationList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]CNIMigration, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CNIMigrationList. +func (in *CNIMigrationList) DeepCopy() *CNIMigrationList { + if in == nil { + return nil + } + out := new(CNIMigrationList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *CNIMigrationList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CNIMigrationSpec) DeepCopyInto(out *CNIMigrationSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CNIMigrationSpec. +func (in *CNIMigrationSpec) DeepCopy() *CNIMigrationSpec { + if in == nil { + return nil + } + out := new(CNIMigrationSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CNIMigrationStatus) DeepCopyInto(out *CNIMigrationStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CNIMigrationStatus. +func (in *CNIMigrationStatus) DeepCopy() *CNIMigrationStatus { + if in == nil { + return nil + } + out := new(CNIMigrationStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CNINodeMigration) DeepCopyInto(out *CNINodeMigration) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CNINodeMigration. +func (in *CNINodeMigration) DeepCopy() *CNINodeMigration { + if in == nil { + return nil + } + out := new(CNINodeMigration) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *CNINodeMigration) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CNINodeMigrationList) DeepCopyInto(out *CNINodeMigrationList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]CNINodeMigration, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CNINodeMigrationList. +func (in *CNINodeMigrationList) DeepCopy() *CNINodeMigrationList { + if in == nil { + return nil + } + out := new(CNINodeMigrationList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *CNINodeMigrationList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CNINodeMigrationSpec) DeepCopyInto(out *CNINodeMigrationSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CNINodeMigrationSpec. +func (in *CNINodeMigrationSpec) DeepCopy() *CNINodeMigrationSpec { + if in == nil { + return nil + } + out := new(CNINodeMigrationSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CNINodeMigrationStatus) DeepCopyInto(out *CNINodeMigrationStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CNINodeMigrationStatus. +func (in *CNINodeMigrationStatus) DeepCopy() *CNINodeMigrationStatus { + if in == nil { + return nil + } + out := new(CNINodeMigrationStatus) + in.DeepCopyInto(out) + return out +} diff --git a/internal/cni/cleanup.go b/internal/cni/cleanup.go new file mode 100644 index 00000000..4ec771b5 --- /dev/null +++ b/internal/cni/cleanup.go @@ -0,0 +1,337 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cni + +import ( + "context" + "fmt" + "strings" + "time" + + admissionregistrationv1 "k8s.io/api/admissionregistration/v1" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" + saferequest "github.com/deckhouse/deckhouse-cli/pkg/libsaferequest/client" +) + +// RunCleanup executes the logic for the 'cni-switch cleanup' command. +// It performs a robust cleanup by deleting a known list of resources +// and waiting for them to be fully terminated. +func RunCleanup(timeout time.Duration) error { + startTime := time.Now() + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + fmt.Printf("šŸš€ Starting CNI switch cleanup (global timeout: %s)\n", timeout) + + // 1. Create Kubernetes client + safeClient, err := saferequest.NewSafeClient() + if err != nil { + return fmt.Errorf("creating safe client: %w", err) + } + + rtClient, err := safeClient.NewRTClient(v1alpha1.AddToScheme) + if err != nil { + return fmt.Errorf("creating runtime client: %w", err) + } + fmt.Printf("āœ… Kubernetes client created\n") + + // 2. Delete cluster-scoped resources first, with waiting + fmt.Println("\nDeleting cluster-scoped resources...") + webhookConfig := &admissionregistrationv1.MutatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: webhookConfigName}, + } + if err = deleteAndWait(ctx, rtClient, webhookConfig); err != nil { + return err + } + + // 3. Stop active controllers first to prevent reconciliation loops + fmt.Printf(" Stopping active controllers in '%s'...\n", cniSwitchNamespace) + helperDs := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{Name: switchHelperDaemonSetName, Namespace: cniSwitchNamespace}, + } + if err = deleteAndWait(ctx, rtClient, helperDs); err != nil { + return err + } + webhookDep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Name: webhookDeploymentName, Namespace: cniSwitchNamespace}, + } + if err = deleteAndWait(ctx, rtClient, webhookDep); err != nil { + return err + } + fmt.Println("āœ… Active controllers stopped") + + // 4. Delete all CNIMigration resources + fmt.Println("\nDeleting all CNIMigration resources...") + migrations := &v1alpha1.CNIMigrationList{} + if err = rtClient.List(ctx, migrations); err != nil && !strings.Contains(err.Error(), "no matches for kind") { + return fmt.Errorf("listing CNIMigrations: %w", err) + } + for _, m := range migrations.Items { + // Remove finalizers to ensure deletion even if controller is down + if err = removeFinalizers(ctx, rtClient, &m); err != nil { + fmt.Printf("āš ļø Warning: failed to remove finalizers from %s: %v\n", m.Name, err) + } + if err = deleteAndWait(ctx, rtClient, &m); err != nil { + return err + } + } + fmt.Println("āœ… All CNIMigration resources deleted") + + // 5. Delete all CNINodeMigration resources + fmt.Println("\nDeleting all CNINodeMigration resources...") + nodeMigrations := &v1alpha1.CNINodeMigrationList{} + if err = rtClient.List(ctx, nodeMigrations); err != nil && !strings.Contains(err.Error(), "no matches for kind") { + return fmt.Errorf("listing CNINodeMigrations: %w", err) + } + for _, nm := range nodeMigrations.Items { + // Remove finalizers to ensure deletion even if controller is down + if err = removeFinalizers(ctx, rtClient, &nm); err != nil { + fmt.Printf("āš ļø Warning: failed to remove finalizers from %s: %v\n", nm.Name, err) + } + if err = deleteAndWait(ctx, rtClient, &nm); err != nil { + return err + } + } + fmt.Println("āœ… All CNINodeMigration resources deleted") + + // 6. Remove annotations from all pods + if err = removePodAnnotations(ctx, rtClient); err != nil { + // Non-fatal, print a warning + fmt.Printf("āš ļø Warning: failed to remove all pod annotations: %v\n", err) + } + + // 7. Delete RBAC resources + fmt.Println("\nDeleting RBAC resources...") + clusterRoleHelper := &rbacv1.ClusterRole{ + ObjectMeta: metav1.ObjectMeta{Name: switchHelperClusterRoleName}, + } + if err = deleteAndWait(ctx, rtClient, clusterRoleHelper); err != nil { + return err + } + clusterRoleWebhook := &rbacv1.ClusterRole{ + ObjectMeta: metav1.ObjectMeta{Name: webhookClusterRoleName}, + } + if err = deleteAndWait(ctx, rtClient, clusterRoleWebhook); err != nil { + return err + } + clusterRoleBindingHelper := &rbacv1.ClusterRoleBinding{ + ObjectMeta: metav1.ObjectMeta{Name: switchHelperClusterRoleBindingName}, + } + if err = deleteAndWait(ctx, rtClient, clusterRoleBindingHelper); err != nil { + return err + } + clusterRoleBindingWebhook := &rbacv1.ClusterRoleBinding{ + ObjectMeta: metav1.ObjectMeta{Name: webhookClusterRoleBindingName}, + } + if err = deleteAndWait(ctx, rtClient, clusterRoleBindingWebhook); err != nil { + return err + } + fmt.Println("āœ… Cluster-scoped RBAC resources deleted") + + // 8. Delete remaining namespaced resources + fmt.Printf("\nDeleting remaining namespaced resources in '%s'...\n", cniSwitchNamespace) + webhookService := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{Name: webhookServiceName, Namespace: cniSwitchNamespace}, + } + if err = deleteAndWait(ctx, rtClient, webhookService); err != nil { + return err + } + webhookSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: webhookSecretName, Namespace: cniSwitchNamespace}, + } + if err = deleteAndWait(ctx, rtClient, webhookSecret); err != nil { + return err + } + helperSA := &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{Name: switchHelperServiceAccountName, Namespace: cniSwitchNamespace}, + } + if err = deleteAndWait(ctx, rtClient, helperSA); err != nil { + return err + } + webhookSA := &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{Name: webhookServiceAccountName, Namespace: cniSwitchNamespace}, + } + if err = deleteAndWait(ctx, rtClient, webhookSA); err != nil { + return err + } + fmt.Println("āœ… Remaining namespaced resources deleted") + + // 9. Delete the namespace + fmt.Printf("\nDeleting namespace '%s'...\n", cniSwitchNamespace) + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: cniSwitchNamespace}} + if err = deleteAndWait(ctx, rtClient, ns); err != nil { + return err + } + fmt.Println("āœ… Namespace successfully deleted") + + fmt.Printf("\nšŸŽ‰ CNI switch cleanup successfully completed (total time: %s)\n", + time.Since(startTime).Round(time.Second), + ) + return nil +} + +// deleteAndWait deletes a resource and waits for it to be fully terminated. +func deleteAndWait(ctx context.Context, rtClient client.Client, obj client.Object) error { + if err := deleteResource(ctx, rtClient, obj); err != nil { + // This handles the "already deleted" case, so we don't need to check again. + return err + } + return waitForResourceDeletion(ctx, rtClient, obj) +} + +// deleteResource just initiates deletion for a Kubernetes resource. +func deleteResource(ctx context.Context, rtClient client.Client, obj client.Object) error { + kind := getKind(obj) + name := obj.GetName() + ns := obj.GetNamespace() + + fmt.Printf("- Deleting %s '%s%s'... ", kind, name, func() string { + if ns != "" { + return fmt.Sprintf("' in namespace '%s", ns) + } + return "" + }()) + + err := rtClient.Delete(ctx, obj, client.PropagationPolicy(metav1.DeletePropagationBackground)) + if err != nil { + if errors.IsNotFound(err) { + fmt.Println("already deleted.") + return nil + } + return fmt.Errorf("failed to delete %s '%s': %w", kind, name, err) + } + + fmt.Println("deleted.") + return nil +} + +// waitForResourceDeletion polls until a resource is confirmed to be gone. +func waitForResourceDeletion(ctx context.Context, rtClient client.Client, obj client.Object) error { + key := client.ObjectKey{Name: obj.GetName(), Namespace: obj.GetNamespace()} + kind := getKind(obj) + + // Check if the resource is already gone before starting to wait. + if err := rtClient.Get(ctx, key, obj); err != nil { + if errors.IsNotFound(err) { + // It was already gone, no need to wait. + return nil + } + return fmt.Errorf("getting %s '%s': %w", kind, key.Name, err) + } + + fmt.Printf(" Waiting for %s '%s' to terminate... ", kind, key.Name) + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + fmt.Println("timeout.") + return fmt.Errorf("timed out waiting for %s '%s' to be deleted", kind, key.Name) + case <-ticker.C: + err := rtClient.Get(ctx, key, obj) + if err != nil { + if errors.IsNotFound(err) { + fmt.Println("terminated.") + return nil // Success! + } + if strings.Contains(err.Error(), "no matches for kind") { + fmt.Println("Kind not known, assuming terminated.") + return nil + } + fmt.Printf("error: %v\n", err) + return fmt.Errorf("getting %s '%s': %w", kind, key.Name, err) + } + + // If the resource is still there, check for finalizers and remove them + // to ensure it doesn't get stuck in Terminating state. + if len(obj.GetFinalizers()) > 0 { + _ = removeFinalizers(ctx, rtClient, obj) + } + } + } +} + +func removePodAnnotations(ctx context.Context, rtClient client.Client) error { + fmt.Println("\nRemoving CNI switch annotations from all pods...") + + podList := &corev1.PodList{} + if err := rtClient.List(ctx, podList); err != nil { + return fmt.Errorf("listing all pods: %w", err) + } + + podsPatched := 0 + for _, pod := range podList.Items { + if pod.Spec.HostNetwork { + continue + } + + if _, ok := pod.Annotations[EffectiveCNIAnnotation]; ok { + // Use a merge patch to remove just the one annotation + patch := []byte(fmt.Sprintf(`{"metadata":{"annotations":{"%s":null}}}`, EffectiveCNIAnnotation)) + err := rtClient.Patch(ctx, &pod, client.RawPatch(client.Merge.Type(), patch)) + if err != nil { + if errors.IsNotFound(err) { + // No need to print warning, just continue + continue + } + fmt.Printf("\nāš ļø Warning: failed to patch pod %s/%s: %v", pod.Namespace, pod.Name, err) + continue + } + podsPatched++ + fmt.Printf("\r\033[K Patched %d pods...", podsPatched) + } + } + + if podsPatched > 0 { + fmt.Printf("\nāœ… Removed annotations from %d pods.\n", podsPatched) + } else { + fmt.Print("āœ… No pods with CNI switch annotations were found.\n") + } + + return nil +} + +// getKind extracts a user-friendly kind from a runtime object. +func getKind(obj client.Object) string { + kind := obj.GetObjectKind().GroupVersionKind().Kind + if kind == "" { + t := fmt.Sprintf("%T", obj) + parts := strings.Split(t, ".") + if len(parts) > 0 { + return parts[len(parts)-1] + } + } + return kind +} + +// removeFinalizers patches the object to remove all finalizers. +func removeFinalizers(ctx context.Context, rtClient client.Client, obj client.Object) error { + if len(obj.GetFinalizers()) == 0 { + return nil + } + + patch := []byte(`{"metadata":{"finalizers":null}}`) + return rtClient.Patch(ctx, obj, client.RawPatch(client.Merge.Type(), patch)) +} diff --git a/internal/cni/common.go b/internal/cni/common.go new file mode 100644 index 00000000..a018f8d8 --- /dev/null +++ b/internal/cni/common.go @@ -0,0 +1,142 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cni + +import ( + "bufio" + "context" + "fmt" + "os" + "strings" + + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" +) + +const ( + // Image ConfigMap from Deckhouse + CMDataName = "d8-cli-data" + CMDataNameSpace = "d8-system" + cmDataHelperImage = "cni-switch-helper-image" + + // Namespace where the helper and webhook are deployed + cniSwitchNamespace = "cni-switch" + + // Service Account Names + switchHelperServiceAccountName = "cni-switch-helper-sa" + webhookServiceAccountName = "cni-switch-webhook-sa" + + // DaemonSet Names + switchHelperDaemonSetName = "cni-switch-helper" + + // Cluster Role and Binding Names + switchHelperClusterRoleName = "d8:cni-switch-helper" + webhookClusterRoleName = "d8:cni-switch-webhook" + switchHelperClusterRoleBindingName = "d8:cni-switch-helper" + webhookClusterRoleBindingName = "d8:cni-switch-webhook" + + // Webhook Resources + webhookDeploymentName = "cni-switch-webhook" + webhookServiceName = "cni-switch-webhook-service" + webhookSecretName = "cni-switch-webhook-tls" + webhookConfigName = "cni-switch-pod-annotator" + webhookConfigurationName = "annotator.cni-switch.deckhouse.io" + webhookPort = 42443 + + // Annotations + EffectiveCNIAnnotation = "effective-cni.network.deckhouse.io" +) + +var ( + CNIModuleConfigs = []string{"cni-cilium", "cni-flannel", "cni-simple-bridge"} + + moduleConfigGVK = schema.GroupVersionKind{ + Group: "deckhouse.io", + Version: "v1alpha1", + Kind: "ModuleConfig", + } +) + +// AskForConfirmation displays a warning and prompts the user for confirmation. +func AskForConfirmation(commandName string) (bool, error) { + reader := bufio.NewReader(os.Stdin) + + fmt.Println("--------------------------------------------------------------------------------") + fmt.Println("āš ļø IMPORTANT: PLEASE READ CAREFULLY") + fmt.Println("--------------------------------------------------------------------------------") + fmt.Println() + fmt.Printf("You are about to run the '%s' step of the CNI switch process. Please ensure that:\n\n", commandName) + fmt.Println("1. External cluster management systems (CI/CD, GitOps like ArgoCD, Flux)") + fmt.Println(" are temporarily disabled. They might interfere with the CNI switch process") + fmt.Println(" by reverting changes made by this tool.") + fmt.Println() + fmt.Println("2. You have sufficient administrative privileges for this cluster to perform") + fmt.Println(" the required actions (modifying ModuleConfigs, deleting pods, etc.).") + fmt.Println() + fmt.Println("3. The utility does not configure CNI modules in the cluster; it only enables/disables") + fmt.Println(" them via ModuleConfig during operation. The user must independently prepare the") + fmt.Println(" ModuleConfig configuration for the target CNI.") + fmt.Println() + fmt.Println("Once the process starts, no active intervention is required from you.") + fmt.Println() + fmt.Print("Do you want to continue? (y/n): ") + + for { + response, err := reader.ReadString('\n') + if err != nil { + return false, err + } + + response = strings.ToLower(strings.TrimSpace(response)) + + switch response { + case "y", "yes": + fmt.Println() + return true, nil + case "n", "no": + fmt.Println() + return false, nil + default: + fmt.Print("Invalid input. Please enter 'y/yes' or 'n/no'): ") + } + } +} + +// FindActiveMigration searches for an existing CNIMigration resource. +// It returns an error if more than one migration is found. +func FindActiveMigration(ctx context.Context, rtClient client.Client) (*v1alpha1.CNIMigration, error) { + migrationList := &v1alpha1.CNIMigrationList{} + if err := rtClient.List(ctx, migrationList); err != nil { + return nil, fmt.Errorf("listing CNIMigration objects: %w", err) + } + + if len(migrationList.Items) == 0 { + return nil, nil // No migration found + } + + if len(migrationList.Items) > 1 { + return nil, fmt.Errorf( + "found %d CNI migration objects, which is an inconsistent state. "+ + "Please run 'd8 cni-switch cleanup' to resolve this", + len(migrationList.Items), + ) + } + + return &migrationList.Items[0], nil +} diff --git a/internal/cni/prepare.go b/internal/cni/prepare.go new file mode 100644 index 00000000..cb72efe8 --- /dev/null +++ b/internal/cni/prepare.go @@ -0,0 +1,587 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cni + +import ( + "bytes" + "context" + "crypto/rand" + "crypto/rsa" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + "fmt" + "math/big" + "strings" + "time" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" + saferequest "github.com/deckhouse/deckhouse-cli/pkg/libsaferequest/client" +) + +// RunPrepare executes the logic for the 'cni-switch prepare' command. +func RunPrepare(targetCNI string, timeout time.Duration) error { + // 0. Ask for user confirmation + confirmed, err := AskForConfirmation("prepare") + if err != nil { + return fmt.Errorf("asking for confirmation: %w", err) + } + if !confirmed { + fmt.Println("Operation cancelled by user.") + return nil + } + + startTime := time.Now() + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + fmt.Printf("šŸš€ Starting CNI switch preparation for target '%s' (global timeout: %s)\n", + targetCNI, timeout) + + // 1. Create a Kubernetes client + safeClient, err := saferequest.NewSafeClient() + if err != nil { + return fmt.Errorf("creating safe client: %w", err) + } + + rtClient, err := safeClient.NewRTClient(v1alpha1.AddToScheme) + if err != nil { + return fmt.Errorf("creating runtime client: %w", err) + } + fmt.Printf("āœ… Kubernetes client created (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 2. Find an existing migration or create a new one + activeMigration, err := getOrCreateMigrationForPrepare(ctx, rtClient, targetCNI) + if err != nil { + return err + } + if activeMigration == nil { + // This means preparation is already complete, and the user has been notified. + return nil + } + fmt.Printf( + "āœ… Working with CNIMigration '%s' (total elapsed: %s)\n\n", + activeMigration.Name, + time.Since(startTime).Round(time.Millisecond), + ) + + // 3. Detect current CNI and update migration status + if activeMigration.Status.CurrentCNI == "" { + var currentCNI string + currentCNI, err = detectCurrentCNI(rtClient) + if err != nil { + return fmt.Errorf("detecting current CNI: %w", err) + } + fmt.Printf("Detected current CNI: '%s'\n", currentCNI) + + if currentCNI == targetCNI { + return fmt.Errorf("target CNI '%s' is the same as the current CNI. Nothing to do", targetCNI) + } + + activeMigration.Status.CurrentCNI = currentCNI + err = rtClient.Status().Update(ctx, activeMigration) + if err != nil { + return fmt.Errorf("updating migration status with current CNI: %w", err) + } + fmt.Printf( + "āœ… Added current CNI to migration status (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond), + ) + } + + // 4. Create the dedicated namespace + fmt.Printf("Creating dedicated namespace '%s'...\n", cniSwitchNamespace) + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: cniSwitchNamespace}} + if err = rtClient.Create(ctx, ns); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating namespace %s: %w", cniSwitchNamespace, err) + } + fmt.Printf("āœ… Namespace created (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 5. Get the helper image name from the configmap + cm := &corev1.ConfigMap{} + if err = rtClient.Get(ctx, client.ObjectKey{Name: CMDataName, Namespace: CMDataNameSpace}, cm); err != nil { + return fmt.Errorf("getting %s configmap: %w", CMDataName, err) + } + imageName, ok := cm.Data[cmDataHelperImage] + if !ok || imageName == "" { + return fmt.Errorf("%s not found or empty in %s configmap", cmDataHelperImage, CMDataName) + } + + // 6. Apply RBAC + fmt.Println("Applying RBAC...") + // Helper RBAC + helperSA := getSwitchHelperServiceAccount(cniSwitchNamespace) + if err = rtClient.Create(ctx, helperSA); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating helper service account: %w", err) + } + fmt.Printf("- Helper's ServiceAccount '%s' created\n", helperSA.Name) + + helperRole := getSwitchHelperClusterRole() + if err = rtClient.Create(ctx, helperRole); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating cluster role: %w", err) + } + fmt.Printf("- Helper's ClusterRole '%s' created\n", helperRole.Name) + + helperBinding := getSwitchHelperClusterRoleBinding(cniSwitchNamespace) + if err = rtClient.Create(ctx, helperBinding); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating cluster role binding: %w", err) + } + fmt.Printf("- Helper's ClusterRoleBinding '%s' created\n", helperBinding.Name) + + // Webhook RBAC + webhookSA := getWebhookServiceAccount(cniSwitchNamespace) + if err = rtClient.Create(ctx, webhookSA); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating webhook service account: %w", err) + } + fmt.Printf("- Webhook's ServiceAccount '%s' created\n", webhookSA.Name) + + webhookRole := getWebhookClusterRole() + if err = rtClient.Create(ctx, webhookRole); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating webhook cluster role: %w", err) + } + fmt.Printf("- Webhook's ClusterRole '%s' created\n", webhookRole.Name) + + webhookBinding := getWebhookClusterRoleBinding(cniSwitchNamespace) + if err = rtClient.Create(ctx, webhookBinding); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating webhook cluster role binding: %w", err) + } + fmt.Printf("- Webhook's ClusterRoleBinding '%s' created\n", webhookBinding.Name) + fmt.Printf("āœ… RBAC applied (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 7. Create and wait for the mutating webhook + fmt.Println("Deploying Mutating Webhook for annotating new pods...") + // Generate certificates + caCert, serverCert, serverKey, err := generateWebhookCertificates(cniSwitchNamespace) + if err != nil { + return fmt.Errorf("generating webhook certificates: %w", err) + } + fmt.Printf("- TLS certificate generated\n") + + // Create TLS secret + tlsSecret := getWebhookTLSSecret(cniSwitchNamespace, serverCert, serverKey) + if err = rtClient.Create(ctx, tlsSecret); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating webhook tls secret: %w", err) + } + fmt.Printf("- Secret with TLS certificate '%s' created\n", tlsSecret.Name) + + // Create Deployment + webhookDeployment := getWebhookDeployment(cniSwitchNamespace, imageName, webhookServiceAccountName) + if err = rtClient.Create(ctx, webhookDeployment); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating webhook deployment: %w", err) + } + + // Wait for Deployment to be ready + if err = waitForDeploymentReady(ctx, rtClient, webhookDeployment); err != nil { + return fmt.Errorf("waiting for webhook deployment ready: %w", err) + } + fmt.Printf("- Webhook Deployment '%s' created\n", webhookDeployment.Name) + + // Create Service + webhookService := getWebhookService(cniSwitchNamespace) + if err = rtClient.Create(ctx, webhookService); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating webhook service: %w", err) + } + fmt.Printf("- Webhook Service '%s' created\n", webhookService.Name) + + // Create MutatingWebhookConfiguration + webhookConfig := getMutatingWebhookConfiguration(cniSwitchNamespace, caCert) + if err = rtClient.Create(ctx, webhookConfig); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating mutating webhook configuration: %w", err) + } + fmt.Printf("āœ… Mutating Webhook '%s' is active (total elapsed: %s)\n\n", + webhookConfig.Name, time.Since(startTime).Round(time.Millisecond)) + + // 8. Create and wait for the Helper daemonset + dsKey := client.ObjectKey{Name: switchHelperDaemonSetName, Namespace: cniSwitchNamespace} + ds := &appsv1.DaemonSet{} + if err = rtClient.Get(ctx, dsKey, ds); err != nil { + if !errors.IsNotFound(err) { + return fmt.Errorf("getting helper daemonset: %w", err) + } + fmt.Printf("Creating helper DaemonSet '%s'...\n", switchHelperDaemonSetName) + dsToCreate := getSwitchHelperDaemonSet(cniSwitchNamespace, imageName) + if err = rtClient.Create(ctx, dsToCreate); err != nil { + return fmt.Errorf("creating helper daemonset: %w", err) + } + ds = dsToCreate + } else { + fmt.Printf("Helper DaemonSet '%s' already exists.\n", switchHelperDaemonSetName) + } + + if err = waitForDaemonSetReady(ctx, rtClient, ds); err != nil { + return fmt.Errorf("waiting for daemonset ready: %w", err) + } + fmt.Printf("āœ… Helper DaemonSet is ready (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 9. Wait for all nodes to be prepared + fmt.Println("Waiting for all nodes to complete the preparation step...") + err = waitForNodesPrepared(ctx, rtClient) + if err != nil { + return fmt.Errorf("waiting for nodes to be prepared: %w", err) + } + fmt.Printf("āœ… All CNINodeMigrations are created and all nodes are prepared (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 10. Update overall status + activeMigration.Status.Conditions = append(activeMigration.Status.Conditions, metav1.Condition{ + Type: "PreparationSucceeded", + Status: metav1.ConditionTrue, + LastTransitionTime: metav1.Now(), + Reason: "AllNodesPrepared", + Message: "All nodes successfully completed the preparation step.", + }) + + err = rtClient.Status().Update(ctx, activeMigration) + if err != nil { + return fmt.Errorf("updating CNIMigration status to prepared: %w", err) + } + + fmt.Printf( + "šŸŽ‰ Cluster successfully prepared for CNI switch (total time: %s)\n", + time.Since(startTime).Round(time.Second), + ) + fmt.Println("\nYou can now run 'd8 cni-switch switch' to proceed") + + return nil +} + +// generateWebhookCertificates creates a self-signed CA and a server certificate for the webhook. +func generateWebhookCertificates(namespace string) ([]byte, []byte, []byte, error) { + caSerialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) + caSerialNumber, err := rand.Int(rand.Reader, caSerialNumberLimit) + if err != nil { + return nil, nil, nil, fmt.Errorf("failed to generate CA serial number: %w", err) + } + + // CA configuration + ca := &x509.Certificate{ + SerialNumber: caSerialNumber, + Subject: pkix.Name{ + Organization: []string{"deckhouse.io"}, + }, + NotBefore: time.Now(), + NotAfter: time.Now().AddDate(1, 0, 0), + IsCA: true, + ExtKeyUsage: []x509.ExtKeyUsage{ + x509.ExtKeyUsageClientAuth, + x509.ExtKeyUsageServerAuth, + }, + KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageCertSign, + BasicConstraintsValid: true, + } + + caPrivKey, err := rsa.GenerateKey(rand.Reader, 4096) + if err != nil { + return nil, nil, nil, fmt.Errorf("generating CA private key: %w", err) + } + + caBytes, err := x509.CreateCertificate(rand.Reader, ca, ca, &caPrivKey.PublicKey, caPrivKey) + if err != nil { + return nil, nil, nil, fmt.Errorf("creating CA certificate: %w", err) + } + + caPEM := new(bytes.Buffer) + _ = pem.Encode(caPEM, &pem.Block{ + Type: "CERTIFICATE", + Bytes: caBytes, + }) + + serverSerialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) + serverSerialNumber, err := rand.Int(rand.Reader, serverSerialNumberLimit) + if err != nil { + return nil, nil, nil, fmt.Errorf("failed to generate server serial number: %w", err) + } + + // Server certificate configuration + commonName := fmt.Sprintf("%s.%s.svc", webhookServiceName, namespace) + cert := &x509.Certificate{ + SerialNumber: serverSerialNumber, + Subject: pkix.Name{ + CommonName: commonName, + Organization: []string{"deckhouse.io"}, + }, + DNSNames: []string{commonName}, + NotBefore: time.Now(), + NotAfter: time.Now().AddDate(1, 0, 0), + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + KeyUsage: x509.KeyUsageDigitalSignature, + } + + serverPrivKey, err := rsa.GenerateKey(rand.Reader, 4096) + if err != nil { + return nil, nil, nil, fmt.Errorf("generating server private key: %w", err) + } + + serverCertBytes, err := x509.CreateCertificate(rand.Reader, cert, ca, &serverPrivKey.PublicKey, caPrivKey) + if err != nil { + return nil, nil, nil, fmt.Errorf("creating server certificate: %w", err) + } + + serverCertPEM := new(bytes.Buffer) + _ = pem.Encode(serverCertPEM, &pem.Block{ + Type: "CERTIFICATE", + Bytes: serverCertBytes, + }) + + serverPrivKeyPEM := new(bytes.Buffer) + _ = pem.Encode(serverPrivKeyPEM, &pem.Block{ + Type: "RSA PRIVATE KEY", + Bytes: x509.MarshalPKCS1PrivateKey(serverPrivKey), + }) + + return caPEM.Bytes(), serverCertPEM.Bytes(), serverPrivKeyPEM.Bytes(), nil +} + +func getOrCreateMigrationForPrepare( + ctx context.Context, rtClient client.Client, targetCNI string, +) (*v1alpha1.CNIMigration, error) { + activeMigration, err := FindActiveMigration(ctx, rtClient) + if err != nil { + return nil, fmt.Errorf("failed to find active migration: %w", err) + } + + if activeMigration != nil { + fmt.Printf("Found active CNIMigration '%s'\n", activeMigration.Name) + + // If an active migration is found, ensure its target CNI matches the requested target CNI. + if activeMigration.Spec.TargetCNI != targetCNI { + return nil, fmt.Errorf( + "an active CNI migration to '%s' is already in progress. "+ + "Cannot prepare for '%s'. To change the target CNI, "+ + "please run 'd8 cni-switch cleanup' first to reset the state", + activeMigration.Spec.TargetCNI, + targetCNI, + ) + } + + // Check if preparation is already done + for _, cond := range activeMigration.Status.Conditions { + if cond.Type == "PreparationSucceeded" && cond.Status == metav1.ConditionTrue { + fmt.Println("šŸŽ‰ Cluster has already been prepared for CNI switch.") + fmt.Println("\nYou can now run 'd8 cni-switch switch' to proceed.") + return nil, nil // Signal to the caller that we can exit gracefully + } + } + + // Ensure the migration is in the 'Prepare' phase + if activeMigration.Spec.Phase != "Prepare" { + return nil, fmt.Errorf( + "an active migration is already in the '%s' phase. "+ + "Cannot run 'prepare' again. To proceed, run 'd8 cni-switch switch'. "+ + "To start over, run 'd8 cni-switch cleanup'", + activeMigration.Spec.Phase, + ) + } + + return activeMigration, nil + } + migrationName := fmt.Sprintf("cni-migration-%s", time.Now().Format("20060102-150405")) + fmt.Printf("No active migration found. Creating a new one...\n") + + newMigration := &v1alpha1.CNIMigration{ + ObjectMeta: metav1.ObjectMeta{ + Name: migrationName, + }, + Spec: v1alpha1.CNIMigrationSpec{ + TargetCNI: targetCNI, + Phase: "Prepare", + }, + } + + err = rtClient.Create(ctx, newMigration) + if err != nil { + if errors.IsAlreadyExists(err) { + fmt.Println("Migration object was created by another process. Getting it.") + err = rtClient.Get(ctx, client.ObjectKey{Name: migrationName}, newMigration) + if err != nil { + return nil, fmt.Errorf("getting existing CNIMigration object: %w", err) + } + return newMigration, nil + } + return nil, fmt.Errorf("creating new CNIMigration object: %w", err) + } + + fmt.Printf("Successfully created CNIMigration object '%s'\n", newMigration.Name) + return newMigration, nil +} + +func waitForDaemonSetReady(ctx context.Context, rtClient client.Client, ds *appsv1.DaemonSet) error { + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + key := client.ObjectKey{Name: ds.Name, Namespace: ds.Namespace} + err := rtClient.Get(ctx, key, ds) + if err != nil { + fmt.Printf("\nāš ļø Warning: could not get DaemonSet status: %v\n", err) + continue + } + + // This is the exit condition for the loop. + if ds.Status.DesiredNumberScheduled == ds.Status.NumberReady && ds.Status.NumberUnavailable == 0 { + fmt.Printf( + "\r\033[K Waiting for DaemonSet... %d/%d pods ready\n", + ds.Status.NumberReady, + ds.Status.DesiredNumberScheduled, + ) + return nil + } + + // This is the progress update. + fmt.Printf( + "\r\033[K Waiting for DaemonSet... %d/%d pods ready", + ds.Status.NumberReady, + ds.Status.DesiredNumberScheduled, + ) + } + } +} + +func waitForDeploymentReady(ctx context.Context, rtClient client.Client, dep *appsv1.Deployment) error { + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + key := client.ObjectKey{Name: dep.Name, Namespace: dep.Namespace} + err := rtClient.Get(ctx, key, dep) + if err != nil { + fmt.Printf("\nāš ļø Warning: could not get Deployment status: %v\n", err) + continue + } + + // This is the exit condition for the loop. + if dep.Spec.Replicas != nil && dep.Status.ReadyReplicas >= + *dep.Spec.Replicas && dep.Status.UnavailableReplicas == 0 { + fmt.Printf( + "\r\033[K Waiting for Deployment... %d/%d replicas ready\n", + dep.Status.ReadyReplicas, + *dep.Spec.Replicas, + ) + return nil + } + + // This is the progress update. + if dep.Spec.Replicas != nil { + fmt.Printf( + "\r\033[K Waiting for Deployment... %d/%d replicas ready", + dep.Status.ReadyReplicas, + *dep.Spec.Replicas, + ) + } + } + } +} + +func waitForNodesPrepared(ctx context.Context, rtClient client.Client) error { + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + nodes := &corev1.NodeList{} + if err := rtClient.List(ctx, nodes); err != nil { + fmt.Printf("āš ļø Warning: could not list nodes: %v\n", err) + continue + } + totalNodes := len(nodes.Items) + + migrations := &v1alpha1.CNINodeMigrationList{} + if err := rtClient.List(ctx, migrations); err != nil { + fmt.Printf("āš ļø Warning: could not list node migrations: %v\n", err) + continue + } + + readyNodes := 0 + for _, migration := range migrations.Items { + for _, cond := range migration.Status.Conditions { + if cond.Type == "PreparationSucceeded" && cond.Status == metav1.ConditionTrue { + readyNodes++ + break + } + } + } + + fmt.Printf("\r\033[K Progress: %d/%d nodes prepared...", readyNodes, totalNodes) + + if readyNodes >= totalNodes && totalNodes > 0 { + fmt.Printf("\r\033[K Progress: %d/%d nodes prepared...\n", readyNodes, totalNodes) + return nil + } + } + } +} + +func detectCurrentCNI(rtClient client.Client) (string, error) { + var enabledCNIs []string + for _, cniModule := range CNIModuleConfigs { + mc := &unstructured.Unstructured{} + mc.SetGroupVersionKind(moduleConfigGVK) + + err := rtClient.Get(context.Background(), client.ObjectKey{Name: cniModule}, mc) + if err != nil { + if errors.IsNotFound(err) { + continue + } + return "", fmt.Errorf("getting module config %s: %w", cniModule, err) + } + + enabled, found, err := unstructured.NestedBool(mc.Object, "spec", "enabled") + if err != nil { + return "", fmt.Errorf("parsing 'spec.enabled' for module config %s: %w", cniModule, err) + } + + if found && enabled { + cniName := strings.TrimPrefix(cniModule, "cni-") + enabledCNIs = append(enabledCNIs, cniName) + } + } + + if len(enabledCNIs) == 0 { + return "", fmt.Errorf("no enabled CNI module found. Looked for: %s", strings.Join(CNIModuleConfigs, ", ")) + } + + if len(enabledCNIs) > 1 { + return "", fmt.Errorf( + "found multiple enabled CNI modules: %s. Please disable all but one", + strings.Join(enabledCNIs, ", "), + ) + } + + return enabledCNIs[0], nil +} diff --git a/internal/cni/resources.go b/internal/cni/resources.go new file mode 100644 index 00000000..fe47446c --- /dev/null +++ b/internal/cni/resources.go @@ -0,0 +1,545 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cni + +import ( + admissionregistrationv1 "k8s.io/api/admissionregistration/v1" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" +) + +// --- Helper Resources --- + +// getSwitchHelperDaemonSet returns a DaemonSet object. +func getSwitchHelperDaemonSet(namespace, imageName string) *appsv1.DaemonSet { + rootID := int64(0) + truePtr := true + terminationGracePeriodSeconds := int64(5) + mountPropagationBidirectional := corev1.MountPropagationBidirectional + hostPathDirectoryOrCreate := corev1.HostPathDirectoryOrCreate + hostPathDirectory := corev1.HostPathDirectory + + return &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: switchHelperDaemonSetName, + Namespace: namespace, + }, + Spec: appsv1.DaemonSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": switchHelperDaemonSetName}, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": switchHelperDaemonSetName}, + }, + Spec: corev1.PodSpec{ + ServiceAccountName: switchHelperServiceAccountName, + Containers: []corev1.Container{ + { + Name: "helper", + Image: imageName, + Args: []string{ + "--health-probe-bind-address=:42281", + }, + Ports: []corev1.ContainerPort{ + { + Name: "healthz", + ContainerPort: 42281, + Protocol: corev1.ProtocolTCP, + }, + }, + LivenessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/healthz", + Port: intstr.FromString("healthz"), + }, + }, + InitialDelaySeconds: 15, + PeriodSeconds: 20, + }, + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/readyz", + Port: intstr.FromString("healthz"), + }, + }, + InitialDelaySeconds: 5, + PeriodSeconds: 10, + }, + Env: []corev1.EnvVar{ + { + Name: "NODE_NAME", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "spec.nodeName", + }, + }, + }, + { + Name: "KUBERNETES_SERVICE_HOST", + Value: "127.0.0.1", + }, + { + Name: "KUBERNETES_SERVICE_PORT", + Value: "6445", + }, + }, + SecurityContext: &corev1.SecurityContext{ + Privileged: &truePtr, + RunAsUser: &rootID, + RunAsGroup: &rootID, + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "host-proc", + MountPath: "/host/proc", + ReadOnly: true, + }, + { + Name: "host-sys", + MountPath: "/host/sys", + ReadOnly: true, + }, + { + Name: "cni-net-d", + MountPath: "/etc/cni/net.d", + }, + { + Name: "cni-bin", + MountPath: "/opt/cni/bin", + }, + { + Name: "host-run", + MountPath: "/run", + }, + { + Name: "host-bpf", + MountPath: "/sys/fs/bpf", + MountPropagation: &mountPropagationBidirectional, + }, + { + Name: "host-lib-modules", + MountPath: "/lib/modules", + ReadOnly: true, + }, + { + Name: "host-var-lib-cni", + MountPath: "/var/lib/cni", + }, + }, + }, + }, + Tolerations: []corev1.Toleration{ + { + Operator: corev1.TolerationOpExists, + }, + }, + PriorityClassName: "system-node-critical", + HostNetwork: true, + HostPID: true, + HostIPC: true, + TerminationGracePeriodSeconds: &terminationGracePeriodSeconds, + Volumes: []corev1.Volume{ + { + Name: "host-proc", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{Path: "/proc"}, + }, + }, + { + Name: "host-sys", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{Path: "/sys"}, + }, + }, + { + Name: "cni-net-d", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{Path: "/etc/cni/net.d"}, + }, + }, + { + Name: "cni-bin", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{Path: "/opt/cni/bin"}, + }, + }, + { + Name: "host-run", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{Path: "/run"}, + }, + }, + { + Name: "host-bpf", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/sys/fs/bpf", + Type: &hostPathDirectoryOrCreate, + }, + }, + }, + { + Name: "host-lib-modules", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/lib/modules", + Type: &hostPathDirectory, + }, + }, + }, + { + Name: "host-var-lib-cni", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/var/lib/cni", + Type: &hostPathDirectoryOrCreate, + }, + }, + }, + }, + }, + }, + }, + } +} + +// --- Webhook Resources --- + +// getWebhookTLSSecret returns a Secret object containing the webhook's TLS certificates. +func getWebhookTLSSecret(namespace string, cert, key []byte) *corev1.Secret { + return &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: webhookSecretName, + Namespace: namespace, + }, + Type: corev1.SecretTypeTLS, + Data: map[string][]byte{ + corev1.TLSCertKey: cert, + corev1.TLSPrivateKeyKey: key, + }, + } +} + +// getWebhookDeployment returns a Deployment object for the webhook server. +func getWebhookDeployment(namespace, imageName, serviceAccountName string) *appsv1.Deployment { + replicas := int32(2) + terminationGracePeriodSeconds := int64(5) + return &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: webhookDeploymentName, + Namespace: namespace, + Labels: map[string]string{ + "app": webhookDeploymentName, + }, + }, + Spec: appsv1.DeploymentSpec{ + Replicas: &replicas, + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": webhookDeploymentName, + }, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "app": webhookDeploymentName, + }, + }, + Spec: corev1.PodSpec{ + ServiceAccountName: serviceAccountName, + HostNetwork: true, + DNSPolicy: corev1.DNSClusterFirstWithHostNet, + TerminationGracePeriodSeconds: &terminationGracePeriodSeconds, + Tolerations: []corev1.Toleration{ + { + Operator: corev1.TolerationOpExists, + }, + }, + Affinity: &corev1.Affinity{ + PodAntiAffinity: &corev1.PodAntiAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": webhookDeploymentName, + }, + }, + TopologyKey: "kubernetes.io/hostname", + }, + }, + }, + }, + Containers: []corev1.Container{ + { + Name: "webhook", + Image: imageName, + Args: []string{ + "--mode=webhook", + "--health-probe-bind-address=:42282", + }, + Ports: []corev1.ContainerPort{ + { + ContainerPort: webhookPort, + Name: "webhook", + }, + { + Name: "healthz", + ContainerPort: 42282, + Protocol: corev1.ProtocolTCP, + }, + }, + LivenessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/healthz", + Port: intstr.FromString("healthz"), + }, + }, + InitialDelaySeconds: 15, + PeriodSeconds: 20, + }, + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/readyz", + Port: intstr.FromString("healthz"), + }, + }, + InitialDelaySeconds: 5, + PeriodSeconds: 10, + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "tls-certs", + MountPath: "/etc/tls", + ReadOnly: true, + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "tls-certs", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: webhookSecretName, + }, + }, + }, + }, + }, + }, + }, + } +} + +// getWebhookService returns a Service object for the webhook. +func getWebhookService(namespace string) *corev1.Service { + return &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: webhookServiceName, + Namespace: namespace, + }, + Spec: corev1.ServiceSpec{ + Selector: map[string]string{ + "app": webhookDeploymentName, + }, + Ports: []corev1.ServicePort{ + { + Protocol: corev1.ProtocolTCP, + Port: 443, + TargetPort: intstr.FromInt(webhookPort), + }, + }, + }, + } +} + +// getMutatingWebhookConfiguration returns a MutatingWebhookConfiguration object for annotating pods. +func getMutatingWebhookConfiguration(namespace string, caBundle []byte) *admissionregistrationv1.MutatingWebhookConfiguration { + path := "/mutate-pod" + failurePolicy := admissionregistrationv1.Ignore + sideEffects := admissionregistrationv1.SideEffectClassNone + + return &admissionregistrationv1.MutatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{ + Name: webhookConfigName, + }, + Webhooks: []admissionregistrationv1.MutatingWebhook{ + { + Name: webhookConfigurationName, + AdmissionReviewVersions: []string{"v1"}, + ClientConfig: admissionregistrationv1.WebhookClientConfig{ + Service: &admissionregistrationv1.ServiceReference{ + Name: webhookServiceName, + Namespace: namespace, + Path: &path, + Port: &[]int32{443}[0], + }, + CABundle: caBundle, + }, + Rules: []admissionregistrationv1.RuleWithOperations{ + { + Operations: []admissionregistrationv1.OperationType{admissionregistrationv1.Create}, + Rule: admissionregistrationv1.Rule{ + APIGroups: []string{""}, + APIVersions: []string{"v1"}, + Resources: []string{"pods"}, + }, + }, + }, + NamespaceSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "kubernetes.io/metadata.name", + Operator: metav1.LabelSelectorOpNotIn, + Values: generateExcludedNamespaces(namespace), + }, + }, + }, + FailurePolicy: &failurePolicy, + SideEffects: &sideEffects, + }, + }, + } +} + +// generateExcludedNamespaces creates a list of namespaces to be excluded from webhook processing. +// This includes the webhook's own namespace and CNI module namespaces. +func generateExcludedNamespaces(currentNamespace string) []string { + excluded := []string{currentNamespace} // Exclude the webhook's own namespace (e.g., "cni-switch") + for _, module := range CNIModuleConfigs { + excluded = append(excluded, "d8-"+module) // Exclude "d8-cni-cilium", "d8-cni-flannel", etc. + } + return excluded +} + +// --- RBAC Resources --- + +func getSwitchHelperServiceAccount(namespace string) *corev1.ServiceAccount { + return &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: switchHelperServiceAccountName, + Namespace: namespace, + }, + } +} + +func getWebhookServiceAccount(namespace string) *corev1.ServiceAccount { + return &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: webhookServiceAccountName, + Namespace: namespace, + }, + } +} + +func getSwitchHelperClusterRole() *rbacv1.ClusterRole { + return &rbacv1.ClusterRole{ + ObjectMeta: metav1.ObjectMeta{ + Name: switchHelperClusterRoleName, + }, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{"network.deckhouse.io"}, + Resources: []string{"cnimigrations"}, + Verbs: []string{"get", "list", "watch"}, + }, + { + APIGroups: []string{"network.deckhouse.io"}, + Resources: []string{"cnimigrations/status"}, + Verbs: []string{"get"}, + }, + { + APIGroups: []string{"network.deckhouse.io"}, + Resources: []string{"cninodemigrations"}, + Verbs: []string{"get", "list", "watch", "create", "update", "patch", "delete"}, + }, + { + APIGroups: []string{"network.deckhouse.io"}, + Resources: []string{"cninodemigrations/status"}, + Verbs: []string{"get", "update", "patch"}, + }, + { + APIGroups: []string{""}, + Resources: []string{"pods"}, + Verbs: []string{"get", "list", "watch", "patch", "update", "delete"}, + }, + }, + } +} + +func getWebhookClusterRole() *rbacv1.ClusterRole { + return &rbacv1.ClusterRole{ + ObjectMeta: metav1.ObjectMeta{ + Name: webhookClusterRoleName, + }, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{"network.deckhouse.io"}, + Resources: []string{"cnimigrations"}, + Verbs: []string{"get", "list", "watch"}, + }, + }, + } +} + +func getSwitchHelperClusterRoleBinding(namespace string) *rbacv1.ClusterRoleBinding { + return &rbacv1.ClusterRoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: switchHelperClusterRoleBindingName, + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: "rbac.authorization.k8s.io", + Kind: "ClusterRole", + Name: switchHelperClusterRoleName, + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + Name: switchHelperServiceAccountName, + Namespace: namespace, + }, + }, + } +} + +func getWebhookClusterRoleBinding(namespace string) *rbacv1.ClusterRoleBinding { + return &rbacv1.ClusterRoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: webhookClusterRoleBindingName, + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: "rbac.authorization.k8s.io", + Kind: "ClusterRole", + Name: webhookClusterRoleName, + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + Name: webhookServiceAccountName, + Namespace: namespace, + }, + }, + } +} diff --git a/internal/cni/rollback.go b/internal/cni/rollback.go new file mode 100644 index 00000000..52999599 --- /dev/null +++ b/internal/cni/rollback.go @@ -0,0 +1,29 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cni + +import ( + "fmt" + "time" +) + +// RunRollback executes the logic for the 'cni-switch rollback' command. +func RunRollback(timeout time.Duration) error { + fmt.Println("Logic for rollback is not implemented yet.") + _ = timeout + return nil +} diff --git a/internal/cni/switch.go b/internal/cni/switch.go new file mode 100644 index 00000000..20cc89e5 --- /dev/null +++ b/internal/cni/switch.go @@ -0,0 +1,841 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cni + +import ( + "context" + "fmt" + "strings" + "time" + + admissionregistrationv1 "k8s.io/api/admissionregistration/v1" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" + saferequest "github.com/deckhouse/deckhouse-cli/pkg/libsaferequest/client" +) + +// RunSwitch executes the logic for the 'cni-switch switch' command. +func RunSwitch(timeout time.Duration) error { + // 0. Ask for user confirmation + confirmed, err := AskForConfirmation("switch") + if err != nil { + return fmt.Errorf("asking for confirmation: %w", err) + } + if !confirmed { + fmt.Println("Operation cancelled by user.") + return nil + } + + startTime := time.Now() + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + fmt.Printf("šŸš€ Starting CNI switch (global timeout: %s)\n", timeout) + + // 1. Create a Kubernetes client + safeClient, err := saferequest.NewSafeClient() + if err != nil { + return fmt.Errorf("creating safe client: %w", err) + } + + rtClient, err := safeClient.NewRTClient(v1alpha1.AddToScheme) + if err != nil { + return fmt.Errorf("creating runtime client: %w", err) + } + fmt.Printf("āœ… Kubernetes client created (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 2. Find the active migration + activeMigration, err := FindActiveMigration(ctx, rtClient) + if err != nil { + return fmt.Errorf("failed to find active migration: %w", err) + } + + if activeMigration == nil { + return fmt.Errorf("no active CNI migration found. Please run 'd8 cni-switch prepare' first") + } + + // Check if the switch is already completed successfully. + for _, cond := range activeMigration.Status.Conditions { + if cond.Type == "Succeeded" && cond.Status == metav1.ConditionTrue { + fmt.Printf("šŸŽ‰ CNI switch to '%s' is already completed successfully.\n", activeMigration.Spec.TargetCNI) + fmt.Println("\nYou can run 'd8 cni-switch cleanup' to remove auxiliary resources.") + return nil + } + } + + // 3. Check if the preparation step was completed successfully + isPrepared := false + for _, cond := range activeMigration.Status.Conditions { + if cond.Type == "PreparationSucceeded" && cond.Status == metav1.ConditionTrue { + isPrepared = true + break + } + } + + if !isPrepared { + return fmt.Errorf("cluster is not ready for switching. " + + "Please ensure the 'prepare' command completed successfully") + } + + // Verify all resources exist + fmt.Println("Verifying all created resources...") + if err = verifyResourcesExist(ctx, rtClient); err != nil { + return fmt.Errorf("verifying resources: %w", err) + } + fmt.Println("- Verified. All necessary resources are present in the cluster") + + fmt.Printf( + "āœ… Working with prepared migration '%s' (total elapsed: %s)\n\n", + activeMigration.Name, + time.Since(startTime).Round(time.Millisecond), + ) + + // 4. Enable target CNI + currentCNI := activeMigration.Status.CurrentCNI + targetCNI := activeMigration.Spec.TargetCNI + + // Check if we already allowed the target CNI to start + targetCNIAllowedToStart := false + for _, cond := range activeMigration.Status.Conditions { + if cond.Type == "NodeCleanupSucceeded" && cond.Status == metav1.ConditionTrue { + targetCNIAllowedToStart = true + break + } + } + + fmt.Printf("Enabling target CNI module 'cni-%s'...\n", targetCNI) + if err = toggleModule(ctx, rtClient, "cni-"+strings.ToLower(targetCNI), true); err != nil { + return fmt.Errorf("enabling module '%s': %w", targetCNI, err) + } + + var dsName string + if !targetCNIAllowedToStart { + // Wait for target CNI pods to start initializing + targetModuleName := "cni-" + strings.ToLower(targetCNI) + dsName, err = getDaemonSetNameForCNI(targetModuleName) + if err != nil { + return fmt.Errorf("getting daemonset name for target CNI: %w", err) + } + if err = waitForModulePodsInitializing(ctx, rtClient, targetModuleName, dsName); err != nil { + return fmt.Errorf("waiting for target CNI pods to initialize: %w", err) + } + fmt.Printf("āœ… CNI module 'cni-%s' enabled and pods initialized (total elapsed: %s)\n\n", + targetCNI, + time.Since(startTime).Round(time.Millisecond)) + + // 5. Disable current CNI + fmt.Printf("Disabling current CNI module 'cni-%s'...\n", currentCNI) + if err = toggleModule(ctx, rtClient, "cni-"+strings.ToLower(currentCNI), false); err != nil { + return fmt.Errorf("disabling module '%s': %w", currentCNI, err) + } + if err = waitForModule(ctx, rtClient, "cni-"+strings.ToLower(currentCNI), false); err != nil { + return fmt.Errorf("waiting for module '%s' to be disabled: %w", currentCNI, err) + } + + var dsNameCurrent string + dsNameCurrent, err = getDaemonSetNameForCNI("cni-" + strings.ToLower(currentCNI)) + if err != nil { + return fmt.Errorf("getting daemonset name for current CNI: %w", err) + } + if err = waitForModulePodsTermination( + ctx, rtClient, "cni-"+strings.ToLower(currentCNI), dsNameCurrent, + ); err != nil { + return fmt.Errorf("waiting for current CNI pods to terminate: %w", err) + } + + if err = updateCNIMigrationStatus(ctx, rtClient, activeMigration.Name, metav1.Condition{ + Type: "OldCNIDisabled", + Status: metav1.ConditionTrue, + Reason: "ModuleDisabled", + Message: fmt.Sprintf("Module 'cni-%s' was successfully disabled.", currentCNI), + }); err != nil { + return fmt.Errorf("updating CNIMigration status: %w", err) + } + fmt.Printf("āœ… CNI module 'cni-%s' disabled (total elapsed: %s)\n\n", + currentCNI, + time.Since(startTime).Round(time.Millisecond)) + + // 6. Update phase to Migrate (Triggers cleanup on nodes) + fmt.Println("Updating CNIMigration phase to 'Migrate' to trigger node cleanup...") + if err = updateCNIMigrationPhase(ctx, rtClient, activeMigration.Name, "Migrate"); err != nil { + return fmt.Errorf("updating CNIMigration phase: %w", err) + } + fmt.Printf("āœ… CNIMigration phase updated (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 7. Wait for nodes to be cleaned up + fmt.Println("Waiting for nodes to be cleaned up by cni-switch-helper...") + if err = waitForNodeConditions(ctx, rtClient, activeMigration, "CleanupSucceeded"); err != nil { + return fmt.Errorf("waiting for node cleanup: %w", err) + } + fmt.Printf("āœ… All nodes cleaned up (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 8. This status update is CRITICAL. It unblocks the target CNI's init-container. + fmt.Println("Signaling 'NodeCleanupSucceeded' to proceed with pod restart...") + if err = updateCNIMigrationStatus(ctx, rtClient, activeMigration.Name, metav1.Condition{ + Type: "NodeCleanupSucceeded", + Status: metav1.ConditionTrue, + Reason: "AllNodesCleanedUp", + Message: "All nodes have been successfully cleaned up from old CNI artifacts.", + }); err != nil { + return fmt.Errorf("updating CNIMigration status: %w", err) + } + fmt.Printf("āœ… CNIMigration status updated (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + } else { + fmt.Printf("ā„¹ļø Skipping wait for init, disable old CNI, cleanup as NodeCleanupSucceeded is already True.\n\n") + } + + // 9. Wait for target CNI to be Ready + // Now that NodeCleanupSucceeded is True, the target CNI pods should unblock and become Ready + targetModuleName := "cni-" + strings.ToLower(targetCNI) + fmt.Printf("Waiting for pods of module '%s' to become Ready...\n", targetModuleName) + dsName, err = getDaemonSetNameForCNI(targetModuleName) + if err != nil { + return fmt.Errorf("getting daemonset name for target CNI: %w", err) + } + if err = waitForModulePodsReady(ctx, rtClient, targetModuleName, dsName); err != nil { + return fmt.Errorf("waiting for target CNI pods to be ready: %w", err) + } + fmt.Printf("āœ… CNI module 'cni-%s' is now Ready (total elapsed: %s)\n\n", + targetCNI, + time.Since(startTime).Round(time.Millisecond)) + + // 10. Delete Mutating Webhook + fmt.Println("Deleting Mutating Webhook...") + if err = deleteMutatingWebhook(ctx, rtClient); err != nil { + return fmt.Errorf("deleting mutating webhook: %w", err) + } + fmt.Printf("\nāœ… Mutating webhook deleted (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 11. Signal 'NewCNIEnabled' + fmt.Println("Signaling 'NewCNIEnabled' to proceed with pod restart...") + if err = updateCNIMigrationStatus(ctx, rtClient, activeMigration.Name, metav1.Condition{ + Type: "NewCNIEnabled", + Status: metav1.ConditionTrue, + Reason: "ModuleEnabled", + Message: fmt.Sprintf("Module 'cni-%s' was successfully enabled.", targetCNI), + }); err != nil { + return fmt.Errorf("updating CNIMigration status: %w", err) + } + fmt.Printf("āœ… CNIMigration status updated (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 12. Wait for pods to be restarted + fmt.Println("Waiting for pods to be restarted on all nodes...") + if err = waitForNodeConditions(ctx, rtClient, activeMigration, "PodsRestarted"); err != nil { + fmt.Printf("āš ļø Warning: Timed out waiting for pods to restart: %v\n", err) + fmt.Println("Please check the cluster status manually. The CNI switch is otherwise complete.") + } else { + fmt.Printf("āœ… All pods restarted (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + } + + // 13. Finalize migration + fmt.Println("Finalizing migration...") + if err = updateCNIMigrationStatus(ctx, rtClient, activeMigration.Name, metav1.Condition{ + Type: "Succeeded", + Status: metav1.ConditionTrue, + LastTransitionTime: metav1.Now(), + Reason: "MigrationComplete", + Message: "CNI migration completed successfully.", + }); err != nil { + fmt.Printf("\n āš ļø Warning: Failed to update status (Succeeded): %v. Proceeding...\n", err) + } + + fmt.Printf( + "šŸŽ‰ CNI switch to '%s' completed successfully! (total time: %s)\n", + targetCNI, + time.Since(startTime).Round(time.Second), + ) + fmt.Println("\nYou can now run 'd8 cni-switch cleanup' to remove auxiliary resources.") + + return nil +} + +func toggleModule(ctx context.Context, cl client.Client, moduleName string, toggle bool) error { + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + mc := &unstructured.Unstructured{} + mc.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "deckhouse.io", + Version: "v1alpha1", + Kind: "ModuleConfig", + }) + + err := cl.Get(ctx, types.NamespacedName{Name: moduleName}, mc) + if err != nil { + fmt.Printf("\r\033[K āš ļø Error getting module config '%s': %v. Retrying...", moduleName, err) + continue + } + + spec, found, err := unstructured.NestedMap(mc.Object, "spec") + if err != nil { + fmt.Printf("\r\033[K āš ļø Error getting spec from module config '%s': %v. Retrying...", moduleName, err) + continue + } + if !found { + spec = make(map[string]any) + } + + // Skip update if value is already correct + if currentToggle, ok := spec["enabled"].(bool); ok && currentToggle == toggle { + return nil + } + + spec["enabled"] = toggle + + if err := unstructured.SetNestedMap(mc.Object, spec, "spec"); err != nil { + fmt.Printf("\r\033[K āš ļø Error setting spec for module config '%s': %v. Retrying...", moduleName, err) + continue + } + + if err := cl.Update(ctx, mc); err != nil { + fmt.Printf("\r\033[K āš ļø Error updating module config '%s': %v. Retrying...", moduleName, err) + continue + } + return nil + } + } +} + +func waitForModule(ctx context.Context, cl client.Client, moduleName string, shouldBeReady bool) error { + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + module := &unstructured.Unstructured{} + module.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "deckhouse.io", + Version: "v1alpha1", + Kind: "Module", + }) + + err := cl.Get(ctx, types.NamespacedName{Name: moduleName}, module) + + if shouldBeReady { + if err != nil { + if errors.IsNotFound(err) { + fmt.Printf("\r\033[K Waiting for module '%s': not found yet...", moduleName) + } else { + fmt.Printf("\r\033[K āš ļø Error getting module '%s': %v. Retrying...", moduleName, err) + } + continue + } + + state, found, err := unstructured.NestedString(module.Object, "status", "phase") + if err != nil || !found { + fmt.Printf("\r\033[K Waiting for module '%s': status.phase field not found. Retrying...", + moduleName) + continue + } + + if state == "Ready" { + fmt.Printf("Module '%s' is Ready.", moduleName) + fmt.Println() + return nil + } + fmt.Printf("\r\033[K Waiting for module '%s' to be Ready, current state: %s", moduleName, state) + } else { + err := cl.Get(ctx, types.NamespacedName{Name: moduleName}, module) + if err != nil { + if errors.IsNotFound(err) { + fmt.Printf("\r\033[K Module '%s' is not found, assuming disabled.", moduleName) + fmt.Println() + return nil + } + fmt.Printf("\r\033[K āš ļø Error getting module '%s': %v. Retrying...", moduleName, err) + continue + } + + // Check conditions to see if it's disabled + conditions, found, err := unstructured.NestedSlice(module.Object, "status", "conditions") + if err != nil || !found { + fmt.Printf("\r\033[K Waiting for module '%s' status conditions. Retrying...", moduleName) + continue + } + + isReadyFound := false + for _, c := range conditions { + condition, ok := c.(map[string]any) + if !ok { + continue + } + + condType, found, err := unstructured.NestedString(condition, "type") + if err != nil || !found { + continue + } + + if condType == "IsReady" { + isReadyFound = true + condStatus, _, _ := unstructured.NestedString(condition, "status") + if condStatus == "False" { + fmt.Printf("\r\033[K- Module '%s' is disabled (IsReady=False).\n", moduleName) + return nil + } + } + } + + if !isReadyFound { + fmt.Printf("\r\033[K Waiting for module '%s' to be disabled, 'IsReady' condition not found...", + moduleName) + } else { + fmt.Printf("\r\033[K Waiting for module '%s' to be disabled (IsReady=False)...", moduleName) + } + } + } + } +} + +func updateCNIMigrationStatus( + ctx context.Context, cl client.Client, migrationName string, newCondition metav1.Condition, +) error { + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + migration := &v1alpha1.CNIMigration{} + err := cl.Get(ctx, types.NamespacedName{Name: migrationName}, migration) + if err != nil { + fmt.Printf("\r\033[K āš ļø Error getting CNIMigration '%s': %v. Retrying...", migrationName, err) + continue + } + + patchedMigration := migration.DeepCopy() + newCondition.LastTransitionTime = metav1.Now() + + found := false + for i, cond := range patchedMigration.Status.Conditions { + if cond.Type == newCondition.Type { + // Check if update is needed + if cond.Status == newCondition.Status && cond.Reason == newCondition.Reason && + cond.Message == newCondition.Message { + return nil // Already up to date + } + patchedMigration.Status.Conditions[i] = newCondition + found = true + break + } + } + + if !found { + patchedMigration.Status.Conditions = append(patchedMigration.Status.Conditions, newCondition) + } + + err = cl.Status().Patch(ctx, patchedMigration, client.MergeFrom(migration)) + if err == nil { + return nil + } + + if !errors.IsConflict(err) { + fmt.Printf("\r\033[K āš ļø Error patching CNIMigration status: %v. Retrying...", err) + } + } + } +} + +func waitForNodeConditions( + ctx context.Context, cl client.Client, cniMigration *v1alpha1.CNIMigration, conditionType string, +) error { + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + reportedFailures := make(map[string]struct{}) + + // Helper to update status stats + updateStats := func(total, succeeded, failed int) { + // Fetch fresh object to avoid conflicts + freshMigration := &v1alpha1.CNIMigration{} + if err := cl.Get(ctx, types.NamespacedName{Name: cniMigration.Name}, freshMigration); err != nil { + return // Ignore error, will retry next tick + } + + if freshMigration.Status.NodesTotal == total && + freshMigration.Status.NodesSucceeded == succeeded && + freshMigration.Status.NodesFailed == failed { + return + } + + patched := freshMigration.DeepCopy() + patched.Status.NodesTotal = total + patched.Status.NodesSucceeded = succeeded + patched.Status.NodesFailed = failed + + _ = cl.Status().Patch(ctx, patched, client.MergeFrom(freshMigration)) + } + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + // Get current total nodes + nodeList := &corev1.NodeList{} + if err := cl.List(ctx, nodeList); err != nil { + fmt.Printf("\r\033[K āš ļø Error listing nodes: %v. Retrying...", err) + continue + } + totalNodes := len(nodeList.Items) + + // Get current node migrations + nodeMigrations := &v1alpha1.CNINodeMigrationList{} + if err := cl.List(ctx, nodeMigrations); err != nil { + fmt.Printf("\r\033[K āš ļø Error listing CNINodeMigration resources: %v. Retrying...", err) + continue + } + + succeededNodes := 0 + failedNodes := 0 + + for _, nm := range nodeMigrations.Items { + if !metav1.IsControlledBy(&nm, cniMigration) { + continue + } + + // Check specific condition success + isSucceeded := false + for _, cond := range nm.Status.Conditions { + if cond.Type == conditionType && cond.Status == metav1.ConditionTrue { + succeededNodes++ + isSucceeded = true + + // Check if it was previously failed + if _, wasFailed := reportedFailures[nm.Name]; wasFailed { + fmt.Printf("\r\033[K āœ… Node '%s' has recovered.\n", nm.Name) + delete(reportedFailures, nm.Name) + } + break + } + } + + if isSucceeded { + continue + } + + // Check failure + for _, cond := range nm.Status.Conditions { + if cond.Type == "Failed" && cond.Status == metav1.ConditionTrue { + failedNodes++ + if _, reported := reportedFailures[nm.Name]; !reported { + // Clear line, print error, then let the progress bar overwrite next + fmt.Printf("\r\033[K āŒ Node '%s' failed: %s\n", nm.Name, cond.Message) + reportedFailures[nm.Name] = struct{}{} + } + break + } + } + } + + // Update status in CNIMigration + go updateStats(totalNodes, succeededNodes, failedNodes) + + // Output progress + fmt.Printf("\r\033[K Progress: %d/%d nodes completed, %d failed. ", + succeededNodes, totalNodes, failedNodes) + + // 5. Check exit condition + if succeededNodes >= totalNodes { + fmt.Println() + return nil + } + } + } +} + +func deleteMutatingWebhook(ctx context.Context, cl client.Client) error { + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + fmt.Printf(" Deleting mutating webhook '%s'...", webhookConfigName) + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + webhook := &admissionregistrationv1.MutatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{ + Name: webhookConfigName, + }, + } + if err := cl.Delete(ctx, webhook); err != nil { + if errors.IsNotFound(err) { + fmt.Println("\r\033[K Mutating webhook not found, assuming already deleted.") + return nil + } + fmt.Printf("\r\033[K āš ļø Error deleting mutating webhook: %v. Retrying...", err) + continue + } + return nil + } + } +} + +// getDaemonSetNameForCNI returns the name of the main DaemonSet for a given CNI module. +func getDaemonSetNameForCNI(cniModule string) (string, error) { + switch cniModule { + case "cni-cilium": + return "agent", nil + case "cni-flannel": + return "flannel", nil + case "cni-simple-bridge": + return "simple-bridge", nil + default: + return "", fmt.Errorf("unknown CNI module: %s", cniModule) + } +} + +// waitForModulePodsInitializing waits for all pods of a module's daemonset to be in the 'Initializing' state, +// specifically waiting in the init container. +func waitForModulePodsInitializing(ctx context.Context, cl client.Client, moduleName string, dsName string) error { + fmt.Printf(" Waiting for pods of module '%s' to enter 'Initializing' state...", moduleName) + + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + // Get the DaemonSet + ds := &appsv1.DaemonSet{} + err := cl.Get(ctx, types.NamespacedName{Name: dsName, Namespace: "d8-" + moduleName}, ds) + if err != nil { + if errors.IsNotFound(err) { + fmt.Printf("\r\033[K Waiting for DaemonSet '%s' in namespace 'd8-%s': not found...", + dsName, moduleName) + continue + } + fmt.Printf("\r\033[K Error getting DaemonSet '%s' in namespace 'd8-%s': %v. Retrying...", + dsName, moduleName, err) + continue + } + + // Check if pods are scheduled + if ds.Status.DesiredNumberScheduled == 0 { + fmt.Printf("\r\033[K Waiting for DaemonSet '%s' to schedule pods...", dsName) + continue + } + + // List pods + podList := &corev1.PodList{} + opts := []client.ListOption{ + client.InNamespace("d8-" + moduleName), + client.MatchingLabels(ds.Spec.Selector.MatchLabels), + } + if err := cl.List(ctx, podList, opts...); err != nil { + fmt.Printf("\r\033[K Error listing pods for module '%s': %v. Retrying...", moduleName, err) + continue + } + + if len(podList.Items) == 0 { + fmt.Printf("\r\033[K Waiting for pods of DaemonSet '%s' to be created...", dsName) + continue + } + + initializingPods := 0 + for _, pod := range podList.Items { + // Check pod status + if pod.Status.Phase == corev1.PodPending || pod.Status.Phase == corev1.PodRunning { + for _, initStatus := range pod.Status.InitContainerStatuses { + if initStatus.Name == "cni-switch-init-checker" && + (initStatus.State.Waiting != nil || initStatus.State.Running != nil) { + // Pod is waiting in or running our init-container + initializingPods++ + break + } + } + } + } + + fmt.Printf("\r\033[K Progress: %d/%d pods are in 'Initializing' state.", + initializingPods, ds.Status.DesiredNumberScheduled) + + if int32(initializingPods) >= ds.Status.DesiredNumberScheduled { + fmt.Println("\n- All pods for target CNI are correctly waiting in the init-container.") + return nil + } + } + } +} + +// waitForModulePodsReady waits for all pods of a module's daemonset to be in the 'Ready' state. +func waitForModulePodsReady(ctx context.Context, cl client.Client, moduleName string, dsName string) error { + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + // Get the DaemonSet + ds := &appsv1.DaemonSet{} + err := cl.Get(ctx, types.NamespacedName{Name: dsName, Namespace: "d8-" + moduleName}, ds) + if err != nil { + fmt.Printf("\r\033[K Error getting DaemonSet '%s': %v. Retrying...", dsName, err) + continue + } + + if ds.Status.DesiredNumberScheduled == 0 { + fmt.Printf("\r\033[K Waiting for DaemonSet '%s' to schedule pods...", dsName) + continue + } + + fmt.Printf("\r\033[K Progress: %d/%d pods are Ready.", + ds.Status.NumberReady, ds.Status.DesiredNumberScheduled) + + if ds.Status.NumberReady >= ds.Status.DesiredNumberScheduled { + fmt.Println("\n- All pods for target CNI are Ready.") + return nil + } + } + } +} + +// waitForModulePodsTermination waits for all pods of a module's daemonset to be terminated. +func waitForModulePodsTermination(ctx context.Context, cl client.Client, moduleName string, dsName string) error { + fmt.Printf(" Waiting for pods of module '%s' to terminate...", moduleName) + + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + // List pods with label app= in namespace d8- + podList := &corev1.PodList{} + opts := []client.ListOption{ + client.InNamespace("d8-" + moduleName), + client.MatchingLabels(map[string]string{"app": dsName}), + } + if err := cl.List(ctx, podList, opts...); err != nil { + fmt.Printf("\r\033[K Error listing pods for module '%s': %v. Retrying...", moduleName, err) + continue + } + + if len(podList.Items) == 0 { + fmt.Println("\n- All pods for disabled CNI module are terminated.") + return nil + } + + fmt.Printf("\r\033[K Waiting for %d pods of module '%s' to terminate...", len(podList.Items), moduleName) + } + } +} + +// verifyResourcesExist checks if all expected resources are present in the cluster. +func verifyResourcesExist(ctx context.Context, rtClient client.Client) error { + resources := []struct { + kind string + name string + namespace string + obj client.Object + }{ + {"Namespace", cniSwitchNamespace, "", &corev1.Namespace{}}, + {"ServiceAccount", switchHelperServiceAccountName, cniSwitchNamespace, &corev1.ServiceAccount{}}, + {"ServiceAccount", webhookServiceAccountName, cniSwitchNamespace, &corev1.ServiceAccount{}}, + {"ClusterRole", switchHelperClusterRoleName, "", &rbacv1.ClusterRole{}}, + {"ClusterRole", webhookClusterRoleName, "", &rbacv1.ClusterRole{}}, + {"ClusterRoleBinding", switchHelperClusterRoleBindingName, "", &rbacv1.ClusterRoleBinding{}}, + {"ClusterRoleBinding", webhookClusterRoleBindingName, "", &rbacv1.ClusterRoleBinding{}}, + {"Secret", webhookSecretName, cniSwitchNamespace, &corev1.Secret{}}, + {"Service", webhookServiceName, cniSwitchNamespace, &corev1.Service{}}, + {"MutatingWebhookConfiguration", webhookConfigName, "", &admissionregistrationv1.MutatingWebhookConfiguration{}}, + {"DaemonSet", switchHelperDaemonSetName, cniSwitchNamespace, &appsv1.DaemonSet{}}, + {"Deployment", webhookDeploymentName, cniSwitchNamespace, &appsv1.Deployment{}}, + } + + for _, r := range resources { + key := client.ObjectKey{Name: r.name, Namespace: r.namespace} + if err := rtClient.Get(ctx, key, r.obj); err != nil { + return fmt.Errorf("resource %s '%s' not found: %w", r.kind, r.name, err) + } + } + return nil +} + +func updateCNIMigrationPhase(ctx context.Context, cl client.Client, migrationName string, phase string) error { + update := func() error { + migration := &v1alpha1.CNIMigration{} + if err := cl.Get(ctx, types.NamespacedName{Name: migrationName}, migration); err != nil { + return fmt.Errorf("getting CNIMigration: %w", err) + } + + if migration.Spec.Phase == phase { + return nil + } + + patchedMigration := migration.DeepCopy() + patchedMigration.Spec.Phase = phase + if err := cl.Patch(ctx, patchedMigration, client.MergeFrom(migration)); err != nil { + return fmt.Errorf("patching CNIMigration: %w", err) + } + return nil + } + + err := update() + if err == nil { + return nil + } + fmt.Printf("\r\033[K āš ļø %v. Retrying...", err) + + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + if err := update(); err != nil { + fmt.Printf("\r\033[K āš ļø %v. Retrying...", err) + continue + } + return nil + } + } +}