From 22d6385569341492476bbc1087d9fc401e97f3d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20K=C3=A4stner?= Date: Wed, 27 May 2026 18:51:30 +0200 Subject: [PATCH 1/3] Implement NX-OS reprovisioning via NX-API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the defunct GNMI-based reprovisioning with NXAPI JSON-RPC calls. The new implementation issues two requests: 1. boot poap enable + copy running-config startup-config (batched with stop-on-error rollback) 2. reload (separate request that tolerates transport errors because the device goes down before responding) Add nxapi.IsTransportError to distinguish network-level errors (EOF, timeout, connection reset) from logical errors (RPCError, HTTPError), enabling callers of disruptive commands to tolerate expected connection drops. Remove the now unused BootPOAP GNMI type. Signed-off-by: Felix Kästner --- internal/provider/cisco/nxos/provider.go | 45 ++++--- .../provider/cisco/nxos/reprovision_test.go | 122 ++++++++++++++++++ internal/provider/cisco/nxos/system.go | 8 -- internal/transport/nxapi/nxapi.go | 17 +++ internal/transport/nxapi/nxapi_test.go | 81 ++++++++++++ 5 files changed, 247 insertions(+), 26 deletions(-) create mode 100644 internal/provider/cisco/nxos/reprovision_test.go diff --git a/internal/provider/cisco/nxos/provider.go b/internal/provider/cisco/nxos/provider.go index 2f92f4c35..5380dda77 100644 --- a/internal/provider/cisco/nxos/provider.go +++ b/internal/provider/cisco/nxos/provider.go @@ -70,13 +70,14 @@ type Provider struct { nxapi *nxapi.Client } +// timeout is the default timeout for all HTTP/gRPC requests made by the provider. +const timeout = 30 * time.Second + func NewProvider() provider.Provider { return &Provider{} } func (p *Provider) Connect(ctx context.Context, conn *deviceutil.Connection) (err error) { - // timeout is the default timeout for all HTTP/gRPC requests made by the provider. - const timeout = 30 * time.Second p.conn, err = grpcext.NewClient(conn, grpcext.WithDefaultTimeout(timeout)) if err != nil { return fmt.Errorf("failed to create grpc connection: %w", err) @@ -91,7 +92,7 @@ func (p *Provider) Connect(ctx context.Context, conn *deviceutil.Connection) (er } // NXAPI only uses the address for URI construction. c := *conn - c.Address = netip.MustParseAddrPort(conn.Address).String() + c.Address = netip.MustParseAddrPort(conn.Address).Addr().String() p.nxapi, err = nxapi.NewClient(&c, timeout) if err != nil { return fmt.Errorf("failed to create nxapi client: %w", err) @@ -141,23 +142,31 @@ func (p *Provider) FactoryReset(ctx context.Context, conn *deviceutil.Connection return FactoryReset(ctx, p.conn) } -func (p *Provider) Reprovision(ctx context.Context, conn *deviceutil.Connection) (reterr error) { - if err := p.Connect(ctx, conn); err != nil { - return err +func (p *Provider) Reprovision(ctx context.Context, conn *deviceutil.Connection) error { + c := *conn + c.Address = netip.MustParseAddrPort(conn.Address).Addr().String() + client, err := nxapi.NewClient(&c, timeout) + if err != nil { + return fmt.Errorf("failed to create nxapi client: %w", err) } - defer func() { - if err := p.Disconnect(ctx, conn); err != nil { - reterr = errors.Join(reterr, err) - } - }() - // This is currently defunct on NX-OS, as enabling POAP requires a `copy running-config startup-config` which we - // cannot issue via GNMI - // TODO add once NXAPI client is available - poap := BootPOAP("enable") - if err := p.client.Update(ctx, &poap); err != nil { - return err + + _, err = client.Do(ctx, nxapi.NewRequest( + "boot poap enable", + "copy running-config startup-config", + ).WithRollback(nxapi.Stop)) + if err != nil { + return fmt.Errorf("failed to prepare device for reprovisioning: %w", err) } - return Reboot(ctx, p.conn) + + // Reboot is issued as a separate request because it actually restarts + // the device. The connection will drop before a response is received, + // so transport errors are expected and tolerated. + _, err = client.Do(ctx, nxapi.NewRequest("reload")) + if err != nil && !nxapi.IsTransportError(err) { + return fmt.Errorf("failed to reboot device: %w", err) + } + + return nil } func (p *Provider) ListPorts(ctx context.Context) ([]provider.DevicePort, error) { diff --git a/internal/provider/cisco/nxos/reprovision_test.go b/internal/provider/cisco/nxos/reprovision_test.go new file mode 100644 index 000000000..6c9dba26e --- /dev/null +++ b/internal/provider/cisco/nxos/reprovision_test.go @@ -0,0 +1,122 @@ +// SPDX-FileCopyrightText: 2026 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package nxos + +import ( + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "slices" + "testing" + + "github.com/ironcore-dev/network-operator/internal/deviceutil" +) + +func TestReprovision(t *testing.T) { + t.Run("success with connection drop on reload", func(t *testing.T) { + var requests [][]string + called := 0 + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + var cmds []struct { + Params struct { + Cmd string `json:"cmd"` + } `json:"params"` + } + if err := json.NewDecoder(r.Body).Decode(&cmds); err != nil { + t.Fatalf("failed to decode request: %v", err) + } + + batch := make([]string, len(cmds)) + for i, c := range cmds { + batch[i] = c.Params.Cmd + } + requests = append(requests, batch) + called++ + + if called == 1 { + w.Header().Set("Content-Type", "application/json-rpc") + fmt.Fprint(w, `[ + {"jsonrpc":"2.0","result":null,"id":1}, + {"jsonrpc":"2.0","result":null,"id":2} + ]`) + return + } + + // Simulate device going down by closing connection abruptly. + hijacker, ok := w.(http.Hijacker) + if !ok { + t.Fatal("server does not support hijacking") + } + conn, _, err := hijacker.Hijack() + if err != nil { + t.Fatalf("hijack failed: %v", err) + } + conn.Close() + })) + defer srv.Close() + + p := &Provider{} + conn := &deviceutil.Connection{Address: srv.Listener.Addr().String(), Username: "admin", Password: "secret"} + + err := p.Reprovision(t.Context(), conn) + if err != nil { + t.Fatalf("Reprovision returned unexpected error: %v", err) + } + + if len(requests) != 2 { + t.Fatalf("expected 2 NXAPI requests, got %d", len(requests)) + } + if !slices.Equal(requests[0], []string{"boot poap enable", "copy running-config startup-config"}) { + t.Errorf("prep batch = %v", requests[0]) + } + if !slices.Equal(requests[1], []string{"reload"}) { + t.Errorf("reload request = %v", requests[1]) + } + }) + + t.Run("prep batch RPC error fails", func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json-rpc") + w.WriteHeader(http.StatusBadRequest) + fmt.Fprint(w, `[{"jsonrpc":"2.0","error":{"code":-32602,"message":"Invalid command"},"id":1}]`) + })) + defer srv.Close() + + p := &Provider{} + conn := &deviceutil.Connection{Address: srv.Listener.Addr().String(), Username: "admin", Password: "secret"} + + err := p.Reprovision(t.Context(), conn) + if err == nil { + t.Fatal("expected error from prep batch, got nil") + } + }) + + t.Run("reload RPC error fails", func(t *testing.T) { + called := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + called++ + w.Header().Set("Content-Type", "application/json-rpc") + if called == 1 { + fmt.Fprint(w, `[ + {"jsonrpc":"2.0","result":null,"id":1}, + {"jsonrpc":"2.0","result":null,"id":2} + ]`) + return + } + w.WriteHeader(http.StatusBadRequest) + fmt.Fprint(w, `[{"jsonrpc":"2.0","error":{"code":-32602,"message":"Permission denied"},"id":1}]`) + })) + defer srv.Close() + + p := &Provider{} + conn := &deviceutil.Connection{Address: srv.Listener.Addr().String(), Username: "admin", Password: "secret"} + + err := p.Reprovision(t.Context(), conn) + if err == nil { + t.Fatal("expected error from reload RPC failure, got nil") + } + }) +} diff --git a/internal/provider/cisco/nxos/system.go b/internal/provider/cisco/nxos/system.go index 009c22f1d..b1d8999a7 100644 --- a/internal/provider/cisco/nxos/system.go +++ b/internal/provider/cisco/nxos/system.go @@ -68,14 +68,6 @@ func (*FirmwareVersion) XPath() string { return "System/showversion-items/nxosVersion" } -var _ gnmiext.DataElement = (*BootPOAP)(nil) - -type BootPOAP string - -func (*BootPOAP) XPath() string { - return "/System/boot-items/poap" -} - type BootTime UnixTime func (*BootTime) XPath() string { diff --git a/internal/transport/nxapi/nxapi.go b/internal/transport/nxapi/nxapi.go index 5881ecbfd..427e27f9c 100644 --- a/internal/transport/nxapi/nxapi.go +++ b/internal/transport/nxapi/nxapi.go @@ -10,6 +10,7 @@ import ( "errors" "fmt" "io" + "net" "net/http" "net/url" "time" @@ -257,3 +258,19 @@ type HTTPError struct { func (e *HTTPError) Error() string { return fmt.Sprintf("nxapi: non-2xx status code: %d - %s", e.Code, string(e.Body)) } + +// IsTransportError reports whether err is a network-level transport error +// (connection reset, timeout, EOF) as opposed to a logical error returned +// by the NX-API endpoint (RPCError, HTTPError). This is useful for callers +// that issue disruptive commands (e.g. reboot) where the device going down +// mid-request is expected. +func IsTransportError(err error) bool { + if err == nil { + return false + } + if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) { + return true + } + var netErr net.Error + return errors.As(err, &netErr) +} diff --git a/internal/transport/nxapi/nxapi_test.go b/internal/transport/nxapi/nxapi_test.go index 0b6c73a32..53e17f25b 100644 --- a/internal/transport/nxapi/nxapi_test.go +++ b/internal/transport/nxapi/nxapi_test.go @@ -9,8 +9,10 @@ import ( "encoding/json" "errors" "fmt" + "io" "net/http" "net/http/httptest" + "net/url" "strings" "testing" @@ -295,3 +297,82 @@ func TestDo(t *testing.T) { }) } } + +func TestIsTransportError(t *testing.T) { + tests := []struct { + desc string + err error + want bool + }{ + { + desc: "nil is not a transport error", + err: nil, + want: false, + }, + { + desc: "RPCError is not a transport error", + err: &RPCError{Code: -32602, Message: "Invalid params"}, + want: false, + }, + { + desc: "RPCErrors is not a transport error", + err: RPCErrors{&RPCError{Code: -32602, Message: "Invalid params"}}, + want: false, + }, + { + desc: "HTTPError is not a transport error", + err: &HTTPError{Code: 401, Body: []byte("unauthorized")}, + want: false, + }, + { + desc: "generic error is not a transport error", + err: errors.New("some logic error"), + want: false, + }, + { + desc: "io.EOF is a transport error", + err: io.EOF, + want: true, + }, + { + desc: "io.ErrUnexpectedEOF is a transport error", + err: io.ErrUnexpectedEOF, + want: true, + }, + { + desc: "wrapped io.EOF is a transport error", + err: fmt.Errorf("request failed: %w", io.EOF), + want: true, + }, + { + desc: "net.Error is a transport error", + err: &netError{msg: "i/o timeout"}, + want: true, + }, + { + desc: "url.Error wrapping net.Error is a transport error", + err: &url.Error{Op: "Post", URL: "http://x/ins", Err: &netError{msg: "i/o timeout"}}, + want: true, + }, + { + desc: "wrapped net.Error is a transport error", + err: fmt.Errorf("read tcp: %w", &netError{msg: "connection reset by peer"}), + want: true, + }, + } + for _, test := range tests { + t.Run(test.desc, func(t *testing.T) { + got := IsTransportError(test.err) + if got != test.want { + t.Errorf("IsTransportError(%v) = %t, want %t", test.err, got, test.want) + } + }) + } +} + +// netError is a mock net.Error for testing. +type netError struct{ msg string } + +func (e *netError) Error() string { return e.msg } +func (e *netError) Timeout() bool { return false } +func (e *netError) Temporary() bool { return false } From f33c592d68ccb9e281186f1be4b17f9787269f61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20K=C3=A4stner?= Date: Fri, 19 Jun 2026 14:13:12 +0200 Subject: [PATCH 2/3] Fix maintenance annotation removal on failed operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the maintenance annotation deletion to after the switch statement so it is only removed when the action succeeds. This allows the reconciler to retry failed maintenance operations on the next reconciliation instead of requiring the user to re-apply the annotation. Additionally, return a terminal error for unknown maintenance actions since retrying will never succeed. Signed-off-by: Felix Kästner --- internal/controller/core/device_controller.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/internal/controller/core/device_controller.go b/internal/controller/core/device_controller.go index 9e70930a9..eb83ee044 100644 --- a/internal/controller/core/device_controller.go +++ b/internal/controller/core/device_controller.go @@ -425,8 +425,6 @@ func (r *DeviceReconciler) reconcileMaintenance(ctx context.Context, obj *v1alph if !ok { return nil } - delete(obj.Annotations, v1alpha1.DeviceMaintenanceAnnotation) - switch action { case v1alpha1.DeviceMaintenanceReboot: prov, ok := r.Provider().(provider.MaintenanceProvider) @@ -499,9 +497,12 @@ func (r *DeviceReconciler) reconcileMaintenance(ctx context.Context, obj *v1alph default: r.Recorder.Eventf(obj, nil, "Warning", "UnknownMaintenanceAction", "Maintenance", "Unknown maintenance action: %s", action) - return fmt.Errorf("unknown maintenance action: %s", action) + return reconcile.TerminalError(fmt.Errorf("unknown maintenance action: %s", action)) } + // Only remove the annotation after the operation succeeds so that + // failed actions are retried on the next reconciliation. + delete(obj.Annotations, v1alpha1.DeviceMaintenanceAnnotation) return nil } From 8547f3b0d58df983a84f414705dbb287bf1f3f1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20K=C3=A4stner?= Date: Tue, 23 Jun 2026 14:23:30 +0200 Subject: [PATCH 3/3] Add WithPort option to NX-API client for testability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a functional option to nxapi.NewClient that overrides the port in the connection address. This allows tests to point the client at an httptest server on a random port. Reprovision now reuses p.nxapi if already set (e.g. from Connect or injected by a test), falling back to creating a standalone client with port stripping for production. Signed-off-by: Felix Kästner --- internal/provider/cisco/nxos/provider.go | 16 ++++++---- .../provider/cisco/nxos/reprovision_test.go | 32 +++++++++++++------ internal/transport/nxapi/nxapi.go | 29 +++++++++++++++-- 3 files changed, 60 insertions(+), 17 deletions(-) diff --git a/internal/provider/cisco/nxos/provider.go b/internal/provider/cisco/nxos/provider.go index 5380dda77..7874b8451 100644 --- a/internal/provider/cisco/nxos/provider.go +++ b/internal/provider/cisco/nxos/provider.go @@ -143,14 +143,18 @@ func (p *Provider) FactoryReset(ctx context.Context, conn *deviceutil.Connection } func (p *Provider) Reprovision(ctx context.Context, conn *deviceutil.Connection) error { - c := *conn - c.Address = netip.MustParseAddrPort(conn.Address).Addr().String() - client, err := nxapi.NewClient(&c, timeout) - if err != nil { - return fmt.Errorf("failed to create nxapi client: %w", err) + client := p.nxapi + if client == nil { + c := *conn + c.Address = netip.MustParseAddrPort(conn.Address).Addr().String() + var err error + client, err = nxapi.NewClient(&c, timeout) + if err != nil { + return fmt.Errorf("failed to create nxapi client: %w", err) + } } - _, err = client.Do(ctx, nxapi.NewRequest( + _, err := client.Do(ctx, nxapi.NewRequest( "boot poap enable", "copy running-config startup-config", ).WithRollback(nxapi.Stop)) diff --git a/internal/provider/cisco/nxos/reprovision_test.go b/internal/provider/cisco/nxos/reprovision_test.go index 6c9dba26e..e2d88f9f7 100644 --- a/internal/provider/cisco/nxos/reprovision_test.go +++ b/internal/provider/cisco/nxos/reprovision_test.go @@ -6,12 +6,14 @@ package nxos import ( "encoding/json" "fmt" + "net" "net/http" "net/http/httptest" "slices" "testing" "github.com/ironcore-dev/network-operator/internal/deviceutil" + "github.com/ironcore-dev/network-operator/internal/transport/nxapi" ) func TestReprovision(t *testing.T) { @@ -58,11 +60,15 @@ func TestReprovision(t *testing.T) { })) defer srv.Close() - p := &Provider{} + _, port, _ := net.SplitHostPort(srv.Listener.Addr().String()) conn := &deviceutil.Connection{Address: srv.Listener.Addr().String(), Username: "admin", Password: "secret"} - - err := p.Reprovision(t.Context(), conn) + client, err := nxapi.NewClient(conn, 0, nxapi.WithPort(port)) if err != nil { + t.Fatalf("failed to create nxapi client: %v", err) + } + + p := &Provider{nxapi: client} + if err := p.Reprovision(t.Context(), conn); err != nil { t.Fatalf("Reprovision returned unexpected error: %v", err) } @@ -85,11 +91,15 @@ func TestReprovision(t *testing.T) { })) defer srv.Close() - p := &Provider{} + _, port, _ := net.SplitHostPort(srv.Listener.Addr().String()) conn := &deviceutil.Connection{Address: srv.Listener.Addr().String(), Username: "admin", Password: "secret"} + client, err := nxapi.NewClient(conn, 0, nxapi.WithPort(port)) + if err != nil { + t.Fatalf("failed to create nxapi client: %v", err) + } - err := p.Reprovision(t.Context(), conn) - if err == nil { + p := &Provider{nxapi: client} + if err := p.Reprovision(t.Context(), conn); err == nil { t.Fatal("expected error from prep batch, got nil") } }) @@ -111,11 +121,15 @@ func TestReprovision(t *testing.T) { })) defer srv.Close() - p := &Provider{} + _, port, _ := net.SplitHostPort(srv.Listener.Addr().String()) conn := &deviceutil.Connection{Address: srv.Listener.Addr().String(), Username: "admin", Password: "secret"} + client, err := nxapi.NewClient(conn, 0, nxapi.WithPort(port)) + if err != nil { + t.Fatalf("failed to create nxapi client: %v", err) + } - err := p.Reprovision(t.Context(), conn) - if err == nil { + p := &Provider{nxapi: client} + if err := p.Reprovision(t.Context(), conn); err == nil { t.Fatal("expected error from reload RPC failure, got nil") } }) diff --git a/internal/transport/nxapi/nxapi.go b/internal/transport/nxapi/nxapi.go index 427e27f9c..2fe7f2f04 100644 --- a/internal/transport/nxapi/nxapi.go +++ b/internal/transport/nxapi/nxapi.go @@ -36,16 +36,41 @@ type Client struct { uri string } +// Option configures a [Client]. +type Option func(*options) + +type options struct { + port string +} + +// WithPort overrides the port in the connection address. +// This is useful when NX-API is reachable on a different +// port (e.g. 8443) than the default (80/443). +func WithPort(port string) Option { + return func(o *options) { o.port = port } +} + // NewClient creates a new [Client] for the given connection. // If the connection has a TLS configuration set, HTTPS is used; otherwise HTTP. // The underlying HTTP client uses timeout for all requests; a value of 0 // means no timeout. -func NewClient(c *deviceutil.Connection, timeout time.Duration) (*Client, error) { +func NewClient(c *deviceutil.Connection, timeout time.Duration, opts ...Option) (*Client, error) { + var o options + for _, opt := range opts { + opt(&o) + } + + addr := c.Address + if o.port != "" { + host, _, _ := net.SplitHostPort(addr) + addr = net.JoinHostPort(host, o.port) + } + proto := "http" if c.TLS != nil { proto = "https" } - uri, err := url.JoinPath(proto+"://"+c.Address, "ins") + uri, err := url.JoinPath(proto+"://"+addr, "ins") if err != nil { return nil, fmt.Errorf("nxapi: failed to join path: %w", err) }