From 94fd9a0bed3f51a8aa549626c9208db75903507f Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Thu, 17 Apr 2025 17:00:06 -0700 Subject: [PATCH 1/3] [tmpnet] Enable runtime-specific restart behavior Delegate responsibility for restart to the node runtime to allow the process runtime to continue to start/stop and the kube runtime to scale down/scale up. --- tests/fixture/tmpnet/network.go | 109 ++++++++++++------------ tests/fixture/tmpnet/node.go | 22 ++--- tests/fixture/tmpnet/process_runtime.go | 37 ++++++++ 3 files changed, 97 insertions(+), 71 deletions(-) diff --git a/tests/fixture/tmpnet/network.go b/tests/fixture/tmpnet/network.go index 2cfaad9eafde..f6df9e697625 100644 --- a/tests/fixture/tmpnet/network.go +++ b/tests/fixture/tmpnet/network.go @@ -191,6 +191,16 @@ func RestartNetwork(ctx context.Context, log logging.Logger, dir string) error { return network.Restart(ctx) } +// Restart the provided nodes. Blocks on the nodes accepting API requests but not their health. +func restartNodes(ctx context.Context, nodes ...*Node) error { + for _, node := range nodes { + if err := node.Restart(ctx); err != nil { + return fmt.Errorf("failed to restart node %s: %w", node.NodeID, err) + } + } + return nil +} + // Reads a network from the provided directory. func ReadNetwork(ctx context.Context, log logging.Logger, dir string) (*Network, error) { canonicalDir, err := toCanonicalDir(dir) @@ -441,26 +451,20 @@ func (n *Network) Bootstrap(ctx context.Context, log logging.Logger) error { bootstrapNode.Flags[config.SybilProtectionEnabledKey] = *existingSybilProtectionValue } + // Ensure the bootstrap node is restarted to pick up subnet and chain configuration + // + // TODO(marun) This restart might be unnecessary if: + // - sybil protection didn't change + // - the node is not a subnet validator log.Info("restarting bootstrap node", zap.Stringer("nodeID", bootstrapNode.NodeID), ) - - if len(n.Nodes) == 1 { - // Ensure the node is restarted to pick up subnet and chain configuration - return n.RestartNode(ctx, bootstrapNode) + if err := bootstrapNode.Restart(ctx); err != nil { + return err } - // TODO(marun) This last restart of the bootstrap node might be unnecessary if: - // - sybil protection didn't change - // - the node is not a subnet validator - - // Ensure the bootstrap node is restarted to pick up configuration changes. Avoid using - // RestartNode since the node won't be able to report healthy until other nodes are started. - if err := bootstrapNode.Stop(ctx); err != nil { - return fmt.Errorf("failed to stop node %s: %w", bootstrapNode.NodeID, err) - } - if err := n.StartNode(ctx, bootstrapNode); err != nil { - return fmt.Errorf("failed to start node %s: %w", bootstrapNode.NodeID, err) + if len(n.Nodes) == 1 { + return nil } log.Info("starting remaining nodes") @@ -486,31 +490,6 @@ func (n *Network) StartNode(ctx context.Context, node *Node) error { return nil } -// Restart a single node. -func (n *Network) RestartNode(ctx context.Context, node *Node) error { - runtimeConfig := node.getRuntimeConfig() - if runtimeConfig.Process != nil && runtimeConfig.Process.ReuseDynamicPorts { - // Attempt to save the API port currently being used so the - // restarted node can reuse it. This may result in the node - // failing to start if the operating system allocates the port - // to a different process between node stop and start. - if err := node.SaveAPIPort(); err != nil { - return err - } - } - - if err := node.Stop(ctx); err != nil { - return fmt.Errorf("failed to stop node %s: %w", node.NodeID, err) - } - if err := n.StartNode(ctx, node); err != nil { - return fmt.Errorf("failed to start node %s: %w", node.NodeID, err) - } - n.log.Info("waiting for node to report healthy", - zap.Stringer("nodeID", node.NodeID), - ) - return node.WaitForHealthy(ctx) -} - // Stops all nodes in the network. func (n *Network) Stop(ctx context.Context) error { // Ensure the node state is up-to-date @@ -540,11 +519,22 @@ func (n *Network) Stop(ctx context.Context) error { return nil } -// Restarts all nodes in the network. +// Restarts all non-ephemeral nodes in the network. func (n *Network) Restart(ctx context.Context) error { n.log.Info("restarting network") - for _, node := range n.Nodes { - if err := n.RestartNode(ctx, node); err != nil { + if err := restartNodes(ctx, n.Nodes...); err != nil { + return err + } + return WaitForHealthyNodes(ctx, n.log, n.Nodes...) +} + +// Waits for the provided nodes to become healthy. +func WaitForHealthyNodes(ctx context.Context, log logging.Logger, nodes ...*Node) error { + for _, node := range nodes { + log.Info("waiting for node to become healthy", + zap.Stringer("nodeID", node.NodeID), + ) + if err := node.WaitForHealthy(ctx); err != nil { return err } } @@ -669,15 +659,20 @@ func (n *Network) CreateSubnets(ctx context.Context, log logging.Logger, apiURI if restartRequired { log.Info("restarting node(s) to enable them to track the new subnet(s)") + runningNodes := make([]*Node, 0, len(reconfiguredNodes)) for _, node := range reconfiguredNodes { - if len(node.URI) == 0 { - // Only running nodes should be restarted - continue - } - if err := n.RestartNode(ctx, node); err != nil { - return err + if len(node.URI) > 0 { + runningNodes = append(runningNodes, node) } } + + if err := restartNodes(ctx, runningNodes...); err != nil { + return err + } + + if err := WaitForHealthyNodes(ctx, n.log, runningNodes...); err != nil { + return err + } } // Add validators for the subnet @@ -738,15 +733,21 @@ func (n *Network) CreateSubnets(ctx context.Context, log logging.Logger, apiURI log.Info("restarting node(s) to pick up chain configuration") // Restart nodes to allow configuration for the new chains to take effect + nodesToRestart := make([]*Node, 0, len(n.Nodes)) for _, node := range n.Nodes { - if !validatorsToRestart.Contains(node.NodeID) { - continue - } - if err := n.RestartNode(ctx, node); err != nil { - return err + if validatorsToRestart.Contains(node.NodeID) { + nodesToRestart = append(nodesToRestart, node) } } + if err := restartNodes(ctx, nodesToRestart...); err != nil { + return err + } + + if err := WaitForHealthyNodes(ctx, log, nodesToRestart...); err != nil { + return err + } + return nil } diff --git a/tests/fixture/tmpnet/node.go b/tests/fixture/tmpnet/node.go index bf6e035e08dc..c50a52c2805f 100644 --- a/tests/fixture/tmpnet/node.go +++ b/tests/fixture/tmpnet/node.go @@ -10,7 +10,6 @@ import ( "fmt" "io" "maps" - "net" "net/http" "net/netip" "os" @@ -58,6 +57,7 @@ type NodeRuntime interface { Start(ctx context.Context) error InitiateStop(ctx context.Context) error WaitForStopped(ctx context.Context) error + Restart(ctx context.Context) error IsHealthy(ctx context.Context) (bool, error) } @@ -165,6 +165,10 @@ func (n *Node) WaitForStopped(ctx context.Context) error { return n.getRuntime().WaitForStopped(ctx) } +func (n *Node) Restart(ctx context.Context) error { + return n.getRuntime().Restart(ctx) +} + func (n *Node) readState(ctx context.Context) error { return n.getRuntime().readState(ctx) } @@ -383,22 +387,6 @@ func (n *Node) composeFlags() (FlagsMap, error) { return flags, nil } -// Saves the currently allocated API port to the node's configuration -// for use across restarts. -func (n *Node) SaveAPIPort() error { - hostPort := strings.TrimPrefix(n.URI, "http://") - if len(hostPort) == 0 { - // Without an API URI there is nothing to save - return nil - } - _, port, err := net.SplitHostPort(hostPort) - if err != nil { - return err - } - n.Flags[config.HTTPPortKey] = port - return nil -} - // WaitForHealthy blocks until node health is true or an error (including context timeout) is observed. func (n *Node) WaitForHealthy(ctx context.Context) error { if _, ok := ctx.Deadline(); !ok { diff --git a/tests/fixture/tmpnet/process_runtime.go b/tests/fixture/tmpnet/process_runtime.go index 03e3ff687717..9088f0928210 100644 --- a/tests/fixture/tmpnet/process_runtime.go +++ b/tests/fixture/tmpnet/process_runtime.go @@ -12,6 +12,7 @@ import ( "io" "io/fs" "maps" + "net" "net/netip" "os" "os/exec" @@ -235,6 +236,26 @@ func (p *ProcessRuntime) WaitForStopped(ctx context.Context) error { } } +// Restarts the node +func (p *ProcessRuntime) Restart(ctx context.Context) error { + if p.getRuntimeConfig().ReuseDynamicPorts { + // Attempt to save the API port currently being used so the + // restarted node can reuse it. This may result in the node + // failing to start if the operating system allocates the port + // to a different process between node stop and start. + if err := p.saveAPIPort(); err != nil { + return err + } + } + if err := p.node.Stop(ctx); err != nil { + return fmt.Errorf("failed to stop node %s: %w", p.node.NodeID, err) + } + if err := p.Start(ctx); err != nil { + return fmt.Errorf("failed to start node %s: %w", p.node.NodeID, err) + } + return nil +} + func (p *ProcessRuntime) IsHealthy(ctx context.Context) (bool, error) { // Check that the node process is running as a precondition for // checking health. getProcess will also ensure that the node's @@ -404,6 +425,22 @@ func (p *ProcessRuntime) GetLocalStakingAddress(_ context.Context) (netip.AddrPo return p.node.StakingAddress, func() {}, nil } +// Saves the currently allocated API port to the node's configuration +// for use across restarts. +func (p *ProcessRuntime) saveAPIPort() error { + hostPort := strings.TrimPrefix(p.node.URI, "http://") + if len(hostPort) == 0 { + // Without an API URI there is nothing to save + return nil + } + _, port, err := net.SplitHostPort(hostPort) + if err != nil { + return err + } + p.node.Flags[config.HTTPPortKey] = port + return nil +} + // watchLogFileForFatal waits for the specified file path to exist and then checks each of // its lines for the string 'FATAL' until such a line is observed or the provided context // is canceled. If line containing 'FATAL' is encountered, it will be provided as an error From d5e8ebec118c6d53e8432236c04d267b2094881b Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Mon, 5 May 2025 16:12:05 +0000 Subject: [PATCH 2/3] fixup: Address review comment --- tests/fixture/tmpnet/network.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/fixture/tmpnet/network.go b/tests/fixture/tmpnet/network.go index f6df9e697625..1fcfa774b39c 100644 --- a/tests/fixture/tmpnet/network.go +++ b/tests/fixture/tmpnet/network.go @@ -192,7 +192,7 @@ func RestartNetwork(ctx context.Context, log logging.Logger, dir string) error { } // Restart the provided nodes. Blocks on the nodes accepting API requests but not their health. -func restartNodes(ctx context.Context, nodes ...*Node) error { +func restartNodes(ctx context.Context, nodes []*Node) error { for _, node := range nodes { if err := node.Restart(ctx); err != nil { return fmt.Errorf("failed to restart node %s: %w", node.NodeID, err) @@ -522,14 +522,14 @@ func (n *Network) Stop(ctx context.Context) error { // Restarts all non-ephemeral nodes in the network. func (n *Network) Restart(ctx context.Context) error { n.log.Info("restarting network") - if err := restartNodes(ctx, n.Nodes...); err != nil { + if err := restartNodes(ctx, n.Nodes); err != nil { return err } - return WaitForHealthyNodes(ctx, n.log, n.Nodes...) + return WaitForHealthyNodes(ctx, n.log, n.Nodes) } // Waits for the provided nodes to become healthy. -func WaitForHealthyNodes(ctx context.Context, log logging.Logger, nodes ...*Node) error { +func WaitForHealthyNodes(ctx context.Context, log logging.Logger, nodes []*Node) error { for _, node := range nodes { log.Info("waiting for node to become healthy", zap.Stringer("nodeID", node.NodeID), @@ -666,11 +666,11 @@ func (n *Network) CreateSubnets(ctx context.Context, log logging.Logger, apiURI } } - if err := restartNodes(ctx, runningNodes...); err != nil { + if err := restartNodes(ctx, runningNodes); err != nil { return err } - if err := WaitForHealthyNodes(ctx, n.log, runningNodes...); err != nil { + if err := WaitForHealthyNodes(ctx, n.log, runningNodes); err != nil { return err } } @@ -740,11 +740,11 @@ func (n *Network) CreateSubnets(ctx context.Context, log logging.Logger, apiURI } } - if err := restartNodes(ctx, nodesToRestart...); err != nil { + if err := restartNodes(ctx, nodesToRestart); err != nil { return err } - if err := WaitForHealthyNodes(ctx, log, nodesToRestart...); err != nil { + if err := WaitForHealthyNodes(ctx, log, nodesToRestart); err != nil { return err } From 02662a63c01f1e3a281869388abaa10c85fb2049 Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Mon, 5 May 2025 16:19:12 +0000 Subject: [PATCH 3/3] fixup: Only restart running nodes This is intended to avoid the problem of ephemeral nodes not being running when a restart is attempted. --- tests/fixture/tmpnet/network.go | 13 ++++++++++--- tests/fixture/tmpnet/node.go | 4 ++++ tests/fixture/tmpnet/utils.go | 2 +- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/tests/fixture/tmpnet/network.go b/tests/fixture/tmpnet/network.go index 1fcfa774b39c..0538a2fe6da5 100644 --- a/tests/fixture/tmpnet/network.go +++ b/tests/fixture/tmpnet/network.go @@ -519,10 +519,17 @@ func (n *Network) Stop(ctx context.Context) error { return nil } -// Restarts all non-ephemeral nodes in the network. +// Restarts all running nodes in the network. func (n *Network) Restart(ctx context.Context) error { n.log.Info("restarting network") - if err := restartNodes(ctx, n.Nodes); err != nil { + nodes := make([]*Node, 0, len(n.Nodes)) + for _, node := range n.Nodes { + if !node.IsRunning() { + continue + } + nodes = append(nodes, node) + } + if err := restartNodes(ctx, nodes); err != nil { return err } return WaitForHealthyNodes(ctx, n.log, n.Nodes) @@ -661,7 +668,7 @@ func (n *Network) CreateSubnets(ctx context.Context, log logging.Logger, apiURI runningNodes := make([]*Node, 0, len(reconfiguredNodes)) for _, node := range reconfiguredNodes { - if len(node.URI) > 0 { + if node.IsRunning() { runningNodes = append(runningNodes, node) } } diff --git a/tests/fixture/tmpnet/node.go b/tests/fixture/tmpnet/node.go index c50a52c2805f..961ac80d7a7b 100644 --- a/tests/fixture/tmpnet/node.go +++ b/tests/fixture/tmpnet/node.go @@ -440,3 +440,7 @@ func (n *Node) getMonitoringLabels() map[string]string { } return labels } + +func (n *Node) IsRunning() bool { + return len(n.URI) > 0 +} diff --git a/tests/fixture/tmpnet/utils.go b/tests/fixture/tmpnet/utils.go index e02fa9f618c3..6687b01ad8fd 100644 --- a/tests/fixture/tmpnet/utils.go +++ b/tests/fixture/tmpnet/utils.go @@ -62,7 +62,7 @@ func GetNodeURIs(nodes []*Node) []NodeURI { } // Only append URIs that are not empty. A node may have an // empty URI if it is not currently running. - if len(node.URI) > 0 { + if node.IsRunning() { uris = append(uris, NodeURI{ NodeID: node.NodeID, URI: node.URI,