Skip to content

Commit ec0ffc4

Browse files
committed
x
1 parent fc04313 commit ec0ffc4

File tree

12 files changed

+138
-63
lines changed

12 files changed

+138
-63
lines changed

apps/evm/go.mod

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@ module github.com/evstack/ev-node/apps/evm
22

33
go 1.25.7
44

5-
// replace (
6-
// github.com/evstack/ev-node => ../../
7-
// github.com/evstack/ev-node/execution/evm => ../../execution/evm
8-
// )
5+
replace (
6+
github.com/evstack/ev-node => ../../
7+
github.com/evstack/ev-node/execution/evm => ../../execution/evm
8+
)
99

1010
require (
1111
github.com/ethereum/go-ethereum v1.17.2

apps/evm/go.sum

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -472,12 +472,8 @@ github.com/ethereum/go-bigmodexpfix v0.0.0-20250911101455-f9e208c548ab h1:rvv6MJ
472472
github.com/ethereum/go-bigmodexpfix v0.0.0-20250911101455-f9e208c548ab/go.mod h1:IuLm4IsPipXKF7CW5Lzf68PIbZ5yl7FFd74l/E0o9A8=
473473
github.com/ethereum/go-ethereum v1.17.2 h1:ag6geu0kn8Hv5FLKTpH+Hm2DHD+iuFtuqKxEuwUsDOI=
474474
github.com/ethereum/go-ethereum v1.17.2/go.mod h1:KHcRXfGOUfUmKg51IhQ0IowiqZ6PqZf08CMtk0g5K1o=
475-
github.com/evstack/ev-node v1.1.0-rc.1 h1:NtPuuDLqN2h4/edu5zxRlZAxmLkTG3ncXBO2PlCDvVs=
476-
github.com/evstack/ev-node v1.1.0-rc.1/go.mod h1:6rhWWzuyiqNn/erDmWCk1aLxUuQphyOGIRq56/smSyk=
477475
github.com/evstack/ev-node/core v1.0.0 h1:s0Tx0uWHme7SJn/ZNEtee4qNM8UO6PIxXnHhPbbKTz8=
478476
github.com/evstack/ev-node/core v1.0.0/go.mod h1:n2w/LhYQTPsi48m6lMj16YiIqsaQw6gxwjyJvR+B3sY=
479-
github.com/evstack/ev-node/execution/evm v1.0.0 h1:UTAdCrnPsLoGzSgsBx4Kv76jkXpMmHBIpNv3MxyzWPo=
480-
github.com/evstack/ev-node/execution/evm v1.0.0/go.mod h1:UrqkiepfTMiot6M8jnswgu3VU8SSucZpaMIHIl22/1A=
481477
github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=
482478
github.com/fatih/color v1.10.0/go.mod h1:ELkj/draVOlAH/xkhN6mQ50Qd0MPOk5AAr3maGEBuJM=
483479
github.com/fatih/color v1.13.0/go.mod h1:kLAiJbzzSOZDVNGyDpeOxJ47H46qBXwg5ILebYFFOfk=

docs/guides/raft_production.md

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ Raft is configured via CLI flags or the `config.toml` file under the `[raft]` (o
3333
| `--evnode.raft.raft_addr` | `raft.raft_addr` | TCP address for Raft transport. | `0.0.0.0:5001` (Bind to private IP) |
3434
| `--evnode.raft.raft_dir` | `raft.raft_dir` | Directory for Raft data. | `/data/raft` (Must be persistent) |
3535
| `--evnode.raft.peers` | `raft.peers` | Comma-separated list of peer addresses in format `nodeID@host:port`. | `node-1@10.0.0.1:5001,node-2@10.0.0.2:5001,node-3@10.0.0.3:5001` |
36-
| `--evnode.raft.bootstrap` | `raft.bootstrap` | Bootstrap the cluster. **Required** for initial setup. | `true` (See Limitations) |
36+
| `--evnode.raft.bootstrap` | `raft.bootstrap` | Compatibility flag. Startup mode is selected automatically from persisted raft configuration state. | optional |
3737

3838
### Timeout Tuning
3939

@@ -55,11 +55,15 @@ Ideally, a failover should complete within `2 * BlockTime` to minimize user impa
5555
5656
## Production Deployment Principles
5757

58-
### 1. Static Peering & Bootstrap
59-
Current implementation requires **Bootstrap Mode** (`--evnode.raft.bootstrap=true`) for all nodes participating in the cluster initialization.
60-
* **All nodes** should list the full set of peers in `--evnode.raft.peers`.
58+
### 1. Static Peering & Automatic Startup Mode
59+
Use static peering with automatic mode selection from local raft configuration:
60+
* If local raft configuration already exists in `--evnode.raft.raft_dir`, the node starts in rejoin mode.
61+
* If no local raft configuration exists yet, the node bootstraps from configured peers.
62+
* `--evnode.raft.bootstrap` is retained for compatibility but does not control mode selection.
63+
* **All configured cluster members** should list the full set of peers in `--evnode.raft.peers`.
6164
* The `peers` list format is strict: `NodeID@Host:Port`.
62-
* **Limitation**: Dynamic addition of peers (Run-time Membership Changes) via RPC/CLI is not currently exposed. The cluster membership is static based on the initial bootstrap configuration.
65+
* **Limitation**: Dynamic addition of peers (run-time membership changes) via RPC/CLI is not currently exposed.
66+
* **Not supported**: Joining an existing cluster as a brand-new node that was not part of the initial static membership.
6367

6468
### 2. Infrastructure Requirements
6569
* **Encrypted Network (CRITICAL)**: Raft traffic is **unencrypted** (plain TCP). You **MUST** run the cluster inside a private network, VPN, or encrypted mesh (e.g., WireGuard, Tailscale). **Never expose Raft ports to the public internet**; doing so allows attackers to hijack the cluster consensus.
@@ -86,13 +90,13 @@ Monitor the following metrics (propagated via Prometheus if enabled):
8690

8791
```bash
8892
./ev-node start \
89-
--node.aggregator \
90-
--raft.enable \
91-
--raft.node_id="node-1" \
92-
--raft.raft_addr="0.0.0.0:5001" \
93-
--raft.raft_dir="/var/lib/ev-node/raft" \
94-
--raft.bootstrap=true \
95-
--raft.peers="node-1@10.0.1.1:5001,node-2@10.0.1.2:5001,node-3@10.0.1.3:5001" \
96-
--p2p.listen_address="/ip4/0.0.0.0/tcp/26656" \
93+
--rollkit.node.aggregator=true \
94+
--evnode.raft.enable=true \
95+
--evnode.raft.node_id="node-1" \
96+
--evnode.raft.raft_addr="0.0.0.0:5001" \
97+
--evnode.raft.raft_dir="/var/lib/ev-node/raft" \
98+
--evnode.raft.bootstrap=true \
99+
--evnode.raft.peers="node-1@10.0.1.1:5001,node-2@10.0.1.2:5001,node-3@10.0.1.3:5001" \
100+
--rollkit.p2p.listen_address="/ip4/0.0.0.0/tcp/26656" \
97101
...other flags
98102
```

docs/learn/config.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1321,7 +1321,7 @@ _Constant:_ `FlagRaftDir`
13211321
### Raft Bootstrap
13221322

13231323
**Description:**
1324-
If true, bootstraps a new Raft cluster. Only set this on the very first node when initializing a new cluster.
1324+
Legacy compatibility flag. Startup mode is now auto-selected from persisted raft configuration state, so this flag is not used to choose bootstrap vs rejoin.
13251325

13261326
**YAML:**
13271327

@@ -1352,6 +1352,16 @@ raft:
13521352
_Default:_ `""` (empty)
13531353
_Constant:_ `FlagRaftPeers`
13541354

1355+
### Raft Startup Mode
1356+
1357+
Raft startup mode is selected automatically from local raft configuration state:
1358+
1359+
* If the node already has persisted raft configuration in `raft.raft_dir`, it starts in rejoin mode.
1360+
* If no raft configuration exists yet, it bootstraps a cluster from configured peers.
1361+
* `raft.bootstrap` is retained for compatibility but does not control mode selection.
1362+
1363+
`--evnode.raft.rejoin` has been removed.
1364+
13551365
### Raft Snap Count
13561366

13571367
**Description:**

node/failover.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -183,22 +183,29 @@ func setupFailoverState(
183183
}, nil
184184
}
185185

186+
// shouldStartSyncInPublisherMode avoids startup deadlock when a raft leader boots
187+
// with empty sync stores and no peer can serve height 1 yet.
186188
func (f *failoverState) shouldStartSyncInPublisherMode(ctx context.Context) bool {
187189
if !f.isAggregator || f.raftNode == nil || !f.raftNode.IsLeader() {
188190
return false
189191
}
190192

191-
height, err := f.store.Height(ctx)
193+
storeHeight, err := f.store.Height(ctx)
192194
if err != nil {
193-
f.logger.Warn().Err(err).Msg("cannot determine local height; keeping blocking sync startup")
195+
f.logger.Warn().Err(err).Msg("cannot determine store height; keeping blocking sync startup")
194196
return false
195197
}
196-
if height > 0 {
198+
headerHeight := f.headerSyncService.Store().Height()
199+
dataHeight := f.dataSyncService.Store().Height()
200+
if headerHeight > 0 || dataHeight > 0 {
197201
return false
198202
}
199203

200204
f.logger.Info().
201-
Msg("raft leader with empty store: starting sync services in publisher mode")
205+
Uint64("store_height", storeHeight).
206+
Uint64("header_height", headerHeight).
207+
Uint64("data_height", dataHeight).
208+
Msg("raft-enabled aggregator with empty sync stores: starting sync services in publisher mode")
202209
return true
203210
}
204211

pkg/config/config.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,7 @@ type RaftConfig struct {
400400
NodeID string `mapstructure:"node_id" yaml:"node_id" comment:"Unique identifier for this node in the Raft cluster"`
401401
RaftAddr string `mapstructure:"raft_addr" yaml:"raft_addr" comment:"Address for Raft communication (host:port)"`
402402
RaftDir string `mapstructure:"raft_dir" yaml:"raft_dir" comment:"Directory for Raft logs and snapshots"`
403-
Bootstrap bool `mapstructure:"bootstrap" yaml:"bootstrap" comment:"Bootstrap a new Raft cluster (only for the first node)"`
403+
Bootstrap bool `mapstructure:"bootstrap" yaml:"bootstrap" comment:"Bootstrap a new static Raft cluster during initial bring-up"`
404404
Peers string `mapstructure:"peers" yaml:"peers" comment:"Comma-separated list of peer Raft addresses (nodeID@host:port)"`
405405
SnapCount uint64 `mapstructure:"snap_count" yaml:"snap_count" comment:"Number of log entries between snapshots"`
406406
SendTimeout time.Duration `mapstructure:"send_timeout" yaml:"send_timeout" comment:"Max duration to wait for a message to be sent to a peer"`
@@ -646,7 +646,7 @@ func AddFlags(cmd *cobra.Command) {
646646
cmd.Flags().String(FlagRaftNodeID, def.Raft.NodeID, "unique identifier for this node in the Raft cluster")
647647
cmd.Flags().String(FlagRaftAddr, def.Raft.RaftAddr, "address for Raft communication (host:port)")
648648
cmd.Flags().String(FlagRaftDir, def.Raft.RaftDir, "directory for Raft logs and snapshots")
649-
cmd.Flags().Bool(FlagRaftBootstrap, def.Raft.Bootstrap, "bootstrap a new Raft cluster (only for the first node)")
649+
cmd.Flags().Bool(FlagRaftBootstrap, def.Raft.Bootstrap, "bootstrap a new static Raft cluster during initial bring-up")
650650
cmd.Flags().String(FlagRaftPeers, def.Raft.Peers, "comma-separated list of peer Raft addresses (nodeID@host:port)")
651651
cmd.Flags().Uint64(FlagRaftSnapCount, def.Raft.SnapCount, "number of log entries between snapshots")
652652
cmd.Flags().Duration(FlagRaftSendTimeout, def.Raft.SendTimeout, "max duration to wait for a message to be sent to a peer")

pkg/config/config_test.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,18 @@ func TestAddFlags(t *testing.T) {
122122
assertFlagValue(t, flags, FlagRPCAddress, DefaultConfig().RPC.Address)
123123
assertFlagValue(t, flags, FlagRPCEnableDAVisualization, DefaultConfig().RPC.EnableDAVisualization)
124124

125+
// Raft flags
126+
assertFlagValue(t, flags, FlagRaftEnable, DefaultConfig().Raft.Enable)
127+
assertFlagValue(t, flags, FlagRaftNodeID, DefaultConfig().Raft.NodeID)
128+
assertFlagValue(t, flags, FlagRaftAddr, DefaultConfig().Raft.RaftAddr)
129+
assertFlagValue(t, flags, FlagRaftDir, DefaultConfig().Raft.RaftDir)
130+
assertFlagValue(t, flags, FlagRaftBootstrap, DefaultConfig().Raft.Bootstrap)
131+
assertFlagValue(t, flags, FlagRaftPeers, DefaultConfig().Raft.Peers)
132+
assertFlagValue(t, flags, FlagRaftSnapCount, DefaultConfig().Raft.SnapCount)
133+
assertFlagValue(t, flags, FlagRaftSendTimeout, DefaultConfig().Raft.SendTimeout)
134+
assertFlagValue(t, flags, FlagRaftHeartbeatTimeout, DefaultConfig().Raft.HeartbeatTimeout)
135+
assertFlagValue(t, flags, FlagRaftLeaderLeaseTimeout, DefaultConfig().Raft.LeaderLeaseTimeout)
136+
125137
// Pruning flags
126138
assertFlagValue(t, flags, FlagPruningMode, DefaultConfig().Pruning.Mode)
127139
assertFlagValue(t, flags, FlagPruningKeepRecent, DefaultConfig().Pruning.KeepRecent)

pkg/raft/election.go

Lines changed: 42 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ func (d *DynamicLeaderElection) Run(ctx context.Context) error {
6464
close(errCh)
6565
}()
6666

67+
var runnable Runnable
6768
startWorker := func(name string, workerFunc func(ctx context.Context) error) {
6869
workerCancel()
6970
wg.Wait() // Ensure previous worker is fully stopped
@@ -83,11 +84,26 @@ func (d *DynamicLeaderElection) Run(ctx context.Context) error {
8384
}
8485
}(workerCtx)
8586
}
87+
startFollower := func() error {
88+
var err error
89+
if runnable, err = d.followerFactory(); err != nil {
90+
return err
91+
}
92+
// avoids validating against stale raft state.
93+
if err = d.node.waitForMsgsLanded(d.node.Config().SendTimeout); err != nil {
94+
// this wait can legitimately time out
95+
d.logger.Debug().Err(err).Msg("timed out waiting for raft messages before follower verification; continuing")
96+
}
97+
if err = d.verifyState(ctx, runnable); err != nil {
98+
return err
99+
}
100+
startWorker("follower", runnable.Run)
101+
return nil
102+
}
86103
ticker := time.NewTicker(300 * time.Millisecond)
87104
defer ticker.Stop()
88105
d.running.Store(true)
89106
defer d.running.Store(false)
90-
var runnable Runnable
91107
for {
92108
select {
93109
case becameLeader := <-d.node.leaderCh():
@@ -144,15 +160,9 @@ func (d *DynamicLeaderElection) Run(ctx context.Context) error {
144160
} else if !becameLeader && !isCurrentlyLeader && !isStarted { // start as a follower
145161
d.logger.Info().Msg("starting follower operations")
146162
isStarted = true
147-
var err error
148-
if runnable, err = d.followerFactory(); err != nil {
163+
if err := startFollower(); err != nil {
149164
return err
150165
}
151-
152-
if err = d.verifyState(ctx, runnable); err != nil {
153-
return err
154-
}
155-
startWorker("follower", runnable.Run)
156166
}
157167
// LeaderCh fires only when leader changes not on initial election
158168
case <-ticker.C:
@@ -171,11 +181,9 @@ func (d *DynamicLeaderElection) Run(ctx context.Context) error {
171181

172182
d.logger.Info().Msg("starting follower operations")
173183
isStarted = true
174-
var err error
175-
if runnable, err = d.followerFactory(); err != nil {
184+
if err := startFollower(); err != nil {
176185
return err
177186
}
178-
startWorker("follower", runnable.Run)
179187
case err := <-errCh:
180188
return err
181189
case <-ctx.Done():
@@ -189,14 +197,32 @@ func (d *DynamicLeaderElection) verifyState(ctx context.Context, runnable Runnab
189197
// Verify sync state before starting follower operations
190198
raftState := d.node.GetState()
191199
if raftState == nil || raftState.Height == 0 {
192-
// Initial/empty raft state - skip recovery and let normal sync handle it.
193-
// This can happen during rolling restarts when the Raft FSM hasn't replayed logs yet.
194-
d.logger.Info().Msg("raft state at height 0, skipping recovery to allow normal sync")
195-
return nil
200+
waitTimeout := d.node.Config().SendTimeout
201+
deadline := time.NewTimer(waitTimeout)
202+
defer deadline.Stop()
203+
ticker := time.NewTicker(min(50*time.Millisecond, max(waitTimeout/4, time.Millisecond)))
204+
defer ticker.Stop()
205+
206+
for raftState == nil || raftState.Height == 0 {
207+
select {
208+
case <-ctx.Done():
209+
return ctx.Err()
210+
case <-deadline.C:
211+
d.logger.Info().Msg("raft state still at height 0 after wait; skipping recovery to allow normal sync")
212+
return nil
213+
case <-ticker.C:
214+
raftState = d.node.GetState()
215+
}
216+
}
196217
}
197218
diff, err := runnable.IsSynced(raftState)
198219
if err != nil {
199-
return err
220+
d.logger.Warn().Err(err).Uint64("raft_height", raftState.Height).Msg("sync check failed, attempting recovery from raft canonical state")
221+
if recErr := runnable.Recover(ctx, raftState); recErr != nil {
222+
return errors.Join(err, fmt.Errorf("recovery after sync-check failure: %w", recErr))
223+
}
224+
d.logger.Info().Msg("recovery successful after sync-check failure")
225+
return nil
200226
}
201227
if diff == 0 {
202228
return nil

pkg/raft/election_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ func TestDynamicLeaderElectionRun(t *testing.T) {
2525
m.EXPECT().leaderCh().Return((<-chan bool)(leaderCh))
2626
m.EXPECT().leaderID().Return("other")
2727
m.EXPECT().NodeID().Return("self")
28+
m.EXPECT().Config().Return(testCfg())
29+
m.EXPECT().waitForMsgsLanded(2 * time.Millisecond).Return(nil)
30+
m.EXPECT().GetState().Return(&RaftBlockState{})
2831

2932
started := make(chan struct{})
3033
follower := &testRunnable{startedCh: started}

pkg/raft/node.go

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -109,13 +109,8 @@ func (n *Node) Start(_ context.Context) error {
109109
if n == nil {
110110
return nil
111111
}
112-
if !n.config.Bootstrap {
113-
// it is intended to fail fast here. at this stage only bootstrap mode is supported.
114-
return fmt.Errorf("raft cluster requires bootstrap mode")
115-
}
116-
117112
if future := n.raft.GetConfiguration(); future.Error() == nil && len(future.Configuration().Servers) > 0 {
118-
n.logger.Info().Msg("cluster already bootstrapped, skipping")
113+
n.logger.Info().Msg("raft node started with existing local state")
119114
return nil
120115
}
121116

0 commit comments

Comments
 (0)