Fix supervisor unstuck vaults

holyfuchs · holyfuchs · commit b2af17514898 · 2026-02-25T21:14:46.000+01:00
added autobalancer callback to find potentially stuck vaults
diff --git a/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc b/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc
@@ -29,6 +29,16 @@ access(all) contract FlowYieldVaultsAutoBalancers {
     /// The path prefix used for StoragePath & PublicPath derivations
     access(all) let pathPrefix: String
 
+    /// Storage path for the shared execution callback that reports to the registry (one per account)
+    access(self) let registryReportCallbackStoragePath: StoragePath
+
+    /// Callback resource invoked by each AutoBalancer after execution; calls Registry.reportExecutionFromCallback with its id
+    access(all) resource RegistryReportCallback: DeFiActions.AutoBalancerExecutionCallback {
+        access(all) fun onExecuted(balancerUUID: UInt64) {
+            FlowYieldVaultsSchedulerRegistry.reportExecution(yieldVaultID: balancerUUID)
+        }
+    }
+
     /* --- PUBLIC METHODS --- */
 
     /// Returns the path (StoragePath or PublicPath) at which an AutoBalancer is stored with the associated
@@ -55,7 +65,7 @@ access(all) contract FlowYieldVaultsAutoBalancers {
         if autoBalancer == nil {
             return false
         }
-        
+
         let txnIDs = autoBalancer!.getScheduledTransactionIDs()
         for txnID in txnIDs {
             if autoBalancer!.borrowScheduledTransaction(id: txnID)?.status() == FlowTransactionScheduler.Status.Scheduled {
@@ -79,24 +89,24 @@ access(all) contract FlowYieldVaultsAutoBalancers {
         if autoBalancer == nil {
             return false
         }
-        
+
         // Check if yield vault has recurring config (should be executing periodically)
         let config = autoBalancer!.getRecurringConfig()
         if config == nil {
             return false // Not configured for recurring, can't be "stuck"
         }
-        
+
         // Check if there's an active schedule
         if self.hasActiveSchedule(id: id) {
             return false // Has active schedule, not stuck
         }
-        
+
         // Check if yield vault is overdue
         let nextExpected = autoBalancer!.calculateNextExecutionTimestampAsConfigured()
         if nextExpected == nil {
             return true // Can't calculate next time, likely stuck
         }
-        
+
         // If next expected time has passed and no active schedule, yield vault is stuck
         return nextExpected! < getCurrentBlock().timestamp
     }
@@ -136,6 +146,8 @@ access(all) contract FlowYieldVaultsAutoBalancers {
         assert(!publishedCap,
             message: "Published Capability collision found when publishing AutoBalancer for UniqueIdentifier.id \(uniqueID.id) at path \(publicPath)")
 
+        let reportCap = self.account.capabilities.storage.issue<&{DeFiActions.AutoBalancerExecutionCallback}>(self.registryReportCallbackStoragePath)
+
         // create & save AutoBalancer with optional recurring config
         let autoBalancer <- DeFiActions.createAutoBalancer(
                 oracle: oracle,
@@ -147,6 +159,7 @@ access(all) contract FlowYieldVaultsAutoBalancers {
                 recurringConfig: recurringConfig,
                 uniqueID: uniqueID
             )
+        autoBalancer.setExecutionCallback(reportCap)
         self.account.storage.save(<-autoBalancer, to: storagePath)
         let autoBalancerRef = self._borrowAutoBalancer(uniqueID.id)
 
@@ -210,7 +223,7 @@ access(all) contract FlowYieldVaultsAutoBalancers {
         let publicPath = self.deriveAutoBalancerPath(id: id, storage: false) as! PublicPath
         // unpublish the public AutoBalancer Capability
         let _ = self.account.capabilities.unpublish(publicPath)
-        
+
         // Collect controller IDs first (can't modify during iteration)
         var controllersToDelete: [UInt64] = []
         self.account.capabilities.storage.forEachController(forPath: storagePath, fun(_ controller: &StorageCapabilityController): Bool {
@@ -223,13 +236,24 @@ access(all) contract FlowYieldVaultsAutoBalancers {
                 controller.delete()
             }
         }
-        
+
         // load & burn the AutoBalancer (this also handles any pending scheduled transactions via burnCallback)
         let autoBalancer <-self.account.storage.load<@DeFiActions.AutoBalancer>(from: storagePath)
         Burner.burn(<-autoBalancer)
     }
 
+    access(self) fun createRegistryReportCallbackImpl(): @RegistryReportCallback {
+        return <-create RegistryReportCallback()
+    }
+
     init() {
         self.pathPrefix = "FlowYieldVaultsAutoBalancer_"
+        self.registryReportCallbackStoragePath = StoragePath(identifier: "FlowYieldVaultsRegistryReportCallback")!
+
+        // Ensure shared execution callback exists (reports this account's executions to Registry)
+        if self.account.storage.type(at: self.registryReportCallbackStoragePath) == nil {
+            self.account.storage.save(<-self.createRegistryReportCallbackImpl(), to: self.registryReportCallbackStoragePath)
+        }
+
     }
 }
diff --git a/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc b/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc
@@ -58,6 +58,11 @@ access(all) contract FlowYieldVaultsSchedulerRegistry {
     /// Stored as a dictionary for O(1) add/remove; iteration gives the pending set
     access(self) var pendingQueue: {UInt64: Bool}
 
+    /// Order for stuck scanning: least recently reported (executed) first.
+    /// Vaults call reportExecution() on each run (remove id from array, append to end).
+    /// Supervisor scans only the first MAX_BATCH_SIZE entries for stuck detection.
+    access(self) var stuckScanOrder: [UInt64]
+
     /* --- ACCOUNT-LEVEL FUNCTIONS --- */
 
     /// Register a YieldVault and store its handler and schedule capabilities (idempotent)
@@ -73,9 +78,27 @@ access(all) contract FlowYieldVaultsSchedulerRegistry {
         self.yieldVaultRegistry[yieldVaultID] = true
         self.handlerCaps[yieldVaultID] = handlerCap
         self.scheduleCaps[yieldVaultID] = scheduleCap
+        self.stuckScanOrder.append(yieldVaultID)
         emit YieldVaultRegistered(yieldVaultID: yieldVaultID)
     }
 
+    /// Called by the account that holds this contract (e.g. from the wrapper) on every execution. Removes yieldVaultID from stuckScanOrder (if present)
+    /// and appends it to the end so the Supervisor only scans the first N (least recently executed) for stuck.
+    access(account) fun reportExecution(yieldVaultID: UInt64) {
+        if !(self.yieldVaultRegistry[yieldVaultID] ?? false) {
+            return
+        }
+        var i = 0
+        while i < self.stuckScanOrder.length {
+            if self.stuckScanOrder[i] == yieldVaultID {
+                self.stuckScanOrder.remove(at: i)
+                break
+            }
+            i = i + 1
+        }
+        self.stuckScanOrder.append(yieldVaultID)
+    }
+
     /// Adds a yield vault to the pending queue for seeding by the Supervisor
     access(account) fun enqueuePending(yieldVaultID: UInt64) {
         if self.yieldVaultRegistry[yieldVaultID] == true {
@@ -92,12 +115,20 @@ access(all) contract FlowYieldVaultsSchedulerRegistry {
         }
     }
 
-    /// Unregister a YieldVault (idempotent) - removes from registry, capabilities, and pending queue
+    /// Unregister a YieldVault (idempotent) - removes from registry, capabilities, pending queue, and stuckScanOrder
     access(account) fun unregister(yieldVaultID: UInt64) {
         self.yieldVaultRegistry.remove(key: yieldVaultID)
         self.handlerCaps.remove(key: yieldVaultID)
         self.scheduleCaps.remove(key: yieldVaultID)
         let pending = self.pendingQueue.remove(key: yieldVaultID)
+        var i = 0
+        while i < self.stuckScanOrder.length {
+            if self.stuckScanOrder[i] == yieldVaultID {
+                self.stuckScanOrder.remove(at: i)
+                break
+            }
+            i = i + 1
+        }
         emit YieldVaultUnregistered(yieldVaultID: yieldVaultID, wasInPendingQueue: pending != nil)
     }
 
@@ -156,19 +187,19 @@ access(all) contract FlowYieldVaultsSchedulerRegistry {
     /// Get paginated pending yield vault IDs
     /// @param page: The page number (0-indexed)
     /// @param size: The page size (defaults to MAX_BATCH_SIZE if nil)
-    access(all) view fun getPendingYieldVaultIDsPaginated(page: Int, size: Int?): [UInt64] {
-        let pageSize = size ?? self.MAX_BATCH_SIZE
+    access(all) view fun getPendingYieldVaultIDsPaginated(page: Int, size: UInt?): [UInt64] {
+        let pageSize = size ?? Int(self.MAX_BATCH_SIZE)
         let allPending = self.pendingQueue.keys
-        let startIndex = page * pageSize
-        
+        let startIndex = page * Int(pageSize)
+
         if startIndex >= allPending.length {
             return []
         }
-        
-        let endIndex = startIndex + pageSize > allPending.length 
-            ? allPending.length 
-            : startIndex + pageSize
-            
+
+        let endIndex = startIndex + Int(pageSize) > allPending.length
+            ? allPending.length
+            : startIndex + Int(pageSize)
+
         return allPending.slice(from: startIndex, upTo: endIndex)
     }
 
@@ -177,6 +208,14 @@ access(all) contract FlowYieldVaultsSchedulerRegistry {
         return self.pendingQueue.length
     }
 
+    /// Returns the first n yield vault IDs from the stuck-scan order (least recently executed first).
+    /// Supervisor should only scan these for stuck detection instead of all registered vaults.
+    /// @param limit: Maximum number of IDs to return (caller typically passes MAX_BATCH_SIZE)
+    access(all) view fun getStuckScanCandidates(limit: UInt): [UInt64] {
+        let end = limit > UInt(self.stuckScanOrder.length) ? self.stuckScanOrder.length : limit
+        return self.stuckScanOrder.slice(from: 0, upTo: Int(end))
+    }
+
     /// Get global Supervisor capability, if set
     /// NOTE: Access restricted - only used internally by the scheduler
     access(account)
@@ -193,6 +232,7 @@ access(all) contract FlowYieldVaultsSchedulerRegistry {
         self.handlerCaps = {}
         self.scheduleCaps = {}
         self.pendingQueue = {}
+        self.stuckScanOrder = []
     }
 }
 
diff --git a/cadence/contracts/FlowYieldVaultsSchedulerV1.cdc b/cadence/contracts/FlowYieldVaultsSchedulerV1.cdc
@@ -186,24 +186,8 @@ access(all) contract FlowYieldVaultsSchedulerV1 {
 
             // STEP 1: State-based detection - scan for stuck yield vaults
             if scanForStuck {
-                // TODO: add pagination - this will inevitably fails and at minimum creates inconsistent execution
-                //      effort between runs
-                let registeredYieldVaults = FlowYieldVaultsSchedulerRegistry.getRegisteredYieldVaultIDs()
-                var scanned = 0
-                for yieldVaultID in registeredYieldVaults {
-                    if scanned >= FlowYieldVaultsSchedulerRegistry.MAX_BATCH_SIZE {
-                        break
-                    }
-                    scanned = scanned + 1
-                    
-                    // Skip if already in pending queue
-                    // TODO: This is extremely inefficient - accessing from mapping is preferrable to iterating over
-                    //      an array
-                    if FlowYieldVaultsSchedulerRegistry.getPendingYieldVaultIDs().contains(yieldVaultID) {
-                        continue
-                    }
-
-                    // Check if yield vault is stuck (has recurring config, no active schedule, overdue)
+                let candidates = FlowYieldVaultsSchedulerRegistry.getStuckScanCandidates(limit: UInt(FlowYieldVaultsSchedulerRegistry.MAX_BATCH_SIZE))
+                for yieldVaultID in candidates {
                     if FlowYieldVaultsAutoBalancers.isStuckYieldVault(id: yieldVaultID) {
                         FlowYieldVaultsSchedulerRegistry.enqueuePending(yieldVaultID: yieldVaultID)
                         emit StuckYieldVaultDetected(yieldVaultID: yieldVaultID)
@@ -213,7 +197,7 @@ access(all) contract FlowYieldVaultsSchedulerV1 {
 
             // STEP 2: Process pending yield vaults - recover them via Schedule capability
             let pendingYieldVaults = FlowYieldVaultsSchedulerRegistry.getPendingYieldVaultIDsPaginated(page: 0, size: nil)
-            
+
             for yieldVaultID in pendingYieldVaults {
                 // Get Schedule capability for this yield vault
                 let scheduleCap = FlowYieldVaultsSchedulerRegistry.getScheduleCap(yieldVaultID: yieldVaultID)
@@ -457,7 +441,7 @@ access(all) contract FlowYieldVaultsSchedulerV1 {
 
         // Initialize paths
         self.SupervisorStoragePath = /storage/FlowYieldVaultsSupervisor
-        
+
         // Configure Supervisor at deploy time
         self.ensureSupervisorConfigured()
     }
diff --git a/cadence/tests/scheduled_supervisor_test.cdc b/cadence/tests/scheduled_supervisor_test.cdc
@@ -913,3 +913,130 @@ fun testInsufficientFundsAndRecovery() {
     log("- All ".concat(activeScheduleCount.toString()).concat(" yield vaults have active schedules"))
     log("========================================")
 }
+
+/// Supervisor batch recovery: 200 stuck vaults, no capacity-probe loop.
+///
+/// Flow: create 200 yield vaults, run 2 scheduling rounds, drain FLOW so executions fail,
+/// wait for vaults to be marked stuck, refund FLOW, schedule the supervisor, then advance
+/// time for ceil(200/MAX_BATCH_SIZE)+10 supervisor ticks. Asserts all 200 vaults are
+/// recovered (YieldVaultRecovered events), none still stuck, and all have active schedules.
+/// The +10 extra ticks are a buffer so every vault is processed despite scheduler timing.
+access(all)
+fun testSupervisorHandlesManyStuckVaults() {
+    let n = 200
+    let maxBatchSize = FlowYieldVaultsSchedulerRegistry.MAX_BATCH_SIZE
+
+    if snapshot != getCurrentBlockHeight() {
+        Test.reset(to: snapshot)
+    }
+
+    // 1. Setup: user, FLOW, and grant
+    let user = Test.createAccount()
+    mintFlow(to: user, amount: 100000.0)
+    grantBeta(flowYieldVaultsAccount, user)
+    mintFlow(to: flowYieldVaultsAccount, amount: 10000.0)
+
+    // 2. Create n yield vaults in batch (Test.executeTransactions)
+    var i = 0
+    let tx = Test.Transaction(
+        code: Test.readFile("../transactions/flow-yield-vaults/create_yield_vault.cdc"),
+        authorizers: [user.address],
+        signers: [user],
+        arguments: [strategyIdentifier, flowTokenIdentifier, 5.0]
+    )
+    let txs: [Test.Transaction] = []
+    while i < n {
+        txs.append(tx)
+        i = i + 1
+    }
+    let results = Test.executeTransactions(txs)
+    for result in results {
+        Test.expect(result, Test.beSucceeded())
+    }
+    log("testSupervisorHandlesManyStuckVaults: created \(n.toString()) yield vaults")
+
+    let yieldVaultIDs = getYieldVaultIDs(address: user.address)!
+    Test.assert(yieldVaultIDs.length == n, message: "expected \(n.toString()) vaults, got \(yieldVaultIDs.length.toString())")
+
+    // 3. Two scheduling rounds so vaults run once
+    setMockOraclePrice(signer: flowYieldVaultsAccount, forTokenIdentifier: flowTokenIdentifier, price: 1.5)
+    setMockOraclePrice(signer: flowYieldVaultsAccount, forTokenIdentifier: yieldTokenIdentifier, price: 1.2)
+    Test.moveTime(by: 60.0 * 10.0 + 10.0)
+    Test.commitBlock()
+    Test.moveTime(by: 60.0 * 10.0 + 10.0)
+    Test.commitBlock()
+
+    // 4. Drain FLOW so subsequent executions fail and vaults become stuck
+    let balanceBeforeDrain = (executeScript(
+        "../scripts/flow-yield-vaults/get_flow_balance.cdc",
+        [flowYieldVaultsAccount.address]
+    ).returnValue! as! UFix64)
+    if balanceBeforeDrain > 0.01 {
+        let drainRes = _executeTransaction(
+            "../transactions/flow-yield-vaults/drain_flow.cdc",
+            [balanceBeforeDrain - 0.001],
+            flowYieldVaultsAccount
+        )
+        Test.expect(drainRes, Test.beSucceeded())
+    }
+    log("testSupervisorHandlesManyStuckVaults: drained FLOW, waiting for vaults to be marked stuck")
+
+    // 5. Wait rounds until vaults are marked stuck
+    var waitRound = 0
+    while waitRound < 6 {
+        Test.moveTime(by: 60.0 * 10.0 + 10.0)
+        Test.commitBlock()
+        waitRound = waitRound + 1
+    }
+
+    // 6. Refund FLOW and schedule supervisor
+    mintFlow(to: flowYieldVaultsAccount, amount: 500.0)
+    Test.commitBlock()
+    Test.moveTime(by: 1.0)
+    Test.commitBlock()
+
+    let interval = 60.0 * 10.0
+    let schedSupRes = _executeTransaction(
+        "../transactions/flow-yield-vaults/admin/schedule_supervisor.cdc",
+        [interval, UInt8(1), UInt64(5000), true],
+        flowYieldVaultsAccount
+    )
+    Test.expect(schedSupRes, Test.beSucceeded())
+
+    // 7. Advance time for supervisor ticks (ceil(n/MAX_BATCH_SIZE)+10); each tick processes a batch
+    let supervisorRunsNeeded = (UInt(n) + UInt(maxBatchSize) - 1) / UInt(maxBatchSize)
+    var run = 0 as UInt
+    while run < supervisorRunsNeeded + 10 {
+        Test.moveTime(by: 60.0 * 10.0 + 10.0)
+        Test.commitBlock()
+        run = run + 1
+    }
+    log("testSupervisorHandlesManyStuckVaults: ran \(supervisorRunsNeeded + 10).toString()) supervisor ticks")
+
+    let recoveredEvents = Test.eventsOfType(Type<FlowYieldVaultsSchedulerV1.YieldVaultRecovered>())
+    Test.assert(recoveredEvents.length >= n, message: "expected at least \(n.toString()) recovered, got \(recoveredEvents.length.toString())")
+    log("testSupervisorHandlesManyStuckVaults: recovered \(recoveredEvents.length.toString()) vaults")
+
+    // 8. Health check: none stuck, all have active schedules
+    var stillStuck = 0
+    var activeCount = 0
+    for yieldVaultID in yieldVaultIDs {
+        let isStuckRes = executeScript(
+            "../scripts/flow-yield-vaults/is_stuck_yield_vault.cdc",
+            [yieldVaultID]
+        )
+        if isStuckRes.returnValue != nil && (isStuckRes.returnValue! as! Bool) {
+            stillStuck = stillStuck + 1
+        }
+        let hasActiveRes = executeScript(
+            "../scripts/flow-yield-vaults/has_active_schedule.cdc",
+            [yieldVaultID]
+        )
+        if hasActiveRes.returnValue != nil && (hasActiveRes.returnValue! as! Bool) {
+            activeCount = activeCount + 1
+        }
+    }
+    Test.assert(stillStuck == 0, message: "expected 0 stuck, got \(stillStuck.toString())")
+    Test.assert(activeCount == n, message: "expected \(n.toString()) active, got \(activeCount.toString())")
+    log("testSupervisorHandlesManyStuckVaults: all \(n.toString()) vaults healthy, active schedules: \(activeCount.toString())")
+}