diff --git a/cpp/src/branch_and_bound/CMakeLists.txt b/cpp/src/branch_and_bound/CMakeLists.txt
index 5bb1017120..1e40c1bbf1 100644
--- a/cpp/src/branch_and_bound/CMakeLists.txt
+++ b/cpp/src/branch_and_bound/CMakeLists.txt
@@ -5,7 +5,6 @@
 
 set(BRANCH_AND_BOUND_SRC_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/branch_and_bound.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/mip_node.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/pseudo_costs.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/diving_heuristics.cpp
   )
diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index e69ff7b9a5..4418dd798f 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -6,6 +6,7 @@
 /* clang-format on */
 
 #include <branch_and_bound/branch_and_bound.hpp>
+#include <branch_and_bound/diving_heuristics.hpp>
 #include <branch_and_bound/mip_node.hpp>
 #include <branch_and_bound/pseudo_costs.hpp>
 
@@ -24,6 +25,7 @@
 #include <dual_simplex/user_problem.hpp>
 
 #include <raft/core/nvtx.hpp>
+#include <utilities/circular_deque.hpp>
 #include <utilities/hashing.hpp>
 
 #include <omp.h>
@@ -32,18 +34,13 @@
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
-#include <deque>
 #include <future>
 #include <limits>
-#include <map>
-#include <optional>
 #include <string>
 #include <thread>
-#include <unordered_map>
 #include <vector>
 
 namespace cuopt::linear_programming::dual_simplex {
-
 namespace {
 
 template <typename f_t>
@@ -258,7 +255,7 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
     incumbent_(1),
     root_relax_soln_(1, 1),
     root_crossover_soln_(1, 1),
-    pc_(1),
+    pc_(1, solver_settings),
     solver_status_(mip_status_t::UNSET)
 {
   exploration_stats_.start_time = start_time;
@@ -299,11 +296,8 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
 template <typename i_t, typename f_t>
 f_t branch_and_bound_t<i_t, f_t>::get_lower_bound()
 {
-  f_t lower_bound        = lower_bound_ceiling_.load();
-  f_t heap_lower_bound   = node_queue_.get_lower_bound();
-  f_t worker_lower_bound = worker_pool_.get_lower_bound();
-  lower_bound            = std::min(heap_lower_bound, lower_bound);
-  lower_bound            = std::min(worker_lower_bound, lower_bound);
+  f_t lower_bound = lower_bound_ceiling_.load();
+  lower_bound     = std::min(bfs_worker_pool_.get_lower_bound(), lower_bound);
 
   if (std::isfinite(lower_bound)) {
     return lower_bound;
@@ -810,7 +804,7 @@ void branch_and_bound_t<i_t, f_t>::add_feasible_solution(f_t leaf_objective,
 // Technische Universit¨at Berlin, Berlin, 1999. Accessed: Aug. 08, 2025.
 // [Online]. Available: https://opus4.kobv.de/opus4-zib/frontdoor/index/index/docId/391
 template <typename f_t>
-rounding_direction_t martin_criteria(f_t val, f_t root_val)
+branch_direction_t martin_criteria(f_t val, f_t root_val)
 {
   const f_t down_val  = std::floor(root_val);
   const f_t up_val    = std::ceil(root_val);
@@ -819,10 +813,10 @@ rounding_direction_t martin_criteria(f_t val, f_t root_val)
   constexpr f_t eps   = 1e-6;
 
   if (down_dist < up_dist + eps) {
-    return rounding_direction_t::DOWN;
+    return branch_direction_t::DOWN;
 
   } else {
-    return rounding_direction_t::UP;
+    return branch_direction_t::UP;
   }
 }
 
@@ -833,14 +827,14 @@ branch_variable_t<i_t> branch_and_bound_t<i_t, f_t>::variable_selection(
   branch_and_bound_worker_t<i_t, f_t>* worker)
 {
   logger_t log;
-  log.log                        = false;
-  i_t branch_var                 = -1;
-  rounding_direction_t round_dir = rounding_direction_t::NONE;
+  log.log                      = false;
+  i_t branch_var               = -1;
+  branch_direction_t round_dir = branch_direction_t::NONE;
   std::vector<f_t> current_incumbent;
   std::vector<f_t>& solution = worker->leaf_solution.x;
 
   switch (worker->search_strategy) {
-    case search_strategy_t::BEST_FIRST:
+    case BEST_FIRST:
 
       if (settings_.reliability_branching != 0) {
         branch_var = pc_.reliable_variable_selection(node_ptr,
@@ -848,31 +842,30 @@ branch_variable_t<i_t> branch_and_bound_t<i_t, f_t>::variable_selection(
                                                      worker,
                                                      var_types_,
                                                      exploration_stats_,
-                                                     settings_,
                                                      upper_bound_,
-                                                     worker_pool_.num_idle_workers(),
-                                                     log,
+                                                     bfs_worker_pool_.num_idle_workers(),
                                                      new_slacks_,
                                                      original_lp_);
       } else {
-        branch_var = pc_.variable_selection(fractional, solution, log);
+        branch_var = pc_.variable_selection(fractional, solution);
       }
 
       round_dir = martin_criteria(solution[branch_var], root_relax_soln_.x[branch_var]);
 
       return {branch_var, round_dir};
 
-    case search_strategy_t::COEFFICIENT_DIVING:
+    case COEFFICIENT_DIVING:
       return coefficient_diving(
         original_lp_, fractional, solution, var_up_locks_, var_down_locks_, log);
 
-    case search_strategy_t::LINE_SEARCH_DIVING:
+    case LINE_SEARCH_DIVING:
       return line_search_diving(fractional, solution, root_relax_soln_.x, log);
 
-    case search_strategy_t::PSEUDOCOST_DIVING:
+    case PSEUDOCOST_DIVING:
       return pseudocost_diving(pc_, fractional, solution, root_relax_soln_.x, log);
 
-    case search_strategy_t::GUIDED_DIVING:
+    case GUIDED_DIVING:
+      assert(incumbent_.has_incumbent);
       mutex_upper_.lock();
       current_incumbent = incumbent_.x;
       mutex_upper_.unlock();
@@ -880,7 +873,7 @@ branch_variable_t<i_t> branch_and_bound_t<i_t, f_t>::variable_selection(
 
     default:
       log.debug("Unknown variable selection method: %d\n", worker->search_strategy);
-      return {-1, rounding_direction_t::NONE};
+      return {-1, branch_direction_t::NONE};
   }
 }
 
@@ -907,7 +900,7 @@ struct tree_update_policy_t {
                                          const std::vector<f_t>& x)                = 0;
   virtual void on_node_completed(mip_node_t<i_t, f_t>* node,
                                  node_status_t status,
-                                 rounding_direction_t dir)                         = 0;
+                                 branch_direction_t dir)                           = 0;
   virtual void on_numerical_issue(mip_node_t<i_t, f_t>*)                           = 0;
   virtual void graphviz(search_tree_t<i_t, f_t>&, mip_node_t<i_t, f_t>*, const char*, f_t) = 0;
   virtual void on_optimal_callback(const std::vector<f_t>&, f_t)                           = 0;
@@ -952,9 +945,7 @@ struct nondeterministic_policy_t : tree_update_policy_t<i_t, f_t> {
                                  const std::vector<f_t>& x) override
   {
     if (worker->search_strategy == search_strategy_t::BEST_FIRST) {
-      logger_t pc_log;
-      pc_log.log               = false;
-      node->objective_estimate = bnb.pc_.obj_estimate(fractional, x, node->lower_bound, pc_log);
+      node->objective_estimate = bnb.pc_.obj_estimate(fractional, x, node->lower_bound);
     }
   }
 
@@ -986,7 +977,7 @@ struct nondeterministic_policy_t : tree_update_policy_t<i_t, f_t> {
     }
   }
 
-  void on_node_completed(mip_node_t<i_t, f_t>*, node_status_t, rounding_direction_t) override {}
+  void on_node_completed(mip_node_t<i_t, f_t>*, node_status_t, branch_direction_t) override {}
 };
 
 template <typename i_t, typename f_t, typename WorkerT>
@@ -1005,7 +996,7 @@ struct deterministic_policy_base_t : tree_update_policy_t<i_t, f_t> {
   {
     if (node->branch_var < 0) return;
     f_t change = std::max(leaf_obj - node->lower_bound, f_t(0));
-    f_t frac   = node->branch_dir == rounding_direction_t::DOWN
+    f_t frac   = node->branch_dir == branch_direction_t::DOWN
                    ? node->fractional_val - std::floor(node->fractional_val)
                    : std::ceil(node->fractional_val) - node->fractional_val;
     if (frac > 1e-10) {
@@ -1049,13 +1040,15 @@ struct deterministic_bfs_policy_t
                                  const std::vector<i_t>& fractional,
                                  const std::vector<f_t>& x) override
   {
+    logger_t log;
+    log.log = false;
     node->objective_estimate =
       this->worker.pc_snapshot.obj_estimate(fractional, x, node->lower_bound);
   }
 
   void on_node_completed(mip_node_t<i_t, f_t>* node,
                          node_status_t status,
-                         rounding_direction_t dir) override
+                         branch_direction_t dir) override
   {
     switch (status) {
       case node_status_t::INFEASIBLE: this->worker.record_infeasible(node); break;
@@ -1090,12 +1083,12 @@ struct deterministic_diving_policy_t
   : deterministic_policy_base_t<i_t, f_t, deterministic_diving_worker_t<i_t, f_t>> {
   using base = deterministic_policy_base_t<i_t, f_t, deterministic_diving_worker_t<i_t, f_t>>;
 
-  std::deque<mip_node_t<i_t, f_t>*>& stack;
+  circular_deque_t<mip_node_t<i_t, f_t>*>& stack;
   i_t max_backtrack_depth;
 
   deterministic_diving_policy_t(branch_and_bound_t<i_t, f_t>& bnb,
                                 deterministic_diving_worker_t<i_t, f_t>& worker,
-                                std::deque<mip_node_t<i_t, f_t>*>& stack,
+                                circular_deque_t<mip_node_t<i_t, f_t>*>& stack,
                                 i_t max_backtrack_depth)
     : base(bnb, worker), stack(stack), max_backtrack_depth(max_backtrack_depth)
   {
@@ -1115,25 +1108,28 @@ struct deterministic_diving_policy_t
                                                 const std::vector<i_t>& fractional,
                                                 const std::vector<f_t>& x) override
   {
+    logger_t log;
+    log.log = false;
+
     switch (this->worker.diving_type) {
       case search_strategy_t::PSEUDOCOST_DIVING:
-        return this->worker.variable_selection_from_snapshot(fractional, x);
+        return pseudocost_diving(
+          this->worker.pc_snapshot, fractional, x, *this->worker.root_solution, log);
 
       case search_strategy_t::LINE_SEARCH_DIVING:
-        if (this->worker.root_solution) {
-          logger_t log;
-          log.log = false;
-          return line_search_diving<i_t, f_t>(fractional, x, *this->worker.root_solution, log);
-        }
-        return this->worker.variable_selection_from_snapshot(fractional, x);
+        return line_search_diving<i_t, f_t>(fractional, x, *this->worker.root_solution, log);
 
       case search_strategy_t::GUIDED_DIVING:
-        return this->worker.guided_variable_selection(fractional, x);
+        if (this->worker.incumbent_snapshot.empty()) {
+          return pseudocost_diving(
+            this->worker.pc_snapshot, fractional, x, *this->worker.root_solution, log);
+        } else {
+          return guided_diving(
+            this->worker.pc_snapshot, fractional, x, this->worker.incumbent_snapshot, log);
+        }
 
       case search_strategy_t::COEFFICIENT_DIVING: {
-        logger_t log;
-        log.log = false;
-        return coefficient_diving<i_t, f_t>(this->bnb.original_lp_,
+        return coefficient_diving<i_t, f_t>(this->worker.leaf_problem,
                                             fractional,
                                             x,
                                             this->bnb.var_up_locks_,
@@ -1141,7 +1137,7 @@ struct deterministic_diving_policy_t
                                             log);
       }
 
-      default: return this->worker.variable_selection_from_snapshot(fractional, x);
+      default: CUOPT_LOG_ERROR("Invalid diving method!"); return {-1, branch_direction_t::NONE};
     }
   }
 
@@ -1153,10 +1149,10 @@ struct deterministic_diving_policy_t
 
   void on_node_completed(mip_node_t<i_t, f_t>* node,
                          node_status_t status,
-                         rounding_direction_t dir) override
+                         branch_direction_t dir) override
   {
     if (status == node_status_t::HAS_CHILDREN) {
-      if (dir == rounding_direction_t::UP) {
+      if (dir == branch_direction_t::UP) {
         stack.push_front(node->get_down_child());
         stack.push_front(node->get_up_child());
       } else {
@@ -1175,7 +1171,7 @@ struct deterministic_diving_policy_t
 
 template <typename i_t, typename f_t>
 template <typename WorkerT, typename Policy>
-std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::update_tree_impl(
+std::pair<node_status_t, branch_direction_t> branch_and_bound_t<i_t, f_t>::update_tree_impl(
   mip_node_t<i_t, f_t>* node_ptr,
   search_tree_t<i_t, f_t>& search_tree,
   WorkerT* worker,
@@ -1187,7 +1183,10 @@ std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::upd
   lp_solution_t<i_t, f_t>& leaf_solution = worker->leaf_solution;
   const f_t upper_bound                  = policy.upper_bound();
   node_status_t status                   = node_status_t::PENDING;
-  rounding_direction_t round_dir         = rounding_direction_t::NONE;
+  branch_direction_t round_dir           = branch_direction_t::NONE;
+
+  worker->recompute_basis  = true;
+  worker->recompute_bounds = true;
 
   if (lp_status == dual::status_t::DUAL_UNBOUNDED) {
     node_ptr->lower_bound = inf;
@@ -1243,11 +1242,13 @@ std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::upd
         policy.select_branch_variable(node_ptr, leaf_fractional, leaf_solution.x);
       round_dir = dir;
 
-      assert(node_ptr->vstatus.size() == leaf_problem.num_cols);
+      assert(worker->leaf_vstatus.size() == leaf_problem.num_cols);
       assert(branch_var >= 0);
-      assert(dir != rounding_direction_t::NONE);
+      assert(dir != branch_direction_t::NONE);
 
       policy.update_objective_estimate(node_ptr, leaf_fractional, leaf_solution.x);
+      worker->recompute_basis  = false;
+      worker->recompute_bounds = false;
 
       logger_t log;
       log.log = false;
@@ -1255,7 +1256,7 @@ std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::upd
                          branch_var,
                          leaf_solution.x[branch_var],
                          num_frac,
-                         node_ptr->vstatus,
+                         worker->leaf_vstatus,
                          leaf_problem,
                          log);
       search_tree.update(node_ptr, node_status_t::HAS_CHILDREN);
@@ -1284,7 +1285,7 @@ std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::upd
 }
 
 template <typename i_t, typename f_t>
-std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::update_tree(
+std::pair<node_status_t, branch_direction_t> branch_and_bound_t<i_t, f_t>::update_tree(
   mip_node_t<i_t, f_t>* node_ptr,
   search_tree_t<i_t, f_t>& search_tree,
   branch_and_bound_worker_t<i_t, f_t>* worker,
@@ -1338,8 +1339,9 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
   }
 #endif
 
-  std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
-  assert(leaf_vstatus.size() == worker->leaf_problem.num_cols);
+  worker->leaf_vstatus =
+    decompress_vstatus(node_ptr->packed_vstatus, worker->leaf_problem.num_cols);
+  assert(worker->leaf_vstatus.size() == worker->leaf_problem.num_cols);
 
   simplex_solver_settings_t lp_settings = settings_;
   lp_settings.concurrent_halt           = &node_concurrent_halt_;
@@ -1377,7 +1379,7 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
     node_ptr->node_id,
     node_ptr->depth,
     node_ptr->branch_var,
-    node_ptr->branch_dir == rounding_direction_t::DOWN ? "DOWN" : "UP",
+    node_ptr->branch_dir == branch_direction_t::DOWN ? "DOWN" : "UP",
     node_ptr->fractional_val,
     node_ptr->branch_var_lower,
     node_ptr->branch_var_upper,
@@ -1398,7 +1400,7 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
                                                 lp_start_time,
                                                 worker->leaf_problem,
                                                 lp_settings,
-                                                leaf_vstatus,
+                                                worker->leaf_vstatus,
                                                 worker->basis_factors,
                                                 worker->basic_list,
                                                 worker->nonbasic_list,
@@ -1415,7 +1417,7 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
                                                                            worker->basis_factors,
                                                                            worker->basic_list,
                                                                            worker->nonbasic_list,
-                                                                           leaf_vstatus,
+                                                                           worker->leaf_vstatus,
                                                                            worker->leaf_edge_norms);
 
       lp_status = convert_lp_status_to_dual_status(second_status);
@@ -1431,11 +1433,19 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
 
   return lp_status;
 }
+
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_t>* worker)
+void branch_and_bound_t<i_t, f_t>::plunge_with(bfs_worker_t<i_t, f_t>* worker,
+                                               mip_node_t<i_t, f_t>* start_node)
 {
-  std::deque<mip_node_t<i_t, f_t>*> stack;
-  stack.push_front(worker->start_node);
+  assert(worker != nullptr && worker->is_active.load());
+  assert(start_node != nullptr);
+
+  // Stack holds at most 2 entries: the preferred child + its sibling.
+  // The sibling is evicted to the queue before a new pair of children is added.
+  circular_deque_t<mip_node_t<i_t, f_t>*> stack(4);
+  stack.push_front(start_node);
+
   worker->recompute_basis  = true;
   worker->recompute_bounds = true;
 
@@ -1446,6 +1456,36 @@ void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_
 
   while (stack.size() > 0 && (solver_status_ == mip_status_t::UNSET && is_running_) &&
          rel_gap > settings_.relative_mip_gap_tol && abs_gap > settings_.absolute_mip_gap_tol) {
+    if (worker->worker_id == 0) { repair_heuristic_solutions(); }
+
+    // Launch a new diving task if any worker is idle
+    if (worker->total_active_diving_workers < worker->total_max_diving_workers &&
+        worker->node_queue.diving_queue_size() > 0) {
+      launch_diving_worker(worker);
+    }
+
+    // If any best-first worker become idle,
+    if (bfs_worker_pool_.num_idle_workers() > 0 && worker->node_queue.best_first_queue_size() > 0) {
+      worker->node_queue.lock();
+      mip_node_t<i_t, f_t>* node = worker->node_queue.pop_best_first();
+
+      // We need to temporarily save the lower bound in this worker so it is
+      // considered when calculating the global lower bound.
+      f_t node_lower_bound = node ? node->lower_bound : std::numeric_limits<f_t>::infinity();
+      worker->lower_bound  = std::min(worker->lower_bound.load(), node_lower_bound);
+
+      worker->node_queue.unlock();
+
+      if (node != nullptr) {
+        if (!launch_bfs_worker({node})) {
+          worker->node_queue.lock();
+          worker->node_queue.push(node);
+          worker->node_queue.unlock();
+        }
+      }
+    }
+
+    assert(stack.size() <= 2);
     mip_node_t<i_t, f_t>* node_ptr = stack.front();
     stack.pop_front();
 
@@ -1455,7 +1495,7 @@ void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_
     // - The current node and its siblings uses the lower bound of the parent before solving the LP
     // relaxation
     // - The lower bound of the parent is lower or equal to its children
-    worker->lower_bound = node_ptr->lower_bound;
+    worker->lower_bound = std::min(worker->node_queue.get_lower_bound(), node_ptr->lower_bound);
 
     if (node_ptr->lower_bound > upper_bound_.load()) {
       search_tree_.graphviz_node(settings_.log, node_ptr, "cutoff", node_ptr->lower_bound);
@@ -1466,13 +1506,31 @@ void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_
       continue;
     }
 
-    if (toc(exploration_stats_.start_time) > settings_.time_limit) {
+    f_t now = toc(exploration_stats_.start_time);
+
+    if (worker->worker_id == 0) {
+      f_t time_since_last_log =
+        exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
+      i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log;
+
+      if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
+           time_since_last_log >= 1) ||
+          (time_since_last_log > 30) || now > settings_.time_limit) {
+        report(' ', upper_bound_, lower_bound, node_ptr->depth, node_ptr->integer_infeasible);
+        exploration_stats_.last_log             = tic();
+        exploration_stats_.nodes_since_last_log = 0;
+      }
+    }
+
+    if (now > settings_.time_limit) {
       solver_status_ = mip_status_t::TIME_LIMIT;
+      stack.push_front(node_ptr);
       break;
     }
 
     if (exploration_stats_.nodes_explored >= settings_.node_limit) {
       solver_status_ = mip_status_t::NODE_LIMIT;
+      stack.push_front(node_ptr);
       break;
     }
 
@@ -1480,11 +1538,17 @@ void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_
 
     if (lp_status == dual::status_t::TIME_LIMIT) {
       solver_status_ = mip_status_t::TIME_LIMIT;
+      stack.push_front(node_ptr);
       break;
-    } else if (lp_status == dual::status_t::CONCURRENT_LIMIT) {
+    }
+
+    if (lp_status == dual::status_t::CONCURRENT_LIMIT) {
       stack.push_front(node_ptr);
       break;
-    } else if (lp_status == dual::status_t::ITERATION_LIMIT) {
+    }
+
+    if (lp_status == dual::status_t::ITERATION_LIMIT) {
+      stack.push_front(node_ptr);
       break;
     }
 
@@ -1506,22 +1570,28 @@ void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_
       if (stack.size() > 0) {
         mip_node_t<i_t, f_t>* node = stack.back();
         stack.pop_back();
-        node_queue_.push(node);
+        worker->node_queue.lock();
+        worker->node_queue.push(node);
+        worker->node_queue.unlock();
       }
 
       exploration_stats_.nodes_unexplored += 2;
 
-      if (round_dir == rounding_direction_t::UP) {
-        if (node_queue_.best_first_queue_size() < min_node_queue_size_) {
-          node_queue_.push(node_ptr->get_down_child());
+      if (round_dir == branch_direction_t::UP) {
+        if (worker->node_queue.best_first_queue_size() < min_node_queue_size_) {
+          worker->node_queue.lock();
+          worker->node_queue.push(node_ptr->get_down_child());
+          worker->node_queue.unlock();
         } else {
           stack.push_front(node_ptr->get_down_child());
         }
 
         stack.push_front(node_ptr->get_up_child());
       } else {
-        if (node_queue_.best_first_queue_size() < min_node_queue_size_) {
-          node_queue_.push(node_ptr->get_up_child());
+        if (worker->node_queue.best_first_queue_size() < min_node_queue_size_) {
+          worker->node_queue.lock();
+          worker->node_queue.push(node_ptr->get_up_child());
+          worker->node_queue.unlock();
         } else {
           stack.push_front(node_ptr->get_up_child());
         }
@@ -1536,31 +1606,115 @@ void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_
     abs_gap     = compute_user_abs_gap(original_lp_, upper_bound, lower_bound);
   }
 
-  lower_bound = get_lower_bound();
-  upper_bound = upper_bound_;
-  rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
-  abs_gap     = compute_user_abs_gap(original_lp_, upper_bound, lower_bound);
+  // If the solver was forced to stop, but we still have nodes to explore
+  // in the stack, then we should add all the pending nodes back to the heap so the global lower
+  // bound is set to the correct value.
+  while (!stack.empty()) {
+    auto node = stack.front();
+    stack.pop_front();
+    worker->node_queue.lock();
+    worker->node_queue.push(node);
+    worker->node_queue.unlock();
+  }
+}
 
-  if (stack.size() > 0 &&
-      (rel_gap <= settings_.relative_mip_gap_tol || abs_gap <= settings_.absolute_mip_gap_tol)) {
-    // If the solver converged according to the gap rules, but we still have nodes to explore
-    // in the stack, then we should add all the pending nodes back to the heap so the lower
-    // bound of the solver is set to the correct value.
-    while (!stack.empty()) {
-      auto node = stack.front();
-      stack.pop_front();
-      node_queue_.push(node);
-    }
+template <typename i_t, typename f_t>
+bfs_worker_t<i_t, f_t>* branch_and_bound_t<i_t, f_t>::launch_bfs_worker(
+  const std::vector<mip_node_t<i_t, f_t>*>& start_nodes)
+{
+  // Take an idle node from the pool
+  bfs_worker_t<i_t, f_t>* idle_worker = bfs_worker_pool_.pop_idle_worker();
+  if (!idle_worker) { return nullptr; }
+
+  if (toc(exploration_stats_.start_time) > settings_.time_limit ||
+      solver_status_ != mip_status_t::UNSET) {
+    bfs_worker_pool_.return_worker_to_pool(idle_worker);
+    return nullptr;
   }
 
-  if (settings_.num_threads > 1) {
-    worker_pool_.return_worker_to_pool(worker);
-    active_workers_per_strategy_[BEST_FIRST]--;
+  idle_worker->init(start_nodes);
+  idle_worker->set_active();
+
+#pragma omp task affinity(*idle_worker) priority(99)
+  best_first_search_with(idle_worker);
+
+  return idle_worker;
+}
+
+template <typename i_t, typename f_t>
+void branch_and_bound_t<i_t, f_t>::best_first_search_with(bfs_worker_t<i_t, f_t>* worker)
+{
+  f_t lower_bound  = get_lower_bound();
+  f_t abs_gap      = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound);
+  f_t rel_gap      = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+  f_t steal_chance = settings_.bnb_steal_chance >= 0 ? settings_.bnb_steal_chance : 0.05;
+
+  worker->calculate_num_diving_workers(bfs_worker_pool_.num_workers(),
+                                       diving_worker_pool_.num_workers(),
+                                       has_solver_space_incumbent(),
+                                       settings_.diving_settings);
+
+  while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
+         rel_gap > settings_.relative_mip_gap_tol &&
+         worker->node_queue.best_first_queue_size() > 0) {
+    // If the guided diving was disabled previously due to the lack of an incumbent solution,
+    // re-enable as soon as a new incumbent is found.
+    if (diving_worker_pool_.num_workers() > 0 && settings_.diving_settings.guided_diving != 0 &&
+        worker->max_diving_workers[GUIDED_DIVING] == 0) {
+      if (has_solver_space_incumbent()) {
+        worker->calculate_num_diving_workers(bfs_worker_pool_.num_workers(),
+                                             diving_worker_pool_.num_workers(),
+                                             has_solver_space_incumbent(),
+                                             settings_.diving_settings);
+      }
+    }
+
+    worker->node_queue.lock();
+    mip_node_t<i_t, f_t>* start_node = worker->node_queue.pop_best_first();
+    if (!start_node) {
+      worker->node_queue.unlock();
+      continue;
+    }
+
+    worker->lower_bound = start_node->lower_bound;
+    worker->node_queue.unlock();
+
+    if (upper_bound_.load() < start_node->lower_bound) {
+      // This node was put on the heap earlier but its lower bound is now greater than the
+      // current upper bound
+      search_tree_.graphviz_node(settings_.log, start_node, "cutoff", start_node->lower_bound);
+      search_tree_.update(start_node, node_status_t::FATHOMED);
+      continue;
+    }
+
+    plunge_with(worker, start_node);
+
+    lower_bound = get_lower_bound();
+    abs_gap     = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound);
+    rel_gap     = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+
+    if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) {
+      node_concurrent_halt_ = 1;
+      solver_status_        = mip_status_t::OPTIMAL;
+      break;
+    }
+
+    // Steal a node with some probability or when it is empty. The victim is determined at random.
+    if (worker->node_queue.best_first_queue_size() == 0 ||
+        worker->rng.next_double() < steal_chance) {
+      for (i_t i = 0; i < settings_.bnb_max_steal_attempts; ++i) {
+        i_t k = worker->rng.uniform(0, bfs_worker_pool_.num_workers());
+        if (worker->steal_node_from(bfs_worker_pool_[k], settings_.bnb_nodes_per_steal)) { break; }
+      }
+    }
   }
+
+  worker->set_inactive();
+  bfs_worker_pool_.return_worker_to_pool(worker);
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>* worker)
+void branch_and_bound_t<i_t, f_t>::dive_with(diving_worker_t<i_t, f_t>* worker)
 {
   raft::common::nvtx::range scope("BB::diving_thread");
   logger_t log;
@@ -1573,8 +1727,8 @@ void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>
   worker->recompute_basis  = true;
   worker->recompute_bounds = true;
 
-  search_tree_t<i_t, f_t> dive_tree(std::move(*worker->start_node));
-  std::deque<mip_node_t<i_t, f_t>*> stack;
+  search_tree_t<i_t, f_t> dive_tree(std::move(worker->start_node));
+  circular_deque_t<mip_node_t<i_t, f_t>*> stack(2 * diving_backtrack_limit + 4);
   stack.push_front(&dive_tree.root);
 
   branch_and_bound_stats_t<i_t, f_t> dive_stats;
@@ -1609,11 +1763,9 @@ void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>
     if (lp_status == dual::status_t::TIME_LIMIT) {
       solver_status_ = mip_status_t::TIME_LIMIT;
       break;
-    } else if (lp_status == dual::status_t::CONCURRENT_LIMIT) {
-      break;
-    } else if (lp_status == dual::status_t::ITERATION_LIMIT) {
-      break;
     }
+    if (lp_status == dual::status_t::CONCURRENT_LIMIT) { break; }
+    if (lp_status == dual::status_t::ITERATION_LIMIT) { break; }
 
     ++dive_stats.nodes_explored;
 
@@ -1623,7 +1775,7 @@ void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>
     worker->recompute_bounds = node_status != node_status_t::HAS_CHILDREN;
 
     if (node_status == node_status_t::HAS_CHILDREN) {
-      if (round_dir == rounding_direction_t::UP) {
+      if (round_dir == branch_direction_t::UP) {
         stack.push_front(node_ptr->get_down_child());
         stack.push_front(node_ptr->get_up_child());
       } else {
@@ -1632,7 +1784,7 @@ void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>
       }
     }
 
-    // Remove nodes that we no longer can backtrack to (i.e., from the current node, we can only
+    // Remove nodes that we can no longer backtrack to (i.e., from the current node, we can only
     // backtrack to a node that is has a depth of at most 5 levels lower than the current node).
     if (stack.size() > 1 && stack.front()->depth - stack.back()->depth > diving_backtrack_limit) {
       stack.pop_back();
@@ -1644,219 +1796,80 @@ void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>
     abs_gap     = compute_user_abs_gap(original_lp_, upper_bound, lower_bound);
   }
 
-  worker_pool_.return_worker_to_pool(worker);
-  active_workers_per_strategy_[search_strategy]--;
+  worker->set_inactive();
+  diving_worker_pool_.return_worker_to_pool(worker);
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::run_scheduler()
+bool branch_and_bound_t<i_t, f_t>::launch_diving_worker(bfs_worker_t<i_t, f_t>* bfs_worker)
 {
-  diving_heuristics_settings_t<i_t, f_t> diving_settings = settings_.diving_settings;
-  const i_t num_workers                                  = 2 * settings_.num_threads;
-
-  if (!has_solver_space_incumbent()) { diving_settings.guided_diving = false; }
-  std::vector<search_strategy_t> strategies = get_search_strategies(diving_settings);
-  std::array<i_t, num_search_strategies> max_num_workers_per_type =
-    get_max_workers(num_workers, strategies);
+  // Get an idle worker.
+  diving_worker_t<i_t, f_t>* diving_worker = diving_worker_pool_.pop_idle_worker();
+  if (diving_worker == nullptr) { return false; }
 
-  worker_pool_.init(num_workers, original_lp_, Arow_, var_types_, settings_);
-  active_workers_per_strategy_.fill(0);
+  bfs_worker->node_queue.lock();
+  mip_node_t<i_t, f_t>* start_node = bfs_worker->node_queue.pop_diving();
 
-#ifdef CUOPT_LOG_DEBUG
-  for (auto strategy : strategies) {
-    settings_.log.debug("%c%d: max num of workers = %d",
-                        feasible_solution_symbol(strategy),
-                        strategy,
-                        max_num_workers_per_type[strategy]);
+  if (!start_node || upper_bound_.load() < start_node->lower_bound ||
+      start_node->depth < settings_.diving_settings.min_node_depth) {
+    bfs_worker->node_queue.unlock();
+    diving_worker_pool_.return_worker_to_pool(diving_worker);
+    return false;
   }
-#endif
-
-  f_t lower_bound     = get_lower_bound();
-  f_t abs_gap         = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound);
-  f_t rel_gap         = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
-  i_t last_node_depth = 0;
-  i_t last_int_infeas = 0;
-
-  while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
-         rel_gap > settings_.relative_mip_gap_tol &&
-         (active_workers_per_strategy_[0] > 0 || node_queue_.best_first_queue_size() > 0)) {
-    bool launched_any_task = false;
-
-    repair_heuristic_solutions();
-
-    // If the guided diving was disabled previously due to the lack of an incumbent solution,
-    // re-enable as soon as a new incumbent is found.
-    if (settings_.diving_settings.guided_diving != diving_settings.guided_diving) {
-      if (has_solver_space_incumbent()) {
-        diving_settings.guided_diving = settings_.diving_settings.guided_diving;
-        strategies                    = get_search_strategies(diving_settings);
-        max_num_workers_per_type      = get_max_workers(num_workers, strategies);
-
-#ifdef CUOPT_LOG_DEBUG
-        for (auto type : strategies) {
-          settings_.log.debug("%c%d: max num of workers = %d",
-                              feasible_solution_symbol(type),
-                              type,
-                              max_num_workers_per_type[type]);
-        }
-#endif
-      }
-    }
-
-    f_t now = toc(exploration_stats_.start_time);
-    f_t time_since_last_log =
-      exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
-    i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log;
-
-    if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
-         time_since_last_log >= 1) ||
-        (time_since_last_log > 30) || now > settings_.time_limit) {
-      i_t queue_size = node_queue_.best_first_queue_size();
-      i_t depth      = queue_size > 0 ? node_queue_.bfs_top()->depth : last_node_depth;
-      i_t int_infeas = queue_size > 0 ? node_queue_.bfs_top()->integer_infeasible : last_int_infeas;
-      report(' ', upper_bound_, lower_bound, depth, int_infeas);
-      exploration_stats_.last_log             = tic();
-      exploration_stats_.nodes_since_last_log = 0;
-    }
-
-    if (now > settings_.time_limit) {
-      solver_status_ = mip_status_t::TIME_LIMIT;
-      break;
-    }
-
-    for (auto strategy : strategies) {
-      if (active_workers_per_strategy_[strategy] >= max_num_workers_per_type[strategy]) {
-        continue;
-      }
-
-      // Get an idle worker.
-      branch_and_bound_worker_t<i_t, f_t>* worker = worker_pool_.get_idle_worker();
-      if (worker == nullptr) { break; }
-
-      if (strategy == BEST_FIRST) {
-        // If there any node left in the heap, we pop the top node and explore it.
-        std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
 
-        if (!start_node.has_value()) { continue; }
-        if (upper_bound_.load() < start_node.value()->lower_bound) {
-          // This node was put on the heap earlier but its lower bound is now greater than the
-          // current upper bound
-          search_tree_.graphviz_node(
-            settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
-          search_tree_.update(start_node.value(), node_status_t::FATHOMED);
-          continue;
-        }
+  diving_worker->init(start_node, original_lp_);
+  bfs_worker->node_queue.unlock();
 
-        // Remove the worker from the idle list.
-        worker_pool_.pop_idle_worker();
-        worker->init_best_first(start_node.value(), original_lp_);
-        last_node_depth = start_node.value()->depth;
-        last_int_infeas = start_node.value()->integer_infeasible;
-        active_workers_per_strategy_[strategy]++;
-        launched_any_task = true;
+  bool is_feasible = diving_worker->presolve_start_bounds(settings_);
+  if (!is_feasible) {
+    diving_worker_pool_.return_worker_to_pool(diving_worker);
+    return false;
+  }
 
-#pragma omp task affinity(worker)
-        plunge_with(worker);
+  if (toc(exploration_stats_.start_time) > settings_.time_limit ||
+      solver_status_ != mip_status_t::UNSET) {
+    diving_worker_pool_.return_worker_to_pool(diving_worker);
+    return false;
+  }
 
-      } else {
-        std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_diving();
+  for (int i = 1; i < num_search_strategies; ++i) {
+    auto strategy = search_strategies[i];
 
-        if (!start_node.has_value()) { continue; }
-        if (upper_bound_.load() < start_node.value()->lower_bound ||
-            start_node.value()->depth < diving_settings.min_node_depth) {
-          continue;
-        }
+    if (bfs_worker->active_diving_workers[strategy] < bfs_worker->max_diving_workers[strategy]) {
+      diving_worker->search_strategy = strategy;
+      diving_worker->bfs_worker      = bfs_worker;
+      diving_worker->set_active();
+      bfs_worker->active_diving_workers[strategy]++;
+      bfs_worker->total_active_diving_workers++;
 
-        bool is_feasible =
-          worker->init_diving(start_node.value(), strategy, original_lp_, settings_);
-        if (!is_feasible) { continue; }
+      assert(bfs_worker->active_diving_workers[strategy].load() <=
+             bfs_worker->max_diving_workers[strategy]);
+      assert(bfs_worker->total_active_diving_workers.load() <=
+             bfs_worker->total_max_diving_workers);
 
-        // Remove the worker from the idle list.
-        worker_pool_.pop_idle_worker();
-        active_workers_per_strategy_[strategy]++;
-        launched_any_task = true;
+#pragma omp task affinity(*diving_worker)
+      dive_with(diving_worker);
 
-#pragma omp task affinity(worker)
-        dive_with(worker);
-      }
+      return true;
     }
-
-    lower_bound = get_lower_bound();
-    abs_gap     = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound);
-    rel_gap     = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
-
-    if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) {
-      node_concurrent_halt_ = 1;
-      solver_status_        = mip_status_t::OPTIMAL;
-      break;
-    }
-
-    // If no new task was launched in this iteration, suspend temporarily the
-    // execution of the scheduler. As of 8/Jan/2026, GCC does not
-    // implement taskyield, but LLVM does.
-    if (!launched_any_task) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); }
   }
+
+  diving_worker_pool_.return_worker_to_pool(diving_worker);
+  return false;
 }
 
 template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::single_threaded_solve()
 {
-  worker_pool_.init(1, original_lp_, Arow_, var_types_, settings_);
-  branch_and_bound_worker_t<i_t, f_t>* worker = worker_pool_.get_idle_worker();
-
-  f_t lower_bound = get_lower_bound();
-  f_t abs_gap     = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound);
-  f_t rel_gap     = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
-
-  while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
-         rel_gap > settings_.relative_mip_gap_tol && node_queue_.best_first_queue_size() > 0) {
-    repair_heuristic_solutions();
-
-    f_t now = toc(exploration_stats_.start_time);
-    f_t time_since_last_log =
-      exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
-    i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log;
-
-    if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
-         time_since_last_log >= 1) ||
-        (time_since_last_log > 30) || now > settings_.time_limit) {
-      i_t depth      = node_queue_.bfs_top()->depth;
-      i_t int_infeas = node_queue_.bfs_top()->integer_infeasible;
-      report(' ', upper_bound_, lower_bound, depth, int_infeas);
-      exploration_stats_.last_log             = tic();
-      exploration_stats_.nodes_since_last_log = 0;
-    }
-
-    if (now > settings_.time_limit) {
-      solver_status_ = mip_status_t::TIME_LIMIT;
-      break;
-    }
-
-    // If there any node left in the heap, we pop the top node and explore it.
-    std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
-
-    if (!start_node.has_value()) { continue; }
-    if (upper_bound_.load() < start_node.value()->lower_bound) {
-      // This node was put on the heap earlier but its lower bound is now greater than the
-      // current upper bound
-      search_tree_.graphviz_node(
-        settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
-      search_tree_.update(start_node.value(), node_status_t::FATHOMED);
-      continue;
-    }
-
-    worker->init_best_first(start_node.value(), original_lp_);
-    plunge_with(worker);
-
-    lower_bound = get_lower_bound();
-    abs_gap     = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound);
-    rel_gap     = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
-
-    if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) {
-      solver_status_ = mip_status_t::OPTIMAL;
-      break;
-    }
-  }
+  bfs_worker_pool_.init(1, original_lp_, Arow_, var_types_, settings_);
+  bfs_worker_t<i_t, f_t>* worker = bfs_worker_pool_.pop_idle_worker();
+
+  node_queue_t<i_t, f_t>& node_queue = worker->node_queue;
+  node_queue.push(search_tree_.root.get_down_child());
+  node_queue.push(search_tree_.root.get_up_child());
+  worker->lower_bound = worker->node_queue.get_lower_bound();
+  worker->set_active();
+  best_first_search_with(worker);
 }
 
 template <typename i_t, typename f_t>
@@ -1894,7 +1907,6 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
   while (!root_crossover_solution_set_.load(std::memory_order_acquire) &&
          *get_root_concurrent_halt() == 0) {
     std::this_thread::sleep_for(std::chrono::milliseconds(1));
-    continue;
   }
 
   if (root_crossover_solution_set_.load(std::memory_order_acquire)) {
@@ -2507,7 +2519,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   set_uninitialized_steepest_edge_norms(original_lp_, basic_list, edge_norms_);
 
   pc_.resize(original_lp_.num_cols);
-  original_lp_.A.transpose(pc_.AT);
+  original_lp_.A.transpose(*pc_.AT);
   {
     raft::common::nvtx::range scope_sb("BB::strong_branching");
     strong_branching<i_t, f_t>(original_lp_,
@@ -2578,7 +2590,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   }
 
   // Choose variable to branch on
-  i_t branch_var = pc_.variable_selection(fractional, root_relax_soln_.x, log);
+  i_t branch_var = pc_.variable_selection(fractional, root_relax_soln_.x);
 
   search_tree_.root      = std::move(mip_node_t<i_t, f_t>(root_objective_, root_vstatus_));
   search_tree_.num_nodes = 0;
@@ -2590,8 +2602,6 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
                       root_vstatus_,
                       original_lp_,
                       log);
-  node_queue_.push(search_tree_.root.get_down_child());
-  node_queue_.push(search_tree_.root.get_up_child());
 
   settings_.log.printf("Exploring the B&B tree using %d threads\n\n", settings_.num_threads);
   node_concurrent_halt_ = 0;
@@ -2600,7 +2610,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   exploration_stats_.nodes_unexplored     = 2;
   exploration_stats_.nodes_since_last_log = 0;
   exploration_stats_.last_log             = tic();
-  min_node_queue_size_                    = 2 * settings_.num_threads;
+  min_node_queue_size_                    = 20;
 
   if (settings_.diving_settings.coefficient_diving != 0) {
     calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_);
@@ -2618,10 +2628,24 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   if (settings_.deterministic) {
     run_deterministic_coordinator(Arow_);
   } else if (settings_.num_threads > 1) {
+    const i_t num_workers        = settings_.num_threads;
+    const i_t num_bfs_workers    = std::max(settings_.num_threads / 2, 1);
+    const i_t num_diving_workers = num_workers - num_bfs_workers;
+    bfs_worker_pool_.init(num_bfs_workers, original_lp_, Arow_, var_types_, settings_);
+    diving_worker_pool_.init(
+      num_diving_workers, original_lp_, Arow_, var_types_, settings_, num_bfs_workers);
+
 #pragma omp parallel num_threads(settings_.num_threads)
     {
 #pragma omp master
-      run_scheduler();
+      {
+        if (num_bfs_workers > 1) {
+          launch_bfs_worker({search_tree_.root.get_up_child()});
+          launch_bfs_worker({search_tree_.root.get_down_child()});
+        } else {
+          launch_bfs_worker({search_tree_.root.get_up_child(), search_tree_.root.get_down_child()});
+        }
+      }
     }
   } else {
     single_threaded_solve();
@@ -2635,31 +2659,33 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
     lower_bound    = deterministic_compute_lower_bound();
     solver_status_ = deterministic_global_termination_status_;
   } else {
-    if (node_queue_.best_first_queue_size() > 0) {
+    lower_bound = std::numeric_limits<f_t>::infinity();
+
+    for (int i = 0; i < bfs_worker_pool_.num_workers(); ++i) {
+      bfs_worker_t<i_t, f_t>* worker = bfs_worker_pool_[i];
+
       // We need to clear the queue and use the info in the search tree for the lower bound
-      while (node_queue_.best_first_queue_size() > 0) {
-        std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
+      while (worker->node_queue.best_first_queue_size() > 0) {
+        mip_node_t<i_t, f_t>* start_node = worker->node_queue.pop_best_first();
 
-        if (!start_node.has_value()) { continue; }
-        if (upper_bound_.load() < start_node.value()->lower_bound) {
+        if (!start_node) { continue; }
+        if (upper_bound_.load() < start_node->lower_bound) {
           // This node was put on the heap earlier but its lower bound is now greater than the
           // current upper bound
-          search_tree_.graphviz_node(
-            settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
-          search_tree_.update(start_node.value(), node_status_t::FATHOMED);
-          continue;
+          search_tree_.graphviz_node(settings_.log, start_node, "cutoff", start_node->lower_bound);
+          search_tree_.update(start_node, node_status_t::FATHOMED);
         } else {
-          node_queue_.push(
-            start_node.value());  // Needed to ensure we don't lose the correct lower bound
+          // Needed to ensure we don't lose the correct lower bound
+          worker->node_queue.push(start_node);
+          lower_bound = std::min(lower_bound, worker->node_queue.get_lower_bound());
           break;
         }
       }
-      lower_bound = node_queue_.best_first_queue_size() > 0 ? node_queue_.get_lower_bound()
-                                                            : search_tree_.root.lower_bound;
-    } else {
-      lower_bound = search_tree_.root.lower_bound;
     }
+
+    if (!std::isfinite(lower_bound)) { lower_bound = search_tree_.root.lower_bound; }
   }
+
   set_final_solution(solution, lower_bound);
   return solver_status_;
 }
@@ -2788,17 +2814,9 @@ void branch_and_bound_t<i_t, f_t>::run_deterministic_coordinator(const csr_matri
   deterministic_horizon_step_ = 0.50;
 
   // Compute worker counts using the same formula as reliability-branching scheduler
-  const i_t num_workers = 2 * settings_.num_threads;
-  std::vector<search_strategy_t> search_strategies =
-    get_search_strategies(settings_.diving_settings);
-  std::array<i_t, num_search_strategies> max_num_workers =
-    get_max_workers(num_workers, search_strategies);
-
-  const int num_bfs_workers = max_num_workers[search_strategy_t::BEST_FIRST];
-  int num_diving_workers    = 0;
-  for (size_t i = 1; i < search_strategies.size(); ++i) {
-    num_diving_workers += max_num_workers[search_strategies[i]];
-  }
+  const i_t num_workers        = settings_.num_threads;
+  const i_t num_bfs_workers    = std::max(num_workers / 2, 1);
+  const i_t num_diving_workers = num_workers - num_bfs_workers;
 
   deterministic_mode_enabled_              = true;
   deterministic_current_horizon_           = deterministic_horizon_step_;
@@ -3197,10 +3215,10 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node_deterministic(
 
   // Solve LP relaxation
   worker.leaf_solution.resize(worker.leaf_problem.num_rows, worker.leaf_problem.num_cols);
-  std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
-  i_t node_iter                                = 0;
-  f_t lp_start_time                            = tic();
-  std::vector<f_t> leaf_edge_norms             = edge_norms_;
+  worker.leaf_vstatus = decompress_vstatus(node_ptr->packed_vstatus, worker.leaf_problem.num_cols);
+  i_t node_iter       = 0;
+  f_t lp_start_time   = tic();
+  std::vector<f_t> leaf_edge_norms = edge_norms_;
 
   dual::status_t lp_status = dual_phase2_with_advanced_basis(2,
                                                              0,
@@ -3208,7 +3226,7 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node_deterministic(
                                                              lp_start_time,
                                                              worker.leaf_problem,
                                                              lp_settings,
-                                                             leaf_vstatus,
+                                                             worker.leaf_vstatus,
                                                              worker.basis_factors,
                                                              worker.basic_list,
                                                              worker.nonbasic_list,
@@ -3226,7 +3244,7 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node_deterministic(
                                                                          worker.basis_factors,
                                                                          worker.basic_list,
                                                                          worker.nonbasic_list,
-                                                                         leaf_vstatus,
+                                                                         worker.leaf_vstatus,
                                                                          leaf_edge_norms,
                                                                          &worker.work_context);
     lp_status                 = convert_lp_status_to_dual_status(second_status);
@@ -3322,11 +3340,12 @@ template <typename PoolT>
 void branch_and_bound_t<i_t, f_t>::deterministic_broadcast_snapshots(
   PoolT& pool, const std::vector<f_t>& incumbent_snapshot)
 {
-  deterministic_snapshot_t<i_t, f_t> snap;
-  snap.upper_bound    = upper_bound_.load();
-  snap.total_lp_iters = exploration_stats_.total_lp_iters.load();
-  snap.incumbent      = incumbent_snapshot;
-  snap.pc_snapshot    = pc_.create_snapshot();
+  deterministic_snapshot_t<i_t, f_t> snap{
+    .upper_bound    = upper_bound_,
+    .pc_snapshot    = pc_,
+    .incumbent      = incumbent_snapshot,
+    .total_lp_iters = exploration_stats_.total_lp_iters,
+  };
 
   for (auto& worker : pool) {
     worker.set_snapshots(snap);
@@ -3683,8 +3702,10 @@ void branch_and_bound_t<i_t, f_t>::deterministic_assign_diving_nodes()
       continue;             // this worker is full, try next one
     }
 
-    auto entry = diving_heap_.pop();
-    if (entry.has_value()) { worker.enqueue_dive_node(entry.value().node, original_lp_); }
+    if (!diving_heap_.empty()) {
+      auto entry = diving_heap_.pop();
+      worker.enqueue_dive_node(entry.node, original_lp_);
+    }
   }
 
   diving_heap_.clear();
@@ -3734,11 +3755,6 @@ void branch_and_bound_t<i_t, f_t>::deterministic_dive(
 {
   raft::common::nvtx::range scope("BB::deterministic_dive");
 
-  // Create local search tree for the dive
-  search_tree_t<i_t, f_t> dive_tree(std::move(entry.node));
-  std::deque<mip_node_t<i_t, f_t>*> stack;
-  stack.push_front(&dive_tree.root);
-
   worker.dive_lower = std::move(entry.resolved_lower);
   worker.dive_upper = std::move(entry.resolved_upper);
 
@@ -3748,6 +3764,11 @@ void branch_and_bound_t<i_t, f_t>::deterministic_dive(
   worker.lp_iters_this_dive         = 0;
   worker.recompute_bounds_and_basis = true;
 
+  // Create local search tree for the dive
+  search_tree_t<i_t, f_t> dive_tree(std::move(entry.node));
+  circular_deque_t<mip_node_t<i_t, f_t>*> stack(2 * max_backtrack_depth + 4);
+  stack.push_front(&dive_tree.root);
+
   while (!stack.empty() && deterministic_global_termination_status_ == mip_status_t::UNSET &&
          nodes_this_dive < max_nodes_per_dive) {
     mip_node_t<i_t, f_t>* node_ptr = stack.front();
@@ -3808,18 +3829,19 @@ void branch_and_bound_t<i_t, f_t>::deterministic_dive(
 
     // Solve LP relaxation
     worker.leaf_solution.resize(worker.leaf_problem.num_rows, worker.leaf_problem.num_cols);
-    std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
-    i_t node_iter                                = 0;
-    f_t lp_start_time                            = tic();
-    std::vector<f_t> leaf_edge_norms             = edge_norms_;
+    i_t node_iter                    = 0;
+    f_t lp_start_time                = tic();
+    std::vector<f_t> leaf_edge_norms = edge_norms_;
 
+    worker.leaf_vstatus =
+      decompress_vstatus(node_ptr->packed_vstatus, worker.leaf_problem.num_cols);
     dual::status_t lp_status = dual_phase2_with_advanced_basis(2,
                                                                0,
                                                                worker.recompute_bounds_and_basis,
                                                                lp_start_time,
                                                                worker.leaf_problem,
                                                                lp_settings,
-                                                               leaf_vstatus,
+                                                               worker.leaf_vstatus,
                                                                worker.basis_factors,
                                                                worker.basic_list,
                                                                worker.nonbasic_list,
@@ -3836,7 +3858,7 @@ void branch_and_bound_t<i_t, f_t>::deterministic_dive(
                                                                            worker.basis_factors,
                                                                            worker.basic_list,
                                                                            worker.nonbasic_list,
-                                                                           leaf_vstatus,
+                                                                           worker.leaf_vstatus,
                                                                            leaf_edge_norms,
                                                                            &worker.work_context);
       lp_status                 = convert_lp_status_to_dual_status(second_status);
diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp
index f2917ba930..254305f574 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.hpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.hpp
@@ -8,12 +8,12 @@
 #pragma once
 
 #include <branch_and_bound/bb_event.hpp>
-#include <branch_and_bound/branch_and_bound_worker.hpp>
 #include <branch_and_bound/deterministic_workers.hpp>
-#include <branch_and_bound/diving_heuristics.hpp>
 #include <branch_and_bound/mip_node.hpp>
 #include <branch_and_bound/node_queue.hpp>
 #include <branch_and_bound/pseudo_costs.hpp>
+#include <branch_and_bound/worker.hpp>
+#include <branch_and_bound/worker_pool.hpp>
 
 #include <cuts/cuts.hpp>
 
@@ -235,18 +235,14 @@ class branch_and_bound_t {
   // Pseudocosts
   pseudo_costs_t<i_t, f_t> pc_;
 
-  // Heap storing the nodes waiting to be explored.
-  node_queue_t<i_t, f_t> node_queue_;
-
   // Search tree
   search_tree_t<i_t, f_t> search_tree_;
 
-  // Count the number of workers per type that either are being executed or
-  // are waiting to be executed.
-  std::array<omp_atomic_t<i_t>, num_search_strategies> active_workers_per_strategy_;
+  // Worker pool dedicated to the best-first search
+  bfs_worker_pool_t<i_t, f_t> bfs_worker_pool_;
 
-  // Worker pool
-  branch_and_bound_worker_pool_t<i_t, f_t> worker_pool_;
+  // Worker pool dedicated to diving
+  diving_worker_pool_t<i_t, f_t> diving_worker_pool_;
 
   // Global status of the solver.
   omp_atomic_t<mip_status_t> solver_status_;
@@ -288,21 +284,26 @@ class branch_and_bound_t {
   // Repairs low-quality solutions from the heuristics, if it is applicable.
   void repair_heuristic_solutions();
 
+  // Launch a new bfs worker initialized from the `start_node`.
+  bfs_worker_t<i_t, f_t>* launch_bfs_worker(const std::vector<mip_node_t<i_t, f_t>*>&);
+  // Launch a new diving worker from a given bfs worker. The dive will start
+  // from the node at the top of the local heap.
+  bool launch_diving_worker(bfs_worker_t<i_t, f_t>* bfs_worker);
+
+  // Perform best-first search with a given bfs worker.
+  void best_first_search_with(bfs_worker_t<i_t, f_t>* worker);
+
   // We use best-first to pick the `start_node` and then perform a depth-first search
   // from this node (i.e., a plunge). It can only backtrack to a sibling node.
   // Unexplored nodes in the subtree are inserted back into the global heap.
-  void plunge_with(branch_and_bound_worker_t<i_t, f_t>* worker);
+  void plunge_with(bfs_worker_t<i_t, f_t>* worker, mip_node_t<i_t, f_t>* start_node);
 
   // Perform a deep dive in the subtree determined by the `start_node` in order
   // to find integer feasible solutions.
-  void dive_with(branch_and_bound_worker_t<i_t, f_t>* worker);
-
-  // Run the scheduler whose will schedule and manage
-  // all the other workers.
-  void run_scheduler();
+  void dive_with(diving_worker_t<i_t, f_t>* worker);
 
   // Run the branch-and-bound algorithm in single threaded mode.
-  // This disable all diving heuristics.
+  // This disables all diving heuristics.
   void single_threaded_solve();
 
   // Solve the LP relaxation of a leaf node
@@ -318,7 +319,7 @@ class branch_and_bound_t {
 
   // Policy-based tree update shared between opportunistic and deterministic codepaths.
   template <typename WorkerT, typename Policy>
-  std::pair<node_status_t, rounding_direction_t> update_tree_impl(
+  std::pair<node_status_t, branch_direction_t> update_tree_impl(
     mip_node_t<i_t, f_t>* node_ptr,
     search_tree_t<i_t, f_t>& search_tree,
     WorkerT* worker,
@@ -326,7 +327,7 @@ class branch_and_bound_t {
     Policy& policy);
 
   // Opportunistic tree update wrapper.
-  std::pair<node_status_t, rounding_direction_t> update_tree(
+  std::pair<node_status_t, branch_direction_t> update_tree(
     mip_node_t<i_t, f_t>* node_ptr,
     search_tree_t<i_t, f_t>& search_tree,
     branch_and_bound_worker_t<i_t, f_t>* worker,
diff --git a/cpp/src/branch_and_bound/branch_and_bound_worker.hpp b/cpp/src/branch_and_bound/branch_and_bound_worker.hpp
deleted file mode 100644
index 4de2b43cae..0000000000
--- a/cpp/src/branch_and_bound/branch_and_bound_worker.hpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/* clang-format off */
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- */
-/* clang-format on */
-
-#pragma once
-
-#include <branch_and_bound/mip_node.hpp>
-
-#include <dual_simplex/basis_updates.hpp>
-#include <dual_simplex/bounds_strengthening.hpp>
-#include <dual_simplex/phase2.hpp>
-
-#include <utilities/pcgenerator.hpp>
-
-#include <array>
-#include <deque>
-#include <mutex>
-#include <vector>
-
-namespace cuopt::linear_programming::dual_simplex {
-
-constexpr int num_search_strategies = 5;
-
-// Indicate the search and variable selection algorithms used by each thread
-// in B&B (See [1]).
-//
-// [1] T. Achterberg, “Constraint Integer Programming,” PhD, Technischen Universität Berlin,
-// Berlin, 2007. doi: 10.14279/depositonce-1634.
-enum search_strategy_t : int {
-  BEST_FIRST         = 0,  // Best-First + Plunging.
-  PSEUDOCOST_DIVING  = 1,  // Pseudocost diving (9.2.5)
-  LINE_SEARCH_DIVING = 2,  // Line search diving (9.2.4)
-  GUIDED_DIVING      = 3,  // Guided diving (9.2.3).
-  COEFFICIENT_DIVING = 4   // Coefficient diving (9.2.1)
-};
-
-template <typename i_t, typename f_t>
-struct branch_and_bound_stats_t {
-  f_t start_time                         = 0.0;
-  omp_atomic_t<f_t> total_lp_solve_time  = 0.0;
-  omp_atomic_t<int64_t> nodes_explored   = 0;
-  omp_atomic_t<int64_t> nodes_unexplored = 0;
-  omp_atomic_t<int64_t> total_lp_iters   = 0;
-  omp_atomic_t<i_t> nodes_since_last_log = 0;
-  omp_atomic_t<f_t> last_log             = 0.0;
-};
-
-template <typename i_t, typename f_t>
-class branch_and_bound_worker_t {
- public:
-  const i_t worker_id;
-  omp_atomic_t<search_strategy_t> search_strategy;
-  omp_atomic_t<bool> is_active;
-  omp_atomic_t<f_t> lower_bound;
-
-  lp_problem_t<i_t, f_t> leaf_problem;
-  lp_solution_t<i_t, f_t> leaf_solution;
-  std::vector<f_t> leaf_edge_norms;
-
-  basis_update_mpf_t<i_t, f_t> basis_factors;
-  std::vector<i_t> basic_list;
-  std::vector<i_t> nonbasic_list;
-
-  bounds_strengthening_t<i_t, f_t> node_presolver;
-  std::vector<bool> bounds_changed;
-
-  std::vector<f_t> start_lower;
-  std::vector<f_t> start_upper;
-  mip_node_t<i_t, f_t>* start_node;
-
-  pcgenerator_t rng;
-
-  bool recompute_basis  = true;
-  bool recompute_bounds = true;
-
-  branch_and_bound_worker_t(i_t worker_id,
-                            const lp_problem_t<i_t, f_t>& original_lp,
-                            const csr_matrix_t<i_t, f_t>& Arow,
-                            const std::vector<variable_type_t>& var_type,
-                            const simplex_solver_settings_t<i_t, f_t>& settings)
-    : worker_id(worker_id),
-      search_strategy(BEST_FIRST),
-      is_active(false),
-      lower_bound(-std::numeric_limits<f_t>::infinity()),
-      leaf_problem(original_lp),
-      leaf_solution(original_lp.num_rows, original_lp.num_cols),
-      basis_factors(original_lp.num_rows, settings.refactor_frequency),
-      basic_list(original_lp.num_rows),
-      nonbasic_list(),
-      node_presolver(leaf_problem, Arow, {}, var_type),
-      bounds_changed(original_lp.num_cols, false),
-      rng(settings.random_seed + pcgenerator_t::default_seed + worker_id,
-          pcgenerator_t::default_stream ^ worker_id)
-  {
-  }
-
-  // Set the `start_node` for best-first search.
-  void init_best_first(mip_node_t<i_t, f_t>* node, const lp_problem_t<i_t, f_t>& original_lp)
-  {
-    start_node      = node;
-    start_lower     = original_lp.lower;
-    start_upper     = original_lp.upper;
-    search_strategy = BEST_FIRST;
-    lower_bound     = node->lower_bound;
-    is_active       = true;
-  }
-
-  // Initialize the worker for diving, setting the `start_node`, `start_lower` and
-  // `start_upper`. Returns `true` if the starting node is feasible via
-  // bounds propagation.
-  bool init_diving(mip_node_t<i_t, f_t>* node,
-                   search_strategy_t type,
-                   const lp_problem_t<i_t, f_t>& original_lp,
-                   const simplex_solver_settings_t<i_t, f_t>& settings)
-  {
-    internal_node = node->detach_copy();
-    start_node    = &internal_node;
-
-    start_lower     = original_lp.lower;
-    start_upper     = original_lp.upper;
-    search_strategy = type;
-    lower_bound     = node->lower_bound;
-    is_active       = true;
-
-    std::fill(bounds_changed.begin(), bounds_changed.end(), false);
-    node->get_variable_bounds(start_lower, start_upper, bounds_changed);
-    return node_presolver.bounds_strengthening(settings, bounds_changed, start_lower, start_upper);
-  }
-
-  // Set the variables bounds for the LP relaxation of the current node.
-  bool set_lp_variable_bounds(mip_node_t<i_t, f_t>* node_ptr,
-                              const simplex_solver_settings_t<i_t, f_t>& settings)
-  {
-    // Reset the bound_changed markers
-    std::fill(bounds_changed.begin(), bounds_changed.end(), false);
-
-    // Set the correct bounds for the leaf problem
-    if (recompute_bounds) {
-      leaf_problem.lower = start_lower;
-      leaf_problem.upper = start_upper;
-      node_ptr->get_variable_bounds(leaf_problem.lower, leaf_problem.upper, bounds_changed);
-
-    } else {
-      node_ptr->update_branched_variable_bounds(
-        leaf_problem.lower, leaf_problem.upper, bounds_changed);
-    }
-
-    return node_presolver.bounds_strengthening(
-      settings, bounds_changed, leaf_problem.lower, leaf_problem.upper);
-  }
-
- private:
-  // For diving, we need to store the full node instead of
-  // of just a pointer, since it is not stored in the tree anymore.
-  // To keep the same interface across all worker types,
-  // this will be used as a temporary storage and
-  // will be pointed by `start_node`.
-  // For exploration, this will not be used.
-  mip_node_t<i_t, f_t> internal_node;
-};
-
-template <typename i_t, typename f_t>
-class branch_and_bound_worker_pool_t {
- public:
-  void init(i_t num_workers,
-            const lp_problem_t<i_t, f_t>& original_lp,
-            const csr_matrix_t<i_t, f_t>& Arow,
-            const std::vector<variable_type_t>& var_type,
-            const simplex_solver_settings_t<i_t, f_t>& settings)
-  {
-    workers_.resize(num_workers);
-    num_idle_workers_ = num_workers;
-    for (i_t i = 0; i < num_workers; ++i) {
-      workers_[i] = std::make_unique<branch_and_bound_worker_t<i_t, f_t>>(
-        i, original_lp, Arow, var_type, settings);
-      idle_workers_.push_front(i);
-    }
-
-    is_initialized = true;
-  }
-
-  // Here, we are assuming that the scheduler is the only
-  // thread that can retrieve/pop an idle worker.
-  branch_and_bound_worker_t<i_t, f_t>* get_idle_worker()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex_);
-    if (idle_workers_.empty()) {
-      return nullptr;
-    } else {
-      i_t idx = idle_workers_.front();
-      return workers_[idx].get();
-    }
-  }
-
-  // Here, we are assuming that the scheduler is the only
-  // thread that can retrieve/pop an idle worker.
-  void pop_idle_worker()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex_);
-    if (!idle_workers_.empty()) {
-      idle_workers_.pop_front();
-      num_idle_workers_--;
-    }
-  }
-
-  void return_worker_to_pool(branch_and_bound_worker_t<i_t, f_t>* worker)
-  {
-    worker->is_active = false;
-    std::lock_guard<omp_mutex_t> lock(mutex_);
-    idle_workers_.push_back(worker->worker_id);
-    num_idle_workers_++;
-  }
-
-  f_t get_lower_bound()
-  {
-    f_t lower_bound = std::numeric_limits<f_t>::infinity();
-
-    if (is_initialized) {
-      for (i_t i = 0; i < workers_.size(); ++i) {
-        if (workers_[i]->search_strategy == BEST_FIRST && workers_[i]->is_active) {
-          lower_bound = std::min(workers_[i]->lower_bound.load(), lower_bound);
-        }
-      }
-    }
-
-    return lower_bound;
-  }
-
-  i_t num_idle_workers() { return num_idle_workers_; }
-
- private:
-  // Worker pool
-  std::vector<std::unique_ptr<branch_and_bound_worker_t<i_t, f_t>>> workers_;
-  bool is_initialized = false;
-
-  omp_mutex_t mutex_;
-  std::deque<i_t> idle_workers_;
-  omp_atomic_t<i_t> num_idle_workers_;
-};
-
-template <typename f_t, typename i_t>
-std::vector<search_strategy_t> get_search_strategies(
-  diving_heuristics_settings_t<i_t, f_t> settings)
-{
-  std::vector<search_strategy_t> types;
-  types.reserve(num_search_strategies);
-  types.push_back(BEST_FIRST);
-  if (settings.pseudocost_diving != 0) { types.push_back(PSEUDOCOST_DIVING); }
-  if (settings.line_search_diving != 0) { types.push_back(LINE_SEARCH_DIVING); }
-  if (settings.guided_diving != 0) { types.push_back(GUIDED_DIVING); }
-  if (settings.coefficient_diving != 0) { types.push_back(COEFFICIENT_DIVING); }
-  return types;
-}
-
-template <typename i_t>
-std::array<i_t, num_search_strategies> get_max_workers(
-  i_t num_workers, const std::vector<search_strategy_t>& strategies)
-{
-  std::array<i_t, num_search_strategies> max_num_workers;
-  max_num_workers.fill(0);
-
-  i_t bfs_workers             = std::max(strategies.size() == 1 ? num_workers : num_workers / 4, 1);
-  max_num_workers[BEST_FIRST] = bfs_workers;
-
-  i_t diving_workers = (num_workers - bfs_workers);
-  i_t m              = strategies.size() - 1;
-
-  for (size_t i = 1, k = 0; i < strategies.size(); ++i) {
-    i_t start                      = (double)k * diving_workers / m;
-    i_t end                        = (double)(k + 1) * diving_workers / m;
-    max_num_workers[strategies[i]] = end - start;
-    ++k;
-  }
-
-  return max_num_workers;
-}
-
-}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/branch_and_bound/constants.hpp b/cpp/src/branch_and_bound/constants.hpp
new file mode 100644
index 0000000000..4693e6789a
--- /dev/null
+++ b/cpp/src/branch_and_bound/constants.hpp
@@ -0,0 +1,35 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+#include <array>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+constexpr int num_search_strategies = 5;
+
+// Indicate the search and variable selection algorithms used by each thread
+// in B&B (See [1]).
+//
+// [1] T. Achterberg, “Constraint Integer Programming,” PhD, Technischen Universität Berlin,
+// Berlin, 2007. doi: 10.14279/depositonce-1634.
+enum search_strategy_t : int {
+  BEST_FIRST         = 0,  // Best-First + Plunging.
+  PSEUDOCOST_DIVING  = 1,  // Pseudocost diving (9.2.5)
+  LINE_SEARCH_DIVING = 2,  // Line search diving (9.2.4)
+  GUIDED_DIVING      = 3,  // Guided diving (9.2.3).
+  COEFFICIENT_DIVING = 4   // Coefficient diving (9.2.1)
+};
+
+constexpr std::array search_strategies = {
+  BEST_FIRST, PSEUDOCOST_DIVING, LINE_SEARCH_DIVING, GUIDED_DIVING, COEFFICIENT_DIVING};
+
+enum class branch_direction_t { NONE = -1, DOWN = 0, UP = 1 };
+
+enum class branch_and_bound_mode_t { PARALLEL = 0, DETERMINISTIC = 1 };
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/branch_and_bound/deterministic_workers.hpp b/cpp/src/branch_and_bound/deterministic_workers.hpp
index 7a074051c6..a63dad1bcc 100644
--- a/cpp/src/branch_and_bound/deterministic_workers.hpp
+++ b/cpp/src/branch_and_bound/deterministic_workers.hpp
@@ -8,9 +8,9 @@
 #pragma once
 
 #include <branch_and_bound/bb_event.hpp>
-#include <branch_and_bound/branch_and_bound_worker.hpp>
 #include <branch_and_bound/diving_heuristics.hpp>
 #include <branch_and_bound/node_queue.hpp>
+#include <branch_and_bound/worker.hpp>
 
 #include <utilities/work_limit_context.hpp>
 
@@ -58,7 +58,7 @@ struct deterministic_snapshot_t {
   f_t upper_bound;
   pseudo_cost_snapshot_t<i_t, f_t> pc_snapshot;
   std::vector<f_t> incumbent;
-  i_t total_lp_iters;
+  int64_t total_lp_iters;
 };
 
 template <typename i_t, typename f_t, typename Derived>
@@ -74,7 +74,7 @@ class deterministic_worker_base_t : public branch_and_bound_worker_t<i_t, f_t> {
 
   // Diving-specific snapshots (ignored by BFS workers)
   std::vector<f_t> incumbent_snapshot;
-  i_t total_lp_iters_snapshot{0};
+  int64_t total_lp_iters_snapshot{0};
 
   std::vector<queued_integer_solution_t<i_t, f_t>> integer_solutions;
   int next_solution_seq{0};
@@ -90,7 +90,9 @@ class deterministic_worker_base_t : public branch_and_bound_worker_t<i_t, f_t> {
                               const std::vector<variable_type_t>& var_types,
                               const simplex_solver_settings_t<i_t, f_t>& settings,
                               const std::string& context_name)
-    : base_t(id, original_lp, Arow, var_types, settings), work_context(context_name)
+    : base_t(id, original_lp, Arow, var_types, settings),
+      work_context(context_name),
+      pc_snapshot(1, settings)
   {
     work_context.deterministic = true;
   }
@@ -156,7 +158,7 @@ class deterministic_bfs_worker_t
 
   mip_node_t<i_t, f_t>* enqueue_children_for_plunge(mip_node_t<i_t, f_t>* down_child,
                                                     mip_node_t<i_t, f_t>* up_child,
-                                                    rounding_direction_t preferred_direction)
+                                                    branch_direction_t preferred_direction)
   {
     if (!plunge_stack.empty()) {
       backlog.push(plunge_stack.back());
@@ -169,7 +171,7 @@ class deterministic_bfs_worker_t
     up_child->creation_seq       = next_creation_seq++;
 
     mip_node_t<i_t, f_t>* first_child;
-    if (preferred_direction == rounding_direction_t::UP) {
+    if (preferred_direction == branch_direction_t::UP) {
       plunge_stack.push_front(down_child);
       plunge_stack.push_front(up_child);
       first_child = up_child;
@@ -193,8 +195,8 @@ class deterministic_bfs_worker_t
       plunge_stack.pop_front();
       return node;
     }
-    auto node_opt = backlog.pop();
-    return node_opt.has_value() ? node_opt.value() : nullptr;
+
+    return !backlog.empty() ? backlog.pop() : nullptr;
   }
 
   size_t queue_size() const
@@ -342,22 +344,6 @@ class deterministic_diving_worker_t
       {objective, solution, depth, this->worker_id, this->next_solution_seq++});
     ++this->total_integer_solutions;
   }
-
-  branch_variable_t<i_t> variable_selection_from_snapshot(const std::vector<i_t>& fractional,
-                                                          const std::vector<f_t>& solution) const
-  {
-    assert(root_solution != nullptr);
-    return this->pc_snapshot.pseudocost_diving(fractional, solution, *root_solution);
-  }
-
-  branch_variable_t<i_t> guided_variable_selection(const std::vector<i_t>& fractional,
-                                                   const std::vector<f_t>& solution) const
-  {
-    if (this->incumbent_snapshot.empty()) {
-      return variable_selection_from_snapshot(fractional, solution);
-    }
-    return this->pc_snapshot.guided_diving(fractional, solution, this->incumbent_snapshot);
-  }
 };
 
 template <typename i_t, typename f_t, typename WorkerT, typename Derived>
diff --git a/cpp/src/branch_and_bound/diving_heuristics.cpp b/cpp/src/branch_and_bound/diving_heuristics.cpp
index f9791280a6..4989e48c6a 100644
--- a/cpp/src/branch_and_bound/diving_heuristics.cpp
+++ b/cpp/src/branch_and_bound/diving_heuristics.cpp
@@ -7,8 +7,6 @@
 
 #include <branch_and_bound/diving_heuristics.hpp>
 
-#include <tuple>
-
 namespace cuopt::linear_programming::dual_simplex {
 
 template <typename i_t, typename f_t>
@@ -17,26 +15,26 @@ branch_variable_t<i_t> line_search_diving(const std::vector<i_t>& fractional,
                                           const std::vector<f_t>& root_solution,
                                           logger_t& log)
 {
-  constexpr f_t eps              = 1e-6;
-  i_t branch_var                 = -1;
-  f_t min_score                  = std::numeric_limits<f_t>::max();
-  rounding_direction_t round_dir = rounding_direction_t::NONE;
+  constexpr f_t eps            = 1e-6;
+  i_t branch_var               = -1;
+  f_t min_score                = std::numeric_limits<f_t>::max();
+  branch_direction_t round_dir = branch_direction_t::NONE;
 
   for (i_t j : fractional) {
-    f_t score                = inf;
-    rounding_direction_t dir = rounding_direction_t::NONE;
+    f_t score              = inf;
+    branch_direction_t dir = branch_direction_t::NONE;
 
     if (solution[j] < root_solution[j] - eps) {
       f_t f = solution[j] - std::floor(solution[j]);
       f_t d = root_solution[j] - solution[j];
       score = f / d;
-      dir   = rounding_direction_t::DOWN;
+      dir   = branch_direction_t::DOWN;
 
     } else if (solution[j] > root_solution[j] + eps) {
       f_t f = std::ceil(solution[j]) - solution[j];
       f_t d = solution[j] - root_solution[j];
       score = f / d;
-      dir   = rounding_direction_t::UP;
+      dir   = branch_direction_t::UP;
     }
 
     if (min_score > score) {
@@ -48,12 +46,12 @@ branch_variable_t<i_t> line_search_diving(const std::vector<i_t>& fractional,
 
   // If the current solution is equal to the root solution, arbitrarily
   // set the branch variable to the first fractional variable and round it down
-  if (round_dir == rounding_direction_t::NONE) {
+  if (round_dir == branch_direction_t::NONE) {
     branch_var = fractional[0];
-    round_dir  = rounding_direction_t::DOWN;
+    round_dir  = branch_direction_t::DOWN;
   }
 
-  assert(round_dir != rounding_direction_t::NONE);
+  assert(round_dir != branch_direction_t::NONE);
   assert(branch_var >= 0);
 
   log.debug("Line search diving: selected var %d with val = %e, round dir = %d and score = %e\n",
@@ -72,14 +70,63 @@ branch_variable_t<i_t> pseudocost_diving(pseudo_costs_t<i_t, f_t>& pc,
                                          const std::vector<f_t>& root_solution,
                                          logger_t& log)
 {
-  return pseudocost_diving_from_arrays(pc.pseudo_cost_sum_down.data(),
-                                       pc.pseudo_cost_sum_up.data(),
-                                       pc.pseudo_cost_num_down.data(),
-                                       pc.pseudo_cost_num_up.data(),
-                                       (i_t)pc.pseudo_cost_sum_down.size(),
-                                       fractional,
-                                       solution,
-                                       root_solution);
+  const i_t num_fractional = fractional.size();
+  if (num_fractional == 0) return {-1, branch_direction_t::NONE};
+
+  f_t avg_down = pc.compute_pseudocost_average_down();
+  f_t avg_up   = pc.compute_pseudocost_average_up();
+
+  i_t branch_var               = fractional[0];
+  f_t max_score                = std::numeric_limits<f_t>::lowest();
+  branch_direction_t round_dir = branch_direction_t::DOWN;
+  constexpr f_t eps            = f_t(1e-6);
+
+  for (i_t j : fractional) {
+    f_t f_down     = solution[j] - std::floor(solution[j]);
+    f_t f_up       = std::ceil(solution[j]) - solution[j];
+    f_t pc_down    = pc.get_pseudocost_down(j, avg_down);
+    f_t pc_up      = pc.get_pseudocost_up(j, avg_up);
+    f_t score_down = std::sqrt(f_up) * (1 + pc_up) / (1 + pc_down);
+    f_t score_up   = std::sqrt(f_down) * (1 + pc_down) / (1 + pc_up);
+
+    f_t score              = 0;
+    branch_direction_t dir = branch_direction_t::DOWN;
+
+    f_t root_val = root_solution[j];
+
+    if (solution[j] < root_val - f_t(0.4)) {
+      score = score_down;
+      dir   = branch_direction_t::DOWN;
+    } else if (solution[j] > root_val + f_t(0.4)) {
+      score = score_up;
+      dir   = branch_direction_t::UP;
+    } else if (f_down < f_t(0.3)) {
+      score = score_down;
+      dir   = branch_direction_t::DOWN;
+    } else if (f_down > f_t(0.7)) {
+      score = score_up;
+      dir   = branch_direction_t::UP;
+    } else if (pc_down < pc_up + eps) {
+      score = score_down;
+      dir   = branch_direction_t::DOWN;
+    } else {
+      score = score_up;
+      dir   = branch_direction_t::UP;
+    }
+
+    if (score > max_score) {
+      max_score  = score;
+      branch_var = j;
+      round_dir  = dir;
+    }
+  }
+
+  if (round_dir == branch_direction_t::NONE) {
+    branch_var = fractional[0];
+    round_dir  = branch_direction_t::DOWN;
+  }
+
+  return {branch_var, round_dir};
 }
 
 template <typename i_t, typename f_t>
@@ -89,14 +136,39 @@ branch_variable_t<i_t> guided_diving(pseudo_costs_t<i_t, f_t>& pc,
                                      const std::vector<f_t>& incumbent,
                                      logger_t& log)
 {
-  return guided_diving_from_arrays(pc.pseudo_cost_sum_down.data(),
-                                   pc.pseudo_cost_sum_up.data(),
-                                   pc.pseudo_cost_num_down.data(),
-                                   pc.pseudo_cost_num_up.data(),
-                                   (i_t)pc.pseudo_cost_sum_down.size(),
-                                   fractional,
-                                   solution,
-                                   incumbent);
+  const i_t num_fractional = fractional.size();
+  if (num_fractional == 0) return {-1, branch_direction_t::NONE};
+
+  f_t avg_down = pc.compute_pseudocost_average_down();
+  f_t avg_up   = pc.compute_pseudocost_average_up();
+
+  i_t branch_var               = fractional[0];
+  f_t max_score                = std::numeric_limits<f_t>::lowest();
+  branch_direction_t round_dir = branch_direction_t::DOWN;
+  constexpr f_t eps            = f_t(1e-6);
+
+  for (i_t j : fractional) {
+    f_t f_down    = solution[j] - std::floor(solution[j]);
+    f_t f_up      = std::ceil(solution[j]) - solution[j];
+    f_t down_dist = std::abs(incumbent[j] - std::floor(solution[j]));
+    f_t up_dist   = std::abs(std::ceil(solution[j]) - incumbent[j]);
+    branch_direction_t dir =
+      down_dist < up_dist + eps ? branch_direction_t::DOWN : branch_direction_t::UP;
+
+    f_t pc_down = pc.get_pseudocost_down(j, avg_down);
+    f_t pc_up   = pc.get_pseudocost_up(j, avg_up);
+    f_t score1  = dir == branch_direction_t::DOWN ? 5 * pc_down * f_down : 5 * pc_up * f_up;
+    f_t score2  = dir == branch_direction_t::DOWN ? pc_up * f_up : pc_down * f_down;
+    f_t score   = (score1 + score2) / 6;
+
+    if (score > max_score) {
+      max_score  = score;
+      branch_var = j;
+      round_dir  = dir;
+    }
+  }
+
+  return {branch_var, round_dir};
 }
 
 template <typename i_t, typename f_t>
@@ -130,10 +202,10 @@ branch_variable_t<i_t> coefficient_diving(const lp_problem_t<i_t, f_t>& lp_probl
                                           const std::vector<i_t>& down_locks,
                                           logger_t& log)
 {
-  i_t branch_var                 = -1;
-  i_t min_locks                  = std::numeric_limits<i_t>::max();
-  rounding_direction_t round_dir = rounding_direction_t::NONE;
-  constexpr f_t eps              = 1e-6;
+  i_t branch_var               = -1;
+  i_t min_locks                = std::numeric_limits<i_t>::max();
+  branch_direction_t round_dir = branch_direction_t::NONE;
+  constexpr f_t eps            = 1e-6;
 
   for (i_t j : fractional) {
     f_t f_down    = solution[j] - std::floor(solution[j]);
@@ -151,18 +223,18 @@ branch_variable_t<i_t> coefficient_diving(const lp_problem_t<i_t, f_t>& lp_probl
       branch_var = j;
 
       if (up_lock < down_lock) {
-        round_dir = rounding_direction_t::UP;
+        round_dir = branch_direction_t::UP;
       } else if (up_lock > down_lock) {
-        round_dir = rounding_direction_t::DOWN;
+        round_dir = branch_direction_t::DOWN;
       } else if (f_down < f_up + eps) {
-        round_dir = rounding_direction_t::DOWN;
+        round_dir = branch_direction_t::DOWN;
       } else {
-        round_dir = rounding_direction_t::UP;
+        round_dir = branch_direction_t::UP;
       }
     }
   }
 
-  assert(round_dir != rounding_direction_t::NONE);
+  assert(round_dir != branch_direction_t::NONE);
   assert(branch_var >= 0);
 
   log.debug(
diff --git a/cpp/src/branch_and_bound/mip_node.cpp b/cpp/src/branch_and_bound/mip_node.cpp
deleted file mode 100644
index 7b0f644f4e..0000000000
--- a/cpp/src/branch_and_bound/mip_node.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/* clang-format off */
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- */
-/* clang-format on */
-
-#include <branch_and_bound/mip_node.hpp>
-
-namespace cuopt::linear_programming::dual_simplex {
-
-bool inactive_status(node_status_t status)
-{
-  return (status == node_status_t::FATHOMED || status == node_status_t::INTEGER_FEASIBLE ||
-          status == node_status_t::INFEASIBLE || status == node_status_t::NUMERICAL);
-}
-
-}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/branch_and_bound/mip_node.hpp b/cpp/src/branch_and_bound/mip_node.hpp
index a24f67c3bc..c2c2a328a4 100644
--- a/cpp/src/branch_and_bound/mip_node.hpp
+++ b/cpp/src/branch_and_bound/mip_node.hpp
@@ -7,6 +7,8 @@
 
 #pragma once
 
+#include <branch_and_bound/constants.hpp>
+
 #include <dual_simplex/initial_basis.hpp>
 #include <dual_simplex/types.hpp>
 
@@ -29,31 +31,15 @@ enum class node_status_t : int {
   NUMERICAL        = 5   // Encountered numerical issue when solving the LP relaxation
 };
 
-enum class rounding_direction_t : int8_t { NONE = -1, DOWN = 0, UP = 1 };
-
-bool inactive_status(node_status_t status);
+inline bool inactive_status(node_status_t status)
+{
+  return (status == node_status_t::FATHOMED || status == node_status_t::INTEGER_FEASIBLE ||
+          status == node_status_t::INFEASIBLE || status == node_status_t::NUMERICAL);
+}
 
 template <typename i_t, typename f_t>
 class mip_node_t {
  public:
-  ~mip_node_t()
-  {
-    // Iterative teardown to avoid stack overflow on deep trees.
-    // Detach all descendants breadth-first, then destroy them as leaves.
-    std::vector<std::unique_ptr<mip_node_t>> nodes;
-    for (auto& c : children) {
-      if (c) { nodes.push_back(std::move(c)); }
-    }
-    // nodes.size() grows so that this loop only terminates when only leaves remain
-    for (size_t i = 0; i < nodes.size(); ++i) {
-      for (auto& c : nodes[i]->children) {
-        if (c) { nodes.push_back(std::move(c)); }
-      }
-    }
-
-    // scope-exit ensure destruction of all detached leaves
-  }
-
   mip_node_t(mip_node_t&&)            = default;
   mip_node_t& operator=(mip_node_t&&) = default;
 
@@ -64,12 +50,12 @@ class mip_node_t {
       parent(nullptr),
       node_id(0),
       branch_var(-1),
-      branch_dir(rounding_direction_t::NONE),
+      branch_dir(branch_direction_t::NONE),
       branch_var_lower(-std::numeric_limits<f_t>::infinity()),
       branch_var_upper(std::numeric_limits<f_t>::infinity()),
       fractional_val(std::numeric_limits<f_t>::infinity()),
       objective_estimate(std::numeric_limits<f_t>::infinity()),
-      vstatus(0)
+      packed_vstatus(0)
   {
     children[0] = nullptr;
     children[1] = nullptr;
@@ -82,10 +68,10 @@ class mip_node_t {
       parent(nullptr),
       node_id(0),
       branch_var(-1),
-      branch_dir(rounding_direction_t::NONE),
+      branch_dir(branch_direction_t::NONE),
       integer_infeasible(-1),
       objective_estimate(std::numeric_limits<f_t>::infinity()),
-      vstatus(basis)
+      packed_vstatus(compress_vstatus(basis))
   {
     children[0] = nullptr;
     children[1] = nullptr;
@@ -95,7 +81,7 @@ class mip_node_t {
              mip_node_t* parent_node,
              i_t node_num,
              i_t branch_variable,
-             rounding_direction_t branch_direction,
+             branch_direction_t branch_direction,
              f_t branch_var_value,
              i_t integer_inf,
              const std::vector<variable_status_t>& basis)
@@ -109,16 +95,32 @@ class mip_node_t {
       fractional_val(branch_var_value),
       integer_infeasible(integer_inf),
       objective_estimate(parent_node->objective_estimate),
-      vstatus(basis)
+      packed_vstatus(compress_vstatus(basis))
   {
-    branch_var_lower = branch_direction == rounding_direction_t::DOWN ? problem.lower[branch_var]
-                                                                      : std::ceil(branch_var_value);
-    branch_var_upper = branch_direction == rounding_direction_t::DOWN ? std::floor(branch_var_value)
-                                                                      : problem.upper[branch_var];
+    branch_var_lower = branch_direction == branch_direction_t::DOWN ? problem.lower[branch_var]
+                                                                    : std::ceil(branch_var_value);
+    branch_var_upper = branch_direction == branch_direction_t::DOWN ? std::floor(branch_var_value)
+                                                                    : problem.upper[branch_var];
     children[0]      = nullptr;
     children[1]      = nullptr;
   }
 
+  ~mip_node_t()
+  {
+    // Iterative teardown to avoid stack overflow on deep trees.
+    // Detach all descendants breadth-first, then destroy them as leaves.
+    std::vector<std::unique_ptr<mip_node_t>> nodes;
+    for (auto& c : children) {
+      if (c) { nodes.push_back(std::move(c)); }
+    }
+    // nodes.size() grows so that this loop only terminates when only leaves remain
+    for (size_t i = 0; i < nodes.size(); ++i) {
+      for (auto& c : nodes[i]->children) {
+        if (c) { nodes.push_back(std::move(c)); }
+      }
+    }
+  }
+
   void get_variable_bounds(std::vector<f_t>& lower,
                            std::vector<f_t>& upper,
                            std::vector<bool>& bounds_changed) const
@@ -164,7 +166,7 @@ class mip_node_t {
     children[0] = std::move(down_child);
     children[1] = std::move(up_child);
     // When we add children we no longer need to store our basis
-    vstatus.clear();
+    packed_vstatus = {};
   }
 
   bool is_inactive() const
@@ -252,7 +254,7 @@ class mip_node_t {
   // This method creates a copy of the current node
   // with its parent set to `nullptr`
   // This detaches the node from the tree.
-  mip_node_t<i_t, f_t> detach_copy() const
+  mip_node_t detach_copy() const
   {
     mip_node_t<i_t, f_t> copy;
     copy.lower_bound        = lower_bound;
@@ -260,7 +262,7 @@ class mip_node_t {
     copy.depth              = depth;
     copy.node_id            = node_id;
     copy.integer_infeasible = integer_infeasible;
-    copy.vstatus            = vstatus;
+    copy.packed_vstatus     = packed_vstatus;
     copy.branch_var         = branch_var;
     copy.branch_dir         = branch_dir;
     copy.branch_var_lower   = branch_var_lower;
@@ -282,7 +284,7 @@ class mip_node_t {
   i_t depth;
   i_t node_id;
   i_t branch_var;
-  rounding_direction_t branch_dir;
+  branch_direction_t branch_dir;
   f_t branch_var_lower;
   f_t branch_var_upper;
   f_t fractional_val;
@@ -291,7 +293,11 @@ class mip_node_t {
   mip_node_t<i_t, f_t>* parent;
   std::unique_ptr<mip_node_t> children[2];
 
-  std::vector<variable_status_t> vstatus;
+  std::vector<uint8_t> packed_vstatus;
+
+  // Indicate if we can dive from this node or not. This is set to false when
+  // this node was already selected for diving once.
+  omp_atomic_t<bool> can_dive{true};
 
   // Worker-local identification for deterministic ordering:
   // - origin_worker_id: which worker created this node
@@ -312,7 +318,7 @@ class mip_node_t {
     const mip_node_t* node = this;
     while (node != nullptr && node->branch_var >= 0) {
       uint64_t step = static_cast<uint64_t>(node->branch_var) << 1;
-      step |= (node->branch_dir == rounding_direction_t::UP) ? 1 : 0;
+      step |= (node->branch_dir == branch_direction_t::UP) ? 1 : 0;
       path_steps.push_back(step);
       node = node->parent;
     }
@@ -359,7 +365,7 @@ class search_tree_t {
                                                              parent_node,
                                                              ++id,
                                                              branch_var,
-                                                             rounding_direction_t::DOWN,
+                                                             branch_direction_t::DOWN,
                                                              fractional_val,
                                                              integer_infeasible,
                                                              parent_vstatus);
@@ -367,14 +373,14 @@ class search_tree_t {
                   parent_node,
                   down_child.get(),
                   branch_var,
-                  rounding_direction_t::DOWN,
+                  branch_direction_t::DOWN,
                   std::floor(fractional_val));
 
     auto up_child = std::make_unique<mip_node_t<i_t, f_t>>(original_lp,
                                                            parent_node,
                                                            ++id,
                                                            branch_var,
-                                                           rounding_direction_t::UP,
+                                                           branch_direction_t::UP,
                                                            fractional_val,
                                                            integer_infeasible,
                                                            parent_vstatus);
@@ -383,7 +389,7 @@ class search_tree_t {
                   parent_node,
                   up_child.get(),
                   branch_var,
-                  rounding_direction_t::UP,
+                  branch_direction_t::UP,
                   std::ceil(fractional_val));
 
     assert(parent_vstatus.size() == original_lp.num_cols);
@@ -405,7 +411,7 @@ class search_tree_t {
                      const mip_node_t<i_t, f_t>* origin_ptr,
                      const mip_node_t<i_t, f_t>* dest_ptr,
                      const i_t branch_var,
-                     rounding_direction_t branch_dir,
+                     branch_direction_t branch_dir,
                      const f_t bound)
   {
     if (write_graphviz) {
@@ -413,7 +419,7 @@ class search_tree_t {
                  origin_ptr->node_id,
                  dest_ptr->node_id,
                  branch_var,
-                 branch_dir == rounding_direction_t::DOWN ? "<=" : ">=",
+                 branch_dir == branch_direction_t::DOWN ? "<=" : ">=",
                  bound);
     }
   }
diff --git a/cpp/src/branch_and_bound/node_queue.hpp b/cpp/src/branch_and_bound/node_queue.hpp
index 09d030c96e..15c50c99b7 100644
--- a/cpp/src/branch_and_bound/node_queue.hpp
+++ b/cpp/src/branch_and_bound/node_queue.hpp
@@ -10,7 +10,6 @@
 #include <algorithm>
 #include <memory>
 #include <mutex>
-#include <optional>
 #include <utility>
 #include <vector>
 
@@ -29,12 +28,14 @@ class heap_t {
   {
     buffer.push_back(node);
     std::push_heap(buffer.begin(), buffer.end(), comp);
+    ++num_entries_;
   }
 
   void push(T&& node)
   {
     buffer.push_back(std::move(node));
     std::push_heap(buffer.begin(), buffer.end(), comp);
+    ++num_entries_;
   }
 
   template <typename... Args>
@@ -42,39 +43,99 @@ class heap_t {
   {
     buffer.emplace_back(std::forward<Args>(args)...);
     std::push_heap(buffer.begin(), buffer.end(), comp);
+    ++num_entries_;
+    assert(num_entries_.load() == buffer.size());
   }
 
-  std::optional<T> pop()
+  T pop()
   {
-    if (buffer.empty()) return std::nullopt;
-
     std::pop_heap(buffer.begin(), buffer.end(), comp);
     T node = std::move(buffer.back());
     buffer.pop_back();
+    --num_entries_;
+    assert(num_entries_.load() == buffer.size());
     return node;
   }
 
-  size_t size() const { return buffer.size(); }
+  size_t size() const { return num_entries_; }
   T& top() { return buffer.front(); }
-  void clear() { buffer.clear(); }
-  bool empty() const { return buffer.empty(); }
+
+  void clear()
+  {
+    buffer.clear();
+    num_entries_ = 0;
+  }
+
+  bool empty() const { return num_entries_ == 0; }
 
   // Read-only access to underlying buffer for iteration without modification
   const std::vector<T>& data() const { return buffer; }
 
  private:
   std::vector<T> buffer;
+  omp_atomic_t<size_t> num_entries_{0};
   Comp comp;
 };
 
-// A queue storing the nodes waiting to be explored/dived from.
+// A queue storing the nodes waiting to be explored. Before calling pop or push in parallel,
+// the mutex NEEDS to be acquired via the `lock()` method. It must the released afterwards with
+// `unlock()`.
 template <typename i_t, typename f_t>
 class node_queue_t {
+ public:
+  void push(mip_node_t<i_t, f_t>* new_node)
+  {
+    assert(new_node != nullptr);
+    auto entry = std::make_shared<heap_entry_t>(new_node);
+    best_first_heap_.push(entry);
+    if (new_node->can_dive) diving_heap_.push(entry);
+    lower_bound_ = best_first_heap_.top()->lower_bound;
+  }
+
+  mip_node_t<i_t, f_t>* pop_best_first()
+  {
+    if (best_first_heap_.empty()) { return nullptr; }
+    auto entry                 = best_first_heap_.pop();
+    lower_bound_               = best_first_heap_.empty() ? std::numeric_limits<f_t>::infinity()
+                                                          : best_first_heap_.top()->lower_bound;
+    mip_node_t<i_t, f_t>* node = std::exchange(entry->node, nullptr);
+    assert(node != nullptr);
+    return node;
+  }
+
+  mip_node_t<i_t, f_t>* pop_diving()
+  {
+    while (!diving_heap_.empty()) {
+      auto entry = diving_heap_.pop();
+      if (entry->node != nullptr) {
+        entry->node->can_dive = false;
+        return entry->node;
+      }
+    }
+    return nullptr;
+  }
+
+  void lock() { mutex_.lock(); }
+  void unlock() { mutex_.unlock(); }
+
+  mip_node_t<i_t, f_t>* bfs_top()
+  {
+    return best_first_heap_.empty() ? nullptr : best_first_heap_.top()->node;
+  }
+
+  i_t diving_queue_size() { return diving_heap_.size(); }
+  i_t best_first_queue_size() { return best_first_heap_.size(); }
+
+  f_t get_lower_bound()
+  {
+    return best_first_heap_.empty() ? std::numeric_limits<f_t>::infinity() : lower_bound_.load();
+  }
+
  private:
   struct heap_entry_t {
     mip_node_t<i_t, f_t>* node = nullptr;
-    f_t lower_bound            = -inf;
-    f_t score                  = inf;
+    f_t lower_bound            = -std::numeric_limits<f_t>::infinity();
+    f_t score                  = std::numeric_limits<f_t>::infinity();
 
     heap_entry_t(mip_node_t<i_t, f_t>* new_node)
       : node(new_node), lower_bound(new_node->lower_bound), score(new_node->objective_estimate)
@@ -102,67 +163,11 @@ class node_queue_t {
     }
   };
 
-  heap_t<std::shared_ptr<heap_entry_t>, lower_bound_comp> best_first_heap;
-  heap_t<std::shared_ptr<heap_entry_t>, score_comp> diving_heap;
-  omp_mutex_t mutex;
-
- public:
-  void push(mip_node_t<i_t, f_t>* new_node)
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-    auto entry = std::make_shared<heap_entry_t>(new_node);
-    best_first_heap.push(entry);
-    diving_heap.push(entry);
-  }
-
-  std::optional<mip_node_t<i_t, f_t>*> pop_best_first()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-    auto entry = best_first_heap.pop();
-
-    if (entry.has_value()) { return std::exchange(entry.value()->node, nullptr); }
-
-    return std::nullopt;
-  }
-
-  std::optional<mip_node_t<i_t, f_t>*> pop_diving()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-
-    while (!diving_heap.empty()) {
-      auto entry = diving_heap.pop();
-
-      if (entry.has_value()) {
-        if (auto node_ptr = entry.value()->node; node_ptr != nullptr) { return node_ptr; }
-      }
-    }
+  heap_t<std::shared_ptr<heap_entry_t>, lower_bound_comp> best_first_heap_;
+  heap_t<std::shared_ptr<heap_entry_t>, score_comp> diving_heap_;
+  omp_mutex_t mutex_;
 
-    return std::nullopt;
-  }
-
-  i_t diving_queue_size()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-    return diving_heap.size();
-  }
-
-  i_t best_first_queue_size()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-    return best_first_heap.size();
-  }
-
-  f_t get_lower_bound()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-    return best_first_heap.empty() ? inf : best_first_heap.top()->lower_bound;
-  }
-
-  mip_node_t<i_t, f_t>* bfs_top()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-    return best_first_heap.empty() ? nullptr : best_first_heap.top()->node;
-  }
+  omp_atomic_t<f_t> lower_bound_{std::numeric_limits<f_t>::infinity()};
 };
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index c38e98e27d..219f60db0d 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -24,7 +24,6 @@
 #include <omp.h>
 
 namespace cuopt::linear_programming::dual_simplex {
-
 namespace {
 
 static bool is_dual_simplex_done(dual::status_t status)
@@ -218,8 +217,10 @@ void initialize_pseudo_costs_with_estimate(const lp_problem_t<i_t, f_t>& lp,
                                            const std::vector<i_t>& basic_list,
                                            const std::vector<i_t>& nonbasic_list,
                                            const std::vector<i_t>& fractional,
+                                           const csc_matrix_t<i_t, f_t>& AT,
                                            basis_update_mpf_t<i_t, f_t>& basis_factors,
-                                           pseudo_costs_t<i_t, f_t>& pc)
+                                           std::vector<f_t>& strong_branch_down,
+                                           std::vector<f_t>& strong_branch_up)
 {
   i_t m = lp.num_rows;
   i_t n = lp.num_cols;
@@ -246,7 +247,7 @@ void initialize_pseudo_costs_with_estimate(const lp_problem_t<i_t, f_t>& lp,
     objective_change_estimate_t<f_t> estimate =
       single_pivot_objective_change_estimate(lp,
                                              settings,
-                                             pc.AT,
+                                             AT,
                                              vstatus,
                                              j,
                                              basic_map[j],
@@ -258,8 +259,8 @@ void initialize_pseudo_costs_with_estimate(const lp_problem_t<i_t, f_t>& lp,
                                              workspace,
                                              delta_z,
                                              work_estimate);
-    pc.strong_branch_down[k] = estimate.down_obj_change;
-    pc.strong_branch_up[k]   = estimate.up_obj_change;
+    strong_branch_down[k] = estimate.down_obj_change;
+    strong_branch_up[k]   = estimate.up_obj_change;
   }
 }
 
@@ -298,12 +299,14 @@ void strong_branch_helper(i_t start,
                           f_t root_obj,
                           f_t upper_bound,
                           i_t iter_limit,
-                          pseudo_costs_t<i_t, f_t>& pc,
+                          std::vector<f_t>& strong_branch_down,
+                          std::vector<f_t>& strong_branch_up,
                           std::vector<f_t>& dual_simplex_obj_down,
                           std::vector<f_t>& dual_simplex_obj_up,
                           std::vector<dual::status_t>& dual_simplex_status_down,
                           std::vector<dual::status_t>& dual_simplex_status_up,
-                          shared_strong_branching_context_view_t<i_t, f_t>& sb_view)
+                          shared_strong_branching_context_view_t<i_t, f_t>& sb_view,
+                          omp_atomic_t<i_t>& num_strong_branches_completed)
 {
   raft::common::nvtx::range scope("BB::strong_branch_helper");
   lp_problem_t child_problem = original_lp;
@@ -380,7 +383,7 @@ void strong_branch_helper(i_t start,
       }
 
       if (branch == 0) {
-        pc.strong_branch_down[k]    = std::max(obj - root_obj, 0.0);
+        strong_branch_down[k]       = std::max(obj - root_obj, 0.0);
         dual_simplex_obj_down[k]    = std::max(obj - root_obj, 0.0);
         dual_simplex_status_down[k] = status;
         if (verbose) {
@@ -393,7 +396,7 @@ void strong_branch_helper(i_t start,
                               toc(start_time));
         }
       } else {
-        pc.strong_branch_up[k]    = std::max(obj - root_obj, 0.0);
+        strong_branch_up[k]       = std::max(obj - root_obj, 0.0);
         dual_simplex_obj_up[k]    = std::max(obj - root_obj, 0.0);
         dual_simplex_status_up[k] = status;
         if (verbose) {
@@ -431,7 +434,7 @@ void strong_branch_helper(i_t start,
     }
     if (toc(start_time) > settings.time_limit) { break; }
 
-    const i_t completed = pc.num_strong_branches_completed++;
+    const i_t completed = num_strong_branches_completed++;
 
     if (thread_id == 0 && toc(last_log) > 10) {
       last_log = tic();
@@ -463,7 +466,7 @@ std::pair<f_t, dual::status_t> trial_branching(const lp_problem_t<i_t, f_t>& ori
                                                f_t upper_bound,
                                                f_t start_time,
                                                i_t iter_limit,
-                                               omp_atomic_t<int64_t>& total_lp_iter)
+                                               i_t& iter)
 {
   lp_problem_t child_problem      = original_lp;
   child_problem.lower[branch_var] = branch_var_lower;
@@ -479,7 +482,7 @@ std::pair<f_t, dual::status_t> trial_branching(const lp_problem_t<i_t, f_t>& ori
     objective_upper_bound(child_problem, upper_bound, child_settings.dual_tol);
 
   lp_solution_t<i_t, f_t> solution(original_lp.num_rows, original_lp.num_cols);
-  i_t iter                                         = 0;
+  iter                                             = 0;
   std::vector<variable_status_t> child_vstatus     = vstatus;
   std::vector<f_t> child_edge_norms                = edge_norms;
   std::vector<i_t> child_basic_list                = basic_list;
@@ -502,7 +505,7 @@ std::pair<f_t, dual::status_t> trial_branching(const lp_problem_t<i_t, f_t>& ori
                                                           solution,
                                                           iter,
                                                           child_edge_norms);
-  total_lp_iter += iter;
+
   settings.log.debug("Trial branching on variable %d. Lo: %e Up: %e. Iter %d. Status %s. Obj %e\n",
                      branch_var,
                      child_problem.lower[branch_var],
@@ -732,9 +735,9 @@ static void batch_pdlp_strong_branching_task(
     std::max(static_cast<f_t>(0.0), settings.time_limit - batch_elapsed_time);
   if (warm_start_remaining_time <= 0.0) { return; }
 
-  assert(!pc.pdlp_warm_cache.populated && "PDLP warm cache should not be populated at this point");
+  assert(!pc.pdlp_warm_cache->populated && "PDLP warm cache should not be populated at this point");
 
-  if (!pc.pdlp_warm_cache.populated) {
+  if (!pc.pdlp_warm_cache->populated) {
     pdlp_solver_settings_t<i_t, f_t> ws_settings;
     ws_settings.method               = method_t::PDLP;
     ws_settings.presolver            = presolver_t::None;
@@ -756,14 +759,14 @@ static void batch_pdlp_strong_branching_task(
     ws_settings.inside_mip                           = true;
     if (effective_batch_pdlp == 1) { ws_settings.concurrent_halt = &concurrent_halt; }
 
-    auto start_time = std::chrono::high_resolution_clock::now();
+    auto lp_start_time = std::chrono::high_resolution_clock::now();
 
-    auto ws_solution = solve_lp(&pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, ws_settings);
+    auto ws_solution = solve_lp(&pc.pdlp_warm_cache->batch_pdlp_handle, mps_model, ws_settings);
 
     if (verbose) {
       auto end_time = std::chrono::high_resolution_clock::now();
       auto duration =
-        std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
+        std::chrono::duration_cast<std::chrono::milliseconds>(end_time - lp_start_time).count();
       settings.log.printf(
         "Original problem solved in %d milliseconds"
         " and iterations: %d\n",
@@ -777,21 +780,21 @@ static void batch_pdlp_strong_branching_task(
       const auto& ws_dual   = ws_solution.get_dual_solution();
       // Need to use the pc steam since the batch pdlp handle will get destroyed after the warm
       // start
-      cache.initial_primal = rmm::device_uvector<f_t>(ws_primal, ws_primal.stream());
-      cache.initial_dual   = rmm::device_uvector<f_t>(ws_dual, ws_dual.stream());
-      cache.step_size      = ws_solution.get_pdlp_warm_start_data().initial_step_size_;
-      cache.primal_weight  = ws_solution.get_pdlp_warm_start_data().initial_primal_weight_;
-      cache.pdlp_iteration = ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_;
-      cache.populated      = true;
+      cache->initial_primal = rmm::device_uvector<f_t>(ws_primal, ws_primal.stream());
+      cache->initial_dual   = rmm::device_uvector<f_t>(ws_dual, ws_dual.stream());
+      cache->step_size      = ws_solution.get_pdlp_warm_start_data().initial_step_size_;
+      cache->primal_weight  = ws_solution.get_pdlp_warm_start_data().initial_primal_weight_;
+      cache->pdlp_iteration = ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_;
+      cache->populated      = true;
 
       if (verbose) {
         settings.log.printf(
           "Cached PDLP warm start: primal=%zu dual=%zu step_size=%e primal_weight=%e iters=%d\n",
-          cache.initial_primal.size(),
-          cache.initial_dual.size(),
-          cache.step_size,
-          cache.primal_weight,
-          cache.pdlp_iteration);
+          cache->initial_primal.size(),
+          cache->initial_dual.size(),
+          cache->step_size,
+          cache->primal_weight,
+          cache->pdlp_iteration);
       }
     } else {
       if (verbose) {
@@ -817,22 +820,23 @@ static void batch_pdlp_strong_branching_task(
   if (batch_remaining_time <= 0.0) { return; }
   pdlp_settings.time_limit = batch_remaining_time;
 
-  if (pc.pdlp_warm_cache.populated) {
+  if (pc.pdlp_warm_cache->populated) {
     auto& cache = pc.pdlp_warm_cache;
-    pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(),
-                                              cache.initial_primal.size(),
-                                              cache.batch_pdlp_handle.get_stream());
-    pdlp_settings.set_initial_dual_solution(
-      cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream());
-    pdlp_settings.set_initial_step_size(cache.step_size);
-    pdlp_settings.set_initial_primal_weight(cache.primal_weight);
-    pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration);
+    pdlp_settings.set_initial_primal_solution(cache->initial_primal.data(),
+                                              cache->initial_primal.size(),
+                                              cache->batch_pdlp_handle.get_stream());
+    pdlp_settings.set_initial_dual_solution(cache->initial_dual.data(),
+                                            cache->initial_dual.size(),
+                                            cache->batch_pdlp_handle.get_stream());
+    pdlp_settings.set_initial_step_size(cache->step_size);
+    pdlp_settings.set_initial_primal_weight(cache->primal_weight);
+    pdlp_settings.set_initial_pdlp_iteration(cache->pdlp_iteration);
   }
 
   if (concurrent_halt.load() == 1) { return; }
 
   const auto solutions = batch_pdlp_solve(
-    &pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings);
+    &pc.pdlp_warm_cache->batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings);
   f_t batch_pdlp_strong_branching_time = toc(start_batch);
 
   // Fail safe in case the batch PDLP failed and produced no solutions
@@ -888,13 +892,13 @@ static void batch_pdlp_reliability_branching_task(
   const std::vector<i_t>& candidate_vars,
   const simplex_solver_settings_t<i_t, f_t>& settings,
   shared_strong_branching_context_view_t<i_t, f_t>& sb_view,
-  batch_pdlp_warm_cache_t<i_t, f_t>& pdlp_warm_cache,
+  batch_pdlp_warm_cache_t<i_t, f_t>* pdlp_warm_cache,
   std::vector<f_t>& pdlp_obj_down,
   std::vector<f_t>& pdlp_obj_up)
 {
-  log.printf(rb_mode == 2 ? "RB batch PDLP only for %d candidates\n"
-                          : "RB cooperative batch PDLP and DS for %d candidates\n",
-             num_candidates);
+  log.debug(rb_mode == 2 ? "RB batch PDLP only for %d candidates\n"
+                         : "RB cooperative batch PDLP and DS for %d candidates\n",
+            num_candidates);
 
   f_t start_batch = tic();
 
@@ -935,15 +939,16 @@ static void batch_pdlp_reliability_branching_task(
   }
   pdlp_settings.time_limit = batch_remaining_time;
 
-  if (pdlp_warm_cache.populated) {
-    auto& cache = pdlp_warm_cache;
-    pdlp_settings.set_initial_primal_solution(
-      cache.initial_primal.data(), cache.initial_primal.size(), batch_pdlp_handle.get_stream());
-    pdlp_settings.set_initial_dual_solution(
-      cache.initial_dual.data(), cache.initial_dual.size(), batch_pdlp_handle.get_stream());
-    pdlp_settings.set_initial_step_size(cache.step_size);
-    pdlp_settings.set_initial_primal_weight(cache.primal_weight);
-    pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration);
+  if (pdlp_warm_cache->populated) {
+    pdlp_settings.set_initial_primal_solution(pdlp_warm_cache->initial_primal.data(),
+                                              pdlp_warm_cache->initial_primal.size(),
+                                              batch_pdlp_handle.get_stream());
+    pdlp_settings.set_initial_dual_solution(pdlp_warm_cache->initial_dual.data(),
+                                            pdlp_warm_cache->initial_dual.size(),
+                                            batch_pdlp_handle.get_stream());
+    pdlp_settings.set_initial_step_size(pdlp_warm_cache->step_size);
+    pdlp_settings.set_initial_primal_weight(pdlp_warm_cache->primal_weight);
+    pdlp_settings.set_initial_pdlp_iteration(pdlp_warm_cache->pdlp_iteration);
   }
 
   if (concurrent_halt.load() == 1) { return; }
@@ -955,7 +960,7 @@ static void batch_pdlp_reliability_branching_task(
 
   if (solutions.get_additional_termination_informations().size() !=
       static_cast<size_t>(num_candidates) * 2) {
-    log.printf("RB batch PDLP failed and produced no solutions\n");
+    log.debug("RB batch PDLP failed and produced no solutions\n");
     return;
   }
 
@@ -966,10 +971,10 @@ static void batch_pdlp_reliability_branching_task(
     }
   }
 
-  log.printf("RB batch PDLP completed in %.2fs. Solved %d/%d\n",
-             batch_pdlp_time,
-             amount_done,
-             num_candidates * 2);
+  log.debug("RB batch PDLP completed in %.2fs. Solved %d/%d\n",
+            batch_pdlp_time,
+            amount_done,
+            num_candidates * 2);
 
   for (i_t k = 0; k < num_candidates; k++) {
     if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) {
@@ -1002,9 +1007,9 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
   constexpr bool verbose = false;
 
   pc.resize(original_lp.num_cols);
-  pc.strong_branch_down.assign(fractional.size(), 0);
-  pc.strong_branch_up.assign(fractional.size(), 0);
-  pc.num_strong_branches_completed = 0;
+  std::vector<f_t> strong_branch_down(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
+  std::vector<f_t> strong_branch_up(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
+  omp_atomic_t<i_t> num_strong_branches_completed = 0;
 
   const f_t elapsed_time = toc(start_time);
   if (elapsed_time > settings.time_limit) { return; }
@@ -1049,8 +1054,10 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                                           basic_list,
                                           nonbasic_list,
                                           fractional,
+                                          *pc.AT,
                                           basis_factors,
-                                          pc);
+                                          strong_branch_down,
+                                          strong_branch_up);
   } else {
 #pragma omp parallel num_threads(settings.num_threads)
     {
@@ -1082,7 +1089,6 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
             i_t start = std::floor(k * fractional.size() / n);
             i_t end   = std::floor((k + 1) * fractional.size() / n);
 
-            constexpr bool verbose = false;
             if (verbose) {
               settings.log.printf("Thread id %d task id %d start %d end %d. size %d\n",
                                   omp_get_thread_num(),
@@ -1105,12 +1111,14 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                                  root_obj,
                                  upper_bound,
                                  simplex_iteration_limit,
-                                 pc,
+                                 strong_branch_down,
+                                 strong_branch_up,
                                  dual_simplex_obj_down,
                                  dual_simplex_obj_up,
                                  dual_simplex_status_down,
                                  dual_simplex_status_up,
-                                 sb_view);
+                                 sb_view,
+                                 num_strong_branches_completed);
           }
           // DS done: signal PDLP to stop (time-limit or all work done) and wait
           if (effective_batch_pdlp == 1) { concurrent_halt.store(1); }
@@ -1178,7 +1186,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
     for (i_t k = 0; k < fractional.size(); k++) {
       for (i_t branch = 0; branch < 2; branch++) {
         const bool is_down = (branch == 0);
-        f_t& sb_dest       = is_down ? pc.strong_branch_down[k] : pc.strong_branch_up[k];
+        f_t& sb_dest       = is_down ? strong_branch_down[k] : strong_branch_up[k];
         f_t ds_obj         = is_down ? dual_simplex_obj_down[k] : dual_simplex_obj_up[k];
         dual::status_t ds_status =
           is_down ? dual_simplex_status_down[k] : dual_simplex_status_up[k];
@@ -1211,12 +1219,12 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
       }
     }
 
-    pc.pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root =
+    pc.pdlp_warm_cache->percent_solved_by_batch_pdlp_at_root =
       (f_t(merged_from_pdlp) / f_t(fractional.size() * 2)) * 100.0;
     if (verbose) {
       settings.log.printf(
         "Batch PDLP for strong branching. Percent solved by batch PDLP at root: %f\n",
-        pc.pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root);
+        pc.pdlp_warm_cache->percent_solved_by_batch_pdlp_at_root);
       settings.log.printf(
         "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN), %d solved by both\n",
         merged_from_ds,
@@ -1226,22 +1234,57 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
     }
   }
 
-  pc.update_pseudo_costs_from_strong_branching(fractional, root_solution.x);
+  pc.update_pseudo_costs_from_strong_branching(
+    fractional, strong_branch_down, strong_branch_up, root_solution.x);
+}
+
+template <typename i_t, typename f_t>
+inline f_t pseudo_costs_t<i_t, f_t>::compute_pseudocost_average_down()
+{
+  i_t num_initialized = 0;
+  f_t avg             = 0.0;
+
+  for (size_t j = 0; j < pseudo_cost_sum_down.size(); ++j) {
+    i_t num = pseudo_cost_num_down[j];
+    f_t sum = pseudo_cost_sum_down[j];
+    if (num > 0 && std::isfinite(sum)) {
+      ++num_initialized;
+      avg += sum / num;
+    }
+  }
+
+  return (num_initialized > 0) ? avg / num_initialized : 1.0;
+}
+
+template <typename i_t, typename f_t>
+inline f_t pseudo_costs_t<i_t, f_t>::compute_pseudocost_average_up()
+{
+  i_t num_initialized = 0;
+  f_t avg             = 0.0;
+
+  for (size_t j = 0; j < pseudo_cost_sum_up.size(); ++j) {
+    i_t num = pseudo_cost_num_up[j];
+    f_t sum = pseudo_cost_sum_up[j];
+    if (num > 0 && std::isfinite(sum)) {
+      ++num_initialized;
+      avg += sum / num;
+    }
+  }
+
+  return (num_initialized > 0) ? avg / num_initialized : 1.0;
 }
 
 template <typename i_t, typename f_t>
 f_t pseudo_costs_t<i_t, f_t>::calculate_pseudocost_score(i_t j,
                                                          const std::vector<f_t>& solution,
-                                                         f_t pseudo_cost_up_avg,
-                                                         f_t pseudo_cost_down_avg) const
+                                                         f_t avg_down,
+                                                         f_t avg_up) const
 {
   constexpr f_t eps = 1e-6;
-  i_t num_up        = pseudo_cost_num_up[j];
-  i_t num_down      = pseudo_cost_num_down[j];
-  f_t pc_up         = num_up > 0 ? pseudo_cost_sum_up[j] / num_up : pseudo_cost_up_avg;
-  f_t pc_down       = num_down > 0 ? pseudo_cost_sum_down[j] / num_down : pseudo_cost_down_avg;
   f_t f_down        = solution[j] - std::floor(solution[j]);
   f_t f_up          = std::ceil(solution[j]) - solution[j];
+  f_t pc_down       = get_pseudocost_down(j, avg_down);
+  f_t pc_up         = get_pseudocost_up(j, avg_up);
   return std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps);
 }
 
@@ -1250,11 +1293,11 @@ void pseudo_costs_t<i_t, f_t>::update_pseudo_costs(mip_node_t<i_t, f_t>* node_pt
                                                    f_t leaf_objective)
 {
   const f_t change_in_obj = std::max(leaf_objective - node_ptr->lower_bound, 0.0);
-  const f_t frac          = node_ptr->branch_dir == rounding_direction_t::DOWN
+  const f_t frac          = node_ptr->branch_dir == branch_direction_t::DOWN
                               ? node_ptr->fractional_val - std::floor(node_ptr->fractional_val)
                               : std::ceil(node_ptr->fractional_val) - node_ptr->fractional_val;
 
-  if (node_ptr->branch_dir == rounding_direction_t::DOWN) {
+  if (node_ptr->branch_dir == branch_direction_t::DOWN) {
     pseudo_cost_sum_down[node_ptr->branch_var] += change_in_obj / frac;
     pseudo_cost_num_down[node_ptr->branch_var]++;
   } else {
@@ -1263,43 +1306,19 @@ void pseudo_costs_t<i_t, f_t>::update_pseudo_costs(mip_node_t<i_t, f_t>* node_pt
   }
 }
 
-template <typename i_t, typename f_t>
-void pseudo_costs_t<i_t, f_t>::initialized(i_t& num_initialized_down,
-                                           i_t& num_initialized_up,
-                                           f_t& pseudo_cost_down_avg,
-                                           f_t& pseudo_cost_up_avg) const
-{
-  auto avgs            = compute_pseudo_cost_averages(pseudo_cost_sum_down.data(),
-                                           pseudo_cost_sum_up.data(),
-                                           pseudo_cost_num_down.data(),
-                                           pseudo_cost_num_up.data(),
-                                           pseudo_cost_sum_down.size());
-  pseudo_cost_down_avg = avgs.down_avg;
-  pseudo_cost_up_avg   = avgs.up_avg;
-}
-
 template <typename i_t, typename f_t>
 i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractional,
-                                                 const std::vector<f_t>& solution,
-                                                 logger_t& log)
+                                                 const std::vector<f_t>& solution)
 {
   i_t branch_var = fractional[0];
   f_t max_score  = -1;
-  i_t num_initialized_down;
-  i_t num_initialized_up;
-  f_t pseudo_cost_down_avg;
-  f_t pseudo_cost_up_avg;
+  f_t avg_down   = compute_pseudocost_average_down();
+  f_t avg_up     = compute_pseudocost_average_up();
 
-  initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg);
-
-  log.printf("PC: num initialized down %d up %d avg down %e up %e\n",
-             num_initialized_down,
-             num_initialized_up,
-             pseudo_cost_down_avg,
-             pseudo_cost_up_avg);
+  settings.log.debug("PC: avg down %e up %e\n", avg_down, avg_up);
 
   for (i_t j : fractional) {
-    f_t score = calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg);
+    f_t score = calculate_pseudocost_score(j, solution, avg_down, avg_up);
 
     if (score > max_score) {
       max_score  = score;
@@ -1307,10 +1326,10 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
     }
   }
 
-  log.debug("Pseudocost branching on %d. Value %e. Score %e.\n",
-            branch_var,
-            solution[branch_var],
-            max_score);
+  settings.log.debug("Pseudocost branching on %d. Value %e. Score %e.\n",
+                     branch_var,
+                     solution[branch_var],
+                     max_score);
 
   return branch_var;
 }
@@ -1322,19 +1341,17 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   branch_and_bound_worker_t<i_t, f_t>* worker,
   const std::vector<variable_type_t>& var_types,
   const branch_and_bound_stats_t<i_t, f_t>& bnb_stats,
-  const simplex_solver_settings_t<i_t, f_t>& settings,
   f_t upper_bound,
   int max_num_tasks,
-  logger_t& log,
   const std::vector<i_t>& new_slacks,
   const lp_problem_t<i_t, f_t>& original_lp)
 {
-  constexpr f_t eps                      = 1e-6;
-  f_t start_time                         = bnb_stats.start_time;
-  i_t branch_var                         = fractional[0];
-  f_t max_score                          = -1;
-  f_t pseudo_cost_down_avg               = -1;
-  f_t pseudo_cost_up_avg                 = -1;
+  constexpr f_t eps = 1e-6;
+  f_t start_time    = bnb_stats.start_time;
+  i_t branch_var    = fractional[0];
+  f_t max_score     = -1;
+  f_t avg_down{0};
+  f_t avg_up{0};
   lp_solution_t<i_t, f_t>& leaf_solution = worker->leaf_solution;
 
   const int64_t branch_and_bound_lp_iters = bnb_stats.total_lp_iters;
@@ -1364,17 +1381,11 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
   // If `reliable_threshold == 0`, then we set the uninitialized pseudocosts to the average.
   // Otherwise, the best ones are initialized via strong branching, while the other are ignored.  //
-  // In the latter, we are not using the average pseudocost (which calculated in the `initialized`
-  // method).
+  // So we only need to initialize the average for the former.
   if (reliable_threshold == 0) {
-    i_t num_initialized_up;
-    i_t num_initialized_down;
-    initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg);
-    log.printf("PC: num initialized down %d up %d avg down %e up %e\n",
-               num_initialized_down,
-               num_initialized_up,
-               pseudo_cost_down_avg,
-               pseudo_cost_up_avg);
+    avg_down = compute_pseudocost_average_down();
+    avg_up   = compute_pseudocost_average_up();
+    settings.log.debug("PC: avg down %e up %e\n", avg_down, avg_up);
   }
 
   std::vector<std::pair<f_t, i_t>> unreliable_list;
@@ -1386,8 +1397,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
       unreliable_list.push_back(std::make_pair(-1, j));
       continue;
     }
-    f_t score =
-      calculate_pseudocost_score(j, leaf_solution.x, pseudo_cost_up_avg, pseudo_cost_down_avg);
+    f_t score = calculate_pseudocost_score(j, leaf_solution.x, avg_down, avg_up);
 
     if (score > max_score) {
       max_score  = score;
@@ -1396,16 +1406,17 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   }
 
   if (unreliable_list.empty()) {
-    log.printf("pc branching on %d. Value %e. Score %e\n",
-               branch_var,
-               leaf_solution.x[branch_var],
-               max_score);
+    settings.log.debug("pc branching on %d. Value %e. Score %e\n",
+                       branch_var,
+                       leaf_solution.x[branch_var],
+                       max_score);
 
     return branch_var;
   }
 
   // 0: no batch PDLP, 1: cooperative batch PDLP and DS, 2: batch PDLP only
   const i_t rb_mode = settings.mip_batch_pdlp_reliability_branching;
+
   // We don't use batch PDLP in reliability branching if the PDLP warm start data was not filled
   // This indicates that PDLP alone (not batched) couldn't even run at the root node
   // So it will most likely perform poorly compared to DS
@@ -1414,31 +1425,45 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   // using batch PDLP
   constexpr i_t min_num_candidates_for_pdlp                       = 5;
   constexpr f_t min_percent_solved_by_batch_pdlp_at_root_for_pdlp = 5.0;
-  // Batch PDLP is either forced or we use the heuristic to decide if it should be used
-  const bool use_pdlp = (rb_mode == 2) || (rb_mode != 0 && !settings.sub_mip &&
-                                           !settings.deterministic && pdlp_warm_cache.populated &&
-                                           unreliable_list.size() > min_num_candidates_for_pdlp &&
-                                           pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root >
-                                             min_percent_solved_by_batch_pdlp_at_root_for_pdlp);
-
-  if (rb_mode != 0 && !pdlp_warm_cache.populated) {
-    log.printf("PDLP warm start data not populated, using DS only\n");
+
+  // Check if batch PDLP was forced to be on
+  bool use_pdlp = rb_mode == 2;
+
+  // Use the heuristic to decide if it should be used (in case it is set to automatic)
+  if (!use_pdlp && rb_mode != 0) {
+    // Check if it is a sub MIP or the determinism mode is on.
+    use_pdlp = !settings.sub_mip;
+    use_pdlp &= !settings.deterministic;
+
+    // Check if the warm cache was filled at the root
+    use_pdlp &= pdlp_warm_cache->populated;
+
+    // Check if there are enough candidates for batch PDLP
+    use_pdlp &= unreliable_list.size() > min_num_candidates_for_pdlp;
+
+    // Check if batch PDLP was effective for strong branching at the root node
+    use_pdlp &= pdlp_warm_cache->percent_solved_by_batch_pdlp_at_root >
+                min_percent_solved_by_batch_pdlp_at_root_for_pdlp;
+  }
+
+  if (rb_mode != 0 && !pdlp_warm_cache->populated) {
+    settings.log.debug("PDLP warm start data not populated, using DS only\n");
   } else if (rb_mode != 0 && settings.sub_mip) {
-    log.printf("Batch PDLP reliability branching is disabled because sub-MIP is enabled\n");
+    settings.log.debug("Batch PDLP reliability branching is disabled because sub-MIP is enabled\n");
   } else if (rb_mode != 0 && settings.deterministic) {
-    log.printf(
+    settings.log.debug(
       "Batch PDLP reliability branching is disabled because deterministic mode is enabled\n");
   } else if (rb_mode != 0 && unreliable_list.size() < min_num_candidates_for_pdlp) {
-    log.printf("Not enough candidates to use batch PDLP, using DS only\n");
-  } else if (rb_mode != 0 && pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root < 5.0) {
-    log.printf("Percent solved by batch PDLP at root is too low, using DS only\n");
+    settings.log.debug("Not enough candidates to use batch PDLP, using DS only\n");
+  } else if (rb_mode != 0 && pdlp_warm_cache->percent_solved_by_batch_pdlp_at_root < 5.0) {
+    settings.log.debug("Percent solved by batch PDLP at root is too low, using DS only\n");
   } else if (use_pdlp) {
-    log.printf(
+    settings.log.debug(
       "Using batch PDLP because populated, unreliable list size is %d (> %d), and percent solved "
       "by batch PDLP at root is %f%% (> %f%%)\n",
       static_cast<i_t>(unreliable_list.size()),
       min_num_candidates_for_pdlp,
-      pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root,
+      pdlp_warm_cache->percent_solved_by_batch_pdlp_at_root,
       min_percent_solved_by_batch_pdlp_at_root_for_pdlp);
   }
 
@@ -1454,9 +1479,9 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   assert(num_candidates > 0);
   assert(num_tasks > 0);
 
-  log.printf(
+  settings.log.debug(
     "RB iters = %d, B&B iters = %d, unreliable = %d, num_tasks = %d, reliable_threshold = %d\n",
-    strong_branching_lp_iter.load(),
+    static_cast<int64_t>(strong_branching_lp_iter),
     branch_and_bound_lp_iters,
     unreliable_list.size(),
     num_tasks,
@@ -1487,8 +1512,8 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
           objective_change_estimate_t<f_t> estimate =
             single_pivot_objective_change_estimate(worker->leaf_problem,
                                                    settings,
-                                                   AT,
-                                                   node_ptr->vstatus,
+                                                   *AT,
+                                                   worker->leaf_vstatus,
                                                    j,
                                                    basic_map[j],
                                                    leaf_solution,
@@ -1503,8 +1528,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
           score = std::max(estimate.up_obj_change, eps) * std::max(estimate.down_obj_change, eps);
         } else {
           // Use the previous score, even if it is unreliable
-          score = calculate_pseudocost_score(
-            j, leaf_solution.x, pseudo_cost_up_avg, pseudo_cost_down_avg);
+          score = calculate_pseudocost_score(j, leaf_solution.x, avg_down, avg_up);
         }
       }
     } else {
@@ -1542,7 +1566,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
   if (use_pdlp) {
 #pragma omp task default(shared)
-    batch_pdlp_reliability_branching_task(log,
+    batch_pdlp_reliability_branching_task(settings.log,
                                           rb_mode,
                                           num_candidates,
                                           start_time,
@@ -1554,13 +1578,13 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
                                           candidate_vars,
                                           settings,
                                           sb_view,
-                                          pdlp_warm_cache,
+                                          pdlp_warm_cache.get(),
                                           pdlp_obj_down,
                                           pdlp_obj_up);
   }
 
   if (toc(start_time) > settings.time_limit) {
-    log.printf("Time limit reached\n");
+    settings.log.debug("Time limit reached\n");
     if (use_pdlp) {
       concurrent_halt.store(1);
 #pragma omp taskwait
@@ -1590,16 +1614,17 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
       if (toc(start_time) > settings.time_limit) { continue; }
 
       if (rb_mode == 1 && sb_view.is_solved(i)) {
-        log.printf(
+        settings.log.debug(
           "DS skipping variable %d branch down (shared_idx %d): already solved by PDLP\n", j, i);
       } else {
         pseudo_cost_mutex_down[j].lock();
         if (pseudo_cost_num_down[j] < reliable_threshold) {
           // Do trial branching on the down branch
+          i_t iter                 = 0;
           const auto [obj, status] = trial_branching(worker->leaf_problem,
                                                      settings,
                                                      var_types,
-                                                     node_ptr->vstatus,
+                                                     worker->leaf_vstatus,
                                                      worker->leaf_edge_norms,
                                                      worker->basis_factors,
                                                      worker->basic_list,
@@ -1610,7 +1635,8 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
                                                      upper_bound,
                                                      start_time,
                                                      iter_limit_per_trial,
-                                                     strong_branching_lp_iter);
+                                                     iter);
+          strong_branching_lp_iter += iter;
 
           dual_simplex_obj_down[i]    = obj;
           dual_simplex_status_down[i] = status;
@@ -1619,7 +1645,6 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
             f_t change_in_x   = leaf_solution.x[j] - std::floor(leaf_solution.x[j]);
             pseudo_cost_sum_down[j] += change_in_obj / change_in_x;
             pseudo_cost_num_down[j]++;
-            // Should be valid if were are already here
             if (rb_mode == 1 && is_dual_simplex_done(status)) { sb_view.mark_solved(i); }
           }
         } else {
@@ -1633,16 +1658,18 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
       const i_t shared_idx = i + num_candidates;
       if (rb_mode == 1 && sb_view.is_solved(shared_idx)) {
-        log.printf("DS skipping variable %d branch up (shared_idx %d): already solved by PDLP\n",
-                   j,
-                   shared_idx);
+        settings.log.debug(
+          "DS skipping variable %d branch up (shared_idx %d): already solved by PDLP\n",
+          j,
+          shared_idx);
       } else {
         pseudo_cost_mutex_up[j].lock();
         if (pseudo_cost_num_up[j] < reliable_threshold) {
+          i_t iter                 = 0;
           const auto [obj, status] = trial_branching(worker->leaf_problem,
                                                      settings,
                                                      var_types,
-                                                     node_ptr->vstatus,
+                                                     worker->leaf_vstatus,
                                                      worker->leaf_edge_norms,
                                                      worker->basis_factors,
                                                      worker->basic_list,
@@ -1653,7 +1680,8 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
                                                      upper_bound,
                                                      start_time,
                                                      iter_limit_per_trial,
-                                                     strong_branching_lp_iter);
+                                                     iter);
+          strong_branching_lp_iter += iter;
 
           dual_simplex_obj_up[i]    = obj;
           dual_simplex_status_up[i] = status;
@@ -1662,7 +1690,6 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
             f_t change_in_x   = std::ceil(leaf_solution.x[j]) - leaf_solution.x[j];
             pseudo_cost_sum_up[j] += change_in_obj / change_in_x;
             pseudo_cost_num_up[j]++;
-            // Should be valid if were are already here
             if (rb_mode == 1 && is_dual_simplex_done(status)) { sb_view.mark_solved(shared_idx); }
           }
         } else {
@@ -1674,9 +1701,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
       if (toc(start_time) > settings.time_limit) { continue; }
 
-      score =
-        calculate_pseudocost_score(j, leaf_solution.x, pseudo_cost_up_avg, pseudo_cost_down_avg);
-
+      score = calculate_pseudocost_score(j, leaf_solution.x, avg_down, avg_up);
       score_mutex.lock();
       if (score > max_score) {
         max_score  = score;
@@ -1690,24 +1715,6 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
   f_t dual_simplex_elapsed = toc(dual_simplex_start_time);
 
-  // TODO put back
-  // if (rb_mode != 2) {
-  //  if (rb_mode == 1) {
-  //    log.printf(
-  //      "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed, %d skipped
-  //      (PDLP) in %.2fs\n", num_candidates, dual_simplex_optimal.load(), num_candidates * 2,
-  //      dual_simplex_infeasible.load(), num_candidates * 2,
-  //      dual_simplex_failed.load(), num_candidates * 2,
-  //      dual_simplex_skipped.load(), dual_simplex_elapsed);
-  //  } else {
-  //    log.printf(
-  //      "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed in
-  //      %.2fs\n", num_candidates, dual_simplex_optimal.load(), num_candidates * 2,
-  //      dual_simplex_infeasible.load(), num_candidates * 2, dual_simplex_failed.load(),
-  //      num_candidates * 2, dual_simplex_elapsed);
-  //  }
-  //}
-
   if (use_pdlp) {
 #pragma omp taskwait
 
@@ -1756,22 +1763,21 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
         }
       }
 
-      f_t score =
-        calculate_pseudocost_score(j, leaf_solution.x, pseudo_cost_up_avg, pseudo_cost_down_avg);
+      f_t score = calculate_pseudocost_score(j, leaf_solution.x, avg_down, avg_up);
       if (score > max_score) {
         max_score  = score;
         branch_var = j;
       }
     }
 
-    log.printf("RB batch PDLP: %d candidates, %d/%d optimal, %d applied to pseudo-costs\n",
-               num_candidates,
-               pdlp_optimal,
-               num_candidates * 2,
-               pdlp_applied);
+    settings.log.debug("RB batch PDLP: %d candidates, %d/%d optimal, %d applied to pseudo-costs\n",
+                       num_candidates,
+                       pdlp_optimal,
+                       num_candidates * 2,
+                       pdlp_applied);
   }
 
-  log.printf(
+  settings.log.debug(
     "pc branching on %d. Value %e. Score %e\n", branch_var, leaf_solution.x[branch_var], max_score);
 
   return branch_var;
@@ -1780,37 +1786,30 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 template <typename i_t, typename f_t>
 f_t pseudo_costs_t<i_t, f_t>::obj_estimate(const std::vector<i_t>& fractional,
                                            const std::vector<f_t>& solution,
-                                           f_t lower_bound,
-                                           logger_t& log)
+                                           f_t lower_bound)
 {
-  const i_t num_fractional = fractional.size();
-  f_t estimate             = lower_bound;
-
-  i_t num_initialized_down;
-  i_t num_initialized_up;
-  f_t pseudo_cost_down_avg;
-  f_t pseudo_cost_up_avg;
-
-  initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg);
+  f_t estimate = lower_bound;
+  f_t avg_down = compute_pseudocost_average_down();
+  f_t avg_up   = compute_pseudocost_average_up();
 
   for (i_t j : fractional) {
-    constexpr f_t eps = 1e-6;
-    i_t num_up        = pseudo_cost_num_up[j];
-    i_t num_down      = pseudo_cost_num_down[j];
-    f_t pc_up         = num_up > 0 ? pseudo_cost_sum_up[j] / num_up : pseudo_cost_up_avg;
-    f_t pc_down       = num_down > 0 ? pseudo_cost_sum_down[j] / num_down : pseudo_cost_down_avg;
-    f_t f_down        = solution[j] - std::floor(solution[j]);
-    f_t f_up          = std::ceil(solution[j]) - solution[j];
+    f_t pc_down = get_pseudocost_down(j, avg_down);
+    f_t pc_up   = get_pseudocost_up(j, avg_up);
+    f_t f_down  = solution[j] - std::floor(solution[j]);
+    f_t f_up    = std::ceil(solution[j]) - solution[j];
     estimate += std::min(pc_down * f_down, pc_up * f_up);
   }
 
-  log.printf("pseudocost estimate = %e\n", estimate);
+  settings.log.debug("pseudocost estimate = %e\n", estimate);
   return estimate;
 }
 
 template <typename i_t, typename f_t>
 void pseudo_costs_t<i_t, f_t>::update_pseudo_costs_from_strong_branching(
-  const std::vector<i_t>& fractional, const std::vector<f_t>& root_soln)
+  const std::vector<i_t>& fractional,
+  const std::vector<f_t>& strong_branch_down,
+  const std::vector<f_t>& strong_branch_up,
+  const std::vector<f_t>& root_soln)
 {
   for (i_t k = 0; k < fractional.size(); k++) {
     const i_t j = fractional[k];
@@ -1835,6 +1834,7 @@ void pseudo_costs_t<i_t, f_t>::update_pseudo_costs_from_strong_branching(
 #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE
 
 template class pseudo_costs_t<int, double>;
+template class pseudo_cost_snapshot_t<int, double>;
 
 template void strong_branching<int, double>(const lp_problem_t<int, double>& original_lp,
                                             const simplex_solver_settings_t<int, double>& settings,
diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp
index 009bd8b81a..8139054a7b 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.hpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.hpp
@@ -7,8 +7,9 @@
 
 #pragma once
 
-#include <branch_and_bound/branch_and_bound_worker.hpp>
+#include <branch_and_bound/constants.hpp>
 #include <branch_and_bound/mip_node.hpp>
+#include <branch_and_bound/worker.hpp>
 
 #include <dual_simplex/basis_updates.hpp>
 #include <dual_simplex/logger.hpp>
@@ -18,7 +19,6 @@
 #include <utilities/omp_helpers.hpp>
 #include <utilities/pcgenerator.hpp>
 
-#include <omp.h>
 #include <cmath>
 #include <rmm/device_uvector.hpp>
 
@@ -27,354 +27,6 @@
 
 namespace cuopt::linear_programming::dual_simplex {
 
-template <typename i_t>
-struct branch_variable_t {
-  i_t variable;
-  rounding_direction_t direction;
-};
-
-template <typename i_t, typename f_t>
-struct pseudo_cost_update_t {
-  i_t variable;
-  rounding_direction_t direction;
-  f_t delta;
-  double work_timestamp;
-  int worker_id;
-
-  bool operator<(const pseudo_cost_update_t& other) const
-  {
-    if (work_timestamp != other.work_timestamp) return work_timestamp < other.work_timestamp;
-    if (variable != other.variable) return variable < other.variable;
-    if (delta != other.delta) return delta < other.delta;
-    return worker_id < other.worker_id;
-  }
-};
-
-template <typename f_t>
-struct pseudo_cost_averages_t {
-  f_t down_avg;
-  f_t up_avg;
-};
-
-// used to get T from omp_atomic_t<T> based on the fact that omp_atomic_t<T>::operator++ returns T
-template <typename T>
-using underlying_type = decltype(std::declval<T&>()++);
-
-// Necessary because omp_atomic_t<f_t> may be passed instead of f_t
-template <typename MaybeWrappedI, typename MaybeWrappedF>
-auto compute_pseudo_cost_averages(const MaybeWrappedF* pc_sum_down,
-                                  const MaybeWrappedF* pc_sum_up,
-                                  const MaybeWrappedI* pc_num_down,
-                                  const MaybeWrappedI* pc_num_up,
-                                  size_t n)
-{
-  using underlying_f_t = underlying_type<MaybeWrappedF>;
-  using underlying_i_t = underlying_type<MaybeWrappedI>;
-
-  underlying_i_t num_initialized_down = 0;
-  underlying_i_t num_initialized_up   = 0;
-  underlying_f_t pseudo_cost_down_avg = 0.0;
-  underlying_f_t pseudo_cost_up_avg   = 0.0;
-
-  for (size_t j = 0; j < n; ++j) {
-    if (pc_num_down[j] > 0) {
-      ++num_initialized_down;
-      if (std::isfinite(pc_sum_down[j])) {
-        pseudo_cost_down_avg += pc_sum_down[j] / pc_num_down[j];
-      }
-    }
-    if (pc_num_up[j] > 0) {
-      ++num_initialized_up;
-      if (std::isfinite(pc_sum_up[j])) { pseudo_cost_up_avg += pc_sum_up[j] / pc_num_up[j]; }
-    }
-  }
-
-  pseudo_cost_down_avg =
-    (num_initialized_down > 0) ? pseudo_cost_down_avg / num_initialized_down : 1.0;
-  pseudo_cost_up_avg = (num_initialized_up > 0) ? pseudo_cost_up_avg / num_initialized_up : 1.0;
-
-  return pseudo_cost_averages_t<underlying_f_t>{pseudo_cost_down_avg, pseudo_cost_up_avg};
-}
-
-// Variable selection using pseudo-cost product scoring
-// Returns the best variable to branch on
-template <typename i_t, typename f_t>
-i_t variable_selection_from_pseudo_costs(const f_t* pc_sum_down,
-                                         const f_t* pc_sum_up,
-                                         const i_t* pc_num_down,
-                                         const i_t* pc_num_up,
-                                         i_t n_vars,
-                                         const std::vector<i_t>& fractional,
-                                         const std::vector<f_t>& solution)
-{
-  const i_t num_fractional = fractional.size();
-  if (num_fractional == 0) return -1;
-
-  auto [pc_down_avg, pc_up_avg] =
-    compute_pseudo_cost_averages(pc_sum_down, pc_sum_up, pc_num_down, pc_num_up, n_vars);
-
-  i_t branch_var    = fractional[0];
-  f_t max_score     = std::numeric_limits<f_t>::lowest();
-  constexpr f_t eps = f_t(1e-6);
-
-  for (i_t j : fractional) {
-    f_t pc_down      = pc_num_down[j] != 0 ? pc_sum_down[j] / pc_num_down[j] : pc_down_avg;
-    f_t pc_up        = pc_num_up[j] != 0 ? pc_sum_up[j] / pc_num_up[j] : pc_up_avg;
-    const f_t f_down = solution[j] - std::floor(solution[j]);
-    const f_t f_up   = std::ceil(solution[j]) - solution[j];
-    f_t score        = std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps);
-    if (score > max_score) {
-      max_score  = score;
-      branch_var = j;
-    }
-  }
-
-  return branch_var;
-}
-
-// Objective estimate using pseudo-costs (lock-free implementation)
-// Returns lower_bound + estimated cost to reach integer feasibility
-template <typename i_t, typename f_t>
-f_t obj_estimate_from_arrays(const f_t* pc_sum_down,
-                             const f_t* pc_sum_up,
-                             const i_t* pc_num_down,
-                             const i_t* pc_num_up,
-                             i_t n_vars,
-                             const std::vector<i_t>& fractional,
-                             const std::vector<f_t>& solution,
-                             f_t lower_bound)
-{
-  auto [pc_down_avg, pc_up_avg] =
-    compute_pseudo_cost_averages(pc_sum_down, pc_sum_up, pc_num_down, pc_num_up, n_vars);
-
-  f_t estimate      = lower_bound;
-  constexpr f_t eps = f_t(1e-6);
-
-  for (i_t j : fractional) {
-    f_t pc_down      = pc_num_down[j] != 0 ? pc_sum_down[j] / pc_num_down[j] : pc_down_avg;
-    f_t pc_up        = pc_num_up[j] != 0 ? pc_sum_up[j] / pc_num_up[j] : pc_up_avg;
-    const f_t f_down = solution[j] - std::floor(solution[j]);
-    const f_t f_up   = std::ceil(solution[j]) - solution[j];
-    estimate += std::min(std::max(pc_down * f_down, eps), std::max(pc_up * f_up, eps));
-  }
-
-  return estimate;
-}
-
-template <typename i_t, typename f_t, typename MaybeWrappedI = i_t, typename MaybeWrappedF = f_t>
-branch_variable_t<i_t> pseudocost_diving_from_arrays(const MaybeWrappedF* pc_sum_down,
-                                                     const MaybeWrappedF* pc_sum_up,
-                                                     const MaybeWrappedI* pc_num_down,
-                                                     const MaybeWrappedI* pc_num_up,
-                                                     i_t n_vars,
-                                                     const std::vector<i_t>& fractional,
-                                                     const std::vector<f_t>& solution,
-                                                     const std::vector<f_t>& root_solution)
-{
-  const i_t num_fractional = fractional.size();
-  if (num_fractional == 0) return {-1, rounding_direction_t::NONE};
-
-  auto avgs = compute_pseudo_cost_averages(pc_sum_down, pc_sum_up, pc_num_down, pc_num_up, n_vars);
-
-  i_t branch_var                 = fractional[0];
-  f_t max_score                  = std::numeric_limits<f_t>::lowest();
-  rounding_direction_t round_dir = rounding_direction_t::DOWN;
-  constexpr f_t eps              = f_t(1e-6);
-
-  for (i_t j : fractional) {
-    f_t f_down  = solution[j] - std::floor(solution[j]);
-    f_t f_up    = std::ceil(solution[j]) - solution[j];
-    f_t pc_down = pc_num_down[j] != 0 ? (f_t)pc_sum_down[j] / (f_t)pc_num_down[j] : avgs.down_avg;
-    f_t pc_up   = pc_num_up[j] != 0 ? (f_t)pc_sum_up[j] / (f_t)pc_num_up[j] : avgs.up_avg;
-
-    f_t score_down = std::sqrt(f_up) * (1 + pc_up) / (1 + pc_down);
-    f_t score_up   = std::sqrt(f_down) * (1 + pc_down) / (1 + pc_up);
-
-    f_t score                = 0;
-    rounding_direction_t dir = rounding_direction_t::DOWN;
-
-    f_t root_val = (j < static_cast<i_t>(root_solution.size())) ? root_solution[j] : solution[j];
-
-    if (solution[j] < root_val - f_t(0.4)) {
-      score = score_down;
-      dir   = rounding_direction_t::DOWN;
-    } else if (solution[j] > root_val + f_t(0.4)) {
-      score = score_up;
-      dir   = rounding_direction_t::UP;
-    } else if (f_down < f_t(0.3)) {
-      score = score_down;
-      dir   = rounding_direction_t::DOWN;
-    } else if (f_down > f_t(0.7)) {
-      score = score_up;
-      dir   = rounding_direction_t::UP;
-    } else if (pc_down < pc_up + eps) {
-      score = score_down;
-      dir   = rounding_direction_t::DOWN;
-    } else {
-      score = score_up;
-      dir   = rounding_direction_t::UP;
-    }
-
-    if (score > max_score) {
-      max_score  = score;
-      branch_var = j;
-      round_dir  = dir;
-    }
-  }
-
-  if (round_dir == rounding_direction_t::NONE) {
-    branch_var = fractional[0];
-    round_dir  = rounding_direction_t::DOWN;
-  }
-
-  return {branch_var, round_dir};
-}
-
-template <typename i_t, typename f_t, typename MaybeWrappedI = i_t, typename MaybeWrappedF = f_t>
-branch_variable_t<i_t> guided_diving_from_arrays(const MaybeWrappedF* pc_sum_down,
-                                                 const MaybeWrappedF* pc_sum_up,
-                                                 const MaybeWrappedI* pc_num_down,
-                                                 const MaybeWrappedI* pc_num_up,
-                                                 i_t n_vars,
-                                                 const std::vector<i_t>& fractional,
-                                                 const std::vector<f_t>& solution,
-                                                 const std::vector<f_t>& incumbent)
-{
-  const i_t num_fractional = fractional.size();
-  if (num_fractional == 0) return {-1, rounding_direction_t::NONE};
-
-  auto avgs = compute_pseudo_cost_averages(pc_sum_down, pc_sum_up, pc_num_down, pc_num_up, n_vars);
-
-  i_t branch_var                 = fractional[0];
-  f_t max_score                  = std::numeric_limits<f_t>::lowest();
-  rounding_direction_t round_dir = rounding_direction_t::DOWN;
-  constexpr f_t eps              = f_t(1e-6);
-
-  for (i_t j : fractional) {
-    f_t f_down    = solution[j] - std::floor(solution[j]);
-    f_t f_up      = std::ceil(solution[j]) - solution[j];
-    f_t down_dist = std::abs(incumbent[j] - std::floor(solution[j]));
-    f_t up_dist   = std::abs(std::ceil(solution[j]) - incumbent[j]);
-    rounding_direction_t dir =
-      down_dist < up_dist + eps ? rounding_direction_t::DOWN : rounding_direction_t::UP;
-
-    f_t pc_down = pc_num_down[j] != 0 ? (f_t)pc_sum_down[j] / (f_t)pc_num_down[j] : avgs.down_avg;
-    f_t pc_up   = pc_num_up[j] != 0 ? (f_t)pc_sum_up[j] / (f_t)pc_num_up[j] : avgs.up_avg;
-
-    f_t score1 = dir == rounding_direction_t::DOWN ? 5 * pc_down * f_down : 5 * pc_up * f_up;
-    f_t score2 = dir == rounding_direction_t::DOWN ? pc_up * f_up : pc_down * f_down;
-    f_t score  = (score1 + score2) / 6;
-
-    if (score > max_score) {
-      max_score  = score;
-      branch_var = j;
-      round_dir  = dir;
-    }
-  }
-
-  return {branch_var, round_dir};
-}
-
-template <typename i_t, typename f_t>
-class pseudo_cost_snapshot_t {
- public:
-  pseudo_cost_snapshot_t() = default;
-
-  pseudo_cost_snapshot_t(std::vector<f_t> sum_down,
-                         std::vector<f_t> sum_up,
-                         std::vector<i_t> num_down,
-                         std::vector<i_t> num_up)
-    : sum_down_(std::move(sum_down)),
-      sum_up_(std::move(sum_up)),
-      num_down_(std::move(num_down)),
-      num_up_(std::move(num_up))
-  {
-  }
-
-  i_t variable_selection(const std::vector<i_t>& fractional, const std::vector<f_t>& solution) const
-  {
-    return variable_selection_from_pseudo_costs(sum_down_.data(),
-                                                sum_up_.data(),
-                                                num_down_.data(),
-                                                num_up_.data(),
-                                                n_vars(),
-                                                fractional,
-                                                solution);
-  }
-
-  f_t obj_estimate(const std::vector<i_t>& fractional,
-                   const std::vector<f_t>& solution,
-                   f_t lower_bound) const
-  {
-    return obj_estimate_from_arrays(sum_down_.data(),
-                                    sum_up_.data(),
-                                    num_down_.data(),
-                                    num_up_.data(),
-                                    n_vars(),
-                                    fractional,
-                                    solution,
-                                    lower_bound);
-  }
-
-  branch_variable_t<i_t> pseudocost_diving(const std::vector<i_t>& fractional,
-                                           const std::vector<f_t>& solution,
-                                           const std::vector<f_t>& root_solution) const
-  {
-    return pseudocost_diving_from_arrays(sum_down_.data(),
-                                         sum_up_.data(),
-                                         num_down_.data(),
-                                         num_up_.data(),
-                                         n_vars(),
-                                         fractional,
-                                         solution,
-                                         root_solution);
-  }
-
-  branch_variable_t<i_t> guided_diving(const std::vector<i_t>& fractional,
-                                       const std::vector<f_t>& solution,
-                                       const std::vector<f_t>& incumbent) const
-  {
-    return guided_diving_from_arrays(sum_down_.data(),
-                                     sum_up_.data(),
-                                     num_down_.data(),
-                                     num_up_.data(),
-                                     n_vars(),
-                                     fractional,
-                                     solution,
-                                     incumbent);
-  }
-
-  void queue_update(
-    i_t variable, rounding_direction_t direction, f_t delta, double clock, int worker_id)
-  {
-    updates_.push_back({variable, direction, delta, clock, worker_id});
-    if (direction == rounding_direction_t::DOWN) {
-      sum_down_[variable] += delta;
-      num_down_[variable]++;
-    } else {
-      sum_up_[variable] += delta;
-      num_up_[variable]++;
-    }
-  }
-
-  std::vector<pseudo_cost_update_t<i_t, f_t>> take_updates()
-  {
-    std::vector<pseudo_cost_update_t<i_t, f_t>> result;
-    result.swap(updates_);
-    return result;
-  }
-
-  i_t n_vars() const { return (i_t)sum_down_.size(); }
-
-  std::vector<f_t> sum_down_;
-  std::vector<f_t> sum_up_;
-  std::vector<i_t> num_down_;
-  std::vector<i_t> num_up_;
-
- private:
-  std::vector<pseudo_cost_update_t<i_t, f_t>> updates_;
-};
-
 template <typename i_t, typename f_t>
 struct reliability_branching_settings_t {
   // Lower bound for the maximum number of LP iterations for a single trial branching
@@ -413,6 +65,12 @@ struct reliability_branching_settings_t {
   bool rank_candidates_with_dual_pivot = true;
 };
 
+template <typename i_t>
+struct branch_variable_t {
+  i_t variable;
+  branch_direction_t direction;
+};
+
 template <typename i_t, typename f_t>
 struct batch_pdlp_warm_cache_t {
   const raft::handle_t batch_pdlp_handle{};
@@ -425,41 +83,63 @@ struct batch_pdlp_warm_cache_t {
   bool populated{false};
 };
 
+template <typename i_t, typename f_t>
+struct pseudo_cost_update_t {
+  i_t variable;
+  branch_direction_t direction;
+  f_t delta;
+  double work_timestamp;
+  int worker_id;
+
+  bool operator<(const pseudo_cost_update_t& other) const
+  {
+    if (work_timestamp != other.work_timestamp) return work_timestamp < other.work_timestamp;
+    if (variable != other.variable) return variable < other.variable;
+    if (delta != other.delta) return delta < other.delta;
+    return worker_id < other.worker_id;
+  }
+};
+
 template <typename i_t, typename f_t>
 class pseudo_costs_t {
  public:
-  explicit pseudo_costs_t(i_t num_variables)
-    : pseudo_cost_sum_down(num_variables),
+  explicit pseudo_costs_t(i_t num_variables, const simplex_solver_settings_t<i_t, f_t>& settings)
+    : settings(settings),
+      pseudo_cost_sum_down(num_variables),
       pseudo_cost_sum_up(num_variables),
       pseudo_cost_num_down(num_variables),
       pseudo_cost_num_up(num_variables),
       pseudo_cost_mutex_up(num_variables),
       pseudo_cost_mutex_down(num_variables),
-      AT(1, 1, 1)
+      AT(std::make_shared<csc_matrix_t<i_t, f_t>>(1, 1, 1)),
+      pdlp_warm_cache(std::make_shared<batch_pdlp_warm_cache_t<i_t, f_t>>())
   {
   }
 
-  void update_pseudo_costs(mip_node_t<i_t, f_t>* node_ptr, f_t leaf_objective);
+  pseudo_costs_t(const pseudo_costs_t<i_t, f_t>& other) : pseudo_costs_t(1, other.settings)
+  {
+    *this = other;
+  }
 
-  pseudo_cost_snapshot_t<i_t, f_t> create_snapshot() const
+  pseudo_costs_t& operator=(const pseudo_costs_t& other)
   {
-    const i_t n = (i_t)pseudo_cost_sum_down.size();
-    std::vector<f_t> sd(n), su(n);
-    std::vector<i_t> nd(n), nu(n);
-    for (i_t j = 0; j < n; ++j) {
-      sd[j] = pseudo_cost_sum_down[j];
-      su[j] = pseudo_cost_sum_up[j];
-      nd[j] = pseudo_cost_num_down[j];
-      nu[j] = pseudo_cost_num_up[j];
+    if (this != &other) {
+      this->AT                   = other.AT;
+      this->pdlp_warm_cache      = other.pdlp_warm_cache;
+      this->pseudo_cost_num_down = other.pseudo_cost_num_down;
+      this->pseudo_cost_num_up   = other.pseudo_cost_num_up;
+      this->pseudo_cost_sum_down = other.pseudo_cost_sum_down;
+      this->pseudo_cost_sum_up   = other.pseudo_cost_sum_up;
     }
-    return pseudo_cost_snapshot_t<i_t, f_t>(
-      std::move(sd), std::move(su), std::move(nd), std::move(nu));
+    return *this;
   }
 
+  void update_pseudo_costs(mip_node_t<i_t, f_t>* node_ptr, f_t leaf_objective);
+
   void merge_updates(const std::vector<pseudo_cost_update_t<i_t, f_t>>& updates)
   {
     for (const auto& upd : updates) {
-      if (upd.direction == rounding_direction_t::DOWN) {
+      if (upd.direction == branch_direction_t::DOWN) {
         pseudo_cost_sum_down[upd.variable] += upd.delta;
         pseudo_cost_num_down[upd.variable]++;
       } else {
@@ -479,33 +159,42 @@ class pseudo_costs_t {
     pseudo_cost_mutex_down.resize(num_variables);
   }
 
-  void initialized(i_t& num_initialized_down,
-                   i_t& num_initialized_up,
-                   f_t& pseudo_cost_down_avg,
-                   f_t& pseudo_cost_up_avg) const;
+  f_t get_pseudocost_down(i_t j, f_t avg) const
+  {
+    i_t num = pseudo_cost_num_down[j];
+    f_t sum = pseudo_cost_sum_down[j];
+    return num > 0 ? sum / num : avg;
+  }
+
+  f_t get_pseudocost_up(i_t j, f_t avg) const
+  {
+    i_t num = pseudo_cost_num_up[j];
+    f_t sum = pseudo_cost_sum_up[j];
+    return num > 0 ? sum / num : avg;
+  }
+
+  f_t compute_pseudocost_average_down();
+  f_t compute_pseudocost_average_up();
 
   f_t obj_estimate(const std::vector<i_t>& fractional,
                    const std::vector<f_t>& solution,
-                   f_t lower_bound,
-                   logger_t& log);
+                   f_t lower_bound);
 
-  i_t variable_selection(const std::vector<i_t>& fractional,
-                         const std::vector<f_t>& solution,
-                         logger_t& log);
+  i_t variable_selection(const std::vector<i_t>& fractional, const std::vector<f_t>& solution);
 
   i_t reliable_variable_selection(const mip_node_t<i_t, f_t>* node_ptr,
                                   const std::vector<i_t>& fractional,
                                   branch_and_bound_worker_t<i_t, f_t>* worker,
                                   const std::vector<variable_type_t>& var_types,
                                   const branch_and_bound_stats_t<i_t, f_t>& bnb_stats,
-                                  const simplex_solver_settings_t<i_t, f_t>& settings,
                                   f_t upper_bound,
                                   int max_num_tasks,
-                                  logger_t& log,
                                   const std::vector<i_t>& new_slacks,
                                   const lp_problem_t<i_t, f_t>& original_lp);
 
   void update_pseudo_costs_from_strong_branching(const std::vector<i_t>& fractional,
+                                                 const std::vector<f_t>& strong_branch_down,
+                                                 const std::vector<f_t>& strong_branch_up,
                                                  const std::vector<f_t>& root_soln);
 
   uint32_t compute_state_hash() const
@@ -514,31 +203,68 @@ class pseudo_costs_t {
            detail::compute_hash(pseudo_cost_num_down) ^ detail::compute_hash(pseudo_cost_num_up);
   }
 
-  uint32_t compute_strong_branch_hash() const
-  {
-    return detail::compute_hash(strong_branch_down) ^ detail::compute_hash(strong_branch_up);
-  }
-
   f_t calculate_pseudocost_score(i_t j,
                                  const std::vector<f_t>& solution,
-                                 f_t pseudo_cost_up_avg,
-                                 f_t pseudo_cost_down_avg) const;
+                                 f_t avg_down,
+                                 f_t avg_up) const;
+
+  std::shared_ptr<csc_matrix_t<i_t, f_t>> AT;  // Transpose of the constraint matrix A
+  std::shared_ptr<batch_pdlp_warm_cache_t<i_t, f_t>> pdlp_warm_cache;
 
   reliability_branching_settings_t<i_t, f_t> reliability_branching_settings;
+  simplex_solver_settings_t<i_t, f_t> settings;
 
-  csc_matrix_t<i_t, f_t> AT;  // Transpose of the constraint matrix A
+ protected:
   std::vector<omp_atomic_t<f_t>> pseudo_cost_sum_up;
   std::vector<omp_atomic_t<f_t>> pseudo_cost_sum_down;
   std::vector<omp_atomic_t<i_t>> pseudo_cost_num_up;
   std::vector<omp_atomic_t<i_t>> pseudo_cost_num_down;
-  std::vector<f_t> strong_branch_down;
-  std::vector<f_t> strong_branch_up;
   std::vector<omp_mutex_t> pseudo_cost_mutex_up;
   std::vector<omp_mutex_t> pseudo_cost_mutex_down;
-  omp_atomic_t<i_t> num_strong_branches_completed = 0;
-  omp_atomic_t<int64_t> strong_branching_lp_iter  = 0;
 
-  batch_pdlp_warm_cache_t<i_t, f_t> pdlp_warm_cache;
+  omp_atomic_t<int64_t> strong_branching_lp_iter = 0;
+};
+
+template <typename i_t, typename f_t>
+class pseudo_cost_snapshot_t : public pseudo_costs_t<i_t, f_t> {
+ public:
+  using Base = pseudo_costs_t<i_t, f_t>;
+  using Base::Base;
+
+  pseudo_cost_snapshot_t(const pseudo_costs_t<i_t, f_t>& other) : Base(1, other.settings)
+  {
+    Base::operator=(other);
+  }
+
+  pseudo_cost_snapshot_t operator=(const pseudo_costs_t<i_t, f_t>& other)
+  {
+    return Base::operator=(other);
+  }
+
+  void queue_update(
+    i_t variable, branch_direction_t direction, f_t delta, double clock, int worker_id)
+  {
+    updates_.push_back({variable, direction, delta, clock, worker_id});
+    if (direction == branch_direction_t::DOWN) {
+      this->pseudo_cost_sum_down[variable] += delta;
+      ++this->pseudo_cost_num_down[variable];
+    } else {
+      this->pseudo_cost_sum_up[variable] += delta;
+      ++this->pseudo_cost_num_up[variable];
+    }
+  }
+
+  std::vector<pseudo_cost_update_t<i_t, f_t>> take_updates()
+  {
+    std::vector<pseudo_cost_update_t<i_t, f_t>> result;
+    result.swap(updates_);
+    return result;
+  }
+
+  i_t n_vars() const { return this->pseudo_cost_sum_down.size(); }
+
+ private:
+  std::vector<pseudo_cost_update_t<i_t, f_t>> updates_;
 };
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/branch_and_bound/worker.hpp b/cpp/src/branch_and_bound/worker.hpp
new file mode 100644
index 0000000000..acb8ccca87
--- /dev/null
+++ b/cpp/src/branch_and_bound/worker.hpp
@@ -0,0 +1,293 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <branch_and_bound/constants.hpp>
+#include <branch_and_bound/mip_node.hpp>
+#include <branch_and_bound/node_queue.hpp>
+
+#include <dual_simplex/basis_updates.hpp>
+#include <dual_simplex/bounds_strengthening.hpp>
+
+#include <utilities/pcgenerator.hpp>
+
+#include <vector>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+template <typename i_t, typename f_t>
+struct branch_and_bound_stats_t {
+  f_t start_time                         = 0.0;
+  omp_atomic_t<f_t> total_lp_solve_time  = 0.0;
+  omp_atomic_t<int64_t> nodes_explored   = 0;
+  omp_atomic_t<int64_t> nodes_unexplored = 0;
+  omp_atomic_t<int64_t> total_lp_iters   = 0;
+  omp_atomic_t<i_t> nodes_since_last_log = 0;
+  omp_atomic_t<f_t> last_log             = 0.0;
+};
+
+template <typename f_t, typename i_t>
+bool is_search_strategy_enabled(search_strategy_t strategy,
+                                bool has_incumbent,
+                                diving_heuristics_settings_t<i_t, f_t> settings)
+{
+  switch (strategy) {
+    case BEST_FIRST: return true;
+    case PSEUDOCOST_DIVING: return settings.pseudocost_diving != 0;
+    case LINE_SEARCH_DIVING: return settings.line_search_diving != 0;
+    case GUIDED_DIVING: return settings.guided_diving != 0 && has_incumbent;
+    case COEFFICIENT_DIVING: return settings.coefficient_diving != 0;
+    default: return false;
+  }
+}
+
+template <typename i_t, typename f_t>
+class branch_and_bound_worker_t {
+ public:
+  using float_type = f_t;
+  using int_type   = i_t;
+
+  i_t worker_id;
+  omp_atomic_t<search_strategy_t> search_strategy;
+  omp_atomic_t<bool> is_active;
+  omp_atomic_t<f_t> lower_bound;
+
+  lp_problem_t<i_t, f_t> leaf_problem;
+  lp_solution_t<i_t, f_t> leaf_solution;
+  std::vector<variable_status_t> leaf_vstatus;
+  std::vector<f_t> leaf_edge_norms;
+
+  basis_update_mpf_t<i_t, f_t> basis_factors;
+  std::vector<i_t> basic_list;
+  std::vector<i_t> nonbasic_list;
+
+  bounds_strengthening_t<i_t, f_t> node_presolver;
+  std::vector<bool> bounds_changed;
+
+  std::vector<f_t> start_lower;
+  std::vector<f_t> start_upper;
+
+  pcgenerator_t rng;
+
+  bool recompute_basis  = true;
+  bool recompute_bounds = true;
+
+  branch_and_bound_worker_t(i_t worker_id,
+                            const lp_problem_t<i_t, f_t>& original_lp,
+                            const csr_matrix_t<i_t, f_t>& Arow,
+                            const std::vector<variable_type_t>& var_type,
+                            const simplex_solver_settings_t<i_t, f_t>& settings,
+                            uint64_t rng_offset = 0)
+    : worker_id(worker_id),
+      search_strategy(BEST_FIRST),
+      is_active(false),
+      lower_bound(-std::numeric_limits<f_t>::infinity()),
+      leaf_problem(original_lp),
+      leaf_solution(original_lp.num_rows, original_lp.num_cols),
+      leaf_vstatus(original_lp.num_cols),
+      basis_factors(original_lp.num_rows, settings.refactor_frequency),
+      basic_list(original_lp.num_rows),
+      nonbasic_list(),
+      node_presolver(leaf_problem, Arow, {}, var_type),
+      bounds_changed(original_lp.num_cols, false),
+      rng(settings.random_seed + pcgenerator_t::default_seed + rng_offset + worker_id,
+          pcgenerator_t::default_stream ^ (worker_id + rng_offset))
+  {
+  }
+
+  // Set the variables bounds for the LP relaxation in the current node.
+  bool set_lp_variable_bounds(mip_node_t<i_t, f_t>* node_ptr,
+                              const simplex_solver_settings_t<i_t, f_t>& settings)
+  {
+    // Reset the bound_changed markers
+    std::fill(bounds_changed.begin(), bounds_changed.end(), false);
+
+    // Set the correct bounds for the leaf problem
+    if (recompute_bounds) {
+      leaf_problem.lower = start_lower;
+      leaf_problem.upper = start_upper;
+      node_ptr->get_variable_bounds(leaf_problem.lower, leaf_problem.upper, bounds_changed);
+
+    } else {
+      node_ptr->update_branched_variable_bounds(
+        leaf_problem.lower, leaf_problem.upper, bounds_changed);
+    }
+
+    return node_presolver.bounds_strengthening(
+      settings, bounds_changed, leaf_problem.lower, leaf_problem.upper);
+  }
+
+  void set_active() { is_active = true; }
+};
+
+template <typename i_t, typename f_t>
+class bfs_worker_t : public branch_and_bound_worker_t<i_t, f_t> {
+ public:
+  using Base = branch_and_bound_worker_t<i_t, f_t>;
+  bfs_worker_t(i_t worker_id,
+               const lp_problem_t<i_t, f_t>& original_lp,
+               const csr_matrix_t<i_t, f_t>& Arow,
+               const std::vector<variable_type_t>& var_type,
+               const simplex_solver_settings_t<i_t, f_t>& settings,
+               uint64_t rng_offset = 0)
+    : Base(worker_id, original_lp, Arow, var_type, settings, rng_offset)
+  {
+    this->start_lower     = original_lp.lower;
+    this->start_upper     = original_lp.upper;
+    this->search_strategy = BEST_FIRST;
+
+    max_diving_workers.fill(0);
+    active_diving_workers.fill(0);
+    total_active_diving_workers = 0;
+  }
+
+  void init(const std::vector<mip_node_t<i_t, f_t>*>& nodes)
+  {
+    assert(!this->is_active.load());
+    assert(node_queue.best_first_queue_size() == 0);
+    assert(nodes.size() > 0);
+
+    for (auto* node : nodes) {
+      assert(node != nullptr);
+      node_queue.push(node);
+    }
+
+    this->lower_bound = node_queue.get_lower_bound();
+  }
+
+  void set_inactive() { this->is_active = false; }
+
+  // Steal nodes from another worker
+  bool steal_node_from(bfs_worker_t* other, i_t num_nodes)
+  {
+    bool steal = false;
+    assert(num_nodes > 0);
+
+    if (!other->is_active || this == other ||
+        other->node_queue.best_first_queue_size() < 2 * num_nodes) {
+      return steal;
+    }
+
+    while (num_nodes > 0) {
+      other->node_queue.lock();
+      mip_node_t<i_t, f_t>* node = other->node_queue.best_first_queue_size() > num_nodes
+                                     ? other->node_queue.pop_best_first()
+                                     : nullptr;
+      other->node_queue.unlock();
+      if (node == nullptr) { break; }
+
+      this->node_queue.lock();
+      this->node_queue.push(node);
+      this->node_queue.unlock();
+      --num_nodes;
+      steal = true;
+    }
+
+    return steal;
+  }
+
+  // Calculate the number of diving workers that this worker can launch. Having a fixed number
+  // of workers allows the solver to be more deterministic.
+  void calculate_num_diving_workers(i_t num_bfs_workers,
+                                    i_t total_diving_workers,
+                                    bool has_incumbent,
+                                    const diving_heuristics_settings_t<i_t, f_t>& settings)
+  {
+    i_t num_active = 0;
+    for (i_t i = 1; i < num_search_strategies; ++i) {
+      num_active += is_search_strategy_enabled(search_strategies[i], has_incumbent, settings);
+    }
+
+    total_max_diving_workers = 0;
+    max_diving_workers.fill(0);
+    if (num_active == 0) { return; }
+
+    for (size_t i = 1, k = 0; i < num_search_strategies; ++i) {
+      if (is_search_strategy_enabled(search_strategies[i], has_incumbent, settings)) {
+        // Calculate the number of workers for a given diving heuristic
+        i_t start            = std::floor((double)k * total_diving_workers / num_active);
+        i_t end              = std::floor((double)(k + 1) * total_diving_workers / num_active);
+        i_t workers_per_type = end - start;
+
+        // Calculate the number of diving workers allocated to this (best-first) worker
+        start = std::floor((double)this->worker_id * workers_per_type / num_bfs_workers);
+        end   = std::floor((double)(this->worker_id + 1) * workers_per_type / num_bfs_workers);
+        max_diving_workers[i] = end - start;
+        total_max_diving_workers += max_diving_workers[i];
+        ++k;
+      }
+    }
+  }
+
+  // The worker-local node heap.
+  node_queue_t<i_t, f_t> node_queue;
+
+  // The number of diving workers of each type that this (best-first) worker can launch.
+  std::array<i_t, num_search_strategies> max_diving_workers;
+
+  // The number of active diving workers of each type associated with this (best-first) worker.
+  std::array<omp_atomic_t<i_t>, num_search_strategies> active_diving_workers;
+
+  // Keep track of the total number of active diving worker that are associated with this
+  // (best-first) worker
+  omp_atomic_t<i_t> total_active_diving_workers{0};
+
+  // The maximum number of diving worker that are associated with this
+  // (best-first) worker
+  i_t total_max_diving_workers{0};
+};
+
+template <typename i_t, typename f_t>
+class diving_worker_t : public branch_and_bound_worker_t<i_t, f_t> {
+ public:
+  using Base = branch_and_bound_worker_t<i_t, f_t>;
+  using Base::Base;
+
+  // After calling this routine, you need to set `is_active = true` when the worker is ready.
+  // Note that the starting node may be dropped if become infeasible via bound propagation.
+  void init(const mip_node_t<i_t, f_t>* node, const lp_problem_t<i_t, f_t>& original_lp)
+  {
+    // Creates a copy of the node that is disconnected from the main tree, such that the
+    // diving does not modify the main tree. We need to store the variables bounds
+    // associated with this node, since we cannot retrieve it from the tree
+    start_node        = node->detach_copy();
+    this->start_lower = original_lp.lower;
+    this->start_upper = original_lp.upper;
+    this->lower_bound = node->lower_bound;
+    std::fill(this->bounds_changed.begin(), this->bounds_changed.end(), false);
+    node->get_variable_bounds(this->start_lower, this->start_upper, this->bounds_changed);
+  }
+
+  // Apply bound strengthening to the starting variable bounds
+  bool presolve_start_bounds(const simplex_solver_settings_t<i_t, f_t>& settings)
+  {
+    return this->node_presolver.bounds_strengthening(
+      settings, this->bounds_changed, this->start_lower, this->start_upper);
+  }
+
+  // Set this node inactive
+  void set_inactive()
+  {
+    assert(this->is_active.load());
+    assert(bfs_worker != nullptr);
+    assert(bfs_worker->active_diving_workers[this->search_strategy].load() > 0);
+    assert(bfs_worker->total_active_diving_workers.load() > 0);
+
+    this->is_active = false;
+    --bfs_worker->active_diving_workers[this->search_strategy];
+    --bfs_worker->total_active_diving_workers;
+  }
+
+  mip_node_t<i_t, f_t> start_node;
+
+  // The best-first worker that is associated with this diving worker. Used for controlling the
+  // number of active diving workers.
+  bfs_worker_t<i_t, f_t>* bfs_worker{nullptr};
+};
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/branch_and_bound/worker_pool.hpp b/cpp/src/branch_and_bound/worker_pool.hpp
new file mode 100644
index 0000000000..9396f48c04
--- /dev/null
+++ b/cpp/src/branch_and_bound/worker_pool.hpp
@@ -0,0 +1,117 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <branch_and_bound/worker.hpp>
+#include <utilities/circular_deque.hpp>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+template <typename WorkerType>
+class worker_pool_t {
+ public:
+  using i_t = typename WorkerType::int_type;
+  using f_t = typename WorkerType::float_type;
+
+  void init(i_t num_workers,
+            const lp_problem_t<i_t, f_t>& original_lp,
+            const csr_matrix_t<i_t, f_t>& Arow,
+            const std::vector<variable_type_t>& var_type,
+            const simplex_solver_settings_t<i_t, f_t>& settings,
+            const uint64_t rng_offset = 0)
+  {
+    assert(!is_initialized);
+    assert(num_workers > 0);
+
+    workers_.resize(num_workers);
+    num_idle_workers_ = num_workers;
+    idle_workers_.clear_resize(num_workers);
+    for (i_t i = 0; i < num_workers; ++i) {
+      workers_[i] =
+        std::make_unique<WorkerType>(i, original_lp, Arow, var_type, settings, rng_offset);
+      idle_workers_.push_back(i);
+    }
+
+    is_initialized = true;
+  }
+
+  WorkerType* pop_idle_worker()
+  {
+    std::lock_guard lock(mutex_);
+    if (idle_workers_.empty()) {
+      return nullptr;
+    } else {
+      i_t idx = idle_workers_.front();
+      idle_workers_.pop_front();
+      num_idle_workers_--;
+      assert(idle_workers_.size() == static_cast<size_t>(num_idle_workers_.load()));
+      assert(idx >= 0 && static_cast<size_t>(idx) < workers_.size());
+      return workers_[idx].get();
+    }
+  }
+
+  void return_worker_to_pool(WorkerType* worker)
+  {
+    std::lock_guard lock(mutex_);
+    assert(worker != nullptr);
+    assert(workers_[worker->worker_id].get() == worker);
+    assert(!worker->is_active.load());
+    assert(static_cast<size_t>(num_idle_workers_.load()) == idle_workers_.size());
+    assert(idle_workers_.size() <= workers_.size());
+
+    idle_workers_.push_back(worker->worker_id);
+    num_idle_workers_++;
+  }
+
+  f_t get_lower_bound()
+  {
+    f_t lower_bound = std::numeric_limits<f_t>::infinity();
+
+    if (is_initialized) {
+      for (i_t i = 0; i < workers_.size(); ++i) {
+        if (workers_[i]->is_active.load()) {
+          lower_bound = std::min(workers_[i]->lower_bound.load(), lower_bound);
+        }
+      }
+    }
+
+    return lower_bound;
+  }
+
+  WorkerType* operator[](i_t id)
+  {
+    assert(id >= 0 && static_cast<size_t>(id) < workers_.size());
+    assert(workers_[id] != nullptr);
+    return workers_[id].get();
+  }
+  WorkerType* operator[](i_t id) const
+  {
+    assert(id >= 0 && static_cast<size_t>(id) < workers_.size());
+    assert(workers_[id] != nullptr);
+    return workers_[id].get();
+  }
+
+  i_t num_idle_workers() const { return num_idle_workers_; }
+  i_t num_workers() const { return workers_.size(); }
+
+ private:
+  std::vector<std::unique_ptr<WorkerType>> workers_;
+  bool is_initialized = false;
+
+  omp_mutex_t mutex_;
+  circular_deque_t<i_t> idle_workers_;
+  omp_atomic_t<i_t> num_idle_workers_{0};
+};
+
+template <typename i_t, typename f_t>
+using bfs_worker_pool_t = worker_pool_t<bfs_worker_t<i_t, f_t>>;
+
+template <typename i_t, typename f_t>
+using diving_worker_pool_t = worker_pool_t<diving_worker_t<i_t, f_t>>;
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/initial_basis.cpp b/cpp/src/dual_simplex/initial_basis.cpp
index d69bf8877e..dcb1615359 100644
--- a/cpp/src/dual_simplex/initial_basis.cpp
+++ b/cpp/src/dual_simplex/initial_basis.cpp
@@ -18,6 +18,90 @@
 
 namespace cuopt::linear_programming::dual_simplex {
 
+uint8_t encode(variable_status_t vstatus)
+{
+  assert(vstatus != variable_status_t::SUPERBASIC &&
+         "packed_variable_status_t does not support superbasic variables");
+
+  uint8_t val = 0;
+  switch (vstatus) {
+    case variable_status_t::BASIC: val = 0b00; break;
+    case variable_status_t::NONBASIC_LOWER: val = 0b01; break;
+    case variable_status_t::NONBASIC_UPPER: val = 0b10; break;
+    case variable_status_t::NONBASIC_FIXED: val = 0b01; break;
+    default: val = 0b11;
+  }
+
+  return val;
+}
+
+variable_status_t decode(uint8_t val)
+{
+  val &= 0b11;
+  if (val == 0b00) return variable_status_t::BASIC;
+  if (val == 0b01) return variable_status_t::NONBASIC_LOWER;
+  if (val == 0b10) return variable_status_t::NONBASIC_UPPER;
+  return variable_status_t::NONBASIC_FREE;
+}
+
+std::vector<uint8_t> compress_vstatus(const std::vector<variable_status_t>& vstatus)
+{
+  size_t n        = vstatus.size() / 4;
+  size_t has_tail = (vstatus.size() % 4 > 0);
+
+  std::vector<uint8_t> packed_vstatus;
+  packed_vstatus.resize(n + has_tail);
+
+  for (size_t i = 0; i < n; ++i) {
+    size_t j          = i * 4;
+    packed_vstatus[i] = 0;
+    packed_vstatus[i] |= encode(vstatus[j]);
+    packed_vstatus[i] |= encode(vstatus[j + 1]) << 2;
+    packed_vstatus[i] |= encode(vstatus[j + 2]) << 4;
+    packed_vstatus[i] |= encode(vstatus[j + 3]) << 6;
+  }
+
+  if (has_tail) {
+    size_t j          = n * 4;
+    packed_vstatus[n] = 0;
+    packed_vstatus[n] |= encode(vstatus[j]);
+    if (j + 1 < vstatus.size()) packed_vstatus[n] |= encode(vstatus[j + 1]) << 2;
+    if (j + 2 < vstatus.size()) packed_vstatus[n] |= encode(vstatus[j + 2]) << 4;
+    if (j + 3 < vstatus.size()) packed_vstatus[n] |= encode(vstatus[j + 3]) << 6;
+  }
+
+  return packed_vstatus;
+}
+
+std::vector<variable_status_t> decompress_vstatus(const std::vector<uint8_t>& packed_vstatus,
+                                                  size_t vstatus_size)
+{
+  size_t n        = vstatus_size / 4;
+  size_t has_tail = (vstatus_size % 4 > 0);
+
+  std::vector<variable_status_t> vstatus;
+  vstatus.resize(vstatus_size);
+  assert(packed_vstatus.size() == n + has_tail);
+
+  for (size_t i = 0; i < n; ++i) {
+    size_t j       = i * 4;
+    vstatus[j]     = decode(packed_vstatus[i]);
+    vstatus[j + 1] = decode(packed_vstatus[i] >> 2);
+    vstatus[j + 2] = decode(packed_vstatus[i] >> 4);
+    vstatus[j + 3] = decode(packed_vstatus[i] >> 6);
+  }
+
+  if (has_tail) {
+    size_t j   = n * 4;
+    vstatus[j] = decode(packed_vstatus[n]);
+    if (j + 1 < vstatus.size()) vstatus[j + 1] = decode(packed_vstatus[n] >> 2);
+    if (j + 2 < vstatus.size()) vstatus[j + 2] = decode(packed_vstatus[n] >> 4);
+    if (j + 3 < vstatus.size()) vstatus[j + 3] = decode(packed_vstatus[n] >> 6);
+  }
+
+  return vstatus;
+}
+
 template <typename i_t, typename f_t>
 i_t initial_basis_selection(const lp_problem_t<i_t, f_t>& problem,
                             const simplex_solver_settings_t<i_t, f_t>& settings,
diff --git a/cpp/src/dual_simplex/initial_basis.hpp b/cpp/src/dual_simplex/initial_basis.hpp
index 646785fbd2..22a172dbe0 100644
--- a/cpp/src/dual_simplex/initial_basis.hpp
+++ b/cpp/src/dual_simplex/initial_basis.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -23,6 +23,10 @@ enum class variable_status_t : int8_t {
   SUPERBASIC     = 4
 };
 
+std::vector<uint8_t> compress_vstatus(const std::vector<variable_status_t>& vstatus);
+std::vector<variable_status_t> decompress_vstatus(const std::vector<uint8_t>& packed_vstatus,
+                                                  size_t vstatus_size);
+
 template <typename i_t, typename f_t>
 i_t initial_basis_selection(const lp_problem_t<i_t, f_t>& problem,
                             const simplex_solver_settings_t<i_t, f_t>& settings,
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index cfc120e477..0f35a7d479 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -110,6 +110,9 @@ struct simplex_solver_settings_t {
       mip_batch_pdlp_reliability_branching(0),
       strong_branching_simplex_iteration_limit(-1),
       random_seed(0),
+      bnb_steal_chance(-1),
+      bnb_nodes_per_steal(10),
+      bnb_max_steal_attempts(3),
       reliability_branching(-1),
       inside_mip(0),
       sub_mip(0),
@@ -198,13 +201,21 @@ struct simplex_solver_settings_t {
                                              // PDLP only
   // Set the maximum number of simplex iterations allowed per trial branch when applying
   // strong branching to the root node.
-  // -1 - Automatic (iteration limit = 200)
-  // 0, 1 - Estimate the objective change using a single pivot of dual simplex
-  // >1 - Set as the iteration limit in dual simplex
+  // -1 - automatic (iteration limit = 200)
+  // 0, 1 - estimate the objective change using a single pivot of dual simplex
+  // >1 - set as the iteration limit in dual simplex
   i_t strong_branching_simplex_iteration_limit;
 
   diving_heuristics_settings_t<i_t, f_t> diving_settings;  // Settings for the diving heuristics
 
+  // In B&B, indicate the chance in which a worker can steal a node from another worker.
+  // -1 - automatic (0.05)
+  // 0 - disable
+  // >0 - set the stealing chance [0, 1]
+  f_t bnb_steal_chance;
+  i_t bnb_nodes_per_steal;
+  i_t bnb_max_steal_attempts;
+
   // Settings for the reliability branching.
   // - -1: automatic
   // - 0: disable (use pseudocost branching instead)
diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu
index ce6b602fba..f4d1735e18 100644
--- a/cpp/src/mip_heuristics/solver.cu
+++ b/cpp/src/mip_heuristics/solver.cu
@@ -356,6 +356,21 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
         ? 2
         : context.settings.reduced_cost_strengthening;
 
+    char* steal_chance_str = std::getenv("CUOPT_BNB_STEAL_CHANCE");
+    if (steal_chance_str != nullptr) {
+      branch_and_bound_settings.bnb_steal_chance = atof(steal_chance_str);
+    }
+
+    char* max_steal_attempts_str = std::getenv("CUOPT_BNB_MAX_STEAL_ATTEMPTS");
+    if (max_steal_attempts_str != nullptr) {
+      branch_and_bound_settings.bnb_max_steal_attempts = atoi(max_steal_attempts_str);
+    }
+
+    char* nodes_per_steal_str = std::getenv("CUOPT_BNB_NODES_PER_STEAL");
+    if (nodes_per_steal_str != nullptr) {
+      branch_and_bound_settings.bnb_nodes_per_steal = atoi(nodes_per_steal_str);
+    }
+
     if (context.settings.num_cpu_threads < 0) {
       branch_and_bound_settings.num_threads = std::max(1, omp_get_max_threads() - 1);
     } else {
diff --git a/cpp/src/utilities/circular_deque.hpp b/cpp/src/utilities/circular_deque.hpp
new file mode 100644
index 0000000000..6e9d8f5b05
--- /dev/null
+++ b/cpp/src/utilities/circular_deque.hpp
@@ -0,0 +1,114 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <cassert>
+#include <utility>
+#include <vector>
+
+namespace cuopt {
+
+// A fixed-capacity double-ended queue backed by a circular buffer.
+// All operations are O(1) with no dynamic allocation after construction.
+//
+// Preconditions (asserted in debug builds):
+//   - push_front / push_back : size() < capacity()
+//   - pop_front  / pop_back  : !empty()
+//   - front / back           : !empty()
+template <typename T>
+class circular_deque_t {
+ public:
+  circular_deque_t() : buffer_(1), capacity_(1), head_(0), tail_(0) {}
+
+  // Allocates storage for exactly `capacity` elements up front.
+  explicit circular_deque_t(size_t capacity)
+    : buffer_(capacity + 1),  // +1 to distinguish full (next(tail)==head) from empty (head==tail)
+      capacity_(capacity + 1),
+      head_(0),
+      tail_(0)
+  {
+    assert(capacity > 0);
+  }
+
+  bool empty() const { return head_ == tail_; }
+  bool full() const { return next(tail_) == head_; }
+
+  size_t size() const { return (tail_ - head_ + capacity_) % capacity_; }
+  size_t capacity() const { return capacity_ - 1; }
+
+  void clear_resize(size_t new_capacity)
+  {
+    assert(new_capacity > 0);
+    head_     = 0;
+    tail_     = 0;
+    capacity_ = new_capacity + 1;
+    buffer_.resize(capacity_);
+  }
+
+  void push_back(T val)
+  {
+    assert(!full());
+    buffer_[tail_] = std::move(val);
+    tail_          = next(tail_);
+  }
+
+  void push_front(T val)
+  {
+    assert(!full());
+    head_          = prev(head_);
+    buffer_[head_] = std::move(val);
+  }
+
+  T pop_front()
+  {
+    assert(!empty());
+    T val = std::move(buffer_[head_]);
+    head_ = next(head_);
+    return val;
+  }
+
+  T pop_back()
+  {
+    assert(!empty());
+    tail_ = prev(tail_);
+    return std::move(buffer_[tail_]);
+  }
+
+  T& front()
+  {
+    assert(!empty());
+    return buffer_[head_];
+  }
+  const T& front() const
+  {
+    assert(!empty());
+    return buffer_[head_];
+  }
+
+  T& back()
+  {
+    assert(!empty());
+    return buffer_[prev(tail_)];
+  }
+  const T& back() const
+  {
+    assert(!empty());
+    return buffer_[prev(tail_)];
+  }
+
+ private:
+  size_t next(size_t idx) const { return (idx + 1) % capacity_; }
+  size_t prev(size_t idx) const { return (idx + capacity_ - 1) % capacity_; }
+
+  std::vector<T> buffer_;
+  size_t capacity_;
+  size_t head_;
+  size_t tail_;
+};
+
+}  // namespace cuopt
diff --git a/cpp/src/utilities/omp_helpers.hpp b/cpp/src/utilities/omp_helpers.hpp
index f6e66472dd..97816f3933 100644
--- a/cpp/src/utilities/omp_helpers.hpp
+++ b/cpp/src/utilities/omp_helpers.hpp
@@ -20,11 +20,9 @@ namespace cuopt {
 class omp_mutex_t {
  public:
   omp_mutex_t() : mutex(new omp_lock_t) { omp_init_lock(mutex.get()); }
-
-  omp_mutex_t(const omp_mutex_t&) = delete;
-
   omp_mutex_t(omp_mutex_t&& other) { *this = std::move(other); }
 
+  omp_mutex_t(const omp_mutex_t&)            = delete;
   omp_mutex_t& operator=(const omp_mutex_t&) = delete;
 
   omp_mutex_t& operator=(omp_mutex_t&& other)
@@ -54,6 +52,15 @@ class omp_mutex_t {
   std::unique_ptr<omp_lock_t> mutex;
 };
 
+// Empty class with the same methods as `omp_mutex_t`. This is mainly used for cleanly disabling
+// the `omp_mutex_t` via type alias (`lock` and `unlock` are replaced by NOOPs).
+class fake_omp_mutex_t {
+ public:
+  static void lock() {}
+  static void unlock() {}
+  static bool try_lock() { return true; }
+};
+
 // Wrapper for omp atomic operations. See
 // https://www.openmp.org/spec-html/5.1/openmpsu105.html.
 template <typename T>
@@ -117,20 +124,17 @@ class omp_atomic_t {
 
   T fetch_sub(T inc) { return fetch_add(-inc); }
 
+  // Get the underlying value without atomics
+  T& underlying() { return val; }
+  T underlying() const { return val; }
+
  private:
   T val;
 
-#ifndef __NVCC__
   friend double fetch_min(omp_atomic_t<double>& atomic_var, double other);
   friend double fetch_max(omp_atomic_t<double>& atomic_var, double other);
-#endif
 };
 
-// Atomic CAS are only supported in OpenMP v5.1
-// (gcc 12+ or clang 14+), however, nvcc (or the host compiler) cannot
-// parse it correctly yet
-#ifndef __NVCC__
-
 // Free non-template functions are necessary because of a clang 20 bug
 // when omp atomic compare is used within a templated context.
 // see https://github.com/llvm/llvm-project/issues/127466
@@ -155,7 +159,6 @@ inline double fetch_max(omp_atomic_t<double>& atomic_var, double other)
   }
   return old;
 }
-#endif
 
 #endif
 
diff --git a/cpp/src/utilities/pcgenerator.hpp b/cpp/src/utilities/pcgenerator.hpp
index 29a865f02f..e83e5f36ad 100644
--- a/cpp/src/utilities/pcgenerator.hpp
+++ b/cpp/src/utilities/pcgenerator.hpp
@@ -21,12 +21,11 @@ class pcgenerator_t {
   static constexpr uint64_t default_stream = 0xda3e39cb94b95bdbULL;
 
   /**
-   * @brief ctor. Initializes the PCG
-   * @param rng_state is the generator state used for initializing the generator
-   * @param subsequence specifies the subsequence to be generated out of 2^64 possible subsequences
-   * In a parallel setting, like threads of a CUDA kernel, each thread is required to generate a
-   * unique set of random numbers. This can be achieved by initializing the generator with same
-   * rng_state for all the threads and diststreamt values for subsequence.
+   * @brief Initializes the PCG generator.
+   * @param seed        Generator state seed.
+   * @param subsequence Selects one of 2^64 independent subsequences. Use distinct values per
+   *                    thread to guarantee non-overlapping streams in parallel contexts.
+   * @param offset      Number of outputs to skip ahead before the first draw.
    */
   pcgenerator_t(const uint64_t seed        = default_seed,
                 const uint64_t subsequence = default_stream,
@@ -35,7 +34,12 @@ class pcgenerator_t {
     set_seed(seed, subsequence, offset);
   }
 
-  // Set the seed, subsequence and offset of the PCG
+  /**
+   * @brief Re-seeds the generator.
+   * @param seed        Generator state seed.
+   * @param subsequence Selects one of 2^64 independent subsequences.
+   * @param offset      Number of outputs to skip ahead before the first draw.
+   */
   void set_seed(uint64_t seed, const uint64_t subsequence = default_stream, uint64_t offset = 0)
   {
     state  = uint64_t(0);
@@ -47,8 +51,12 @@ class pcgenerator_t {
     skipahead(offset);
   }
 
-  // Based on "Random Number Generation with Arbitrary Strides" F. B. Brown
-  // Link https://mcnp.lanl.gov/pdf_files/anl-rn-arb-stride.pdf
+  /**
+   * @brief Advances the generator state by @p offset steps in O(log offset) time.
+   *
+   * Uses the closed-form LCG jump described in "Random Number Generation with Arbitrary Strides"
+   * (F. B. Brown, https://mcnp.lanl.gov/pdf_files/anl-rn-arb-stride.pdf).
+   */
   void skipahead(uint64_t offset)
   {
     uint64_t G = 1;
@@ -68,9 +76,7 @@ class pcgenerator_t {
   }
 
   /**
-   * @defgroup NextRand Generate the next random number
-   * @brief This code is derived from PCG basic code
-   * @{
+   * @returns the next uniformly distributed 32-bit unsigned integer.
    */
   uint32_t next_u32()
   {
@@ -83,6 +89,9 @@ class pcgenerator_t {
     return ret;
   }
 
+  /**
+   * @returns the next uniformly distributed 64-bit unsigned integer.
+   */
   uint64_t next_u64()
   {
     uint64_t ret;
@@ -93,6 +102,10 @@ class pcgenerator_t {
     return ret;
   }
 
+  /**
+   * @returns the next uniformly distributed non-negative 32-bit signed integer in [0,
+   * INT32_MAX].
+   */
   int32_t next_i32()
   {
     int32_t ret;
@@ -102,6 +115,10 @@ class pcgenerator_t {
     return ret;
   }
 
+  /**
+   * @returns the next uniformly distributed non-negative 64-bit signed integer in [0,
+   * INT64_MAX].
+   */
   int64_t next_i64()
   {
     int64_t ret;
@@ -111,10 +128,19 @@ class pcgenerator_t {
     return ret;
   }
 
-  float next_float() { return static_cast<float>((next_u32() >> 8) * 0x1.0p-24); }
+  /**
+   * @returns a uniformly distributed float in [0, 1).
+   */
+  float next_float() { return (next_u32() >> 8) * 0x1.0p-24; }
 
-  double next_double() { return static_cast<double>((next_u64() >> 11) * 0x1.0p-53); }
+  /**
+   * @returns a uniformly distributed double in [0, 1).
+   */
+  double next_double() { return (next_u64() >> 11) * 0x1.0p-53; }
 
+  /**
+   * @returns the next random value of type @p T.
+   */
   template <typename T>
   T next()
   {
@@ -130,9 +156,11 @@ class pcgenerator_t {
   void next(float& ret) { ret = next_float(); }
   void next(double& ret) { ret = next_double(); }
 
-  /// Draws a sample from a uniform distribution. The samples are uniformly distributed over
-  /// the semi-closed interval `[low, high)`. This routine may have a **slight bias** toward
-  /// some numbers in the range (scaling by floating-point).
+  /**
+   * @brief Draws a sample from a uniform distribution over `[low, high)`.
+   *
+   * May have a slight bias toward some values due to floating-point scaling.
+   */
   template <typename T>
   T uniform(T low, T high)
   {
@@ -141,7 +169,9 @@ class pcgenerator_t {
     return low + (val * range);
   }
 
-  // Shuffles the contents of a sequence using the Fisher–Yates algorithm.
+  /**
+   * @brief Shuffles @p seq in-place using the Fisher-Yates algorithm.
+   */
   template <typename T>
   void shuffle(std::vector<T>& seq)
   {