diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index e69ff7b9a..d0af517e6 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -2507,7 +2507,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   set_uninitialized_steepest_edge_norms(original_lp_, basic_list, edge_norms_);
 
   pc_.resize(original_lp_.num_cols);
-  original_lp_.A.transpose(pc_.AT);
+  pc_.Arow = Arow_;
   {
     raft::common::nvtx::range scope_sb("BB::strong_branching");
     strong_branching<i_t, f_t>(original_lp_,
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index c38e98e27..aa059b532 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -66,14 +66,14 @@ template <typename i_t, typename f_t>
 objective_change_estimate_t<f_t> single_pivot_objective_change_estimate(
   const lp_problem_t<i_t, f_t>& lp,
   const simplex_solver_settings_t<i_t, f_t>& settings,
-  const csc_matrix_t<i_t, f_t>& A_transpose,
+  const csr_matrix_t<i_t, f_t>& Arow,
   const std::vector<variable_status_t>& vstatus,
   i_t variable_j,
   i_t basic_j,
   const lp_solution_t<i_t, f_t>& lp_solution,
   const std::vector<i_t>& basic_list,
   const std::vector<i_t>& nonbasic_list,
-  const std::vector<i_t>& nonbasic_mark,
+  const std::vector<i_t>& nonbasic_end,
   basis_update_mpf_t<i_t, f_t>& basis_factors,
   std::vector<i_t>& workspace,
   std::vector<f_t>& delta_z,
@@ -82,7 +82,6 @@ objective_change_estimate_t<f_t> single_pivot_objective_change_estimate(
   // Compute the objective estimate for the down and up branches of variable j
   assert(variable_j >= 0);
   assert(basic_j >= 0);
-
   // Down branch
   i_t direction = -1;
   sparse_vector_t<i_t, f_t> e_k(lp.num_rows, 0);
@@ -104,11 +103,11 @@ objective_change_estimate_t<f_t> single_pivot_objective_change_estimate(
   std::vector<i_t> delta_z_indices;
   // delta_z starts out all zero
   if (use_transpose) {
-    compute_delta_z(A_transpose,
+    compute_delta_z(Arow,
                     delta_y,
                     variable_j,
                     direction,
-                    nonbasic_mark,
+                    nonbasic_end,
                     workspace,
                     delta_z_indices,
                     delta_z,
@@ -234,10 +233,8 @@ void initialize_pseudo_costs_with_estimate(const lp_problem_t<i_t, f_t>& lp,
     basic_map[basic_list[i]] = i;
   }
 
-  std::vector<i_t> nonbasic_mark(n, -1);
-  for (i_t i = 0; i < n - m; i++) {
-    nonbasic_mark[nonbasic_list[i]] = i;
-  }
+  std::vector<i_t> nonbasic_end(lp.num_rows);
+  compute_initial_nonbasic_end(basic_map, pc.Arow, nonbasic_end);
 
   for (i_t k = 0; k < fractional.size(); k++) {
     const i_t j = fractional[k];
@@ -246,14 +243,14 @@ void initialize_pseudo_costs_with_estimate(const lp_problem_t<i_t, f_t>& lp,
     objective_change_estimate_t<f_t> estimate =
       single_pivot_objective_change_estimate(lp,
                                              settings,
-                                             pc.AT,
+                                             pc.Arow,
                                              vstatus,
                                              j,
                                              basic_map[j],
                                              lp_solution,
                                              basic_list,
                                              nonbasic_list,
-                                             nonbasic_mark,
+                                             nonbasic_end,
                                              basis_factors,
                                              workspace,
                                              delta_z,
@@ -1476,10 +1473,13 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
         basic_map[worker->basic_list[i]] = i;
       }
 
-      std::vector<i_t> nonbasic_mark(n, -1);
-      for (i_t i = 0; i < n - m; i++) {
-        nonbasic_mark[worker->nonbasic_list[i]] = i;
-      }
+      // Each thread will have a different basis
+      // So we need to make a copy of Arow before we permute the columns
+      // so that nonbasic variables appear first
+      csr_matrix_t<i_t, f_t> local_Arow = Arow;
+
+      std::vector<i_t> nonbasic_end(m);
+      compute_initial_nonbasic_end(basic_map, local_Arow, nonbasic_end);
 
       for (auto& [score, j] : unreliable_list) {
         if (pseudo_cost_num_down[j] == 0 || pseudo_cost_num_up[j] == 0) {
@@ -1487,14 +1487,14 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
           objective_change_estimate_t<f_t> estimate =
             single_pivot_objective_change_estimate(worker->leaf_problem,
                                                    settings,
-                                                   AT,
+                                                   local_Arow,
                                                    node_ptr->vstatus,
                                                    j,
                                                    basic_map[j],
                                                    leaf_solution,
                                                    worker->basic_list,
                                                    worker->nonbasic_list,
-                                                   nonbasic_mark,
+                                                   nonbasic_end,
                                                    worker->basis_factors,
                                                    workspace,
                                                    delta_z,
diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp
index 009bd8b81..43d6448e6 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.hpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.hpp
@@ -435,7 +435,7 @@ class pseudo_costs_t {
       pseudo_cost_num_up(num_variables),
       pseudo_cost_mutex_up(num_variables),
       pseudo_cost_mutex_down(num_variables),
-      AT(1, 1, 1)
+      Arow(1, 1, 1)
   {
   }
 
@@ -526,7 +526,7 @@ class pseudo_costs_t {
 
   reliability_branching_settings_t<i_t, f_t> reliability_branching_settings;
 
-  csc_matrix_t<i_t, f_t> AT;  // Transpose of the constraint matrix A
+  csr_matrix_t<i_t, f_t> Arow;
   std::vector<omp_atomic_t<f_t>> pseudo_cost_sum_up;
   std::vector<omp_atomic_t<f_t>> pseudo_cost_sum_down;
   std::vector<omp_atomic_t<i_t>> pseudo_cost_num_up;
diff --git a/cpp/src/dual_simplex/basis_solves.cpp b/cpp/src/dual_simplex/basis_solves.cpp
index c5fee4e10..caa2351ef 100644
--- a/cpp/src/dual_simplex/basis_solves.cpp
+++ b/cpp/src/dual_simplex/basis_solves.cpp
@@ -380,7 +380,7 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
         }
         work_estimate += 3 * Sdim;
 
-        Srank = right_looking_lu(S,
+        Srank = right_looking_lu2(S,
                                  settings,
                                  settings.threshold_partial_pivoting_tol,
                                  identity,
diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp
index 75e5ecae3..30285048a 100644
--- a/cpp/src/dual_simplex/phase2.cpp
+++ b/cpp/src/dual_simplex/phase2.cpp
@@ -130,79 +130,136 @@ void compute_reduced_cost_update(const lp_problem_t<i_t, f_t>& lp,
 }
 
 template <typename i_t, typename f_t>
-void compute_delta_z(const csc_matrix_t<i_t, f_t>& A_transpose,
+void compute_delta_z(const csr_matrix_t<i_t, f_t>& Arow,
                      const sparse_vector_t<i_t, f_t>& delta_y,
                      i_t leaving_index,
                      i_t direction,
-                     const std::vector<i_t>& nonbasic_mark,
+                     const std::vector<i_t>& nonbasic_end,
                      std::vector<i_t>& delta_z_mark,
                      std::vector<i_t>& delta_z_indices,
                      std::vector<f_t>& delta_z,
                      f_t& work_estimate)
 {
   // delta_zN = - N'*delta_y
-  const i_t nz_delta_y   = delta_y.i.size();
-  size_t nnz_processed   = 0;
-  size_t nonbasic_marked = 0;
+  const i_t nz_delta_y = delta_y.i.size();
+  size_t nnz_processed = 0;
   for (i_t k = 0; k < nz_delta_y; k++) {
     const i_t i         = delta_y.i[k];
     const f_t delta_y_i = delta_y.x[k];
     if (std::abs(delta_y_i) < 1e-12) { continue; }
-    const i_t row_start = A_transpose.col_start[i];
-    const i_t row_end   = A_transpose.col_start[i + 1];
+    const i_t row_start = Arow.row_start[i];
+    const i_t row_end   = nonbasic_end[i] + 1;
     nnz_processed += row_end - row_start;
     for (i_t p = row_start; p < row_end; ++p) {
-      const i_t j = A_transpose.i[p];
-      if (nonbasic_mark[j] >= 0) {
-        delta_z[j] -= delta_y_i * A_transpose.x[p];
-        nonbasic_marked++;
-        if (!delta_z_mark[j]) {
-          delta_z_mark[j] = 1;
-          delta_z_indices.push_back(j);
-        }
+      const i_t j = Arow.j[p];
+      delta_z[j] -= delta_y_i * Arow.x[p];
+      if (!delta_z_mark[j]) {
+        delta_z_mark[j] = 1;
+        delta_z_indices.push_back(j);
       }
     }
   }
   work_estimate += 4 * nz_delta_y;
-  work_estimate += 2 * nnz_processed;
-  work_estimate += 3 * nonbasic_marked;
+  work_estimate += 4 * nnz_processed;
   work_estimate += 2 * delta_z_indices.size();
 
   // delta_zB = sigma*ei
   delta_z[leaving_index] = direction;
+}
 
-#ifdef CHECK_CHANGE_IN_REDUCED_COST
-  const i_t m = A_transpose.n;
-  const i_t n = A_transpose.m;
-  std::vector<f_t> delta_y_dense(m);
-  delta_y.to_dense(delta_y_dense);
-  std::vector<f_t> delta_z_check(n);
-  std::vector<i_t> delta_z_mark_check(n, 0);
-  std::vector<i_t> delta_z_indices_check;
-  phase2::compute_reduced_cost_update(lp,
-                                      basic_list,
-                                      nonbasic_list,
-                                      delta_y_dense,
-                                      leaving_index,
-                                      direction,
-                                      delta_z_mark_check,
-                                      delta_z_indices_check,
-                                      delta_z_check,
-                                      work_estimate);
-  f_t error_check = 0.0;
-  for (i_t k = 0; k < n; ++k) {
-    const f_t diff = std::abs(delta_z[k] - delta_z_check[k]);
-    if (diff > 1e-6) {
-      printf("delta_z error %d transpose %e no transpose %e diff %e\n",
-             k,
-             delta_z[k],
-             delta_z_check[k],
-             diff);
+template <typename i_t, typename f_t>
+void compute_initial_nonbasic_end(const std::vector<i_t>& basic_mark,
+                                  csr_matrix_t<i_t, f_t>& Arow,
+                                  std::vector<i_t>& nonbasic_end)
+{
+  i_t m = Arow.m;
+  // Swap coefficients so that all coefficients for nonbasic variables in row i
+  // appear from Arow.row_start[i] to nonbasic_end[i]
+  for (i_t i = 0; i < m; i++) {
+    i_t lo = Arow.row_start[i];
+    i_t hi = Arow.row_start[i + 1] - 1;
+    while (lo <= hi) {
+      const i_t j = Arow.j[lo];
+      if (basic_mark[j] >= 0) {
+        // Swap coefficients
+        std::swap(Arow.j[lo], Arow.j[hi]);
+        std::swap(Arow.x[lo], Arow.x[hi]);
+        hi--;
+      } else {
+        lo++;
+      }
     }
-    error_check = std::max(error_check, diff);
+    nonbasic_end[i] = hi;
   }
-  if (error_check > 1e-6) { printf("delta_z error %e\n", error_check); }
-#endif
+}
+
+template <typename i_t, typename f_t>
+void update_Arow(i_t leaving,
+                 i_t entering,
+                 const csc_matrix_t<i_t, f_t>& A,
+                 std::vector<i_t>& row_mark,
+                 std::vector<i_t>& nonbasic_end,
+                 csr_matrix_t<i_t, f_t>& Arow,
+                 f_t& work_estimate)
+{
+  // Update the Arow matrix to reflect the new basis. So
+  // that the coefficients for all nonbasic variables in row i
+  // appear in Arow.row_start[i] to nonbasic_end[i].
+  // Swap the coefficients for the leaving and entering variables
+  // The leaving variable is now nonbasic, the entering variable is now basic
+  row_mark.clear();
+  const i_t col_start_enter = A.col_start[entering];
+  const i_t col_end_enter   = A.col_start[entering + 1];
+  for (i_t p = col_start_enter; p < col_end_enter; p++) {
+    const i_t i = A.i[p];
+    row_mark.push_back(i);
+  }
+  work_estimate += 2 * (col_end_enter - col_start_enter);
+
+  // Move the coefficients for the entering variable to the end of the nonbasics
+  // And decrement the nonbasic count for the row
+  for (i_t i : row_mark) {
+    const i_t row_start = Arow.row_start[i];
+    const i_t nb_end    = nonbasic_end[i];
+    for (i_t p = row_start; p <= nb_end; p++) {
+      const i_t j = Arow.j[p];
+      if (j == entering) {
+        std::swap(Arow.j[p], Arow.j[nb_end]);
+        std::swap(Arow.x[p], Arow.x[nb_end]);
+        nonbasic_end[i]--;
+        break;
+      }
+    }
+    work_estimate += nb_end - row_start;
+  }
+  work_estimate += 2 * row_mark.size();
+
+  row_mark.clear();
+  const i_t col_start_leave = A.col_start[leaving];
+  const i_t col_end_leave   = A.col_start[leaving + 1];
+  for (i_t p = col_start_leave; p < col_end_leave; p++) {
+    const i_t i = A.i[p];
+    row_mark.push_back(i);
+  }
+  work_estimate += 2 * (col_end_leave - col_start_leave);
+
+  // Move the coefficient for the leaving variable to the end of the nonbasics
+  // And increment the nonbasic count for the row
+  for (i_t i : row_mark) {
+    const i_t nb_end  = nonbasic_end[i];
+    const i_t row_end = Arow.row_start[i + 1];
+    for (i_t p = nb_end + 1; p < row_end; p++) {
+      const i_t j = Arow.j[p];
+      if (j == leaving) {
+        std::swap(Arow.j[p], Arow.j[nb_end + 1]);
+        std::swap(Arow.x[p], Arow.x[nb_end + 1]);
+        nonbasic_end[i]++;
+        break;
+      }
+    }
+    work_estimate += row_end - nb_end;
+  }
+  work_estimate += 2 * row_mark.size();
 }
 
 namespace phase2 {
@@ -807,7 +864,7 @@ void update_single_primal_infeasibility(const std::vector<f_t>& lower,
 }
 
 template <typename i_t, typename f_t>
-void update_primal_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
+bool update_primal_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
                                    const simplex_solver_settings_t<i_t, f_t>& settings,
                                    const std::vector<i_t>& basic_list,
                                    const std::vector<f_t>& x,
@@ -819,6 +876,7 @@ void update_primal_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
                                    f_t& primal_inf,
                                    f_t& work_estimate)
 {
+  bool became_feasible = false;
   const f_t primal_tol = settings.primal_tol;
   const i_t nz         = basic_change_list.size();
   for (i_t k = 0; k < nz; ++k) {
@@ -831,8 +889,10 @@ void update_primal_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
       const f_t old_val          = squared_infeasibilities[j];
       squared_infeasibilities[j] = 0.0;
       primal_inf                 = std::max(0.0, primal_inf - old_val);
+      if (old_val != 0.0) { became_feasible = true; }
       continue;
     }
+    const f_t old_val = squared_infeasibilities[j];
     update_single_primal_infeasibility(lp.lower,
                                        lp.upper,
                                        x,
@@ -841,8 +901,10 @@ void update_primal_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
                                        infeasibility_indices,
                                        j,
                                        primal_inf);
+    if (old_val != 0.0 && squared_infeasibilities[j] == 0.0) { became_feasible = true; }
   }
   work_estimate += 8 * nz;
+  return became_feasible;
 }
 
 template <typename i_t, typename f_t>
@@ -850,33 +912,14 @@ void clean_up_infeasibilities(std::vector<f_t>& squared_infeasibilities,
                               std::vector<i_t>& infeasibility_indices,
                               f_t& work_estimate)
 {
-  bool needs_clean_up  = false;
-  const i_t initial_nz = infeasibility_indices.size();
-  for (i_t k = 0; k < initial_nz; ++k) {
-    const i_t j              = infeasibility_indices[k];
-    const f_t squared_infeas = squared_infeasibilities[j];
-    if (squared_infeas == 0.0) { needs_clean_up = true; }
-  }
-  work_estimate += 2 * initial_nz;
-
-  if (needs_clean_up) {
-    i_t num_cleans = 0;
-    work_estimate += 2 * infeasibility_indices.size();
-    for (size_t k = 0; k < infeasibility_indices.size(); ++k) {
-      const i_t j              = infeasibility_indices[k];
-      const f_t squared_infeas = squared_infeasibilities[j];
-      if (squared_infeas == 0.0) {
-        const i_t new_j          = infeasibility_indices.back();
-        infeasibility_indices[k] = new_j;
-        infeasibility_indices.pop_back();
-        if (squared_infeasibilities[new_j] == 0.0) {
-          k--;
-        }  // Decrement k so that we process the same index again
-        num_cleans++;
-      }
-    }
-    work_estimate += 4 * num_cleans;
+  i_t write   = 0;
+  const i_t n = infeasibility_indices.size();
+  for (i_t k = 0; k < n; ++k) {
+    const i_t j = infeasibility_indices[k];
+    if (squared_infeasibilities[j] != 0.0) { infeasibility_indices[write++] = j; }
   }
+  infeasibility_indices.resize(write);
+  work_estimate += 3 * n;
 }
 
 template <typename i_t, typename f_t>
@@ -1480,8 +1523,10 @@ i_t compute_steepest_edge_norm_entering(const simplex_solver_settings_t<i_t, f_t
                                         const basis_update_mpf_t<i_t, f_t>& ft,
                                         i_t basic_leaving_index,
                                         i_t entering_index,
+                                        i_t leaving_index,
                                         std::vector<f_t>& steepest_edge_norms)
 {
+#if 0
   sparse_vector_t<i_t, f_t> es_sparse(m, 1);
   es_sparse.i[0] = basic_leaving_index;
   es_sparse.x[0] = -1.0;
@@ -1489,6 +1534,16 @@ i_t compute_steepest_edge_norm_entering(const simplex_solver_settings_t<i_t, f_t
   ft.b_transpose_solve(es_sparse, delta_ys_sparse);
   steepest_edge_norms[entering_index] = delta_ys_sparse.norm2_squared();
 
+  if (std::abs(steepest_edge_norms[entering_index] - steepest_edge_norms[leaving_index]) > 1e-1) {
+  settings.log.printf("Steepest edge norm %e for entering %d. Leaving %d norm %e. Diff %e\n",
+                      steepest_edge_norms[entering_index],
+                      entering_index,
+                      leaving_index,
+                      steepest_edge_norms[leaving_index],
+                      std::abs(steepest_edge_norms[entering_index] - steepest_edge_norms[leaving_index]));
+  }
+#endif
+  steepest_edge_norms[entering_index] = steepest_edge_norms[leaving_index];
 #ifdef STEEPEST_EDGE_DEBUG
   settings.log.printf("Steepest edge norm %e for entering j %d at i %d\n",
                       steepest_edge_norms[entering_index],
@@ -2408,6 +2463,7 @@ class phase2_timers_t {
       se_norms_time(0),
       se_entering_time(0),
       lu_update_time(0),
+      lu_factorization_time(0),
       perturb_time(0),
       vector_time(0),
       objective_time(0),
@@ -2431,8 +2487,9 @@ class phase2_timers_t {
   {
     if (!record_time) { return; }
     const f_t total_time = bfrt_time + pricing_time + btran_time + ftran_time + flip_time +
-                           delta_z_time + lu_update_time + se_norms_time + se_entering_time +
-                           perturb_time + vector_time + objective_time + update_infeasibility_time;
+                           delta_z_time + lu_update_time + lu_factorization_time + se_norms_time +
+                           se_entering_time + perturb_time + vector_time + objective_time +
+                           update_infeasibility_time;
     // clang-format off
     settings.log.printf("BFRT time       %.2fs %4.1f%\n", bfrt_time, 100.0 * bfrt_time / total_time);
     settings.log.printf("Pricing time    %.2fs %4.1f%\n", pricing_time, 100.0 * pricing_time / total_time);
@@ -2441,6 +2498,7 @@ class phase2_timers_t {
     settings.log.printf("Flip time       %.2fs %4.1f%\n", flip_time, 100.0 * flip_time / total_time);
     settings.log.printf("Delta_z time    %.2fs %4.1f%\n", delta_z_time, 100.0 * delta_z_time / total_time);
     settings.log.printf("LU update time  %.2fs %4.1f%\n", lu_update_time, 100.0 * lu_update_time / total_time);
+    settings.log.printf("LU factor time  %.2fs %4.1f%\n", lu_factorization_time, 100.0 * lu_factorization_time / total_time);
     settings.log.printf("SE norms time   %.2fs %4.1f%\n", se_norms_time, 100.0 * se_norms_time / total_time);
     settings.log.printf("SE enter time   %.2fs %4.1f%\n", se_entering_time, 100.0 * se_entering_time / total_time);
     settings.log.printf("Perturb time    %.2fs %4.1f%\n", perturb_time, 100.0 * perturb_time / total_time);
@@ -2459,6 +2517,7 @@ class phase2_timers_t {
   f_t se_norms_time;
   f_t se_entering_time;
   f_t lu_update_time;
+  f_t lu_factorization_time;
   f_t perturb_time;
   f_t vector_time;
   f_t objective_time;
@@ -2560,6 +2619,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
   phase2::bound_info(lp, settings);
   phase2_work_estimate += 2 * n;
 
+  f_t refactor_work = 0.0;
+  f_t solve_work    = 0.0;
+
   if (initialize_basis) {
     PHASE2_NVTX_RANGE("DualSimplex::init_basis");
     std::vector<i_t> superbasic_list;
@@ -2572,8 +2634,10 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
     assert(superbasic_list.size() == 0);
     assert(nonbasic_list.size() == n - m);
 
-    i_t refactor_status = ft.refactor_basis(
+    f_t refactor_start_work = ft.work_estimate();
+    i_t refactor_status     = ft.refactor_basis(
       lp.A, settings, lp.lower, lp.upper, start_time, basic_list, nonbasic_list, vstatus);
+    refactor_work = ft.work_estimate() - refactor_start_work;
     if (refactor_status == CONCURRENT_HALT_RETURN) { return dual::status_t::CONCURRENT_LIMIT; }
     if (refactor_status == TIME_LIMIT_RETURN) { return dual::status_t::TIME_LIMIT; }
     if (refactor_status > 0) { return dual::status_t::NUMERICAL; }
@@ -2738,13 +2802,18 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
   phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 0);
 #endif
 
+  csr_matrix_t<i_t, f_t> Arow(1, 1, 0);
+  lp.A.to_compressed_row(Arow);
+  phase2_work_estimate += 2 * lp.A.col_start[lp.A.n];
   if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
     return dual::status_t::CONCURRENT_LIMIT;
   }
 
-  csc_matrix_t<i_t, f_t> A_transpose(1, 1, 0);
-  lp.A.transpose(A_transpose);
-  phase2_work_estimate += 2 * lp.A.col_start[lp.A.n];
+  std::vector<i_t> nonbasic_end(m);
+  std::vector<i_t> row_mark;
+  row_mark.reserve(m);
+  compute_initial_nonbasic_end(basic_mark, Arow, nonbasic_end);
+  phase2_work_estimate += lp.A.col_start[lp.A.n];
 
   if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
     return dual::status_t::CONCURRENT_LIMIT;
@@ -2792,6 +2861,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                         0.0,
                         toc(start_time));
   }
+  i_t iterations_since_refactor = 0;
 
   while (iter < iter_limit) {
     PHASE2_NVTX_RANGE("DualSimplex::phase2_main_loop");
@@ -2914,11 +2984,14 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
     timers.start_timer();
     delta_y_sparse.clear();
     UTsol_sparse.clear();
+    f_t btran_start_work = ft.work_estimate();
     {
       PHASE2_NVTX_RANGE("DualSimplex::btran");
       phase2::compute_delta_y(ft, basic_leaving_index, direction, delta_y_sparse, UTsol_sparse);
     }
     timers.btran_time += timers.stop_timer();
+    solve_work += (ft.work_estimate() - btran_start_work);
+
     if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
       return dual::status_t::CONCURRENT_LIMIT;
     }
@@ -2953,11 +3026,11 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
       PHASE2_NVTX_RANGE("DualSimplex::delta_z");
       if (use_transpose) {
         sparse_delta_z++;
-        compute_delta_z(A_transpose,
+        compute_delta_z(Arow,
                         delta_y_sparse,
                         leaving_index,
                         direction,
-                        nonbasic_mark,
+                        nonbasic_end,
                         delta_z_mark,
                         delta_z_indices,
                         delta_z,
@@ -3295,6 +3368,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
     utilde_sparse.clear();
     scaled_delta_xB_sparse.clear();
     rhs_sparse.from_csc_column(lp.A, entering_index);
+    f_t ftran_start_work = ft.work_estimate();
     {
       PHASE2_NVTX_RANGE("DualSimplex::ftran");
       if (phase2::compute_delta_x(lp,
@@ -3316,7 +3390,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
         return dual::status_t::NUMERICAL;
       }
     }
-
+    solve_work += (ft.work_estimate() - ftran_start_work);
     timers.ftran_time += timers.stop_timer();
     if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
       return dual::status_t::CONCURRENT_LIMIT;
@@ -3330,6 +3404,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
 #endif
 
     timers.start_timer();
+    f_t se_norms_start_work        = ft.work_estimate();
     const i_t steepest_edge_status = phase2::update_steepest_edge_norms(settings,
                                                                         basic_list,
                                                                         ft,
@@ -3351,6 +3426,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
 #endif
     assert(steepest_edge_status == 0);
     timers.se_norms_time += timers.stop_timer();
+    solve_work += (ft.work_estimate() - se_norms_start_work);
+
     if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
       return dual::status_t::CONCURRENT_LIMIT;
     }
@@ -3388,42 +3465,48 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
 #ifdef CHECK_BASIC_INFEASIBILITIES
     phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 2);
 #endif
-    phase2::update_primal_infeasibilities(lp,
-                                          settings,
-                                          basic_list,
-                                          x,
-                                          entering_index,
-                                          leaving_index,
-                                          delta_xB_0_sparse.i,
-                                          squared_infeasibilities,
-                                          infeasibility_indices,
-                                          primal_infeasibility_squared,
-                                          phase2_work_estimate);
+    bool needs_cleanup = phase2::update_primal_infeasibilities(lp,
+                                                               settings,
+                                                               basic_list,
+                                                               x,
+                                                               entering_index,
+                                                               leaving_index,
+                                                               delta_xB_0_sparse.i,
+                                                               squared_infeasibilities,
+                                                               infeasibility_indices,
+                                                               primal_infeasibility_squared,
+                                                               phase2_work_estimate);
     // Update primal infeasibilities due to changes in basic variables
     // from the leaving and entering variables
-    phase2::update_primal_infeasibilities(lp,
-                                          settings,
-                                          basic_list,
-                                          x,
-                                          entering_index,
-                                          leaving_index,
-                                          scaled_delta_xB_sparse.i,
-                                          squared_infeasibilities,
-                                          infeasibility_indices,
-                                          primal_infeasibility_squared,
-                                          phase2_work_estimate);
+    needs_cleanup |= phase2::update_primal_infeasibilities(lp,
+                                                           settings,
+                                                           basic_list,
+                                                           x,
+                                                           entering_index,
+                                                           leaving_index,
+                                                           scaled_delta_xB_sparse.i,
+                                                           squared_infeasibilities,
+                                                           infeasibility_indices,
+                                                           primal_infeasibility_squared,
+                                                           phase2_work_estimate);
     // Update the entering variable
-    phase2::update_single_primal_infeasibility(lp.lower,
-                                               lp.upper,
-                                               x,
-                                               settings.primal_tol,
-                                               squared_infeasibilities,
-                                               infeasibility_indices,
-                                               entering_index,
-                                               primal_infeasibility_squared);
-
-    phase2::clean_up_infeasibilities(
-      squared_infeasibilities, infeasibility_indices, phase2_work_estimate);
+    {
+      const f_t old_val = squared_infeasibilities[entering_index];
+      phase2::update_single_primal_infeasibility(lp.lower,
+                                                 lp.upper,
+                                                 x,
+                                                 settings.primal_tol,
+                                                 squared_infeasibilities,
+                                                 infeasibility_indices,
+                                                 entering_index,
+                                                 primal_infeasibility_squared);
+      needs_cleanup |= (old_val != 0.0 && squared_infeasibilities[entering_index] == 0.0);
+    }
+
+    if (needs_cleanup) {
+      phase2::clean_up_infeasibilities(
+        squared_infeasibilities, infeasibility_indices, phase2_work_estimate);
+    }
 
 #if CHECK_PRIMAL_INFEASIBILITIES
     phase2::check_primal_infeasibilities(
@@ -3455,6 +3538,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
     basic_mark[leaving_index]              = -1;
     basic_mark[entering_index]             = basic_leaving_index;
 
+    update_Arow(
+      leaving_index, entering_index, lp.A, row_mark, nonbasic_end, Arow, phase2_work_estimate);
+
 #ifdef CHECK_BASIC_INFEASIBILITIES
     phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 5);
 #endif
@@ -3463,22 +3549,35 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
     // Refactor or update the basis factorization
     {
       PHASE2_NVTX_RANGE("DualSimplex::basis_update");
-      bool should_refactor = ft.num_updates() > settings.refactor_frequency;
+      iterations_since_refactor++;
+      bool should_refactor =
+        (ft.num_updates() > 100 && solve_work > refactor_work) || (ft.num_updates() > 1000);
+      // settings.log.printf("Solve work %e refactor work %e iterations since refactor %d\n",
+      // solve_work, refactor_work, iterations_since_refactor);
       if (!should_refactor) {
         i_t recommend_refactor = ft.update(utilde_sparse, UTsol_sparse, basic_leaving_index);
 #ifdef CHECK_UPDATE
         phase2::check_update(lp, settings, ft, basic_list, basic_leaving_index);
 #endif
         should_refactor = recommend_refactor == 1;
+        timers.lu_update_time += timers.stop_timer();
+        timers.start_timer();
       }
 
 #ifdef CHECK_BASIC_INFEASIBILITIES
       phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 6);
 #endif
       if (should_refactor) {
+        settings.log.printf(
+          "Refactoring basis. Iteration %d solve work %e refactor work %e updates %d\n",
+          iter,
+          solve_work,
+          refactor_work,
+          ft.num_updates());
         PHASE2_NVTX_RANGE("DualSimplex::refactorization");
         num_refactors++;
-        bool should_recompute_x = true;  // Need for numerically difficult problems like cbs-cta
+        bool should_recompute_x = true;  // Needed for numerically difficult problems like cbs-cta
+        f_t refactor_start_work = ft.work_estimate();
         i_t refactor_status     = ft.refactor_basis(
           lp.A, settings, lp.lower, lp.upper, start_time, basic_list, nonbasic_list, vstatus);
         if (refactor_status == CONCURRENT_HALT_RETURN) { return dual::status_t::CONCURRENT_LIMIT; }
@@ -3511,9 +3610,11 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
 
           settings.log.printf("Successfully repaired basis. Iteration %d\n", iter);
         }
+        refactor_work = ft.work_estimate() - refactor_start_work;
 
         phase2::reset_basis_mark(
           basic_list, nonbasic_list, basic_mark, nonbasic_mark, phase2_work_estimate);
+        compute_initial_nonbasic_end(basic_mark, Arow, nonbasic_end);
         if (should_recompute_x) {
           std::vector<f_t> unperturbed_x(n);
           phase2_work_estimate += n;
@@ -3537,16 +3638,18 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                                                          infeasibility_indices,
                                                          primal_infeasibility);
         phase2_work_estimate += 4 * m + 2 * n;
+        iterations_since_refactor = 0;
+        solve_work                = 0.0;
       }
 #ifdef CHECK_BASIC_INFEASIBILITIES
       phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 7);
 #endif
     }
-    timers.lu_update_time += timers.stop_timer();
+    timers.lu_factorization_time += timers.stop_timer();
 
     timers.start_timer();
     phase2::compute_steepest_edge_norm_entering(
-      settings, m, ft, basic_leaving_index, entering_index, delta_y_steepest_edge);
+      settings, m, ft, basic_leaving_index, entering_index, leaving_index, delta_y_steepest_edge);
     timers.se_entering_time += timers.stop_timer();
 
 #ifdef STEEPEST_EDGE_DEBUG
@@ -3676,15 +3779,19 @@ template void compute_reduced_cost_update<int, double>(const lp_problem_t<int, d
                                                        std::vector<double>& delta_z,
                                                        double& work_estimate);
 
-template void compute_delta_z<int, double>(const csc_matrix_t<int, double>& A_transpose,
+template void compute_delta_z<int, double>(const csr_matrix_t<int, double>& Arow,
                                            const sparse_vector_t<int, double>& delta_y,
                                            int leaving_index,
                                            int direction,
-                                           const std::vector<int>& nonbasic_mark,
+                                           const std::vector<int>& nonbasic_end,
                                            std::vector<int>& delta_z_mark,
                                            std::vector<int>& delta_z_indices,
                                            std::vector<double>& delta_z,
                                            double& work_estimate);
+
+template void compute_initial_nonbasic_end<int, double>(const std::vector<int>& basic_mark,
+                                                        csr_matrix_t<int, double>& Arow,
+                                                        std::vector<int>& nonbasic_end);
 #endif
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/phase2.hpp b/cpp/src/dual_simplex/phase2.hpp
index 5db797449..266d57c19 100644
--- a/cpp/src/dual_simplex/phase2.hpp
+++ b/cpp/src/dual_simplex/phase2.hpp
@@ -94,14 +94,19 @@ void compute_reduced_cost_update(const lp_problem_t<i_t, f_t>& lp,
                                  f_t& work_estimate);
 
 template <typename i_t, typename f_t>
-void compute_delta_z(const csc_matrix_t<i_t, f_t>& A_transpose,
+void compute_delta_z(const csr_matrix_t<i_t, f_t>& Arow,
                      const sparse_vector_t<i_t, f_t>& delta_y,
                      i_t leaving_index,
                      i_t direction,
-                     const std::vector<i_t>& nonbasic_mark,
+                     const std::vector<i_t>& nonbasic_end,
                      std::vector<i_t>& delta_z_mark,
                      std::vector<i_t>& delta_z_indices,
                      std::vector<f_t>& delta_z,
                      f_t& work_estimate);
 
+template <typename i_t, typename f_t>
+void compute_initial_nonbasic_end(const std::vector<i_t>& basic_mark,
+                                  csr_matrix_t<i_t, f_t>& Arow,
+                                  std::vector<i_t>& nonbasic_end);
+
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/right_looking_lu.cpp b/cpp/src/dual_simplex/right_looking_lu.cpp
index 5cb0185c8..3513ff7dc 100644
--- a/cpp/src/dual_simplex/right_looking_lu.cpp
+++ b/cpp/src/dual_simplex/right_looking_lu.cpp
@@ -33,6 +33,835 @@ struct element_t {
 };  // 24 bytes
 constexpr int kNone = -1;
 
+template <typename i_t, typename f_t>
+class nonzero_counts_t {
+ public:
+  nonzero_counts_t(const std::vector<i_t>& deg, i_t m)
+    : m_(m), work_estimate_(0), deg_(deg), counts_(m + 1), pos_(deg.size())
+  {
+    const i_t n = deg_.size();
+    for (i_t k = 0; k < n; ++k) {
+      assert(deg_[k] <= m && deg_[k] >= 0);
+      const i_t nz = deg_[k];
+      pos_[k]      = counts_[nz].size();
+      counts_[nz].push_back(k);
+    }
+    work_estimate_ += 4*n;
+  }
+
+  i_t get_count(i_t k) const
+  {
+    return deg_[k];
+  }
+
+  void update_count(i_t k, i_t new_nz)
+  {
+    const i_t old_nz = deg_[k];
+    update_count(k, old_nz, new_nz);
+  }
+
+  const std::vector<i_t>& get_elements_with_count(i_t nz) const
+  {
+    return counts_[nz];
+  }
+
+  // Remove k from its current bucket without re-inserting.
+  // Sets deg_[k] to -1 to mark it as removed.
+  void remove_from_count(i_t k)
+  {
+    const i_t old_nz = deg_[k];
+    const i_t p = pos_[k];
+    const i_t other = counts_[old_nz].back();
+    counts_[old_nz][p] = other;
+    pos_[other] = p;
+    counts_[old_nz].pop_back();
+    deg_[k] = -1;
+    work_estimate_ += 6;
+  }
+
+  f_t record_and_clear_work_estimate_() {
+    f_t tmp = work_estimate_;
+    work_estimate_ = 0;
+    return tmp;
+  }
+
+ private:
+
+  void update_count(i_t k, i_t old_nz, i_t new_nz)
+  {
+    const i_t p = pos_[k];
+    const i_t other = counts_[old_nz].back();
+    counts_[old_nz][p] = other;
+    pos_[other] = p;
+    counts_[old_nz].pop_back();
+    deg_[k] = new_nz;
+    pos_[k] = counts_[new_nz].size();
+    counts_[new_nz].push_back(k);
+    work_estimate_ += 11;
+  }
+
+  i_t m_;
+  f_t work_estimate_;
+  std::vector<i_t> deg_;
+  std::vector<std::vector<i_t>> counts_;
+  std::vector<i_t> pos_;
+};
+
+// Represents the sparse trailing matrix Atlide = A - l u^T of a sparse LU factorization
+// We need to be able to access the nonzeros in this matrix by both row and column.
+// Thus, we do not compress the storage.
+template <typename i_t, typename f_t>
+class trailing_matrix_t {
+ public:
+  trailing_matrix_t(const csc_matrix_t<i_t, f_t>& A,
+                    const std::vector<i_t>& column_list)
+    : m_(A.m),
+      n_(column_list.size()),
+      Bnz_(0),
+      work_estimate_(0),
+      col_start_(n_),
+      col_end_(n_),
+      col_max_(n_),
+      row_start_(m_),
+      row_end_(m_),
+      row_max_(m_),
+      max_in_column_(n_),
+      pivot_row_val_(n_, 0.0),
+      pivot_col_val_(m_, 0.0),
+      pivot_col_mark_(m_, 0),
+      row_mark_(m_, kNone),
+      col_mark_(n_, kNone),
+      col_counts_(compute_column_degree(A, column_list), m_),
+      row_counts_(compute_row_degree(A, column_list, Bnz_), n_),
+      unused_col_nz_(0),
+      unused_row_nz_(0),
+      col_hits_(0),
+      col_miss_(0),
+      row_hits_(0),
+      row_miss_(0),
+      col_realloc_hist_(std::max(m_, static_cast<i_t>(n_)) + 1, 0),
+      row_realloc_hist_(std::max(m_, static_cast<i_t>(n_)) + 1, 0)
+  {
+
+    work_estimate_ += 4*m_ + 2*n_ + col_realloc_hist_.size() + row_realloc_hist_.size();
+
+    // Allocate 2x initial size per column/row to reduce early relocations
+    i_t col_nz = 2 * Bnz_;
+    i_t row_nz = 2 * Bnz_;
+
+    c_i_.resize(col_nz);
+    c_x_.resize(col_nz);
+    r_j_.resize(row_nz);
+
+
+    i_t nz = 0;
+    for (i_t i = 0; i < m_; i++) {
+      row_start_[i] = nz;
+      row_end_[i] = nz;  // Temporary value used for initializing r_j_. Will be updated in loop
+      i_t row_space = 2 * row_counts_.get_count(i);
+      row_max_[i] = nz + row_space;
+      nz += row_space;
+    }
+    assert(nz == row_nz);
+    work_estimate_ += 4 * m_;
+
+    nz = 0;
+    for (size_t k = 0; k < column_list.size(); k++) {
+      const i_t j = column_list[k];
+      const i_t A_start = A.col_start[j];
+      const i_t A_end = A.col_start[j + 1];
+      const i_t len  = A_end - A_start;
+      i_t col_space = 2 * len;
+      col_max_[k] = nz + col_space;
+      col_start_[k] = nz;
+      col_end_[k] = nz + len;
+      for (i_t p = A_start; p < A_end; p++) {
+        const i_t row = A.i[p];
+        const f_t val = A.x[p];
+        c_i_[nz] = row;
+        c_x_[nz] = val;
+        nz++;
+        r_j_[row_end_[row]] = k;
+        row_end_[row]++;
+      }
+      nz += col_space - len; // Remaining capacity for this column
+    }
+    assert(nz == col_nz);
+    work_estimate_ += 7 * n_ + 7 * Bnz_;
+
+    for (i_t j = 0; j < n_; j++) {
+      f_t max_in_col = 0.0;
+      const i_t c_start = col_start_[j];
+      const i_t c_end = col_end_[j];
+      for (i_t p = c_start; p < c_end; p++) {
+        const f_t val = std::abs(c_x_[p]);
+        if (val > max_in_col) {
+          max_in_col = val;
+        }
+      }
+      max_in_column_[j] = max_in_col;
+    }
+    work_estimate_ += Bnz_ + 3*n_;
+  }
+
+  f_t record_and_clear_work_estimate_()
+  {
+    const f_t row_work_estimate = row_counts_.record_and_clear_work_estimate_();
+    const f_t col_work_estimate = col_counts_.record_and_clear_work_estimate_();
+    work_estimate_ += row_work_estimate + col_work_estimate;
+    f_t tmp        = work_estimate_;
+    work_estimate_ = 0;
+    return tmp;
+  }
+
+  i_t markowitz_search(f_t pivot_tol, f_t threshold_tol, i_t& pivot_i, i_t& pivot_j, f_t &pivot_val) {
+    f_t markowitz = static_cast<f_t>(m_) * static_cast<f_t>(n_); // Upper bound on markowitz criteria
+    i_t nz      = 1;
+    i_t nsearch            = 0;
+    constexpr bool verbose = false;
+    i_t nz_max             = std::min(m_, n_);
+    while (nz <= nz_max) {
+      i_t markowitz_lower_bound = (nz - 1) * (nz - 1);
+      // Search columns of length nz
+      i_t nsearch_start = nsearch;
+      for (const i_t j : col_counts_.get_elements_with_count(nz)) {
+        assert(col_counts_.get_count(j) == nz);
+        const f_t max_in_col = max_in_column_[j];
+        const i_t c_start = col_start_[j];
+        const i_t c_end = col_end_[j];
+        for (i_t p = c_start; p < c_end; p++) {
+          const i_t i = c_i_[p];
+          const f_t val = c_x_[p];
+          const i_t rdeg = row_counts_.get_count(i);
+          assert(rdeg >= 0);
+          const i_t Mij = (rdeg - 1) * (nz - 1);
+          if (Mij < markowitz && std::abs(val) >= threshold_tol * max_in_col &&
+              std::abs(val) >= pivot_tol) {
+            markowitz = Mij;
+            pivot_i   = i;
+            pivot_j   = j;
+            pivot_val = val;
+            if (markowitz <= markowitz_lower_bound) { break; }
+          }
+        }
+        work_estimate_ += 3 * (c_end - c_start);
+        nsearch++;
+        if (markowitz <= markowitz_lower_bound) { break; }
+      }
+     work_estimate_ += 4 * (nsearch - nsearch_start);
+      if (markowitz <= markowitz_lower_bound) { break; }
+
+      markowitz_lower_bound = (nz - 1) * nz;
+
+      // Search rows of length nz
+      assert(row_counts_.get_elements_with_count(nz).size() >= 0);
+      nsearch_start = nsearch;
+      for (const i_t i : row_counts_.get_elements_with_count(nz)) {
+        const i_t rdeg = row_counts_.get_count(i);
+        assert(rdeg == nz);
+        const i_t r_start = row_start_[i];
+        const i_t r_end = row_end_[i];
+        for (i_t p = r_start; p < r_end; p++) {
+          const i_t j = r_j_[p];
+          // Look up the value from the column copy of j
+          f_t val = 0;
+          const i_t c_start = col_start_[j];
+          const i_t c_end = col_end_[j];
+          for (i_t q = c_start; q < c_end; q++) {
+            if (c_i_[q] == i) { val = c_x_[q]; break; }
+          }
+          work_estimate_ += 2 * (c_end - c_start);
+          const f_t max_in_col = max_in_column_[j];
+          const i_t cdeg = col_counts_.get_count(j);
+          assert(cdeg >= 0);
+          const i_t Mij = (nz - 1) * (cdeg - 1);
+          if (Mij < markowitz && std::abs(val) >= threshold_tol * max_in_col &&
+              std::abs(val) >= pivot_tol) {
+            markowitz = Mij;
+            pivot_i   = i;
+            pivot_j   = j;
+            pivot_val = val;
+            if (markowitz <= markowitz_lower_bound) { break; }
+          }
+        }
+        work_estimate_ += 5 * (r_end - r_start);
+        nsearch++;
+        if (markowitz <= markowitz_lower_bound) { break; }
+      }
+      work_estimate_ += 4 * (nsearch - nsearch_start);
+      if (pivot_i != -1 && nz >= 2) { break; }
+      nz++;
+    }
+    if (nsearch > 10) {
+      if constexpr (verbose) { printf("nsearch %d\n", nsearch); }
+    }
+    return nsearch;
+  }
+
+
+  void update_for_pivot_removal(i_t pivot_i, i_t pivot_j)
+  {
+    // Iterate over the pivot row: decrement column degrees.
+    // Skip the pivot column itself — it is being eliminated, not just decremented.
+    const i_t r_start = row_start_[pivot_i];
+    const i_t r_end = row_end_[pivot_i];
+    for (i_t p = r_start; p < r_end; p++) {
+      const i_t j = r_j_[p];
+      const i_t cdeg = col_counts_.get_count(j);
+      if (j != pivot_j) {
+        col_counts_.update_count(j, cdeg - 1);
+      } else {
+        col_counts_.remove_from_count(j);
+      }
+    }
+    work_estimate_ += 2 * (r_end - r_start);
+
+    // Iterate over the pivot column: decrement row degrees.
+    // Skip the pivot row itself — it is being eliminated, not just decremented.
+    const i_t c_start = col_start_[pivot_j];
+    const i_t c_end = col_end_[pivot_j];
+    for (i_t p = c_start; p < c_end; p++) {
+      const i_t i = c_i_[p];
+      const i_t rdeg = row_counts_.get_count(i);
+      if (i != pivot_i) {
+        row_counts_.update_count(i, rdeg - 1);
+      } else {
+        row_counts_.remove_from_count(i);
+      }
+    }
+    work_estimate_ += 2 * (c_end - c_start);
+  }
+
+  void schur_complement(i_t pivot_i,
+                        i_t pivot_j,
+                        f_t drop_tol,
+                        f_t pivot_val)
+  {
+    // Step 1: Cache the pivot column into dense workspaces.
+    // pivot_col_val_[i] = l_i = a(i, pivot_j) / pivot_val  for each row i != pivot_i
+    // pivot_col_mark_[i] = 1 if row i is in the pivot column
+    // pivot_col_index_[] = sparse list of such row indices
+    i_t pivot_col_count = 0;
+    const i_t c_pivot_start = col_start_[pivot_j];
+    const i_t c_pivot_end = col_end_[pivot_j];
+    for (i_t p = c_pivot_start; p < c_pivot_end; p++) {
+      const i_t i = c_i_[p];
+      if (i == pivot_i) { continue; }
+      const f_t li = c_x_[p] / pivot_val;
+      pivot_col_val_[i] = li;
+      pivot_col_mark_[i] = 1;
+      pivot_col_index_.push_back(i);
+      pivot_col_count++;
+    }
+    work_estimate_ += 5 * (c_pivot_end - c_pivot_start);
+
+    // Step 2: For each column j in the pivot row, update existing entries and insert fill.
+    const i_t r_pivot_start = row_start_[pivot_i];
+    const i_t r_pivot_end = row_end_[pivot_i];
+    for (i_t p0 = r_pivot_start; p0 < r_pivot_end; p0++) {
+      const i_t j = r_j_[p0];
+      if (j == pivot_j) { continue; }
+      const f_t uj = pivot_row_val_[j];
+
+      // Step 2a: Scan column j, update existing entries, and count fill-in.
+      // For each entry (i, j) that also appears in the pivot column, update it.
+      // Simultaneously, unmark pivot_col_mark_[i] for matched entries, so that
+      // after the scan, the still-marked entries are the fill-ins.
+      i_t n_fillin = pivot_col_count;
+      i_t n_cancel = 0;
+      const i_t c_start = col_start_[j];
+      const i_t c_end = col_end_[j];
+      for (i_t q = c_start; q < c_end; q++) {
+        const i_t i = c_i_[q];
+        if (pivot_col_mark_[i]) {
+          pivot_col_mark_[i] = 0;
+          n_fillin--;
+          const f_t val = pivot_col_val_[i] * uj;
+
+          c_x_[q] -= val;
+          const f_t abs_updated = std::abs(c_x_[q]);
+          if (abs_updated > max_in_column_[j]) { max_in_column_[j] = abs_updated; }
+          if (abs_updated < drop_tol) {
+            c_x_[q] = 0;
+            n_cancel++;
+            // TODO: does max_in_column_ need to be updated in this case?
+          }
+        }
+      }
+      work_estimate_ += 2*(c_end - c_start) + 6*(pivot_col_count - n_fillin);
+
+
+      // Step 2b: Remove cancellations (entries that became zero).
+      if (n_cancel > 0) {
+        i_t new_end = col_start_[j];
+        for (i_t q = col_start_[j]; q < col_end_[j]; q++) {
+          if (c_x_[q] != 0) {
+            c_i_[new_end] = c_i_[q];
+            c_x_[new_end] = c_x_[q];
+            new_end++;
+          } else {
+            const i_t dead_row = c_i_[q];
+            // Remove this entry from the row copy as well
+            const i_t r_start = row_start_[dead_row];
+            for (i_t rp = r_start; rp < row_end_[dead_row]; rp++) {
+              if (r_j_[rp] == j) {
+                r_j_[rp] = r_j_[row_end_[dead_row] - 1];
+                row_end_[dead_row]--;
+                break;
+              }
+            }
+            work_estimate_ += 2*(row_end_[dead_row] - r_start) + 4;
+            // Update row degree
+            const i_t rdeg = row_counts_.get_count(dead_row);
+            row_counts_.update_count(dead_row, rdeg - 1);
+
+          }
+        }
+        work_estimate_ += 2*(new_end - col_start_[j]);
+        i_t old_count = col_end_[j] - col_start_[j];
+        col_end_[j] = new_end;
+        i_t new_count = new_end - col_start_[j];
+        // Update column degree for cancellations
+        if (new_count != old_count) {
+          col_counts_.update_count(j, new_count);
+        }
+      }
+
+
+      // Step 2c: Insert fill-in entries. We know exactly how many there are.
+      if (n_fillin > 0) {
+        // Ensure column j has enough space for all fill-ins at once.
+        // After this, col_start_[j] is stable — no further relocation needed.
+        ensure_col_space(j, n_fillin);
+
+        // Insert fill into column j and row copies.
+        for (i_t k = 0; k < pivot_col_count; k++) {
+          const i_t i = pivot_col_index_[k];
+          if (pivot_col_mark_[i]) {
+            const f_t val = pivot_col_val_[i] * uj;
+            const f_t abs_val = std::abs(val);
+            if (abs_val < drop_tol) {
+              // Skip this fill-in but still need to unmark
+              continue;
+            }
+
+            // Insert into column copy (space is guaranteed)
+            c_i_[col_end_[j]] = i;
+            c_x_[col_end_[j]] = -val;
+            col_end_[j]++;
+            if (abs_val > max_in_column_[j]) { max_in_column_[j] = abs_val; }
+
+            // Insert into row copy
+            ensure_row_space(i, 1);
+            r_j_[row_end_[i]] = j;
+            row_end_[i]++;
+
+            // Update row degree
+            const i_t rdeg = row_counts_.get_count(i);
+            row_counts_.update_count(i, rdeg + 1);
+            work_estimate_ += 10;
+          }
+        }
+      }
+
+      // Step 2d: Update column degree bucket once for this column.
+      {
+        i_t new_cdeg = col_end_[j] - col_start_[j];
+        if (new_cdeg != col_counts_.get_count(j)) {
+          col_counts_.update_count(j, new_cdeg);
+        }
+      }
+
+      // Step 2e: Reset all pivot column marks back to 1 for the next column.
+      // Some marks were cleared to 0 during the scan of column j (matched entries).
+      // We restore them by iterating the pivot column index list. So that we are
+      // prepared to process the next column.
+      for (i_t k = 0; k < pivot_col_count; k++) {
+        pivot_col_mark_[pivot_col_index_[k]] = 1;
+      }
+      work_estimate_ += 2*pivot_col_count;
+    }
+
+    // Step 3: Clear the pivot column workspaces.
+    for (i_t k = 0; k < pivot_col_count; k++) {
+      const i_t i = pivot_col_index_[k];
+      pivot_col_val_[i] = 0;
+      pivot_col_mark_[i] = 0;
+    }
+    work_estimate_ += 2*pivot_col_count;
+    pivot_col_index_.clear();
+  }
+
+  // Populate the dense pivot_row_val_ workspace by scanning column representation
+  // for each column j that appears in the pivot row.
+  // Must be called before extract_row() and schur_complement().
+  // Cleared by remove_pivot_row_and_column().
+  void cache_pivot_row(i_t pivot_i)
+  {
+    const i_t r_start = row_start_[pivot_i];
+    const i_t r_end = row_end_[pivot_i];
+    for (i_t p = r_start; p < r_end; p++) {
+      const i_t j = r_j_[p];
+      const i_t c_start = col_start_[j];
+      const i_t c_end = col_end_[j];
+      i_t q;
+      for (q = c_start; q < c_end; q++) {
+        if (c_i_[q] == pivot_i) {
+          pivot_row_val_[j] = c_x_[q];
+          break;
+        }
+      }
+      work_estimate_ += 2 *(q - c_start);
+    }
+    work_estimate_ += 3 * (r_end - r_start) + 2;
+  }
+
+  void remove_pivot_row_and_column(i_t pivot_i, i_t pivot_j)
+  {
+    // Iterate over the pivot row
+    const i_t r_pivot_start = row_start_[pivot_i];
+    const i_t r_pivot_end = row_end_[pivot_i];
+    for (i_t p = r_pivot_start; p < r_pivot_end; p++) {
+      const i_t j = r_j_[p];
+      // Clear the cached pivot row value for this column
+      pivot_row_val_[j] = 0;
+      // Remove pivot_i from each column j in the pivot row
+      f_t max_in_col = 0.0;
+
+      const i_t prev_col_end = col_end_[j];
+      for (i_t q = col_start_[j]; q < col_end_[j]; q++) {
+        const i_t i = c_i_[q];
+        if (i == pivot_i) {
+          // Swap with the last element in the column
+          i_t other_i = c_i_[col_end_[j] - 1];
+          f_t other_x = c_x_[col_end_[j] - 1];
+          c_i_[q] = other_i;
+          c_x_[q] = other_x;
+          // Update col_end_[j]
+          col_end_[j]--;
+          q--;
+          continue;
+        } else {
+          const f_t val = std::abs(c_x_[q]);
+          if (val > max_in_col) {
+            max_in_col = val;
+          }
+        }
+      }
+      work_estimate_ += 3*(prev_col_end - col_start_[j]) + 7;
+      max_in_column_[j] = max_in_col;
+    }
+    work_estimate_ += 4*(r_pivot_end - r_pivot_start);
+
+
+    // Iterate over the pivot column
+    const i_t c_start = col_start_[pivot_j];
+    const i_t c_end = col_end_[pivot_j];
+    for (i_t p = c_start; p < c_end; p++) {
+      const i_t i = c_i_[p];
+      // Remove pivot_j from each row i in the pivot column
+
+      i_t q;
+      for (q = row_start_[i]; q < row_end_[i]; q++) {
+        const i_t j = r_j_[q];
+        if (j == pivot_j) {
+          // Swap with the last element in the row
+          r_j_[q] = r_j_[row_end_[i] - 1];
+          // Update row_end_[i]
+          row_end_[i]--;
+          break;
+        }
+      }
+      work_estimate_ += 2*(q - row_start_[i]) + 4;
+    }
+    work_estimate_ += 4*(c_end - c_start);
+
+    // Mark pivot column and pivot row as empty so garbage collection skips them
+    col_end_[pivot_j] = col_start_[pivot_j];
+    row_end_[pivot_i] = row_start_[pivot_i];
+  }
+
+  void extract_row(i_t pivot_i, i_t pivot_j, csr_matrix_t<i_t, f_t>& Urow, i_t& Unz)
+  {
+    // U(k, :)
+    const i_t r_pivot_start = row_start_[pivot_i];
+    const i_t r_pivot_end = row_end_[pivot_i];
+    for (i_t p = r_pivot_start; p < r_pivot_end; p++) {
+      const i_t j = r_j_[p];
+      if (j != pivot_j) {
+        Urow.j.push_back(j);
+        Urow.x.push_back(pivot_row_val_[j]);
+        Unz++;
+      }
+    }
+    work_estimate_ += 3 * (r_pivot_end - r_pivot_start);
+  }
+
+  void extract_column(i_t pivot_i, i_t pivot_j, f_t pivot_val, csc_matrix_t<i_t, f_t>& L, i_t& Lnz)
+  {
+    // L(:, k)
+    const i_t c_pivot_start = col_start_[pivot_j];
+    const i_t c_pivot_end = col_end_[pivot_j];
+    for (i_t p = c_pivot_start; p < c_pivot_end; p++) {
+      const i_t i = c_i_[p];
+      if (i != pivot_i) {
+        L.i.push_back(i);
+        const f_t l_val = c_x_[p] / pivot_val;
+        L.x.push_back(l_val);
+        Lnz++;
+      }
+    }
+    work_estimate_ += 4 * (c_pivot_end - c_pivot_start);
+  }
+
+  void garbage_collect(f_t max_unused_fraction = 0.90)
+  {
+    if (unused_col_nz_ > max_unused_fraction * static_cast<f_t>(c_i_.size())) {
+      printf("Garbage collected column %e\n", unused_col_nz_ / static_cast<f_t>(c_i_.size()));
+      std::vector<i_t> new_c_i;
+      std::vector<f_t> new_c_x;
+      new_c_i.reserve(c_i_.size() - unused_col_nz_);
+      new_c_x.reserve(c_x_.size() - unused_col_nz_);
+      for (i_t j = 0; j < n_; j++) {
+        const i_t new_start = static_cast<i_t>(new_c_i.size());
+        const i_t c_start = col_start_[j];
+        const i_t c_end = col_end_[j];
+        const i_t col_size = c_end - c_start;
+        for (i_t p = c_start; p < c_end; p++) {
+          new_c_i.push_back(c_i_[p]);
+          new_c_x.push_back(c_x_[p]);
+        }
+        col_start_[j] = new_start;
+        col_end_[j] = static_cast<i_t>(new_c_i.size());
+        // Reserve space equal to current size (doubling strategy)
+        for (i_t s = 0; s < col_size; s++) {
+          new_c_i.push_back(kNone);
+          new_c_x.push_back(0.0);
+        }
+        work_estimate_ += 4*col_size;
+        col_max_[j] = static_cast<i_t>(new_c_i.size());
+      }
+      work_estimate_ += 6*n_;
+      c_i_ = std::move(new_c_i);
+      c_x_ = std::move(new_c_x);
+
+      unused_col_nz_ = 0;
+    }
+
+    if (unused_row_nz_ > max_unused_fraction * static_cast<f_t>(r_j_.size())) {
+      printf("Garbage collected row %e\n", unused_row_nz_ / static_cast<f_t>(r_j_.size()));
+      std::vector<i_t> new_r_j;
+      new_r_j.reserve(r_j_.size() - unused_row_nz_);
+      for (i_t i = 0; i < m_; i++) {
+        const i_t new_start = static_cast<i_t>(new_r_j.size());
+        const i_t r_start = row_start_[i];
+        const i_t r_end = row_end_[i];
+        const i_t row_size = r_end - r_start;
+        for (i_t p = r_start; p < r_end; p++) {
+          new_r_j.push_back(r_j_[p]);
+        }
+        row_start_[i] = new_start;
+        row_end_[i] = static_cast<i_t>(new_r_j.size());
+        // Reserve space equal to current size (doubling strategy)
+        for (i_t s = 0; s < row_size; s++) {
+          new_r_j.push_back(kNone);
+        }
+        row_max_[i] = static_cast<i_t>(new_r_j.size());
+        work_estimate_ += 2*row_size;
+      }
+      work_estimate_ += 6*m_;
+      r_j_ = std::move(new_r_j);
+      unused_row_nz_ = 0;
+    }
+  }
+
+  void print_stats()
+  {
+#if 0
+    printf("Column hits: %.1f%%, Column misses: %.1f%%, Row hits: %.1f%%, Row misses: %.1f%%\n",
+           100.0 * static_cast<f_t>(col_hits_) / static_cast<f_t>(col_hits_ + col_miss_),
+           100.0 * static_cast<f_t>(col_miss_) / static_cast<f_t>(col_hits_ + col_miss_),
+           100.0 * static_cast<f_t>(row_hits_) / static_cast<f_t>(row_hits_ + row_miss_),
+           100.0 * static_cast<f_t>(row_miss_) / static_cast<f_t>(row_hits_ + row_miss_));
+
+    printf("Column reallocation histogram (shortfall -> count):\n");
+    for (size_t k = 0; k < col_realloc_hist_.size(); k++) {
+      if (col_realloc_hist_[k] > 0) {
+        printf("  %4zu: %d\n", k, col_realloc_hist_[k]);
+      }
+    }
+
+    printf("Row reallocation histogram (shortfall -> count):\n");
+    for (size_t k = 0; k < row_realloc_hist_.size(); k++) {
+      if (row_realloc_hist_[k] > 0) {
+        printf("  %4zu: %d\n", k, row_realloc_hist_[k]);
+      }
+    }
+
+    f_t ci_mb = static_cast<f_t>(c_i_.size() * sizeof(i_t)) / (1024.0 * 1024.0);
+    f_t cx_mb = static_cast<f_t>(c_x_.size() * sizeof(f_t)) / (1024.0 * 1024.0);
+    f_t rj_mb = static_cast<f_t>(r_j_.size() * sizeof(i_t)) / (1024.0 * 1024.0);
+    printf("Memory: c_i_ = %.2f MB, c_x_ = %.2f MB, r_j_ = %.2f MB, total = %.2f MB\n",
+           ci_mb, cx_mb, rj_mb, ci_mb + cx_mb + rj_mb);
+#endif
+  }
+
+ private:
+
+  // Ensure column j has space for at least `needed` additional entries.
+  // If not, relocate the column to the end of c_i_/c_x_ with enough space.
+  // Returns true if the column was relocated (invalidating any cached positions).
+  bool ensure_col_space(i_t j, i_t needed)
+  {
+    if (col_end_[j] + needed <= col_max_[j]) {
+      col_hits_++;
+      return false;
+    }
+    col_miss_++;
+    i_t shortfall = needed - (col_max_[j] - col_end_[j]);
+    col_realloc_hist_[shortfall]++;
+    // Relocate column j to the end of c_i_/c_x_
+    const i_t c_start = col_start_[j];
+    const i_t c_end = col_end_[j];
+    i_t current_size = c_end - c_start;
+    unused_col_nz_ += current_size;
+    i_t new_start = c_i_.size();
+    for (i_t p = c_start; p < c_end; p++) {
+      c_i_.push_back(c_i_[p]);
+      c_x_.push_back(c_x_[p]);
+    }
+    work_estimate_ += 2*(c_end - c_start);
+    col_start_[j] = new_start;
+    col_end_[j] = c_i_.size();
+    // Reserve space using doubling strategy to reduce future relocations
+    i_t extra = std::max(current_size, needed);
+    for (i_t k = 0; k < extra; k++) {
+      c_i_.push_back(kNone);
+      c_x_.push_back(0.0);
+    }
+    work_estimate_ += 2*extra;
+    col_max_[j] = c_i_.size();
+    work_estimate_ += 10;
+    return true;
+  }
+
+  // Ensure row i has space for at least `needed` additional entries.
+  // If not, relocate the row to the end of r_j_ with enough space.
+  void ensure_row_space(i_t i, i_t needed)
+  {
+    if (row_end_[i] + needed <= row_max_[i]) {
+      row_hits_++;
+      return;
+    }
+    row_miss_++;
+    i_t shortfall = needed - (row_max_[i] - row_end_[i]);
+    row_realloc_hist_[shortfall]++;
+    // Relocate row i to the end of r_j_
+    const i_t r_start = row_start_[i];
+    const i_t r_end = row_end_[i];
+    i_t current_size = r_end - r_start;
+    unused_row_nz_ += current_size;
+    i_t new_start = r_j_.size();
+    for (i_t p = r_start; p < r_end; p++) {
+      r_j_.push_back(r_j_[p]);
+    }
+    work_estimate_ += (r_end - r_start);
+    row_start_[i] = new_start;
+    row_end_[i] = r_j_.size();
+    // Reserve space using doubling strategy to reduce future relocations
+    i_t extra = std::max(current_size, needed);
+    for (i_t k = 0; k < extra; k++) {
+      r_j_.push_back(kNone);
+    }
+    work_estimate_ += extra;
+    row_max_[i] = r_j_.size();
+    work_estimate_ += 9;
+  }
+
+  std::vector<i_t> compute_column_degree(const csc_matrix_t<i_t, f_t>& A, const std::vector<i_t>& column_list)
+  {
+    const i_t n = column_list.size();
+    std::vector<i_t> Cdegree(n);
+    for (i_t k = 0; k < n; k++) {
+      const i_t j         = column_list[k];
+      const i_t A_start = A.col_start[j];
+      const i_t A_end   = A.col_start[j + 1];
+      Cdegree[k]          = A_end - A_start;
+    }
+    work_estimate_ += 4 * n;
+    return Cdegree;
+  }
+
+  std::vector<i_t> compute_row_degree(const csc_matrix_t<i_t, f_t>& A, const std::vector<i_t>& column_list, i_t& Bnz)
+  {
+    std::vector<i_t> Rdegree(A.m, 0);
+    Bnz = 0;
+    const i_t n = column_list.size();
+    for (i_t k = 0; k < n; k++) {
+      const i_t j = column_list[k];
+      const i_t col_start = A.col_start[j];
+      const i_t col_end = A.col_start[j + 1];
+      for (i_t p = col_start; p < col_end; ++p) {
+        Rdegree[A.i[p]]++;
+        Bnz++;
+      }
+    }
+    work_estimate_ += 3 * n + 2 * Bnz;
+    return Rdegree;
+  }
+
+  i_t m_;
+  i_t n_;
+  i_t Bnz_;
+  f_t work_estimate_;
+
+  // The representation of the matrix by column
+  std::vector<i_t> col_start_;
+  std::vector<i_t> col_end_;
+  std::vector<i_t> col_max_;
+
+  std::vector<i_t> c_i_;   // row indices (indexed by col_start_[j] to col_end_[j])
+  std::vector<f_t> c_x_; // coefficients (indexed by col_start_[j] to col_end_[j])
+
+
+  // The representation of the matrix by row (index only, no values)
+  std::vector<i_t> row_start_;
+  std::vector<i_t> row_end_;
+  std::vector<i_t> row_max_;
+
+  std::vector<i_t> r_j_;   // column indices (indexed by row_start_[i] to row_end_[i])
+
+
+
+  std::vector<f_t> max_in_column_;  // max_in_column_[j] is absolute value of the maximum coefficient in column j
+
+  std::vector<f_t> pivot_row_val_;  // dense workspace of size n_; caches pivot row values
+
+  std::vector<f_t> pivot_col_val_;   // dense workspace of size m_; caches L multipliers for pivot column
+  std::vector<char> pivot_col_mark_; // dense workspace of size m_; 1 if row i is in the pivot column
+  std::vector<i_t> pivot_col_index_; // sparse list of row indices in the pivot column (excl. pivot_i)
+
+  std::vector<i_t> row_mark_;
+  std::vector<i_t> col_mark_;
+
+
+  nonzero_counts_t<i_t, f_t> col_counts_;
+  nonzero_counts_t<i_t, f_t> row_counts_;
+
+
+  i_t unused_col_nz_;
+  i_t unused_row_nz_;
+
+
+  i_t col_hits_;
+  i_t col_miss_;
+  i_t row_hits_;
+  i_t row_miss_;
+
+  std::vector<i_t> col_realloc_hist_;  // col_realloc_hist_[k] = number of column relocations with shortfall k
+  std::vector<i_t> row_realloc_hist_;  // row_realloc_hist_[k] = number of row relocations with shortfall k
+};
+
 template <typename i_t, typename f_t>
 i_t initialize_degree_data(const csc_matrix_t<i_t, f_t>& A,
                            const std::vector<i_t>& column_list,
@@ -641,6 +1470,206 @@ void remove_pivot_col(i_t pivot_i,
 }
 
 }  // namespace
+template <typename i_t, typename f_t>
+i_t right_looking_lu2(const csc_matrix_t<i_t, f_t>& A,
+                     const simplex_solver_settings_t<i_t, f_t>& settings,
+                     f_t tol,
+                     const std::vector<i_t>& column_list,
+                     f_t start_time,
+                     std::vector<i_t>& q,
+                     csc_matrix_t<i_t, f_t>& L,
+                     csc_matrix_t<i_t, f_t>& U,
+                     std::vector<i_t>& pinv,
+                     f_t& work_estimate)
+{
+  raft::common::nvtx::range scope("LU::right_looking_lu");
+  const i_t n = column_list.size();
+  const i_t m = A.m;
+
+  assert(A.m == n);
+  assert(L.n == n);
+  assert(L.m == n);
+  assert(U.n == n);
+  assert(U.m == n);
+  assert(q.size() == n);
+  assert(pinv.size() == n);
+
+
+  trailing_matrix_t<i_t, f_t> trailing_matrix(A, column_list);
+
+  csr_matrix_t<i_t, f_t> Urow(n, n, 0);  // We will store U by rows in Urow during the factorization
+                                         // and translate back to U at the end
+  Urow.n = Urow.m = n;
+  Urow.row_start.resize(n + 1, -1);
+  i_t Unz = 0;
+  work_estimate += 2 * n;
+
+  i_t Lnz = 0;
+  L.x.clear();
+  L.i.clear();
+
+  std::fill(q.begin(), q.end(), -1);
+  std::fill(pinv.begin(), pinv.end(), -1);
+  std::vector<i_t> qinv(n);
+  std::fill(qinv.begin(), qinv.end(), -1);
+  work_estimate += 4 * n;
+
+  work_estimate += trailing_matrix.record_and_clear_work_estimate_();
+
+  i_t pivots = 0;
+  for (i_t k = 0; k < n; ++k) {
+    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+      return CONCURRENT_HALT_RETURN;
+    }
+    if (toc(start_time) > settings.time_limit) { return TIME_LIMIT_RETURN; }
+    // Find pivot that satisfies
+    // abs(pivot) >= abstol,
+    // abs(pivot) >= threshold_tol * max abs[pivot column]
+    i_t pivot_i             = -1;
+    i_t pivot_j             = -1;
+    f_t pivot_val           = std::numeric_limits<f_t>::quiet_NaN();
+    constexpr f_t pivot_tol = 1e-11;
+    const f_t drop_tol      = tol == 1.0 ? 0.0 : 1e-13;
+    const f_t threshold_tol = tol;
+
+    trailing_matrix.markowitz_search(pivot_tol, threshold_tol, pivot_i, pivot_j, pivot_val);
+
+    if (pivot_i == -1 || pivot_j == -1) { break; }
+    assert(pivot_i != -1 && pivot_j != -1);
+
+    // Pivot
+    pinv[pivot_i]       = k;  // pivot_i is the kth pivot row
+    q[k]                = pivot_j;
+    qinv[pivot_j]       = k;
+    assert(std::abs(pivot_val) >= pivot_tol);
+    pivots++;
+
+    // Cache pivot row values from column copies into dense workspace
+    trailing_matrix.cache_pivot_row(pivot_i);
+
+    // U <- [U; u^T]
+    Urow.row_start[k] = Unz;
+    // U(k, pivot_j) = pivot_val
+    Urow.j.push_back(pivot_j);
+    Urow.x.push_back(pivot_val);
+    Unz++;
+    // U(k, :)
+    trailing_matrix.extract_row(pivot_i, pivot_j, Urow, Unz);
+    work_estimate += 4 * (Unz - Urow.row_start[k]);
+
+    // L <- [L l]
+    L.col_start[k] = Lnz;
+    // L(pivot_i, k) = 1
+    L.i.push_back(pivot_i);
+    L.x.push_back(1.0);
+    Lnz++;
+
+    // L(:, k)
+    trailing_matrix.extract_column(pivot_i, pivot_j, pivot_val, L, Lnz);
+    work_estimate += 4 * (Lnz - L.col_start[k]);
+
+
+    trailing_matrix.update_for_pivot_removal(pivot_i, pivot_j);
+
+    // A22 <- A22 - l u^T
+    trailing_matrix.schur_complement(pivot_i, pivot_j, drop_tol, pivot_val);
+
+    trailing_matrix.remove_pivot_row_and_column(pivot_i, pivot_j);
+
+    trailing_matrix.garbage_collect();
+
+    work_estimate += trailing_matrix.record_and_clear_work_estimate_();
+
+
+#ifdef CHECK_MAX_IN_COLUMN
+    // Check that maximum in column is maintained
+
+#endif
+
+
+
+  }
+
+  trailing_matrix.print_stats();
+
+  // Check for rank deficiency
+  if (pivots < n) {
+    // Complete the permutation pinv
+    i_t start = pivots;
+    for (i_t i = 0; i < m; ++i) {
+      if (pinv[i] == -1) { pinv[i] = start++; }
+    }
+    work_estimate += m;
+
+    // Finalize the permutation q. Do this by first completing the inverse permutation qinv.
+    // Then invert qinv to get the final permutation q.
+    start = pivots;
+    for (i_t j = 0; j < n; ++j) {
+      if (qinv[j] == -1) { qinv[j] = start++; }
+    }
+    work_estimate += n;
+    inverse_permutation(qinv, q);
+    work_estimate += 2 * n;
+
+    return pivots;
+  }
+
+  // Finalize L and Urow
+  L.col_start[n]    = Lnz;
+  Urow.row_start[n] = Unz;
+
+  // Fix row inidices of L for final pinv
+  for (i_t p = 0; p < Lnz; ++p) {
+    L.i[p] = pinv[L.i[p]];
+  }
+  work_estimate += 3 * Lnz;
+
+#ifdef CHECK_LOWER_TRIANGULAR
+  for (i_t j = 0; j < n; ++j) {
+    const i_t col_start = L.col_start[j];
+    const i_t col_end   = L.col_start[j + 1];
+    for (i_t p = col_start; p < col_end; ++p) {
+      const i_t i = L.i[p];
+      if (i < j) { printf("Found L(%d, %d) not lower triangular!\n", i, j); }
+      assert(i >= j);
+    }
+  }
+#endif
+
+  csc_matrix_t<i_t, f_t> U_unpermuted(n, n, 1);
+  work_estimate += n;
+  Urow.to_compressed_col(
+    U_unpermuted);  // Convert Urow to U stored in compressed sparse column format
+  work_estimate += n + Unz;
+  std::vector<i_t> row_perm(n);
+  work_estimate += n;
+  inverse_permutation(pinv, row_perm);
+  work_estimate += 2 * n;
+
+  std::vector<i_t> identity(n);
+  for (i_t k = 0; k < n; k++) {
+    identity[k] = k;
+  }
+  work_estimate += 2 * n;
+
+  U_unpermuted.permute_rows_and_cols(identity, q, U);
+  work_estimate += 3 * U.n + 5 * Unz;
+
+#ifdef CHECK_UPPER_TRIANGULAR
+  for (i_t k = 0; k < n; ++k) {
+    const i_t j         = k;
+    const i_t col_start = U.col_start[j];
+    const i_t col_end   = U.col_start[j + 1];
+    for (i_t p = col_start; p < col_end; ++p) {
+      const i_t i = U.i[p];
+      if (i > j) { printf("Found U(%d, %d) not upper triangluar\n", i, j); }
+      assert(i <= j);
+    }
+  }
+#endif
+
+  return n;
+}
 
 template <typename i_t, typename f_t>
 i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
@@ -1298,6 +2327,17 @@ template int right_looking_lu<int, double>(const csc_matrix_t<int, double>& A,
                                            std::vector<int>& pinv,
                                            double& work_estimate);
 
+template int right_looking_lu2<int, double>(const csc_matrix_t<int, double>& A,
+                                            const simplex_solver_settings_t<int, double>& settings,
+                                            double tol,
+                                            const std::vector<int>& column_list,
+                                            double start_time,
+                                            std::vector<int>& q,
+                                            csc_matrix_t<int, double>& L,
+                                            csc_matrix_t<int, double>& U,
+                                            std::vector<int>& pinv,
+                                            double& work_estimate);
+
 template int right_looking_lu_row_permutation_only<int, double>(
   const csc_matrix_t<int, double>& A,
   const simplex_solver_settings_t<int, double>& settings,
diff --git a/cpp/src/dual_simplex/right_looking_lu.hpp b/cpp/src/dual_simplex/right_looking_lu.hpp
index 5f0bf570b..66f3747d6 100644
--- a/cpp/src/dual_simplex/right_looking_lu.hpp
+++ b/cpp/src/dual_simplex/right_looking_lu.hpp
@@ -26,6 +26,18 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
                      std::vector<i_t>& pinv,
                      f_t& work_estimate);
 
+template <typename i_t, typename f_t>
+i_t right_looking_lu2(const csc_matrix_t<i_t, f_t>& A,
+                      const simplex_solver_settings_t<i_t, f_t>& settings,
+                      f_t tol,
+                      const std::vector<i_t>& column_list,
+                      f_t start_time,
+                      std::vector<i_t>& q,
+                      csc_matrix_t<i_t, f_t>& L,
+                      csc_matrix_t<i_t, f_t>& U,
+                      std::vector<i_t>& pinv,
+                      f_t& work_estimate);
+
 template <typename i_t, typename f_t>
 i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
                                           const simplex_solver_settings_t<i_t, f_t>& settings,
diff --git a/cpp/src/pdlp/cpu_optimization_problem.cpp b/cpp/src/pdlp/cpu_optimization_problem.cpp
index 406b0b654..6197260f0 100644
--- a/cpp/src/pdlp/cpu_optimization_problem.cpp
+++ b/cpp/src/pdlp/cpu_optimization_problem.cpp
@@ -724,7 +724,11 @@ void cpu_optimization_problem_t<i_t, f_t>::write_to_mps(const std::string& mps_f
     for (size_t i = 0; i < var_types_char.size(); ++i) {
       var_types_char[i] = (variable_types_[i] == var_t::INTEGER) ? 'I' : 'C';
     }
-
+  } else if (get_n_variables() > 0) {
+    // Variable types not set (e.g. pure LP); default to all continuous
+    var_types_char.assign(get_n_variables(), 'C');
+  }
+  if (!var_types_char.empty()) {
     data_model_view.set_variable_types(var_types_char.data(), var_types_char.size());
   }
 
diff --git a/cpp/src/pdlp/optimization_problem.cu b/cpp/src/pdlp/optimization_problem.cu
index 87ff9dab0..8ed630c00 100644
--- a/cpp/src/pdlp/optimization_problem.cu
+++ b/cpp/src/pdlp/optimization_problem.cu
@@ -798,11 +798,15 @@ void optimization_problem_t<i_t, f_t>::write_to_mps(const std::string& mps_file_
   std::vector<char> variable_types;
   if (get_n_variables() != 0) {
     auto enum_variable_types = cuopt::host_copy(get_variable_types(), stream);
-    variable_types.resize(enum_variable_types.size());
-
-    // Convert enum types to char types
-    for (size_t i = 0; i < variable_types.size(); ++i) {
-      variable_types[i] = (enum_variable_types[i] == var_t::INTEGER) ? 'I' : 'C';
+    if (enum_variable_types.empty()) {
+      // Variable types not set (e.g. pure LP); default to all continuous
+      variable_types.assign(get_n_variables(), 'C');
+    } else {
+      variable_types.resize(enum_variable_types.size());
+      // Convert enum types to char types
+      for (size_t i = 0; i < variable_types.size(); ++i) {
+        variable_types[i] = (enum_variable_types[i] == var_t::INTEGER) ? 'I' : 'C';
+      }
     }
 
     data_model_view.set_variable_types(variable_types.data(), variable_types.size());
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index f0345368a..d2c5e3074 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -1422,8 +1422,18 @@ optimization_problem_solution_t<i_t, f_t> solve_lp(
 
     // handle default presolve
     if (settings.presolver == presolver_t::Default) {
-      settings.presolver = presolver_t::PSLP;
-      CUOPT_LOG_INFO("Using PSLP presolver");
+      // Skip presolve for small problems where the fixed overhead (~20-30ms)
+      // exceeds the simplex solve time. Based on Netlib benchmarks, problems
+      // with fewer than 8000 nonzeros never benefit from PSLP presolve.
+      constexpr i_t presolve_nnz_threshold = 8000;
+      if (op_problem.get_nnz() >= presolve_nnz_threshold) {
+        settings.presolver = presolver_t::PSLP;
+        CUOPT_LOG_INFO("Using PSLP presolver");
+      } else {
+        settings.presolver = presolver_t::None;
+        CUOPT_LOG_INFO("Skipping presolve for small problem (nnz=%d < %d)",
+                       op_problem.get_nnz(), presolve_nnz_threshold);
+      }
     }
 
     [[maybe_unused]] double presolve_time = 0.0;