NVIDIA
diff --git a/‎benchmarks/linear_programming/cuopt/run_pdlp.cu‎
Lines changed: 14 additions & 6 deletions b/‎benchmarks/linear_programming/cuopt/run_pdlp.cu‎
Lines changed: 14 additions & 6 deletions
diff --git a/‎cpp/src/branch_and_bound/pseudo_costs.cpp‎
Lines changed: 3 additions & 2 deletions b/‎cpp/src/branch_and_bound/pseudo_costs.cpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎cpp/src/pdlp/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎cpp/src/pdlp/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu‎
Lines changed: 1 addition & 1 deletion b/‎cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/src/pdlp/pdhg.cu‎
Lines changed: 38 additions & 1 deletion b/‎cpp/src/pdlp/pdhg.cu‎
Lines changed: 38 additions & 1 deletion
diff --git a/‎cpp/src/pdlp/pdlp.cu‎
Lines changed: 81 additions & 24 deletions b/‎cpp/src/pdlp/pdlp.cu‎
Lines changed: 81 additions & 24 deletions
diff --git a/‎cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu‎
Lines changed: 1 addition & 1 deletion b/‎cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu‎
Lines changed: 1 addition & 1 deletion
@@ -70,15 +70,23 @@ static void parse_arguments(argparse::ArgumentParser& program)
       "Path to PDLP hyper-params file to configure PDLP solver. Has priority over PDLP solver "
       "modes.");
 
-  program.add_argument("--presolve")
-    .help("enable/disable presolve (default: true for MIP problems, false for LP problems)")
-    .default_value(0)
-    .scan<'i', int>()
-    .choices(0, 1);
+  program.add_argument("--presolver")
+    .help("Presolver to use. Possible values: None, Papilo, PSLP, Default")
+    .default_value("Default")
+    .choices("None", "Papilo", "PSLP", "Default");
 
   program.add_argument("--solution-path").help("Path where solution file will be generated");
 }
 
+static cuopt::linear_programming::presolver_t string_to_presolver(const std::string& presolver)
+{
+  if (presolver == "None") return cuopt::linear_programming::presolver_t::None;
+  if (presolver == "Papilo") return cuopt::linear_programming::presolver_t::Papilo;
+  if (presolver == "PSLP") return cuopt::linear_programming::presolver_t::PSLP;
+  if (presolver == "Default") return cuopt::linear_programming::presolver_t::Default;
+  return cuopt::linear_programming::presolver_t::Default;
+}
+
 static cuopt::linear_programming::pdlp_solver_mode_t string_to_pdlp_solver_mode(
   const std::string& mode)
 {
@@ -107,7 +115,7 @@ static cuopt::linear_programming::pdlp_solver_settings_t<int, double> create_sol
     string_to_pdlp_solver_mode(program.get<std::string>("--pdlp-solver-mode"));
   settings.method = static_cast<cuopt::linear_programming::method_t>(program.get<int>("--method"));
   settings.crossover = program.get<int>("--crossover");
-  settings.presolve  = program.get<int>("--presolve");
+  settings.presolver = string_to_presolver(program.get<std::string>("--presolver"));
 
   return settings;
 }
 
@@ -344,8 +344,9 @@ void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
     if (batch_remaining_time <= 0.0) { return; }
     pdlp_solver_settings_t<i_t, f_t> pdlp_settings;
     pdlp_settings.time_limit = batch_remaining_time;
-    const auto solutions     = batch_pdlp_solve(
-      original_problem.handle_ptr, mps_model, fractional, fraction_values, pdlp_settings);
+    const raft::handle_t batch_pdlp_handle;
+    const auto solutions =
+      batch_pdlp_solve(&batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings);
     f_t batch_pdlp_strong_branching_time = toc(start_batch);
 
     // Find max iteration on how many are done accross the batch
 
@@ -24,6 +24,7 @@ set(LP_CORE_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/infeasibility_information.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/convergence_information.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/optimal_batch_size_handler/optimal_batch_size_handler.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/utilities/ping_pong_graph.cu
 )
 
 # C and Python adapter files
 
@@ -545,7 +545,7 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::scale_problem()
 #ifdef CUPDLP_DEBUG_MODE
   print("constraint_lower_bound", op_problem_scaled_.constraint_lower_bounds);
   print("constraint_upper_bound", op_problem_scaled_.constraint_upper_bounds);
-  std::vector<f_t2> variable_bounds = host_copy(op_problem_scaled_.variable_bounds);
+  std::vector<f_t2> variable_bounds = host_copy(op_problem_scaled_.variable_bounds, stream_view_);
   std::vector<f_t> lower_bounds;
   std::vector<f_t> upper_bounds;
   for (const auto& variable_bound : variable_bounds) {
 
@@ -567,6 +567,12 @@ struct primal_reflected_major_projection_bulk_op {
     const f_t obj_coef   = objective_coefficients[var_idx];
     const f_t aty_val    = current_AtY[idx];
 
+    cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_major_projection");
+    cuopt_assert(!isinf(step_size), "primal_step_size is Inf in primal_reflected_major_projection");
+    cuopt_assert(step_size > f_t(0.0), "primal_step_size must be > 0");
+    cuopt_assert(!isnan(primal_val), "primal_solution is NaN in primal_reflected_major_projection");
+    cuopt_assert(!isnan(aty_val), "current_AtY is NaN in primal_reflected_major_projection");
+
     const f_t next = primal_val - step_size * (obj_coef - aty_val);
 
     const f_t2 bounds = variable_bounds[var_idx];
@@ -576,6 +582,9 @@ struct primal_reflected_major_projection_bulk_op {
     potential_next_primal[idx] = next_clamped;
     dual_slack[idx]            = (next_clamped - next) / step_size;
     reflected_primal[idx]      = f_t(2.0) * next_clamped - primal_val;
+
+    cuopt_assert(!isnan(reflected_primal[idx]),
+                 "reflected_primal is NaN after primal_reflected_major_projection");
   }
 };
 
@@ -599,6 +608,12 @@ struct dual_reflected_major_projection_bulk_op {
     const f_t current_dual = dual_solution[idx];
     const f_t Ax           = dual_gradient[idx];
 
+    cuopt_assert(!isnan(step_size), "dual_step_size is NaN in dual_reflected_major_projection");
+    cuopt_assert(!isinf(step_size), "dual_step_size is Inf in dual_reflected_major_projection");
+    cuopt_assert(step_size > f_t(0.0), "dual_step_size must be > 0");
+    cuopt_assert(!isnan(current_dual), "dual_solution is NaN in dual_reflected_major_projection");
+    cuopt_assert(!isnan(Ax), "dual_gradient is NaN in dual_reflected_major_projection");
+
     const f_t tmp = current_dual / step_size - Ax;
     const f_t tmp_proj =
       cuda::std::max<f_t>(-constraint_upper_bounds[constraint_idx],
@@ -607,6 +622,9 @@ struct dual_reflected_major_projection_bulk_op {
 
     potential_next_dual[idx] = next_dual;
     reflected_dual[idx]      = f_t(2.0) * next_dual - current_dual;
+
+    cuopt_assert(!isnan(reflected_dual[idx]),
+                 "reflected_dual is NaN after dual_reflected_major_projection");
   }
 };
 
@@ -631,12 +649,21 @@ struct primal_reflected_projection_bulk_op {
     const f_t obj_coef   = objective_coefficients[var_idx];
     const f_t aty_val    = current_AtY[idx];
 
+    cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_projection");
+    cuopt_assert(!isnan(primal_val), "primal_solution is NaN in primal_reflected_projection");
+    cuopt_assert(!isnan(aty_val), "current_AtY is NaN in primal_reflected_projection");
+    cuopt_assert(!isinf(step_size), "primal_step_size is Inf in primal_reflected_projection");
+    cuopt_assert(step_size > f_t(0.0), "primal_step_size must be > 0");
+
     f_t reflected = primal_val - step_size * (obj_coef - aty_val);
 
     const f_t2 bounds = variable_bounds[var_idx];
     reflected = cuda::std::max(cuda::std::min(reflected, get_upper(bounds)), get_lower(bounds));
 
     reflected_primal[idx] = f_t(2.0) * reflected - primal_val;
+
+    cuopt_assert(!isnan(reflected_primal[idx]),
+                 "reflected_primal is NaN after primal_reflected_projection");
   }
 };
 
@@ -659,13 +686,23 @@ struct dual_reflected_projection_bulk_op {
 
     const f_t step_size    = dual_step_size[batch_idx];
     const f_t current_dual = dual_solution[idx];
-    const f_t tmp          = current_dual / step_size - dual_gradient[idx];
+
+    cuopt_assert(!isnan(step_size), "dual_step_size is NaN in dual_reflected_projection");
+    cuopt_assert(!isnan(current_dual), "dual_solution is NaN in dual_reflected_projection");
+    cuopt_assert(!isnan(dual_gradient[idx]), "dual_gradient is NaN in dual_reflected_projection");
+    cuopt_assert(!isinf(step_size), "dual_step_size is Inf in dual_reflected_projection");
+    cuopt_assert(step_size > f_t(0.0), "dual_step_size must be > 0");
+
+    const f_t tmp = current_dual / step_size - dual_gradient[idx];
     const f_t tmp_proj =
       cuda::std::max<f_t>(-constraint_upper_bounds[constraint_idx],
                           cuda::std::min<f_t>(tmp, -constraint_lower_bounds[constraint_idx]));
     const f_t next_dual = (tmp - tmp_proj) * step_size;
 
     reflected_dual[idx] = f_t(2.0) * next_dual - current_dual;
+
+    cuopt_assert(!isnan(reflected_dual[idx]),
+                 "reflected_dual is NaN after dual_reflected_projection");
   }
 };
 
 
@@ -35,7 +35,9 @@
 
 #include <thrust/count.h>
 #include <thrust/extrema.h>
+#include <thrust/logical.h>
 
+#include <cmath>
 #include <optional>
 #include <unordered_set>
 
@@ -1186,24 +1188,55 @@ static void compute_stats(const rmm::device_uvector<f_t>& vec,
                           f_t& avg)
 {
   auto abs_op      = [] __host__ __device__(f_t x) { return abs(x); };
-  auto min_nonzero = [] __host__ __device__(f_t x) {
-    return x == 0 ? std::numeric_limits<f_t>::max() : abs(x);
-  };
-
-  smallest = thrust::transform_reduce(rmm::exec_policy(vec.stream()),
-                                      vec.begin(),
-                                      vec.end(),
-                                      min_nonzero,
-                                      std::numeric_limits<f_t>::max(),
-                                      thrust::minimum<f_t>());
-
-  largest = thrust::transform_reduce(
-    rmm::exec_policy(vec.stream()), vec.begin(), vec.end(), abs_op, 0.0f, thrust::maximum<f_t>());
-
-  f_t sum = thrust::transform_reduce(
-    rmm::exec_policy(vec.stream()), vec.begin(), vec.end(), abs_op, 0.0f, thrust::plus<f_t>());
-
-  avg = sum / vec.size();
+  auto min_nonzero = [] __host__ __device__(f_t x)
+    -> f_t { return x == 0 ? std::numeric_limits<f_t>::max() : abs(x); };
+
+  cuopt_assert(vec.size() > 0, "Vector must not be empty");
+
+  auto stream = vec.stream();
+  size_t n    = vec.size();
+
+  rmm::device_scalar<f_t> d_smallest(stream);
+  rmm::device_scalar<f_t> d_largest(stream);
+  rmm::device_scalar<f_t> d_sum(stream);
+
+  auto min_nz_iter = thrust::make_transform_iterator(vec.cbegin(), min_nonzero);
+  auto abs_iter    = thrust::make_transform_iterator(vec.cbegin(), abs_op);
+
+  void* d_temp   = nullptr;
+  size_t bytes_1 = 0, bytes_2 = 0, bytes_3 = 0;
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(d_temp,
+                                          bytes_1,
+                                          min_nz_iter,
+                                          d_smallest.data(),
+                                          n,
+                                          cuda::minimum<>{},
+                                          std::numeric_limits<f_t>::max(),
+                                          stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    d_temp, bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    d_temp, bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream));
+
+  size_t max_bytes = std::max({bytes_1, bytes_2, bytes_3});
+  rmm::device_buffer temp_buf(max_bytes, stream);
+
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(temp_buf.data(),
+                                          bytes_1,
+                                          min_nz_iter,
+                                          d_smallest.data(),
+                                          n,
+                                          cuda::minimum<>{},
+                                          std::numeric_limits<f_t>::max(),
+                                          stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    temp_buf.data(), bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    temp_buf.data(), bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream));
+
+  smallest = d_smallest.value(stream);
+  largest  = d_largest.value(stream);
+  avg      = d_sum.value(stream) / vec.size();
 };
 
 template <typename f_t>
@@ -1406,11 +1439,25 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal,
                                  const f_t interaction,
                                  f_t* fixed_point_error)
 {
+  cuopt_assert(!isnan(norm_squared_delta_primal), "norm_squared_delta_primal must not be NaN");
+  cuopt_assert(!isnan(norm_squared_delta_dual), "norm_squared_delta_dual must not be NaN");
+  cuopt_assert(!isnan(primal_weight), "primal_weight must not be NaN");
+  cuopt_assert(!isnan(step_size), "step_size must not be NaN");
+  cuopt_assert(!isnan(interaction), "interaction must not be NaN");
+  cuopt_assert(norm_squared_delta_primal >= f_t(0.0), "norm_squared_delta_primal must be >= 0");
+  cuopt_assert(norm_squared_delta_dual >= f_t(0.0), "norm_squared_delta_dual must be >= 0");
+  cuopt_assert(primal_weight > f_t(0.0), "primal_weight must be > 0");
+  cuopt_assert(step_size > f_t(0.0), "step_size must be > 0");
+
   const f_t movement =
     norm_squared_delta_primal * primal_weight + norm_squared_delta_dual / primal_weight;
   const f_t computed_interaction = f_t(2.0) * interaction * step_size;
 
-  *fixed_point_error = cuda::std::sqrt(movement + computed_interaction);
+  cuopt_assert(movement + computed_interaction >= f_t(0.0),
+               "Movement + computed interaction must be >= 0");
+
+  // Clamp to 0 to avoid NaN
+  *fixed_point_error = cuda::std::sqrt(cuda::std::max(f_t(0.0), movement + computed_interaction));
 
 #ifdef CUPDLP_DEBUG_MODE
   printf("movement %lf\n", movement);
@@ -1790,6 +1837,7 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
   // Sync to make sure all previous cuSparse operations are finished before setting the
   // potential_next_dual_solution
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+
   // Make potential_next_dual_solution point towards reflected dual solution to reuse the code
   RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution,
                                            (void*)pdhg_solver_.get_reflected_dual().data()));
@@ -1813,6 +1861,7 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
     RAFT_CUDA_TRY(cudaStreamSynchronize(
       stream_view_));  // To make sure all the data is written from device to host
     RAFT_CUDA_TRY(cudaPeekAtLastError());
+
 #ifdef CUPDLP_DEBUG_MODE
     RAFT_CUDA_TRY(cudaDeviceSynchronize());
 #endif
@@ -1847,9 +1896,15 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
 #endif
 
   for (size_t i = 0; i < climber_strategies_.size(); ++i) {
+    cuopt_assert(!std::isnan(restart_strategy_.fixed_point_error_[i]),
+                 "fixed_point_error_ must not be NaN after compute_fixed_error");
+    cuopt_assert(restart_strategy_.fixed_point_error_[i] >= f_t(0.0),
+                 "fixed_point_error_ must be >= 0 after compute_fixed_error");
     if (has_restarted[i]) {
       restart_strategy_.initial_fixed_point_error_[i] = restart_strategy_.fixed_point_error_[i];
-      has_restarted[i]                                = false;
+      cuopt_assert(!std::isnan(restart_strategy_.initial_fixed_point_error_[i]),
+                   "initial_fixed_point_error_ must not be NaN after assignment");
+      has_restarted[i] = false;
     }
   }
 }
@@ -1869,6 +1924,7 @@ void pdlp_solver_t<i_t, f_t>::transpose_primal_dual_to_row(
   rmm::device_uvector<f_t> dual_slack_transposed(
     is_dual_slack_empty ? 0 : primal_size_h_ * climber_strategies_.size(), stream_view_);
 
+  RAFT_CUBLAS_TRY(cublasSetStream(handle_ptr_->get_cublas_handle(), stream_view_));
   CUBLAS_CHECK(cublasDgeam(handle_ptr_->get_cublas_handle(),
                            CUBLAS_OP_T,
                            CUBLAS_OP_N,
@@ -1945,6 +2001,7 @@ void pdlp_solver_t<i_t, f_t>::transpose_primal_dual_back_to_col(
   rmm::device_uvector<f_t> dual_slack_transposed(
     is_dual_slack_empty ? 0 : primal_size_h_ * climber_strategies_.size(), stream_view_);
 
+  RAFT_CUBLAS_TRY(cublasSetStream(handle_ptr_->get_cublas_handle(), stream_view_));
   CUBLAS_CHECK(cublasDgeam(handle_ptr_->get_cublas_handle(),
                            CUBLAS_OP_T,
                            CUBLAS_OP_N,
@@ -2632,7 +2689,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
     rmm::device_uvector<f_t> d_atq(n, stream_view_);
 
     std::mt19937 gen(1);
-    std::normal_distribution<double> dist(0.0, 1.0);
+    std::normal_distribution<f_t> dist(f_t(0.0), f_t(1.0));
 
     for (int i = 0; i < m; ++i)
       z[i] = dist(gen);
@@ -2684,7 +2741,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
                                            vecATQ,
                                            CUSPARSE_SPMV_CSR_ALG2,
                                            (f_t*)cusparse_view_.buffer_transpose.data(),
-                                           stream_view_));
+                                           stream_view_.value()));
 
       // z = A @ A_t_q
       RAFT_CUSPARSE_TRY(
@@ -2697,7 +2754,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
                                            vecZ,
                                            CUSPARSE_SPMV_CSR_ALG2,
                                            (f_t*)cusparse_view_.buffer_non_transpose.data(),
-                                           stream_view_));
+                                           stream_view_.value()));
       // sigma_max_sq = dot(q, z)
       RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
                                                       m,
@@ -2706,7 +2763,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
                                                       d_z.data(),
                                                       primal_stride,
                                                       sigma_max_sq.data(),
-                                                      stream_view_));
+                                                      stream_view_.value()));
 
       cub::DeviceTransform::Transform(
         cuda::std::make_tuple(d_q.data(), d_z.data()),
 
@@ -34,11 +34,11 @@
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/logical.h>
 #include <thrust/sort.h>
-#include <thrust/transform_reduce.h>
 
 #include <cub/cub.cuh>
Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,7 @@ set(LP_CORE_FILES`
`24`	`24`	`${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/infeasibility_information.cu`
`25`	`25`	`${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/convergence_information.cu`
`26`	`26`	`${CMAKE_CURRENT_SOURCE_DIR}/optimal_batch_size_handler/optimal_batch_size_handler.cu`
	`27`	`+ ${CMAKE_CURRENT_SOURCE_DIR}/utilities/ping_pong_graph.cu`
`27`	`28`	`)`
`28`	`29`
`29`	`30`	`# C and Python adapter files`