3535
3636#include < thrust/count.h>
3737#include < thrust/extrema.h>
38+ #include < thrust/logical.h>
3839
40+ #include < cmath>
3941#include < optional>
4042#include < unordered_set>
4143
@@ -1186,24 +1188,55 @@ static void compute_stats(const rmm::device_uvector<f_t>& vec,
11861188 f_t & avg)
11871189{
11881190 auto abs_op = [] __host__ __device__ (f_t x) { return abs (x); };
1189- auto min_nonzero = [] __host__ __device__ (f_t x) {
1190- return x == 0 ? std::numeric_limits<f_t >::max () : abs (x);
1191- };
1192-
1193- smallest = thrust::transform_reduce (rmm::exec_policy (vec.stream ()),
1194- vec.begin (),
1195- vec.end (),
1196- min_nonzero,
1197- std::numeric_limits<f_t >::max (),
1198- thrust::minimum<f_t >());
1199-
1200- largest = thrust::transform_reduce (
1201- rmm::exec_policy (vec.stream ()), vec.begin (), vec.end (), abs_op, 0 .0f , thrust::maximum<f_t >());
1202-
1203- f_t sum = thrust::transform_reduce (
1204- rmm::exec_policy (vec.stream ()), vec.begin (), vec.end (), abs_op, 0 .0f , thrust::plus<f_t >());
1205-
1206- avg = sum / vec.size ();
1191+ auto min_nonzero = [] __host__ __device__ (f_t x)
1192+ -> f_t { return x == 0 ? std::numeric_limits<f_t >::max () : abs (x); };
1193+
1194+ cuopt_assert (vec.size () > 0 , " Vector must not be empty" );
1195+
1196+ auto stream = vec.stream ();
1197+ size_t n = vec.size ();
1198+
1199+ rmm::device_scalar<f_t > d_smallest (stream);
1200+ rmm::device_scalar<f_t > d_largest (stream);
1201+ rmm::device_scalar<f_t > d_sum (stream);
1202+
1203+ auto min_nz_iter = thrust::make_transform_iterator (vec.cbegin (), min_nonzero);
1204+ auto abs_iter = thrust::make_transform_iterator (vec.cbegin (), abs_op);
1205+
1206+ void * d_temp = nullptr ;
1207+ size_t bytes_1 = 0 , bytes_2 = 0 , bytes_3 = 0 ;
1208+ RAFT_CUDA_TRY (cub::DeviceReduce::Reduce (d_temp,
1209+ bytes_1,
1210+ min_nz_iter,
1211+ d_smallest.data (),
1212+ n,
1213+ cuda::minimum<>{},
1214+ std::numeric_limits<f_t >::max (),
1215+ stream));
1216+ RAFT_CUDA_TRY (cub::DeviceReduce::Reduce (
1217+ d_temp, bytes_2, abs_iter, d_largest.data (), n, cuda::maximum<>{}, f_t (0 ), stream));
1218+ RAFT_CUDA_TRY (cub::DeviceReduce::Reduce (
1219+ d_temp, bytes_3, abs_iter, d_sum.data (), n, cuda::std::plus<>{}, f_t (0 ), stream));
1220+
1221+ size_t max_bytes = std::max ({bytes_1, bytes_2, bytes_3});
1222+ rmm::device_buffer temp_buf (max_bytes, stream);
1223+
1224+ RAFT_CUDA_TRY (cub::DeviceReduce::Reduce (temp_buf.data (),
1225+ bytes_1,
1226+ min_nz_iter,
1227+ d_smallest.data (),
1228+ n,
1229+ cuda::minimum<>{},
1230+ std::numeric_limits<f_t >::max (),
1231+ stream));
1232+ RAFT_CUDA_TRY (cub::DeviceReduce::Reduce (
1233+ temp_buf.data (), bytes_2, abs_iter, d_largest.data (), n, cuda::maximum<>{}, f_t (0 ), stream));
1234+ RAFT_CUDA_TRY (cub::DeviceReduce::Reduce (
1235+ temp_buf.data (), bytes_3, abs_iter, d_sum.data (), n, cuda::std::plus<>{}, f_t (0 ), stream));
1236+
1237+ smallest = d_smallest.value (stream);
1238+ largest = d_largest.value (stream);
1239+ avg = d_sum.value (stream) / vec.size ();
12071240};
12081241
12091242template <typename f_t >
@@ -1406,11 +1439,25 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal,
14061439 const f_t interaction,
14071440 f_t * fixed_point_error)
14081441{
1442+ cuopt_assert (!isnan (norm_squared_delta_primal), " norm_squared_delta_primal must not be NaN" );
1443+ cuopt_assert (!isnan (norm_squared_delta_dual), " norm_squared_delta_dual must not be NaN" );
1444+ cuopt_assert (!isnan (primal_weight), " primal_weight must not be NaN" );
1445+ cuopt_assert (!isnan (step_size), " step_size must not be NaN" );
1446+ cuopt_assert (!isnan (interaction), " interaction must not be NaN" );
1447+ cuopt_assert (norm_squared_delta_primal >= f_t (0.0 ), " norm_squared_delta_primal must be >= 0" );
1448+ cuopt_assert (norm_squared_delta_dual >= f_t (0.0 ), " norm_squared_delta_dual must be >= 0" );
1449+ cuopt_assert (primal_weight > f_t (0.0 ), " primal_weight must be > 0" );
1450+ cuopt_assert (step_size > f_t (0.0 ), " step_size must be > 0" );
1451+
14091452 const f_t movement =
14101453 norm_squared_delta_primal * primal_weight + norm_squared_delta_dual / primal_weight;
14111454 const f_t computed_interaction = f_t (2.0 ) * interaction * step_size;
14121455
1413- *fixed_point_error = cuda::std::sqrt (movement + computed_interaction);
1456+ cuopt_assert (movement + computed_interaction >= f_t (0.0 ),
1457+ " Movement + computed interaction must be >= 0" );
1458+
1459+ // Clamp to 0 to avoid NaN
1460+ *fixed_point_error = cuda::std::sqrt (cuda::std::max (f_t (0.0 ), movement + computed_interaction));
14141461
14151462#ifdef CUPDLP_DEBUG_MODE
14161463 printf (" movement %lf\n " , movement);
@@ -1790,6 +1837,7 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
17901837 // Sync to make sure all previous cuSparse operations are finished before setting the
17911838 // potential_next_dual_solution
17921839 RAFT_CUDA_TRY (cudaStreamSynchronize (stream_view_));
1840+
17931841 // Make potential_next_dual_solution point towards reflected dual solution to reuse the code
17941842 RAFT_CUSPARSE_TRY (cusparseDnVecSetValues (cusparse_view.potential_next_dual_solution ,
17951843 (void *)pdhg_solver_.get_reflected_dual ().data ()));
@@ -1813,6 +1861,7 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
18131861 RAFT_CUDA_TRY (cudaStreamSynchronize (
18141862 stream_view_)); // To make sure all the data is written from device to host
18151863 RAFT_CUDA_TRY (cudaPeekAtLastError ());
1864+
18161865#ifdef CUPDLP_DEBUG_MODE
18171866 RAFT_CUDA_TRY (cudaDeviceSynchronize ());
18181867#endif
@@ -1847,9 +1896,15 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
18471896#endif
18481897
18491898 for (size_t i = 0 ; i < climber_strategies_.size (); ++i) {
1899+ cuopt_assert (!std::isnan (restart_strategy_.fixed_point_error_ [i]),
1900+ " fixed_point_error_ must not be NaN after compute_fixed_error" );
1901+ cuopt_assert (restart_strategy_.fixed_point_error_ [i] >= f_t (0.0 ),
1902+ " fixed_point_error_ must be >= 0 after compute_fixed_error" );
18501903 if (has_restarted[i]) {
18511904 restart_strategy_.initial_fixed_point_error_ [i] = restart_strategy_.fixed_point_error_ [i];
1852- has_restarted[i] = false ;
1905+ cuopt_assert (!std::isnan (restart_strategy_.initial_fixed_point_error_ [i]),
1906+ " initial_fixed_point_error_ must not be NaN after assignment" );
1907+ has_restarted[i] = false ;
18531908 }
18541909 }
18551910}
@@ -1869,6 +1924,7 @@ void pdlp_solver_t<i_t, f_t>::transpose_primal_dual_to_row(
18691924 rmm::device_uvector<f_t > dual_slack_transposed (
18701925 is_dual_slack_empty ? 0 : primal_size_h_ * climber_strategies_.size (), stream_view_);
18711926
1927+ RAFT_CUBLAS_TRY (cublasSetStream (handle_ptr_->get_cublas_handle (), stream_view_));
18721928 CUBLAS_CHECK (cublasDgeam (handle_ptr_->get_cublas_handle (),
18731929 CUBLAS_OP_T,
18741930 CUBLAS_OP_N,
@@ -1945,6 +2001,7 @@ void pdlp_solver_t<i_t, f_t>::transpose_primal_dual_back_to_col(
19452001 rmm::device_uvector<f_t > dual_slack_transposed (
19462002 is_dual_slack_empty ? 0 : primal_size_h_ * climber_strategies_.size (), stream_view_);
19472003
2004+ RAFT_CUBLAS_TRY (cublasSetStream (handle_ptr_->get_cublas_handle (), stream_view_));
19482005 CUBLAS_CHECK (cublasDgeam (handle_ptr_->get_cublas_handle (),
19492006 CUBLAS_OP_T,
19502007 CUBLAS_OP_N,
@@ -2632,7 +2689,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
26322689 rmm::device_uvector<f_t > d_atq (n, stream_view_);
26332690
26342691 std::mt19937 gen (1 );
2635- std::normal_distribution<double > dist (0.0 , 1.0 );
2692+ std::normal_distribution<f_t > dist (f_t ( 0.0 ), f_t ( 1.0 ) );
26362693
26372694 for (int i = 0 ; i < m; ++i)
26382695 z[i] = dist (gen);
@@ -2684,7 +2741,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
26842741 vecATQ,
26852742 CUSPARSE_SPMV_CSR_ALG2,
26862743 (f_t *)cusparse_view_.buffer_transpose .data (),
2687- stream_view_));
2744+ stream_view_. value () ));
26882745
26892746 // z = A @ A_t_q
26902747 RAFT_CUSPARSE_TRY (
@@ -2697,7 +2754,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
26972754 vecZ,
26982755 CUSPARSE_SPMV_CSR_ALG2,
26992756 (f_t *)cusparse_view_.buffer_non_transpose .data (),
2700- stream_view_));
2757+ stream_view_. value () ));
27012758 // sigma_max_sq = dot(q, z)
27022759 RAFT_CUBLAS_TRY (raft::linalg::detail::cublasdot (handle_ptr_->get_cublas_handle (),
27032760 m,
@@ -2706,7 +2763,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
27062763 d_z.data (),
27072764 primal_stride,
27082765 sigma_max_sq.data (),
2709- stream_view_));
2766+ stream_view_. value () ));
27102767
27112768 cub::DeviceTransform::Transform (
27122769 cuda::std::make_tuple (d_q.data (), d_z.data ()),
0 commit comments