Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
234 changes: 194 additions & 40 deletions cpp/src/barrier/barrier.cu
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ class iteration_data_t {
d_augmented_diagonal_indices_(0, lp.handle_ptr->get_stream()),
use_augmented(false),
has_factorization(false),
n_free_vars(0),
d_is_free_(0, lp.handle_ptr->get_stream()),
num_factorizations(0),
has_solve_info(false),
settings_(settings),
Expand Down Expand Up @@ -287,6 +289,11 @@ class iteration_data_t {
f_t estimated_nz_AAT = 0.0;
std::vector<i_t> dense_columns_unordered;

if (has_Q) {
// QPs always use the augmented system; skip dense column detection
use_augmented = true;
n_dense_columns = 0;
} else {
Comment on lines +292 to +296
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Don't force the augmented system for every QP.

This now disables the existing ADAT path even for diagonal-Q models with no native free variables. That grows the factorization from roughly m to m + n, skips dense-column handling, and can turn previously workable large QPs into memory/time regressions unrelated to the free-variable fix. Please gate this on cases that actually require the augmented formulation, e.g. non-diagonal Q or at least one native free column.

As per coding guidelines, "Verify correct problem size checks before expensive GPU/CPU operations; prevent resource exhaustion on oversized problems."

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@cpp/src/barrier/barrier.cu` around lines 292 - 296, The change currently
forces use_augmented = true whenever has_Q is true, removing the ADAT path and
skipping dense-column handling; instead only enable the augmented system when it
is actually required (e.g., Q is non-diagonal or there is at least one native
free column). Update the conditional around has_Q to test the actual need for
augmentation (check the Q-diagonality flag/criterion used in this module and the
native free-column count variable) and only set use_augmented = true and
n_dense_columns = 0 in that case; otherwise preserve the ADAT path and run
dense-column detection as before (keep existing logic that handles diagonal-Q +
zero native-free-columns).

f_t start_column_density = tic();
// Ignore Q matrix for now
find_dense_columns(
Expand Down Expand Up @@ -320,6 +327,7 @@ class iteration_data_t {
n_dense_columns = 0;
use_augmented = !Q_diagonal;
}
} // end !has_Q

if (use_augmented) {
settings.log.printf("Linear system : augmented\n");
Expand Down Expand Up @@ -1538,6 +1546,8 @@ class iteration_data_t {

bool use_augmented;
i_t symbolic_status;
i_t n_free_vars{0};
rmm::device_uvector<i_t> d_is_free_; // 1 if variable is free (QP only), 0 otherwise

std::unique_ptr<sparse_cholesky_base_t<i_t, f_t>> chol;

Expand Down Expand Up @@ -1908,6 +1918,12 @@ int barrier_solver_t<i_t, f_t>::initial_point(iteration_data_t<i_t, f_t>& data)
}
}
}
// Free variables have z = 0 (no complementarity condition)
if (data.n_free_vars > 0) {
for (i_t j : presolve_info.free_variable_indices) {
data.z[j] = 0.0;
}
}
} else if (use_augmented) {
dense_vector_t<i_t, f_t> dual_rhs(lp.num_cols + lp.num_rows);
dual_rhs.set_scalar(0.0);
Expand Down Expand Up @@ -1970,8 +1986,25 @@ int barrier_solver_t<i_t, f_t>::initial_point(iteration_data_t<i_t, f_t>& data)
vector_norm2<i_t, f_t>(data.dual_residual));
#endif
// Make sure (w, x, v, z) > 0
// Save free variable x values before forcing positive (they can be negative)
std::vector<f_t> free_x_save;
if (data.n_free_vars > 0) {
free_x_save.resize(data.n_free_vars);
for (i_t k = 0; k < data.n_free_vars; ++k) {
free_x_save[k] = data.x[presolve_info.free_variable_indices[k]];
}
}
data.w.ensure_positive(epsilon_adjust);
data.x.ensure_positive(epsilon_adjust);

// For native free variables (QP): restore x values and set z = 0
if (data.n_free_vars > 0) {
for (i_t k = 0; k < data.n_free_vars; ++k) {
i_t j = presolve_info.free_variable_indices[k];
data.x[j] = free_x_save[k];
data.z[j] = 0.0;
}
}
#ifdef PRINT_INFO
settings.log.printf("min v %e min z %e\n", data.v.minimum(), data.z.minimum());
#endif
Expand Down Expand Up @@ -2166,6 +2199,29 @@ f_t barrier_solver_t<i_t, f_t>::gpu_max_step_to_boundary(iteration_data_t<i_t, f
const rmm::device_uvector<f_t>& x,
const rmm::device_uvector<f_t>& dx)
{
// For x-sized vectors with free variables, skip free vars in ratio test
const bool has_free = data.n_free_vars > 0 && static_cast<i_t>(x.size()) == lp.num_cols;

if (has_free) {
auto is_free_ptr = data.d_is_free_.data();
auto ratio_test_free = [is_free_ptr] HD(const thrust::tuple<f_t, f_t, i_t> t) {
const f_t dx_val = thrust::get<0>(t);
const f_t x_val = thrust::get<1>(t);
const i_t is_free = thrust::get<2>(t);
if (is_free) return f_t(1.0);
if (dx_val < f_t(0.0)) return -x_val / dx_val;
return f_t(1.0);
};

return data.transform_reduce_helper_.transform_reduce(
thrust::make_zip_iterator(dx.data(), x.data(), is_free_ptr),
thrust::minimum<f_t>(),
ratio_test_free,
f_t(1.0),
x.size(),
stream_view_);
}

return data.transform_reduce_helper_.transform_reduce(
thrust::make_zip_iterator(dx.data(), x.data()),
thrust::minimum<f_t>(),
Expand Down Expand Up @@ -2248,11 +2304,37 @@ i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_
raft::common::nvtx::range fun_scope("Barrier: GPU diag, inv diag and sqrt inv diag formation");

// diag = z ./ x
cub::DeviceTransform::Transform(cuda::std::make_tuple(data.d_z_.data(), data.d_x_.data()),
data.d_diag_.data(),
data.d_diag_.size(),
cuda::std::divides<>{},
stream_view_.value());
// For native free variables (QP): use Q diagonal if available, otherwise a static regularizer
if (data.n_free_vars > 0) {
constexpr f_t free_var_reg = 1e-7;
if (data.Q.n > 0 && data.Q_diagonal) {
cub::DeviceTransform::Transform(
cuda::std::make_tuple(data.d_z_.data(), data.d_x_.data(),
data.d_is_free_.data(), data.d_Q_diag_.data()),
data.d_diag_.data(),
data.d_diag_.size(),
[free_var_reg] HD(f_t z_j, f_t x_j, i_t is_free, f_t q_jj) {
if (!is_free) return z_j / x_j;
return (q_jj > f_t(0)) ? f_t(0) : free_var_reg;
},
stream_view_.value());
} else {
cub::DeviceTransform::Transform(
cuda::std::make_tuple(data.d_z_.data(), data.d_x_.data(), data.d_is_free_.data()),
data.d_diag_.data(),
data.d_diag_.size(),
[free_var_reg] HD(f_t z_j, f_t x_j, i_t is_free) {
return is_free ? free_var_reg : (z_j / x_j);
},
stream_view_.value());
}
} else {
cub::DeviceTransform::Transform(cuda::std::make_tuple(data.d_z_.data(), data.d_x_.data()),
data.d_diag_.data(),
data.d_diag_.size(),
cuda::std::divides<>{},
stream_view_.value());
}
RAFT_CHECK_CUDA(stream_view_);
// diag = z ./ x + E * (v ./ w) * E'

Expand Down Expand Up @@ -2358,20 +2440,40 @@ i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_
RAFT_CHECK_CUDA(stream_view_);
// tmp3 <- tmp3 .+ -(complementarity_xz_rhs ./ x) .+ dual_rhs
// tmp4 <- inv_diag .* tmp3
cub::DeviceTransform::Transform(
cuda::std::make_tuple(data.d_inv_diag.data(),
data.d_tmp3_.data(),
data.d_complementarity_xz_rhs_.data(),
data.d_x_.data(),
data.d_dual_rhs_.data()),
thrust::make_zip_iterator(data.d_tmp3_.data(), data.d_tmp4_.data()),
lp.num_cols,
[] HD(f_t inv_diag, f_t tmp3, f_t complementarity_xz_rhs, f_t x, f_t dual_rhs)
-> thrust::tuple<f_t, f_t> {
const f_t tmp = tmp3 + -(complementarity_xz_rhs / x) + dual_rhs;
return {tmp, inv_diag * tmp};
},
stream_view_.value());
// For free variables, skip the complementarity_xz_rhs / x term
if (data.n_free_vars > 0) {
cub::DeviceTransform::Transform(
cuda::std::make_tuple(data.d_inv_diag.data(),
data.d_tmp3_.data(),
data.d_complementarity_xz_rhs_.data(),
data.d_x_.data(),
data.d_dual_rhs_.data(),
data.d_is_free_.data()),
thrust::make_zip_iterator(data.d_tmp3_.data(), data.d_tmp4_.data()),
lp.num_cols,
[] HD(f_t inv_diag, f_t tmp3, f_t complementarity_xz_rhs, f_t x, f_t dual_rhs, i_t is_free)
-> thrust::tuple<f_t, f_t> {
const f_t xz_term = is_free ? f_t(0) : (complementarity_xz_rhs / x);
const f_t tmp = tmp3 - xz_term + dual_rhs;
return {tmp, inv_diag * tmp};
},
stream_view_.value());
} else {
cub::DeviceTransform::Transform(
cuda::std::make_tuple(data.d_inv_diag.data(),
data.d_tmp3_.data(),
data.d_complementarity_xz_rhs_.data(),
data.d_x_.data(),
data.d_dual_rhs_.data()),
thrust::make_zip_iterator(data.d_tmp3_.data(), data.d_tmp4_.data()),
lp.num_cols,
[] HD(f_t inv_diag, f_t tmp3, f_t complementarity_xz_rhs, f_t x, f_t dual_rhs)
-> thrust::tuple<f_t, f_t> {
const f_t tmp = tmp3 + -(complementarity_xz_rhs / x) + dual_rhs;
return {tmp, inv_diag * tmp};
},
stream_view_.value());
}
RAFT_CHECK_CUDA(stream_view_);
raft::copy(data.d_r1_.data(), data.d_tmp3_.data(), data.d_tmp3_.size(), stream_view_);
raft::copy(data.d_r1_prime_.data(), data.d_tmp3_.data(), data.d_tmp3_.size(), stream_view_);
Expand Down Expand Up @@ -2656,17 +2758,33 @@ i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_
raft::common::nvtx::range fun_scope("Barrier: dz formation GPU");

// dz = (complementarity_xz_rhs - z.* dx) ./ x;
cub::DeviceTransform::Transform(
cuda::std::make_tuple(data.d_complementarity_xz_rhs_.data(),
data.d_z_.data(),
data.d_dx_.data(),
data.d_x_.data()),
data.d_dz_.data(),
data.d_dz_.size(),
[] HD(f_t complementarity_xz_rhs, f_t z, f_t dx, f_t x) {
return (complementarity_xz_rhs - z * dx) / x;
},
stream_view_.value());
// For free variables, dz = 0
if (data.n_free_vars > 0) {
cub::DeviceTransform::Transform(
cuda::std::make_tuple(data.d_complementarity_xz_rhs_.data(),
data.d_z_.data(),
data.d_dx_.data(),
data.d_x_.data(),
data.d_is_free_.data()),
data.d_dz_.data(),
data.d_dz_.size(),
[] HD(f_t complementarity_xz_rhs, f_t z, f_t dx, f_t x, i_t is_free) {
return is_free ? f_t(0) : ((complementarity_xz_rhs - z * dx) / x);
},
stream_view_.value());
} else {
cub::DeviceTransform::Transform(
cuda::std::make_tuple(data.d_complementarity_xz_rhs_.data(),
data.d_z_.data(),
data.d_dx_.data(),
data.d_x_.data()),
data.d_dz_.data(),
data.d_dz_.size(),
[] HD(f_t complementarity_xz_rhs, f_t z, f_t dx, f_t x) {
return (complementarity_xz_rhs - z * dx) / x;
},
stream_view_.value());
}
RAFT_CHECK_CUDA(stream_view_);
raft::copy(dz.data(), data.d_dz_.data(), data.d_dz_.size(), stream_view_);
}
Expand Down Expand Up @@ -2956,9 +3074,9 @@ void barrier_solver_t<i_t, f_t>::compute_target_mu(
stream_view_);

complementarity_aff_sum = complementarity_xz_aff_sum + complementarity_wv_aff_sum;

mu_aff = (complementarity_aff_sum) /
(static_cast<f_t>(data.x.size()) + static_cast<f_t>(data.n_upper_bounds));
f_t mu_denom = static_cast<f_t>(data.x.size()) + static_cast<f_t>(data.n_upper_bounds);
mu_denom -= static_cast<f_t>(data.n_free_vars);
mu_aff = complementarity_aff_sum / mu_denom;
Comment on lines +3077 to +3079
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Guard the zero-complementarity case before dividing by mu_denom.

With native free variables, an equality-constrained QP can legitimately have n_free_vars == x.size() and n_upper_bounds == 0. Then this denominator becomes 0, so mu_aff/mu turn into NaN and the predictor-corrector step blows up. Please centralize the pair-count calculation, handle denom == 0, and reuse that helper for the initial mu computed in solve() as well.

Possible fix
+ auto complementarity_pairs = [&]() -> f_t {
+   return static_cast<f_t>(data.x.size()) +
+          static_cast<f_t>(data.n_upper_bounds) -
+          static_cast<f_t>(data.n_free_vars);
+ };
+
- f_t mu_denom            = static_cast<f_t>(data.x.size()) + static_cast<f_t>(data.n_upper_bounds);
- mu_denom -= static_cast<f_t>(data.n_free_vars);
- mu_aff = complementarity_aff_sum / mu_denom;
+ const f_t mu_denom = complementarity_pairs();
+ if (mu_denom == f_t(0)) {
+   mu_aff = f_t(0);
+   sigma  = f_t(0);
+   new_mu = f_t(0);
+   return;
+ }
+ mu_aff = complementarity_aff_sum / mu_denom;

Apply the same helper/guard in compute_mu() and in the initial mu setup inside solve().

As per coding guidelines, "Check numerical stability: prevent overflow/underflow, precision loss, division by zero/near-zero, and use epsilon comparisons for floating-point equality checks."

Also applies to: 3303-3312

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@cpp/src/barrier/barrier.cu` around lines 3077 - 3079, The division by
mu_denom when computing mu_aff can hit zero if all variables are free
(data.n_free_vars == data.x.size() and data.n_upper_bounds == 0); create a
helper (e.g., compute_mu_denom or centralize pair-count logic used by
compute_mu() and solve()) that computes denom = static_cast<f_t>(data.x.size())
+ static_cast<f_t>(data.n_upper_bounds) - static_cast<f_t>(data.n_free_vars) and
returns a guarded value or a boolean indicating zero; use that helper both where
mu_aff is set (mu_aff = complementarity_aff_sum / denom) and where the initial
mu is computed in solve(), and if denom is zero or below an epsilon, set
mu_aff/mu to 0 (or a safe fallback) instead of performing the division to avoid
NaN/INF.

sigma = std::max(0.0, std::min(1.0, std::pow(mu_aff / mu, 3.0)));
new_mu = sigma * mu_aff;
}
Expand All @@ -2968,12 +3086,34 @@ void barrier_solver_t<i_t, f_t>::compute_cc_rhs(iteration_data_t<i_t, f_t>& data
{
raft::common::nvtx::range fun_scope("Barrier: compute_cc_rhs");

cub::DeviceTransform::Transform(
cuda::std::make_tuple(data.d_dx_aff_.data(), data.d_dz_aff_.data()),
data.d_complementarity_xz_rhs_.data(),
data.d_complementarity_xz_rhs_.size(),
[new_mu] HD(f_t dx_aff, f_t dz_aff) { return -(dx_aff * dz_aff) + new_mu; },
stream_view_.value());
auto fill_linear_cc_rhs = [&](raft::device_span<f_t> out,
raft::device_span<const f_t> dx_aff,
raft::device_span<const f_t> dz_aff) {
if (out.empty()) return;
if (data.n_free_vars > 0) {
auto is_free_ptr = data.d_is_free_.data();
cub::DeviceTransform::Transform(
cuda::std::make_tuple(dx_aff.data(), dz_aff.data(), is_free_ptr),
out.data(),
out.size(),
[new_mu] HD(f_t dx_aff_val, f_t dz_aff_val, i_t is_free) {
return is_free ? f_t(0) : (-(dx_aff_val * dz_aff_val) + new_mu);
},
stream_view_.value());
} else {
cub::DeviceTransform::Transform(
cuda::std::make_tuple(dx_aff.data(), dz_aff.data()),
out.data(),
out.size(),
[new_mu] HD(f_t dx_aff_val, f_t dz_aff_val) {
return -(dx_aff_val * dz_aff_val) + new_mu;
},
stream_view_.value());
}
};
fill_linear_cc_rhs(cuopt::make_span(data.d_complementarity_xz_rhs_),
cuopt::make_span(data.d_dx_aff_),
cuopt::make_span(data.d_dz_aff_));
RAFT_CHECK_CUDA(stream_view_);
cub::DeviceTransform::Transform(
cuda::std::make_tuple(data.d_dw_aff_.data(), data.d_dv_aff_.data()),
Expand Down Expand Up @@ -3160,13 +3300,16 @@ void barrier_solver_t<i_t, f_t>::compute_mu(iteration_data_t<i_t, f_t>& data, f_
{
raft::common::nvtx::range fun_scope("Barrier: compute_mu");

f_t mu_denom = static_cast<f_t>(data.x.size()) + static_cast<f_t>(data.n_upper_bounds);
mu_denom -= static_cast<f_t>(data.n_free_vars); // free vars don't contribute to mu

mu = (data.sum_reduce_helper_.sum(data.d_complementarity_xz_residual_.begin(),
data.d_complementarity_xz_residual_.size(),
stream_view_) +
data.sum_reduce_helper_.sum(data.d_complementarity_wv_residual_.begin(),
data.d_complementarity_wv_residual_.size(),
stream_view_)) /
(static_cast<f_t>(data.x.size()) + static_cast<f_t>(data.n_upper_bounds));
mu_denom;
}

template <typename i_t, typename f_t>
Expand Down Expand Up @@ -3445,6 +3588,17 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
if (lp.Q.n > 0) { create_Q(lp, Q); }

iteration_data_t<i_t, f_t> data(lp, num_upper_bounds, Q, settings);
// Set up native free variable tracking for QPs
if (!presolve_info.free_variable_indices.empty()) {
data.n_free_vars = presolve_info.free_variable_indices.size();
std::vector<i_t> is_free_host(n, 0);
for (i_t j : presolve_info.free_variable_indices) {
is_free_host[j] = 1;
}
data.d_is_free_.resize(n, stream_view_);
raft::copy(data.d_is_free_.data(), is_free_host.data(), n, stream_view_);
settings.log.printf("Native free variables (QP) : %d\n", data.n_free_vars);
}
if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
settings.log.printf("Barrier solver halted\n");
return lp_status_t::CONCURRENT_LIMIT;
Expand Down
13 changes: 12 additions & 1 deletion cpp/src/dual_simplex/presolve.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1139,7 +1139,18 @@ i_t presolve(const lp_problem_t<i_t, f_t>& original,

problem.Q.check_matrix("Before free variable expansion");

if (settings.barrier_presolve && free_variables > 0) {
// For QPs using the augmented system, keep free variables as-is rather than
// splitting x = v - w. The barrier solver handles them natively with a
// static regularizer on the diagonal instead of z/x complementarity terms.
if (settings.barrier_presolve && free_variables > 0 && problem.Q.n > 0) {
presolve_info.free_variable_indices.clear();
for (i_t j = 0; j < problem.num_cols; j++) {
if (problem.lower[j] == -inf && problem.upper[j] == inf) {
presolve_info.free_variable_indices.push_back(j);
}
}
settings.log.printf("Keeping %d free variables for QP augmented system\n", free_variables);
} else if (settings.barrier_presolve && free_variables > 0) {
// We have a variable x_j: with -inf < x_j < inf
// we create new variables v and w with 0 <= v, w and x_j = v - w
// Constraints
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/dual_simplex/presolve.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ struct presolve_info_t {

// Variables that were negated to handle -inf < x_j <= u_j
std::vector<i_t> negated_variables;

// Free variable indices for QP augmented system (not split, handled natively)
std::vector<i_t> free_variable_indices;
};

template <typename i_t, typename f_t>
Expand Down