diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index e69ff7b9a..d0af517e6 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -2507,7 +2507,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut set_uninitialized_steepest_edge_norms(original_lp_, basic_list, edge_norms_); pc_.resize(original_lp_.num_cols); - original_lp_.A.transpose(pc_.AT); + pc_.Arow = Arow_; { raft::common::nvtx::range scope_sb("BB::strong_branching"); strong_branching(original_lp_, diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index c38e98e27..aa059b532 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -66,14 +66,14 @@ template objective_change_estimate_t single_pivot_objective_change_estimate( const lp_problem_t& lp, const simplex_solver_settings_t& settings, - const csc_matrix_t& A_transpose, + const csr_matrix_t& Arow, const std::vector& vstatus, i_t variable_j, i_t basic_j, const lp_solution_t& lp_solution, const std::vector& basic_list, const std::vector& nonbasic_list, - const std::vector& nonbasic_mark, + const std::vector& nonbasic_end, basis_update_mpf_t& basis_factors, std::vector& workspace, std::vector& delta_z, @@ -82,7 +82,6 @@ objective_change_estimate_t single_pivot_objective_change_estimate( // Compute the objective estimate for the down and up branches of variable j assert(variable_j >= 0); assert(basic_j >= 0); - // Down branch i_t direction = -1; sparse_vector_t e_k(lp.num_rows, 0); @@ -104,11 +103,11 @@ objective_change_estimate_t single_pivot_objective_change_estimate( std::vector delta_z_indices; // delta_z starts out all zero if (use_transpose) { - compute_delta_z(A_transpose, + compute_delta_z(Arow, delta_y, variable_j, direction, - nonbasic_mark, + nonbasic_end, workspace, delta_z_indices, delta_z, @@ -234,10 +233,8 @@ void initialize_pseudo_costs_with_estimate(const lp_problem_t& lp, basic_map[basic_list[i]] = i; } - std::vector nonbasic_mark(n, -1); - for (i_t i = 0; i < n - m; i++) { - nonbasic_mark[nonbasic_list[i]] = i; - } + std::vector nonbasic_end(lp.num_rows); + compute_initial_nonbasic_end(basic_map, pc.Arow, nonbasic_end); for (i_t k = 0; k < fractional.size(); k++) { const i_t j = fractional[k]; @@ -246,14 +243,14 @@ void initialize_pseudo_costs_with_estimate(const lp_problem_t& lp, objective_change_estimate_t estimate = single_pivot_objective_change_estimate(lp, settings, - pc.AT, + pc.Arow, vstatus, j, basic_map[j], lp_solution, basic_list, nonbasic_list, - nonbasic_mark, + nonbasic_end, basis_factors, workspace, delta_z, @@ -1476,10 +1473,13 @@ i_t pseudo_costs_t::reliable_variable_selection( basic_map[worker->basic_list[i]] = i; } - std::vector nonbasic_mark(n, -1); - for (i_t i = 0; i < n - m; i++) { - nonbasic_mark[worker->nonbasic_list[i]] = i; - } + // Each thread will have a different basis + // So we need to make a copy of Arow before we permute the columns + // so that nonbasic variables appear first + csr_matrix_t local_Arow = Arow; + + std::vector nonbasic_end(m); + compute_initial_nonbasic_end(basic_map, local_Arow, nonbasic_end); for (auto& [score, j] : unreliable_list) { if (pseudo_cost_num_down[j] == 0 || pseudo_cost_num_up[j] == 0) { @@ -1487,14 +1487,14 @@ i_t pseudo_costs_t::reliable_variable_selection( objective_change_estimate_t estimate = single_pivot_objective_change_estimate(worker->leaf_problem, settings, - AT, + local_Arow, node_ptr->vstatus, j, basic_map[j], leaf_solution, worker->basic_list, worker->nonbasic_list, - nonbasic_mark, + nonbasic_end, worker->basis_factors, workspace, delta_z, diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp index 009bd8b81..43d6448e6 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.hpp +++ b/cpp/src/branch_and_bound/pseudo_costs.hpp @@ -435,7 +435,7 @@ class pseudo_costs_t { pseudo_cost_num_up(num_variables), pseudo_cost_mutex_up(num_variables), pseudo_cost_mutex_down(num_variables), - AT(1, 1, 1) + Arow(1, 1, 1) { } @@ -526,7 +526,7 @@ class pseudo_costs_t { reliability_branching_settings_t reliability_branching_settings; - csc_matrix_t AT; // Transpose of the constraint matrix A + csr_matrix_t Arow; std::vector> pseudo_cost_sum_up; std::vector> pseudo_cost_sum_down; std::vector> pseudo_cost_num_up; diff --git a/cpp/src/dual_simplex/basis_solves.cpp b/cpp/src/dual_simplex/basis_solves.cpp index c5fee4e10..caa2351ef 100644 --- a/cpp/src/dual_simplex/basis_solves.cpp +++ b/cpp/src/dual_simplex/basis_solves.cpp @@ -380,7 +380,7 @@ i_t factorize_basis(const csc_matrix_t& A, } work_estimate += 3 * Sdim; - Srank = right_looking_lu(S, + Srank = right_looking_lu2(S, settings, settings.threshold_partial_pivoting_tol, identity, diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp index 75e5ecae3..30285048a 100644 --- a/cpp/src/dual_simplex/phase2.cpp +++ b/cpp/src/dual_simplex/phase2.cpp @@ -130,79 +130,136 @@ void compute_reduced_cost_update(const lp_problem_t& lp, } template -void compute_delta_z(const csc_matrix_t& A_transpose, +void compute_delta_z(const csr_matrix_t& Arow, const sparse_vector_t& delta_y, i_t leaving_index, i_t direction, - const std::vector& nonbasic_mark, + const std::vector& nonbasic_end, std::vector& delta_z_mark, std::vector& delta_z_indices, std::vector& delta_z, f_t& work_estimate) { // delta_zN = - N'*delta_y - const i_t nz_delta_y = delta_y.i.size(); - size_t nnz_processed = 0; - size_t nonbasic_marked = 0; + const i_t nz_delta_y = delta_y.i.size(); + size_t nnz_processed = 0; for (i_t k = 0; k < nz_delta_y; k++) { const i_t i = delta_y.i[k]; const f_t delta_y_i = delta_y.x[k]; if (std::abs(delta_y_i) < 1e-12) { continue; } - const i_t row_start = A_transpose.col_start[i]; - const i_t row_end = A_transpose.col_start[i + 1]; + const i_t row_start = Arow.row_start[i]; + const i_t row_end = nonbasic_end[i] + 1; nnz_processed += row_end - row_start; for (i_t p = row_start; p < row_end; ++p) { - const i_t j = A_transpose.i[p]; - if (nonbasic_mark[j] >= 0) { - delta_z[j] -= delta_y_i * A_transpose.x[p]; - nonbasic_marked++; - if (!delta_z_mark[j]) { - delta_z_mark[j] = 1; - delta_z_indices.push_back(j); - } + const i_t j = Arow.j[p]; + delta_z[j] -= delta_y_i * Arow.x[p]; + if (!delta_z_mark[j]) { + delta_z_mark[j] = 1; + delta_z_indices.push_back(j); } } } work_estimate += 4 * nz_delta_y; - work_estimate += 2 * nnz_processed; - work_estimate += 3 * nonbasic_marked; + work_estimate += 4 * nnz_processed; work_estimate += 2 * delta_z_indices.size(); // delta_zB = sigma*ei delta_z[leaving_index] = direction; +} -#ifdef CHECK_CHANGE_IN_REDUCED_COST - const i_t m = A_transpose.n; - const i_t n = A_transpose.m; - std::vector delta_y_dense(m); - delta_y.to_dense(delta_y_dense); - std::vector delta_z_check(n); - std::vector delta_z_mark_check(n, 0); - std::vector delta_z_indices_check; - phase2::compute_reduced_cost_update(lp, - basic_list, - nonbasic_list, - delta_y_dense, - leaving_index, - direction, - delta_z_mark_check, - delta_z_indices_check, - delta_z_check, - work_estimate); - f_t error_check = 0.0; - for (i_t k = 0; k < n; ++k) { - const f_t diff = std::abs(delta_z[k] - delta_z_check[k]); - if (diff > 1e-6) { - printf("delta_z error %d transpose %e no transpose %e diff %e\n", - k, - delta_z[k], - delta_z_check[k], - diff); +template +void compute_initial_nonbasic_end(const std::vector& basic_mark, + csr_matrix_t& Arow, + std::vector& nonbasic_end) +{ + i_t m = Arow.m; + // Swap coefficients so that all coefficients for nonbasic variables in row i + // appear from Arow.row_start[i] to nonbasic_end[i] + for (i_t i = 0; i < m; i++) { + i_t lo = Arow.row_start[i]; + i_t hi = Arow.row_start[i + 1] - 1; + while (lo <= hi) { + const i_t j = Arow.j[lo]; + if (basic_mark[j] >= 0) { + // Swap coefficients + std::swap(Arow.j[lo], Arow.j[hi]); + std::swap(Arow.x[lo], Arow.x[hi]); + hi--; + } else { + lo++; + } } - error_check = std::max(error_check, diff); + nonbasic_end[i] = hi; } - if (error_check > 1e-6) { printf("delta_z error %e\n", error_check); } -#endif +} + +template +void update_Arow(i_t leaving, + i_t entering, + const csc_matrix_t& A, + std::vector& row_mark, + std::vector& nonbasic_end, + csr_matrix_t& Arow, + f_t& work_estimate) +{ + // Update the Arow matrix to reflect the new basis. So + // that the coefficients for all nonbasic variables in row i + // appear in Arow.row_start[i] to nonbasic_end[i]. + // Swap the coefficients for the leaving and entering variables + // The leaving variable is now nonbasic, the entering variable is now basic + row_mark.clear(); + const i_t col_start_enter = A.col_start[entering]; + const i_t col_end_enter = A.col_start[entering + 1]; + for (i_t p = col_start_enter; p < col_end_enter; p++) { + const i_t i = A.i[p]; + row_mark.push_back(i); + } + work_estimate += 2 * (col_end_enter - col_start_enter); + + // Move the coefficients for the entering variable to the end of the nonbasics + // And decrement the nonbasic count for the row + for (i_t i : row_mark) { + const i_t row_start = Arow.row_start[i]; + const i_t nb_end = nonbasic_end[i]; + for (i_t p = row_start; p <= nb_end; p++) { + const i_t j = Arow.j[p]; + if (j == entering) { + std::swap(Arow.j[p], Arow.j[nb_end]); + std::swap(Arow.x[p], Arow.x[nb_end]); + nonbasic_end[i]--; + break; + } + } + work_estimate += nb_end - row_start; + } + work_estimate += 2 * row_mark.size(); + + row_mark.clear(); + const i_t col_start_leave = A.col_start[leaving]; + const i_t col_end_leave = A.col_start[leaving + 1]; + for (i_t p = col_start_leave; p < col_end_leave; p++) { + const i_t i = A.i[p]; + row_mark.push_back(i); + } + work_estimate += 2 * (col_end_leave - col_start_leave); + + // Move the coefficient for the leaving variable to the end of the nonbasics + // And increment the nonbasic count for the row + for (i_t i : row_mark) { + const i_t nb_end = nonbasic_end[i]; + const i_t row_end = Arow.row_start[i + 1]; + for (i_t p = nb_end + 1; p < row_end; p++) { + const i_t j = Arow.j[p]; + if (j == leaving) { + std::swap(Arow.j[p], Arow.j[nb_end + 1]); + std::swap(Arow.x[p], Arow.x[nb_end + 1]); + nonbasic_end[i]++; + break; + } + } + work_estimate += row_end - nb_end; + } + work_estimate += 2 * row_mark.size(); } namespace phase2 { @@ -807,7 +864,7 @@ void update_single_primal_infeasibility(const std::vector& lower, } template -void update_primal_infeasibilities(const lp_problem_t& lp, +bool update_primal_infeasibilities(const lp_problem_t& lp, const simplex_solver_settings_t& settings, const std::vector& basic_list, const std::vector& x, @@ -819,6 +876,7 @@ void update_primal_infeasibilities(const lp_problem_t& lp, f_t& primal_inf, f_t& work_estimate) { + bool became_feasible = false; const f_t primal_tol = settings.primal_tol; const i_t nz = basic_change_list.size(); for (i_t k = 0; k < nz; ++k) { @@ -831,8 +889,10 @@ void update_primal_infeasibilities(const lp_problem_t& lp, const f_t old_val = squared_infeasibilities[j]; squared_infeasibilities[j] = 0.0; primal_inf = std::max(0.0, primal_inf - old_val); + if (old_val != 0.0) { became_feasible = true; } continue; } + const f_t old_val = squared_infeasibilities[j]; update_single_primal_infeasibility(lp.lower, lp.upper, x, @@ -841,8 +901,10 @@ void update_primal_infeasibilities(const lp_problem_t& lp, infeasibility_indices, j, primal_inf); + if (old_val != 0.0 && squared_infeasibilities[j] == 0.0) { became_feasible = true; } } work_estimate += 8 * nz; + return became_feasible; } template @@ -850,33 +912,14 @@ void clean_up_infeasibilities(std::vector& squared_infeasibilities, std::vector& infeasibility_indices, f_t& work_estimate) { - bool needs_clean_up = false; - const i_t initial_nz = infeasibility_indices.size(); - for (i_t k = 0; k < initial_nz; ++k) { - const i_t j = infeasibility_indices[k]; - const f_t squared_infeas = squared_infeasibilities[j]; - if (squared_infeas == 0.0) { needs_clean_up = true; } - } - work_estimate += 2 * initial_nz; - - if (needs_clean_up) { - i_t num_cleans = 0; - work_estimate += 2 * infeasibility_indices.size(); - for (size_t k = 0; k < infeasibility_indices.size(); ++k) { - const i_t j = infeasibility_indices[k]; - const f_t squared_infeas = squared_infeasibilities[j]; - if (squared_infeas == 0.0) { - const i_t new_j = infeasibility_indices.back(); - infeasibility_indices[k] = new_j; - infeasibility_indices.pop_back(); - if (squared_infeasibilities[new_j] == 0.0) { - k--; - } // Decrement k so that we process the same index again - num_cleans++; - } - } - work_estimate += 4 * num_cleans; + i_t write = 0; + const i_t n = infeasibility_indices.size(); + for (i_t k = 0; k < n; ++k) { + const i_t j = infeasibility_indices[k]; + if (squared_infeasibilities[j] != 0.0) { infeasibility_indices[write++] = j; } } + infeasibility_indices.resize(write); + work_estimate += 3 * n; } template @@ -1480,8 +1523,10 @@ i_t compute_steepest_edge_norm_entering(const simplex_solver_settings_t& ft, i_t basic_leaving_index, i_t entering_index, + i_t leaving_index, std::vector& steepest_edge_norms) { +#if 0 sparse_vector_t es_sparse(m, 1); es_sparse.i[0] = basic_leaving_index; es_sparse.x[0] = -1.0; @@ -1489,6 +1534,16 @@ i_t compute_steepest_edge_norm_entering(const simplex_solver_settings_t 1e-1) { + settings.log.printf("Steepest edge norm %e for entering %d. Leaving %d norm %e. Diff %e\n", + steepest_edge_norms[entering_index], + entering_index, + leaving_index, + steepest_edge_norms[leaving_index], + std::abs(steepest_edge_norms[entering_index] - steepest_edge_norms[leaving_index])); + } +#endif + steepest_edge_norms[entering_index] = steepest_edge_norms[leaving_index]; #ifdef STEEPEST_EDGE_DEBUG settings.log.printf("Steepest edge norm %e for entering j %d at i %d\n", steepest_edge_norms[entering_index], @@ -2408,6 +2463,7 @@ class phase2_timers_t { se_norms_time(0), se_entering_time(0), lu_update_time(0), + lu_factorization_time(0), perturb_time(0), vector_time(0), objective_time(0), @@ -2431,8 +2487,9 @@ class phase2_timers_t { { if (!record_time) { return; } const f_t total_time = bfrt_time + pricing_time + btran_time + ftran_time + flip_time + - delta_z_time + lu_update_time + se_norms_time + se_entering_time + - perturb_time + vector_time + objective_time + update_infeasibility_time; + delta_z_time + lu_update_time + lu_factorization_time + se_norms_time + + se_entering_time + perturb_time + vector_time + objective_time + + update_infeasibility_time; // clang-format off settings.log.printf("BFRT time %.2fs %4.1f%\n", bfrt_time, 100.0 * bfrt_time / total_time); settings.log.printf("Pricing time %.2fs %4.1f%\n", pricing_time, 100.0 * pricing_time / total_time); @@ -2441,6 +2498,7 @@ class phase2_timers_t { settings.log.printf("Flip time %.2fs %4.1f%\n", flip_time, 100.0 * flip_time / total_time); settings.log.printf("Delta_z time %.2fs %4.1f%\n", delta_z_time, 100.0 * delta_z_time / total_time); settings.log.printf("LU update time %.2fs %4.1f%\n", lu_update_time, 100.0 * lu_update_time / total_time); + settings.log.printf("LU factor time %.2fs %4.1f%\n", lu_factorization_time, 100.0 * lu_factorization_time / total_time); settings.log.printf("SE norms time %.2fs %4.1f%\n", se_norms_time, 100.0 * se_norms_time / total_time); settings.log.printf("SE enter time %.2fs %4.1f%\n", se_entering_time, 100.0 * se_entering_time / total_time); settings.log.printf("Perturb time %.2fs %4.1f%\n", perturb_time, 100.0 * perturb_time / total_time); @@ -2459,6 +2517,7 @@ class phase2_timers_t { f_t se_norms_time; f_t se_entering_time; f_t lu_update_time; + f_t lu_factorization_time; f_t perturb_time; f_t vector_time; f_t objective_time; @@ -2560,6 +2619,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, phase2::bound_info(lp, settings); phase2_work_estimate += 2 * n; + f_t refactor_work = 0.0; + f_t solve_work = 0.0; + if (initialize_basis) { PHASE2_NVTX_RANGE("DualSimplex::init_basis"); std::vector superbasic_list; @@ -2572,8 +2634,10 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, assert(superbasic_list.size() == 0); assert(nonbasic_list.size() == n - m); - i_t refactor_status = ft.refactor_basis( + f_t refactor_start_work = ft.work_estimate(); + i_t refactor_status = ft.refactor_basis( lp.A, settings, lp.lower, lp.upper, start_time, basic_list, nonbasic_list, vstatus); + refactor_work = ft.work_estimate() - refactor_start_work; if (refactor_status == CONCURRENT_HALT_RETURN) { return dual::status_t::CONCURRENT_LIMIT; } if (refactor_status == TIME_LIMIT_RETURN) { return dual::status_t::TIME_LIMIT; } if (refactor_status > 0) { return dual::status_t::NUMERICAL; } @@ -2738,13 +2802,18 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 0); #endif + csr_matrix_t Arow(1, 1, 0); + lp.A.to_compressed_row(Arow); + phase2_work_estimate += 2 * lp.A.col_start[lp.A.n]; if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return dual::status_t::CONCURRENT_LIMIT; } - csc_matrix_t A_transpose(1, 1, 0); - lp.A.transpose(A_transpose); - phase2_work_estimate += 2 * lp.A.col_start[lp.A.n]; + std::vector nonbasic_end(m); + std::vector row_mark; + row_mark.reserve(m); + compute_initial_nonbasic_end(basic_mark, Arow, nonbasic_end); + phase2_work_estimate += lp.A.col_start[lp.A.n]; if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return dual::status_t::CONCURRENT_LIMIT; @@ -2792,6 +2861,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, 0.0, toc(start_time)); } + i_t iterations_since_refactor = 0; while (iter < iter_limit) { PHASE2_NVTX_RANGE("DualSimplex::phase2_main_loop"); @@ -2914,11 +2984,14 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, timers.start_timer(); delta_y_sparse.clear(); UTsol_sparse.clear(); + f_t btran_start_work = ft.work_estimate(); { PHASE2_NVTX_RANGE("DualSimplex::btran"); phase2::compute_delta_y(ft, basic_leaving_index, direction, delta_y_sparse, UTsol_sparse); } timers.btran_time += timers.stop_timer(); + solve_work += (ft.work_estimate() - btran_start_work); + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return dual::status_t::CONCURRENT_LIMIT; } @@ -2953,11 +3026,11 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, PHASE2_NVTX_RANGE("DualSimplex::delta_z"); if (use_transpose) { sparse_delta_z++; - compute_delta_z(A_transpose, + compute_delta_z(Arow, delta_y_sparse, leaving_index, direction, - nonbasic_mark, + nonbasic_end, delta_z_mark, delta_z_indices, delta_z, @@ -3295,6 +3368,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, utilde_sparse.clear(); scaled_delta_xB_sparse.clear(); rhs_sparse.from_csc_column(lp.A, entering_index); + f_t ftran_start_work = ft.work_estimate(); { PHASE2_NVTX_RANGE("DualSimplex::ftran"); if (phase2::compute_delta_x(lp, @@ -3316,7 +3390,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, return dual::status_t::NUMERICAL; } } - + solve_work += (ft.work_estimate() - ftran_start_work); timers.ftran_time += timers.stop_timer(); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return dual::status_t::CONCURRENT_LIMIT; @@ -3330,6 +3404,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, #endif timers.start_timer(); + f_t se_norms_start_work = ft.work_estimate(); const i_t steepest_edge_status = phase2::update_steepest_edge_norms(settings, basic_list, ft, @@ -3351,6 +3426,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, #endif assert(steepest_edge_status == 0); timers.se_norms_time += timers.stop_timer(); + solve_work += (ft.work_estimate() - se_norms_start_work); + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return dual::status_t::CONCURRENT_LIMIT; } @@ -3388,42 +3465,48 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, #ifdef CHECK_BASIC_INFEASIBILITIES phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 2); #endif - phase2::update_primal_infeasibilities(lp, - settings, - basic_list, - x, - entering_index, - leaving_index, - delta_xB_0_sparse.i, - squared_infeasibilities, - infeasibility_indices, - primal_infeasibility_squared, - phase2_work_estimate); + bool needs_cleanup = phase2::update_primal_infeasibilities(lp, + settings, + basic_list, + x, + entering_index, + leaving_index, + delta_xB_0_sparse.i, + squared_infeasibilities, + infeasibility_indices, + primal_infeasibility_squared, + phase2_work_estimate); // Update primal infeasibilities due to changes in basic variables // from the leaving and entering variables - phase2::update_primal_infeasibilities(lp, - settings, - basic_list, - x, - entering_index, - leaving_index, - scaled_delta_xB_sparse.i, - squared_infeasibilities, - infeasibility_indices, - primal_infeasibility_squared, - phase2_work_estimate); + needs_cleanup |= phase2::update_primal_infeasibilities(lp, + settings, + basic_list, + x, + entering_index, + leaving_index, + scaled_delta_xB_sparse.i, + squared_infeasibilities, + infeasibility_indices, + primal_infeasibility_squared, + phase2_work_estimate); // Update the entering variable - phase2::update_single_primal_infeasibility(lp.lower, - lp.upper, - x, - settings.primal_tol, - squared_infeasibilities, - infeasibility_indices, - entering_index, - primal_infeasibility_squared); - - phase2::clean_up_infeasibilities( - squared_infeasibilities, infeasibility_indices, phase2_work_estimate); + { + const f_t old_val = squared_infeasibilities[entering_index]; + phase2::update_single_primal_infeasibility(lp.lower, + lp.upper, + x, + settings.primal_tol, + squared_infeasibilities, + infeasibility_indices, + entering_index, + primal_infeasibility_squared); + needs_cleanup |= (old_val != 0.0 && squared_infeasibilities[entering_index] == 0.0); + } + + if (needs_cleanup) { + phase2::clean_up_infeasibilities( + squared_infeasibilities, infeasibility_indices, phase2_work_estimate); + } #if CHECK_PRIMAL_INFEASIBILITIES phase2::check_primal_infeasibilities( @@ -3455,6 +3538,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, basic_mark[leaving_index] = -1; basic_mark[entering_index] = basic_leaving_index; + update_Arow( + leaving_index, entering_index, lp.A, row_mark, nonbasic_end, Arow, phase2_work_estimate); + #ifdef CHECK_BASIC_INFEASIBILITIES phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 5); #endif @@ -3463,22 +3549,35 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, // Refactor or update the basis factorization { PHASE2_NVTX_RANGE("DualSimplex::basis_update"); - bool should_refactor = ft.num_updates() > settings.refactor_frequency; + iterations_since_refactor++; + bool should_refactor = + (ft.num_updates() > 100 && solve_work > refactor_work) || (ft.num_updates() > 1000); + // settings.log.printf("Solve work %e refactor work %e iterations since refactor %d\n", + // solve_work, refactor_work, iterations_since_refactor); if (!should_refactor) { i_t recommend_refactor = ft.update(utilde_sparse, UTsol_sparse, basic_leaving_index); #ifdef CHECK_UPDATE phase2::check_update(lp, settings, ft, basic_list, basic_leaving_index); #endif should_refactor = recommend_refactor == 1; + timers.lu_update_time += timers.stop_timer(); + timers.start_timer(); } #ifdef CHECK_BASIC_INFEASIBILITIES phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 6); #endif if (should_refactor) { + settings.log.printf( + "Refactoring basis. Iteration %d solve work %e refactor work %e updates %d\n", + iter, + solve_work, + refactor_work, + ft.num_updates()); PHASE2_NVTX_RANGE("DualSimplex::refactorization"); num_refactors++; - bool should_recompute_x = true; // Need for numerically difficult problems like cbs-cta + bool should_recompute_x = true; // Needed for numerically difficult problems like cbs-cta + f_t refactor_start_work = ft.work_estimate(); i_t refactor_status = ft.refactor_basis( lp.A, settings, lp.lower, lp.upper, start_time, basic_list, nonbasic_list, vstatus); if (refactor_status == CONCURRENT_HALT_RETURN) { return dual::status_t::CONCURRENT_LIMIT; } @@ -3511,9 +3610,11 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, settings.log.printf("Successfully repaired basis. Iteration %d\n", iter); } + refactor_work = ft.work_estimate() - refactor_start_work; phase2::reset_basis_mark( basic_list, nonbasic_list, basic_mark, nonbasic_mark, phase2_work_estimate); + compute_initial_nonbasic_end(basic_mark, Arow, nonbasic_end); if (should_recompute_x) { std::vector unperturbed_x(n); phase2_work_estimate += n; @@ -3537,16 +3638,18 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, infeasibility_indices, primal_infeasibility); phase2_work_estimate += 4 * m + 2 * n; + iterations_since_refactor = 0; + solve_work = 0.0; } #ifdef CHECK_BASIC_INFEASIBILITIES phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 7); #endif } - timers.lu_update_time += timers.stop_timer(); + timers.lu_factorization_time += timers.stop_timer(); timers.start_timer(); phase2::compute_steepest_edge_norm_entering( - settings, m, ft, basic_leaving_index, entering_index, delta_y_steepest_edge); + settings, m, ft, basic_leaving_index, entering_index, leaving_index, delta_y_steepest_edge); timers.se_entering_time += timers.stop_timer(); #ifdef STEEPEST_EDGE_DEBUG @@ -3676,15 +3779,19 @@ template void compute_reduced_cost_update(const lp_problem_t& delta_z, double& work_estimate); -template void compute_delta_z(const csc_matrix_t& A_transpose, +template void compute_delta_z(const csr_matrix_t& Arow, const sparse_vector_t& delta_y, int leaving_index, int direction, - const std::vector& nonbasic_mark, + const std::vector& nonbasic_end, std::vector& delta_z_mark, std::vector& delta_z_indices, std::vector& delta_z, double& work_estimate); + +template void compute_initial_nonbasic_end(const std::vector& basic_mark, + csr_matrix_t& Arow, + std::vector& nonbasic_end); #endif } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/phase2.hpp b/cpp/src/dual_simplex/phase2.hpp index 5db797449..266d57c19 100644 --- a/cpp/src/dual_simplex/phase2.hpp +++ b/cpp/src/dual_simplex/phase2.hpp @@ -94,14 +94,19 @@ void compute_reduced_cost_update(const lp_problem_t& lp, f_t& work_estimate); template -void compute_delta_z(const csc_matrix_t& A_transpose, +void compute_delta_z(const csr_matrix_t& Arow, const sparse_vector_t& delta_y, i_t leaving_index, i_t direction, - const std::vector& nonbasic_mark, + const std::vector& nonbasic_end, std::vector& delta_z_mark, std::vector& delta_z_indices, std::vector& delta_z, f_t& work_estimate); +template +void compute_initial_nonbasic_end(const std::vector& basic_mark, + csr_matrix_t& Arow, + std::vector& nonbasic_end); + } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/right_looking_lu.cpp b/cpp/src/dual_simplex/right_looking_lu.cpp index 5cb0185c8..3513ff7dc 100644 --- a/cpp/src/dual_simplex/right_looking_lu.cpp +++ b/cpp/src/dual_simplex/right_looking_lu.cpp @@ -33,6 +33,835 @@ struct element_t { }; // 24 bytes constexpr int kNone = -1; +template +class nonzero_counts_t { + public: + nonzero_counts_t(const std::vector& deg, i_t m) + : m_(m), work_estimate_(0), deg_(deg), counts_(m + 1), pos_(deg.size()) + { + const i_t n = deg_.size(); + for (i_t k = 0; k < n; ++k) { + assert(deg_[k] <= m && deg_[k] >= 0); + const i_t nz = deg_[k]; + pos_[k] = counts_[nz].size(); + counts_[nz].push_back(k); + } + work_estimate_ += 4*n; + } + + i_t get_count(i_t k) const + { + return deg_[k]; + } + + void update_count(i_t k, i_t new_nz) + { + const i_t old_nz = deg_[k]; + update_count(k, old_nz, new_nz); + } + + const std::vector& get_elements_with_count(i_t nz) const + { + return counts_[nz]; + } + + // Remove k from its current bucket without re-inserting. + // Sets deg_[k] to -1 to mark it as removed. + void remove_from_count(i_t k) + { + const i_t old_nz = deg_[k]; + const i_t p = pos_[k]; + const i_t other = counts_[old_nz].back(); + counts_[old_nz][p] = other; + pos_[other] = p; + counts_[old_nz].pop_back(); + deg_[k] = -1; + work_estimate_ += 6; + } + + f_t record_and_clear_work_estimate_() { + f_t tmp = work_estimate_; + work_estimate_ = 0; + return tmp; + } + + private: + + void update_count(i_t k, i_t old_nz, i_t new_nz) + { + const i_t p = pos_[k]; + const i_t other = counts_[old_nz].back(); + counts_[old_nz][p] = other; + pos_[other] = p; + counts_[old_nz].pop_back(); + deg_[k] = new_nz; + pos_[k] = counts_[new_nz].size(); + counts_[new_nz].push_back(k); + work_estimate_ += 11; + } + + i_t m_; + f_t work_estimate_; + std::vector deg_; + std::vector> counts_; + std::vector pos_; +}; + +// Represents the sparse trailing matrix Atlide = A - l u^T of a sparse LU factorization +// We need to be able to access the nonzeros in this matrix by both row and column. +// Thus, we do not compress the storage. +template +class trailing_matrix_t { + public: + trailing_matrix_t(const csc_matrix_t& A, + const std::vector& column_list) + : m_(A.m), + n_(column_list.size()), + Bnz_(0), + work_estimate_(0), + col_start_(n_), + col_end_(n_), + col_max_(n_), + row_start_(m_), + row_end_(m_), + row_max_(m_), + max_in_column_(n_), + pivot_row_val_(n_, 0.0), + pivot_col_val_(m_, 0.0), + pivot_col_mark_(m_, 0), + row_mark_(m_, kNone), + col_mark_(n_, kNone), + col_counts_(compute_column_degree(A, column_list), m_), + row_counts_(compute_row_degree(A, column_list, Bnz_), n_), + unused_col_nz_(0), + unused_row_nz_(0), + col_hits_(0), + col_miss_(0), + row_hits_(0), + row_miss_(0), + col_realloc_hist_(std::max(m_, static_cast(n_)) + 1, 0), + row_realloc_hist_(std::max(m_, static_cast(n_)) + 1, 0) + { + + work_estimate_ += 4*m_ + 2*n_ + col_realloc_hist_.size() + row_realloc_hist_.size(); + + // Allocate 2x initial size per column/row to reduce early relocations + i_t col_nz = 2 * Bnz_; + i_t row_nz = 2 * Bnz_; + + c_i_.resize(col_nz); + c_x_.resize(col_nz); + r_j_.resize(row_nz); + + + i_t nz = 0; + for (i_t i = 0; i < m_; i++) { + row_start_[i] = nz; + row_end_[i] = nz; // Temporary value used for initializing r_j_. Will be updated in loop + i_t row_space = 2 * row_counts_.get_count(i); + row_max_[i] = nz + row_space; + nz += row_space; + } + assert(nz == row_nz); + work_estimate_ += 4 * m_; + + nz = 0; + for (size_t k = 0; k < column_list.size(); k++) { + const i_t j = column_list[k]; + const i_t A_start = A.col_start[j]; + const i_t A_end = A.col_start[j + 1]; + const i_t len = A_end - A_start; + i_t col_space = 2 * len; + col_max_[k] = nz + col_space; + col_start_[k] = nz; + col_end_[k] = nz + len; + for (i_t p = A_start; p < A_end; p++) { + const i_t row = A.i[p]; + const f_t val = A.x[p]; + c_i_[nz] = row; + c_x_[nz] = val; + nz++; + r_j_[row_end_[row]] = k; + row_end_[row]++; + } + nz += col_space - len; // Remaining capacity for this column + } + assert(nz == col_nz); + work_estimate_ += 7 * n_ + 7 * Bnz_; + + for (i_t j = 0; j < n_; j++) { + f_t max_in_col = 0.0; + const i_t c_start = col_start_[j]; + const i_t c_end = col_end_[j]; + for (i_t p = c_start; p < c_end; p++) { + const f_t val = std::abs(c_x_[p]); + if (val > max_in_col) { + max_in_col = val; + } + } + max_in_column_[j] = max_in_col; + } + work_estimate_ += Bnz_ + 3*n_; + } + + f_t record_and_clear_work_estimate_() + { + const f_t row_work_estimate = row_counts_.record_and_clear_work_estimate_(); + const f_t col_work_estimate = col_counts_.record_and_clear_work_estimate_(); + work_estimate_ += row_work_estimate + col_work_estimate; + f_t tmp = work_estimate_; + work_estimate_ = 0; + return tmp; + } + + i_t markowitz_search(f_t pivot_tol, f_t threshold_tol, i_t& pivot_i, i_t& pivot_j, f_t &pivot_val) { + f_t markowitz = static_cast(m_) * static_cast(n_); // Upper bound on markowitz criteria + i_t nz = 1; + i_t nsearch = 0; + constexpr bool verbose = false; + i_t nz_max = std::min(m_, n_); + while (nz <= nz_max) { + i_t markowitz_lower_bound = (nz - 1) * (nz - 1); + // Search columns of length nz + i_t nsearch_start = nsearch; + for (const i_t j : col_counts_.get_elements_with_count(nz)) { + assert(col_counts_.get_count(j) == nz); + const f_t max_in_col = max_in_column_[j]; + const i_t c_start = col_start_[j]; + const i_t c_end = col_end_[j]; + for (i_t p = c_start; p < c_end; p++) { + const i_t i = c_i_[p]; + const f_t val = c_x_[p]; + const i_t rdeg = row_counts_.get_count(i); + assert(rdeg >= 0); + const i_t Mij = (rdeg - 1) * (nz - 1); + if (Mij < markowitz && std::abs(val) >= threshold_tol * max_in_col && + std::abs(val) >= pivot_tol) { + markowitz = Mij; + pivot_i = i; + pivot_j = j; + pivot_val = val; + if (markowitz <= markowitz_lower_bound) { break; } + } + } + work_estimate_ += 3 * (c_end - c_start); + nsearch++; + if (markowitz <= markowitz_lower_bound) { break; } + } + work_estimate_ += 4 * (nsearch - nsearch_start); + if (markowitz <= markowitz_lower_bound) { break; } + + markowitz_lower_bound = (nz - 1) * nz; + + // Search rows of length nz + assert(row_counts_.get_elements_with_count(nz).size() >= 0); + nsearch_start = nsearch; + for (const i_t i : row_counts_.get_elements_with_count(nz)) { + const i_t rdeg = row_counts_.get_count(i); + assert(rdeg == nz); + const i_t r_start = row_start_[i]; + const i_t r_end = row_end_[i]; + for (i_t p = r_start; p < r_end; p++) { + const i_t j = r_j_[p]; + // Look up the value from the column copy of j + f_t val = 0; + const i_t c_start = col_start_[j]; + const i_t c_end = col_end_[j]; + for (i_t q = c_start; q < c_end; q++) { + if (c_i_[q] == i) { val = c_x_[q]; break; } + } + work_estimate_ += 2 * (c_end - c_start); + const f_t max_in_col = max_in_column_[j]; + const i_t cdeg = col_counts_.get_count(j); + assert(cdeg >= 0); + const i_t Mij = (nz - 1) * (cdeg - 1); + if (Mij < markowitz && std::abs(val) >= threshold_tol * max_in_col && + std::abs(val) >= pivot_tol) { + markowitz = Mij; + pivot_i = i; + pivot_j = j; + pivot_val = val; + if (markowitz <= markowitz_lower_bound) { break; } + } + } + work_estimate_ += 5 * (r_end - r_start); + nsearch++; + if (markowitz <= markowitz_lower_bound) { break; } + } + work_estimate_ += 4 * (nsearch - nsearch_start); + if (pivot_i != -1 && nz >= 2) { break; } + nz++; + } + if (nsearch > 10) { + if constexpr (verbose) { printf("nsearch %d\n", nsearch); } + } + return nsearch; + } + + + void update_for_pivot_removal(i_t pivot_i, i_t pivot_j) + { + // Iterate over the pivot row: decrement column degrees. + // Skip the pivot column itself — it is being eliminated, not just decremented. + const i_t r_start = row_start_[pivot_i]; + const i_t r_end = row_end_[pivot_i]; + for (i_t p = r_start; p < r_end; p++) { + const i_t j = r_j_[p]; + const i_t cdeg = col_counts_.get_count(j); + if (j != pivot_j) { + col_counts_.update_count(j, cdeg - 1); + } else { + col_counts_.remove_from_count(j); + } + } + work_estimate_ += 2 * (r_end - r_start); + + // Iterate over the pivot column: decrement row degrees. + // Skip the pivot row itself — it is being eliminated, not just decremented. + const i_t c_start = col_start_[pivot_j]; + const i_t c_end = col_end_[pivot_j]; + for (i_t p = c_start; p < c_end; p++) { + const i_t i = c_i_[p]; + const i_t rdeg = row_counts_.get_count(i); + if (i != pivot_i) { + row_counts_.update_count(i, rdeg - 1); + } else { + row_counts_.remove_from_count(i); + } + } + work_estimate_ += 2 * (c_end - c_start); + } + + void schur_complement(i_t pivot_i, + i_t pivot_j, + f_t drop_tol, + f_t pivot_val) + { + // Step 1: Cache the pivot column into dense workspaces. + // pivot_col_val_[i] = l_i = a(i, pivot_j) / pivot_val for each row i != pivot_i + // pivot_col_mark_[i] = 1 if row i is in the pivot column + // pivot_col_index_[] = sparse list of such row indices + i_t pivot_col_count = 0; + const i_t c_pivot_start = col_start_[pivot_j]; + const i_t c_pivot_end = col_end_[pivot_j]; + for (i_t p = c_pivot_start; p < c_pivot_end; p++) { + const i_t i = c_i_[p]; + if (i == pivot_i) { continue; } + const f_t li = c_x_[p] / pivot_val; + pivot_col_val_[i] = li; + pivot_col_mark_[i] = 1; + pivot_col_index_.push_back(i); + pivot_col_count++; + } + work_estimate_ += 5 * (c_pivot_end - c_pivot_start); + + // Step 2: For each column j in the pivot row, update existing entries and insert fill. + const i_t r_pivot_start = row_start_[pivot_i]; + const i_t r_pivot_end = row_end_[pivot_i]; + for (i_t p0 = r_pivot_start; p0 < r_pivot_end; p0++) { + const i_t j = r_j_[p0]; + if (j == pivot_j) { continue; } + const f_t uj = pivot_row_val_[j]; + + // Step 2a: Scan column j, update existing entries, and count fill-in. + // For each entry (i, j) that also appears in the pivot column, update it. + // Simultaneously, unmark pivot_col_mark_[i] for matched entries, so that + // after the scan, the still-marked entries are the fill-ins. + i_t n_fillin = pivot_col_count; + i_t n_cancel = 0; + const i_t c_start = col_start_[j]; + const i_t c_end = col_end_[j]; + for (i_t q = c_start; q < c_end; q++) { + const i_t i = c_i_[q]; + if (pivot_col_mark_[i]) { + pivot_col_mark_[i] = 0; + n_fillin--; + const f_t val = pivot_col_val_[i] * uj; + + c_x_[q] -= val; + const f_t abs_updated = std::abs(c_x_[q]); + if (abs_updated > max_in_column_[j]) { max_in_column_[j] = abs_updated; } + if (abs_updated < drop_tol) { + c_x_[q] = 0; + n_cancel++; + // TODO: does max_in_column_ need to be updated in this case? + } + } + } + work_estimate_ += 2*(c_end - c_start) + 6*(pivot_col_count - n_fillin); + + + // Step 2b: Remove cancellations (entries that became zero). + if (n_cancel > 0) { + i_t new_end = col_start_[j]; + for (i_t q = col_start_[j]; q < col_end_[j]; q++) { + if (c_x_[q] != 0) { + c_i_[new_end] = c_i_[q]; + c_x_[new_end] = c_x_[q]; + new_end++; + } else { + const i_t dead_row = c_i_[q]; + // Remove this entry from the row copy as well + const i_t r_start = row_start_[dead_row]; + for (i_t rp = r_start; rp < row_end_[dead_row]; rp++) { + if (r_j_[rp] == j) { + r_j_[rp] = r_j_[row_end_[dead_row] - 1]; + row_end_[dead_row]--; + break; + } + } + work_estimate_ += 2*(row_end_[dead_row] - r_start) + 4; + // Update row degree + const i_t rdeg = row_counts_.get_count(dead_row); + row_counts_.update_count(dead_row, rdeg - 1); + + } + } + work_estimate_ += 2*(new_end - col_start_[j]); + i_t old_count = col_end_[j] - col_start_[j]; + col_end_[j] = new_end; + i_t new_count = new_end - col_start_[j]; + // Update column degree for cancellations + if (new_count != old_count) { + col_counts_.update_count(j, new_count); + } + } + + + // Step 2c: Insert fill-in entries. We know exactly how many there are. + if (n_fillin > 0) { + // Ensure column j has enough space for all fill-ins at once. + // After this, col_start_[j] is stable — no further relocation needed. + ensure_col_space(j, n_fillin); + + // Insert fill into column j and row copies. + for (i_t k = 0; k < pivot_col_count; k++) { + const i_t i = pivot_col_index_[k]; + if (pivot_col_mark_[i]) { + const f_t val = pivot_col_val_[i] * uj; + const f_t abs_val = std::abs(val); + if (abs_val < drop_tol) { + // Skip this fill-in but still need to unmark + continue; + } + + // Insert into column copy (space is guaranteed) + c_i_[col_end_[j]] = i; + c_x_[col_end_[j]] = -val; + col_end_[j]++; + if (abs_val > max_in_column_[j]) { max_in_column_[j] = abs_val; } + + // Insert into row copy + ensure_row_space(i, 1); + r_j_[row_end_[i]] = j; + row_end_[i]++; + + // Update row degree + const i_t rdeg = row_counts_.get_count(i); + row_counts_.update_count(i, rdeg + 1); + work_estimate_ += 10; + } + } + } + + // Step 2d: Update column degree bucket once for this column. + { + i_t new_cdeg = col_end_[j] - col_start_[j]; + if (new_cdeg != col_counts_.get_count(j)) { + col_counts_.update_count(j, new_cdeg); + } + } + + // Step 2e: Reset all pivot column marks back to 1 for the next column. + // Some marks were cleared to 0 during the scan of column j (matched entries). + // We restore them by iterating the pivot column index list. So that we are + // prepared to process the next column. + for (i_t k = 0; k < pivot_col_count; k++) { + pivot_col_mark_[pivot_col_index_[k]] = 1; + } + work_estimate_ += 2*pivot_col_count; + } + + // Step 3: Clear the pivot column workspaces. + for (i_t k = 0; k < pivot_col_count; k++) { + const i_t i = pivot_col_index_[k]; + pivot_col_val_[i] = 0; + pivot_col_mark_[i] = 0; + } + work_estimate_ += 2*pivot_col_count; + pivot_col_index_.clear(); + } + + // Populate the dense pivot_row_val_ workspace by scanning column representation + // for each column j that appears in the pivot row. + // Must be called before extract_row() and schur_complement(). + // Cleared by remove_pivot_row_and_column(). + void cache_pivot_row(i_t pivot_i) + { + const i_t r_start = row_start_[pivot_i]; + const i_t r_end = row_end_[pivot_i]; + for (i_t p = r_start; p < r_end; p++) { + const i_t j = r_j_[p]; + const i_t c_start = col_start_[j]; + const i_t c_end = col_end_[j]; + i_t q; + for (q = c_start; q < c_end; q++) { + if (c_i_[q] == pivot_i) { + pivot_row_val_[j] = c_x_[q]; + break; + } + } + work_estimate_ += 2 *(q - c_start); + } + work_estimate_ += 3 * (r_end - r_start) + 2; + } + + void remove_pivot_row_and_column(i_t pivot_i, i_t pivot_j) + { + // Iterate over the pivot row + const i_t r_pivot_start = row_start_[pivot_i]; + const i_t r_pivot_end = row_end_[pivot_i]; + for (i_t p = r_pivot_start; p < r_pivot_end; p++) { + const i_t j = r_j_[p]; + // Clear the cached pivot row value for this column + pivot_row_val_[j] = 0; + // Remove pivot_i from each column j in the pivot row + f_t max_in_col = 0.0; + + const i_t prev_col_end = col_end_[j]; + for (i_t q = col_start_[j]; q < col_end_[j]; q++) { + const i_t i = c_i_[q]; + if (i == pivot_i) { + // Swap with the last element in the column + i_t other_i = c_i_[col_end_[j] - 1]; + f_t other_x = c_x_[col_end_[j] - 1]; + c_i_[q] = other_i; + c_x_[q] = other_x; + // Update col_end_[j] + col_end_[j]--; + q--; + continue; + } else { + const f_t val = std::abs(c_x_[q]); + if (val > max_in_col) { + max_in_col = val; + } + } + } + work_estimate_ += 3*(prev_col_end - col_start_[j]) + 7; + max_in_column_[j] = max_in_col; + } + work_estimate_ += 4*(r_pivot_end - r_pivot_start); + + + // Iterate over the pivot column + const i_t c_start = col_start_[pivot_j]; + const i_t c_end = col_end_[pivot_j]; + for (i_t p = c_start; p < c_end; p++) { + const i_t i = c_i_[p]; + // Remove pivot_j from each row i in the pivot column + + i_t q; + for (q = row_start_[i]; q < row_end_[i]; q++) { + const i_t j = r_j_[q]; + if (j == pivot_j) { + // Swap with the last element in the row + r_j_[q] = r_j_[row_end_[i] - 1]; + // Update row_end_[i] + row_end_[i]--; + break; + } + } + work_estimate_ += 2*(q - row_start_[i]) + 4; + } + work_estimate_ += 4*(c_end - c_start); + + // Mark pivot column and pivot row as empty so garbage collection skips them + col_end_[pivot_j] = col_start_[pivot_j]; + row_end_[pivot_i] = row_start_[pivot_i]; + } + + void extract_row(i_t pivot_i, i_t pivot_j, csr_matrix_t& Urow, i_t& Unz) + { + // U(k, :) + const i_t r_pivot_start = row_start_[pivot_i]; + const i_t r_pivot_end = row_end_[pivot_i]; + for (i_t p = r_pivot_start; p < r_pivot_end; p++) { + const i_t j = r_j_[p]; + if (j != pivot_j) { + Urow.j.push_back(j); + Urow.x.push_back(pivot_row_val_[j]); + Unz++; + } + } + work_estimate_ += 3 * (r_pivot_end - r_pivot_start); + } + + void extract_column(i_t pivot_i, i_t pivot_j, f_t pivot_val, csc_matrix_t& L, i_t& Lnz) + { + // L(:, k) + const i_t c_pivot_start = col_start_[pivot_j]; + const i_t c_pivot_end = col_end_[pivot_j]; + for (i_t p = c_pivot_start; p < c_pivot_end; p++) { + const i_t i = c_i_[p]; + if (i != pivot_i) { + L.i.push_back(i); + const f_t l_val = c_x_[p] / pivot_val; + L.x.push_back(l_val); + Lnz++; + } + } + work_estimate_ += 4 * (c_pivot_end - c_pivot_start); + } + + void garbage_collect(f_t max_unused_fraction = 0.90) + { + if (unused_col_nz_ > max_unused_fraction * static_cast(c_i_.size())) { + printf("Garbage collected column %e\n", unused_col_nz_ / static_cast(c_i_.size())); + std::vector new_c_i; + std::vector new_c_x; + new_c_i.reserve(c_i_.size() - unused_col_nz_); + new_c_x.reserve(c_x_.size() - unused_col_nz_); + for (i_t j = 0; j < n_; j++) { + const i_t new_start = static_cast(new_c_i.size()); + const i_t c_start = col_start_[j]; + const i_t c_end = col_end_[j]; + const i_t col_size = c_end - c_start; + for (i_t p = c_start; p < c_end; p++) { + new_c_i.push_back(c_i_[p]); + new_c_x.push_back(c_x_[p]); + } + col_start_[j] = new_start; + col_end_[j] = static_cast(new_c_i.size()); + // Reserve space equal to current size (doubling strategy) + for (i_t s = 0; s < col_size; s++) { + new_c_i.push_back(kNone); + new_c_x.push_back(0.0); + } + work_estimate_ += 4*col_size; + col_max_[j] = static_cast(new_c_i.size()); + } + work_estimate_ += 6*n_; + c_i_ = std::move(new_c_i); + c_x_ = std::move(new_c_x); + + unused_col_nz_ = 0; + } + + if (unused_row_nz_ > max_unused_fraction * static_cast(r_j_.size())) { + printf("Garbage collected row %e\n", unused_row_nz_ / static_cast(r_j_.size())); + std::vector new_r_j; + new_r_j.reserve(r_j_.size() - unused_row_nz_); + for (i_t i = 0; i < m_; i++) { + const i_t new_start = static_cast(new_r_j.size()); + const i_t r_start = row_start_[i]; + const i_t r_end = row_end_[i]; + const i_t row_size = r_end - r_start; + for (i_t p = r_start; p < r_end; p++) { + new_r_j.push_back(r_j_[p]); + } + row_start_[i] = new_start; + row_end_[i] = static_cast(new_r_j.size()); + // Reserve space equal to current size (doubling strategy) + for (i_t s = 0; s < row_size; s++) { + new_r_j.push_back(kNone); + } + row_max_[i] = static_cast(new_r_j.size()); + work_estimate_ += 2*row_size; + } + work_estimate_ += 6*m_; + r_j_ = std::move(new_r_j); + unused_row_nz_ = 0; + } + } + + void print_stats() + { +#if 0 + printf("Column hits: %.1f%%, Column misses: %.1f%%, Row hits: %.1f%%, Row misses: %.1f%%\n", + 100.0 * static_cast(col_hits_) / static_cast(col_hits_ + col_miss_), + 100.0 * static_cast(col_miss_) / static_cast(col_hits_ + col_miss_), + 100.0 * static_cast(row_hits_) / static_cast(row_hits_ + row_miss_), + 100.0 * static_cast(row_miss_) / static_cast(row_hits_ + row_miss_)); + + printf("Column reallocation histogram (shortfall -> count):\n"); + for (size_t k = 0; k < col_realloc_hist_.size(); k++) { + if (col_realloc_hist_[k] > 0) { + printf(" %4zu: %d\n", k, col_realloc_hist_[k]); + } + } + + printf("Row reallocation histogram (shortfall -> count):\n"); + for (size_t k = 0; k < row_realloc_hist_.size(); k++) { + if (row_realloc_hist_[k] > 0) { + printf(" %4zu: %d\n", k, row_realloc_hist_[k]); + } + } + + f_t ci_mb = static_cast(c_i_.size() * sizeof(i_t)) / (1024.0 * 1024.0); + f_t cx_mb = static_cast(c_x_.size() * sizeof(f_t)) / (1024.0 * 1024.0); + f_t rj_mb = static_cast(r_j_.size() * sizeof(i_t)) / (1024.0 * 1024.0); + printf("Memory: c_i_ = %.2f MB, c_x_ = %.2f MB, r_j_ = %.2f MB, total = %.2f MB\n", + ci_mb, cx_mb, rj_mb, ci_mb + cx_mb + rj_mb); +#endif + } + + private: + + // Ensure column j has space for at least `needed` additional entries. + // If not, relocate the column to the end of c_i_/c_x_ with enough space. + // Returns true if the column was relocated (invalidating any cached positions). + bool ensure_col_space(i_t j, i_t needed) + { + if (col_end_[j] + needed <= col_max_[j]) { + col_hits_++; + return false; + } + col_miss_++; + i_t shortfall = needed - (col_max_[j] - col_end_[j]); + col_realloc_hist_[shortfall]++; + // Relocate column j to the end of c_i_/c_x_ + const i_t c_start = col_start_[j]; + const i_t c_end = col_end_[j]; + i_t current_size = c_end - c_start; + unused_col_nz_ += current_size; + i_t new_start = c_i_.size(); + for (i_t p = c_start; p < c_end; p++) { + c_i_.push_back(c_i_[p]); + c_x_.push_back(c_x_[p]); + } + work_estimate_ += 2*(c_end - c_start); + col_start_[j] = new_start; + col_end_[j] = c_i_.size(); + // Reserve space using doubling strategy to reduce future relocations + i_t extra = std::max(current_size, needed); + for (i_t k = 0; k < extra; k++) { + c_i_.push_back(kNone); + c_x_.push_back(0.0); + } + work_estimate_ += 2*extra; + col_max_[j] = c_i_.size(); + work_estimate_ += 10; + return true; + } + + // Ensure row i has space for at least `needed` additional entries. + // If not, relocate the row to the end of r_j_ with enough space. + void ensure_row_space(i_t i, i_t needed) + { + if (row_end_[i] + needed <= row_max_[i]) { + row_hits_++; + return; + } + row_miss_++; + i_t shortfall = needed - (row_max_[i] - row_end_[i]); + row_realloc_hist_[shortfall]++; + // Relocate row i to the end of r_j_ + const i_t r_start = row_start_[i]; + const i_t r_end = row_end_[i]; + i_t current_size = r_end - r_start; + unused_row_nz_ += current_size; + i_t new_start = r_j_.size(); + for (i_t p = r_start; p < r_end; p++) { + r_j_.push_back(r_j_[p]); + } + work_estimate_ += (r_end - r_start); + row_start_[i] = new_start; + row_end_[i] = r_j_.size(); + // Reserve space using doubling strategy to reduce future relocations + i_t extra = std::max(current_size, needed); + for (i_t k = 0; k < extra; k++) { + r_j_.push_back(kNone); + } + work_estimate_ += extra; + row_max_[i] = r_j_.size(); + work_estimate_ += 9; + } + + std::vector compute_column_degree(const csc_matrix_t& A, const std::vector& column_list) + { + const i_t n = column_list.size(); + std::vector Cdegree(n); + for (i_t k = 0; k < n; k++) { + const i_t j = column_list[k]; + const i_t A_start = A.col_start[j]; + const i_t A_end = A.col_start[j + 1]; + Cdegree[k] = A_end - A_start; + } + work_estimate_ += 4 * n; + return Cdegree; + } + + std::vector compute_row_degree(const csc_matrix_t& A, const std::vector& column_list, i_t& Bnz) + { + std::vector Rdegree(A.m, 0); + Bnz = 0; + const i_t n = column_list.size(); + for (i_t k = 0; k < n; k++) { + const i_t j = column_list[k]; + const i_t col_start = A.col_start[j]; + const i_t col_end = A.col_start[j + 1]; + for (i_t p = col_start; p < col_end; ++p) { + Rdegree[A.i[p]]++; + Bnz++; + } + } + work_estimate_ += 3 * n + 2 * Bnz; + return Rdegree; + } + + i_t m_; + i_t n_; + i_t Bnz_; + f_t work_estimate_; + + // The representation of the matrix by column + std::vector col_start_; + std::vector col_end_; + std::vector col_max_; + + std::vector c_i_; // row indices (indexed by col_start_[j] to col_end_[j]) + std::vector c_x_; // coefficients (indexed by col_start_[j] to col_end_[j]) + + + // The representation of the matrix by row (index only, no values) + std::vector row_start_; + std::vector row_end_; + std::vector row_max_; + + std::vector r_j_; // column indices (indexed by row_start_[i] to row_end_[i]) + + + + std::vector max_in_column_; // max_in_column_[j] is absolute value of the maximum coefficient in column j + + std::vector pivot_row_val_; // dense workspace of size n_; caches pivot row values + + std::vector pivot_col_val_; // dense workspace of size m_; caches L multipliers for pivot column + std::vector pivot_col_mark_; // dense workspace of size m_; 1 if row i is in the pivot column + std::vector pivot_col_index_; // sparse list of row indices in the pivot column (excl. pivot_i) + + std::vector row_mark_; + std::vector col_mark_; + + + nonzero_counts_t col_counts_; + nonzero_counts_t row_counts_; + + + i_t unused_col_nz_; + i_t unused_row_nz_; + + + i_t col_hits_; + i_t col_miss_; + i_t row_hits_; + i_t row_miss_; + + std::vector col_realloc_hist_; // col_realloc_hist_[k] = number of column relocations with shortfall k + std::vector row_realloc_hist_; // row_realloc_hist_[k] = number of row relocations with shortfall k +}; + template i_t initialize_degree_data(const csc_matrix_t& A, const std::vector& column_list, @@ -641,6 +1470,206 @@ void remove_pivot_col(i_t pivot_i, } } // namespace +template +i_t right_looking_lu2(const csc_matrix_t& A, + const simplex_solver_settings_t& settings, + f_t tol, + const std::vector& column_list, + f_t start_time, + std::vector& q, + csc_matrix_t& L, + csc_matrix_t& U, + std::vector& pinv, + f_t& work_estimate) +{ + raft::common::nvtx::range scope("LU::right_looking_lu"); + const i_t n = column_list.size(); + const i_t m = A.m; + + assert(A.m == n); + assert(L.n == n); + assert(L.m == n); + assert(U.n == n); + assert(U.m == n); + assert(q.size() == n); + assert(pinv.size() == n); + + + trailing_matrix_t trailing_matrix(A, column_list); + + csr_matrix_t Urow(n, n, 0); // We will store U by rows in Urow during the factorization + // and translate back to U at the end + Urow.n = Urow.m = n; + Urow.row_start.resize(n + 1, -1); + i_t Unz = 0; + work_estimate += 2 * n; + + i_t Lnz = 0; + L.x.clear(); + L.i.clear(); + + std::fill(q.begin(), q.end(), -1); + std::fill(pinv.begin(), pinv.end(), -1); + std::vector qinv(n); + std::fill(qinv.begin(), qinv.end(), -1); + work_estimate += 4 * n; + + work_estimate += trailing_matrix.record_and_clear_work_estimate_(); + + i_t pivots = 0; + for (i_t k = 0; k < n; ++k) { + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + return CONCURRENT_HALT_RETURN; + } + if (toc(start_time) > settings.time_limit) { return TIME_LIMIT_RETURN; } + // Find pivot that satisfies + // abs(pivot) >= abstol, + // abs(pivot) >= threshold_tol * max abs[pivot column] + i_t pivot_i = -1; + i_t pivot_j = -1; + f_t pivot_val = std::numeric_limits::quiet_NaN(); + constexpr f_t pivot_tol = 1e-11; + const f_t drop_tol = tol == 1.0 ? 0.0 : 1e-13; + const f_t threshold_tol = tol; + + trailing_matrix.markowitz_search(pivot_tol, threshold_tol, pivot_i, pivot_j, pivot_val); + + if (pivot_i == -1 || pivot_j == -1) { break; } + assert(pivot_i != -1 && pivot_j != -1); + + // Pivot + pinv[pivot_i] = k; // pivot_i is the kth pivot row + q[k] = pivot_j; + qinv[pivot_j] = k; + assert(std::abs(pivot_val) >= pivot_tol); + pivots++; + + // Cache pivot row values from column copies into dense workspace + trailing_matrix.cache_pivot_row(pivot_i); + + // U <- [U; u^T] + Urow.row_start[k] = Unz; + // U(k, pivot_j) = pivot_val + Urow.j.push_back(pivot_j); + Urow.x.push_back(pivot_val); + Unz++; + // U(k, :) + trailing_matrix.extract_row(pivot_i, pivot_j, Urow, Unz); + work_estimate += 4 * (Unz - Urow.row_start[k]); + + // L <- [L l] + L.col_start[k] = Lnz; + // L(pivot_i, k) = 1 + L.i.push_back(pivot_i); + L.x.push_back(1.0); + Lnz++; + + // L(:, k) + trailing_matrix.extract_column(pivot_i, pivot_j, pivot_val, L, Lnz); + work_estimate += 4 * (Lnz - L.col_start[k]); + + + trailing_matrix.update_for_pivot_removal(pivot_i, pivot_j); + + // A22 <- A22 - l u^T + trailing_matrix.schur_complement(pivot_i, pivot_j, drop_tol, pivot_val); + + trailing_matrix.remove_pivot_row_and_column(pivot_i, pivot_j); + + trailing_matrix.garbage_collect(); + + work_estimate += trailing_matrix.record_and_clear_work_estimate_(); + + +#ifdef CHECK_MAX_IN_COLUMN + // Check that maximum in column is maintained + +#endif + + + + } + + trailing_matrix.print_stats(); + + // Check for rank deficiency + if (pivots < n) { + // Complete the permutation pinv + i_t start = pivots; + for (i_t i = 0; i < m; ++i) { + if (pinv[i] == -1) { pinv[i] = start++; } + } + work_estimate += m; + + // Finalize the permutation q. Do this by first completing the inverse permutation qinv. + // Then invert qinv to get the final permutation q. + start = pivots; + for (i_t j = 0; j < n; ++j) { + if (qinv[j] == -1) { qinv[j] = start++; } + } + work_estimate += n; + inverse_permutation(qinv, q); + work_estimate += 2 * n; + + return pivots; + } + + // Finalize L and Urow + L.col_start[n] = Lnz; + Urow.row_start[n] = Unz; + + // Fix row inidices of L for final pinv + for (i_t p = 0; p < Lnz; ++p) { + L.i[p] = pinv[L.i[p]]; + } + work_estimate += 3 * Lnz; + +#ifdef CHECK_LOWER_TRIANGULAR + for (i_t j = 0; j < n; ++j) { + const i_t col_start = L.col_start[j]; + const i_t col_end = L.col_start[j + 1]; + for (i_t p = col_start; p < col_end; ++p) { + const i_t i = L.i[p]; + if (i < j) { printf("Found L(%d, %d) not lower triangular!\n", i, j); } + assert(i >= j); + } + } +#endif + + csc_matrix_t U_unpermuted(n, n, 1); + work_estimate += n; + Urow.to_compressed_col( + U_unpermuted); // Convert Urow to U stored in compressed sparse column format + work_estimate += n + Unz; + std::vector row_perm(n); + work_estimate += n; + inverse_permutation(pinv, row_perm); + work_estimate += 2 * n; + + std::vector identity(n); + for (i_t k = 0; k < n; k++) { + identity[k] = k; + } + work_estimate += 2 * n; + + U_unpermuted.permute_rows_and_cols(identity, q, U); + work_estimate += 3 * U.n + 5 * Unz; + +#ifdef CHECK_UPPER_TRIANGULAR + for (i_t k = 0; k < n; ++k) { + const i_t j = k; + const i_t col_start = U.col_start[j]; + const i_t col_end = U.col_start[j + 1]; + for (i_t p = col_start; p < col_end; ++p) { + const i_t i = U.i[p]; + if (i > j) { printf("Found U(%d, %d) not upper triangluar\n", i, j); } + assert(i <= j); + } + } +#endif + + return n; +} template i_t right_looking_lu(const csc_matrix_t& A, @@ -1298,6 +2327,17 @@ template int right_looking_lu(const csc_matrix_t& A, std::vector& pinv, double& work_estimate); +template int right_looking_lu2(const csc_matrix_t& A, + const simplex_solver_settings_t& settings, + double tol, + const std::vector& column_list, + double start_time, + std::vector& q, + csc_matrix_t& L, + csc_matrix_t& U, + std::vector& pinv, + double& work_estimate); + template int right_looking_lu_row_permutation_only( const csc_matrix_t& A, const simplex_solver_settings_t& settings, diff --git a/cpp/src/dual_simplex/right_looking_lu.hpp b/cpp/src/dual_simplex/right_looking_lu.hpp index 5f0bf570b..66f3747d6 100644 --- a/cpp/src/dual_simplex/right_looking_lu.hpp +++ b/cpp/src/dual_simplex/right_looking_lu.hpp @@ -26,6 +26,18 @@ i_t right_looking_lu(const csc_matrix_t& A, std::vector& pinv, f_t& work_estimate); +template +i_t right_looking_lu2(const csc_matrix_t& A, + const simplex_solver_settings_t& settings, + f_t tol, + const std::vector& column_list, + f_t start_time, + std::vector& q, + csc_matrix_t& L, + csc_matrix_t& U, + std::vector& pinv, + f_t& work_estimate); + template i_t right_looking_lu_row_permutation_only(const csc_matrix_t& A, const simplex_solver_settings_t& settings, diff --git a/cpp/src/pdlp/cpu_optimization_problem.cpp b/cpp/src/pdlp/cpu_optimization_problem.cpp index 406b0b654..6197260f0 100644 --- a/cpp/src/pdlp/cpu_optimization_problem.cpp +++ b/cpp/src/pdlp/cpu_optimization_problem.cpp @@ -724,7 +724,11 @@ void cpu_optimization_problem_t::write_to_mps(const std::string& mps_f for (size_t i = 0; i < var_types_char.size(); ++i) { var_types_char[i] = (variable_types_[i] == var_t::INTEGER) ? 'I' : 'C'; } - + } else if (get_n_variables() > 0) { + // Variable types not set (e.g. pure LP); default to all continuous + var_types_char.assign(get_n_variables(), 'C'); + } + if (!var_types_char.empty()) { data_model_view.set_variable_types(var_types_char.data(), var_types_char.size()); } diff --git a/cpp/src/pdlp/optimization_problem.cu b/cpp/src/pdlp/optimization_problem.cu index 87ff9dab0..8ed630c00 100644 --- a/cpp/src/pdlp/optimization_problem.cu +++ b/cpp/src/pdlp/optimization_problem.cu @@ -798,11 +798,15 @@ void optimization_problem_t::write_to_mps(const std::string& mps_file_ std::vector variable_types; if (get_n_variables() != 0) { auto enum_variable_types = cuopt::host_copy(get_variable_types(), stream); - variable_types.resize(enum_variable_types.size()); - - // Convert enum types to char types - for (size_t i = 0; i < variable_types.size(); ++i) { - variable_types[i] = (enum_variable_types[i] == var_t::INTEGER) ? 'I' : 'C'; + if (enum_variable_types.empty()) { + // Variable types not set (e.g. pure LP); default to all continuous + variable_types.assign(get_n_variables(), 'C'); + } else { + variable_types.resize(enum_variable_types.size()); + // Convert enum types to char types + for (size_t i = 0; i < variable_types.size(); ++i) { + variable_types[i] = (enum_variable_types[i] == var_t::INTEGER) ? 'I' : 'C'; + } } data_model_view.set_variable_types(variable_types.data(), variable_types.size()); diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index f0345368a..d2c5e3074 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -1422,8 +1422,18 @@ optimization_problem_solution_t solve_lp( // handle default presolve if (settings.presolver == presolver_t::Default) { - settings.presolver = presolver_t::PSLP; - CUOPT_LOG_INFO("Using PSLP presolver"); + // Skip presolve for small problems where the fixed overhead (~20-30ms) + // exceeds the simplex solve time. Based on Netlib benchmarks, problems + // with fewer than 8000 nonzeros never benefit from PSLP presolve. + constexpr i_t presolve_nnz_threshold = 8000; + if (op_problem.get_nnz() >= presolve_nnz_threshold) { + settings.presolver = presolver_t::PSLP; + CUOPT_LOG_INFO("Using PSLP presolver"); + } else { + settings.presolver = presolver_t::None; + CUOPT_LOG_INFO("Skipping presolve for small problem (nnz=%d < %d)", + op_problem.get_nnz(), presolve_nnz_threshold); + } } [[maybe_unused]] double presolve_time = 0.0;