Skip to content

Commit c9fc08d

Browse files
committed
[Compat] Reset some unnecessary changes and remove debug log and sync
1 parent cf25ea2 commit c9fc08d

11 files changed

Lines changed: 69 additions & 101 deletions

File tree

build.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ rm -rf dist
33
rm -rf deep_ep_cpp.cpython-38-x86_64-linux-gnu.so
44
export TORCH_CUDA_ARCH_LIST="10.0"
55
export PADDLE_CUDA_ARCH_LIST="10.0"
6-
export CUDA_HOME="/path/to/cuda"
7-
NVSHMEM_DIR=/path/to/nvshmem python setup_deep_ep.py bdist_wheel
8-
NVSHMEM_DIR=/path/to/nvshmem python setup_hybrid_ep.py bdist_wheel
6+
python setup_deep_ep.py bdist_wheel
7+
python setup_hybrid_ep.py bdist_wheel
98
pip install dist/*.whl --force-reinstall

csrc/deep_ep.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1251,9 +1251,6 @@ void Buffer::clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank, int
12511251
check_boundary(clean_meta_0.first, clean_meta_0.second * sizeof(int));
12521252
check_boundary(clean_meta_1.first, clean_meta_1.second * sizeof(int));
12531253

1254-
// internode_ll::clean_low_latency_buffer(clean_meta_0.first, clean_meta_0.second,
1255-
// clean_meta_1.first, clean_meta_1.second,
1256-
// calc_ctx->stream());
12571254
internode_ll::clean_low_latency_buffer(clean_meta_0.first, clean_meta_0.second,
12581255
clean_meta_1.first, clean_meta_1.second,
12591256
at::cuda::getCurrentCUDAStream());
@@ -1307,7 +1304,6 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
13071304

13081305
// Wait previous tasks to be finished
13091306
// NOTES: the hook mode will always use the default stream
1310-
// auto compute_stream = calc_ctx->stream();
13111307
auto compute_stream = at::cuda::getCurrentCUDAStream();
13121308
auto launch_stream = return_recv_hook ? compute_stream : comm_stream;
13131309
EP_HOST_ASSERT(not (async and return_recv_hook));
@@ -1462,7 +1458,6 @@ Buffer::low_latency_combine(const torch::Tensor& x, const torch::Tensor& topk_id
14621458

14631459
// Wait previous tasks to be finished
14641460
// NOTES: the hook mode will always use the default stream
1465-
// auto compute_stream = calc_ctx->stream();
14661461
auto compute_stream = at::cuda::getCurrentCUDAStream();
14671462
auto launch_stream = return_recv_hook ? compute_stream : comm_stream;
14681463
EP_HOST_ASSERT(not (async and return_recv_hook));
@@ -1778,7 +1773,7 @@ Buffer::dispatch_pcie(const torch::Tensor& x, const std::optional<torch::Tensor>
17781773
} else {
17791774
stream_wait(compute_stream, comm_stream);
17801775
}
1781-
1776+
17821777
if (allocate_on_comm_stream)
17831778
deep_ep::SetAllocatorStreamForGPUContext(compute_stream, calc_ctx);
17841779

csrc/hybrid_ep/executor/executor.cu

Lines changed: 15 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -25,34 +25,13 @@ torch::Tensor Executor::allgather_routing_map(
2525
torch::Tensor global_routing_map;
2626
// At inter-node case, we will use NCCL allgather
2727
if(config.num_of_nodes > 1 || !enable_custom_allgather) {
28-
// Create a list of independent tensors for paddle.distributed.all_gather
29-
// paddle.distributed.all_gather requires a list of tensors as output
30-
std::vector<torch::Tensor> tensor_vec;
31-
tensor_vec.reserve(group_size);
32-
py::list tensor_list;
33-
for (int i = 0; i < group_size; i++) {
34-
auto tensor = torch::empty(
35-
{num_of_tokens_per_rank, num_of_expert},
36-
torch::TensorOptions().dtype(torch::kBool).device(torch::kCUDA)
37-
);
38-
tensor_vec.push_back(tensor);
39-
tensor_list.append(tensor);
40-
}
41-
42-
// Call paddle.distributed.all_gather (sync_op=True for synchronous operation)
43-
paddle_distributed.attr("all_gather")(tensor_list, local_routing_map, process_group, py::arg("sync_op") = true);
44-
45-
// Synchronize to ensure all_gather completes
46-
CUDA_CHECK(cudaDeviceSynchronize());
47-
48-
// Concatenate all gathered tensors into a single contiguous tensor
49-
global_routing_map = torch::cat(tensor_vec, 0);
50-
51-
// Synchronize again to ensure torch::cat completes
52-
CUDA_CHECK(cudaDeviceSynchronize());
28+
global_routing_map = torch::empty(
29+
{num_of_tokens_per_rank * group_size, num_of_expert},
30+
torch::TensorOptions().dtype(torch::kBool).device(torch::kCUDA)
31+
);
32+
paddle_distributed.attr("stream").attr("all_gather")(global_routing_map, local_routing_map, process_group, py::arg("sync_op") = true);
5333
} else { // At intra-node case, we will use custom allgather
5434
allgather_obj.launch(local_routing_map, /*NUM_OF_SMS=*/32, at::cuda::getCurrentCUDAStream());
55-
// allgather_obj.launch(local_routing_map, /*NUM_OF_SMS=*/32, calc_ctx->stream());
5635
global_routing_map = torch::from_blob(
5736
allgather_obj.get_output_buffer(),
5837
{num_of_tokens_per_rank * group_size, num_of_expert},
@@ -73,9 +52,6 @@ Executor::metadata_preprocess_core(
7352
bool non_blocking
7453
) {
7554
nvtxRangePushA("metadata_preprocess_core in hybrid-ep");
76-
// Note: Disabled SetAllocatorStreamForGPUContext because it can cause memory allocation issues
77-
// when Torch tensors are allocated on a different stream than expected.
78-
// SetAllocatorStreamForGPUContext(calc_ctx->stream(), calc_ctx);
7955
// padding for the routing map
8056
const int rdma_to_attn_map_size_per_node = (((num_of_tokens_per_rank - 1) / 16) + 1) * 16;
8157

@@ -91,13 +67,13 @@ Executor::metadata_preprocess_core(
9167
torch::empty({num_of_tokens_per_rank, config.num_of_nodes - 1},
9268
torch::dtype(torch::kBool).device(torch::kCUDA));
9369
torch::Tensor num_of_tokens_for_experts;
94-
// Always allocate on GPU to avoid illegal memory access from kernel
95-
// Note: pinned memory (host memory) cannot be directly accessed from GPU kernel
96-
num_of_tokens_for_experts =
97-
torch::empty({1}, torch::dtype(torch::kInt32).device(torch::kCUDA));
98-
// num_of_tokens_for_experts =
99-
// torch::empty({1}, torch::dtype(torch::kInt32).pinned_memory(true));
100-
printf("num_of_tokens_for_experts on cpu %d\n", num_of_tokens_for_experts.is_cpu());
70+
if (non_blocking) {
71+
num_of_tokens_for_experts =
72+
torch::empty({1}, torch::dtype(torch::kInt32).device(torch::kCUDA));
73+
} else {
74+
num_of_tokens_for_experts =
75+
torch::empty({1}, torch::dtype(torch::kInt32).pinned_memory(true));
76+
}
10177
auto local_expert_routing_map = torch::empty(
10278
{num_of_tokens_per_rank * config.num_of_ranks_per_node * config.num_of_nodes, config.num_of_experts_per_rank},
10379
torch::dtype(torch::kBool).device(torch::kCUDA));
@@ -110,9 +86,6 @@ Executor::metadata_preprocess_core(
11086
local_expert_routing_map.data_ptr<bool>(), static_cast<int>(node_rank),
11187
static_cast<int>(local_rank), num_of_tokens_per_rank, at::cuda::getCurrentCUDAStream());
11288

113-
// Synchronize to ensure the kernel completes before global_routing_map is released
114-
CUDA_CHECK(cudaStreamSynchronize(at::cuda::getCurrentCUDAStream()));
115-
11689
nvtxRangePop(); // End of metadata_preprocess_core nvtx range
11790
return std::make_tuple(sparse_to_dense_map, rdma_to_attn_map, attn_to_rdma_map, num_of_tokens_for_experts, local_expert_routing_map);
11891
}
@@ -233,15 +206,15 @@ void Executor::dispatch_core(HybridEpConfigInstance config, DispatchBuffers& dis
233206
nvtxRangePop(); // End of dispatch_core nvtx range
234207
}
235208

236-
std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor> >
209+
std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor> >
237210
Executor::dispatch_postprocess(HybridEpConfigInstance config, DispatchBuffers& dispatch_buffers, DispatchArgs& args) {
238211
nvtxRangePushA("dispatch_postprocess in hybrid-ep");
239212

240213
// Create and return output tensors
241214
// The output tensor of the dispatch kernel.
242215
torch::Tensor dispatched_tokens;
243-
std::optional<torch::Tensor> dispatched_probs;
244-
std::optional<torch::Tensor> dispatched_scaling_factor;
216+
c10::optional<torch::Tensor> dispatched_probs;
217+
c10::optional<torch::Tensor> dispatched_scaling_factor;
245218

246219
if(args.enable_permute) {
247220
// Use permute kernel to avoid standalone D2D memory copy

csrc/hybrid_ep/executor/executor.cuh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,13 @@ public:
2727
torch::Tensor sparse_to_dense_map;
2828
torch::Tensor rdma_to_attn_map;
2929
torch::Tensor attn_to_rdma_map;
30-
std::optional<torch::Tensor> num_dispatched_tokens_tensor; // Used in the permute
31-
std::optional<torch::Tensor> local_expert_routing_map; // Used in the permute
30+
c10::optional<torch::Tensor> num_dispatched_tokens_tensor; // Used in the permute
31+
c10::optional<torch::Tensor> local_expert_routing_map; // Used in the permute
3232

3333
int64_t num_dispatched_tokens = -1;
3434
// Used in the permute case, use up-bound to avoid synchronization to get the real num_dispatched_tokens from the pinned memory
3535
int64_t max_num_dispatched_tokens = -1;
36-
std::optional<torch::Tensor> row_id_map;
36+
c10::optional<torch::Tensor> row_id_map;
3737
int64_t num_permuted_tokens = -1;
3838
// Misc
3939
int pad_multiple; // Used in the padding case of permute
@@ -56,9 +56,9 @@ public:
5656
torch::Tensor sparse_to_dense_map;
5757
torch::Tensor rdma_to_attn_map;
5858
torch::Tensor attn_to_rdma_map;
59-
std::optional<torch::Tensor> num_dispatched_tokens_tensor;
59+
c10::optional<torch::Tensor> num_dispatched_tokens_tensor;
6060
// Output of Permute-preprocess
61-
std::optional<torch::Tensor> row_id_map; // Used in the unpermute
61+
c10::optional<torch::Tensor> row_id_map; // Used in the unpermute
6262
// Used in the sync-free Unpermute
6363
int64_t num_dispatched_tokens = -1;
6464
// Misc
@@ -90,7 +90,7 @@ public:
9090
template<typename DType>
9191
void dispatch_core(
9292
HybridEpConfigInstance config, DispatchBuffers& dispatch_buffers, DispatchArgs& args);
93-
std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor> >
93+
std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor> >
9494
dispatch_postprocess(
9595
HybridEpConfigInstance config, DispatchBuffers& dispatch_buffers, DispatchArgs& args);
9696

csrc/hybrid_ep/extension/permute.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33

44
#include "permute.cuh"
55

6-
template std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>>
6+
template std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor>>
77
permute_launcher<uint16_t, float, float>(PermuteArgs args);
88

9-
template std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>>
9+
template std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor>>
1010
permute_launcher<uint8_t, float, float>(PermuteArgs args);
1111

1212
template void unpermute_launcher<uint16_t, float>(UnpermuteArgs args);
@@ -403,7 +403,7 @@
403403
}
404404

405405
template <typename DType, typename ProbType = float, typename ScalarType = float>
406-
std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>>
406+
std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor>>
407407
permute_launcher( PermuteArgs args) {
408408
DType * tokens_ptr = reinterpret_cast<DType*>(args.tokens_ptr);
409409
// Current only support 8-bits and 16-bits tokens

csrc/hybrid_ep/extension/permute.cuh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <cooperative_groups.h>
77
#include <cuda_bf16.h>
88
#include <cuda_fp16.h>
9+
#include <c10/util/Optional.h>
910
// #include <torch/torch.h>
1011
#include <torch/python.h>
1112
#include <cub/cub.cuh>
@@ -40,7 +41,7 @@ struct PermuteArgs {
4041
struct UnpermuteArgs {
4142
// Input tensors
4243
torch::Tensor permuted_tokens;
43-
std::optional<torch::Tensor> permuted_probs;
44+
c10::optional<torch::Tensor> permuted_probs;
4445
torch::Tensor row_id_map;
4546

4647
// The address of the output
@@ -104,7 +105,7 @@ struct UnpermuteArgs {
104105
* num_of_local_experts], type: ProbType, now only support float
105106
*/
106107
template <typename DType, typename ProbType, typename ScalarType>
107-
std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>>
108+
std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor>>
108109
permute_launcher(PermuteArgs args);
109110

110111
/**

csrc/hybrid_ep/hybrid_ep.cu

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -491,14 +491,14 @@ HybridEPBuffer::metadata_preprocessing(HybridEpConfigInstance config, torch::Ten
491491
return executor.metadata_preprocess_core(config, preprocessing_tmp, global_routing_map, num_of_tokens_per_rank, non_blocking);
492492
}
493493

494-
std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>>
494+
std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor>>
495495
HybridEPBuffer::dispatch(HybridEpConfigInstance config,
496-
torch::Tensor hidden, std::optional<torch::Tensor> probs,
497-
std::optional<torch::Tensor> scaling_factor,
496+
torch::Tensor hidden, c10::optional<torch::Tensor> probs,
497+
c10::optional<torch::Tensor> scaling_factor,
498498
torch::Tensor sparse_to_dense_map,
499499
torch::Tensor rdma_to_attn_map, torch::Tensor attn_to_rdma_map,
500-
std::optional<torch::Tensor> num_dispatched_tokens_tensor,
501-
std::optional<int64_t> num_dispatched_tokens,
500+
c10::optional<torch::Tensor> num_dispatched_tokens_tensor,
501+
c10::optional<int64_t> num_dispatched_tokens,
502502
int64_t num_of_tokens_per_rank,
503503
bool with_probs) {
504504
// Check the input tensors
@@ -548,7 +548,7 @@ HybridEPBuffer::dispatch(HybridEpConfigInstance config,
548548

549549
std::tuple<torch::Tensor, torch::Tensor>
550550
HybridEPBuffer::combine(HybridEpConfigInstance config,
551-
torch::Tensor hidden, std::optional<torch::Tensor> probs,
551+
torch::Tensor hidden, c10::optional<torch::Tensor> probs,
552552
torch::Tensor sparse_to_dense_map,
553553
torch::Tensor rdma_to_attn_map, torch::Tensor attn_to_rdma_map,
554554
int64_t num_of_tokens_per_rank,
@@ -598,18 +598,18 @@ HybridEPBuffer::combine(HybridEpConfigInstance config,
598598
return std::make_tuple(combined_tokens, combined_probs);
599599
}
600600

601-
std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor>
601+
std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor>
602602
HybridEPBuffer::dispatch_with_permute(HybridEpConfigInstance config,
603-
torch::Tensor hidden, std::optional<torch::Tensor> probs,
604-
std::optional<torch::Tensor> scaling_factor,
603+
torch::Tensor hidden, c10::optional<torch::Tensor> probs,
604+
c10::optional<torch::Tensor> scaling_factor,
605605
torch::Tensor sparse_to_dense_map, torch::Tensor rdma_to_attn_map,
606606
torch::Tensor attn_to_rdma_map,
607-
std::optional<torch::Tensor> num_dispatched_tokens_tensor,
608-
std::optional<torch::Tensor> local_expert_routing_map,
609-
std::optional<torch::Tensor> row_id_map,
610-
std::optional<int64_t> num_permuted_tokens,
607+
c10::optional<torch::Tensor> num_dispatched_tokens_tensor,
608+
c10::optional<torch::Tensor> local_expert_routing_map,
609+
c10::optional<torch::Tensor> row_id_map,
610+
c10::optional<int64_t> num_permuted_tokens,
611611
int64_t num_of_tokens_per_rank,
612-
std::optional<int64_t> pad_multiple,
612+
c10::optional<int64_t> pad_multiple,
613613
bool non_blocking,
614614
bool with_probs)
615615
{
@@ -665,12 +665,12 @@ HybridEPBuffer::dispatch_with_permute(HybridEpConfigInstance config,
665665

666666
std::tuple<torch::Tensor, torch::Tensor>
667667
HybridEPBuffer::combine_with_unpermute(HybridEpConfigInstance config,
668-
torch::Tensor hidden, std::optional<torch::Tensor> probs,
668+
torch::Tensor hidden, c10::optional<torch::Tensor> probs,
669669
torch::Tensor sparse_to_dense_map, torch::Tensor rdma_to_attn_map,
670-
torch::Tensor attn_to_rdma_map, std::optional<torch::Tensor> num_dispatched_tokens_tensor,
671-
std::optional<torch::Tensor> row_id_map,
670+
torch::Tensor attn_to_rdma_map, c10::optional<torch::Tensor> num_dispatched_tokens_tensor,
671+
c10::optional<torch::Tensor> row_id_map,
672672
int64_t num_of_tokens_per_rank,
673-
std::optional<int64_t> pad_multiple,
673+
c10::optional<int64_t> pad_multiple,
674674
bool with_probs)
675675
{
676676
// Check the input tensors

csrc/hybrid_ep/hybrid_ep.cuh

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -32,47 +32,47 @@ public:
3232
torch::Tensor>
3333
metadata_preprocessing(HybridEpConfigInstance config, torch::Tensor global_routing_map, int64_t num_of_tokens_per_rank, bool non_blocking);
3434

35-
std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>>
35+
std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor>>
3636
dispatch(HybridEpConfigInstance config,
37-
torch::Tensor hidden, std::optional<torch::Tensor> probs,
38-
std::optional<torch::Tensor> scaling_factor,
37+
torch::Tensor hidden, c10::optional<torch::Tensor> probs,
38+
c10::optional<torch::Tensor> scaling_factor,
3939
torch::Tensor sparse_to_dense_map, torch::Tensor rdma_to_attn_map,
40-
torch::Tensor attn_to_rdma_map, std::optional<torch::Tensor> num_dispatched_tokens_tensor,
41-
std::optional<int64_t> num_dispatched_tokens,
40+
torch::Tensor attn_to_rdma_map, c10::optional<torch::Tensor> num_dispatched_tokens_tensor,
41+
c10::optional<int64_t> num_dispatched_tokens,
4242
int64_t num_of_tokens_per_rank,
4343
bool with_probs);
4444

4545
std::tuple<torch::Tensor, torch::Tensor>
46-
combine(HybridEpConfigInstance config, torch::Tensor hidden, std::optional<torch::Tensor> probs,
46+
combine(HybridEpConfigInstance config, torch::Tensor hidden, c10::optional<torch::Tensor> probs,
4747
torch::Tensor sparse_to_dense_map, torch::Tensor rdma_to_attn_map,
4848
torch::Tensor attn_to_rdma_map, int64_t num_of_tokens_per_rank,
4949
bool with_probs);
5050

51-
std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor>
51+
std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor>
5252
dispatch_with_permute(
5353
HybridEpConfigInstance config,
54-
torch::Tensor hidden, std::optional<torch::Tensor> probs,
55-
std::optional<torch::Tensor> scaling_factor,
54+
torch::Tensor hidden, c10::optional<torch::Tensor> probs,
55+
c10::optional<torch::Tensor> scaling_factor,
5656
torch::Tensor sparse_to_dense_map, torch::Tensor rdma_to_attn_map,
5757
torch::Tensor attn_to_rdma_map,
58-
std::optional<torch::Tensor> num_dispatched_tokens_tensor,
59-
std::optional<torch::Tensor> local_expert_routing_map,
60-
std::optional<torch::Tensor> row_id_map,
61-
std::optional<int64_t> num_permuted_tokens,
58+
c10::optional<torch::Tensor> num_dispatched_tokens_tensor,
59+
c10::optional<torch::Tensor> local_expert_routing_map,
60+
c10::optional<torch::Tensor> row_id_map,
61+
c10::optional<int64_t> num_permuted_tokens,
6262
int64_t num_of_tokens_per_rank,
63-
std::optional<int64_t> pad_multiple,
63+
c10::optional<int64_t> pad_multiple,
6464
bool non_blocking,
6565
bool with_probs);
6666

6767
std::tuple<torch::Tensor, torch::Tensor>
6868
combine_with_unpermute(
6969
HybridEpConfigInstance config,
70-
torch::Tensor hidden, std::optional<torch::Tensor> probs,
70+
torch::Tensor hidden, c10::optional<torch::Tensor> probs,
7171
torch::Tensor sparse_to_dense_map, torch::Tensor rdma_to_attn_map,
72-
torch::Tensor attn_to_rdma_map, std::optional<torch::Tensor> num_dispatched_tokens_tensor,
73-
std::optional<torch::Tensor> row_id_map,
72+
torch::Tensor attn_to_rdma_map, c10::optional<torch::Tensor> num_dispatched_tokens_tensor,
73+
c10::optional<torch::Tensor> row_id_map,
7474
int64_t num_of_tokens_per_rank,
75-
std::optional<int64_t> pad_multiple,
75+
c10::optional<int64_t> pad_multiple,
7676
bool with_probs);
7777

7878
private:

csrc/hybrid_ep/jit/compiler.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ NVCCCompiler::NVCCCompiler(std::string base_path, std::string comm_id):
3131
base_path(base_path), comm_id(comm_id) {
3232
jit_dir = get_jit_dir();
3333

34-
nvcc_path = get_env("CUDA_HOME") + "nvcc";
34+
nvcc_path = get_env("CUDA_HOME") + "/bin/nvcc";
3535

3636
// Init the flags to compiler
3737
std::string sm_arch_flags = convert_to_nvcc_arch_flags(SM_ARCH);

0 commit comments

Comments
 (0)