[Compat] Reset some unnecessary changes and remove debug log and sync

SigureMo · SigureMo · commit c9fc08de9ae2 · 2026-04-02T20:56:58.000+08:00
diff --git a/build.sh b/build.sh
@@ -3,7 +3,6 @@ rm -rf dist
 rm -rf deep_ep_cpp.cpython-38-x86_64-linux-gnu.so
 export TORCH_CUDA_ARCH_LIST="10.0"
 export PADDLE_CUDA_ARCH_LIST="10.0"
-export CUDA_HOME="/path/to/cuda"
-NVSHMEM_DIR=/path/to/nvshmem python setup_deep_ep.py bdist_wheel
-NVSHMEM_DIR=/path/to/nvshmem python setup_hybrid_ep.py bdist_wheel
+python setup_deep_ep.py bdist_wheel
+python setup_hybrid_ep.py bdist_wheel
 pip install dist/*.whl --force-reinstall
diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
@@ -1251,9 +1251,6 @@ void Buffer::clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank, int
     check_boundary(clean_meta_0.first, clean_meta_0.second * sizeof(int));
     check_boundary(clean_meta_1.first, clean_meta_1.second * sizeof(int));
 
-    // internode_ll::clean_low_latency_buffer(clean_meta_0.first, clean_meta_0.second,
-    //                                        clean_meta_1.first, clean_meta_1.second,
-    //                                        calc_ctx->stream());
     internode_ll::clean_low_latency_buffer(clean_meta_0.first, clean_meta_0.second,
                                            clean_meta_1.first, clean_meta_1.second,
                                            at::cuda::getCurrentCUDAStream());
@@ -1307,7 +1304,6 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
 
     // Wait previous tasks to be finished
     // NOTES: the hook mode will always use the default stream
-    // auto compute_stream = calc_ctx->stream();
     auto compute_stream = at::cuda::getCurrentCUDAStream();
     auto launch_stream = return_recv_hook ? compute_stream : comm_stream;
     EP_HOST_ASSERT(not (async and return_recv_hook));
@@ -1462,7 +1458,6 @@ Buffer::low_latency_combine(const torch::Tensor& x, const torch::Tensor& topk_id
 
     // Wait previous tasks to be finished
     // NOTES: the hook mode will always use the default stream
-    // auto compute_stream = calc_ctx->stream();
     auto compute_stream = at::cuda::getCurrentCUDAStream();
     auto launch_stream = return_recv_hook ? compute_stream : comm_stream;
     EP_HOST_ASSERT(not (async and return_recv_hook));
@@ -1778,7 +1773,7 @@ Buffer::dispatch_pcie(const torch::Tensor& x, const std::optional<torch::Tensor>
     } else {
         stream_wait(compute_stream, comm_stream);
     }
-    
+
     if (allocate_on_comm_stream)
         deep_ep::SetAllocatorStreamForGPUContext(compute_stream, calc_ctx);
 
diff --git a/csrc/hybrid_ep/executor/executor.cu b/csrc/hybrid_ep/executor/executor.cu
@@ -25,34 +25,13 @@ torch::Tensor Executor::allgather_routing_map(
     torch::Tensor global_routing_map;
     // At inter-node case, we will use NCCL allgather
     if(config.num_of_nodes > 1 || !enable_custom_allgather) {
-        // Create a list of independent tensors for paddle.distributed.all_gather
-        // paddle.distributed.all_gather requires a list of tensors as output
-        std::vector<torch::Tensor> tensor_vec;
-        tensor_vec.reserve(group_size);
-        py::list tensor_list;
-        for (int i = 0; i < group_size; i++) {
-            auto tensor = torch::empty(
-                {num_of_tokens_per_rank, num_of_expert},
-                torch::TensorOptions().dtype(torch::kBool).device(torch::kCUDA)
-            );
-            tensor_vec.push_back(tensor);
-            tensor_list.append(tensor);
-        }
-
-        // Call paddle.distributed.all_gather (sync_op=True for synchronous operation)
-        paddle_distributed.attr("all_gather")(tensor_list, local_routing_map, process_group, py::arg("sync_op") = true);
-
-        // Synchronize to ensure all_gather completes
-        CUDA_CHECK(cudaDeviceSynchronize());
-
-        // Concatenate all gathered tensors into a single contiguous tensor
-        global_routing_map = torch::cat(tensor_vec, 0);
-
-        // Synchronize again to ensure torch::cat completes
-        CUDA_CHECK(cudaDeviceSynchronize());
+        global_routing_map = torch::empty(
+            {num_of_tokens_per_rank * group_size, num_of_expert},
+            torch::TensorOptions().dtype(torch::kBool).device(torch::kCUDA)
+        );
+        paddle_distributed.attr("stream").attr("all_gather")(global_routing_map, local_routing_map, process_group, py::arg("sync_op") = true);
     } else { // At intra-node case, we will use custom allgather
         allgather_obj.launch(local_routing_map, /*NUM_OF_SMS=*/32, at::cuda::getCurrentCUDAStream());
-        // allgather_obj.launch(local_routing_map, /*NUM_OF_SMS=*/32, calc_ctx->stream());
         global_routing_map = torch::from_blob(
             allgather_obj.get_output_buffer(), 
             {num_of_tokens_per_rank * group_size, num_of_expert},
@@ -73,9 +52,6 @@ Executor::metadata_preprocess_core(
     bool non_blocking
 ) {
   nvtxRangePushA("metadata_preprocess_core in hybrid-ep");
-  // Note: Disabled SetAllocatorStreamForGPUContext because it can cause memory allocation issues
-  // when Torch tensors are allocated on a different stream than expected.
-  // SetAllocatorStreamForGPUContext(calc_ctx->stream(), calc_ctx);
   // padding for the routing map
   const int rdma_to_attn_map_size_per_node = (((num_of_tokens_per_rank - 1) / 16) + 1) * 16;
 
@@ -91,13 +67,13 @@ Executor::metadata_preprocess_core(
       torch::empty({num_of_tokens_per_rank, config.num_of_nodes - 1},
                    torch::dtype(torch::kBool).device(torch::kCUDA));
   torch::Tensor num_of_tokens_for_experts;
-  // Always allocate on GPU to avoid illegal memory access from kernel
-  // Note: pinned memory (host memory) cannot be directly accessed from GPU kernel
-  num_of_tokens_for_experts =
-      torch::empty({1}, torch::dtype(torch::kInt32).device(torch::kCUDA));
-//   num_of_tokens_for_experts =
-//         torch::empty({1}, torch::dtype(torch::kInt32).pinned_memory(true));
-  printf("num_of_tokens_for_experts on cpu %d\n", num_of_tokens_for_experts.is_cpu());
+  if (non_blocking) {
+    num_of_tokens_for_experts =
+        torch::empty({1}, torch::dtype(torch::kInt32).device(torch::kCUDA));
+  } else {
+    num_of_tokens_for_experts =
+        torch::empty({1}, torch::dtype(torch::kInt32).pinned_memory(true));
+  }
   auto local_expert_routing_map = torch::empty(
       {num_of_tokens_per_rank * config.num_of_ranks_per_node * config.num_of_nodes, config.num_of_experts_per_rank},
       torch::dtype(torch::kBool).device(torch::kCUDA));
@@ -110,9 +86,6 @@ Executor::metadata_preprocess_core(
       local_expert_routing_map.data_ptr<bool>(), static_cast<int>(node_rank),
       static_cast<int>(local_rank), num_of_tokens_per_rank, at::cuda::getCurrentCUDAStream());
 
-  // Synchronize to ensure the kernel completes before global_routing_map is released
-  CUDA_CHECK(cudaStreamSynchronize(at::cuda::getCurrentCUDAStream()));
-
   nvtxRangePop();  // End of metadata_preprocess_core nvtx range
   return std::make_tuple(sparse_to_dense_map, rdma_to_attn_map, attn_to_rdma_map, num_of_tokens_for_experts, local_expert_routing_map);
 }
@@ -233,15 +206,15 @@ void Executor::dispatch_core(HybridEpConfigInstance config, DispatchBuffers& dis
     nvtxRangePop();  // End of dispatch_core nvtx range
 }
 
-std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor> >
+std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor> >
 Executor::dispatch_postprocess(HybridEpConfigInstance config, DispatchBuffers& dispatch_buffers, DispatchArgs& args) {
     nvtxRangePushA("dispatch_postprocess in hybrid-ep");
 
     // Create and return output tensors
     // The output tensor of the dispatch kernel.
     torch::Tensor dispatched_tokens;
-    std::optional<torch::Tensor> dispatched_probs;
-    std::optional<torch::Tensor> dispatched_scaling_factor;
+    c10::optional<torch::Tensor> dispatched_probs;
+    c10::optional<torch::Tensor> dispatched_scaling_factor;
 
     if(args.enable_permute) {
         // Use permute kernel to avoid standalone D2D memory copy
diff --git a/csrc/hybrid_ep/executor/executor.cuh b/csrc/hybrid_ep/executor/executor.cuh
@@ -27,13 +27,13 @@ public:
         torch::Tensor sparse_to_dense_map;
         torch::Tensor rdma_to_attn_map;
         torch::Tensor attn_to_rdma_map;
-        std::optional<torch::Tensor> num_dispatched_tokens_tensor;  // Used in the permute
-        std::optional<torch::Tensor> local_expert_routing_map;      // Used in the permute
+        c10::optional<torch::Tensor> num_dispatched_tokens_tensor;  // Used in the permute
+        c10::optional<torch::Tensor> local_expert_routing_map;      // Used in the permute
 
         int64_t num_dispatched_tokens = -1;
         // Used in the permute case, use up-bound to avoid synchronization to get the real num_dispatched_tokens from the pinned memory
         int64_t max_num_dispatched_tokens = -1;
-        std::optional<torch::Tensor> row_id_map;
+        c10::optional<torch::Tensor> row_id_map;
         int64_t num_permuted_tokens = -1;
         // Misc
         int pad_multiple;  // Used in the padding case of permute
@@ -56,9 +56,9 @@ public:
         torch::Tensor sparse_to_dense_map;
         torch::Tensor rdma_to_attn_map;
         torch::Tensor attn_to_rdma_map;
-        std::optional<torch::Tensor> num_dispatched_tokens_tensor;
+        c10::optional<torch::Tensor> num_dispatched_tokens_tensor;
         // Output of Permute-preprocess
-        std::optional<torch::Tensor> row_id_map;  // Used in the unpermute
+        c10::optional<torch::Tensor> row_id_map;  // Used in the unpermute
         // Used in the sync-free Unpermute
         int64_t num_dispatched_tokens = -1;
         // Misc
@@ -90,7 +90,7 @@ public:
     template<typename DType> 
     void dispatch_core(
         HybridEpConfigInstance config, DispatchBuffers& dispatch_buffers, DispatchArgs& args);
-    std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor> > 
+    std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor> > 
     dispatch_postprocess(
         HybridEpConfigInstance config, DispatchBuffers& dispatch_buffers, DispatchArgs& args); 
 
diff --git a/csrc/hybrid_ep/extension/permute.cu b/csrc/hybrid_ep/extension/permute.cu
@@ -3,10 +3,10 @@
 
 #include "permute.cuh"
 
- template std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>>
+ template std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor>>
  permute_launcher<uint16_t, float, float>(PermuteArgs args);
  
- template std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>>
+ template std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor>>
  permute_launcher<uint8_t, float, float>(PermuteArgs args);
  
  template void unpermute_launcher<uint16_t, float>(UnpermuteArgs args);
@@ -403,7 +403,7 @@
  }
  
  template <typename DType, typename ProbType = float, typename ScalarType = float>
- std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>>
+ std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor>>
  permute_launcher( PermuteArgs args) {
    DType * tokens_ptr = reinterpret_cast<DType*>(args.tokens_ptr);
    // Current only support 8-bits and 16-bits tokens
diff --git a/csrc/hybrid_ep/extension/permute.cuh b/csrc/hybrid_ep/extension/permute.cuh
@@ -6,6 +6,7 @@
 #include <cooperative_groups.h>
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
+#include <c10/util/Optional.h>
 // #include <torch/torch.h>
 #include <torch/python.h>
 #include <cub/cub.cuh>
@@ -40,7 +41,7 @@ struct PermuteArgs {
 struct UnpermuteArgs {
   // Input tensors
   torch::Tensor permuted_tokens;
-  std::optional<torch::Tensor> permuted_probs;
+  c10::optional<torch::Tensor> permuted_probs;
   torch::Tensor row_id_map;
 
   // The address of the output
@@ -104,7 +105,7 @@ struct UnpermuteArgs {
   * num_of_local_experts], type: ProbType, now only support float
   */
  template <typename DType, typename ProbType, typename ScalarType>
- std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>>
+ std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor>>
  permute_launcher(PermuteArgs args);
  
  /**
diff --git a/csrc/hybrid_ep/hybrid_ep.cu b/csrc/hybrid_ep/hybrid_ep.cu
@@ -491,14 +491,14 @@ HybridEPBuffer::metadata_preprocessing(HybridEpConfigInstance config, torch::Ten
   return executor.metadata_preprocess_core(config, preprocessing_tmp, global_routing_map, num_of_tokens_per_rank, non_blocking);
 }
 
-std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>>
+std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor>>
 HybridEPBuffer::dispatch(HybridEpConfigInstance config, 
-                 torch::Tensor hidden, std::optional<torch::Tensor> probs,
-                 std::optional<torch::Tensor> scaling_factor,
+                 torch::Tensor hidden, c10::optional<torch::Tensor> probs,
+                 c10::optional<torch::Tensor> scaling_factor,
                  torch::Tensor sparse_to_dense_map,
                  torch::Tensor rdma_to_attn_map, torch::Tensor attn_to_rdma_map,
-                 std::optional<torch::Tensor> num_dispatched_tokens_tensor,
-                 std::optional<int64_t> num_dispatched_tokens,
+                 c10::optional<torch::Tensor> num_dispatched_tokens_tensor,
+                 c10::optional<int64_t> num_dispatched_tokens,
                  int64_t num_of_tokens_per_rank,
                  bool with_probs) {
   // Check the input tensors
@@ -548,7 +548,7 @@ HybridEPBuffer::dispatch(HybridEpConfigInstance config,
 
 std::tuple<torch::Tensor, torch::Tensor>
 HybridEPBuffer::combine(HybridEpConfigInstance config, 
-                torch::Tensor hidden, std::optional<torch::Tensor> probs,
+                torch::Tensor hidden, c10::optional<torch::Tensor> probs,
                 torch::Tensor sparse_to_dense_map,
                 torch::Tensor rdma_to_attn_map, torch::Tensor attn_to_rdma_map,
                 int64_t num_of_tokens_per_rank,
@@ -598,18 +598,18 @@ HybridEPBuffer::combine(HybridEpConfigInstance config,
   return std::make_tuple(combined_tokens, combined_probs);
 }
 
-std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor>
+std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor>
 HybridEPBuffer::dispatch_with_permute(HybridEpConfigInstance config, 
-          torch::Tensor hidden, std::optional<torch::Tensor> probs,
-          std::optional<torch::Tensor> scaling_factor,
+          torch::Tensor hidden, c10::optional<torch::Tensor> probs,
+          c10::optional<torch::Tensor> scaling_factor,
           torch::Tensor sparse_to_dense_map, torch::Tensor rdma_to_attn_map,
           torch::Tensor attn_to_rdma_map, 
-          std::optional<torch::Tensor> num_dispatched_tokens_tensor,
-          std::optional<torch::Tensor> local_expert_routing_map,
-          std::optional<torch::Tensor> row_id_map,
-          std::optional<int64_t> num_permuted_tokens,
+          c10::optional<torch::Tensor> num_dispatched_tokens_tensor,
+          c10::optional<torch::Tensor> local_expert_routing_map,
+          c10::optional<torch::Tensor> row_id_map,
+          c10::optional<int64_t> num_permuted_tokens,
           int64_t num_of_tokens_per_rank,
-          std::optional<int64_t> pad_multiple,
+          c10::optional<int64_t> pad_multiple,
           bool non_blocking,
           bool with_probs)
 {
@@ -665,12 +665,12 @@ HybridEPBuffer::dispatch_with_permute(HybridEpConfigInstance config,
 
 std::tuple<torch::Tensor, torch::Tensor>
 HybridEPBuffer::combine_with_unpermute(HybridEpConfigInstance config, 
-        torch::Tensor hidden, std::optional<torch::Tensor> probs,
+        torch::Tensor hidden, c10::optional<torch::Tensor> probs,
         torch::Tensor sparse_to_dense_map, torch::Tensor rdma_to_attn_map,
-        torch::Tensor attn_to_rdma_map, std::optional<torch::Tensor> num_dispatched_tokens_tensor,
-        std::optional<torch::Tensor> row_id_map,
+        torch::Tensor attn_to_rdma_map, c10::optional<torch::Tensor> num_dispatched_tokens_tensor,
+        c10::optional<torch::Tensor> row_id_map,
         int64_t num_of_tokens_per_rank,
-        std::optional<int64_t> pad_multiple,
+        c10::optional<int64_t> pad_multiple,
         bool with_probs)
 {
   // Check the input tensors
diff --git a/csrc/hybrid_ep/hybrid_ep.cuh b/csrc/hybrid_ep/hybrid_ep.cuh
@@ -32,47 +32,47 @@ public:
              torch::Tensor>
   metadata_preprocessing(HybridEpConfigInstance config, torch::Tensor global_routing_map, int64_t num_of_tokens_per_rank, bool non_blocking);
 
-  std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>>
+  std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor>>
   dispatch(HybridEpConfigInstance config, 
-           torch::Tensor hidden, std::optional<torch::Tensor> probs,
-           std::optional<torch::Tensor> scaling_factor,
+           torch::Tensor hidden, c10::optional<torch::Tensor> probs,
+           c10::optional<torch::Tensor> scaling_factor,
            torch::Tensor sparse_to_dense_map, torch::Tensor rdma_to_attn_map,
-           torch::Tensor attn_to_rdma_map, std::optional<torch::Tensor> num_dispatched_tokens_tensor,
-           std::optional<int64_t> num_dispatched_tokens,
+           torch::Tensor attn_to_rdma_map, c10::optional<torch::Tensor> num_dispatched_tokens_tensor,
+           c10::optional<int64_t> num_dispatched_tokens,
            int64_t num_of_tokens_per_rank,
            bool with_probs);
 
   std::tuple<torch::Tensor, torch::Tensor>
-  combine(HybridEpConfigInstance config, torch::Tensor hidden, std::optional<torch::Tensor> probs,
+  combine(HybridEpConfigInstance config, torch::Tensor hidden, c10::optional<torch::Tensor> probs,
           torch::Tensor sparse_to_dense_map, torch::Tensor rdma_to_attn_map,
           torch::Tensor attn_to_rdma_map, int64_t num_of_tokens_per_rank,
           bool with_probs);
   
-  std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor>
+  std::tuple<torch::Tensor, c10::optional<torch::Tensor>, c10::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor>
   dispatch_with_permute(
             HybridEpConfigInstance config, 
-            torch::Tensor hidden, std::optional<torch::Tensor> probs,
-            std::optional<torch::Tensor> scaling_factor,
+            torch::Tensor hidden, c10::optional<torch::Tensor> probs,
+            c10::optional<torch::Tensor> scaling_factor,
             torch::Tensor sparse_to_dense_map, torch::Tensor rdma_to_attn_map,
             torch::Tensor attn_to_rdma_map, 
-            std::optional<torch::Tensor> num_dispatched_tokens_tensor,
-            std::optional<torch::Tensor> local_expert_routing_map,
-            std::optional<torch::Tensor> row_id_map,
-            std::optional<int64_t> num_permuted_tokens,
+            c10::optional<torch::Tensor> num_dispatched_tokens_tensor,
+            c10::optional<torch::Tensor> local_expert_routing_map,
+            c10::optional<torch::Tensor> row_id_map,
+            c10::optional<int64_t> num_permuted_tokens,
             int64_t num_of_tokens_per_rank,
-            std::optional<int64_t> pad_multiple,
+            c10::optional<int64_t> pad_multiple,
             bool non_blocking,
             bool with_probs);
 
   std::tuple<torch::Tensor, torch::Tensor>
   combine_with_unpermute(
           HybridEpConfigInstance config, 
-          torch::Tensor hidden, std::optional<torch::Tensor> probs,
+          torch::Tensor hidden, c10::optional<torch::Tensor> probs,
           torch::Tensor sparse_to_dense_map, torch::Tensor rdma_to_attn_map,
-          torch::Tensor attn_to_rdma_map, std::optional<torch::Tensor> num_dispatched_tokens_tensor,
-          std::optional<torch::Tensor> row_id_map,
+          torch::Tensor attn_to_rdma_map, c10::optional<torch::Tensor> num_dispatched_tokens_tensor,
+          c10::optional<torch::Tensor> row_id_map,
           int64_t num_of_tokens_per_rank,
-          std::optional<int64_t> pad_multiple,
+          c10::optional<int64_t> pad_multiple,
           bool with_probs);       
 
 private:
diff --git a/csrc/hybrid_ep/jit/compiler.cu b/csrc/hybrid_ep/jit/compiler.cu
@@ -31,7 +31,7 @@ NVCCCompiler::NVCCCompiler(std::string base_path, std::string comm_id):
     base_path(base_path), comm_id(comm_id) {
     jit_dir = get_jit_dir();
 
-    nvcc_path = get_env("CUDA_HOME") + "nvcc";
+    nvcc_path = get_env("CUDA_HOME") + "/bin/nvcc";
 
     // Init the flags to compiler
     std::string sm_arch_flags = convert_to_nvcc_arch_flags(SM_ARCH);
diff --git a/test.sh b/test.sh
diff --git a/tests/test_hybrid_ep.py b/tests/test_hybrid_ep.py