Skip to content

Commit ef35913

Browse files
committed
Add generate_identity_sequences helper for common pattern
This adds an optimized helper for the common generate_tuple pattern: generate_tuple([](auto i) { return Sequence<i.value>{}; }, N) The new generate_identity_sequences<N>() function creates Tuple<Sequence<0>, Sequence<1>, ..., Sequence<N-1>> without requiring lambda instantiation at each call site. Updated 21 call sites across threadwise_tensor_slice_transfer, wrapper utilities, and layout files to use the new helper. Build time improvement: ~1.1% wall-clock (18.3s -> 18.1s)
1 parent 1b33b98 commit ef35913

13 files changed

+43
-42
lines changed

include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -866,8 +866,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
866866
},
867867
Number<nDim>{});
868868

869-
constexpr auto up_dim_idss =
870-
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
869+
constexpr auto up_dim_idss = generate_identity_sequences<nDim>();
871870

872871
return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
873872
}
@@ -925,8 +924,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
925924
},
926925
Number<nDim>{});
927926

928-
constexpr auto up_dim_idss =
929-
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
927+
constexpr auto up_dim_idss = generate_identity_sequences<nDim>();
930928

931929
return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
932930
}

include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -894,8 +894,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant
894894
},
895895
Number<nDim>{});
896896

897-
constexpr auto up_dim_idss =
898-
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
897+
constexpr auto up_dim_idss = generate_identity_sequences<nDim>();
899898

900899
return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
901900
}
@@ -944,8 +943,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant
944943
},
945944
Number<nDim>{});
946945

947-
constexpr auto up_dim_idss =
948-
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
946+
constexpr auto up_dim_idss = generate_identity_sequences<nDim>();
949947

950948
return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
951949
}
@@ -993,8 +991,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant
993991
},
994992
Number<nDim>{});
995993

996-
constexpr auto up_dim_idss =
997-
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
994+
constexpr auto up_dim_idss = generate_identity_sequences<nDim>();
998995

999996
return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
1000997
}

include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -833,8 +833,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_gather
833833
},
834834
Number<nDim>{});
835835

836-
constexpr auto up_dim_idss =
837-
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
836+
constexpr auto up_dim_idss = generate_identity_sequences<nDim>();
838837

839838
return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
840839
}
@@ -892,8 +891,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_gather
892891
},
893892
Number<nDim>{});
894893

895-
constexpr auto up_dim_idss =
896-
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
894+
constexpr auto up_dim_idss = generate_identity_sequences<nDim>();
897895

898896
return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
899897
}

include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r2.hpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -692,8 +692,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
692692
},
693693
Number<nDim>{});
694694

695-
constexpr auto up_dim_idss =
696-
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
695+
constexpr auto up_dim_idss = generate_identity_sequences<nDim>();
697696

698697
return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
699698
}
@@ -744,8 +743,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
744743
},
745744
Number<nDim>{});
746745

747-
constexpr auto up_dim_idss =
748-
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
746+
constexpr auto up_dim_idss = generate_identity_sequences<nDim>();
749747

750748
return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
751749
}

include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -514,8 +514,7 @@ struct ThreadwiseTensorSliceTransfer_v7r2
514514
},
515515
Number<nDim>{});
516516

517-
constexpr auto up_dim_idss =
518-
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
517+
constexpr auto up_dim_idss = generate_identity_sequences<nDim>();
519518

520519
return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
521520
}
@@ -563,8 +562,7 @@ struct ThreadwiseTensorSliceTransfer_v7r2
563562
},
564563
Number<nDim>{});
565564

566-
constexpr auto up_dim_idss =
567-
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
565+
constexpr auto up_dim_idss = generate_identity_sequences<nDim>();
568566

569567
return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
570568
}

include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3.hpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -656,8 +656,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3
656656
},
657657
Number<nDim>{});
658658

659-
constexpr auto up_dim_idss =
660-
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
659+
constexpr auto up_dim_idss = generate_identity_sequences<nDim>();
661660

662661
return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
663662
}
@@ -706,8 +705,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3
706705
},
707706
Number<nDim>{});
708707

709-
constexpr auto up_dim_idss =
710-
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
708+
constexpr auto up_dim_idss = generate_identity_sequences<nDim>();
711709

712710
return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
713711
}

include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -548,8 +548,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
548548
},
549549
Number<nDim>{});
550550

551-
constexpr auto up_dim_idss =
552-
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
551+
constexpr auto up_dim_idss = generate_identity_sequences<nDim>();
553552

554553
return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
555554
}
@@ -598,8 +597,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
598597
},
599598
Number<nDim>{});
600599

601-
constexpr auto up_dim_idss =
602-
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
600+
constexpr auto up_dim_idss = generate_identity_sequences<nDim>();
603601

604602
return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
605603
}

include/ck/utility/tuple_helper.hpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,28 @@ __host__ __device__ constexpr auto generate_tie(F&& f, Number<N>)
3737
typename arithmetic_sequence_gen<0, N, 1>::type{});
3838
}
3939

40+
// Optimized helper for common pattern: generate_tuple([](auto i) { return Sequence<i.value>{}; },
41+
// N) Creates Tuple<Sequence<0>, Sequence<1>, ..., Sequence<N-1>> without lambda instantiation
42+
namespace detail {
43+
template <index_t... Is>
44+
__host__ __device__ constexpr auto make_identity_sequences_impl(Sequence<Is...>)
45+
{
46+
return make_tuple(Sequence<Is>{}...);
47+
}
48+
} // namespace detail
49+
50+
template <index_t N>
51+
__host__ __device__ constexpr auto generate_identity_sequences()
52+
{
53+
return detail::make_identity_sequences_impl(make_index_sequence<N>{});
54+
}
55+
56+
template <index_t N>
57+
__host__ __device__ constexpr auto generate_identity_sequences(Number<N>)
58+
{
59+
return generate_identity_sequences<N>();
60+
}
61+
4062
// tx and ty are tuple of references, return type of will tuple of referennce (not rvalue)
4163
template <typename... X, typename... Y>
4264
__host__ __device__ constexpr auto concat_tuple_of_reference(const Tuple<X&...>& tx,

include/ck/wrapper/layout.hpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -242,8 +242,7 @@ struct Layout
242242
const auto lower_dims =
243243
generate_tuple([&](auto i) { return GenerateLowerDim<Number<i>>(shape); },
244244
Number<Tuple<ShapeDims...>::Size()>{});
245-
const auto upper_dims = generate_tuple([&](auto i) { return Sequence<i.value>{}; },
246-
Number<Tuple<ShapeDims...>::Size()>{});
245+
const auto upper_dims = generate_identity_sequences<Tuple<ShapeDims...>::Size()>();
247246

248247
return transform_tensor_descriptor(desc, transforms, lower_dims, upper_dims);
249248
}

include/ck/wrapper/operations/gemm.hpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -259,8 +259,7 @@ make_blockwise_gemm_xdl_c_local_partition(CTensorType& c_local_tile_tensor)
259259
const auto partition_desc = BlockwiseGemmXdlops::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(
260260
layout(c_local_tile_tensor).GetUnrolledDescriptor());
261261

262-
const auto lower_upper_dims =
263-
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<8>{});
262+
const auto lower_upper_dims = generate_identity_sequences<8>();
264263

265264
auto sliced_desc = transform_tensor_descriptor(
266265
partition_desc,

0 commit comments

Comments
 (0)