From 298dd9c45b550a74cad130cfb49881affd1c8477 Mon Sep 17 00:00:00 2001 From: dbsanfte Date: Fri, 16 Jan 2026 11:32:12 +0000 Subject: [PATCH 1/3] Fix: Buffer load OOB crashes on gfx906 (MI50/MI60) Problem: DeviceGemmDl crashes on gfx906 when K >= 1472 with small M (M=1 decode case). Root cause: CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK was disabled by default. Without this, invalid buffer loads execute and crash before bounds checking can prevent them. Solution: 1. Enable the OOB offset trick (0x80000000) so invalid coordinates safely return zero instead of accessing unmapped memory 2. Use full coordinate_has_valid_offset() check instead of the _assuming_visible_index_is_valid variant for proper K bounds validation Verified with INT8 GEMM tests: M=1 decode, K=14336, FFN projections. --- include/ck/ck.hpp | 4 +++- .../gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index 879fb31ca59..53af4ce211f 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -186,8 +186,10 @@ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting. Don't use this hack unless absolutely necessary! // FIXME: make the behavior of buffer load a configurable (template) parameter for each usage +// FIX: Enable offset trick to prevent invalid loads from crashing on gfx906/MI50 +// Without this, invalid loads still execute and crash despite bounds checking #ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0 +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 #endif #define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1 #define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1 diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp index bce2d453dce..0e8f9b0fd94 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp @@ -177,8 +177,10 @@ struct ThreadwiseTensorSliceTransfer_v5r1 using src_vector_t = typename decltype(src_vector)::type; + // FIX: Use full bounds check including visible index to prevent OOB access + // when K0 coordinate exceeds tensor bounds const bool is_src_valid = - coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_); + coordinate_has_valid_offset(src_desc, src_coord_); // copy data from src_buf to src_vector src_vector.template AsType()(I0) = From 41797af07828101fffb8b48836fa311fc8c7092c Mon Sep 17 00:00:00 2001 From: dbsanfte Date: Fri, 16 Jan 2026 11:40:18 +0000 Subject: [PATCH 2/3] Fix: GridwiseGemmDlMultipleD element op for FloatAcc != FloatC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problem: When FloatAcc differs from FloatC (e.g., INT8×INT8→INT32 accumulator with FP32 output scaling), the CDE element op is invoked with wrong storage types. The element op contract is: (E& e, const C& c, const D& d...) where: - E = FloatC (final output type, e.g., float) - C = FloatAcc (accumulator type, e.g., int32_t) Original code used generate_tie() returning the same c_thread_buf for both E& and C&, which: 1. Violates the element op signature when types differ 2. Causes compile errors with strictly-typed element ops 3. Results in undefined behavior during ThreadwiseTensorSliceTransfer Solution: Introduce separate e_thread_buf for element op output, pass (E& e) from e_thread_buf and (const C& c) from c_thread_buf, then transfer e_thread_buf to global memory. Bug has existed since the file was created in December 2022 (PR #517). --- .../gpu/grid/gridwise_gemm_dl_multiple_d.hpp | 29 +++++++++++++++---- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp index c486b124237..da36a8f206f 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp @@ -418,6 +418,12 @@ struct GridwiseGemmDlMultipleD_km_kn_mn auto c_thread_buf = make_static_buffer( c_thread_desc_m10_m11_n10_n11.GetElementSpaceSize()); + // FIX: Separate buffer for element op output with proper type (FloatC) + // This is needed when FloatAcc (e.g., int32) differs from FloatC (e.g., float) + // The element op e = static_cast(c) * d expects to write to FloatC, not FloatAcc + auto e_thread_buf = make_static_buffer( + c_thread_desc_m10_m11_n10_n11.GetElementSpaceSize()); + // Initialize C c_thread_buf.Clear(); @@ -621,13 +627,22 @@ struct GridwiseGemmDlMultipleD_km_kn_mn Number{}); // get reference to dst data + // FIX: Use e_thread_buf (FloatC) for output, c_thread_buf + // (FloatAcc) for input. This fixes type mismatch when FloatAcc + // (int32) != FloatC (float) constexpr index_t c_offset = c_thread_desc_m0_m10_m11_n0_n10_n11.CalculateOffset( make_tuple(0, m10, m11, 0, n10, i)); - auto dst_data_refs = generate_tie( - // return type should be lvalue - [&](auto) -> auto& { return c_thread_buf(Number{}); }, - Number<2>{}); + // Element op signature: (E& e, const C& c, const D& d) + // - e (output): goes to e_thread_buf (FloatC type) + // - c (input): comes from c_thread_buf (FloatAcc type) + // Use tie() to create a tuple of references to different buffers + auto dst_data_refs = + tie(e_thread_buf( + Number{}), // E& e (output to FloatC buffer) + c_thread_buf( + Number{}) // C& c (input from FloatAcc buffer) + ); unpack2(cde_element_op, dst_data_refs, src_data_refs); }); @@ -653,8 +668,10 @@ struct GridwiseGemmDlMultipleD_km_kn_mn }); }); + // FIX: Transfer from e_thread_buf (FloatC) instead of c_thread_buf (FloatAcc) + // since element op output is now stored in e_thread_buf with proper type ThreadwiseTensorSliceTransfer_v1r3< - FloatAcc, + FloatC, // FIX: Source is now FloatC (e_thread_buf) FloatC, decltype(c_thread_desc_m0_m10_m11_n0_n10_n11), decltype(c_grid_desc_m0_m10_m11_n0_n10_n11), @@ -680,7 +697,7 @@ struct GridwiseGemmDlMultipleD_km_kn_mn ck::tensor_operation::element_wise::PassThrough{}} .Run(c_thread_desc_m0_m10_m11_n0_n10_n11, make_tuple(I0, I0, I0, I0, I0, I0), - c_thread_buf, + e_thread_buf, // FIX: Use e_thread_buf instead of c_thread_buf c_grid_desc_m0_m10_m11_n0_n10_n11, c_grid_buf); } From 5046c60e915ce1f9eef68ccacda26c450c331320 Mon Sep 17 00:00:00 2001 From: dbsanfte Date: Sat, 31 Jan 2026 17:04:28 +0000 Subject: [PATCH 3/3] Add debug logging for gfx906 GEMM compatibility - Add CK_GFX906_DEBUG macro for conditional debug output - Log GEMM parameters (M, N, K, strides) for gfx906 devices - Track which device GEMM variants are being invoked - Helps diagnose launch bounds and occupancy issues on older GCN --- include/ck/ck.hpp | 4 +-- .../device/impl/device_gemm_multiple_d_dl.hpp | 35 ++++++++++--------- .../gpu/grid/gridwise_gemm_dl_v1r3.hpp | 27 ++++++++++++++ .../threadwise_tensor_slice_transfer_v5r1.hpp | 3 +- 4 files changed, 49 insertions(+), 20 deletions(-) diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index 53af4ce211f..2452ca499f3 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -109,7 +109,7 @@ #define CK_USE_AMD_MFMA #endif -#if(defined(__gfx90a__) || defined(__gfx94__)) +#if (defined(__gfx90a__) || defined(__gfx94__)) #define CK_USE_AMD_MFMA_BF16_1K_OP #endif @@ -135,7 +135,7 @@ #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0 #endif -#if(defined(__gfx90a__) || defined(__gfx94__)) // for GPU code +#if (defined(__gfx90a__) || defined(__gfx94__)) // for GPU code #define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1 #else #define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0 diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp index 11d1a748193..5093e50dace 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp @@ -50,8 +50,8 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11, const Block2CTileMap block_2_ctile_map) { -#if(defined(__gfx906__) || defined(__gfx9__) || defined(__gfx101__) || defined(__gfx103__) || \ - defined(__gfx11__) || defined(__gfx12__)) +#if (defined(__gfx906__) || defined(__gfx9__) || defined(__gfx101__) || defined(__gfx103__) || \ + defined(__gfx11__) || defined(__gfx12__)) constexpr index_t shared_block_size = GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(ABDataType); @@ -443,20 +443,23 @@ struct DeviceGemmMultipleD_Dl : public DeviceGemmMultipleD()(I0) =