From 298dd9c45b550a74cad130cfb49881affd1c8477 Mon Sep 17 00:00:00 2001
From: dbsanfte <dbsanfte@users.noreply.github.com>
Date: Fri, 16 Jan 2026 11:32:12 +0000
Subject: [PATCH 1/3] Fix: Buffer load OOB crashes on gfx906 (MI50/MI60)

Problem:
DeviceGemmDl crashes on gfx906 when K >= 1472 with small M (M=1 decode case).

Root cause:
CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK was disabled by default.
Without this, invalid buffer loads execute and crash before bounds checking
can prevent them.

Solution:
1. Enable the OOB offset trick (0x80000000) so invalid coordinates safely
   return zero instead of accessing unmapped memory
2. Use full coordinate_has_valid_offset() check instead of the
   _assuming_visible_index_is_valid variant for proper K bounds validation

Verified with INT8 GEMM tests: M=1 decode, K=14336, FFN projections.
---
 include/ck/ck.hpp                                             | 4 +++-
 .../gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp      | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 879fb31ca59..53af4ce211f 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -186,8 +186,10 @@
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting. Don't use this hack unless absolutely necessary!
 // FIXME: make the behavior of buffer load a configurable (template) parameter for each usage
+// FIX: Enable offset trick to prevent invalid loads from crashing on gfx906/MI50
+// Without this, invalid loads still execute and crash despite bounds checking
 #ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 #endif
 #define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
 #define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
index bce2d453dce..0e8f9b0fd94 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
@@ -177,8 +177,10 @@ struct ThreadwiseTensorSliceTransfer_v5r1
 
             using src_vector_t = typename decltype(src_vector)::type;
 
+            // FIX: Use full bounds check including visible index to prevent OOB access
+            // when K0 coordinate exceeds tensor bounds
             const bool is_src_valid =
-                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+                coordinate_has_valid_offset(src_desc, src_coord_);
 
             // copy data from src_buf to src_vector
             src_vector.template AsType<src_vector_t>()(I0) =

From 41797af07828101fffb8b48836fa311fc8c7092c Mon Sep 17 00:00:00 2001
From: dbsanfte <dbsanfte@users.noreply.github.com>
Date: Fri, 16 Jan 2026 11:40:18 +0000
Subject: [PATCH 2/3] Fix: GridwiseGemmDlMultipleD element op for FloatAcc !=
 FloatC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Problem:
When FloatAcc differs from FloatC (e.g., INT8×INT8→INT32 accumulator with
FP32 output scaling), the CDE element op is invoked with wrong storage types.

The element op contract is: (E& e, const C& c, const D& d...) where:
- E = FloatC (final output type, e.g., float)
- C = FloatAcc (accumulator type, e.g., int32_t)

Original code used generate_tie() returning the same c_thread_buf for both
E& and C&, which:
1. Violates the element op signature when types differ
2. Causes compile errors with strictly-typed element ops
3. Results in undefined behavior during ThreadwiseTensorSliceTransfer

Solution:
Introduce separate e_thread_buf<FloatC> for element op output, pass
(E& e) from e_thread_buf and (const C& c) from c_thread_buf, then
transfer e_thread_buf to global memory.

Bug has existed since the file was created in December 2022 (PR #517).
---
 .../gpu/grid/gridwise_gemm_dl_multiple_d.hpp  | 29 +++++++++++++++----
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
index c486b124237..da36a8f206f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
@@ -418,6 +418,12 @@ struct GridwiseGemmDlMultipleD_km_kn_mn
         auto c_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAcc>(
             c_thread_desc_m10_m11_n10_n11.GetElementSpaceSize());
 
+        // FIX: Separate buffer for element op output with proper type (FloatC)
+        // This is needed when FloatAcc (e.g., int32) differs from FloatC (e.g., float)
+        // The element op e = static_cast<E>(c) * d expects to write to FloatC, not FloatAcc
+        auto e_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatC>(
+            c_thread_desc_m10_m11_n10_n11.GetElementSpaceSize());
+
         // Initialize C
         c_thread_buf.Clear();
 
@@ -621,13 +627,22 @@ struct GridwiseGemmDlMultipleD_km_kn_mn
                                     Number<NumDTensor>{});
 
                                 // get reference to dst data
+                                // FIX: Use e_thread_buf (FloatC) for output, c_thread_buf
+                                // (FloatAcc) for input. This fixes type mismatch when FloatAcc
+                                // (int32) != FloatC (float)
                                 constexpr index_t c_offset =
                                     c_thread_desc_m0_m10_m11_n0_n10_n11.CalculateOffset(
                                         make_tuple(0, m10, m11, 0, n10, i));
-                                auto dst_data_refs = generate_tie(
-                                    // return type should be lvalue
-                                    [&](auto) -> auto& { return c_thread_buf(Number<c_offset>{}); },
-                                    Number<2>{});
+                                // Element op signature: (E& e, const C& c, const D& d)
+                                // - e (output): goes to e_thread_buf (FloatC type)
+                                // - c (input): comes from c_thread_buf (FloatAcc type)
+                                // Use tie() to create a tuple of references to different buffers
+                                auto dst_data_refs =
+                                    tie(e_thread_buf(
+                                            Number<c_offset>{}), // E& e (output to FloatC buffer)
+                                        c_thread_buf(
+                                            Number<c_offset>{}) // C& c (input from FloatAcc buffer)
+                                    );
 
                                 unpack2(cde_element_op, dst_data_refs, src_data_refs);
                             });
@@ -653,8 +668,10 @@ struct GridwiseGemmDlMultipleD_km_kn_mn
                 });
             });
 
+            // FIX: Transfer from e_thread_buf (FloatC) instead of c_thread_buf (FloatAcc)
+            // since element op output is now stored in e_thread_buf with proper type
             ThreadwiseTensorSliceTransfer_v1r3<
-                FloatAcc,
+                FloatC, // FIX: Source is now FloatC (e_thread_buf)
                 FloatC,
                 decltype(c_thread_desc_m0_m10_m11_n0_n10_n11),
                 decltype(c_grid_desc_m0_m10_m11_n0_n10_n11),
@@ -680,7 +697,7 @@ struct GridwiseGemmDlMultipleD_km_kn_mn
                       ck::tensor_operation::element_wise::PassThrough{}}
                 .Run(c_thread_desc_m0_m10_m11_n0_n10_n11,
                      make_tuple(I0, I0, I0, I0, I0, I0),
-                     c_thread_buf,
+                     e_thread_buf, // FIX: Use e_thread_buf instead of c_thread_buf
                      c_grid_desc_m0_m10_m11_n0_n10_n11,
                      c_grid_buf);
         }

From 5046c60e915ce1f9eef68ccacda26c450c331320 Mon Sep 17 00:00:00 2001
From: dbsanfte <dbsanfte@users.noreply.github.com>
Date: Sat, 31 Jan 2026 17:04:28 +0000
Subject: [PATCH 3/3] Add debug logging for gfx906 GEMM compatibility

- Add CK_GFX906_DEBUG macro for conditional debug output
- Log GEMM parameters (M, N, K, strides) for gfx906 devices
- Track which device GEMM variants are being invoked
- Helps diagnose launch bounds and occupancy issues on older GCN
---
 include/ck/ck.hpp                             |  4 +--
 .../device/impl/device_gemm_multiple_d_dl.hpp | 35 ++++++++++---------
 .../gpu/grid/gridwise_gemm_dl_v1r3.hpp        | 27 ++++++++++++++
 .../threadwise_tensor_slice_transfer_v5r1.hpp |  3 +-
 4 files changed, 49 insertions(+), 20 deletions(-)

diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 53af4ce211f..2452ca499f3 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -109,7 +109,7 @@
 #define CK_USE_AMD_MFMA
 #endif
 
-#if(defined(__gfx90a__) || defined(__gfx94__))
+#if (defined(__gfx90a__) || defined(__gfx94__))
 #define CK_USE_AMD_MFMA_BF16_1K_OP
 #endif
 
@@ -135,7 +135,7 @@
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
 #endif
 
-#if(defined(__gfx90a__) || defined(__gfx94__)) // for GPU code
+#if (defined(__gfx90a__) || defined(__gfx94__)) // for GPU code
 #define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1
 #else
 #define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
index 11d1a748193..5093e50dace 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
@@ -50,8 +50,8 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11,
         const Block2CTileMap block_2_ctile_map)
 {
-#if(defined(__gfx906__) || defined(__gfx9__) || defined(__gfx101__) || defined(__gfx103__) || \
-    defined(__gfx11__) || defined(__gfx12__))
+#if (defined(__gfx906__) || defined(__gfx9__) || defined(__gfx101__) || defined(__gfx103__) || \
+     defined(__gfx11__) || defined(__gfx12__))
 
     constexpr index_t shared_block_size =
         GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(ABDataType);
@@ -443,20 +443,23 @@ struct DeviceGemmMultipleD_Dl : public DeviceGemmMultipleD<ALayout,
 
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
-            {
-                std::cout << "arg.a_grid_desc_k0_m0_m1_k1_{"
-                          << arg.a_grid_desc_k0_m_k1_.GetLength(I0) << ", "
-                          << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
-                          << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
-
-                std::cout << "arg.b_grid_desc_k0_n0_n1_k1_{"
-                          << arg.b_grid_desc_k0_n_k1_.GetLength(I0) << ", "
-                          << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
-                          << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
-
-                std::cout << "arg.e_grid_desc_m_n_{ " << arg.e_grid_desc_m_n_.GetLength(I0) << ", "
-                          << arg.e_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
-            }
+            // Debug output removed for production use - was spamming console with grid descriptors
+            // To re-enable, uncomment the block below:
+            // {
+            //     std::cout << "arg.a_grid_desc_k0_m0_m1_k1_{"
+            //               << arg.a_grid_desc_k0_m_k1_.GetLength(I0) << ", "
+            //               << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
+            //               << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
+            //
+            //     std::cout << "arg.b_grid_desc_k0_n0_n1_k1_{"
+            //               << arg.b_grid_desc_k0_n_k1_.GetLength(I0) << ", "
+            //               << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
+            //               << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
+            //
+            //     std::cout << "arg.e_grid_desc_m_n_{ " << arg.e_grid_desc_m_n_.GetLength(I0) << ",
+            //     "
+            //               << arg.e_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+            // }
 
             if(!GridwiseGemm::CheckValidity(
                    arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.e_grid_desc_m_n_))
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
index a08c9cfa8b4..044d6b7d196 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
@@ -440,10 +440,37 @@ struct GridwiseGemmDl_km_kn_mn_v1r3
 
             index_t k_block_data_begin = 0;
 
+#ifdef CK_DEBUG_KERNEL
+            // Debug: print loop parameters (only thread 0 of block 0)
+            if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
+            {
+                printf("[CK_DEBUG] Block 0 Thread 0: K0=%d, K0PerBlock=%d, loop_limit=%d\n",
+                       (int)K0,
+                       (int)K0PerBlock,
+                       (int)(K0 - 2 * K0PerBlock));
+                printf("[CK_DEBUG] HasMainKBlockLoop=%d, HasDoubleTailKBlockLoop=%d\n",
+                       (int)HasMainKBlockLoop,
+                       (int)HasDoubleTailKBlockLoop);
+                printf("[CK_DEBUG] a_global_buf size=%ld\n",
+                       (long)a_grid_desc_k0_m0_m1_k1.GetElementSpaceSize());
+                printf("[CK_DEBUG] b_global_buf size=%ld\n",
+                       (long)b_grid_desc_k0_n0_n1_k1.GetElementSpaceSize());
+            }
+#endif
+
             // LDS double buffer: main body
             // use Do-While loop instead of For loop to simplify control flow
             do
             {
+#ifdef CK_DEBUG_KERNEL
+                // Debug: print iteration info
+                if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
+                {
+                    printf("[CK_DEBUG] Main loop: k_block_data_begin=%d\n",
+                           (int)k_block_data_begin);
+                }
+#endif
+
                 // even iteration
                 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_k0_m0_m1_k1,
                                                     a_block_slice_copy_step);
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
index 0e8f9b0fd94..cacec3abd2e 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
@@ -179,8 +179,7 @@ struct ThreadwiseTensorSliceTransfer_v5r1
 
             // FIX: Use full bounds check including visible index to prevent OOB access
             // when K0 coordinate exceeds tensor bounds
-            const bool is_src_valid =
-                coordinate_has_valid_offset(src_desc, src_coord_);
+            const bool is_src_valid = coordinate_has_valid_offset(src_desc, src_coord_);
 
             // copy data from src_buf to src_vector
             src_vector.template AsType<src_vector_t>()(I0) =