diff --git a/kernels/sgemm_tcore/kernel.cpp b/kernels/sgemm_tcore/kernel.cpp index 38830b86..427a3e90 100644 --- a/kernels/sgemm_tcore/kernel.cpp +++ b/kernels/sgemm_tcore/kernel.cpp @@ -95,6 +95,8 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { constexpr uint32_t quartile = (128 << 10) >> 2; // 128KB / 4 static_assert((quartile * 4) == SMEM_SIZE, "wrong quartile constant"); + MARK_BEG(); + constexpr uint32_t smem_a_offset = 0; constexpr uint32_t smem_a_dbuf_offset = 1 * quartile; constexpr uint32_t smem_b_offset = @@ -119,6 +121,8 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { threadblocks_per_cluster, threadblock_id_in_cluster, sharedmem_per_threadblock); + MARK_END(); + float *gmem_tmp_d0 = reinterpret_cast(0xd0000000UL); float *gmem_tmp_d1 = reinterpret_cast(0xd1000000UL); float *gmem_tmp_d2 = reinterpret_cast(0xd2000000UL); diff --git a/kernels/sgemm_tcore/sgemm_impl.hpp b/kernels/sgemm_tcore/sgemm_impl.hpp index 10829db2..39d082d9 100644 --- a/kernels/sgemm_tcore/sgemm_impl.hpp +++ b/kernels/sgemm_tcore/sgemm_impl.hpp @@ -19,7 +19,7 @@ using float_type = float16_t; // Generate kernel for the Hopper-style SMEM-decoupled tensor core. This uses // asynchronous HGMMA and HGMMA_WAIT instructions. -#define TENSOR_HOPPER 0 +#define TENSOR_HOPPER 1 // Constraints on parameters: // * Memory: