Set TENSOR_HOPPER to 1, add missing markers

This commit is contained in:
Hansung Kim
2025-01-30 00:34:13 -08:00
parent f23b2a3fcc
commit 8a635b5fcb
2 changed files with 5 additions and 1 deletions

View File

@@ -95,6 +95,8 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
constexpr uint32_t quartile = (128 << 10) >> 2; // 128KB / 4
static_assert((quartile * 4) == SMEM_SIZE, "wrong quartile constant");
MARK_BEG();
constexpr uint32_t smem_a_offset = 0;
constexpr uint32_t smem_a_dbuf_offset = 1 * quartile;
constexpr uint32_t smem_b_offset =
@@ -119,6 +121,8 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
threadblocks_per_cluster, threadblock_id_in_cluster,
sharedmem_per_threadblock);
MARK_END();
float *gmem_tmp_d0 = reinterpret_cast<float *>(0xd0000000UL);
float *gmem_tmp_d1 = reinterpret_cast<float *>(0xd1000000UL);
float *gmem_tmp_d2 = reinterpret_cast<float *>(0xd2000000UL);

View File

@@ -19,7 +19,7 @@ using float_type = float16_t;
// Generate kernel for the Hopper-style SMEM-decoupled tensor core. This uses
// asynchronous HGMMA and HGMMA_WAIT instructions.
#define TENSOR_HOPPER 0
#define TENSOR_HOPPER 1
// Constraints on parameters:
// * Memory: