13 Commits

Author SHA1 Message Date
Virgo-AE Eval
0884ba6fcb Merge branch 'ae' into ae-hopper 2025-02-07 14:52:27 -08:00
Richard Yan
fd2fe71ca1 Merge branch 'ae' into ae-hopper 2025-01-31 03:53:00 -08:00
Richard Yan
8d71815809 Merge branch 'ae' into ae-hopper 2025-01-30 23:40:48 -08:00
Richard Yan
63f476eb83 Merge branch 'ae' into ae-hopper 2025-01-30 15:34:58 -08:00
Hansung Kim
0711f5f7a3 Merge branch 'ae' into ae-hopper 2025-01-30 13:24:50 -08:00
Hansung Kim
97227577b5 Merge branch 'ae' into ae-hopper 2025-01-30 01:48:09 -08:00
Richard Yan
3cd6aacc17 Merge branch 'ae' into ae-hopper 2025-01-30 01:35:10 -08:00
Hansung Kim
e4f8f3481c Merge branch 'ae' into ae-hopper 2025-01-30 01:05:31 -08:00
Hansung Kim
c7f713c71e Merge branch 'ae' into ae-hopper 2025-01-30 00:49:23 -08:00
Hansung Kim
b06e345706 Merge branch 'ae' into ae-hopper 2025-01-30 00:35:10 -08:00
Hansung Kim
8a635b5fcb Set TENSOR_HOPPER to 1, add missing markers 2025-01-30 00:34:13 -08:00
Richard Yan
f23b2a3fcc Merge branch 'ae' into ae-hopper 2025-01-29 23:31:21 -08:00
Richard Yan
ac34a8f5f5 hopper changes 2025-01-29 22:22:34 -08:00
4 changed files with 7 additions and 3 deletions

View File

@@ -95,6 +95,8 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
constexpr uint32_t quartile = (128 << 10) >> 2; // 128KB / 4 constexpr uint32_t quartile = (128 << 10) >> 2; // 128KB / 4
static_assert((quartile * 4) == SMEM_SIZE, "wrong quartile constant"); static_assert((quartile * 4) == SMEM_SIZE, "wrong quartile constant");
MARK_BEG();
constexpr uint32_t smem_a_offset = 0; constexpr uint32_t smem_a_offset = 0;
constexpr uint32_t smem_a_dbuf_offset = 1 * quartile; constexpr uint32_t smem_a_dbuf_offset = 1 * quartile;
constexpr uint32_t smem_b_offset = constexpr uint32_t smem_b_offset =
@@ -119,6 +121,8 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
threadblocks_per_cluster, threadblock_id_in_cluster, threadblocks_per_cluster, threadblock_id_in_cluster,
sharedmem_per_threadblock); sharedmem_per_threadblock);
MARK_END();
float *gmem_tmp_d0 = reinterpret_cast<float *>(0xd0000000UL); float *gmem_tmp_d0 = reinterpret_cast<float *>(0xd0000000UL);
float *gmem_tmp_d1 = reinterpret_cast<float *>(0xd1000000UL); float *gmem_tmp_d1 = reinterpret_cast<float *>(0xd1000000UL);
float *gmem_tmp_d2 = reinterpret_cast<float *>(0xd2000000UL); float *gmem_tmp_d2 = reinterpret_cast<float *>(0xd2000000UL);

View File

@@ -19,7 +19,7 @@ using float_type = float16_t;
// Generate kernel for the Hopper-style SMEM-decoupled tensor core. This uses // Generate kernel for the Hopper-style SMEM-decoupled tensor core. This uses
// asynchronous HGMMA and HGMMA_WAIT instructions. // asynchronous HGMMA and HGMMA_WAIT instructions.
#define TENSOR_HOPPER 0 #define TENSOR_HOPPER 1
// Constraints on parameters: // Constraints on parameters:
// * Memory: // * Memory:

View File

@@ -84,7 +84,7 @@
#endif #endif
#ifndef NUM_CORES #ifndef NUM_CORES
#define NUM_CORES 8 #define NUM_CORES 4
#endif #endif
#ifndef NUM_WARPS #ifndef NUM_WARPS

View File

@@ -18,7 +18,7 @@
#include <stdio.h> #include <stdio.h>
#ifndef CORES_PER_CLUSTER #ifndef CORES_PER_CLUSTER
#define CORES_PER_CLUSTER 8 #define CORES_PER_CLUSTER 4
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus