Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
326141b11f | ||
|
|
d893780594 | ||
|
|
9b7c22a7e9 | ||
|
|
b1ebabef26 | ||
|
|
9f524538a4 | ||
|
|
51ebe18ebb | ||
|
|
7b0a95034b | ||
|
|
c240069147 | ||
|
|
d86c33acf3 | ||
|
|
b49e8a293c | ||
|
|
19731b8e2f | ||
|
|
afc69507a3 | ||
|
|
6e279c905f |
@@ -95,8 +95,6 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
|
||||
constexpr uint32_t quartile = (128 << 10) >> 2; // 128KB / 4
|
||||
static_assert((quartile * 4) == SMEM_SIZE, "wrong quartile constant");
|
||||
|
||||
MARK_BEG();
|
||||
|
||||
constexpr uint32_t smem_a_offset = 0;
|
||||
constexpr uint32_t smem_a_dbuf_offset = 1 * quartile;
|
||||
constexpr uint32_t smem_b_offset =
|
||||
@@ -121,8 +119,6 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
|
||||
threadblocks_per_cluster, threadblock_id_in_cluster,
|
||||
sharedmem_per_threadblock);
|
||||
|
||||
MARK_END();
|
||||
|
||||
float *gmem_tmp_d0 = reinterpret_cast<float *>(0xd0000000UL);
|
||||
float *gmem_tmp_d1 = reinterpret_cast<float *>(0xd1000000UL);
|
||||
float *gmem_tmp_d2 = reinterpret_cast<float *>(0xd2000000UL);
|
||||
|
||||
@@ -19,7 +19,7 @@ using float_type = float16_t;
|
||||
|
||||
// Generate kernel for the Hopper-style SMEM-decoupled tensor core. This uses
|
||||
// asynchronous HGMMA and HGMMA_WAIT instructions.
|
||||
#define TENSOR_HOPPER 1
|
||||
#define TENSOR_HOPPER 0
|
||||
|
||||
// Constraints on parameters:
|
||||
// * Memory:
|
||||
@@ -110,7 +110,7 @@ static_assert(WMITER * WNITER * TCM * TCN * NUM_WARPS * CORES_PER_CLUSTER ==
|
||||
// result matrix will be stored in a swizzled form in the global memory.
|
||||
#define WMMA_STORE_FAST 0
|
||||
|
||||
#define GEMMINI_DMA 1
|
||||
#define GEMMINI_DMA 0
|
||||
#define GEMMINI_DMA_FAST 1
|
||||
#if SMEM_SIZE == 0x4000
|
||||
#define SMEM_ADDR_Q0 ((float * const) 0xff000000)
|
||||
|
||||
@@ -84,7 +84,7 @@
|
||||
#endif
|
||||
|
||||
#ifndef NUM_CORES
|
||||
#define NUM_CORES 4
|
||||
#define NUM_CORES 8
|
||||
#endif
|
||||
|
||||
#ifndef NUM_WARPS
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#ifndef CORES_PER_CLUSTER
|
||||
#define CORES_PER_CLUSTER 4
|
||||
#define CORES_PER_CLUSTER 8
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Reference in New Issue
Block a user