sgemm_wg: Remove software-based barrier implementation

Intra-cluster barrier is now implemented in hardware, transparent to the ISA.
This commit is contained in:
Hansung Kim
2024-03-27 22:43:25 -07:00
parent 870846f20f
commit 09822764e7

View File

@@ -12,63 +12,9 @@
#define TM 2
#define TN 2
#define DEV_BARRIER_MMIO_BASE_ADDR 0xff003f00UL
#define CORES_PER_CLUSTER 2
#define BARRIER_STRIDE 4
void threadblock_barrier(unsigned int tid_in_threadblock, unsigned int barrier_id, unsigned int count) {
vx_barrier(barrier_id, count);
vx_fence();
// vx_printf("========== barrier! barrier_id=%u, count=%u\n", barrier_id, count);
#if CORES_PER_CLUSTER != 0
// this code doesn't work without the memory-mapped register implemented in
// hardware, hence the #ifdef.
if (tid_in_threadblock == 0) {
volatile uint32_t *mmio = (volatile uint32_t *)(DEV_BARRIER_MMIO_BASE_ADDR);
int core_id = vx_core_id();
// FIXME: hardcoded
const uint32_t barrier_stride = BARRIER_STRIDE;
const uint32_t barrier_offset = barrier_stride * barrier_id;
// wait for the barrier to be initialized
while (mmio[barrier_offset + 1 + core_id] != 0);
// signal internal-core synchronization done
mmio[barrier_offset + 1 + core_id] = 1;
// wait for other cores in the cluster to finish by waiting on the
// all-synced read-only mmio reg
while (mmio[barrier_offset] == 0);
// need to signal that this core passed the barrier; otherwise, if we
// reset this to 0 right away, the other core still waiting for the
// barrier might never see the all-sync mmio reg as 1.
mmio[barrier_offset + 1 + core_id] = 2;
// // if this core is the last one passing the barrier, reset all per-core
// // flags to 0 to get ready for the next barrier
// bool all_passed = true;
// for (int i = 0; i < CORES_PER_CLUSTER; i++) {
// // if (i == core_id) continue;
// // NOTE: this requires coherent access of store-to-load to the same
// // address
// if (mmio[barrier_offset + 1 + i] != 2) {
// all_passed = false;
// break;
// }
// }
// if (all_passed) {
// for (int i = 0; i < CORES_PER_CLUSTER; i++) {
// mmio[barrier_offset + 1 + i] = 0;
// }
// }
}
vx_barrier(barrier_id, count);
#endif
}
void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg,