tensor: Change B in-memory layout to column-major

This commit is contained in:
Hansung Kim
2024-08-12 15:20:55 -07:00
parent 07dd9e35a0
commit 95e3e96c6c
6 changed files with 42 additions and 24 deletions

View File

@@ -49,7 +49,10 @@ VX_CP = $(LLVM_VORTEX)/bin/llvm-objcopy
#VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
VX_CFLAGS += -v -O3 -std=c++17
VX_CFLAGS += -mcmodel=medany -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections -mllvm -inline-threshold=8192
VX_CFLAGS += -mcmodel=medany -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
# comment out below for regression/basic, which uses GCC that doesn't
# understand these flags
VX_CFLAGS += -mllvm -inline-threshold=8192
VX_CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw -I$(GEMMINI_SW_PATH)
VX_CFLAGS += -DNDEBUG -DLLVM_VORTEX
@@ -104,23 +107,24 @@ kernel.bin: kernel.elf kernel.radiance.elf
OBJCOPY ?= $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
OBJCOPY_FLAGS ?= "LOAD,ALLOC,DATA,CONTENTS"
kernel.elf: $(VX_SRCS)
BINFILES := args.bin input.a.bin input.b.bin
kernel.elf: $(VX_SRCS) $(BINFILES)
$(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o $@
$(OBJCOPY) --set-section-flags .operand.a=$(OBJCOPY_FLAGS) $@
$(OBJCOPY) --set-section-flags .operand.b=$(OBJCOPY_FLAGS) $@
$(OBJCOPY) --set-section-flags .args=$(OBJCOPY_FLAGS) $@
$(OBJCOPY) --update-section .operand.a=input.a.bin $@
$(OBJCOPY) --update-section .operand.b=input.b.bin $@
$(OBJCOPY) --update-section .args=args.bin $@
$(OBJCOPY) --update-section .operand.a=input.a.bin $@ || true
$(OBJCOPY) --update-section .operand.b=input.b.bin $@ || true
$(OBJCOPY) --update-section .args=args.bin $@ || true
kernel.radiance.elf: $(VX_SRCS)
kernel.radiance.elf: $(VX_SRCS) $(BINFILES)
$(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -DRADIANCE -o $@
$(OBJCOPY) --set-section-flags .operand.a=$(OBJCOPY_FLAGS) $@
$(OBJCOPY) --set-section-flags .operand.b=$(OBJCOPY_FLAGS) $@
$(OBJCOPY) --set-section-flags .args=$(OBJCOPY_FLAGS) $@
$(OBJCOPY) --update-section .operand.a=input.a.bin $@
$(OBJCOPY) --update-section .operand.b=input.b.bin $@
$(OBJCOPY) --update-section .args=args.bin $@
$(OBJCOPY) --update-section .operand.a=input.a.bin $@ || true
$(OBJCOPY) --update-section .operand.b=input.b.bin $@ || true
$(OBJCOPY) --update-section .args=args.bin $@ || true
ifneq ($(CONFIG),)
kernel$(CONFIGEXT).elf: kernel.elf

View File

@@ -572,7 +572,7 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
const uint32_t problem_size = (dim_m * dim_n) / (ELEM_PER_THREAD);
const uint32_t num_threadblocks = problem_size / threads_per_threadblock;
using float_type = float;
using float_type = float16_t;
// "static" shared memory allocation. This would determine threadblock
// occupancy of a single cluster

View File

@@ -173,7 +173,8 @@ int main(int argc, char *argv[]) {
uint32_t dim_n = 64;
uint32_t dim_k = 64;
using float_type = float;
using float_type = half;
generate_source_matrix<float_type>(dim_m, dim_n, dim_k);
generate_reference_matmul<float_type>(dim_m, dim_n, dim_k);

View File

@@ -20,7 +20,7 @@
// BM <= BK*TM*TN
#define BM 64
#define BN 64
#define BK 64
#define BK 128
#define WM 16
#define WN 8
#define TCM 8