Implement WU architecture support
This commit is contained in:
701
kernel/include/VX_config.h
Normal file
701
kernel/include/VX_config.h
Normal file
@@ -0,0 +1,701 @@
|
||||
// auto-generated by gen_config.py. DO NOT EDIT
|
||||
// Generated at 2024-05-07 13:55:58.398687
|
||||
|
||||
// Translated from ./rtl/VX_config.vh:
|
||||
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef VX_CONFIG_VH
|
||||
#define VX_CONFIG_VH
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
|
||||
#endif
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
|
||||
#endif
|
||||
|
||||
#ifndef CLAMP
|
||||
#define CLAMP(x, lo, hi) (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x)))
|
||||
#endif
|
||||
|
||||
#ifndef UP
|
||||
#define UP(x) (((x) != 0) ? (x) : 1)
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef EXT_M_DISABLE
|
||||
#define EXT_M_ENABLE
|
||||
#endif
|
||||
|
||||
#ifndef EXT_F_DISABLE
|
||||
#define EXT_F_ENABLE
|
||||
#endif
|
||||
|
||||
#ifndef XLEN_32
|
||||
#ifndef XLEN_64
|
||||
#define XLEN_32
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef XLEN_64
|
||||
#define XLEN 64
|
||||
#endif
|
||||
|
||||
#ifdef XLEN_32
|
||||
#define XLEN 32
|
||||
#endif
|
||||
|
||||
#ifdef EXT_D_ENABLE
|
||||
#define FLEN_64
|
||||
#else
|
||||
#define FLEN_32
|
||||
#endif
|
||||
|
||||
#ifdef FLEN_64
|
||||
#define FLEN 64
|
||||
#endif
|
||||
|
||||
#ifdef FLEN_32
|
||||
#define FLEN 32
|
||||
#endif
|
||||
|
||||
#ifdef XLEN_64
|
||||
#ifdef FLEN_32
|
||||
#define FPU_RV64F
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef NUM_CLUSTERS
|
||||
#define NUM_CLUSTERS 1
|
||||
#endif
|
||||
|
||||
#ifndef NUM_CORES
|
||||
#define NUM_CORES 1
|
||||
#endif
|
||||
|
||||
#ifndef NUM_WARPS
|
||||
#define NUM_WARPS 4
|
||||
#endif
|
||||
|
||||
#ifndef NUM_TENSOR_WARPS
|
||||
#define NUM_TENSOR_WARPS 2
|
||||
#endif
|
||||
|
||||
#define NUM_SCALAR_WARPS (NUM_WARPS - NUM_TENSOR_WARPS)
|
||||
|
||||
#define IS_SCALAR_WARP(wid) ((wid) < NUM_SCALAR_WARPS)
|
||||
#define IS_TENSOR_WARP(wid) ((wid) >= NUM_SCALAR_WARPS)
|
||||
|
||||
#ifndef TENSOR_NUM_GPRS
|
||||
#define TENSOR_NUM_GPRS 8
|
||||
#endif
|
||||
|
||||
#ifndef TENSOR_NUM_FPRS
|
||||
#define TENSOR_NUM_FPRS 8
|
||||
#endif
|
||||
|
||||
#ifndef NUM_THREADS
|
||||
#define NUM_THREADS 4
|
||||
#endif
|
||||
|
||||
#ifndef NUM_BARRIERS
|
||||
#define NUM_BARRIERS 8
|
||||
#endif
|
||||
|
||||
#ifndef SOCKET_SIZE
|
||||
#define SOCKET_SIZE MIN(4, NUM_CORES)
|
||||
#endif
|
||||
#define NUM_SOCKETS UP(NUM_CORES / SOCKET_SIZE)
|
||||
|
||||
#ifdef L2_ENABLE
|
||||
#define L2_ENABLED 1
|
||||
#else
|
||||
#define L2_ENABLED 0
|
||||
#endif
|
||||
|
||||
#ifdef L3_ENABLE
|
||||
#define L3_ENABLED 1
|
||||
#else
|
||||
#define L3_ENABLED 0
|
||||
#endif
|
||||
|
||||
#ifdef L1_DISABLE
|
||||
#define ICACHE_DISABLE
|
||||
#define DCACHE_DISABLE
|
||||
#endif
|
||||
|
||||
#ifndef MEM_BLOCK_SIZE
|
||||
#define MEM_BLOCK_SIZE 64
|
||||
#endif
|
||||
|
||||
#ifndef MEM_ADDR_WIDTH
|
||||
#ifdef XLEN_64
|
||||
#define MEM_ADDR_WIDTH 48
|
||||
#else
|
||||
#define MEM_ADDR_WIDTH 32
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef L1_LINE_SIZE
|
||||
#ifdef L1_DISABLE
|
||||
#define L1_LINE_SIZE ((L2_ENABLED || L3_ENABLED) ? 4 : MEM_BLOCK_SIZE)
|
||||
#else
|
||||
#define L1_LINE_SIZE ((L2_ENABLED || L3_ENABLED) ? 16 : MEM_BLOCK_SIZE)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef L2_ENABLE
|
||||
#define L2_LINE_SIZE MEM_BLOCK_SIZE
|
||||
#else
|
||||
#define L2_LINE_SIZE L1_LINE_SIZE
|
||||
#endif
|
||||
|
||||
#ifdef L3_ENABLE
|
||||
#define L3_LINE_SIZE MEM_BLOCK_SIZE
|
||||
#else
|
||||
#define L3_LINE_SIZE L2_LINE_SIZE
|
||||
#endif
|
||||
|
||||
#ifdef XLEN_64
|
||||
|
||||
#ifndef STARTUP_ADDR
|
||||
#define STARTUP_ADDR 0x180000000
|
||||
#endif
|
||||
|
||||
#ifndef STACK_BASE_ADDR
|
||||
#define STACK_BASE_ADDR 0x1FF000000
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#ifndef STARTUP_ADDR
|
||||
#define STARTUP_ADDR 0x80000000
|
||||
#endif
|
||||
|
||||
#ifndef STACK_BASE_ADDR
|
||||
#define STACK_BASE_ADDR 0xFF000000
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef SMEM_BASE_ADDR
|
||||
#define SMEM_BASE_ADDR STACK_BASE_ADDR
|
||||
#endif
|
||||
|
||||
#ifndef SMEM_LOG_SIZE
|
||||
#define SMEM_LOG_SIZE 19
|
||||
#endif
|
||||
|
||||
#ifndef IO_BASE_ADDR
|
||||
#define IO_BASE_ADDR (SMEM_BASE_ADDR + (1 << SMEM_LOG_SIZE))
|
||||
#endif
|
||||
|
||||
#ifndef IO_COUT_ADDR
|
||||
#define IO_COUT_ADDR IO_BASE_ADDR
|
||||
#endif
|
||||
#define IO_COUT_SIZE MEM_BLOCK_SIZE
|
||||
|
||||
#ifndef IO_CSR_ADDR
|
||||
#define IO_CSR_ADDR (IO_COUT_ADDR + IO_COUT_SIZE)
|
||||
#endif
|
||||
#define IO_CSR_SIZE (4 * 64 * NUM_CORES * NUM_CLUSTERS)
|
||||
|
||||
#ifndef STACK_LOG2_SIZE
|
||||
#define STACK_LOG2_SIZE 13
|
||||
#endif
|
||||
#define STACK_SIZE (1 << STACK_LOG2_SIZE)
|
||||
|
||||
#define RESET_DELAY 8
|
||||
|
||||
#ifndef STALL_TIMEOUT
|
||||
#define STALL_TIMEOUT (100000 * (1 ** (L2_ENABLED + L3_ENABLED)))
|
||||
#endif
|
||||
|
||||
#ifndef SV_DPI
|
||||
#define DPI_DISABLE
|
||||
#endif
|
||||
|
||||
#ifndef FPU_FPNEW
|
||||
#ifndef FPU_DSP
|
||||
#ifndef FPU_DPI
|
||||
#ifndef SYNTHESIS
|
||||
#ifndef DPI_DISABLE
|
||||
#define FPU_DPI
|
||||
#else
|
||||
#define FPU_DSP
|
||||
#endif
|
||||
#else
|
||||
#define FPU_DSP
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef SYNTHESIS
|
||||
#ifndef DPI_DISABLE
|
||||
#define IMUL_DPI
|
||||
#define IDIV_DPI
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef DEBUG_LEVEL
|
||||
#define DEBUG_LEVEL 3
|
||||
#endif
|
||||
|
||||
// Pipeline Configuration /////////////////////////////////////////////////////
|
||||
|
||||
// Issue width
|
||||
#ifndef ISSUE_WIDTH
|
||||
#define ISSUE_WIDTH NUM_WARPS
|
||||
#endif
|
||||
|
||||
// Number of ALU units
|
||||
#ifndef NUM_ALU_LANES
|
||||
#define NUM_ALU_LANES NUM_THREADS
|
||||
#endif
|
||||
#ifndef NUM_ALU_BLOCKS
|
||||
#define NUM_ALU_BLOCKS 4
|
||||
#endif
|
||||
|
||||
// Number of FPU units
|
||||
#ifndef NUM_FPU_LANES
|
||||
#define NUM_FPU_LANES NUM_THREADS
|
||||
#endif
|
||||
#ifndef NUM_FPU_BLOCKS
|
||||
#define NUM_FPU_BLOCKS 2
|
||||
#endif
|
||||
|
||||
// Number of LSU units
|
||||
#ifndef NUM_LSU_LANES
|
||||
#define NUM_LSU_LANES NUM_THREADS
|
||||
#endif
|
||||
|
||||
// Number of SFU units
|
||||
#ifndef NUM_SFU_LANES
|
||||
#define NUM_SFU_LANES MIN(NUM_THREADS, 4)
|
||||
#endif
|
||||
|
||||
// Size of Instruction Buffer
|
||||
#ifndef IBUF_SIZE
|
||||
#define IBUF_SIZE (4 * ISSUE_WIDTH)
|
||||
#endif
|
||||
|
||||
// Size of LSU Request Queue
|
||||
#ifndef LSUQ_SIZE
|
||||
#define LSUQ_SIZE (4 * NUM_WARPS * (NUM_THREADS / NUM_LSU_LANES))
|
||||
#endif
|
||||
|
||||
// LSU Duplicate Address Check
|
||||
#ifndef LSU_DUP_DISABLE
|
||||
#define LSU_DUP_ENABLE
|
||||
#endif
|
||||
#ifdef LSU_DUP_ENABLE
|
||||
#define LSU_DUP_ENABLED 1
|
||||
#else
|
||||
#define LSU_DUP_ENABLED 0
|
||||
#endif
|
||||
|
||||
#ifdef GBAR_ENABLE
|
||||
#define GBAR_ENABLED 1
|
||||
#else
|
||||
#define GBAR_ENABLED 0
|
||||
#endif
|
||||
|
||||
#ifndef LATENCY_IMUL
|
||||
#ifdef VIVADO
|
||||
#define LATENCY_IMUL 4
|
||||
#endif
|
||||
#ifdef QUARTUS
|
||||
#define LATENCY_IMUL 3
|
||||
#endif
|
||||
#ifndef LATENCY_IMUL
|
||||
#define LATENCY_IMUL 4
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Floating-Point Units ///////////////////////////////////////////////////////
|
||||
|
||||
// Size of FPU Request Queue
|
||||
#ifndef FPUQ_SIZE
|
||||
#define FPUQ_SIZE (2 * (NUM_THREADS / NUM_FPU_LANES))
|
||||
#endif
|
||||
|
||||
// FNCP Latency
|
||||
#ifndef LATENCY_FNCP
|
||||
#define LATENCY_FNCP 2
|
||||
#endif
|
||||
|
||||
// FMA Latency
|
||||
#ifndef LATENCY_FMA
|
||||
#ifdef FPU_DPI
|
||||
#define LATENCY_FMA 4
|
||||
#endif
|
||||
#ifdef FPU_FPNEW
|
||||
#define LATENCY_FMA 4
|
||||
#endif
|
||||
#ifdef FPU_DSP
|
||||
#ifdef QUARTUS
|
||||
#define LATENCY_FMA 4
|
||||
#endif
|
||||
#ifdef VIVADO
|
||||
#define LATENCY_FMA 16
|
||||
#endif
|
||||
#ifndef LATENCY_FMA
|
||||
#define LATENCY_FMA 4
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// FDIV Latency
|
||||
#ifndef LATENCY_FDIV
|
||||
#ifdef FPU_DPI
|
||||
#define LATENCY_FDIV 15
|
||||
#endif
|
||||
#ifdef FPU_FPNEW
|
||||
#define LATENCY_FDIV 16
|
||||
#endif
|
||||
#ifdef FPU_DSP
|
||||
#ifdef QUARTUS
|
||||
#define LATENCY_FDIV 15
|
||||
#endif
|
||||
#ifdef VIVADO
|
||||
#define LATENCY_FDIV 28
|
||||
#endif
|
||||
#ifndef LATENCY_FDIV
|
||||
#define LATENCY_FDIV 16
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// FSQRT Latency
|
||||
#ifndef LATENCY_FSQRT
|
||||
#ifdef FPU_DPI
|
||||
#define LATENCY_FSQRT 10
|
||||
#endif
|
||||
#ifdef FPU_FPNEW
|
||||
#define LATENCY_FSQRT 16
|
||||
#endif
|
||||
#ifdef FPU_DSP
|
||||
#ifdef QUARTUS
|
||||
#define LATENCY_FSQRT 10
|
||||
#endif
|
||||
#ifdef VIVADO
|
||||
#define LATENCY_FSQRT 28
|
||||
#endif
|
||||
#ifndef LATENCY_FSQRT
|
||||
#define LATENCY_FSQRT 16
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// FCVT Latency
|
||||
#ifndef LATENCY_FCVT
|
||||
#define LATENCY_FCVT 5
|
||||
#endif
|
||||
|
||||
// Icache Configurable Knobs //////////////////////////////////////////////////
|
||||
|
||||
// Cache Enable
|
||||
#ifndef ICACHE_DISABLE
|
||||
#define ICACHE_ENABLE
|
||||
#endif
|
||||
#ifdef ICACHE_ENABLE
|
||||
#define ICACHE_ENABLED 1
|
||||
#else
|
||||
#define ICACHE_ENABLED 0
|
||||
#define NUM_ICACHES 0
|
||||
#endif
|
||||
|
||||
// Number of Cache Units
|
||||
#ifndef NUM_ICACHES
|
||||
#define NUM_ICACHES UP(SOCKET_SIZE / 4)
|
||||
#endif
|
||||
|
||||
// Cache Size
|
||||
#ifndef ICACHE_SIZE
|
||||
#define ICACHE_SIZE 16384
|
||||
#endif
|
||||
|
||||
// Core Response Queue Size
|
||||
#ifndef ICACHE_CRSQ_SIZE
|
||||
#define ICACHE_CRSQ_SIZE 2
|
||||
#endif
|
||||
|
||||
// Miss Handling Register Size
|
||||
#ifndef ICACHE_MSHR_SIZE
|
||||
#define ICACHE_MSHR_SIZE 16
|
||||
#endif
|
||||
|
||||
// Memory Request Queue Size
|
||||
#ifndef ICACHE_MREQ_SIZE
|
||||
#define ICACHE_MREQ_SIZE 4
|
||||
#endif
|
||||
|
||||
// Memory Response Queue Size
|
||||
#ifndef ICACHE_MRSQ_SIZE
|
||||
#define ICACHE_MRSQ_SIZE 0
|
||||
#endif
|
||||
|
||||
// Number of Associative Ways
|
||||
#ifndef ICACHE_NUM_WAYS
|
||||
#define ICACHE_NUM_WAYS 1
|
||||
#endif
|
||||
|
||||
// Dcache Configurable Knobs //////////////////////////////////////////////////
|
||||
|
||||
// Cache Enable
|
||||
#ifndef DCACHE_DISABLE
|
||||
#define DCACHE_ENABLE
|
||||
#endif
|
||||
#ifdef DCACHE_ENABLE
|
||||
#define DCACHE_ENABLED 1
|
||||
#else
|
||||
#define DCACHE_ENABLED 0
|
||||
#define NUM_DCACHES 0
|
||||
#define DCACHE_NUM_BANKS 1
|
||||
#endif
|
||||
|
||||
// Number of Cache Units
|
||||
#ifndef NUM_DCACHES
|
||||
#define NUM_DCACHES UP(SOCKET_SIZE / 4)
|
||||
#endif
|
||||
|
||||
// Cache Size
|
||||
#ifndef DCACHE_SIZE
|
||||
#define DCACHE_SIZE 16384
|
||||
#endif
|
||||
|
||||
// Number of Banks
|
||||
#ifndef DCACHE_NUM_BANKS
|
||||
#define DCACHE_NUM_BANKS NUM_LSU_LANES
|
||||
#endif
|
||||
|
||||
// Core Response Queue Size
|
||||
#ifndef DCACHE_CRSQ_SIZE
|
||||
#define DCACHE_CRSQ_SIZE 2
|
||||
#endif
|
||||
|
||||
// Miss Handling Register Size
|
||||
#ifndef DCACHE_MSHR_SIZE
|
||||
#define DCACHE_MSHR_SIZE 8
|
||||
#endif
|
||||
|
||||
// Memory Request Queue Size
|
||||
#ifndef DCACHE_MREQ_SIZE
|
||||
#define DCACHE_MREQ_SIZE 4
|
||||
#endif
|
||||
|
||||
// Memory Response Queue Size
|
||||
#ifndef DCACHE_MRSQ_SIZE
|
||||
#define DCACHE_MRSQ_SIZE 0
|
||||
#endif
|
||||
|
||||
// Number of Associative Ways
|
||||
#ifndef DCACHE_NUM_WAYS
|
||||
#define DCACHE_NUM_WAYS 1
|
||||
#endif
|
||||
|
||||
// SM Configurable Knobs //////////////////////////////////////////////////////
|
||||
|
||||
#ifndef SM_DISABLE
|
||||
#define SM_ENABLE
|
||||
#endif
|
||||
|
||||
#ifdef SM_ENABLE
|
||||
#define SM_ENABLED 1
|
||||
#else
|
||||
#define SM_ENABLED 0
|
||||
#define SMEM_NUM_BANKS 1
|
||||
#endif
|
||||
|
||||
// Number of Banks
|
||||
#ifndef SMEM_NUM_BANKS
|
||||
#define SMEM_NUM_BANKS (NUM_LSU_LANES)
|
||||
#endif
|
||||
|
||||
// L2cache Configurable Knobs /////////////////////////////////////////////////
|
||||
|
||||
// Cache Size
|
||||
#ifndef L2_CACHE_SIZE
|
||||
#ifdef ALTERA_S10
|
||||
#define L2_CACHE_SIZE 2097152
|
||||
#else
|
||||
#define L2_CACHE_SIZE 1048576
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Number of Banks
|
||||
#ifndef L2_NUM_BANKS
|
||||
#define L2_NUM_BANKS MIN(4, NUM_SOCKETS)
|
||||
#endif
|
||||
|
||||
// Core Response Queue Size
|
||||
#ifndef L2_CRSQ_SIZE
|
||||
#define L2_CRSQ_SIZE 2
|
||||
#endif
|
||||
|
||||
// Miss Handling Register Size
|
||||
#ifndef L2_MSHR_SIZE
|
||||
#define L2_MSHR_SIZE 16
|
||||
#endif
|
||||
|
||||
// Memory Request Queue Size
|
||||
#ifndef L2_MREQ_SIZE
|
||||
#define L2_MREQ_SIZE 4
|
||||
#endif
|
||||
|
||||
// Memory Response Queue Size
|
||||
#ifndef L2_MRSQ_SIZE
|
||||
#define L2_MRSQ_SIZE 0
|
||||
#endif
|
||||
|
||||
// Number of Associative Ways
|
||||
#ifndef L2_NUM_WAYS
|
||||
#define L2_NUM_WAYS 2
|
||||
#endif
|
||||
|
||||
// L3cache Configurable Knobs /////////////////////////////////////////////////
|
||||
|
||||
// Cache Size
|
||||
#ifndef L3_CACHE_SIZE
|
||||
#ifdef ALTERA_S10
|
||||
#define L3_CACHE_SIZE 2097152
|
||||
#else
|
||||
#define L3_CACHE_SIZE 1048576
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Number of Banks
|
||||
#ifndef L3_NUM_BANKS
|
||||
#define L3_NUM_BANKS MIN(4, NUM_CLUSTERS)
|
||||
#endif
|
||||
|
||||
// Core Response Queue Size
|
||||
#ifndef L3_CRSQ_SIZE
|
||||
#define L3_CRSQ_SIZE 2
|
||||
#endif
|
||||
|
||||
// Miss Handling Register Size
|
||||
#ifndef L3_MSHR_SIZE
|
||||
#define L3_MSHR_SIZE 16
|
||||
#endif
|
||||
|
||||
// Memory Request Queue Size
|
||||
#ifndef L3_MREQ_SIZE
|
||||
#define L3_MREQ_SIZE 4
|
||||
#endif
|
||||
|
||||
// Memory Response Queue Size
|
||||
#ifndef L3_MRSQ_SIZE
|
||||
#define L3_MRSQ_SIZE 0
|
||||
#endif
|
||||
|
||||
// Number of Associative Ways
|
||||
#ifndef L3_NUM_WAYS
|
||||
#define L3_NUM_WAYS 4
|
||||
#endif
|
||||
|
||||
// ISA Extensions /////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef EXT_A_ENABLE
|
||||
#define EXT_A_ENABLED 1
|
||||
#else
|
||||
#define EXT_A_ENABLED 0
|
||||
#endif
|
||||
|
||||
#ifdef EXT_C_ENABLE
|
||||
#define EXT_C_ENABLED 1
|
||||
#else
|
||||
#define EXT_C_ENABLED 0
|
||||
#endif
|
||||
|
||||
#ifdef EXT_D_ENABLE
|
||||
#define EXT_D_ENABLED 1
|
||||
#else
|
||||
#define EXT_D_ENABLED 0
|
||||
#endif
|
||||
|
||||
#ifdef EXT_F_ENABLE
|
||||
#define EXT_F_ENABLED 1
|
||||
#else
|
||||
#define EXT_F_ENABLED 0
|
||||
#endif
|
||||
|
||||
#ifdef EXT_M_ENABLE
|
||||
#define EXT_M_ENABLED 1
|
||||
#else
|
||||
#define EXT_M_ENABLED 0
|
||||
#endif
|
||||
|
||||
#define ISA_STD_A 0
|
||||
#define ISA_STD_C 2
|
||||
#define ISA_STD_D 3
|
||||
#define ISA_STD_E 4
|
||||
#define ISA_STD_F 5
|
||||
#define ISA_STD_H 7
|
||||
#define ISA_STD_I 8
|
||||
#define ISA_STD_N 13
|
||||
#define ISA_STD_Q 16
|
||||
#define ISA_STD_S 18
|
||||
#define ISA_STD_U 20
|
||||
|
||||
#define ISA_EXT_ICACHE 0
|
||||
#define ISA_EXT_DCACHE 1
|
||||
#define ISA_EXT_L2CACHE 2
|
||||
#define ISA_EXT_L3CACHE 3
|
||||
#define ISA_EXT_SMEM 4
|
||||
|
||||
#define MISA_EXT (ICACHE_ENABLED << ISA_EXT_ICACHE) \
|
||||
| (DCACHE_ENABLED << ISA_EXT_DCACHE) \
|
||||
| (L2_ENABLED << ISA_EXT_L2CACHE) \
|
||||
| (L3_ENABLED << ISA_EXT_L3CACHE) \
|
||||
| (SM_ENABLED << ISA_EXT_SMEM)
|
||||
|
||||
#define MISA_STD (EXT_A_ENABLED << 0) /* A - Atomic Instructions extension */ \
|
||||
| (0 << 1) /* B - Tentatively reserved for Bit operations extension */ \
|
||||
| (EXT_C_ENABLED << 2) /* C - Compressed extension */ \
|
||||
| (EXT_D_ENABLED << 3) /* D - Double precsision floating-point extension */ \
|
||||
| (0 << 4) /* E - RV32E base ISA */ \
|
||||
| (EXT_F_ENABLED << 5) /* F - Single precsision floating-point extension */ \
|
||||
| (0 << 6) /* G - Additional standard extensions present */ \
|
||||
| (0 << 7) /* H - Hypervisor mode implemented */ \
|
||||
| (1 << 8) /* I - RV32I/64I/128I base ISA */ \
|
||||
| (0 << 9) /* J - Reserved */ \
|
||||
| (0 << 10) /* K - Reserved */ \
|
||||
| (0 << 11) /* L - Tentatively reserved for Bit operations extension */ \
|
||||
| (EXT_M_ENABLED << 12) /* M - Integer Multiply/Divide extension */ \
|
||||
| (0 << 13) /* N - User level interrupts supported */ \
|
||||
| (0 << 14) /* O - Reserved */ \
|
||||
| (0 << 15) /* P - Tentatively reserved for Packed-SIMD extension */ \
|
||||
| (0 << 16) /* Q - Quad-precision floating-point extension */ \
|
||||
| (0 << 17) /* R - Reserved */ \
|
||||
| (0 << 18) /* S - Supervisor mode implemented */ \
|
||||
| (0 << 19) /* T - Tentatively reserved for Transactional Memory extension */ \
|
||||
| (1 << 20) /* U - User mode implemented */ \
|
||||
| (0 << 21) /* V - Tentatively reserved for Vector extension */ \
|
||||
| (0 << 22) /* W - Reserved */ \
|
||||
| (1 << 23) /* X - Non-standard extensions present */ \
|
||||
| (0 << 24) /* Y - Reserved */ \
|
||||
| (0 << 25) /* Z - Reserved */
|
||||
|
||||
// Device identification //////////////////////////////////////////////////////
|
||||
|
||||
#define VENDOR_ID 0
|
||||
#define ARCHITECTURE_ID 0
|
||||
#define IMPLEMENTATION_ID 0
|
||||
|
||||
#endif // VX_CONFIG_VH
|
||||
193
kernel/include/VX_types.h
Normal file
193
kernel/include/VX_types.h
Normal file
@@ -0,0 +1,193 @@
|
||||
// auto-generated by gen_config.py. DO NOT EDIT
|
||||
// Generated at 2024-06-15 00:25:12.935689
|
||||
|
||||
// Translated from ./rtl/VX_types.vh:
|
||||
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef VX_TYPES_VH
|
||||
#define VX_TYPES_VH
|
||||
|
||||
// Device configuration registers
|
||||
|
||||
#define VX_CSR_ADDR_BITS 12
|
||||
#define VX_DCR_ADDR_BITS 12
|
||||
|
||||
#define VX_DCR_BASE_STATE_BEGIN 0x001
|
||||
#define VX_DCR_BASE_STARTUP_ADDR0 0x001
|
||||
#define VX_DCR_BASE_STARTUP_ADDR1 0x002
|
||||
#define VX_DCR_BASE_MPM_CLASS 0x003
|
||||
#define VX_DCR_BASE_STATE_END 0x004
|
||||
|
||||
#define VX_DCR_BASE_STATE(addr) ((addr) - VX_DCR_BASE_STATE_BEGIN)
|
||||
#define VX_DCR_BASE_STATE_COUNT (VX_DCR_BASE_STATE_END-VX_DCR_BASE_STATE_BEGIN)
|
||||
|
||||
// Machine Performance-monitoring counters classes
|
||||
|
||||
#define VX_DCR_MPM_CLASS_NONE 0
|
||||
#define VX_DCR_MPM_CLASS_CORE 1
|
||||
#define VX_DCR_MPM_CLASS_MEM 2
|
||||
|
||||
// User Floating-Point CSRs
|
||||
|
||||
#define VX_CSR_FFLAGS 0x001
|
||||
#define VX_CSR_FRM 0x002
|
||||
#define VX_CSR_FCSR 0x003
|
||||
|
||||
#define VX_CSR_SATP 0x180
|
||||
|
||||
#define VX_CSR_PMPCFG0 0x3A0
|
||||
#define VX_CSR_PMPADDR0 0x3B0
|
||||
|
||||
#define VX_CSR_MSTATUS 0x300
|
||||
#define VX_CSR_MISA 0x301
|
||||
#define VX_CSR_MEDELEG 0x302
|
||||
#define VX_CSR_MIDELEG 0x303
|
||||
#define VX_CSR_MIE 0x304
|
||||
#define VX_CSR_MTVEC 0x305
|
||||
|
||||
#define VX_CSR_MEPC 0x341
|
||||
|
||||
#define VX_CSR_MNSTATUS 0x744
|
||||
|
||||
#define VX_CSR_MPM_BASE 0xB00
|
||||
#define VX_CSR_MPM_BASE_H 0xB80
|
||||
#define VX_CSR_MPM_USER 0xB03
|
||||
#define VX_CSR_MPM_USER_H 0xB83
|
||||
|
||||
// Machine Performance-monitoring core counters
|
||||
// PERF: Standard
|
||||
#define VX_CSR_MCYCLE 0xB00
|
||||
#define VX_CSR_MCYCLE_H 0xB80
|
||||
#define VX_CSR_MPM_RESERVED 0xB01
|
||||
#define VX_CSR_MPM_RESERVED_H 0xB81
|
||||
#define VX_CSR_MINSTRET 0xB02
|
||||
#define VX_CSR_MINSTRET_H 0xB82
|
||||
// PERF: pipeline
|
||||
#define VX_CSR_MPM_SCHED_ID 0xB03
|
||||
#define VX_CSR_MPM_SCHED_ID_H 0xB83
|
||||
#define VX_CSR_MPM_SCHED_ST 0xB04
|
||||
#define VX_CSR_MPM_SCHED_ST_H 0xB84
|
||||
#define VX_CSR_MPM_IBUF_ST 0xB05
|
||||
#define VX_CSR_MPM_IBUF_ST_H 0xB85
|
||||
#define VX_CSR_MPM_SCRB_ST 0xB06
|
||||
#define VX_CSR_MPM_SCRB_ST_H 0xB86
|
||||
#define VX_CSR_MPM_SCRB_ALU 0xB07
|
||||
#define VX_CSR_MPM_SCRB_ALU_H 0xB87
|
||||
#define VX_CSR_MPM_SCRB_FPU 0xB08
|
||||
#define VX_CSR_MPM_SCRB_FPU_H 0xB88
|
||||
#define VX_CSR_MPM_SCRB_LSU 0xB09
|
||||
#define VX_CSR_MPM_SCRB_LSU_H 0xB89
|
||||
#define VX_CSR_MPM_SCRB_SFU 0xB0A
|
||||
#define VX_CSR_MPM_SCRB_SFU_H 0xB8A
|
||||
// PERF: memory
|
||||
#define VX_CSR_MPM_IFETCHES 0xB0B
|
||||
#define VX_CSR_MPM_IFETCHES_H 0xB8B
|
||||
#define VX_CSR_MPM_LOADS 0xB0C
|
||||
#define VX_CSR_MPM_LOADS_H 0xB8C
|
||||
#define VX_CSR_MPM_STORES 0xB0D
|
||||
#define VX_CSR_MPM_STORES_H 0xB8D
|
||||
#define VX_CSR_MPM_IFETCH_LT 0xB0E
|
||||
#define VX_CSR_MPM_IFETCH_LT_H 0xB8E
|
||||
#define VX_CSR_MPM_LOAD_LT 0xB0F
|
||||
#define VX_CSR_MPM_LOAD_LT_H 0xB8F
|
||||
// SFU: scoreboard
|
||||
#define VX_CSR_MPM_SCRB_WCTL 0xB10
|
||||
#define VX_CSR_MPM_SCRB_WCTL_H 0xB90
|
||||
#define VX_CSR_MPM_SCRB_CSRS 0xB11
|
||||
#define VX_CSR_MPM_SCRB_CSRS_H 0xB91
|
||||
|
||||
// Machine Performance-monitoring memory counters
|
||||
// PERF: icache
|
||||
#define VX_CSR_MPM_ICACHE_READS 0xB03 // total reads
|
||||
#define VX_CSR_MPM_ICACHE_READS_H 0xB83
|
||||
#define VX_CSR_MPM_ICACHE_MISS_R 0xB04 // read misses
|
||||
#define VX_CSR_MPM_ICACHE_MISS_R_H 0xB84
|
||||
#define VX_CSR_MPM_ICACHE_MSHR_ST 0xB05 // MSHR stalls
|
||||
#define VX_CSR_MPM_ICACHE_MSHR_ST_H 0xB85
|
||||
// PERF: dcache
|
||||
#define VX_CSR_MPM_DCACHE_READS 0xB06 // total reads
|
||||
#define VX_CSR_MPM_DCACHE_READS_H 0xB86
|
||||
#define VX_CSR_MPM_DCACHE_WRITES 0xB07 // total writes
|
||||
#define VX_CSR_MPM_DCACHE_WRITES_H 0xB87
|
||||
#define VX_CSR_MPM_DCACHE_MISS_R 0xB08 // read misses
|
||||
#define VX_CSR_MPM_DCACHE_MISS_R_H 0xB88
|
||||
#define VX_CSR_MPM_DCACHE_MISS_W 0xB09 // write misses
|
||||
#define VX_CSR_MPM_DCACHE_MISS_W_H 0xB89
|
||||
#define VX_CSR_MPM_DCACHE_BANK_ST 0xB0A // bank conflicts
|
||||
#define VX_CSR_MPM_DCACHE_BANK_ST_H 0xB8A
|
||||
#define VX_CSR_MPM_DCACHE_MSHR_ST 0xB0B // MSHR stalls
|
||||
#define VX_CSR_MPM_DCACHE_MSHR_ST_H 0xB8B
|
||||
// PERF: l2cache
|
||||
#define VX_CSR_MPM_L2CACHE_READS 0xB0C // total reads
|
||||
#define VX_CSR_MPM_L2CACHE_READS_H 0xB8C
|
||||
#define VX_CSR_MPM_L2CACHE_WRITES 0xB0D // total writes
|
||||
#define VX_CSR_MPM_L2CACHE_WRITES_H 0xB8D
|
||||
#define VX_CSR_MPM_L2CACHE_MISS_R 0xB0E // read misses
|
||||
#define VX_CSR_MPM_L2CACHE_MISS_R_H 0xB8E
|
||||
#define VX_CSR_MPM_L2CACHE_MISS_W 0xB0F // write misses
|
||||
#define VX_CSR_MPM_L2CACHE_MISS_W_H 0xB8F
|
||||
#define VX_CSR_MPM_L2CACHE_BANK_ST 0xB10 // bank conflicts
|
||||
#define VX_CSR_MPM_L2CACHE_BANK_ST_H 0xB90
|
||||
#define VX_CSR_MPM_L2CACHE_MSHR_ST 0xB11 // MSHR stalls
|
||||
#define VX_CSR_MPM_L2CACHE_MSHR_ST_H 0xB91
|
||||
// PERF: l3cache
|
||||
#define VX_CSR_MPM_L3CACHE_READS 0xB12 // total reads
|
||||
#define VX_CSR_MPM_L3CACHE_READS_H 0xB92
|
||||
#define VX_CSR_MPM_L3CACHE_WRITES 0xB13 // total writes
|
||||
#define VX_CSR_MPM_L3CACHE_WRITES_H 0xB93
|
||||
#define VX_CSR_MPM_L3CACHE_MISS_R 0xB14 // read misses
|
||||
#define VX_CSR_MPM_L3CACHE_MISS_R_H 0xB94
|
||||
#define VX_CSR_MPM_L3CACHE_MISS_W 0xB15 // write misses
|
||||
#define VX_CSR_MPM_L3CACHE_MISS_W_H 0xB95
|
||||
#define VX_CSR_MPM_L3CACHE_BANK_ST 0xB16 // bank conflicts
|
||||
#define VX_CSR_MPM_L3CACHE_BANK_ST_H 0xB96
|
||||
#define VX_CSR_MPM_L3CACHE_MSHR_ST 0xB17 // MSHR stalls
|
||||
#define VX_CSR_MPM_L3CACHE_MSHR_ST_H 0xB97
|
||||
// PERF: memory
|
||||
#define VX_CSR_MPM_MEM_READS 0xB18 // total reads
|
||||
#define VX_CSR_MPM_MEM_READS_H 0xB98
|
||||
#define VX_CSR_MPM_MEM_WRITES 0xB19 // total writes
|
||||
#define VX_CSR_MPM_MEM_WRITES_H 0xB99
|
||||
#define VX_CSR_MPM_MEM_LT 0xB1A // memory latency
|
||||
#define VX_CSR_MPM_MEM_LT_H 0xB9A
|
||||
// PERF: smem
|
||||
#define VX_CSR_MPM_SMEM_READS 0xB1B // memory reads
|
||||
#define VX_CSR_MPM_SMEM_READS_H 0xB9B
|
||||
#define VX_CSR_MPM_SMEM_WRITES 0xB1C // memory writes
|
||||
#define VX_CSR_MPM_SMEM_WRITES_H 0xB9C
|
||||
#define VX_CSR_MPM_SMEM_BANK_ST 0xB1D // bank conflicts
|
||||
#define VX_CSR_MPM_SMEM_BANK_ST_H 0xB9D
|
||||
|
||||
// Machine Information Registers
|
||||
|
||||
#define VX_CSR_MVENDORID 0xF11
|
||||
#define VX_CSR_MARCHID 0xF12
|
||||
#define VX_CSR_MIMPID 0xF13
|
||||
#define VX_CSR_MHARTID 0xF14
|
||||
|
||||
// GPGU CSRs
|
||||
|
||||
#define VX_CSR_THREAD_ID 0xCC0
|
||||
#define VX_CSR_WARP_ID 0xCC1
|
||||
#define VX_CSR_CORE_ID 0xCC2
|
||||
#define VX_CSR_WARP_MASK 0xCC3
|
||||
#define VX_CSR_THREAD_MASK 0xCC4 // warning! this value is also used in LLVM
|
||||
#define VX_CSR_GCID 0xCC5 // legacy global core id alias used by Radiance bootrom
|
||||
|
||||
#define VX_CSR_NUM_THREADS 0xFC0
|
||||
#define VX_CSR_NUM_WARPS 0xFC1
|
||||
#define VX_CSR_NUM_CORES 0xFC2
|
||||
|
||||
#endif // VX_TYPES_VH
|
||||
@@ -136,6 +136,19 @@ inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) {
|
||||
asm volatile (".insn r %0, 1, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(num_warps), "r"(func_ptr));
|
||||
}
|
||||
|
||||
// Spawn an explicit warp mask. The current warp bit is ignored by hardware.
|
||||
inline void vx_wspawn_mask(unsigned warp_mask, vx_wspawn_pfn func_ptr) {
|
||||
asm volatile (".insn r %0, 6, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(warp_mask), "r"(func_ptr));
|
||||
}
|
||||
|
||||
inline void vx_spawn_scalar(unsigned warp_mask, vx_wspawn_pfn func_ptr) {
|
||||
vx_wspawn_mask(warp_mask & ((1u << NUM_SCALAR_WARPS) - 1u), func_ptr);
|
||||
}
|
||||
|
||||
inline void vx_spawn_tensor(unsigned warp_mask, vx_wspawn_pfn func_ptr) {
|
||||
vx_wspawn_mask(warp_mask & (((1u << NUM_TENSOR_WARPS) - 1u) << NUM_SCALAR_WARPS), func_ptr);
|
||||
}
|
||||
|
||||
// Split on a predicate
|
||||
inline unsigned vx_split(unsigned predicate) {
|
||||
unsigned ret;
|
||||
@@ -149,8 +162,36 @@ inline void vx_join(unsigned stack_ptr) {
|
||||
}
|
||||
|
||||
// Warp Barrier
|
||||
__attribute__((convergent))
|
||||
inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
|
||||
asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barried_id), "r"(num_warps));
|
||||
unsigned scalar_warps = (num_warps > NUM_SCALAR_WARPS) ? NUM_SCALAR_WARPS : num_warps;
|
||||
asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barried_id), "r"(scalar_warps));
|
||||
}
|
||||
|
||||
#define VX_BARRIER_DOMAIN_SHIFT 28
|
||||
#define VX_BARRIER_DOMAIN_ALL 0u
|
||||
#define VX_BARRIER_DOMAIN_SCALAR 1u
|
||||
#define VX_BARRIER_DOMAIN_TENSOR 2u
|
||||
|
||||
__attribute__((convergent))
|
||||
inline void vx_barrier_domain(unsigned barrier_id, unsigned num_warps, unsigned domain) {
|
||||
unsigned encoded_id = barrier_id | (domain << VX_BARRIER_DOMAIN_SHIFT);
|
||||
asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(encoded_id), "r"(num_warps));
|
||||
}
|
||||
|
||||
__attribute__((convergent))
|
||||
inline void vx_barrier_scalar(unsigned barrier_id, unsigned num_warps) {
|
||||
vx_barrier_domain(barrier_id, num_warps, VX_BARRIER_DOMAIN_SCALAR);
|
||||
}
|
||||
|
||||
__attribute__((convergent))
|
||||
inline void vx_barrier_tensor(unsigned barrier_id, unsigned num_warps) {
|
||||
vx_barrier_domain(barrier_id, num_warps, VX_BARRIER_DOMAIN_TENSOR);
|
||||
}
|
||||
|
||||
__attribute__((convergent))
|
||||
inline void vx_barrier_mask(unsigned barrier_id, unsigned warp_mask) {
|
||||
asm volatile (".insn r %0, 7, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barrier_id), "r"(warp_mask));
|
||||
}
|
||||
|
||||
// Return current thread identifier
|
||||
@@ -202,6 +243,22 @@ inline int vx_num_warps() {
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline int vx_num_scalar_warps() {
|
||||
return NUM_SCALAR_WARPS;
|
||||
}
|
||||
|
||||
inline int vx_num_tensor_warps() {
|
||||
return NUM_TENSOR_WARPS;
|
||||
}
|
||||
|
||||
inline unsigned vx_scalar_warp_mask() {
|
||||
return (1u << NUM_SCALAR_WARPS) - 1u;
|
||||
}
|
||||
|
||||
inline unsigned vx_tensor_warp_mask() {
|
||||
return ((1u << NUM_TENSOR_WARPS) - 1u) << NUM_SCALAR_WARPS;
|
||||
}
|
||||
|
||||
// Return the number of cores per cluster
|
||||
inline int vx_num_cores() {
|
||||
int ret;
|
||||
|
||||
@@ -17,6 +17,10 @@
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifndef CORES_PER_CLUSTER
|
||||
#define CORES_PER_CLUSTER 8
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
@@ -48,6 +52,7 @@ void vx_wspawn_wait();
|
||||
void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg);
|
||||
|
||||
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
|
||||
void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
|
||||
void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg);
|
||||
|
||||
void vx_serial(vx_serial_cb callback, void * arg);
|
||||
|
||||
@@ -74,18 +74,9 @@ static void __attribute__ ((noinline)) spawn_tasks_all_stub() {
|
||||
}
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_tasks_rem_stub() {
|
||||
int cid = vx_core_id();
|
||||
int tid = vx_thread_id();
|
||||
|
||||
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
|
||||
int task_id = p_wspawn_args->offset + tid;
|
||||
(p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() {
|
||||
int NT = vx_num_threads();
|
||||
int NW = vx_num_warps();
|
||||
int NW = NUM_SCALAR_WARPS;
|
||||
int cid = vx_core_id();
|
||||
int wid = vx_warp_id();
|
||||
int tid = vx_thread_id();
|
||||
@@ -103,6 +94,60 @@ static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() {
|
||||
}
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() {
|
||||
int NT = vx_num_threads();
|
||||
int NW = NUM_SCALAR_WARPS;
|
||||
int cid = vx_core_id();
|
||||
int wid = vx_warp_id();
|
||||
int tid = vx_thread_id();
|
||||
|
||||
const int core_id_in_cluster = cid % CORES_PER_CLUSTER;
|
||||
// round-robin warp_id allocation across cores in cluster
|
||||
const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster;
|
||||
|
||||
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
|
||||
|
||||
int waves = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs);
|
||||
int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid);
|
||||
|
||||
vx_spawn_tasks_cb callback = p_wspawn_args->callback;
|
||||
void* arg = p_wspawn_args->arg;
|
||||
|
||||
// sequential iterations
|
||||
for (int wave_id = 0; wave_id < waves; ++wave_id) {
|
||||
int task_id = offset + (wave_id * NT * NW * CORES_PER_CLUSTER);
|
||||
callback(task_id, arg);
|
||||
}
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_tasks_rem_stub() {
|
||||
int cid = vx_core_id();
|
||||
int tid = vx_thread_id();
|
||||
|
||||
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
|
||||
int task_id = p_wspawn_args->offset + tid;
|
||||
(p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_tasks_cluster_rem_stub() {
|
||||
int NT = vx_num_threads();
|
||||
int cid = vx_core_id();
|
||||
int tid = vx_thread_id();
|
||||
int wid = vx_warp_id();
|
||||
|
||||
const int core_id_in_cluster = cid % CORES_PER_CLUSTER;
|
||||
// round-robin warp_id allocation across cores in cluster
|
||||
const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster;
|
||||
|
||||
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
|
||||
// FIXME: This assumes that all cores but the last one are working with full
|
||||
// warps, and only the last core has a partially-filled warp.
|
||||
int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid);
|
||||
|
||||
int task_id = offset;
|
||||
(p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() {
|
||||
// activate all threads
|
||||
vx_tmc(-1);
|
||||
@@ -111,11 +156,21 @@ static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() {
|
||||
spawn_tasks_contiguous_all_stub();
|
||||
|
||||
// disable warp
|
||||
// deadlock here on warps 1, 2, 3
|
||||
vx_tmc_zero();
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_tasks_all_cb() {
|
||||
static void __attribute__ ((noinline)) spawn_tasks_cluster_all_cb() {
|
||||
// activate all threads
|
||||
vx_tmc(-1);
|
||||
|
||||
// call stub routine
|
||||
spawn_tasks_cluster_all_stub();
|
||||
|
||||
// disable warp
|
||||
vx_tmc_zero();
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_tasks_all_cb() {
|
||||
// activate all threads
|
||||
vx_tmc(-1);
|
||||
|
||||
@@ -126,10 +181,115 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() {
|
||||
vx_tmc_zero();
|
||||
}
|
||||
|
||||
// This function runs in every core, but with only 1 warp and 1 thread enabled.
|
||||
// The logic in this function figures out how many warps/threads this particular
|
||||
// core has to enable to fulfill an entire grid of computation.
|
||||
void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg) {
|
||||
// device specs
|
||||
const int NC = vx_num_cores();
|
||||
const int NW = NUM_SCALAR_WARPS;
|
||||
const int NT = vx_num_threads();
|
||||
// NOTE: assumes divisible
|
||||
const int num_cluster = NC / CORES_PER_CLUSTER;
|
||||
|
||||
// current core id
|
||||
int core_id = vx_core_id();
|
||||
if (core_id >= NUM_CORES_MAX)
|
||||
return;
|
||||
const int cluster_id = core_id / CORES_PER_CLUSTER;
|
||||
const int core_id_in_cluster = core_id % CORES_PER_CLUSTER;
|
||||
|
||||
// try to fill up full clusters first
|
||||
const int num_threads_in_cluster = CORES_PER_CLUSTER * NW * NT;
|
||||
const int num_used_clusters =
|
||||
(num_tasks + (num_threads_in_cluster - 1)) / num_threads_in_cluster;
|
||||
if (cluster_id >= num_used_clusters) {
|
||||
return; // terminate extra clusters
|
||||
}
|
||||
// fill up the last cluster with remaining tasks
|
||||
const int num_full_clusters = num_tasks / num_threads_in_cluster;
|
||||
int num_tasks_this_cluster = num_threads_in_cluster;
|
||||
if (cluster_id >= num_full_clusters) {
|
||||
num_tasks_this_cluster = num_tasks % num_threads_in_cluster;
|
||||
}
|
||||
|
||||
// Distribute threads equally across as many cores as possible, even if they
|
||||
// don't fill up NW*NT in a single core. This makes sure the warps get evenly
|
||||
// distributed in a single cluster
|
||||
//
|
||||
// TODO: Try to contain in a single cluster if possible?
|
||||
const int num_active_cores = (num_tasks + (NT - 1)) / NT;
|
||||
if (core_id >= num_active_cores)
|
||||
return; // terminate extra cores
|
||||
|
||||
const int num_full_warps_this_cluster = num_tasks_this_cluster / NT;
|
||||
const int rem_threads_in_last_warp = num_tasks_this_cluster % NT;
|
||||
// const int num_warps = (num_tasks_this_cluster + (NT - 1)) / NT;
|
||||
|
||||
int num_warps_this_core = num_full_warps_this_cluster / CORES_PER_CLUSTER;
|
||||
const int num_warps_in_last_row = num_full_warps_this_cluster % CORES_PER_CLUSTER;
|
||||
if (core_id_in_cluster < num_warps_in_last_row) {
|
||||
num_warps_this_core++;
|
||||
}
|
||||
// if 0, last warp is full-threads enabled
|
||||
int rem_threads_in_last_warp_this_core = 0;
|
||||
if (rem_threads_in_last_warp != 0) {
|
||||
if (core_id_in_cluster == num_warps_in_last_row - 1) {
|
||||
rem_threads_in_last_warp_this_core = rem_threads_in_last_warp;
|
||||
}
|
||||
}
|
||||
|
||||
// sequential iterations
|
||||
const int num_full_waves = num_warps_this_core / NW;
|
||||
const int rem_full_warps_in_last_wave = num_warps_this_core % NW;
|
||||
|
||||
const int offset = cluster_id * num_tasks_this_cluster;
|
||||
wspawn_tasks_args_t wspawn_args = {callback, arg, offset, num_full_waves,
|
||||
rem_full_warps_in_last_wave};
|
||||
g_wspawn_args[core_id] = &wspawn_args;
|
||||
|
||||
if (num_warps_this_core > 0) {
|
||||
// execute callback on other warps
|
||||
const int nw = MIN(num_warps_this_core, NW);
|
||||
vx_wspawn(nw, spawn_tasks_cluster_all_cb);
|
||||
|
||||
// activate all threads
|
||||
vx_tmc(-1);
|
||||
|
||||
// call stub routine
|
||||
spawn_tasks_cluster_all_stub();
|
||||
|
||||
// back to single-threaded
|
||||
vx_tmc_one();
|
||||
|
||||
// wait for spawn warps to terminate
|
||||
vx_wspawn_wait();
|
||||
}
|
||||
|
||||
// TODO: this is incomplete
|
||||
// TODO: Instead of launching an additional wave just to work on remaining
|
||||
// threads, handle this in the last wave amongst other full warps.
|
||||
if (rem_threads_in_last_warp != 0 && core_id_in_cluster == 0) {
|
||||
// adjust offset
|
||||
// FIXME: use rem_threads_in_last_warp_this_core
|
||||
wspawn_args.offset += (num_tasks_this_cluster - rem_threads_in_last_warp);
|
||||
|
||||
// activate remaining threads
|
||||
const int tmask = (1 << rem_threads_in_last_warp) - 1;
|
||||
vx_tmc(tmask);
|
||||
|
||||
// call stub routine
|
||||
spawn_tasks_cluster_rem_stub();
|
||||
|
||||
// back to single-threaded
|
||||
vx_tmc_one();
|
||||
}
|
||||
}
|
||||
|
||||
void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
|
||||
// device specs
|
||||
int NC = vx_num_cores();
|
||||
int NW = vx_num_warps();
|
||||
int NW = NUM_SCALAR_WARPS;
|
||||
int NT = vx_num_threads();
|
||||
|
||||
// current core id
|
||||
@@ -179,7 +339,6 @@ void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void
|
||||
vx_tmc_one();
|
||||
|
||||
// wait for spawn warps to terminate
|
||||
// deadlock here on warp 0!
|
||||
vx_wspawn_wait();
|
||||
}
|
||||
|
||||
@@ -202,7 +361,7 @@ void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void
|
||||
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
|
||||
// device specs
|
||||
int NC = vx_num_cores();
|
||||
int NW = vx_num_warps();
|
||||
int NW = NUM_SCALAR_WARPS;
|
||||
int NT = vx_num_threads();
|
||||
|
||||
// current core id
|
||||
@@ -356,7 +515,7 @@ void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg) {
|
||||
|
||||
// device specs
|
||||
int NC = vx_num_cores();
|
||||
int NW = vx_num_warps();
|
||||
int NW = NUM_SCALAR_WARPS;
|
||||
int NT = vx_num_threads();
|
||||
|
||||
// current core id
|
||||
|
||||
@@ -22,9 +22,9 @@
|
||||
_start:
|
||||
|
||||
# initialize per-thread registers
|
||||
csrr t0, VX_CSR_NUM_WARPS # get num warps
|
||||
li t0, ((1 << NUM_SCALAR_WARPS) - 1) # scalar warp mask
|
||||
la t1, init_regs_all
|
||||
.insn r RISCV_CUSTOM0, 1, 0, x0, t0, t1 # wspawn t0, t1
|
||||
.insn r RISCV_CUSTOM0, 6, 0, x0, t0, t1 # wspawn_mask t0, t1
|
||||
li t0, -1
|
||||
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
|
||||
jal init_regs
|
||||
@@ -35,9 +35,9 @@ _start:
|
||||
jal vx_wspawn_wait
|
||||
|
||||
# initialize TLS for all warps
|
||||
csrr t0, VX_CSR_NUM_WARPS # get num warps
|
||||
li t0, ((1 << NUM_SCALAR_WARPS) - 1) # scalar warp mask
|
||||
la t1, init_tls_all
|
||||
.insn r RISCV_CUSTOM0, 1, 0, x0, t0, t1 # wspawn t0, t1
|
||||
.insn r RISCV_CUSTOM0, 6, 0, x0, t0, t1 # wspawn_mask t0, t1
|
||||
li t0, -1
|
||||
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
|
||||
call __init_tls
|
||||
@@ -102,6 +102,8 @@ init_regs:
|
||||
#endif
|
||||
csrr t0, VX_CSR_MHARTID
|
||||
sll t1, t0, STACK_LOG2_SIZE
|
||||
sll t2, t0, 4
|
||||
add t1, t1, t2
|
||||
sub sp, sp, t1
|
||||
|
||||
# set thread pointer register
|
||||
|
||||
Reference in New Issue
Block a user