Implement WU architecture support

2026-05-25 19:25:05 +08:00
parent 323ed7d7e9
commit 0ad87bde81
35 changed files with 3303 additions and 472 deletions
--- a/kernel/include/VX_config.h
+++ b/kernel/include/VX_config.h
@@ -0,0 +1,701 @@
+// auto-generated by gen_config.py. DO NOT EDIT
+// Generated at 2024-05-07 13:55:58.398687
+
+// Translated from ./rtl/VX_config.vh:
+
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef VX_CONFIG_VH
+#define VX_CONFIG_VH
+
+#ifndef MIN
+#define MIN(x, y)   (((x) < (y)) ? (x) : (y))
+#endif
+
+#ifndef MAX
+#define MAX(x, y)   (((x) > (y)) ? (x) : (y))
+#endif
+
+#ifndef CLAMP
+#define CLAMP(x, lo, hi)   (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x)))
+#endif
+
+#ifndef UP
+#define UP(x)   (((x) != 0) ? (x) : 1)
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef EXT_M_DISABLE
+#define EXT_M_ENABLE
+#endif
+
+#ifndef EXT_F_DISABLE
+#define EXT_F_ENABLE
+#endif
+
+#ifndef XLEN_32
+#ifndef XLEN_64
+#define XLEN_32
+#endif
+#endif
+
+#ifdef XLEN_64
+#define XLEN 64
+#endif
+
+#ifdef XLEN_32
+#define XLEN 32
+#endif
+
+#ifdef EXT_D_ENABLE
+#define FLEN_64
+#else
+#define FLEN_32
+#endif
+
+#ifdef FLEN_64
+#define FLEN 64
+#endif
+
+#ifdef FLEN_32
+#define FLEN 32
+#endif
+
+#ifdef XLEN_64
+#ifdef FLEN_32
+    #define FPU_RV64F
+#endif
+#endif
+
+#ifndef NUM_CLUSTERS
+#define NUM_CLUSTERS 1
+#endif
+
+#ifndef NUM_CORES
+#define NUM_CORES 1
+#endif
+
+#ifndef NUM_WARPS
+#define NUM_WARPS 4
+#endif
+
+#ifndef NUM_TENSOR_WARPS
+#define NUM_TENSOR_WARPS 2
+#endif
+
+#define NUM_SCALAR_WARPS (NUM_WARPS - NUM_TENSOR_WARPS)
+
+#define IS_SCALAR_WARP(wid) ((wid) < NUM_SCALAR_WARPS)
+#define IS_TENSOR_WARP(wid) ((wid) >= NUM_SCALAR_WARPS)
+
+#ifndef TENSOR_NUM_GPRS
+#define TENSOR_NUM_GPRS 8
+#endif
+
+#ifndef TENSOR_NUM_FPRS
+#define TENSOR_NUM_FPRS 8
+#endif
+
+#ifndef NUM_THREADS
+#define NUM_THREADS 4
+#endif
+
+#ifndef NUM_BARRIERS
+#define NUM_BARRIERS 8
+#endif
+
+#ifndef SOCKET_SIZE
+#define SOCKET_SIZE MIN(4, NUM_CORES)
+#endif
+#define NUM_SOCKETS UP(NUM_CORES / SOCKET_SIZE)
+
+#ifdef L2_ENABLE
+    #define L2_ENABLED   1
+#else
+    #define L2_ENABLED   0
+#endif
+
+#ifdef L3_ENABLE
+    #define L3_ENABLED   1
+#else
+    #define L3_ENABLED   0
+#endif
+
+#ifdef L1_DISABLE
+    #define ICACHE_DISABLE
+    #define DCACHE_DISABLE
+#endif
+
+#ifndef MEM_BLOCK_SIZE
+#define MEM_BLOCK_SIZE 64
+#endif
+
+#ifndef MEM_ADDR_WIDTH
+#ifdef XLEN_64
+#define MEM_ADDR_WIDTH 48
+#else
+#define MEM_ADDR_WIDTH 32
+#endif
+#endif
+
+#ifndef L1_LINE_SIZE
+#ifdef L1_DISABLE
+#define L1_LINE_SIZE ((L2_ENABLED || L3_ENABLED) ? 4 : MEM_BLOCK_SIZE)
+#else
+#define L1_LINE_SIZE ((L2_ENABLED || L3_ENABLED) ? 16 : MEM_BLOCK_SIZE)
+#endif
+#endif
+
+#ifdef L2_ENABLE
+#define L2_LINE_SIZE MEM_BLOCK_SIZE
+#else
+#define L2_LINE_SIZE L1_LINE_SIZE
+#endif
+
+#ifdef L3_ENABLE
+#define L3_LINE_SIZE MEM_BLOCK_SIZE
+#else
+#define L3_LINE_SIZE L2_LINE_SIZE
+#endif
+
+#ifdef XLEN_64
+
+#ifndef STARTUP_ADDR
+#define STARTUP_ADDR 0x180000000
+#endif
+
+#ifndef STACK_BASE_ADDR
+#define STACK_BASE_ADDR 0x1FF000000
+#endif
+
+#else
+
+#ifndef STARTUP_ADDR
+#define STARTUP_ADDR 0x80000000
+#endif
+
+#ifndef STACK_BASE_ADDR
+#define STACK_BASE_ADDR 0xFF000000
+#endif
+
+#endif
+
+#ifndef SMEM_BASE_ADDR
+#define SMEM_BASE_ADDR STACK_BASE_ADDR
+#endif
+
+#ifndef SMEM_LOG_SIZE
+#define SMEM_LOG_SIZE   19
+#endif
+
+#ifndef IO_BASE_ADDR
+#define IO_BASE_ADDR (SMEM_BASE_ADDR + (1 << SMEM_LOG_SIZE))
+#endif
+
+#ifndef IO_COUT_ADDR
+#define IO_COUT_ADDR IO_BASE_ADDR
+#endif
+#define IO_COUT_SIZE MEM_BLOCK_SIZE
+
+#ifndef IO_CSR_ADDR
+#define IO_CSR_ADDR (IO_COUT_ADDR + IO_COUT_SIZE)
+#endif
+#define IO_CSR_SIZE (4 * 64 * NUM_CORES * NUM_CLUSTERS)
+
+#ifndef STACK_LOG2_SIZE
+#define STACK_LOG2_SIZE 13
+#endif
+#define STACK_SIZE (1 << STACK_LOG2_SIZE)
+
+#define RESET_DELAY 8
+
+#ifndef STALL_TIMEOUT
+#define STALL_TIMEOUT (100000 * (1 ** (L2_ENABLED + L3_ENABLED)))
+#endif
+
+#ifndef SV_DPI
+#define DPI_DISABLE
+#endif
+
+#ifndef FPU_FPNEW
+#ifndef FPU_DSP
+#ifndef FPU_DPI
+#ifndef SYNTHESIS
+#ifndef DPI_DISABLE
+#define FPU_DPI
+#else
+#define FPU_DSP
+#endif
+#else
+#define FPU_DSP
+#endif
+#endif
+#endif
+#endif
+
+#ifndef SYNTHESIS
+#ifndef DPI_DISABLE
+#define IMUL_DPI
+#define IDIV_DPI
+#endif
+#endif
+
+#ifndef DEBUG_LEVEL
+#define DEBUG_LEVEL 3
+#endif
+
+// Pipeline Configuration /////////////////////////////////////////////////////
+
+// Issue width
+#ifndef ISSUE_WIDTH
+#define ISSUE_WIDTH     NUM_WARPS
+#endif
+
+// Number of ALU units
+#ifndef NUM_ALU_LANES
+#define NUM_ALU_LANES   NUM_THREADS
+#endif
+#ifndef NUM_ALU_BLOCKS
+#define NUM_ALU_BLOCKS  4
+#endif
+
+// Number of FPU units
+#ifndef NUM_FPU_LANES
+#define NUM_FPU_LANES   NUM_THREADS
+#endif
+#ifndef NUM_FPU_BLOCKS
+#define NUM_FPU_BLOCKS  2
+#endif
+
+// Number of LSU units
+#ifndef NUM_LSU_LANES
+#define NUM_LSU_LANES   NUM_THREADS
+#endif
+
+// Number of SFU units
+#ifndef NUM_SFU_LANES
+#define NUM_SFU_LANES   MIN(NUM_THREADS, 4)
+#endif
+
+// Size of Instruction Buffer
+#ifndef IBUF_SIZE
+#define IBUF_SIZE   (4 * ISSUE_WIDTH)
+#endif
+
+// Size of LSU Request Queue
+#ifndef LSUQ_SIZE
+#define LSUQ_SIZE   (4 * NUM_WARPS * (NUM_THREADS / NUM_LSU_LANES))
+#endif
+
+// LSU Duplicate Address Check
+#ifndef LSU_DUP_DISABLE
+#define LSU_DUP_ENABLE
+#endif
+#ifdef LSU_DUP_ENABLE
+#define LSU_DUP_ENABLED 1
+#else
+#define LSU_DUP_ENABLED 0
+#endif
+
+#ifdef GBAR_ENABLE
+#define GBAR_ENABLED 1
+#else
+#define GBAR_ENABLED 0
+#endif
+
+#ifndef LATENCY_IMUL
+#ifdef VIVADO
+#define LATENCY_IMUL 4
+#endif
+#ifdef QUARTUS
+#define LATENCY_IMUL 3
+#endif
+#ifndef LATENCY_IMUL
+#define LATENCY_IMUL 4
+#endif
+#endif
+
+// Floating-Point Units ///////////////////////////////////////////////////////
+
+// Size of FPU Request Queue
+#ifndef FPUQ_SIZE
+#define FPUQ_SIZE (2 * (NUM_THREADS / NUM_FPU_LANES))
+#endif
+
+// FNCP Latency
+#ifndef LATENCY_FNCP
+#define LATENCY_FNCP 2
+#endif
+
+// FMA Latency
+#ifndef LATENCY_FMA
+#ifdef FPU_DPI
+#define LATENCY_FMA 4    
+#endif
+#ifdef FPU_FPNEW
+#define LATENCY_FMA 4    
+#endif
+#ifdef FPU_DSP
+#ifdef QUARTUS
+#define LATENCY_FMA 4
+#endif
+#ifdef VIVADO
+#define LATENCY_FMA 16    
+#endif
+#ifndef LATENCY_FMA
+#define LATENCY_FMA 4    
+#endif
+#endif
+#endif
+
+// FDIV Latency
+#ifndef LATENCY_FDIV
+#ifdef FPU_DPI
+#define LATENCY_FDIV 15    
+#endif
+#ifdef FPU_FPNEW
+#define LATENCY_FDIV 16    
+#endif
+#ifdef FPU_DSP
+#ifdef QUARTUS
+#define LATENCY_FDIV 15
+#endif
+#ifdef VIVADO
+#define LATENCY_FDIV 28    
+#endif
+#ifndef LATENCY_FDIV
+#define LATENCY_FDIV 16
+#endif
+#endif
+#endif
+
+// FSQRT Latency
+#ifndef LATENCY_FSQRT
+#ifdef FPU_DPI
+#define LATENCY_FSQRT 10    
+#endif
+#ifdef FPU_FPNEW
+#define LATENCY_FSQRT 16    
+#endif
+#ifdef FPU_DSP
+#ifdef QUARTUS
+#define LATENCY_FSQRT 10
+#endif
+#ifdef VIVADO
+#define LATENCY_FSQRT 28    
+#endif
+#ifndef LATENCY_FSQRT
+#define LATENCY_FSQRT 16    
+#endif
+#endif
+#endif
+
+// FCVT Latency
+#ifndef LATENCY_FCVT
+#define LATENCY_FCVT 5
+#endif
+
+// Icache Configurable Knobs //////////////////////////////////////////////////
+
+// Cache Enable
+#ifndef ICACHE_DISABLE
+#define ICACHE_ENABLE
+#endif
+#ifdef ICACHE_ENABLE
+    #define ICACHE_ENABLED 1
+#else
+    #define ICACHE_ENABLED 0
+    #define NUM_ICACHES 0
+#endif
+
+// Number of Cache Units
+#ifndef NUM_ICACHES
+#define NUM_ICACHES UP(SOCKET_SIZE / 4)
+#endif
+
+// Cache Size
+#ifndef ICACHE_SIZE
+#define ICACHE_SIZE 16384
+#endif
+
+// Core Response Queue Size
+#ifndef ICACHE_CRSQ_SIZE
+#define ICACHE_CRSQ_SIZE 2
+#endif
+
+// Miss Handling Register Size
+#ifndef ICACHE_MSHR_SIZE
+#define ICACHE_MSHR_SIZE 16
+#endif
+
+// Memory Request Queue Size
+#ifndef ICACHE_MREQ_SIZE
+#define ICACHE_MREQ_SIZE 4
+#endif
+
+// Memory Response Queue Size
+#ifndef ICACHE_MRSQ_SIZE
+#define ICACHE_MRSQ_SIZE 0
+#endif
+
+// Number of Associative Ways
+#ifndef ICACHE_NUM_WAYS
+#define ICACHE_NUM_WAYS 1
+#endif
+
+// Dcache Configurable Knobs //////////////////////////////////////////////////
+
+// Cache Enable
+#ifndef DCACHE_DISABLE
+#define DCACHE_ENABLE
+#endif
+#ifdef DCACHE_ENABLE
+    #define DCACHE_ENABLED 1
+#else
+    #define DCACHE_ENABLED 0
+    #define NUM_DCACHES 0
+    #define DCACHE_NUM_BANKS 1
+#endif
+
+// Number of Cache Units
+#ifndef NUM_DCACHES
+#define NUM_DCACHES UP(SOCKET_SIZE / 4)
+#endif
+
+// Cache Size
+#ifndef DCACHE_SIZE
+#define DCACHE_SIZE 16384
+#endif
+
+// Number of Banks
+#ifndef DCACHE_NUM_BANKS
+#define DCACHE_NUM_BANKS NUM_LSU_LANES
+#endif
+
+// Core Response Queue Size
+#ifndef DCACHE_CRSQ_SIZE
+#define DCACHE_CRSQ_SIZE 2
+#endif
+
+// Miss Handling Register Size
+#ifndef DCACHE_MSHR_SIZE
+#define DCACHE_MSHR_SIZE 8
+#endif
+
+// Memory Request Queue Size
+#ifndef DCACHE_MREQ_SIZE
+#define DCACHE_MREQ_SIZE 4
+#endif
+
+// Memory Response Queue Size
+#ifndef DCACHE_MRSQ_SIZE
+#define DCACHE_MRSQ_SIZE 0
+#endif
+
+// Number of Associative Ways
+#ifndef DCACHE_NUM_WAYS
+#define DCACHE_NUM_WAYS 1
+#endif
+
+// SM Configurable Knobs //////////////////////////////////////////////////////
+
+#ifndef SM_DISABLE
+#define SM_ENABLE
+#endif
+
+#ifdef SM_ENABLE
+    #define SM_ENABLED   1
+#else
+    #define SM_ENABLED   0
+    #define SMEM_NUM_BANKS 1
+#endif
+
+// Number of Banks
+#ifndef SMEM_NUM_BANKS
+#define SMEM_NUM_BANKS (NUM_LSU_LANES)
+#endif
+
+// L2cache Configurable Knobs /////////////////////////////////////////////////
+
+// Cache Size
+#ifndef L2_CACHE_SIZE
+#ifdef ALTERA_S10
+#define L2_CACHE_SIZE 2097152
+#else
+#define L2_CACHE_SIZE 1048576
+#endif
+#endif
+
+// Number of Banks
+#ifndef L2_NUM_BANKS
+#define L2_NUM_BANKS MIN(4, NUM_SOCKETS)
+#endif
+
+// Core Response Queue Size
+#ifndef L2_CRSQ_SIZE
+#define L2_CRSQ_SIZE 2
+#endif
+
+// Miss Handling Register Size
+#ifndef L2_MSHR_SIZE
+#define L2_MSHR_SIZE 16
+#endif
+
+// Memory Request Queue Size
+#ifndef L2_MREQ_SIZE
+#define L2_MREQ_SIZE 4
+#endif
+
+// Memory Response Queue Size
+#ifndef L2_MRSQ_SIZE
+#define L2_MRSQ_SIZE 0
+#endif
+
+// Number of Associative Ways
+#ifndef L2_NUM_WAYS
+#define L2_NUM_WAYS 2
+#endif
+
+// L3cache Configurable Knobs /////////////////////////////////////////////////
+
+// Cache Size
+#ifndef L3_CACHE_SIZE
+#ifdef ALTERA_S10
+#define L3_CACHE_SIZE 2097152
+#else
+#define L3_CACHE_SIZE 1048576
+#endif
+#endif
+
+// Number of Banks
+#ifndef L3_NUM_BANKS
+#define L3_NUM_BANKS MIN(4, NUM_CLUSTERS)
+#endif
+
+// Core Response Queue Size
+#ifndef L3_CRSQ_SIZE
+#define L3_CRSQ_SIZE 2
+#endif
+
+// Miss Handling Register Size
+#ifndef L3_MSHR_SIZE
+#define L3_MSHR_SIZE 16
+#endif
+
+// Memory Request Queue Size
+#ifndef L3_MREQ_SIZE
+#define L3_MREQ_SIZE 4
+#endif
+
+// Memory Response Queue Size
+#ifndef L3_MRSQ_SIZE
+#define L3_MRSQ_SIZE 0
+#endif
+
+// Number of Associative Ways
+#ifndef L3_NUM_WAYS
+#define L3_NUM_WAYS 4
+#endif
+
+// ISA Extensions /////////////////////////////////////////////////////////////
+
+#ifdef EXT_A_ENABLE
+    #define EXT_A_ENABLED   1
+#else
+    #define EXT_A_ENABLED   0
+#endif
+
+#ifdef EXT_C_ENABLE
+    #define EXT_C_ENABLED   1
+#else
+    #define EXT_C_ENABLED   0
+#endif
+
+#ifdef EXT_D_ENABLE
+    #define EXT_D_ENABLED   1
+#else
+    #define EXT_D_ENABLED   0
+#endif
+
+#ifdef EXT_F_ENABLE
+    #define EXT_F_ENABLED   1
+#else
+    #define EXT_F_ENABLED   0
+#endif
+
+#ifdef EXT_M_ENABLE
+    #define EXT_M_ENABLED   1
+#else
+    #define EXT_M_ENABLED   0
+#endif
+
+#define ISA_STD_A           0
+#define ISA_STD_C           2
+#define ISA_STD_D           3
+#define ISA_STD_E           4
+#define ISA_STD_F           5
+#define ISA_STD_H           7
+#define ISA_STD_I           8
+#define ISA_STD_N           13
+#define ISA_STD_Q           16
+#define ISA_STD_S           18
+#define ISA_STD_U           20
+
+#define ISA_EXT_ICACHE      0
+#define ISA_EXT_DCACHE      1
+#define ISA_EXT_L2CACHE     2
+#define ISA_EXT_L3CACHE     3
+#define ISA_EXT_SMEM        4
+
+#define MISA_EXT  (ICACHE_ENABLED  << ISA_EXT_ICACHE) \
+                | (DCACHE_ENABLED  << ISA_EXT_DCACHE) \
+                | (L2_ENABLED      << ISA_EXT_L2CACHE) \
+                | (L3_ENABLED      << ISA_EXT_L3CACHE) \
+                | (SM_ENABLED      << ISA_EXT_SMEM)
+
+#define MISA_STD  (EXT_A_ENABLED <<  0) /* A - Atomic Instructions extension */ \
+                | (0 <<  1) /* B - Tentatively reserved for Bit operations extension */ \
+                | (EXT_C_ENABLED <<  2) /* C - Compressed extension */ \
+                | (EXT_D_ENABLED <<  3) /* D - Double precsision floating-point extension */ \
+                | (0 <<  4) /* E - RV32E base ISA */ \
+                | (EXT_F_ENABLED << 5) /* F - Single precsision floating-point extension */ \
+                | (0 <<  6) /* G - Additional standard extensions present */ \
+                | (0 <<  7) /* H - Hypervisor mode implemented */ \
+                | (1 <<  8) /* I - RV32I/64I/128I base ISA */ \
+                | (0 <<  9) /* J - Reserved */ \
+                | (0 << 10) /* K - Reserved */ \
+                | (0 << 11) /* L - Tentatively reserved for Bit operations extension */ \
+                | (EXT_M_ENABLED << 12) /* M - Integer Multiply/Divide extension */ \
+                | (0 << 13) /* N - User level interrupts supported */ \
+                | (0 << 14) /* O - Reserved */ \
+                | (0 << 15) /* P - Tentatively reserved for Packed-SIMD extension */ \
+                | (0 << 16) /* Q - Quad-precision floating-point extension */ \
+                | (0 << 17) /* R - Reserved */ \
+                | (0 << 18) /* S - Supervisor mode implemented */ \
+                | (0 << 19) /* T - Tentatively reserved for Transactional Memory extension */ \
+                | (1 << 20) /* U - User mode implemented */ \
+                | (0 << 21) /* V - Tentatively reserved for Vector extension */ \
+                | (0 << 22) /* W - Reserved */ \
+                | (1 << 23) /* X - Non-standard extensions present */ \
+                | (0 << 24) /* Y - Reserved */ \
+                | (0 << 25) /* Z - Reserved */
+
+// Device identification //////////////////////////////////////////////////////
+
+#define VENDOR_ID           0
+#define ARCHITECTURE_ID     0
+#define IMPLEMENTATION_ID   0
+
+#endif // VX_CONFIG_VH
--- a/kernel/include/VX_types.h
+++ b/kernel/include/VX_types.h
@@ -0,0 +1,193 @@
+// auto-generated by gen_config.py. DO NOT EDIT
+// Generated at 2024-06-15 00:25:12.935689
+
+// Translated from ./rtl/VX_types.vh:
+
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef VX_TYPES_VH
+#define VX_TYPES_VH
+
+// Device configuration registers
+
+#define VX_CSR_ADDR_BITS                12
+#define VX_DCR_ADDR_BITS                12
+
+#define VX_DCR_BASE_STATE_BEGIN         0x001
+#define VX_DCR_BASE_STARTUP_ADDR0       0x001
+#define VX_DCR_BASE_STARTUP_ADDR1       0x002
+#define VX_DCR_BASE_MPM_CLASS           0x003
+#define VX_DCR_BASE_STATE_END           0x004
+
+#define VX_DCR_BASE_STATE(addr)         ((addr) - VX_DCR_BASE_STATE_BEGIN)
+#define VX_DCR_BASE_STATE_COUNT         (VX_DCR_BASE_STATE_END-VX_DCR_BASE_STATE_BEGIN)
+
+// Machine Performance-monitoring counters classes
+
+#define VX_DCR_MPM_CLASS_NONE           0           
+#define VX_DCR_MPM_CLASS_CORE           1
+#define VX_DCR_MPM_CLASS_MEM            2
+
+// User Floating-Point CSRs
+
+#define VX_CSR_FFLAGS                   0x001
+#define VX_CSR_FRM                      0x002
+#define VX_CSR_FCSR                     0x003
+ 
+#define VX_CSR_SATP                     0x180
+
+#define VX_CSR_PMPCFG0                  0x3A0
+#define VX_CSR_PMPADDR0                 0x3B0
+
+#define VX_CSR_MSTATUS                  0x300
+#define VX_CSR_MISA                     0x301
+#define VX_CSR_MEDELEG                  0x302
+#define VX_CSR_MIDELEG                  0x303
+#define VX_CSR_MIE                      0x304
+#define VX_CSR_MTVEC                    0x305
+
+#define VX_CSR_MEPC                     0x341
+
+#define VX_CSR_MNSTATUS                 0x744
+
+#define VX_CSR_MPM_BASE                 0xB00
+#define VX_CSR_MPM_BASE_H               0xB80
+#define VX_CSR_MPM_USER                 0xB03
+#define VX_CSR_MPM_USER_H               0xB83
+
+// Machine Performance-monitoring core counters
+// PERF: Standard
+#define VX_CSR_MCYCLE                   0xB00
+#define VX_CSR_MCYCLE_H                 0xB80
+#define VX_CSR_MPM_RESERVED             0xB01
+#define VX_CSR_MPM_RESERVED_H           0xB81
+#define VX_CSR_MINSTRET                 0xB02
+#define VX_CSR_MINSTRET_H               0xB82
+// PERF: pipeline
+#define VX_CSR_MPM_SCHED_ID             0xB03
+#define VX_CSR_MPM_SCHED_ID_H           0xB83
+#define VX_CSR_MPM_SCHED_ST             0xB04
+#define VX_CSR_MPM_SCHED_ST_H           0xB84
+#define VX_CSR_MPM_IBUF_ST              0xB05
+#define VX_CSR_MPM_IBUF_ST_H            0xB85
+#define VX_CSR_MPM_SCRB_ST              0xB06
+#define VX_CSR_MPM_SCRB_ST_H            0xB86
+#define VX_CSR_MPM_SCRB_ALU             0xB07
+#define VX_CSR_MPM_SCRB_ALU_H           0xB87
+#define VX_CSR_MPM_SCRB_FPU             0xB08
+#define VX_CSR_MPM_SCRB_FPU_H           0xB88
+#define VX_CSR_MPM_SCRB_LSU             0xB09
+#define VX_CSR_MPM_SCRB_LSU_H           0xB89
+#define VX_CSR_MPM_SCRB_SFU             0xB0A
+#define VX_CSR_MPM_SCRB_SFU_H           0xB8A
+// PERF: memory
+#define VX_CSR_MPM_IFETCHES             0xB0B
+#define VX_CSR_MPM_IFETCHES_H           0xB8B
+#define VX_CSR_MPM_LOADS                0xB0C
+#define VX_CSR_MPM_LOADS_H              0xB8C
+#define VX_CSR_MPM_STORES               0xB0D
+#define VX_CSR_MPM_STORES_H             0xB8D
+#define VX_CSR_MPM_IFETCH_LT            0xB0E
+#define VX_CSR_MPM_IFETCH_LT_H          0xB8E
+#define VX_CSR_MPM_LOAD_LT              0xB0F 
+#define VX_CSR_MPM_LOAD_LT_H            0xB8F
+// SFU: scoreboard
+#define VX_CSR_MPM_SCRB_WCTL            0xB10
+#define VX_CSR_MPM_SCRB_WCTL_H          0xB90
+#define VX_CSR_MPM_SCRB_CSRS            0xB11
+#define VX_CSR_MPM_SCRB_CSRS_H          0xB91
+
+// Machine Performance-monitoring memory counters
+// PERF: icache
+#define VX_CSR_MPM_ICACHE_READS         0xB03     // total reads
+#define VX_CSR_MPM_ICACHE_READS_H       0xB83
+#define VX_CSR_MPM_ICACHE_MISS_R        0xB04     // read misses
+#define VX_CSR_MPM_ICACHE_MISS_R_H      0xB84
+#define VX_CSR_MPM_ICACHE_MSHR_ST       0xB05     // MSHR stalls
+#define VX_CSR_MPM_ICACHE_MSHR_ST_H     0xB85
+// PERF: dcache
+#define VX_CSR_MPM_DCACHE_READS         0xB06     // total reads
+#define VX_CSR_MPM_DCACHE_READS_H       0xB86
+#define VX_CSR_MPM_DCACHE_WRITES        0xB07     // total writes
+#define VX_CSR_MPM_DCACHE_WRITES_H      0xB87
+#define VX_CSR_MPM_DCACHE_MISS_R        0xB08     // read misses
+#define VX_CSR_MPM_DCACHE_MISS_R_H      0xB88
+#define VX_CSR_MPM_DCACHE_MISS_W        0xB09     // write misses
+#define VX_CSR_MPM_DCACHE_MISS_W_H      0xB89
+#define VX_CSR_MPM_DCACHE_BANK_ST       0xB0A     // bank conflicts
+#define VX_CSR_MPM_DCACHE_BANK_ST_H     0xB8A
+#define VX_CSR_MPM_DCACHE_MSHR_ST       0xB0B     // MSHR stalls
+#define VX_CSR_MPM_DCACHE_MSHR_ST_H     0xB8B
+// PERF: l2cache
+#define VX_CSR_MPM_L2CACHE_READS        0xB0C     // total reads
+#define VX_CSR_MPM_L2CACHE_READS_H      0xB8C
+#define VX_CSR_MPM_L2CACHE_WRITES       0xB0D     // total writes
+#define VX_CSR_MPM_L2CACHE_WRITES_H     0xB8D
+#define VX_CSR_MPM_L2CACHE_MISS_R       0xB0E     // read misses
+#define VX_CSR_MPM_L2CACHE_MISS_R_H     0xB8E
+#define VX_CSR_MPM_L2CACHE_MISS_W       0xB0F     // write misses
+#define VX_CSR_MPM_L2CACHE_MISS_W_H     0xB8F
+#define VX_CSR_MPM_L2CACHE_BANK_ST      0xB10     // bank conflicts
+#define VX_CSR_MPM_L2CACHE_BANK_ST_H    0xB90
+#define VX_CSR_MPM_L2CACHE_MSHR_ST      0xB11     // MSHR stalls
+#define VX_CSR_MPM_L2CACHE_MSHR_ST_H    0xB91
+// PERF: l3cache
+#define VX_CSR_MPM_L3CACHE_READS        0xB12     // total reads
+#define VX_CSR_MPM_L3CACHE_READS_H      0xB92
+#define VX_CSR_MPM_L3CACHE_WRITES       0xB13     // total writes
+#define VX_CSR_MPM_L3CACHE_WRITES_H     0xB93
+#define VX_CSR_MPM_L3CACHE_MISS_R       0xB14     // read misses
+#define VX_CSR_MPM_L3CACHE_MISS_R_H     0xB94
+#define VX_CSR_MPM_L3CACHE_MISS_W       0xB15     // write misses
+#define VX_CSR_MPM_L3CACHE_MISS_W_H     0xB95
+#define VX_CSR_MPM_L3CACHE_BANK_ST      0xB16     // bank conflicts
+#define VX_CSR_MPM_L3CACHE_BANK_ST_H    0xB96
+#define VX_CSR_MPM_L3CACHE_MSHR_ST      0xB17     // MSHR stalls
+#define VX_CSR_MPM_L3CACHE_MSHR_ST_H    0xB97
+// PERF: memory
+#define VX_CSR_MPM_MEM_READS            0xB18     // total reads
+#define VX_CSR_MPM_MEM_READS_H          0xB98
+#define VX_CSR_MPM_MEM_WRITES           0xB19     // total writes
+#define VX_CSR_MPM_MEM_WRITES_H         0xB99
+#define VX_CSR_MPM_MEM_LT               0xB1A     // memory latency
+#define VX_CSR_MPM_MEM_LT_H             0xB9A
+// PERF: smem
+#define VX_CSR_MPM_SMEM_READS           0xB1B     // memory reads
+#define VX_CSR_MPM_SMEM_READS_H         0xB9B
+#define VX_CSR_MPM_SMEM_WRITES          0xB1C     // memory writes
+#define VX_CSR_MPM_SMEM_WRITES_H        0xB9C
+#define VX_CSR_MPM_SMEM_BANK_ST         0xB1D     // bank conflicts
+#define VX_CSR_MPM_SMEM_BANK_ST_H       0xB9D
+
+// Machine Information Registers
+
+#define VX_CSR_MVENDORID                0xF11
+#define VX_CSR_MARCHID                  0xF12
+#define VX_CSR_MIMPID                   0xF13
+#define VX_CSR_MHARTID                  0xF14
+
+// GPGU CSRs
+
+#define VX_CSR_THREAD_ID                0xCC0
+#define VX_CSR_WARP_ID                  0xCC1
+#define VX_CSR_CORE_ID                  0xCC2
+#define VX_CSR_WARP_MASK                0xCC3
+#define VX_CSR_THREAD_MASK              0xCC4     // warning! this value is also used in LLVM
+#define VX_CSR_GCID                     0xCC5     // legacy global core id alias used by Radiance bootrom
+
+#define VX_CSR_NUM_THREADS              0xFC0
+#define VX_CSR_NUM_WARPS                0xFC1
+#define VX_CSR_NUM_CORES                0xFC2
+
+#endif // VX_TYPES_VH
--- a/kernel/include/vx_intrinsics.h
+++ b/kernel/include/vx_intrinsics.h
@@ -136,6 +136,19 @@ inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) {
    asm volatile (".insn r %0, 1, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(num_warps), "r"(func_ptr));
 }

+// Spawn an explicit warp mask. The current warp bit is ignored by hardware.
+inline void vx_wspawn_mask(unsigned warp_mask, vx_wspawn_pfn func_ptr) {
+    asm volatile (".insn r %0, 6, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(warp_mask), "r"(func_ptr));
+}
+
+inline void vx_spawn_scalar(unsigned warp_mask, vx_wspawn_pfn func_ptr) {
+    vx_wspawn_mask(warp_mask & ((1u << NUM_SCALAR_WARPS) - 1u), func_ptr);
+}
+
+inline void vx_spawn_tensor(unsigned warp_mask, vx_wspawn_pfn func_ptr) {
+    vx_wspawn_mask(warp_mask & (((1u << NUM_TENSOR_WARPS) - 1u) << NUM_SCALAR_WARPS), func_ptr);
+}
+
 // Split on a predicate
 inline unsigned vx_split(unsigned predicate) {
    unsigned ret;
@@ -149,8 +162,36 @@ inline void vx_join(unsigned stack_ptr) {
 }

 // Warp Barrier
+__attribute__((convergent))
 inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
-    asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barried_id), "r"(num_warps));
+    unsigned scalar_warps = (num_warps > NUM_SCALAR_WARPS) ? NUM_SCALAR_WARPS : num_warps;
+    asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barried_id), "r"(scalar_warps));
+}
+
+#define VX_BARRIER_DOMAIN_SHIFT 28
+#define VX_BARRIER_DOMAIN_ALL    0u
+#define VX_BARRIER_DOMAIN_SCALAR 1u
+#define VX_BARRIER_DOMAIN_TENSOR 2u
+
+__attribute__((convergent))
+inline void vx_barrier_domain(unsigned barrier_id, unsigned num_warps, unsigned domain) {
+    unsigned encoded_id = barrier_id | (domain << VX_BARRIER_DOMAIN_SHIFT);
+    asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(encoded_id), "r"(num_warps));
+}
+
+__attribute__((convergent))
+inline void vx_barrier_scalar(unsigned barrier_id, unsigned num_warps) {
+    vx_barrier_domain(barrier_id, num_warps, VX_BARRIER_DOMAIN_SCALAR);
+}
+
+__attribute__((convergent))
+inline void vx_barrier_tensor(unsigned barrier_id, unsigned num_warps) {
+    vx_barrier_domain(barrier_id, num_warps, VX_BARRIER_DOMAIN_TENSOR);
+}
+
+__attribute__((convergent))
+inline void vx_barrier_mask(unsigned barrier_id, unsigned warp_mask) {
+    asm volatile (".insn r %0, 7, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barrier_id), "r"(warp_mask));
 }

 // Return current thread identifier
@@ -202,6 +243,22 @@ inline int vx_num_warps() {
    return ret;   
 }

+inline int vx_num_scalar_warps() {
+    return NUM_SCALAR_WARPS;
+}
+
+inline int vx_num_tensor_warps() {
+    return NUM_TENSOR_WARPS;
+}
+
+inline unsigned vx_scalar_warp_mask() {
+    return (1u << NUM_SCALAR_WARPS) - 1u;
+}
+
+inline unsigned vx_tensor_warp_mask() {
+    return ((1u << NUM_TENSOR_WARPS) - 1u) << NUM_SCALAR_WARPS;
+}
+
 // Return the number of cores per cluster
 inline int vx_num_cores() {
    int ret;
--- a/kernel/include/vx_spawn.h
+++ b/kernel/include/vx_spawn.h
@@ -17,6 +17,10 @@
 #include <stdint.h>
 #include <stdio.h>

+#ifndef CORES_PER_CLUSTER
+#define CORES_PER_CLUSTER 8
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -48,6 +52,7 @@ void vx_wspawn_wait();
 void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg);

 void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
+void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
 void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg);

 void vx_serial(vx_serial_cb callback, void * arg);
--- a/kernel/src/vx_spawn.c
+++ b/kernel/src/vx_spawn.c
@@ -74,18 +74,9 @@ static void __attribute__ ((noinline)) spawn_tasks_all_stub() {
  }
 }

-static void __attribute__ ((noinline)) spawn_tasks_rem_stub() {
-  int cid = vx_core_id();
-  int tid = vx_thread_id();
-  
-  wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
-  int task_id = p_wspawn_args->offset + tid;
-  (p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
-}
-
 static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() {
  int NT  = vx_num_threads();
-  int NW  = vx_num_warps();
+  int NW  = NUM_SCALAR_WARPS;
  int cid = vx_core_id();
  int wid = vx_warp_id();
  int tid = vx_thread_id();
@@ -103,6 +94,60 @@ static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() {
  }
 }

+static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() {
+  int NT  = vx_num_threads();
+  int NW  = NUM_SCALAR_WARPS;
+  int cid = vx_core_id();
+  int wid = vx_warp_id();
+  int tid = vx_thread_id();
+
+  const int core_id_in_cluster = cid % CORES_PER_CLUSTER;
+  // round-robin warp_id allocation across cores in cluster
+  const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster;
+
+  wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
+
+  int waves = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs);
+  int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid);
+
+  vx_spawn_tasks_cb callback = p_wspawn_args->callback;
+  void* arg = p_wspawn_args->arg;
+
+  // sequential iterations
+  for (int wave_id = 0; wave_id < waves; ++wave_id) {
+    int task_id = offset + (wave_id * NT * NW * CORES_PER_CLUSTER);
+    callback(task_id, arg);
+  }
+}
+
+static void __attribute__ ((noinline)) spawn_tasks_rem_stub() {
+  int cid = vx_core_id();
+  int tid = vx_thread_id();
+  
+  wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
+  int task_id = p_wspawn_args->offset + tid;
+  (p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
+}
+
+static void __attribute__ ((noinline)) spawn_tasks_cluster_rem_stub() {
+  int NT  = vx_num_threads();
+  int cid = vx_core_id();
+  int tid = vx_thread_id();
+  int wid = vx_warp_id();
+
+  const int core_id_in_cluster = cid % CORES_PER_CLUSTER;
+  // round-robin warp_id allocation across cores in cluster
+  const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster;
+
+  wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
+  // FIXME: This assumes that all cores but the last one are working with full
+  // warps, and only the last core has a partially-filled warp.
+  int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid);
+
+  int task_id = offset;
+  (p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
+}
+
 static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() {
  // activate all threads
  vx_tmc(-1);
@@ -111,11 +156,21 @@ static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() {
  spawn_tasks_contiguous_all_stub();

  // disable warp
-  // deadlock here on warps 1, 2, 3
  vx_tmc_zero();
 }

-static void __attribute__ ((noinline)) spawn_tasks_all_cb() {  
+static void __attribute__ ((noinline)) spawn_tasks_cluster_all_cb() {
+  // activate all threads
+  vx_tmc(-1);
+
+  // call stub routine
+  spawn_tasks_cluster_all_stub();
+
+  // disable warp
+  vx_tmc_zero();
+}
+
+static void __attribute__ ((noinline)) spawn_tasks_all_cb() {
  // activate all threads
  vx_tmc(-1);

@@ -126,10 +181,115 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() {
  vx_tmc_zero();
 }

+// This function runs in every core, but with only 1 warp and 1 thread enabled.
+// The logic in this function figures out how many warps/threads this particular
+// core has to enable to fulfill an entire grid of computation.
+void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg) {
+  // device specs
+  const int NC = vx_num_cores();
+  const int NW = NUM_SCALAR_WARPS;
+  const int NT = vx_num_threads();
+  // NOTE: assumes divisible
+  const int num_cluster = NC / CORES_PER_CLUSTER;
+
+  // current core id
+  int core_id = vx_core_id();
+  if (core_id >= NUM_CORES_MAX)
+    return;
+  const int cluster_id = core_id / CORES_PER_CLUSTER;
+  const int core_id_in_cluster = core_id % CORES_PER_CLUSTER;
+
+  // try to fill up full clusters first
+  const int num_threads_in_cluster = CORES_PER_CLUSTER * NW * NT;
+  const int num_used_clusters =
+      (num_tasks + (num_threads_in_cluster - 1)) / num_threads_in_cluster;
+  if (cluster_id >= num_used_clusters) {
+      return; // terminate extra clusters
+  }
+  // fill up the last cluster with remaining tasks
+  const int num_full_clusters = num_tasks / num_threads_in_cluster;
+  int num_tasks_this_cluster = num_threads_in_cluster;
+  if (cluster_id >= num_full_clusters) {
+      num_tasks_this_cluster = num_tasks % num_threads_in_cluster;
+  }
+
+  // Distribute threads equally across as many cores as possible, even if they
+  // don't fill up NW*NT in a single core.  This makes sure the warps get evenly
+  // distributed in a single cluster
+  //
+  // TODO: Try to contain in a single cluster if possible?
+  const int num_active_cores = (num_tasks + (NT - 1)) / NT;
+  if (core_id >= num_active_cores)
+    return; // terminate extra cores
+
+  const int num_full_warps_this_cluster = num_tasks_this_cluster / NT;
+  const int rem_threads_in_last_warp = num_tasks_this_cluster % NT;
+  // const int num_warps = (num_tasks_this_cluster + (NT - 1)) / NT;
+
+  int num_warps_this_core = num_full_warps_this_cluster / CORES_PER_CLUSTER;
+  const int num_warps_in_last_row = num_full_warps_this_cluster % CORES_PER_CLUSTER;
+  if (core_id_in_cluster < num_warps_in_last_row) {
+    num_warps_this_core++;
+  }
+  // if 0, last warp is full-threads enabled
+  int rem_threads_in_last_warp_this_core = 0;
+  if (rem_threads_in_last_warp != 0) {
+    if (core_id_in_cluster == num_warps_in_last_row - 1) {
+      rem_threads_in_last_warp_this_core = rem_threads_in_last_warp;
+    }
+  }
+
+  // sequential iterations
+  const int num_full_waves = num_warps_this_core / NW;
+  const int rem_full_warps_in_last_wave = num_warps_this_core % NW;
+
+  const int offset = cluster_id * num_tasks_this_cluster;
+  wspawn_tasks_args_t wspawn_args = {callback, arg, offset, num_full_waves,
+                                     rem_full_warps_in_last_wave};
+  g_wspawn_args[core_id] = &wspawn_args;
+
+  if (num_warps_this_core > 0) {
+    // execute callback on other warps
+    const int nw = MIN(num_warps_this_core, NW);
+    vx_wspawn(nw, spawn_tasks_cluster_all_cb);
+
+    // activate all threads
+    vx_tmc(-1);
+
+    // call stub routine
+    spawn_tasks_cluster_all_stub();
+
+    // back to single-threaded
+    vx_tmc_one();
+
+    // wait for spawn warps to terminate
+    vx_wspawn_wait();
+  }
+
+  // TODO: this is incomplete
+  // TODO: Instead of launching an additional wave just to work on remaining
+  // threads, handle this in the last wave amongst other full warps.
+  if (rem_threads_in_last_warp != 0 && core_id_in_cluster == 0) {
+    // adjust offset
+    // FIXME: use rem_threads_in_last_warp_this_core
+    wspawn_args.offset += (num_tasks_this_cluster - rem_threads_in_last_warp);
+
+    // activate remaining threads
+    const int tmask = (1 << rem_threads_in_last_warp) - 1;
+    vx_tmc(tmask);
+
+    // call stub routine
+    spawn_tasks_cluster_rem_stub();
+
+    // back to single-threaded
+    vx_tmc_one();
+  }
+}
+
 void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
 	// device specs
  int NC = vx_num_cores();
-  int NW = vx_num_warps();
+  int NW = NUM_SCALAR_WARPS;
  int NT = vx_num_threads();

  // current core id
@@ -179,7 +339,6 @@ void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void
    vx_tmc_one();
    
    // wait for spawn warps to terminate
-    // deadlock here on warp 0!
    vx_wspawn_wait();
 	}  

@@ -202,7 +361,7 @@ void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void
 void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
 	// device specs
  int NC = vx_num_cores();
-  int NW = vx_num_warps();
+  int NW = NUM_SCALAR_WARPS;
  int NT = vx_num_threads();

  // current core id
@@ -356,7 +515,7 @@ void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg) {
  
  // device specs
  int NC = vx_num_cores();
-  int NW = vx_num_warps();
+  int NW = NUM_SCALAR_WARPS;
  int NT = vx_num_threads();

  // current core id
--- a/kernel/src/vx_start.S
+++ b/kernel/src/vx_start.S
@@ -22,9 +22,9 @@
 _start:  

  # initialize per-thread registers
-  csrr  t0, VX_CSR_NUM_WARPS  # get num warps
+  li    t0, ((1 << NUM_SCALAR_WARPS) - 1)  # scalar warp mask
  la    t1, init_regs_all
-  .insn r RISCV_CUSTOM0, 1, 0, x0, t0, t1  # wspawn t0, t1
+  .insn r RISCV_CUSTOM0, 6, 0, x0, t0, t1  # wspawn_mask t0, t1
  li    t0, -1
  .insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0  # tmc t0
  jal   init_regs
@@ -35,9 +35,9 @@ _start:
  jal vx_wspawn_wait

  # initialize TLS for all warps
-  csrr  t0, VX_CSR_NUM_WARPS  # get num warps
+  li    t0, ((1 << NUM_SCALAR_WARPS) - 1)  # scalar warp mask
  la    t1, init_tls_all
-  .insn r RISCV_CUSTOM0, 1, 0, x0, t0, t1  # wspawn t0, t1
+  .insn r RISCV_CUSTOM0, 6, 0, x0, t0, t1  # wspawn_mask t0, t1
  li    t0, -1
  .insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0  # tmc t0
  call  __init_tls
@@ -102,6 +102,8 @@ init_regs:
 #endif
  csrr  t0, VX_CSR_MHARTID
  sll   t1, t0, STACK_LOG2_SIZE
+  sll   t2, t0, 4
+  add   t1, t1, t2
  sub   sp, sp, t1

  # set thread pointer register