// Copyright © 2019-2023 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef __VX_INTRINSICS_H__ #define __VX_INTRINSICS_H__ #include #include #if defined(__clang__) #define __UNIFORM__ __attribute__((annotate("vortex.uniform"))) #else #define __UNIFORM__ #endif #ifdef __cplusplus extern "C" { #endif #ifdef __ASSEMBLY__ #define __ASM_STR(x) x #else #define __ASM_STR(x) #x #endif #define RISCV_CUSTOM0 0x0B #define RISCV_CUSTOM1 0x2B #define RISCV_CUSTOM2 0x5B #define RISCV_CUSTOM3 0x7B #define csr_read(csr) ({ \ unsigned __r; \ __asm__ __volatile__ ("csrr %0, %1" : "=r" (__r) : "i" (csr)); \ __r; \ }) #define csr_write(csr, val) ({ \ unsigned __v = (unsigned)(val); \ if (__builtin_constant_p(val) && __v < 32) \ __asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "i" (__v)); \ else \ __asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "r" (__v)); \ }) #define csr_swap(csr, val) ({ \ unsigned __r; \ unsigned __v = (unsigned)(val); \ if (__builtin_constant_p(val) && __v < 32) \ __asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \ else \ __asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \ __r; \ }) #define csr_read_set(csr, val) ({ \ unsigned __r; \ unsigned __v = (unsigned)(val); \ if (__builtin_constant_p(val) && __v < 32) \ __asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \ else \ __asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \ __r; \ }) #define csr_set(csr, val) ({ \ unsigned __v = (unsigned)(val); \ if (__builtin_constant_p(val) && __v < 32) \ __asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "i" (__v)); \ else \ __asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "r" (__v)); \ }) #define csr_read_clear(csr, val) ({ \ unsigned __r; \ unsigned __v = (unsigned)(val); \ if (__builtin_constant_p(val) && __v < 32) \ __asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \ else \ __asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \ __r; \ }) #define csr_clear(csr, val) ({ \ unsigned __v = (unsigned)(val); \ if (__builtin_constant_p(val) && __v < 32) \ __asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "i" (__v)); \ else \ __asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "r" (__v)); \ }) // Conditional move inline unsigned vx_cmov(unsigned c, unsigned t, unsigned f) { unsigned ret; asm volatile (".insn r4 %1, 1, 0, %0, %2, %3, %4" : "=r"(ret) : "i"(RISCV_CUSTOM1), "r"(c), "r"(t), "r"(f)); return ret; } // Set thread mask inline void vx_tmc(unsigned thread_mask) { asm volatile (".insn r %0, 0, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(thread_mask)); } // disable all threads in the current warp inline void vx_tmc_zero() { asm volatile (".insn r %0, 0, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM0)); } // switch execution to single thread zero inline void vx_tmc_one() { asm volatile ( "li a0, 1\n\t" // Load immediate value 1 into a0 (x10) register ".insn r %0, 0, 0, x0, a0, x0" :: "i"(RISCV_CUSTOM0) : "a0" // Indicate that a0 (x10) is clobbered ); } // Set thread predicate inline void vx_pred(unsigned condition, unsigned thread_mask) { asm volatile (".insn r %0, 5, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(condition), "r"(thread_mask)); } typedef void (*vx_wspawn_pfn)(); // Spawn warps inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) { asm volatile (".insn r %0, 1, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(num_warps), "r"(func_ptr)); } // Spawn an explicit warp mask. The current warp bit is ignored by hardware. inline void vx_wspawn_mask(unsigned warp_mask, vx_wspawn_pfn func_ptr) { asm volatile (".insn r %0, 6, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(warp_mask), "r"(func_ptr)); } inline void vx_spawn_scalar(unsigned warp_mask, vx_wspawn_pfn func_ptr) { vx_wspawn_mask(warp_mask & ((1u << NUM_SCALAR_WARPS) - 1u), func_ptr); } inline void vx_spawn_tensor(unsigned warp_mask, vx_wspawn_pfn func_ptr) { vx_wspawn_mask(warp_mask & (((1u << NUM_TENSOR_WARPS) - 1u) << NUM_SCALAR_WARPS), func_ptr); } // Split on a predicate inline unsigned vx_split(unsigned predicate) { unsigned ret; asm volatile (".insn r %1, 2, 0, %0, %2, x0" : "=r"(ret) : "i"(RISCV_CUSTOM0), "r"(predicate)); return ret; } // Join inline void vx_join(unsigned stack_ptr) { asm volatile (".insn r %0, 3, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(stack_ptr)); } // Warp Barrier __attribute__((convergent)) inline void vx_barrier(unsigned barried_id, unsigned num_warps) { unsigned scalar_warps = (num_warps > NUM_SCALAR_WARPS) ? NUM_SCALAR_WARPS : num_warps; asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barried_id), "r"(scalar_warps)); } #define VX_BARRIER_DOMAIN_SHIFT 28 #define VX_BARRIER_DOMAIN_ALL 0u #define VX_BARRIER_DOMAIN_SCALAR 1u #define VX_BARRIER_DOMAIN_TENSOR 2u __attribute__((convergent)) inline void vx_barrier_domain(unsigned barrier_id, unsigned num_warps, unsigned domain) { unsigned encoded_id = barrier_id | (domain << VX_BARRIER_DOMAIN_SHIFT); asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(encoded_id), "r"(num_warps)); } __attribute__((convergent)) inline void vx_barrier_scalar(unsigned barrier_id, unsigned num_warps) { vx_barrier_domain(barrier_id, num_warps, VX_BARRIER_DOMAIN_SCALAR); } __attribute__((convergent)) inline void vx_barrier_tensor(unsigned barrier_id, unsigned num_warps) { vx_barrier_domain(barrier_id, num_warps, VX_BARRIER_DOMAIN_TENSOR); } __attribute__((convergent)) inline void vx_barrier_mask(unsigned barrier_id, unsigned warp_mask) { asm volatile (".insn r %0, 7, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barrier_id), "r"(warp_mask)); } // Return current thread identifier inline int vx_thread_id() { int ret; asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_THREAD_ID)); return ret; } // Return current warp identifier inline int vx_warp_id() { int ret; asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_WARP_ID)); return ret; } // Return current core identifier inline int vx_core_id() { int ret; asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_CORE_ID)); return ret; } // Return current thread mask inline int vx_thread_mask() { int ret; asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_THREAD_MASK)); return ret; } // Return number of active warps inline int vx_active_warps() { int ret; asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_WARP_MASK)); return ret; } // Return the number of threads per warp inline int vx_num_threads() { int ret; asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_THREADS)); return ret; } // Return the number of warps per core inline int vx_num_warps() { int ret; asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_WARPS)); return ret; } inline int vx_num_scalar_warps() { return NUM_SCALAR_WARPS; } inline int vx_num_tensor_warps() { return NUM_TENSOR_WARPS; } inline unsigned vx_scalar_warp_mask() { return (1u << NUM_SCALAR_WARPS) - 1u; } inline unsigned vx_tensor_warp_mask() { return ((1u << NUM_TENSOR_WARPS) - 1u) << NUM_SCALAR_WARPS; } // Return the number of cores per cluster inline int vx_num_cores() { int ret; asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_CORES)); return ret; } // Return the hart identifier (thread id accross the processor) inline int vx_hart_id() { int ret; asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_MHARTID)); return ret; } inline void vx_fence() { asm volatile ("fence iorw, iorw"); } #ifdef __cplusplus } #endif #endif // __VX_INTRINSICS_H__