285 lines
8.9 KiB
C
285 lines
8.9 KiB
C
// Copyright © 2019-2023
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#ifndef __VX_INTRINSICS_H__
|
|
#define __VX_INTRINSICS_H__
|
|
|
|
#include <VX_config.h>
|
|
#include <VX_types.h>
|
|
|
|
#if defined(__clang__)
|
|
#define __UNIFORM__ __attribute__((annotate("vortex.uniform")))
|
|
#else
|
|
#define __UNIFORM__
|
|
#endif
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
#ifdef __ASSEMBLY__
|
|
#define __ASM_STR(x) x
|
|
#else
|
|
#define __ASM_STR(x) #x
|
|
#endif
|
|
|
|
#define RISCV_CUSTOM0 0x0B
|
|
#define RISCV_CUSTOM1 0x2B
|
|
#define RISCV_CUSTOM2 0x5B
|
|
#define RISCV_CUSTOM3 0x7B
|
|
|
|
#define csr_read(csr) ({ \
|
|
unsigned __r; \
|
|
__asm__ __volatile__ ("csrr %0, %1" : "=r" (__r) : "i" (csr)); \
|
|
__r; \
|
|
})
|
|
|
|
#define csr_write(csr, val) ({ \
|
|
unsigned __v = (unsigned)(val); \
|
|
if (__builtin_constant_p(val) && __v < 32) \
|
|
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "i" (__v)); \
|
|
else \
|
|
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "r" (__v)); \
|
|
})
|
|
|
|
#define csr_swap(csr, val) ({ \
|
|
unsigned __r; \
|
|
unsigned __v = (unsigned)(val); \
|
|
if (__builtin_constant_p(val) && __v < 32) \
|
|
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
|
|
else \
|
|
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
|
|
__r; \
|
|
})
|
|
|
|
#define csr_read_set(csr, val) ({ \
|
|
unsigned __r; \
|
|
unsigned __v = (unsigned)(val); \
|
|
if (__builtin_constant_p(val) && __v < 32) \
|
|
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
|
|
else \
|
|
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
|
|
__r; \
|
|
})
|
|
|
|
#define csr_set(csr, val) ({ \
|
|
unsigned __v = (unsigned)(val); \
|
|
if (__builtin_constant_p(val) && __v < 32) \
|
|
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "i" (__v)); \
|
|
else \
|
|
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "r" (__v)); \
|
|
})
|
|
|
|
#define csr_read_clear(csr, val) ({ \
|
|
unsigned __r; \
|
|
unsigned __v = (unsigned)(val); \
|
|
if (__builtin_constant_p(val) && __v < 32) \
|
|
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
|
|
else \
|
|
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
|
|
__r; \
|
|
})
|
|
|
|
#define csr_clear(csr, val) ({ \
|
|
unsigned __v = (unsigned)(val); \
|
|
if (__builtin_constant_p(val) && __v < 32) \
|
|
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "i" (__v)); \
|
|
else \
|
|
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "r" (__v)); \
|
|
})
|
|
|
|
// Conditional move
|
|
inline unsigned vx_cmov(unsigned c, unsigned t, unsigned f) {
|
|
unsigned ret;
|
|
asm volatile (".insn r4 %1, 1, 0, %0, %2, %3, %4" : "=r"(ret) : "i"(RISCV_CUSTOM1), "r"(c), "r"(t), "r"(f));
|
|
return ret;
|
|
}
|
|
|
|
// Set thread mask
|
|
inline void vx_tmc(unsigned thread_mask) {
|
|
asm volatile (".insn r %0, 0, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(thread_mask));
|
|
}
|
|
|
|
// disable all threads in the current warp
|
|
inline void vx_tmc_zero() {
|
|
asm volatile (".insn r %0, 0, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM0));
|
|
}
|
|
|
|
// switch execution to single thread zero
|
|
inline void vx_tmc_one() {
|
|
asm volatile (
|
|
"li a0, 1\n\t" // Load immediate value 1 into a0 (x10) register
|
|
".insn r %0, 0, 0, x0, a0, x0" :: "i"(RISCV_CUSTOM0)
|
|
: "a0" // Indicate that a0 (x10) is clobbered
|
|
);
|
|
}
|
|
|
|
// Set thread predicate
|
|
inline void vx_pred(unsigned condition, unsigned thread_mask) {
|
|
asm volatile (".insn r %0, 5, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(condition), "r"(thread_mask));
|
|
}
|
|
|
|
typedef void (*vx_wspawn_pfn)();
|
|
|
|
// Spawn warps
|
|
inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) {
|
|
asm volatile (".insn r %0, 1, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(num_warps), "r"(func_ptr));
|
|
}
|
|
|
|
// Spawn an explicit warp mask. The current warp bit is ignored by hardware.
|
|
inline void vx_wspawn_mask(unsigned warp_mask, vx_wspawn_pfn func_ptr) {
|
|
asm volatile (".insn r %0, 6, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(warp_mask), "r"(func_ptr));
|
|
}
|
|
|
|
inline void vx_spawn_scalar(unsigned warp_mask, vx_wspawn_pfn func_ptr) {
|
|
vx_wspawn_mask(warp_mask & ((1u << NUM_SCALAR_WARPS) - 1u), func_ptr);
|
|
}
|
|
|
|
inline void vx_spawn_tensor(unsigned warp_mask, vx_wspawn_pfn func_ptr) {
|
|
vx_wspawn_mask(warp_mask & (((1u << NUM_TENSOR_WARPS) - 1u) << NUM_SCALAR_WARPS), func_ptr);
|
|
}
|
|
|
|
// Split on a predicate
|
|
inline unsigned vx_split(unsigned predicate) {
|
|
unsigned ret;
|
|
asm volatile (".insn r %1, 2, 0, %0, %2, x0" : "=r"(ret) : "i"(RISCV_CUSTOM0), "r"(predicate));
|
|
return ret;
|
|
}
|
|
|
|
// Join
|
|
inline void vx_join(unsigned stack_ptr) {
|
|
asm volatile (".insn r %0, 3, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(stack_ptr));
|
|
}
|
|
|
|
// Warp Barrier
|
|
__attribute__((convergent))
|
|
inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
|
|
unsigned scalar_warps = (num_warps > NUM_SCALAR_WARPS) ? NUM_SCALAR_WARPS : num_warps;
|
|
asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barried_id), "r"(scalar_warps));
|
|
}
|
|
|
|
#define VX_BARRIER_DOMAIN_SHIFT 28
|
|
#define VX_BARRIER_DOMAIN_ALL 0u
|
|
#define VX_BARRIER_DOMAIN_SCALAR 1u
|
|
#define VX_BARRIER_DOMAIN_TENSOR 2u
|
|
|
|
__attribute__((convergent))
|
|
inline void vx_barrier_domain(unsigned barrier_id, unsigned num_warps, unsigned domain) {
|
|
unsigned encoded_id = barrier_id | (domain << VX_BARRIER_DOMAIN_SHIFT);
|
|
asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(encoded_id), "r"(num_warps));
|
|
}
|
|
|
|
__attribute__((convergent))
|
|
inline void vx_barrier_scalar(unsigned barrier_id, unsigned num_warps) {
|
|
vx_barrier_domain(barrier_id, num_warps, VX_BARRIER_DOMAIN_SCALAR);
|
|
}
|
|
|
|
__attribute__((convergent))
|
|
inline void vx_barrier_tensor(unsigned barrier_id, unsigned num_warps) {
|
|
vx_barrier_domain(barrier_id, num_warps, VX_BARRIER_DOMAIN_TENSOR);
|
|
}
|
|
|
|
__attribute__((convergent))
|
|
inline void vx_barrier_mask(unsigned barrier_id, unsigned warp_mask) {
|
|
asm volatile (".insn r %0, 7, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barrier_id), "r"(warp_mask));
|
|
}
|
|
|
|
// Return current thread identifier
|
|
inline int vx_thread_id() {
|
|
int ret;
|
|
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_THREAD_ID));
|
|
return ret;
|
|
}
|
|
|
|
// Return current warp identifier
|
|
inline int vx_warp_id() {
|
|
int ret;
|
|
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_WARP_ID));
|
|
return ret;
|
|
}
|
|
|
|
// Return current core identifier
|
|
inline int vx_core_id() {
|
|
int ret;
|
|
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_CORE_ID));
|
|
return ret;
|
|
}
|
|
|
|
// Return current thread mask
|
|
inline int vx_thread_mask() {
|
|
int ret;
|
|
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_THREAD_MASK));
|
|
return ret;
|
|
}
|
|
|
|
// Return number of active warps
|
|
inline int vx_active_warps() {
|
|
int ret;
|
|
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_WARP_MASK));
|
|
return ret;
|
|
}
|
|
|
|
// Return the number of threads per warp
|
|
inline int vx_num_threads() {
|
|
int ret;
|
|
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_THREADS));
|
|
return ret;
|
|
}
|
|
|
|
// Return the number of warps per core
|
|
inline int vx_num_warps() {
|
|
int ret;
|
|
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_WARPS));
|
|
return ret;
|
|
}
|
|
|
|
inline int vx_num_scalar_warps() {
|
|
return NUM_SCALAR_WARPS;
|
|
}
|
|
|
|
inline int vx_num_tensor_warps() {
|
|
return NUM_TENSOR_WARPS;
|
|
}
|
|
|
|
inline unsigned vx_scalar_warp_mask() {
|
|
return (1u << NUM_SCALAR_WARPS) - 1u;
|
|
}
|
|
|
|
inline unsigned vx_tensor_warp_mask() {
|
|
return ((1u << NUM_TENSOR_WARPS) - 1u) << NUM_SCALAR_WARPS;
|
|
}
|
|
|
|
// Return the number of cores per cluster
|
|
inline int vx_num_cores() {
|
|
int ret;
|
|
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_CORES));
|
|
return ret;
|
|
}
|
|
|
|
// Return the hart identifier (thread id accross the processor)
|
|
inline int vx_hart_id() {
|
|
int ret;
|
|
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_MHARTID));
|
|
return ret;
|
|
}
|
|
|
|
inline void vx_fence() {
|
|
asm volatile ("fence iorw, iorw");
|
|
}
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif // __VX_INTRINSICS_H__
|