+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
228 lines
6.9 KiB
C
228 lines
6.9 KiB
C
// Copyright © 2019-2023
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#ifndef __VX_INTRINSICS_H__
|
|
#define __VX_INTRINSICS_H__
|
|
|
|
#include <VX_config.h>
|
|
#include <VX_types.h>
|
|
|
|
#if defined(__clang__)
|
|
#define __UNIFORM__ __attribute__((annotate("vortex.uniform")))
|
|
#else
|
|
#define __UNIFORM__
|
|
#endif
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
#ifdef __ASSEMBLY__
|
|
#define __ASM_STR(x) x
|
|
#else
|
|
#define __ASM_STR(x) #x
|
|
#endif
|
|
|
|
#define RISCV_CUSTOM0 0x0B
|
|
#define RISCV_CUSTOM1 0x2B
|
|
#define RISCV_CUSTOM2 0x5B
|
|
#define RISCV_CUSTOM3 0x7B
|
|
|
|
#define csr_read(csr) ({ \
|
|
unsigned __r; \
|
|
__asm__ __volatile__ ("csrr %0, %1" : "=r" (__r) : "i" (csr)); \
|
|
__r; \
|
|
})
|
|
|
|
#define csr_write(csr, val) ({ \
|
|
unsigned __v = (unsigned)(val); \
|
|
if (__builtin_constant_p(val) && __v < 32) \
|
|
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "i" (__v)); \
|
|
else \
|
|
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "r" (__v)); \
|
|
})
|
|
|
|
#define csr_swap(csr, val) ({ \
|
|
unsigned __r; \
|
|
unsigned __v = (unsigned)(val); \
|
|
if (__builtin_constant_p(val) && __v < 32) \
|
|
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
|
|
else \
|
|
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
|
|
__r; \
|
|
})
|
|
|
|
#define csr_read_set(csr, val) ({ \
|
|
unsigned __r; \
|
|
unsigned __v = (unsigned)(val); \
|
|
if (__builtin_constant_p(val) && __v < 32) \
|
|
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
|
|
else \
|
|
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
|
|
__r; \
|
|
})
|
|
|
|
#define csr_set(csr, val) ({ \
|
|
unsigned __v = (unsigned)(val); \
|
|
if (__builtin_constant_p(val) && __v < 32) \
|
|
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "i" (__v)); \
|
|
else \
|
|
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "r" (__v)); \
|
|
})
|
|
|
|
#define csr_read_clear(csr, val) ({ \
|
|
unsigned __r; \
|
|
unsigned __v = (unsigned)(val); \
|
|
if (__builtin_constant_p(val) && __v < 32) \
|
|
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
|
|
else \
|
|
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
|
|
__r; \
|
|
})
|
|
|
|
#define csr_clear(csr, val) ({ \
|
|
unsigned __v = (unsigned)(val); \
|
|
if (__builtin_constant_p(val) && __v < 32) \
|
|
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "i" (__v)); \
|
|
else \
|
|
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "r" (__v)); \
|
|
})
|
|
|
|
// Conditional move
|
|
inline unsigned vx_cmov(unsigned c, unsigned t, unsigned f) {
|
|
unsigned ret;
|
|
asm volatile (".insn r4 %1, 1, 0, %0, %2, %3, %4" : "=r"(ret) : "i"(RISCV_CUSTOM1), "r"(c), "r"(t), "r"(f));
|
|
return ret;
|
|
}
|
|
|
|
// Set thread mask
|
|
inline void vx_tmc(unsigned thread_mask) {
|
|
asm volatile (".insn r %0, 0, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(thread_mask));
|
|
}
|
|
|
|
// disable all threads in the current warp
|
|
inline void vx_tmc_zero() {
|
|
asm volatile (".insn r %0, 0, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM0));
|
|
}
|
|
|
|
// switch execution to single thread zero
|
|
inline void vx_tmc_one() {
|
|
asm volatile (
|
|
"li a0, 1\n\t" // Load immediate value 1 into a0 (x10) register
|
|
".insn r %0, 0, 0, x0, a0, x0" :: "i"(RISCV_CUSTOM0)
|
|
: "a0" // Indicate that a0 (x10) is clobbered
|
|
);
|
|
}
|
|
|
|
// Set thread predicate
|
|
inline void vx_pred(unsigned condition, unsigned thread_mask) {
|
|
asm volatile (".insn r %0, 5, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(condition), "r"(thread_mask));
|
|
}
|
|
|
|
typedef void (*vx_wspawn_pfn)();
|
|
|
|
// Spawn warps
|
|
inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) {
|
|
asm volatile (".insn r %0, 1, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(num_warps), "r"(func_ptr));
|
|
}
|
|
|
|
// Split on a predicate
|
|
inline unsigned vx_split(unsigned predicate) {
|
|
unsigned ret;
|
|
asm volatile (".insn r %1, 2, 0, %0, %2, x0" : "=r"(ret) : "i"(RISCV_CUSTOM0), "r"(predicate));
|
|
return ret;
|
|
}
|
|
|
|
// Join
|
|
inline void vx_join(unsigned stack_ptr) {
|
|
asm volatile (".insn r %0, 3, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(stack_ptr));
|
|
}
|
|
|
|
// Warp Barrier
|
|
inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
|
|
asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barried_id), "r"(num_warps));
|
|
}
|
|
|
|
// Return current thread identifier
|
|
inline int vx_thread_id() {
|
|
int ret;
|
|
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_THREAD_ID));
|
|
return ret;
|
|
}
|
|
|
|
// Return current warp identifier
|
|
inline int vx_warp_id() {
|
|
int ret;
|
|
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_WARP_ID));
|
|
return ret;
|
|
}
|
|
|
|
// Return current core identifier
|
|
inline int vx_core_id() {
|
|
int ret;
|
|
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_CORE_ID));
|
|
return ret;
|
|
}
|
|
|
|
// Return current thread mask
|
|
inline int vx_thread_mask() {
|
|
int ret;
|
|
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_THREAD_MASK));
|
|
return ret;
|
|
}
|
|
|
|
// Return number of active warps
|
|
inline int vx_active_warps() {
|
|
int ret;
|
|
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_WARP_MASK));
|
|
return ret;
|
|
}
|
|
|
|
// Return the number of threads per warp
|
|
inline int vx_num_threads() {
|
|
int ret;
|
|
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_THREADS));
|
|
return ret;
|
|
}
|
|
|
|
// Return the number of warps per core
|
|
inline int vx_num_warps() {
|
|
int ret;
|
|
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_WARPS));
|
|
return ret;
|
|
}
|
|
|
|
// Return the number of cores per cluster
|
|
inline int vx_num_cores() {
|
|
int ret;
|
|
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_CORES));
|
|
return ret;
|
|
}
|
|
|
|
// Return the hart identifier (thread id accross the processor)
|
|
inline int vx_hart_id() {
|
|
int ret;
|
|
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_MHARTID));
|
|
return ret;
|
|
}
|
|
|
|
inline void vx_fence() {
|
|
asm volatile ("fence iorw, iorw");
|
|
}
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif // __VX_INTRINSICS_H__
|