Vortex 2.0 changes:
+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes
This commit is contained in:
111
runtime/include/vortex.h
Normal file
111
runtime/include/vortex.h
Normal file
@@ -0,0 +1,111 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef __VX_VORTEX_H__
|
||||
#define __VX_VORTEX_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef void* vx_device_h;
|
||||
|
||||
// device caps ids
|
||||
#define VX_CAPS_VERSION 0x0
|
||||
#define VX_CAPS_NUM_THREADS 0x1
|
||||
#define VX_CAPS_NUM_WARPS 0x2
|
||||
#define VX_CAPS_NUM_CORES 0x3
|
||||
#define VX_CAPS_CACHE_LINE_SIZE 0x4
|
||||
#define VX_CAPS_GLOBAL_MEM_SIZE 0x5
|
||||
#define VX_CAPS_LOCAL_MEM_SIZE 0x6
|
||||
#define VX_CAPS_KERNEL_BASE_ADDR 0x7
|
||||
#define VX_CAPS_ISA_FLAGS 0x8
|
||||
|
||||
// device isa flags
|
||||
#define VX_ISA_STD_A (1ull << 0)
|
||||
#define VX_ISA_STD_C (1ull << 2)
|
||||
#define VX_ISA_STD_D (1ull << 3)
|
||||
#define VX_ISA_STD_E (1ull << 4)
|
||||
#define VX_ISA_STD_F (1ull << 5)
|
||||
#define VX_ISA_STD_H (1ull << 7)
|
||||
#define VX_ISA_STD_I (1ull << 8)
|
||||
#define VX_ISA_STD_N (1ull << 13)
|
||||
#define VX_ISA_STD_Q (1ull << 16)
|
||||
#define VX_ISA_STD_S (1ull << 18)
|
||||
#define VX_ISA_STD_U (1ull << 20)
|
||||
#define VX_ISA_BASE(flags) (1 << (((flags >> 30) & 0x3) + 4))
|
||||
#define VX_ISA_EXT_TEX (1ull << 32)
|
||||
#define VX_ISA_EXT_RASTER (1ull << 33)
|
||||
#define VX_ISA_EXT_ROP (1ull << 34)
|
||||
|
||||
// device memory types
|
||||
#define VX_MEM_TYPE_GLOBAL 0
|
||||
#define VX_MEM_TYPE_LOCAL 1
|
||||
|
||||
// ready wait timeout
|
||||
#define VX_MAX_TIMEOUT (24*60*60*1000) // 24 Hr
|
||||
|
||||
// open the device and connect to it
|
||||
int vx_dev_open(vx_device_h* hdevice);
|
||||
|
||||
// Close the device when all the operations are done
|
||||
int vx_dev_close(vx_device_h hdevice);
|
||||
|
||||
// return device configurations
|
||||
int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value);
|
||||
|
||||
// allocate device memory and return address
|
||||
int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int type, uint64_t* dev_addr);
|
||||
|
||||
// release device memory
|
||||
int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr);
|
||||
|
||||
// get device memory info
|
||||
int vx_mem_info(vx_device_h hdevice, int type, uint64_t* mem_free, uint64_t* mem_used);
|
||||
|
||||
// Copy bytes from host to device memory
|
||||
int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size);
|
||||
|
||||
// Copy bytes from device memory to host
|
||||
int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size);
|
||||
|
||||
// Start device execution
|
||||
int vx_start(vx_device_h hdevice);
|
||||
|
||||
// Wait for device ready with milliseconds timeout
|
||||
int vx_ready_wait(vx_device_h hdevice, uint64_t timeout);
|
||||
|
||||
// write device configuration registers
|
||||
int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value);
|
||||
|
||||
////////////////////////////// UTILITY FUNCTIONS //////////////////////////////
|
||||
|
||||
// upload kernel bytes to device
|
||||
int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size);
|
||||
|
||||
// upload kernel file to device
|
||||
int vx_upload_kernel_file(vx_device_h hdevice, const char* filename);
|
||||
|
||||
// performance counters
|
||||
int vx_dump_perf(vx_device_h hdevice, FILE* stream);
|
||||
int vx_perf_counter(vx_device_h hdevice, int counter, int core_id, uint64_t* value);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // __VX_VORTEX_H__
|
||||
@@ -1,214 +0,0 @@
|
||||
#ifndef VX_INTRINSICS_H
|
||||
#define VX_INTRINSICS_H
|
||||
|
||||
#include <VX_config.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef __ASSEMBLY__
|
||||
#define __ASM_STR(x) x
|
||||
#else
|
||||
#define __ASM_STR(x) #x
|
||||
#endif
|
||||
|
||||
#define csr_read(csr) ({ \
|
||||
unsigned __r; \
|
||||
__asm__ __volatile__ ("csrr %0, %1" : "=r" (__r) : "i" (csr)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
#define csr_write(csr, val) ({ \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "r" (__v)); \
|
||||
})
|
||||
|
||||
#define csr_swap(csr, val) ({ \
|
||||
unsigned __r; \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
#define csr_read_set(csr, val) ({ \
|
||||
unsigned __r; \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
#define csr_set(csr, val) ({ \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "r" (__v)); \
|
||||
})
|
||||
|
||||
#define csr_read_clear(csr, val) ({ \
|
||||
unsigned __r; \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
#define csr_clear(csr, val) ({ \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "r" (__v)); \
|
||||
})
|
||||
|
||||
// Texture load
|
||||
#define vx_tex(unit, u, v, lod) ({ \
|
||||
unsigned __r; \
|
||||
__asm__ __volatile__ (".insn r4 0x5b, 0, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(unit), "r"(u), "r"(v), "r"(lod)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
// Conditional move
|
||||
#define vx_cmov(c, t, f) ({ \
|
||||
unsigned __r; \
|
||||
__asm__ __volatile__ (".insn r4 0x5b, 1, 0, %0, %1, %2, %3" : "=r"(__r : "r"(c), "r"(t), "r"(f)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
// Set thread mask
|
||||
inline void vx_tmc(unsigned thread_mask) {
|
||||
asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(thread_mask));
|
||||
}
|
||||
|
||||
// Set thread predicate
|
||||
inline void vx_pred(unsigned condition) {
|
||||
asm volatile (".insn s 0x6b, 0, x1, 0(%0)" :: "r"(condition));
|
||||
}
|
||||
|
||||
typedef void (*vx_wspawn_pfn)();
|
||||
|
||||
// Spawn warps
|
||||
inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) {
|
||||
asm volatile (".insn s 0x6b, 1, %1, 0(%0)" :: "r"(num_warps), "r"(func_ptr));
|
||||
}
|
||||
|
||||
// Split on a predicate
|
||||
inline void vx_split(int predicate) {
|
||||
asm volatile (".insn s 0x6b, 2, x0, 0(%0)" :: "r"(predicate));
|
||||
}
|
||||
|
||||
// Join
|
||||
inline void vx_join() {
|
||||
asm volatile (".insn s 0x6b, 3, x0, 0(x0)");
|
||||
}
|
||||
|
||||
// Warp Barrier
|
||||
inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
|
||||
asm volatile (".insn s 0x6b, 4, %1, 0(%0)" :: "r"(barried_id), "r"(num_warps));
|
||||
}
|
||||
|
||||
// Prefetch
|
||||
inline void vx_prefetch(unsigned addr) {
|
||||
asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) );
|
||||
}
|
||||
|
||||
// Return active warp's thread id
|
||||
inline int vx_thread_id() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_WTID));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return active core's local thread id
|
||||
inline int vx_thread_lid() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_LTID));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return processsor global thread id
|
||||
inline int vx_thread_gid() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GTID));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return active core's local warp id
|
||||
inline int vx_warp_id() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_LWID));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return processsor's global warp id
|
||||
inline int vx_warp_gid() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GWID));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return processsor core id
|
||||
inline int vx_core_id() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GCID));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return current threadk mask
|
||||
inline int vx_thread_mask() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_TMASK));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return the number of threads in a warp
|
||||
inline int vx_num_threads() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NT));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return the number of warps in a core
|
||||
inline int vx_num_warps() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NW));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return the number of cores in the processsor
|
||||
inline int vx_num_cores() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NC));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline void vx_fence() {
|
||||
asm volatile ("fence iorw, iorw");
|
||||
}
|
||||
|
||||
#define __if(b) vx_split(b); \
|
||||
if (b)
|
||||
|
||||
#define __else else
|
||||
|
||||
#define __endif vx_join();
|
||||
|
||||
#define __DIVERGENT__ __attribute__((annotate("divergent")))
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1,21 +0,0 @@
|
||||
#ifndef VX_PRINT_H
|
||||
#define VX_PRINT_H
|
||||
|
||||
#include <stdarg.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
int vx_vprintf(const char* format, va_list va);
|
||||
int vx_printf(const char * format, ...);
|
||||
|
||||
void vx_putchar(int c);
|
||||
void vx_putint(int value, int base);
|
||||
void vx_putfloat(float value, int precision);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1,43 +0,0 @@
|
||||
#ifndef VX_API_H
|
||||
#define VX_API_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint32_t num_groups[3];
|
||||
uint32_t global_offset[3];
|
||||
uint32_t local_size[3];
|
||||
char * printf_buffer;
|
||||
uint32_t *printf_buffer_position;
|
||||
uint32_t printf_buffer_capacity;
|
||||
uint32_t work_dim;
|
||||
} context_t;
|
||||
|
||||
typedef void (*vx_spawn_kernel_cb) (
|
||||
const void * /* arg */,
|
||||
const context_t * /* context */,
|
||||
uint32_t /* group_x */,
|
||||
uint32_t /* group_y */,
|
||||
uint32_t /* group_z */
|
||||
);
|
||||
|
||||
typedef void (*vx_spawn_tasks_cb)(int task_id, void *arg);
|
||||
|
||||
typedef void (*vx_serial_cb)(void *arg);
|
||||
|
||||
void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg);
|
||||
|
||||
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
|
||||
|
||||
void vx_serial(vx_serial_cb callback, void * arg);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user