More doc comments
This commit is contained in:
@@ -29,6 +29,10 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE(hansung): This is code running on the CPU, but CPU is still the one
|
||||
// that keeps track of allocation of the GPU memory. GPU kernel simply runs
|
||||
// assuming that CPU has done the right thing and returned a safe and valid
|
||||
// chunk of memory.
|
||||
int allocate(uint64_t size, uint64_t* addr) {
|
||||
if (size == 0 || addr == nullptr)
|
||||
return -1;
|
||||
@@ -403,4 +407,4 @@ private:
|
||||
page_t* pages_;
|
||||
};
|
||||
|
||||
} // namespace vortex
|
||||
} // namespace vortex
|
||||
|
||||
2
hw/rtl/cache/VX_cache.sv
vendored
2
hw/rtl/cache/VX_cache.sv
vendored
@@ -250,6 +250,8 @@ module VX_cache #(
|
||||
wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_c;
|
||||
wire mem_rsp_ready_c;
|
||||
|
||||
// NOTE(hansung): non-cacheable addresses. Although is this applied for
|
||||
// all address range?
|
||||
if (NC_ENABLE) begin
|
||||
VX_nc_bypass #(
|
||||
.NUM_PORTS (NUM_PORTS),
|
||||
|
||||
1
hw/rtl/cache/VX_cache_define.vh
vendored
1
hw/rtl/cache/VX_cache_define.vh
vendored
@@ -55,6 +55,7 @@
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// NOTE(hansung): what does CORE_TAG_ID_BITS == 0 mean?
|
||||
`define CORE_RSP_TAGS ((CORE_TAG_ID_BITS != 0) ? 1 : NUM_REQS)
|
||||
|
||||
`define LINE_TO_MEM_ADDR(x, i) {x, `BANK_SELECT_BITS'(i)}
|
||||
|
||||
@@ -1326,6 +1326,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
||||
} else {
|
||||
tmask_.reset();
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
// NOTE(hansung): `ts` is the left-most lane currently enabled.
|
||||
// Doing this only respects the operand of that lane, even though
|
||||
// every lane might have different operand for the tmask.
|
||||
tmask_.set(t, rsdata.at(ts)[0].i & (1 << t));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -156,6 +156,8 @@ int main (int argc, char **argv) {
|
||||
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
|
||||
|
||||
// Set kernel arguments
|
||||
// NOTE(hansung): clSetKernelArg doesn't seem to incur any device-specific
|
||||
// operation
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
|
||||
@@ -190,6 +192,8 @@ int main (int argc, char **argv) {
|
||||
size_t local_work_size[1] = {1};
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
// NOTE(hansung): clFinish blocks until all kernels in the command queue are
|
||||
// finished. This seems to be what actually kicks off kernel execution.
|
||||
CL_CHECK(clFinish(commandQueue));
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
|
||||
Reference in New Issue
Block a user