More doc comments
This commit is contained in:
@@ -29,6 +29,10 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NOTE(hansung): This is code running on the CPU, but CPU is still the one
|
||||||
|
// that keeps track of allocation of the GPU memory. GPU kernel simply runs
|
||||||
|
// assuming that CPU has done the right thing and returned a safe and valid
|
||||||
|
// chunk of memory.
|
||||||
int allocate(uint64_t size, uint64_t* addr) {
|
int allocate(uint64_t size, uint64_t* addr) {
|
||||||
if (size == 0 || addr == nullptr)
|
if (size == 0 || addr == nullptr)
|
||||||
return -1;
|
return -1;
|
||||||
@@ -403,4 +407,4 @@ private:
|
|||||||
page_t* pages_;
|
page_t* pages_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace vortex
|
} // namespace vortex
|
||||||
|
|||||||
2
hw/rtl/cache/VX_cache.sv
vendored
2
hw/rtl/cache/VX_cache.sv
vendored
@@ -250,6 +250,8 @@ module VX_cache #(
|
|||||||
wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_c;
|
wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_c;
|
||||||
wire mem_rsp_ready_c;
|
wire mem_rsp_ready_c;
|
||||||
|
|
||||||
|
// NOTE(hansung): non-cacheable addresses. Although is this applied for
|
||||||
|
// all address range?
|
||||||
if (NC_ENABLE) begin
|
if (NC_ENABLE) begin
|
||||||
VX_nc_bypass #(
|
VX_nc_bypass #(
|
||||||
.NUM_PORTS (NUM_PORTS),
|
.NUM_PORTS (NUM_PORTS),
|
||||||
|
|||||||
1
hw/rtl/cache/VX_cache_define.vh
vendored
1
hw/rtl/cache/VX_cache_define.vh
vendored
@@ -55,6 +55,7 @@
|
|||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// NOTE(hansung): what does CORE_TAG_ID_BITS == 0 mean?
|
||||||
`define CORE_RSP_TAGS ((CORE_TAG_ID_BITS != 0) ? 1 : NUM_REQS)
|
`define CORE_RSP_TAGS ((CORE_TAG_ID_BITS != 0) ? 1 : NUM_REQS)
|
||||||
|
|
||||||
`define LINE_TO_MEM_ADDR(x, i) {x, `BANK_SELECT_BITS'(i)}
|
`define LINE_TO_MEM_ADDR(x, i) {x, `BANK_SELECT_BITS'(i)}
|
||||||
|
|||||||
@@ -1326,6 +1326,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||||||
} else {
|
} else {
|
||||||
tmask_.reset();
|
tmask_.reset();
|
||||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||||
|
// NOTE(hansung): `ts` is the left-most lane currently enabled.
|
||||||
|
// Doing this only respects the operand of that lane, even though
|
||||||
|
// every lane might have different operand for the tmask.
|
||||||
tmask_.set(t, rsdata.at(ts)[0].i & (1 << t));
|
tmask_.set(t, rsdata.at(ts)[0].i & (1 << t));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -156,6 +156,8 @@ int main (int argc, char **argv) {
|
|||||||
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
|
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
|
||||||
|
|
||||||
// Set kernel arguments
|
// Set kernel arguments
|
||||||
|
// NOTE(hansung): clSetKernelArg doesn't seem to incur any device-specific
|
||||||
|
// operation
|
||||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));
|
||||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));
|
||||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
|
||||||
@@ -190,6 +192,8 @@ int main (int argc, char **argv) {
|
|||||||
size_t local_work_size[1] = {1};
|
size_t local_work_size[1] = {1};
|
||||||
auto time_start = std::chrono::high_resolution_clock::now();
|
auto time_start = std::chrono::high_resolution_clock::now();
|
||||||
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||||
|
// NOTE(hansung): clFinish blocks until all kernels in the command queue are
|
||||||
|
// finished. This seems to be what actually kicks off kernel execution.
|
||||||
CL_CHECK(clFinish(commandQueue));
|
CL_CHECK(clFinish(commandQueue));
|
||||||
auto time_end = std::chrono::high_resolution_clock::now();
|
auto time_end = std::chrono::high_resolution_clock::now();
|
||||||
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||||
|
|||||||
Reference in New Issue
Block a user