From c90fe5658893c40a7d0ded4c17d36583221d5ff9 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 10 Sep 2023 14:45:23 -0700 Subject: [PATCH] More doc comments --- driver/common/vx_malloc.h | 6 +++++- hw/rtl/cache/VX_cache.sv | 2 ++ hw/rtl/cache/VX_cache_define.vh | 1 + sim/simx/execute.cpp | 3 +++ tests/opencl/vecadd/main.cc | 4 ++++ 5 files changed, 15 insertions(+), 1 deletion(-) diff --git a/driver/common/vx_malloc.h b/driver/common/vx_malloc.h index 650a2f80..31e41a7d 100644 --- a/driver/common/vx_malloc.h +++ b/driver/common/vx_malloc.h @@ -29,6 +29,10 @@ public: } } + // NOTE(hansung): This is code running on the CPU, but CPU is still the one + // that keeps track of allocation of the GPU memory. GPU kernel simply runs + // assuming that CPU has done the right thing and returned a safe and valid + // chunk of memory. int allocate(uint64_t size, uint64_t* addr) { if (size == 0 || addr == nullptr) return -1; @@ -403,4 +407,4 @@ private: page_t* pages_; }; -} // namespace vortex \ No newline at end of file +} // namespace vortex diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv index 1b7d7abf..d50e0d6e 100644 --- a/hw/rtl/cache/VX_cache.sv +++ b/hw/rtl/cache/VX_cache.sv @@ -250,6 +250,8 @@ module VX_cache #( wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_c; wire mem_rsp_ready_c; + // NOTE(hansung): non-cacheable addresses. Although is this applied for + // all address range? if (NC_ENABLE) begin VX_nc_bypass #( .NUM_PORTS (NUM_PORTS), diff --git a/hw/rtl/cache/VX_cache_define.vh b/hw/rtl/cache/VX_cache_define.vh index 647ea0be..c737dc83 100644 --- a/hw/rtl/cache/VX_cache_define.vh +++ b/hw/rtl/cache/VX_cache_define.vh @@ -55,6 +55,7 @@ /////////////////////////////////////////////////////////////////////////////// +// NOTE(hansung): what does CORE_TAG_ID_BITS == 0 mean? `define CORE_RSP_TAGS ((CORE_TAG_ID_BITS != 0) ? 1 : NUM_REQS) `define LINE_TO_MEM_ADDR(x, i) {x, `BANK_SELECT_BITS'(i)} diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp index 776daa4d..0786a00f 100644 --- a/sim/simx/execute.cpp +++ b/sim/simx/execute.cpp @@ -1326,6 +1326,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } else { tmask_.reset(); for (uint32_t t = 0; t < num_threads; ++t) { + // NOTE(hansung): `ts` is the left-most lane currently enabled. + // Doing this only respects the operand of that lane, even though + // every lane might have different operand for the tmask. tmask_.set(t, rsdata.at(ts)[0].i & (1 << t)); } } diff --git a/tests/opencl/vecadd/main.cc b/tests/opencl/vecadd/main.cc index 28774ed4..1bf8774d 100644 --- a/tests/opencl/vecadd/main.cc +++ b/tests/opencl/vecadd/main.cc @@ -156,6 +156,8 @@ int main (int argc, char **argv) { kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err)); // Set kernel arguments + // NOTE(hansung): clSetKernelArg doesn't seem to incur any device-specific + // operation CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj)); @@ -190,6 +192,8 @@ int main (int argc, char **argv) { size_t local_work_size[1] = {1}; auto time_start = std::chrono::high_resolution_clock::now(); CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL)); + // NOTE(hansung): clFinish blocks until all kernels in the command queue are + // finished. This seems to be what actually kicks off kernel execution. CL_CHECK(clFinish(commandQueue)); auto time_end = std::chrono::high_resolution_clock::now(); double elapsed = std::chrono::duration_cast(time_end - time_start).count();