More doc comments

2023-09-10 14:45:23 -07:00
parent 9efdd2ebb7
commit c90fe56588
5 changed files with 15 additions and 1 deletions
--- a/driver/common/vx_malloc.h
+++ b/driver/common/vx_malloc.h
@@ -29,6 +29,10 @@ public:
        }
    }

+    // NOTE(hansung): This is code running on the CPU, but CPU is still the one
+    // that keeps track of allocation of the GPU memory.  GPU kernel simply runs
+    // assuming that CPU has done the right thing and returned a safe and valid
+    // chunk of memory.
    int allocate(uint64_t size, uint64_t* addr) {
        if (size == 0 || addr == nullptr)
            return -1;
@@ -403,4 +407,4 @@ private:
    page_t* pages_;    
 };

-} // namespace vortex
+} // namespace vortex
--- a/hw/rtl/cache/VX_cache.sv
+++ b/hw/rtl/cache/VX_cache.sv
@@ -250,6 +250,8 @@ module VX_cache #(
    wire [MEM_TAG_IN_WIDTH-1:0]     mem_rsp_tag_c;
    wire                            mem_rsp_ready_c;

+    // NOTE(hansung): non-cacheable addresses. Although is this applied for
+    // all address range?
    if (NC_ENABLE) begin
        VX_nc_bypass #( 
            .NUM_PORTS         (NUM_PORTS),
--- a/hw/rtl/cache/VX_cache_define.vh
+++ b/hw/rtl/cache/VX_cache_define.vh
@@ -55,6 +55,7 @@

 ///////////////////////////////////////////////////////////////////////////////

+// NOTE(hansung): what does CORE_TAG_ID_BITS == 0 mean?
 `define CORE_RSP_TAGS           ((CORE_TAG_ID_BITS != 0) ? 1 : NUM_REQS)

 `define LINE_TO_MEM_ADDR(x, i)  {x, `BANK_SELECT_BITS'(i)}
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@@ -1326,6 +1326,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } else {
        tmask_.reset();
        for (uint32_t t = 0; t < num_threads; ++t) {
+          // NOTE(hansung): `ts` is the left-most lane currently enabled.
+          // Doing this only respects the operand of that lane, even though
+          // every lane might have different operand for the tmask.
          tmask_.set(t, rsdata.at(ts)[0].i & (1 << t));
        }
      }
--- a/tests/opencl/vecadd/main.cc
+++ b/tests/opencl/vecadd/main.cc
@@ -156,6 +156,8 @@ int main (int argc, char **argv) {
  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));

  // Set kernel arguments
+  // NOTE(hansung): clSetKernelArg doesn't seem to incur any device-specific
+  // operation
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));	
  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));	
  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
@@ -190,6 +192,8 @@ int main (int argc, char **argv) {
  size_t local_work_size[1] = {1};
  auto time_start = std::chrono::high_resolution_clock::now();
  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+  // NOTE(hansung): clFinish blocks until all kernels in the command queue are
+  // finished.  This seems to be what actually kicks off kernel execution.
  CL_CHECK(clFinish(commandQueue));
  auto time_end = std::chrono::high_resolution_clock::now();
  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();