From c90fe5658893c40a7d0ded4c17d36583221d5ff9 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Sun, 10 Sep 2023 14:45:23 -0700
Subject: [PATCH] More doc comments

---
 driver/common/vx_malloc.h       | 6 +++++-
 hw/rtl/cache/VX_cache.sv        | 2 ++
 hw/rtl/cache/VX_cache_define.vh | 1 +
 sim/simx/execute.cpp            | 3 +++
 tests/opencl/vecadd/main.cc     | 4 ++++
 5 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/driver/common/vx_malloc.h b/driver/common/vx_malloc.h
index 650a2f80..31e41a7d 100644
--- a/driver/common/vx_malloc.h
+++ b/driver/common/vx_malloc.h
@@ -29,6 +29,10 @@ public:
         }
     }
 
+    // NOTE(hansung): This is code running on the CPU, but CPU is still the one
+    // that keeps track of allocation of the GPU memory.  GPU kernel simply runs
+    // assuming that CPU has done the right thing and returned a safe and valid
+    // chunk of memory.
     int allocate(uint64_t size, uint64_t* addr) {
         if (size == 0 || addr == nullptr)
             return -1;
@@ -403,4 +407,4 @@ private:
     page_t* pages_;    
 };
 
-} // namespace vortex
\ No newline at end of file
+} // namespace vortex
diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv
index 1b7d7abf..d50e0d6e 100644
--- a/hw/rtl/cache/VX_cache.sv
+++ b/hw/rtl/cache/VX_cache.sv
@@ -250,6 +250,8 @@ module VX_cache #(
     wire [MEM_TAG_IN_WIDTH-1:0]     mem_rsp_tag_c;
     wire                            mem_rsp_ready_c;
 
+    // NOTE(hansung): non-cacheable addresses. Although is this applied for
+    // all address range?
     if (NC_ENABLE) begin
         VX_nc_bypass #( 
             .NUM_PORTS         (NUM_PORTS),
diff --git a/hw/rtl/cache/VX_cache_define.vh b/hw/rtl/cache/VX_cache_define.vh
index 647ea0be..c737dc83 100644
--- a/hw/rtl/cache/VX_cache_define.vh
+++ b/hw/rtl/cache/VX_cache_define.vh
@@ -55,6 +55,7 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
+// NOTE(hansung): what does CORE_TAG_ID_BITS == 0 mean?
 `define CORE_RSP_TAGS           ((CORE_TAG_ID_BITS != 0) ? 1 : NUM_REQS)
 
 `define LINE_TO_MEM_ADDR(x, i)  {x, `BANK_SELECT_BITS'(i)}
diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp
index 776daa4d..0786a00f 100644
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@@ -1326,6 +1326,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
       } else {
         tmask_.reset();
         for (uint32_t t = 0; t < num_threads; ++t) {
+          // NOTE(hansung): `ts` is the left-most lane currently enabled.
+          // Doing this only respects the operand of that lane, even though
+          // every lane might have different operand for the tmask.
           tmask_.set(t, rsdata.at(ts)[0].i & (1 << t));
         }
       }
diff --git a/tests/opencl/vecadd/main.cc b/tests/opencl/vecadd/main.cc
index 28774ed4..1bf8774d 100644
--- a/tests/opencl/vecadd/main.cc
+++ b/tests/opencl/vecadd/main.cc
@@ -156,6 +156,8 @@ int main (int argc, char **argv) {
   kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
 
   // Set kernel arguments
+  // NOTE(hansung): clSetKernelArg doesn't seem to incur any device-specific
+  // operation
   CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));	
   CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));	
   CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
@@ -190,6 +192,8 @@ int main (int argc, char **argv) {
   size_t local_work_size[1] = {1};
   auto time_start = std::chrono::high_resolution_clock::now();
   CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+  // NOTE(hansung): clFinish blocks until all kernels in the command queue are
+  // finished.  This seems to be what actually kicks off kernel execution.
   CL_CHECK(clFinish(commandQueue));
   auto time_end = std::chrono::high_resolution_clock::now();
   double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();