diff --git a/AMSS_NCKU_source/bssn_cuda_step.C b/AMSS_NCKU_source/bssn_cuda_step.C
index 83fd6f5..303a057 100644
--- a/AMSS_NCKU_source/bssn_cuda_step.C
+++ b/AMSS_NCKU_source/bssn_cuda_step.C
@@ -317,6 +317,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
 
   auto stage_download_var_list =
       [&](Block *cg, MyList<var> *var_list, bool skip_unmapped) {
+        std::vector<double *> batch_host_ptrs;
+        std::vector<MyList<var> *> batch_vars;
         while (var_list)
         {
           double *host_ptr = cg->fgfs[var_list->data->sgfn];
@@ -325,18 +327,28 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
             var_list = var_list->next;
             continue;
           }
-          if (bssn_cuda_download_buffer(cg->shape, host_ptr))
-          {
-            cerr << "GPU stage download failure: lev=" << lev
-                 << " var=" << var_list->data->name
-                 << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
-                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
-                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-            ERROR = 1;
-            break;
-          }
+          batch_host_ptrs.push_back(host_ptr);
+          batch_vars.push_back(var_list);
           var_list = var_list->next;
         }
+        if (!batch_host_ptrs.empty() &&
+            bssn_gpu_download_buffer_batch(cg->shape, &batch_host_ptrs[0],
+                                           static_cast<int>(batch_host_ptrs.size())))
+        {
+          for (size_t i = 0; i < batch_host_ptrs.size(); ++i)
+          {
+            if (bssn_cuda_download_buffer(cg->shape, batch_host_ptrs[i]))
+            {
+              cerr << "GPU stage download failure: lev=" << lev
+                   << " var=" << batch_vars[i]->data->name
+                   << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
+                   << cg->bbox[1] << ":" << cg->bbox[4] << ","
+                   << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+              ERROR = 1;
+              break;
+            }
+          }
+        }
       };
 
   auto stage_download_patch_list =
diff --git a/AMSS_NCKU_source/bssn_gpu.cu b/AMSS_NCKU_source/bssn_gpu.cu
index 931d3a0..5fef7ec 100644
--- a/AMSS_NCKU_source/bssn_gpu.cu
+++ b/AMSS_NCKU_source/bssn_gpu.cu
@@ -1103,6 +1103,77 @@ int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count)
 	return prepare_owned_buffer(host_ptr, static_cast<size_t>(count), true) ? 0 : 1;
 }
 
+int bssn_gpu_download_buffer_batch(const int *ex, double **host_ptrs, int num_buffers)
+{
+	if (!ex || !host_ptrs || num_buffers <= 0)
+		return 1;
+
+	static thread_local cudaStream_t stream = nullptr;
+	static thread_local cudaEvent_t ready = nullptr;
+	if (!stream)
+	{
+		cudaError_t err = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+		if (err != cudaSuccess)
+		{
+			cerr << "cudaStreamCreateWithFlags failed: " << cudaGetErrorString(err) << endl;
+			return 1;
+		}
+	}
+	if (!ready)
+	{
+		cudaError_t err = cudaEventCreateWithFlags(&ready, cudaEventDisableTiming);
+		if (err != cudaSuccess)
+		{
+			cerr << "cudaEventCreateWithFlags failed: " << cudaGetErrorString(err) << endl;
+			return 1;
+		}
+	}
+	cudaError_t err = cudaEventRecord(ready, 0);
+	if (err != cudaSuccess)
+	{
+		cerr << "cudaEventRecord download readiness failed: " << cudaGetErrorString(err) << endl;
+		return 1;
+	}
+	err = cudaStreamWaitEvent(stream, ready, 0);
+	if (err != cudaSuccess)
+	{
+		cerr << "cudaStreamWaitEvent download readiness failed: " << cudaGetErrorString(err) << endl;
+		return 1;
+	}
+
+	int n = 1;
+	for (int i = 0; i < 3; ++i)
+		n *= ex[i];
+	const size_t bytes = static_cast<size_t>(n) * sizeof(double);
+
+	for (int i = 0; i < num_buffers; ++i)
+	{
+		double *host_ptr = host_ptrs[i];
+		if (!host_ptr)
+			return 1;
+		const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
+		if (!device_ptr)
+			return 1;
+		bssn_gpu_prepare_host_buffer(host_ptr, n);
+		err = cudaMemcpyAsync(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost, stream);
+		if (err != cudaSuccess)
+		{
+			cerr << "cudaMemcpyAsync(D2H) buffered batch download failed: "
+			     << cudaGetErrorString(err) << endl;
+			return 1;
+		}
+	}
+
+	err = cudaStreamSynchronize(stream);
+	if (err != cudaSuccess)
+	{
+		cerr << "cudaStreamSynchronize buffered batch download failed: "
+		     << cudaGetErrorString(err) << endl;
+		return 1;
+	}
+	return 0;
+}
+
 int bssn_gpu_stage_upload_region(const double *host_ptr,
                                  const int *full_shape,
                                  const double *full_llb,
diff --git a/AMSS_NCKU_source/bssn_gpu.h b/AMSS_NCKU_source/bssn_gpu.h
index 5a3337d..2248589 100644
--- a/AMSS_NCKU_source/bssn_gpu.h
+++ b/AMSS_NCKU_source/bssn_gpu.h
@@ -99,6 +99,7 @@ int bssn_gpu_stage_upload_buffer_to_region(const double *host_src_ptr,
                                            const double *full_uub,
                                            const int *region_shape,
                                            const double *region_llb);
+int bssn_gpu_download_buffer_batch(const int *ex, double **host_ptrs, int num_buffers);
 
 /** Init GPU side data in GPUMeta. */
 // void init_fluid_meta_gpu(GPUMeta *gpu_meta);