Tune GPU RHS launch geometry

This commit is contained in:
2026-04-12 18:59:59 +08:00
parent db2d6978b2
commit ce88c18265

View File

@@ -64,6 +64,29 @@ int read_forced_device_from_env()
return -1;
}
int read_positive_env_value(const char *key)
{
const char *value = getenv(key);
if (!value || !*value)
return -1;
const int parsed = atoi(value);
return parsed > 0 ? parsed : -1;
}
int clamp_launch_block_dim(int requested, int max_threads_per_block)
{
if (requested <= 0)
return 0;
if (max_threads_per_block > 0 && requested > max_threads_per_block)
requested = max_threads_per_block;
requested = (requested / 32) * 32;
if (requested <= 0)
requested = max_threads_per_block >= 32 ? 32 : max_threads_per_block;
return requested > 0 ? requested : BLOCK_DIM;
}
int select_cuda_device_for_process(int mpi_rank)
{
static int cached_device = -2;
@@ -141,6 +164,19 @@ struct GpuRhsCache
int mapped_buffer_count = 0;
};
struct GpuRhsLaunchConfig
{
int device = -1;
int sm_count = 0;
int max_threads_per_block = 1024;
int grid_dim = GRID_DIM;
int block_dim = BLOCK_DIM;
int step_size = GRID_DIM * BLOCK_DIM;
int env_grid_dim = -1;
int env_block_dim = -1;
bool env_loaded = false;
};
struct ExternalBufferRegistry
{
static const int max_mapped_buffers = 4096;
@@ -175,6 +211,12 @@ GpuRhsCache &gpu_rhs_cache()
return cache;
}
GpuRhsLaunchConfig &gpu_rhs_launch_config()
{
static GpuRhsLaunchConfig config;
return config;
}
ExternalBufferRegistry &external_buffer_registry()
{
static thread_local ExternalBufferRegistry registry;
@@ -198,6 +240,16 @@ void reset_meta(Meta *meta)
memset(meta, 0, sizeof(Meta));
}
int gpu_rhs_grid_dim()
{
return gpu_rhs_launch_config().grid_dim;
}
int gpu_rhs_block_dim()
{
return gpu_rhs_launch_config().block_dim;
}
void reset_buffer_map(GpuRhsCache &cache)
{
cache.mapped_buffer_count = 0;
@@ -652,6 +704,74 @@ void ensure_gpu_rhs_invariant_symbols()
initialized = true;
}
bool ensure_gpu_rhs_launch_symbols(int device, int matrix_size)
{
GpuRhsLaunchConfig &config = gpu_rhs_launch_config();
if (!config.env_loaded)
{
config.env_grid_dim = read_positive_env_value("AMSS_GPU_GRID_DIM");
config.env_block_dim = read_positive_env_value("AMSS_GPU_BLOCK_DIM");
config.env_loaded = true;
}
if (config.device != device || config.sm_count <= 0)
{
cudaDeviceProp prop;
cudaError_t err = cudaGetDeviceProperties(&prop, device);
if (err != cudaSuccess)
{
cerr << "cudaGetDeviceProperties(" << device << ") failed: "
<< cudaGetErrorString(err) << endl;
return false;
}
config.device = device;
config.sm_count = prop.multiProcessorCount;
config.max_threads_per_block = prop.maxThreadsPerBlock;
}
int block_dim = clamp_launch_block_dim(config.env_block_dim > 0 ? config.env_block_dim : 256,
config.max_threads_per_block);
if (block_dim <= 0)
block_dim = BLOCK_DIM;
int grid_dim = 1;
if (config.env_grid_dim > 0)
{
grid_dim = config.env_grid_dim;
}
else
{
int needed_blocks = (matrix_size + block_dim - 1) / block_dim;
int grid_cap = config.sm_count > 0 ? config.sm_count * 4 : GRID_DIM;
if (grid_cap < 64)
grid_cap = 64;
if (grid_cap > 512)
grid_cap = 512;
grid_dim = needed_blocks < grid_cap ? needed_blocks : grid_cap;
}
if (grid_dim <= 0)
grid_dim = 1;
const int step_size = grid_dim * block_dim;
if (config.step_size != step_size)
{
cudaError_t err = cudaMemcpyToSymbol(STEP_SIZE, &step_size, sizeof(int));
if (err != cudaSuccess)
{
cerr << "cudaMemcpyToSymbol(STEP_SIZE) failed: "
<< cudaGetErrorString(err) << endl;
return false;
}
config.step_size = step_size;
}
config.grid_dim = grid_dim;
config.block_dim = block_dim;
return true;
}
bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex)
{
register_gpu_rhs_cleanup();
@@ -1318,7 +1438,7 @@ __global__ void enforce_ga(double * trA){
}
inline void sub_enforce_ga(double *trA, int matrix_size){
enforce_ga<<<GRID_DIM,BLOCK_DIM>>>(trA);
enforce_ga<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(trA);
cudaMemset(trA,0,matrix_size * sizeof(double));
//cudaMemset(Mh_ gupxx,0,matrix_size * sizeof(double));
@@ -1508,10 +1628,10 @@ __global__ void sub_symmetry_bd_partK(int ord,double * func, double * funcc,doub
#endif //ifdef Cell
#endif //ifdef Vertex
inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){
sub_symmetry_bd_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
sub_symmetry_bd_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
sub_symmetry_bd_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
sub_symmetry_bd_partK<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[2]);
sub_symmetry_bd_partF<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(ord,func,funcc);
sub_symmetry_bd_partI<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(ord,func,funcc,SoA[0]);
sub_symmetry_bd_partJ<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(ord,func,funcc,SoA[1]);
sub_symmetry_bd_partK<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(ord,func,funcc,SoA[2]);
}
@@ -1610,7 +1730,7 @@ inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *f
cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double));
sub_fdderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz);
sub_fdderivs_part1<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz);
}
__global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,double *fz )
@@ -1675,7 +1795,7 @@ inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz,
cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double));
sub_fderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fx,fy,fz);
sub_fderivs_part1<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(f,fh,fx,fy,fz);
}
__global__ void computeRicci_part1(double * dst)
@@ -1693,7 +1813,7 @@ __global__ void computeRicci_part1(double * dst)
inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta)
{
sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
computeRicci_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
computeRicci_part1<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(dst);
}/*Exception*/
@@ -1750,7 +1870,7 @@ __global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs)
inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA)
{
sub_symmetry_bd(3,f,fh,SoA);
sub_kodis_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
sub_kodis_part1<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(f,fh,f_rhs);
}
__global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz)
@@ -1841,7 +1961,7 @@ __global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *S
inline void sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){
sub_symmetry_bd(3,f,fh,SoA);
sub_lopsided_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs,Sfx,Sfy,Sfz);
sub_lopsided_part1<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(f,fh,f_rhs,Sfx,Sfy,Sfz);
}
__global__ void compute_rhs_bssn_part1()
@@ -3246,6 +3366,8 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
GpuRhsCache &cache = gpu_rhs_cache();
if (!prepare_gpu_rhs_cache(cache, device, ex))
return 1;
if (!ensure_gpu_rhs_launch_symbols(device, matrix_size))
return 1;
Meta * meta = &cache.meta;
const int effective_co = (calledby == CALLED_BY_STEP) ? 1 : co;
@@ -3922,7 +4044,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
//4.0------enforce_ga---------
sub_enforce_ga(Mh_ chin1, matrix_size);
//4.1-----compute rhs---------
compute_rhs_bssn_part1<<<GRID_DIM,BLOCK_DIM>>>();
compute_rhs_bssn_part1<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>();
sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
sub_fderivs(Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas);
@@ -3937,7 +4059,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
sub_fderivs(Mh_ gxz,Mh_ fh,Mh_ gxzx,Mh_ gxzy,Mh_ gxzz, asa);
sub_fderivs(Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa);
compute_rhs_bssn_part2<<<GRID_DIM,BLOCK_DIM>>>();
compute_rhs_bssn_part2<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>();
sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
sub_fdderivs(Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas);
@@ -3946,7 +4068,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
sub_fderivs( Mh_ Gamy, Mh_ fh,Mh_ Gamyx, Mh_ Gamyy, Mh_ Gamyz,sas);
sub_fderivs( Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa);
compute_rhs_bssn_part3<<<GRID_DIM,BLOCK_DIM>>>();
compute_rhs_bssn_part3<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>();
computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta);
computeRicci(Mh_ dyy,Mh_ Ryy,sss, meta);
@@ -3955,19 +4077,19 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
computeRicci(Mh_ gxz,Mh_ Rxz,asa, meta);
computeRicci(Mh_ gyz,Mh_ Ryz,saa, meta);
compute_rhs_bssn_part4<<<GRID_DIM,BLOCK_DIM>>>();
compute_rhs_bssn_part4<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>();
sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
compute_rhs_bssn_part5<<<GRID_DIM,BLOCK_DIM>>>();
compute_rhs_bssn_part5<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>();
sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
compute_rhs_bssn_part6<<<GRID_DIM,BLOCK_DIM>>>();
compute_rhs_bssn_part6<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>();
#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
compute_rhs_bssn_part6_gauge<<<GRID_DIM,BLOCK_DIM>>>();
compute_rhs_bssn_part6_gauge<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>();
#endif
sub_lopsided(Mh_ gxx,Mh_ fh2,Mh_ gxx_rhs,Mh_ betax,Mh_ betay,Mh_ betaz,sss);
@@ -4034,7 +4156,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
}
if(effective_co == 0){
compute_rhs_bssn_part7<<<GRID_DIM,BLOCK_DIM>>>();
compute_rhs_bssn_part7<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>();
sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
sub_fderivs(Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas);
@@ -4042,7 +4164,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
sub_fderivs(Mh_ Ayy,Mh_ fh,Mh_ gyyx,Mh_ gyyy,Mh_ gyyz,sss);
sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
compute_rhs_bssn_part8<<<GRID_DIM,BLOCK_DIM>>>();
compute_rhs_bssn_part8<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>();
}
#if (ABV == 1)