Tune GPU RHS launch geometry
This commit is contained in:
@@ -64,6 +64,29 @@ int read_forced_device_from_env()
|
||||
return -1;
|
||||
}
|
||||
|
||||
int read_positive_env_value(const char *key)
|
||||
{
|
||||
const char *value = getenv(key);
|
||||
if (!value || !*value)
|
||||
return -1;
|
||||
|
||||
const int parsed = atoi(value);
|
||||
return parsed > 0 ? parsed : -1;
|
||||
}
|
||||
|
||||
int clamp_launch_block_dim(int requested, int max_threads_per_block)
|
||||
{
|
||||
if (requested <= 0)
|
||||
return 0;
|
||||
if (max_threads_per_block > 0 && requested > max_threads_per_block)
|
||||
requested = max_threads_per_block;
|
||||
|
||||
requested = (requested / 32) * 32;
|
||||
if (requested <= 0)
|
||||
requested = max_threads_per_block >= 32 ? 32 : max_threads_per_block;
|
||||
return requested > 0 ? requested : BLOCK_DIM;
|
||||
}
|
||||
|
||||
int select_cuda_device_for_process(int mpi_rank)
|
||||
{
|
||||
static int cached_device = -2;
|
||||
@@ -141,6 +164,19 @@ struct GpuRhsCache
|
||||
int mapped_buffer_count = 0;
|
||||
};
|
||||
|
||||
struct GpuRhsLaunchConfig
|
||||
{
|
||||
int device = -1;
|
||||
int sm_count = 0;
|
||||
int max_threads_per_block = 1024;
|
||||
int grid_dim = GRID_DIM;
|
||||
int block_dim = BLOCK_DIM;
|
||||
int step_size = GRID_DIM * BLOCK_DIM;
|
||||
int env_grid_dim = -1;
|
||||
int env_block_dim = -1;
|
||||
bool env_loaded = false;
|
||||
};
|
||||
|
||||
struct ExternalBufferRegistry
|
||||
{
|
||||
static const int max_mapped_buffers = 4096;
|
||||
@@ -175,6 +211,12 @@ GpuRhsCache &gpu_rhs_cache()
|
||||
return cache;
|
||||
}
|
||||
|
||||
GpuRhsLaunchConfig &gpu_rhs_launch_config()
|
||||
{
|
||||
static GpuRhsLaunchConfig config;
|
||||
return config;
|
||||
}
|
||||
|
||||
ExternalBufferRegistry &external_buffer_registry()
|
||||
{
|
||||
static thread_local ExternalBufferRegistry registry;
|
||||
@@ -198,6 +240,16 @@ void reset_meta(Meta *meta)
|
||||
memset(meta, 0, sizeof(Meta));
|
||||
}
|
||||
|
||||
int gpu_rhs_grid_dim()
|
||||
{
|
||||
return gpu_rhs_launch_config().grid_dim;
|
||||
}
|
||||
|
||||
int gpu_rhs_block_dim()
|
||||
{
|
||||
return gpu_rhs_launch_config().block_dim;
|
||||
}
|
||||
|
||||
void reset_buffer_map(GpuRhsCache &cache)
|
||||
{
|
||||
cache.mapped_buffer_count = 0;
|
||||
@@ -652,6 +704,74 @@ void ensure_gpu_rhs_invariant_symbols()
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
bool ensure_gpu_rhs_launch_symbols(int device, int matrix_size)
|
||||
{
|
||||
GpuRhsLaunchConfig &config = gpu_rhs_launch_config();
|
||||
if (!config.env_loaded)
|
||||
{
|
||||
config.env_grid_dim = read_positive_env_value("AMSS_GPU_GRID_DIM");
|
||||
config.env_block_dim = read_positive_env_value("AMSS_GPU_BLOCK_DIM");
|
||||
config.env_loaded = true;
|
||||
}
|
||||
|
||||
if (config.device != device || config.sm_count <= 0)
|
||||
{
|
||||
cudaDeviceProp prop;
|
||||
cudaError_t err = cudaGetDeviceProperties(&prop, device);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
cerr << "cudaGetDeviceProperties(" << device << ") failed: "
|
||||
<< cudaGetErrorString(err) << endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
config.device = device;
|
||||
config.sm_count = prop.multiProcessorCount;
|
||||
config.max_threads_per_block = prop.maxThreadsPerBlock;
|
||||
}
|
||||
|
||||
int block_dim = clamp_launch_block_dim(config.env_block_dim > 0 ? config.env_block_dim : 256,
|
||||
config.max_threads_per_block);
|
||||
if (block_dim <= 0)
|
||||
block_dim = BLOCK_DIM;
|
||||
|
||||
int grid_dim = 1;
|
||||
if (config.env_grid_dim > 0)
|
||||
{
|
||||
grid_dim = config.env_grid_dim;
|
||||
}
|
||||
else
|
||||
{
|
||||
int needed_blocks = (matrix_size + block_dim - 1) / block_dim;
|
||||
int grid_cap = config.sm_count > 0 ? config.sm_count * 4 : GRID_DIM;
|
||||
if (grid_cap < 64)
|
||||
grid_cap = 64;
|
||||
if (grid_cap > 512)
|
||||
grid_cap = 512;
|
||||
grid_dim = needed_blocks < grid_cap ? needed_blocks : grid_cap;
|
||||
}
|
||||
|
||||
if (grid_dim <= 0)
|
||||
grid_dim = 1;
|
||||
|
||||
const int step_size = grid_dim * block_dim;
|
||||
if (config.step_size != step_size)
|
||||
{
|
||||
cudaError_t err = cudaMemcpyToSymbol(STEP_SIZE, &step_size, sizeof(int));
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
cerr << "cudaMemcpyToSymbol(STEP_SIZE) failed: "
|
||||
<< cudaGetErrorString(err) << endl;
|
||||
return false;
|
||||
}
|
||||
config.step_size = step_size;
|
||||
}
|
||||
|
||||
config.grid_dim = grid_dim;
|
||||
config.block_dim = block_dim;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex)
|
||||
{
|
||||
register_gpu_rhs_cleanup();
|
||||
@@ -1318,7 +1438,7 @@ __global__ void enforce_ga(double * trA){
|
||||
}
|
||||
|
||||
inline void sub_enforce_ga(double *trA, int matrix_size){
|
||||
enforce_ga<<<GRID_DIM,BLOCK_DIM>>>(trA);
|
||||
enforce_ga<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(trA);
|
||||
cudaMemset(trA,0,matrix_size * sizeof(double));
|
||||
|
||||
//cudaMemset(Mh_ gupxx,0,matrix_size * sizeof(double));
|
||||
@@ -1508,10 +1628,10 @@ __global__ void sub_symmetry_bd_partK(int ord,double * func, double * funcc,doub
|
||||
#endif //ifdef Cell
|
||||
#endif //ifdef Vertex
|
||||
inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){
|
||||
sub_symmetry_bd_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
|
||||
sub_symmetry_bd_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
|
||||
sub_symmetry_bd_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
|
||||
sub_symmetry_bd_partK<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[2]);
|
||||
sub_symmetry_bd_partF<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(ord,func,funcc);
|
||||
sub_symmetry_bd_partI<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(ord,func,funcc,SoA[0]);
|
||||
sub_symmetry_bd_partJ<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(ord,func,funcc,SoA[1]);
|
||||
sub_symmetry_bd_partK<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(ord,func,funcc,SoA[2]);
|
||||
}
|
||||
|
||||
|
||||
@@ -1610,7 +1730,7 @@ inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *f
|
||||
cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double));
|
||||
sub_fdderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz);
|
||||
sub_fdderivs_part1<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz);
|
||||
}
|
||||
|
||||
__global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,double *fz )
|
||||
@@ -1675,7 +1795,7 @@ inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz,
|
||||
cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double));
|
||||
|
||||
sub_fderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fx,fy,fz);
|
||||
sub_fderivs_part1<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(f,fh,fx,fy,fz);
|
||||
}
|
||||
|
||||
__global__ void computeRicci_part1(double * dst)
|
||||
@@ -1693,7 +1813,7 @@ __global__ void computeRicci_part1(double * dst)
|
||||
inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta)
|
||||
{
|
||||
sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
|
||||
computeRicci_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
|
||||
computeRicci_part1<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(dst);
|
||||
|
||||
}/*Exception*/
|
||||
|
||||
@@ -1750,7 +1870,7 @@ __global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs)
|
||||
inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA)
|
||||
{
|
||||
sub_symmetry_bd(3,f,fh,SoA);
|
||||
sub_kodis_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
|
||||
sub_kodis_part1<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(f,fh,f_rhs);
|
||||
}
|
||||
|
||||
__global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz)
|
||||
@@ -1841,7 +1961,7 @@ __global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *S
|
||||
|
||||
inline void sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){
|
||||
sub_symmetry_bd(3,f,fh,SoA);
|
||||
sub_lopsided_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs,Sfx,Sfy,Sfz);
|
||||
sub_lopsided_part1<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>(f,fh,f_rhs,Sfx,Sfy,Sfz);
|
||||
}
|
||||
|
||||
__global__ void compute_rhs_bssn_part1()
|
||||
@@ -3246,6 +3366,8 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
GpuRhsCache &cache = gpu_rhs_cache();
|
||||
if (!prepare_gpu_rhs_cache(cache, device, ex))
|
||||
return 1;
|
||||
if (!ensure_gpu_rhs_launch_symbols(device, matrix_size))
|
||||
return 1;
|
||||
Meta * meta = &cache.meta;
|
||||
const int effective_co = (calledby == CALLED_BY_STEP) ? 1 : co;
|
||||
|
||||
@@ -3922,7 +4044,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
//4.0------enforce_ga---------
|
||||
sub_enforce_ga(Mh_ chin1, matrix_size);
|
||||
//4.1-----compute rhs---------
|
||||
compute_rhs_bssn_part1<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
compute_rhs_bssn_part1<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>();
|
||||
|
||||
sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
|
||||
sub_fderivs(Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas);
|
||||
@@ -3937,7 +4059,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
sub_fderivs(Mh_ gxz,Mh_ fh,Mh_ gxzx,Mh_ gxzy,Mh_ gxzz, asa);
|
||||
sub_fderivs(Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa);
|
||||
|
||||
compute_rhs_bssn_part2<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
compute_rhs_bssn_part2<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>();
|
||||
|
||||
sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
|
||||
sub_fdderivs(Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas);
|
||||
@@ -3946,7 +4068,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
sub_fderivs( Mh_ Gamy, Mh_ fh,Mh_ Gamyx, Mh_ Gamyy, Mh_ Gamyz,sas);
|
||||
sub_fderivs( Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa);
|
||||
|
||||
compute_rhs_bssn_part3<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
compute_rhs_bssn_part3<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>();
|
||||
|
||||
computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta);
|
||||
computeRicci(Mh_ dyy,Mh_ Ryy,sss, meta);
|
||||
@@ -3955,19 +4077,19 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
computeRicci(Mh_ gxz,Mh_ Rxz,asa, meta);
|
||||
computeRicci(Mh_ gyz,Mh_ Ryz,saa, meta);
|
||||
|
||||
compute_rhs_bssn_part4<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
compute_rhs_bssn_part4<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>();
|
||||
|
||||
sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
|
||||
|
||||
compute_rhs_bssn_part5<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
compute_rhs_bssn_part5<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>();
|
||||
|
||||
sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
|
||||
|
||||
compute_rhs_bssn_part6<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
compute_rhs_bssn_part6<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>();
|
||||
|
||||
#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
|
||||
sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
|
||||
compute_rhs_bssn_part6_gauge<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
compute_rhs_bssn_part6_gauge<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>();
|
||||
#endif
|
||||
|
||||
sub_lopsided(Mh_ gxx,Mh_ fh2,Mh_ gxx_rhs,Mh_ betax,Mh_ betay,Mh_ betaz,sss);
|
||||
@@ -4034,7 +4156,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
}
|
||||
|
||||
if(effective_co == 0){
|
||||
compute_rhs_bssn_part7<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
compute_rhs_bssn_part7<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>();
|
||||
|
||||
sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
|
||||
sub_fderivs(Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas);
|
||||
@@ -4042,7 +4164,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
sub_fderivs(Mh_ Ayy,Mh_ fh,Mh_ gyyx,Mh_ gyyy,Mh_ gyyz,sss);
|
||||
sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
|
||||
sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
|
||||
compute_rhs_bssn_part8<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
compute_rhs_bssn_part8<<<gpu_rhs_grid_dim(), gpu_rhs_block_dim()>>>();
|
||||
}
|
||||
|
||||
#if (ABV == 1)
|
||||
|
||||
Reference in New Issue
Block a user