Add EM GPU fast paths and defaults
This commit is contained in:
@@ -18,7 +18,7 @@
|
|||||||
#endif
|
#endif
|
||||||
#if USE_CUDA_BSSN
|
#if USE_CUDA_BSSN
|
||||||
#include "bssn_rhs_cuda.h"
|
#include "bssn_rhs_cuda.h"
|
||||||
#define AMSS_BSSN_CUDA_MAX_STATE_COUNT BSSN_ESCALAR_CUDA_STATE_COUNT
|
#define AMSS_BSSN_CUDA_MAX_STATE_COUNT BSSN_EM_CUDA_STATE_COUNT
|
||||||
#endif
|
#endif
|
||||||
#if USE_CUDA_Z4C
|
#if USE_CUDA_Z4C
|
||||||
#include "z4c_rhs_cuda.h"
|
#include "z4c_rhs_cuda.h"
|
||||||
@@ -181,8 +181,7 @@ bool cuda_build_bssn_host_views(Block *block,
|
|||||||
double **views)
|
double **views)
|
||||||
{
|
{
|
||||||
if (!block || !vars || !views ||
|
if (!block || !vars || !views ||
|
||||||
(state_count != BSSN_CUDA_STATE_COUNT &&
|
state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
|
||||||
state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
|
|
||||||
return false;
|
return false;
|
||||||
MyList<var> *v = vars;
|
MyList<var> *v = vars;
|
||||||
for (int i = 0; i < state_count; ++i)
|
for (int i = 0; i < state_count; ++i)
|
||||||
@@ -200,8 +199,7 @@ bool cuda_build_bssn_soa(MyList<var> *vars,
|
|||||||
double *soa_flat)
|
double *soa_flat)
|
||||||
{
|
{
|
||||||
if (!vars || !soa_flat ||
|
if (!vars || !soa_flat ||
|
||||||
(state_count != BSSN_CUDA_STATE_COUNT &&
|
state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
|
||||||
state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
|
|
||||||
return false;
|
return false;
|
||||||
MyList<var> *v = vars;
|
MyList<var> *v = vars;
|
||||||
for (int i = 0; i < state_count; ++i)
|
for (int i = 0; i < state_count; ++i)
|
||||||
@@ -322,7 +320,7 @@ bool cuda_state_count_direct_supported(int state_count)
|
|||||||
#if USE_CUDA_Z4C && (ABEtype == 2)
|
#if USE_CUDA_Z4C && (ABEtype == 2)
|
||||||
return state_count == Z4C_CUDA_STATE_COUNT;
|
return state_count == Z4C_CUDA_STATE_COUNT;
|
||||||
#elif USE_CUDA_BSSN
|
#elif USE_CUDA_BSSN
|
||||||
return state_count > 0 && state_count <= BSSN_ESCALAR_CUDA_STATE_COUNT;
|
return state_count > 0 && state_count <= AMSS_BSSN_CUDA_MAX_STATE_COUNT;
|
||||||
#else
|
#else
|
||||||
(void)state_count;
|
(void)state_count;
|
||||||
return false;
|
return false;
|
||||||
@@ -550,7 +548,8 @@ bool cuda_uncached_device_buffers_enabled(int state_count)
|
|||||||
}
|
}
|
||||||
if (!enabled)
|
if (!enabled)
|
||||||
return false;
|
return false;
|
||||||
if (state_count != BSSN_ESCALAR_CUDA_STATE_COUNT)
|
if (state_count != BSSN_ESCALAR_CUDA_STATE_COUNT &&
|
||||||
|
state_count != BSSN_EM_CUDA_STATE_COUNT)
|
||||||
return false;
|
return false;
|
||||||
return cuda_aware_mpi_enabled();
|
return cuda_aware_mpi_enabled();
|
||||||
#else
|
#else
|
||||||
@@ -6136,6 +6135,7 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
|||||||
MyList<var> *VarList1, MyList<var> *VarList2,
|
MyList<var> *VarList1, MyList<var> *VarList2,
|
||||||
int Symmetry, SyncCache &cache)
|
int Symmetry, SyncCache &cache)
|
||||||
{
|
{
|
||||||
|
const double t_transfer = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||||
int myrank;
|
int myrank;
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
|
MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||||
@@ -6324,6 +6324,13 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
|||||||
else
|
else
|
||||||
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
||||||
}
|
}
|
||||||
|
if (sync_profile_enabled())
|
||||||
|
{
|
||||||
|
SyncProfileStats &stats = sync_profile_stats();
|
||||||
|
stats.finish_calls++;
|
||||||
|
stats.finish_sec += MPI_Wtime() - t_transfer;
|
||||||
|
sync_profile_maybe_log();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
void Parallel::Sync_ensure_cache(MyList<Patch> *PatL, int Symmetry, SyncCache &cache)
|
void Parallel::Sync_ensure_cache(MyList<Patch> *PatL, int Symmetry, SyncCache &cache)
|
||||||
{
|
{
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -438,7 +438,7 @@ int count_bssn_cuda_state_list(MyList<var> *vars)
|
|||||||
{
|
{
|
||||||
++count;
|
++count;
|
||||||
vars = vars->next;
|
vars = vars->next;
|
||||||
if (count > BSSN_ESCALAR_CUDA_STATE_COUNT)
|
if (count > BSSN_EM_CUDA_STATE_COUNT)
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
return count;
|
return count;
|
||||||
@@ -449,8 +449,7 @@ bool fill_bssn_cuda_views_count(Block *cg, MyList<var> *vars,
|
|||||||
double **host_views)
|
double **host_views)
|
||||||
{
|
{
|
||||||
if (!cg || !host_views ||
|
if (!cg || !host_views ||
|
||||||
(state_count != BSSN_CUDA_STATE_COUNT &&
|
state_count <= 0 || state_count > BSSN_EM_CUDA_STATE_COUNT)
|
||||||
state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
|
|
||||||
return false;
|
return false;
|
||||||
int idx = 0;
|
int idx = 0;
|
||||||
while (vars && idx < state_count)
|
while (vars && idx < state_count)
|
||||||
@@ -742,7 +741,7 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
|
|||||||
Block *cg = BP->data;
|
Block *cg = BP->data;
|
||||||
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
|
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
|
||||||
{
|
{
|
||||||
double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
|
double *state_out[BSSN_EM_CUDA_STATE_COUNT];
|
||||||
if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
|
if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
|
||||||
{
|
{
|
||||||
cout << "CUDA BSSN state list mismatch on resident state download" << endl;
|
cout << "CUDA BSSN state list mismatch on resident state download" << endl;
|
||||||
@@ -750,7 +749,9 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
|
|||||||
}
|
}
|
||||||
const int rc = (state_count == BSSN_ESCALAR_CUDA_STATE_COUNT)
|
const int rc = (state_count == BSSN_ESCALAR_CUDA_STATE_COUNT)
|
||||||
? bssn_escalar_cuda_download_resident_state(cg, cg->shape, state_out)
|
? bssn_escalar_cuda_download_resident_state(cg, cg->shape, state_out)
|
||||||
: bssn_cuda_download_resident_state(cg, cg->shape, state_out);
|
: ((state_count == BSSN_CUDA_STATE_COUNT)
|
||||||
|
? bssn_cuda_download_resident_state(cg, cg->shape, state_out)
|
||||||
|
: bssn_cuda_download_resident_state_count_if_present(cg, cg->shape, state_out, state_count));
|
||||||
if (rc)
|
if (rc)
|
||||||
{
|
{
|
||||||
cout << "CUDA resident state download failed" << endl;
|
cout << "CUDA resident state download failed" << endl;
|
||||||
@@ -779,7 +780,7 @@ void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var>
|
|||||||
Block *cg = BP->data;
|
Block *cg = BP->data;
|
||||||
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
|
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
|
||||||
{
|
{
|
||||||
double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
|
double *state_out[BSSN_EM_CUDA_STATE_COUNT];
|
||||||
if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
|
if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
|
||||||
{
|
{
|
||||||
cout << "CUDA BSSN state list mismatch on resident state conditional download" << endl;
|
cout << "CUDA BSSN state list mismatch on resident state conditional download" << endl;
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -8,6 +8,8 @@ extern "C" {
|
|||||||
enum {
|
enum {
|
||||||
BSSN_CUDA_STATE_COUNT = 24,
|
BSSN_CUDA_STATE_COUNT = 24,
|
||||||
BSSN_ESCALAR_CUDA_STATE_COUNT = 26,
|
BSSN_ESCALAR_CUDA_STATE_COUNT = 26,
|
||||||
|
BSSN_EM_CUDA_STATE_COUNT = 32,
|
||||||
|
BSSN_EM_CUDA_SOURCE_COUNT = 4,
|
||||||
BSSN_CUDA_MATTER_COUNT = 10
|
BSSN_CUDA_MATTER_COUNT = 10
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -82,6 +84,28 @@ int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double
|
|||||||
int &Lev,
|
int &Lev,
|
||||||
double &eps);
|
double &eps);
|
||||||
|
|
||||||
|
int bssn_em_cuda_rk4_substep(void *block_tag,
|
||||||
|
int *ex, double *X, double *Y, double *Z,
|
||||||
|
double **state_host_in,
|
||||||
|
double **state_host_out,
|
||||||
|
double **source_host,
|
||||||
|
const double *propspeed,
|
||||||
|
const double *soa_flat,
|
||||||
|
const double *bbox,
|
||||||
|
double &dT,
|
||||||
|
double &T,
|
||||||
|
int &RK4,
|
||||||
|
int &apply_bam_bc,
|
||||||
|
int &Symmetry,
|
||||||
|
int &Lev,
|
||||||
|
double &eps,
|
||||||
|
int &co,
|
||||||
|
int &keep_resident_state,
|
||||||
|
int &apply_enforce_ga,
|
||||||
|
double &chitiny);
|
||||||
|
|
||||||
|
int bssn_em_cuda_resident_zero_fast_state(void *block_tag);
|
||||||
|
|
||||||
int bssn_cuda_copy_state_region_to_host(void *block_tag,
|
int bssn_cuda_copy_state_region_to_host(void *block_tag,
|
||||||
int state_index,
|
int state_index,
|
||||||
double *host_state,
|
double *host_state,
|
||||||
|
|||||||
@@ -150,8 +150,11 @@ def _gpu_runtime_env():
|
|||||||
"AMSS_CUDA_KEEP_ALL_LEVELS": "1",
|
"AMSS_CUDA_KEEP_ALL_LEVELS": "1",
|
||||||
"AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP": "1",
|
"AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP": "1",
|
||||||
"AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS": "1",
|
"AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS": "1",
|
||||||
|
"AMSS_CUDA_EM_CACHE_SOURCES": "1",
|
||||||
|
"AMSS_CUDA_EM_ZERO_FASTPATH": "1",
|
||||||
|
"AMSS_EM_ZERO_ANALYSIS_FASTPATH": "1",
|
||||||
"AMSS_CUDA_AMR_HOST_STAGED": "1",
|
"AMSS_CUDA_AMR_HOST_STAGED": "1",
|
||||||
"AMSS_CUDA_AMR_RESTRICT_DEVICE": "1",
|
"AMSS_CUDA_AMR_RESTRICT_DEVICE": "0",
|
||||||
"AMSS_CUDA_AMR_RESTRICT_BATCH": "0",
|
"AMSS_CUDA_AMR_RESTRICT_BATCH": "0",
|
||||||
"AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0",
|
"AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0",
|
||||||
"AMSS_CUDA_UNCACHED_DEVICE_BUFFERS": "1",
|
"AMSS_CUDA_UNCACHED_DEVICE_BUFFERS": "1",
|
||||||
@@ -287,6 +290,9 @@ def run_ABE():
|
|||||||
print(f" AMSS_CUDA_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_KEEP_ALL_LEVELS', '')}")
|
print(f" AMSS_CUDA_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_KEEP_ALL_LEVELS', '')}")
|
||||||
print(f" AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP', '')}")
|
print(f" AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP', '')}")
|
||||||
print(f" AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS', '')}")
|
print(f" AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS', '')}")
|
||||||
|
print(f" AMSS_CUDA_EM_CACHE_SOURCES={mpi_env.get('AMSS_CUDA_EM_CACHE_SOURCES', '')}")
|
||||||
|
print(f" AMSS_CUDA_EM_ZERO_FASTPATH={mpi_env.get('AMSS_CUDA_EM_ZERO_FASTPATH', '')}")
|
||||||
|
print(f" AMSS_EM_ZERO_ANALYSIS_FASTPATH={mpi_env.get('AMSS_EM_ZERO_ANALYSIS_FASTPATH', '')}")
|
||||||
print(f" AMSS_CUDA_AMR_HOST_STAGED={mpi_env.get('AMSS_CUDA_AMR_HOST_STAGED', '')}")
|
print(f" AMSS_CUDA_AMR_HOST_STAGED={mpi_env.get('AMSS_CUDA_AMR_HOST_STAGED', '')}")
|
||||||
print(f" AMSS_CUDA_AMR_RESTRICT_DEVICE={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_DEVICE', '')}")
|
print(f" AMSS_CUDA_AMR_RESTRICT_DEVICE={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_DEVICE', '')}")
|
||||||
print(f" AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}")
|
print(f" AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}")
|
||||||
|
|||||||
Reference in New Issue
Block a user