diff --git a/common.mk b/common.mk index 4aa0a903..89ebbea3 100644 --- a/common.mk +++ b/common.mk @@ -27,9 +27,13 @@ EXTRA_SIM_REQS ?= #---------------------------------------------------------------------------- HELP_SIMULATION_VARIABLES += \ -" EXTRA_SIM_FLAGS = additional runtime simulation flags (passed within +permissive)" +" EXTRA_SIM_FLAGS = additional runtime simulation flags (passed within +permissive)" \ +" NUMACTL = set to '1' to wrap simulator in the appropriate numactl command" EXTRA_SIM_FLAGS ?= +NUMACTL ?= 0 + +NUMA_PREFIX = $(if $(filter $(NUMACTL),0),,$(shell $(base_dir)/scripts/numa_prefix)) #---------------------------------------------------------------------------- HELP_COMMANDS += \ @@ -165,15 +169,15 @@ verilog: $(sim_vsrcs) # run normal binary with hardware-logged insn dissassembly run-binary: $(output_dir) $(sim) - (set -o pipefail && $(sim) $(PERMISSIVE_ON) $(SIM_FLAGS) $(EXTRA_SIM_FLAGS) $(SEED_FLAG) $(VERBOSE_FLAGS) $(PERMISSIVE_OFF) $(BINARY) >(spike-dasm > $(sim_out_name).out) | tee $(sim_out_name).log) + (set -o pipefail && $(NUMA_PREFIX) $(sim) $(PERMISSIVE_ON) $(SIM_FLAGS) $(EXTRA_SIM_FLAGS) $(SEED_FLAG) $(VERBOSE_FLAGS) $(PERMISSIVE_OFF) $(BINARY) >(spike-dasm > $(sim_out_name).out) | tee $(sim_out_name).log) # run simulator as fast as possible (no insn disassembly) run-binary-fast: $(output_dir) $(sim) - (set -o pipefail && $(sim) $(PERMISSIVE_ON) $(SIM_FLAGS) $(EXTRA_SIM_FLAGS) $(SEED_FLAG) $(PERMISSIVE_OFF) $(BINARY) >(spike-dasm > $(sim_out_name).out) | tee $(sim_out_name).log) + (set -o pipefail && $(NUMA_PREFIX) $(sim_debug) $(PERMISSIVE_ON) $(SIM_FLAGS) $(EXTRA_SIM_FLAGS) $(SEED_FLAG) $(VERBOSE_FLAGS) $(WAVEFORM_FLAG) $(PERMISSIVE_OFF) $(BINARY) >(spike-dasm > $(sim_out_name).out) | tee $(sim_out_name).log) run-fast: run-asm-tests-fast run-bmark-tests-fast @@ -209,10 +213,10 @@ $(output_dir)/%: $(RISCV)/riscv64-unknown-elf/share/riscv-tests/isa/% $(output_d ln -sf $< $@ $(output_dir)/%.run: $(output_dir)/% $(sim) - (set -o pipefail && $(sim) $(PERMISSIVE_ON) $(SIM_FLAGS) $(EXTRA_SIM_FLAGS) $(SEED_FLAG) $(PERMISSIVE_OFF) $< >(spike-dasm > $@) | tee $<.log) + (set -o pipefail && $(NUMA_PREFIX) $(sim) $(PERMISSIVE_ON) $(SIM_FLAGS) $(EXTRA_SIM_FLAGS) $(SEED_FLAG) $(VERBOSE_FLAGS) $(PERMISSIVE_OFF) $< >(spike-dasm > $@) | tee $<.log) ######################################################################################### # include build/project specific makefrags made from the generator diff --git a/docs/Simulation/Software-RTL-Simulation.rst b/docs/Simulation/Software-RTL-Simulation.rst index af568ef4..6d7fb2b8 100644 --- a/docs/Simulation/Software-RTL-Simulation.rst +++ b/docs/Simulation/Software-RTL-Simulation.rst @@ -181,3 +181,18 @@ An open-source vcd-capable waveform viewer is `GTKWave `` option enables the compiled Verilator simulator to use ```` parallel threads. +On a multi-socket machine, you will want to make sure all threads are on the same socket by using ``numactl``. +You can also just use the ``numa_prefix`` wrapper, which is a simple wrapper around ``numactl`` that runs your verilated simulator like this: ``$(numa_prefix) ./simulator- ``. diff --git a/generators/utilities/src/main/resources/csrc/emulator.cc b/generators/utilities/src/main/resources/csrc/emulator.cc index 27a8aa4a..dc8827a9 100644 --- a/generators/utilities/src/main/resources/csrc/emulator.cc +++ b/generators/utilities/src/main/resources/csrc/emulator.cc @@ -1,11 +1,15 @@ // See LICENSE.SiFive for license details. // See LICENSE.Berkeley for license details. -#include "verilated.h" #if VM_TRACE #include +#if CY_FST_TRACE +#include "verilated_fst_c.h" +#else +#include "verilated.h" #include "verilated_vcd_c.h" -#endif +#endif // CY_FST_TRACE +#endif // VM_TRACE #include #include #include "remote_bitbang.h" @@ -16,6 +20,8 @@ #include #include #include +// needed for s_vpi_vlog_info, which is needed for multithreading +#include // For option parsing, which is split across this file, Verilog, and // FESVR's HTIF, a few external files must be pulled in. The list of @@ -35,6 +41,7 @@ extern tsi_t* tsi; extern dtm_t* dtm; extern remote_bitbang_t * jtag; +extern int dramsim; static uint64_t trace_count = 0; bool verbose = false; @@ -50,6 +57,18 @@ double sc_time_stamp() return trace_count; } +// need to pull htif_argc/htif_argv out here so the thread that calls tick() +// for the HTIF device can initialize properly with the cmdline args. this +// was pulled out here for multithreading to work +static int htif_argc; +static char **htif_argv = NULL; +extern "C" int vpi_get_vlog_info(s_vpi_vlog_info *vlog_info_s) +{ + vlog_info_s->argc = htif_argc; + vlog_info_s->argv = htif_argv; + return 1; +} + static void usage(const char * program_name) { printf("Usage: %s [EMULATOR OPTION]... [VERILOG PLUSARG]... [HOST OPTION]... BINARY [TARGET OPTION]...\n", @@ -113,34 +132,34 @@ int main(int argc, char** argv) // Port numbers are 16 bit unsigned integers. uint16_t rbb_port = 0; #if VM_TRACE + const char* vcdfile_name = NULL; FILE * vcdfile = NULL; uint64_t start = 0; #endif int verilog_plusargs_legal = 1; - opterr = 1; + dramsim = 0; while (1) { static struct option long_options[] = { - {"cycle-count", no_argument, 0, 'c' }, - {"help", no_argument, 0, 'h' }, - {"max-cycles", required_argument, 0, 'm' }, - {"seed", required_argument, 0, 's' }, - {"rbb-port", required_argument, 0, 'r' }, - {"verbose", no_argument, 0, 'V' }, - {"permissive", no_argument, 0, 'p' }, - {"permissive-off", no_argument, 0, 'o' }, + {"cycle-count", no_argument, 0, 'c' }, + {"help", no_argument, 0, 'h' }, + {"max-cycles", required_argument, 0, 'm' }, + {"seed", required_argument, 0, 's' }, + {"rbb-port", required_argument, 0, 'r' }, + {"verbose", no_argument, 0, 'V' }, + {"dramsim", no_argument, 0, 'D' }, #if VM_TRACE - {"vcd", required_argument, 0, 'v' }, - {"dump-start", required_argument, 0, 'x' }, + {"vcd", required_argument, 0, 'v' }, + {"dump-start", required_argument, 0, 'x' }, #endif HTIF_LONG_OPTIONS }; int option_index = 0; #if VM_TRACE - int c = getopt_long(argc, argv, "-chm:s:r:v:Vx:po", long_options, &option_index); + int c = getopt_long(argc, argv, "-chm:s:r:v:Vx:D", long_options, &option_index); #else - int c = getopt_long(argc, argv, "-chm:s:r:Vpo", long_options, &option_index); + int c = getopt_long(argc, argv, "-chm:s:r:VD", long_options, &option_index); #endif if (c == -1) break; retry: @@ -153,10 +172,10 @@ int main(int argc, char** argv) case 's': random_seed = atoi(optarg); break; case 'r': rbb_port = atoi(optarg); break; case 'V': verbose = true; break; - case 'p': opterr = 0; break; - case 'o': opterr = 1; break; + case 'D': dramsim = 1; break; #if VM_TRACE case 'v': { + vcdfile_name = optarg; vcdfile = strcmp(optarg, "-") == 0 ? stdout : fopen(optarg, "w"); if (!vcdfile) { std::cerr << "Unable to open " << optarg << " for VCD write\n"; @@ -188,10 +207,8 @@ int main(int argc, char** argv) #endif else if (arg.substr(0, 12) == "+cycle-count") c = 'c'; - else if (arg == "+permissive") - c = 'p'; - else if (arg == "+permissive-off") - c = 'o'; + else if (arg == "+dramsim") + c = 'D'; // If we don't find a legacy '+' EMULATOR argument, it still could be // a VERILOG_PLUSARG and not an error. else if (verilog_plusargs_legal) { @@ -223,13 +240,9 @@ int main(int argc, char** argv) } htif_option++; } - if(opterr) { - std::cerr << argv[0] << ": invalid plus-arg (Verilog or HTIF) \"" - << arg << "\"\n"; - c = '?'; - } else { - c = 'p'; - } + std::cerr << argv[0] << ": invalid plus-arg (Verilog or HTIF) \"" + << arg << "\"\n"; + c = '?'; } goto retry; } @@ -251,6 +264,10 @@ done_processing: usage(argv[0]); return 1; } + htif_argc = 1 + argc - optind; + htif_argv = (char **) malloc((htif_argc) * sizeof (char *)); + htif_argv[0] = argv[0]; + for (int i = 1; optind < argc;) htif_argv[i++] = argv[optind++]; if (verbose) fprintf(stderr, "using random seed %u\n", random_seed); @@ -264,17 +281,17 @@ done_processing: #if VM_TRACE Verilated::traceEverOn(true); // Verilator must compute traced signals +#if CY_FST_TRACE + std::unique_ptr tfp(new VerilatedFstC); +#else std::unique_ptr vcdfd(new VerilatedVcdFILE(vcdfile)); std::unique_ptr tfp(new VerilatedVcdC(vcdfd.get())); - if (vcdfile) { +#endif // CY_FST_TRACE + if (vcdfile_name) { tile->trace(tfp.get(), 99); // Trace 99 levels of hierarchy - tfp->open(""); + tfp->open(vcdfile_name); } -#endif - - jtag = new remote_bitbang_t(rbb_port); - dtm = new dtm_t(argc, argv); - tsi = new tsi_t(argc, argv); +#endif // VM_TRACE signal(SIGTERM, handle_sigterm); @@ -304,8 +321,7 @@ done_processing: tile->reset = 0; done_reset = true; - while (!dtm->done() && !jtag->done() && !tsi->done() && - !tile->io_success && trace_count < max_cycles) { + do { tile->clock = 0; tile->eval(); #if VM_TRACE @@ -322,6 +338,13 @@ done_processing: #endif trace_count++; } + // for verilator multithreading. need to do 1 loop before checking if + // tsi exists, since tsi is created by verilated thread on the first + // serial_tick. + while ((!dtm || !dtm->done()) && + (!jtag || !jtag->done()) && + (!tsi || !tsi->done()) && + !tile->io_success && trace_count < max_cycles); #if VM_TRACE if (tfp) @@ -330,17 +353,17 @@ done_processing: fclose(vcdfile); #endif - if (dtm->exit_code()) + if (dtm && dtm->exit_code()) { fprintf(stderr, "*** FAILED *** via dtm (code = %d, seed %d) after %ld cycles\n", dtm->exit_code(), random_seed, trace_count); ret = dtm->exit_code(); } - else if (tsi->exit_code()) + else if (tsi && tsi->exit_code()) { fprintf(stderr, "*** FAILED *** (code = %d, seed %d) after %ld cycles\n", tsi->exit_code(), random_seed, trace_count); ret = tsi->exit_code(); } - else if (jtag->exit_code()) + else if (jtag && jtag->exit_code()) { fprintf(stderr, "*** FAILED *** via jtag (code = %d, seed %d) after %ld cycles\n", jtag->exit_code(), random_seed, trace_count); ret = jtag->exit_code(); @@ -359,5 +382,6 @@ done_processing: if (tsi) delete tsi; if (jtag) delete jtag; if (tile) delete tile; + if (htif_argv) free(htif_argv); return ret; } diff --git a/scripts/numa_prefix b/scripts/numa_prefix new file mode 100755 index 00000000..af2d2eb1 --- /dev/null +++ b/scripts/numa_prefix @@ -0,0 +1,67 @@ +#!/usr/bin/env perl + +#============================================================================ +# - really simple script, which just prints out the numactl cmd to +# prefix before your actual command. it determines this based on free +# memory size attached to every node. +# - when you run this on a machine without `numactl`, the output is empty, +# so `$(numa_prefix) ` turns in to ` `. +# - when the machine has `numactl` installed, regardless of the socket-count +# on the machine, the resulting command is: +# `numactl -m -C -- ` +# - example output from `numactl -H` on a 2 socket machine: +# available: 2 nodes (0-1) +# node 0 cpus: 0 2 4 6 8 10 12 14 16 18 20 22 +# node 0 size: 131026 MB +# node 0 free: 7934 MB +# node 1 cpus: 1 3 5 7 9 11 13 15 17 19 21 23 +# node 1 size: 65536 MB +# node 1 free: 429 MB +# node distances: +# node 0 1 +# 0: 10 20 +# 1: 20 10 +#============================================================================ + +use strict; +use warnings; + +my $path = `which numactl`; +if(length($path) > 0) { + my ($head_line, @rest) = map {chomp; $_} `numactl -H`; + + if($head_line =~ /available: (\d+) nodes/) { + my $node_count = $1; + my $best_node_id = undef + my $best_cpus = undef; + my $best_free_size = undef; + + # loop through available nodes, selecting the node with the most free mem + foreach my $num (1..$node_count) { + my $cpus_line = shift(@rest); + my $mem_size_line = shift(@rest); + my $mem_free_line = shift(@rest); + + if($cpus_line =~ /node (\d+) cpus: (\d.*\d)$/) { + my ($node_id, $cpus) = ($1, $2); + $cpus =~ s/\s+/,/g; + + if($mem_free_line =~ /node $node_id free: (\d+) \S+$/) { + my $free_size = $1; + if(!defined($best_free_size) || ($free_size > $best_free_size)) { + $best_node_id = $node_id; + $best_cpus = $cpus; + $best_free_size = $free_size; + } + } else { + die("malformed mem-free line: $mem_free_line\n"); + } + } else { + die("malformed cpus line: $cpus_line\n"); + } + } + print("numactl -m $best_node_id -C $best_cpus --"); + } else { + die("malformed head line: $head_line\n"); + } +} diff --git a/sims/verilator/Makefile b/sims/verilator/Makefile index 295729b7..5ce75683 100644 --- a/sims/verilator/Makefile +++ b/sims/verilator/Makefile @@ -70,6 +70,9 @@ RUNTIME_PROFILING_VFLAGS := $(if $(filter $(VERILATOR_PROFILE),all),\ $(if $(filter $(VERILATOR_PROFILE),threads),\ --prof-threads,)) +VERILATOR_THREADS ?= 1 +RUNTIME_THREADS := --threads $(VERILATOR_THREADS) --threads-dpi all + VERILATOR_FST_MODE ?= 0 TRACING_OPTS := $(if $(filter $(VERILATOR_FST_MODE),0),\ --trace,--trace-fst --trace-threads 1) @@ -122,6 +125,7 @@ PREPROC_DEFINES := \ VERILATOR_NONCC_OPTS = \ $(RUNTIME_PROFILING_VFLAGS) \ + $(RUNTIME_THREADS) \ $(VERILATOR_OPT_FLAGS) \ $(PLATFORM_OPTS) \ -Wno-fatal \ @@ -157,7 +161,6 @@ VERILATOR_CXXFLAGS = \ VERILATOR_LDFLAGS = \ $(LDFLAGS) \ - $(RUNTIME_PROFILING_CFLAGS) \ -L$(RISCV)/lib \ -Wl,-rpath,$(RISCV)/lib \ -L$(sim_dir) \ @@ -219,7 +222,7 @@ $(sim_debug): $(model_mk_debug) $(dramsim_lib) $(output_dir)/%.vpd: $(output_dir)/% $(sim_debug) rm -f $@.vcd && mkfifo $@.vcd vcd2vpd $@.vcd $@ > /dev/null & - (set -o pipefail && $(sim_debug) $(PERMISSIVE_ON) $(SIM_FLAGS) $(EXTRA_SIM_FLAGS) $(SEED_FLAG) $(VERBOSE_FLAGS) -v$@.vcd $(PERMISSIVE_OFF) $< >(spike-dasm > $<.out) | tee $<.log) + (set -o pipefail && $(NUMA_PREFIX) $(sim_debug) $(PERMISSIVE_ON) $(SIM_FLAGS) $(EXTRA_SIM_FLAGS) $(SEED_FLAG) $(VERBOSE_FLAGS) -v$@.vcd $(PERMISSIVE_OFF) $< >(spike-dasm > $<.out) | tee $<.log) ######################################################################################### # general cleanup rules