Select Git revision
run_core_bench.sh 2.24 KiB
#!/bin/sh
#SBATCH -N 3
#SBATCH -n 3
#SBATCH --time=00:10:00
#SBATCH --job-name=mstro_core_bench
#SBATCH --output=job.core_bench.%j.out
#SBATCH --error=job.core_bench.%j.err
#SBATCH --hint=nomultithread
#SBATCH --exclusive
#SBATCH --ntasks-per-node 1
# core_bench is an MPI program, where one rank acts as a pool manager, other ranks are producers and consumers.
# It tests the bandwidth of moving data (core data objects, i.e. CDOs) between producers and consumers across nodes,
# hence --ntasks-per-node 1 is needed to make sure procucers and consumers are on different nodes.
# It can also run on a single node, where it will use the local pool manager and test data movement between producers
# and consumers threads.
# number of procduer and comsumer threads
export OMP_NUM_THREADS=4
#OpenMP thread pinning for application threads
export OMP_PLACES="{0,1,2,3}"
export OMP_PROC_BIND=close
#debug omp pinning
export CRAY_OMP_CHECK_AFFINITY=TRUE
#Maestro thread pinning
export MSTRO_BIND_PM_PC=8
export MSTRO_BIND_TRANSPORT_THREAD=7
export MSTRO_BIND_CQ_HANDLER="4"
export MSTRO_BIND_OP_THREAD="5-6"
export MSTRO_LOG_LEVEL=0
# FI provider, e.g. sockets, gni, verbs, cxi
export FI_PROVIDER=cxi
# maestro transport method, options are RDMA, GFS, MIO
export MSTRO_TRANSPORT_DEFAULT=RDMA
export MPICH_MAX_THREAD_SAFETY=multiple
# number of completion queue handler threads
export MSTRO_OFI_CQ_NUM_THREADS=1
export MSTRO_OPERATIONS_NUM_THREADS=1
# core_bench supports multiple consumer modes
# MSTRO_CONSUMER_SINK_ALL >> One consumer sinks in all data from all producers
# MSTRO_CONSUMER_ONE2ONE >> #consumers == #procducers. each consumer is assigned CDOs from one producer
# MSTRO_CONSUMER_ONE2TEN >> one consumer sinks data from 10 producers. #producers = 10 * #consumers
# MSTRO_CONSUMER_ALL2ALL >> Each consumer sinks data from all producers
export MSTRO_CONSUMER_MODE=MSTRO_CONSUMER_SINK_ALL
# total number of ranks = number of producer ranks + number of consumer ranks + 1 (pool manager rank)
# number of procducers = total number of ranks - #consumers - 1 (pool manager)
# srun <options> <core_bench> <#attributes> <attribute size> <#CDOs/thread> <#consumers> <CDO size in byte>
srun --exclusive -c 128 --cpu-bind=v --ntasks-per-node 1 -N 3 ./core_bench 0 0 20 1 671088640