blob: 30b37a36b0f73b9462f40b68a92937c42fca55df [file] [log] [blame]
#!/bin/bash
# **************************************************************************
# Function: Wrapper that helps launching Intel MPI jobs within SLURM
# using MICs in native mode.
# mpiexec.hydra needs passwordless ssh access to all involved nodes
# Version: 0.4
#---------------------------------------------------------------------------
# 11.10.2013 Created by Chrysovalantis Paschoulas, Juelich Supercomputing Centre - Forschungszentrum Juelich
# Intial Script by (C) Olli-Pekka Lehto - CSC IT Center for Science Ltd.
# **************************************************************************
# Usage message
USAGE="
USAGE
$(basename "$0") [ [-h] | [-v] [-x <host num tasks> -c <host binary>] [-z <mic num tasks> -m <mic binary>] ]
OPTIONS
-h Print this message.
-c Binary that will run on host nodes. If it is not set then only the MICs will be used.
-m Binary that will run inside the MICs.
-x Number of tasks (MPI ranks) for the host nodes. Default value is 1.
-z Number of tasks (MPI ranks) for the MICs. Default value is 1.
-v Show more info for this script.
--tv Run using TotalView (equivalent to export MPIEXEC_PREFIX=\"totalview -args\").
--tvcli Run using TotalView cli (equivalent to export MPIEXEC_PREFIX=\"totalviewcli -args\")
MORE INFO
The user MUST export the following environment variables:
MIC_NUM_PER_HOST Number of MICs on each host that will be used by mpiexec. Available options: 0, 1, 2. Default 2.
OMP_NUM_THREADS OpenMP threads number per task on hosts. This MUST be exported when OpenMP is used!
MIC_OMP_NUM_THREADS OpenMP threads number per task on MICs. If not defined then is set same as OMP_NUM_THREADS.
Also the user MAY pass additional flags to mpiexec exporting the following env vars:
MPIEXEC_PREFIX Wrap the execution of mpiexec with another tool (e.g. totalview).
MPIEXEC_FLAGS_HOST Flags that will be passed to the hosts.
MPIEXEC_FLAGS_MIC Flags that will be passed to the MICs.
-- Examples:
export MPIEXEC_PREFIX=\"totalview -args\"
export MPIEXEC_PREFIX=\"totalviewcli -args\"
export MPIEXEC_FLAGS_HOST=\"-env VAR VALUE\"
export MPIEXEC_FLAGS_MIC=\"-envlist VAR1,VAR2\"
EXAMPLES
Batch Script1 - Only hosts:
---
#!/bin/bash
#SBATCH -J TestJobMICNativeHybrid
#SBATCH -N 4
#SBATCH -p q_mics
#SBATCH -o TestJob-%j.out
#SBATCH -e TestJob-%j.err
#SBATCH --time=30
module purge
module load impi intel/13.1.3
export MIC_NUM_PER_HOST=0
export OMP_NUM_THREADS=32
mpirun-mic -x 1 -c ./impi_native_hybrid
---
Batch Script2 - Only mics:
---
#!/bin/bash
#SBATCH -J TestJobMICNativeHybrid
#SBATCH -N 4
#SBATCH -p q_mics
#SBATCH -o TestJob-%j.out
#SBATCH -e TestJob-%j.err
#SBATCH --time=30
module purge
module load impi intel/13.1.3
export MIC_NUM_PER_HOST=2
export MIC_OMP_NUM_THREADS=240
mpirun-mic -z 1 -m ./impi_native_hybrid.mic
---
Batch Script3 - Hosts and MICs:
---
#!/bin/bash
#SBATCH -J TestJobMICNativeHybrid
#SBATCH -N 2
#SBATCH -p q_mics
#SBATCH -o TestJob-%j.out
#SBATCH -e TestJob-%j.err
#SBATCH --time=30
module purge
module load impi intel/13.1.3
export MIC_NUM_PER_HOST=2
export OMP_NUM_THREADS=2
export MIC_OMP_NUM_THREADS=4
mpirun-mic -v -x 16 -c ./impi_native_hybrid -z 60 -m ./impi_native_hybrid.mic
---
";
# check script arguments
if [ $# -lt 1 ] ; then
echo "$USAGE" >&2
exit 1
fi
# get script arguments
while getopts "vhc:m:x:z:-:" OPTION
do
case $OPTION in
c)
HOST_BINARY=$OPTARG
;;
h)
echo "$USAGE";
exit 0;
;;
m)
MIC_BINARY=$OPTARG
;;
v)
MPIRUN_MIC_VERBOSE=1
;;
x)
HOST_PPN=$OPTARG
;;
z)
MIC_PPN=$OPTARG
;;
-)
case $OPTARG in
tv)
MPIEXEC_PREFIX="totalview -args"
;;
tvcli)
MPIEXEC_PREFIX="totalviewcli -args"
;;
\?) echo $USAGE >&2
exit 1
;;
esac
;;
\?)
echo "$USAGE";
exit 1;
;;
esac
done
### prepare the environment
# If not under SLURM just run on the local system, but still we must be on a compute node..
if [[ -z "$SLURM_PROCID" ]] ; then
SLURM_PROCID=0
fi
if [[ -z "$SLURM_NODELIST" ]] ; then
SLURM_NODELIST=`hostname`
fi
# give default values
if [[ -z "$MIC_PPN" ]] ; then
MIC_PPN=1
fi
if [[ -z "$HOST_PPN" ]] ; then
HOST_PPN=1
fi
if [[ -z "$MIC_NUM_PER_HOST" ]] ; then
MIC_NUM_PER_HOST=2
fi
# We will use OMP_NUM_THREADS to decide if the user will run a Hybrid MPI+OpenMP job
# Here set default value for MIC_OMP_NUM_THREADS
if [[ -n "$OMP_NUM_THREADS" ]] ; then
if [[ -z "$MIC_OMP_NUM_THREADS" ]] ; then
MIC_OMP_NUM_THREADS=$OMP_NUM_THREADS
fi
fi
# check the important values
if [[ -z "$HOST_BINARY" ]] && [[ -z "$MIC_BINARY" ]] ; then
echo "$USAGE" >&2
exit 1;
fi
# create the command line
#MPI_EXEC=mpirun
MPI_EXEC=mpiexec.hydra
EXEC_ARGS=""
# create the list of the nodes that are configured to have MICs
LLIST_HOSTS_WITH_MICS="";
SLIST_HOSTS_WITH_MICS=`sinfo -h -o "%N %G" | grep mic | awk '{ print $1; }'`;
for host in `scontrol show hostname $SLIST_HOSTS_WITH_MICS` ; do
LLIST_HOSTS_WITH_MICS="${LLIST_HOSTS_WITH_MICS} ${host}";
done
# create the lists of HOSTS AND MICS!
HOST_NODELIST="";
MIC_NODELIST="";
for host in `scontrol show hostname $SLURM_NODELIST` ; do
echo $LLIST_HOSTS_WITH_MICS | grep $host &> /dev/null
if [ $? -eq 0 ] ; then
if [ $MIC_NUM_PER_HOST -eq 1 ] ; then
MIC_NODELIST="${MIC_NODELIST} ${host}-mic0";
elif [ $MIC_NUM_PER_HOST -eq 2 ] ; then
MIC_NODELIST="${MIC_NODELIST} ${host}-mic0 ${host}-mic1";
fi
fi
HOST_NODELIST="${HOST_NODELIST} ${host}";
done
# create the arguments
# args for hosts here
# run job on hosts if host binary is not null
if [[ -n "$HOST_BINARY" ]] ; then
if [[ -n "$HOST_NODELIST" ]] ; then
for n in $HOST_NODELIST ; do
if [[ -n "$OMP_NUM_THREADS" ]] ; then
# with OpenMP
EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS $OMP_NUM_THREADS $MPIEXEC_FLAGS_HOST -n $HOST_PPN -host $n $HOST_BINARY";
else
# without OpenMP
EXEC_ARGS="${EXEC_ARGS} : $MPIEXEC_FLAGS_HOST -n $HOST_PPN -host $n $HOST_BINARY";
fi
done
fi
fi
# args for mics here
# run job on mics if mic binary is not null and MIC_NUM_PER_HOST is 1 or 2
if [[ -n "$MIC_NODELIST" ]] ; then
for n in $MIC_NODELIST ; do
if [[ -n "$MIC_OMP_NUM_THREADS" ]] ; then
# with OpenMP
EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS $MIC_OMP_NUM_THREADS -env LD_LIBRARY_PATH $MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n $MIC_BINARY";
#EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS $MIC_OMP_NUM_THREADS $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n $MIC_BINARY";
else
# NO OpenMP
EXEC_ARGS="${EXEC_ARGS} : -env LD_LIBRARY_PATH $MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n $MIC_BINARY";
#EXEC_ARGS="${EXEC_ARGS} : $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n $MIC_BINARY";
fi
done
fi
RUNCMD="$MPI_EXEC $EXEC_ARGS";
if [[ -n "$MPIEXEC_PREFIX" ]] ; then
RUNCMD="$MPIEXEC_PREFIX $RUNCMD";
fi
# extra important env (Local System depended)
#export LD_LIBRARY_PATH="$MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH"
export I_MPI_MIC=1
export I_MPI_DAPL_PROVIDER_LIST=ofa-v2-mlx4_0-1
unset I_MPI_DEVICE
unset I_MPI_PMI_LIBRARY
# start the job
if [ $SLURM_PROCID -eq 0 ] ; then
if [[ -n "$MPIRUN_MIC_VERBOSE" ]] ; then
echo
echo "########################################################################"
echo "MPI Tasks per host: $HOST_PPN"
echo "Threads per host MPI task: $OMP_NUM_THREADS"
echo "Binary for the hosts: $HOST_BINARY"
echo "MPI Tasks per MIC: $MIC_PPN"
echo "Threads per MIC MPI task: $MIC_OMP_NUM_THREADS"
echo "Binary for the mics: $MIC_BINARY"
echo "MIC_NUM_PER_HOST: $MIC_NUM_PER_HOST"
echo
echo "MPIEXEC_PREFIX: $MPIEXEC_PREFIX"
echo "MPIEXEC_FLAGS_HOST: $MPIEXEC_FLAGS_HOST"
echo "MPIEXEC_FLAGS_MIC: $MPIEXEC_FLAGS_MIC"
echo ""
echo "Run command: "
echo "$RUNCMD"
echo "########################################################################"
echo
fi
$RUNCMD
fi