blob: ae4aedbd6e4d1e563f3a5ea8e3ccbbf3cba08b73 [file] [log] [blame] [edit]
#!/bin/bash
#
# Generic release agent for SLURM cgroup usage
#
# Manage cgroup hierarchy like :
#
# /cgroup/subsystem/uid_%/job_%/step_%/task_%
#
# Automatically sync uid_% cgroups to be coherent
# with remaining job childs when one of them is removed
# by a call to this release agent.
# The synchronisation is made in a flock on the root cgroup
# to ensure coherency of the cgroups contents.
#
progname=$(basename $0)
subsystem=${progname##*_}
get_mount_dir()
{
local lssubsys=$(type -p lssubsys)
if [[ $lssubsys ]]; then
$lssubsys -m $subsystem | awk '{print $2}'
else
echo "/cgroup/$subsystem"
fi
}
mountdir=$(get_mount_dir)
if [[ $# -eq 0 ]]
then
echo "Usage: $(basename $0) [sync] cgroup"
exit 1
fi
# build orphan cg path
if [[ $# -eq 1 ]]
then
rmcg=${mountdir}$1
else
rmcg=${mountdir}$2
fi
slurmcg=${rmcg%/uid_*}
if [[ ${slurmcg} == ${rmcg} ]]
then
# not a slurm job pattern, perhaps the slurmcg, just remove
# the dir with a lock and exit
flock -x ${mountdir} -c "rmdir ${rmcg}"
exit $?
fi
orphancg=${slurmcg}/orphan
# make sure orphan cgroup is existing
if [[ ! -d ${orphancg} ]]
then
mkdir ${orphancg}
case ${subsystem} in
cpuset)
cat ${mountdir}/cpuset.cpus > ${orphancg}/cpuset.cpus
cat ${mountdir}/cpuset.mems > ${orphancg}/cpuset.mems
;;
*)
;;
esac
fi
# kernel call
if [[ $# -eq 1 ]]
then
rmcg=${mountdir}$@
# try to extract the uid cgroup from the input one
# ( extract /uid_% from /uid%/job_*...)
uidcg=${rmcg%/job_*}
if [[ ${uidcg} == ${rmcg} ]]
then
# not a slurm job pattern, perhaps the uidcg, just remove
# the dir with a lock and exit
flock -x ${mountdir} -c "rmdir ${rmcg}"
exit $?
fi
if [[ -d ${mountdir} ]]
then
flock -x ${mountdir} -c "$0 sync $@"
fi
exit $?
# sync subcall (called using flock by the kernel hook to be sure
# that no one is manipulating the hierarchy, i.e. PAM, SLURM, ...)
elif [[ $# -eq 2 ]] && [[ $1 == "sync" ]]
then
shift
rmcg=${mountdir}$@
uidcg=${rmcg%/job_*}
# remove this cgroup
if [[ -d ${rmcg} ]]
then
case ${subsystem} in
memory)
# help to correctly remove lazy cleaning memcg
# but still not perfect
sleep 1
;;
*)
;;
esac
rmdir ${rmcg}
fi
if [[ ${uidcg} == ${rmcg} ]]
then
## not a slurm job pattern exit now do not sync
exit 0
fi
# sync the user cgroup based on targeted subsystem
# and the remaining job
if [[ -d ${uidcg} ]]
then
case ${subsystem} in
cpuset)
cpus=$(cat ${uidcg}/job_*/cpuset.cpus 2>/dev/null)
if [[ -n ${cpus} ]]
then
cpus=$(scontrol show hostnames $(echo ${cpus} | tr ' ' ','))
cpus=$(echo ${cpus} | tr ' ' ',')
echo ${cpus} > ${uidcg}/cpuset.cpus
else
# first move the remaining processes to
# a cgroup reserved for orphaned processes
for t in $(cat ${uidcg}/tasks)
do
echo $t > ${orphancg}/tasks
done
# then remove the remaining cpus from the cgroup
echo "" > ${uidcg}/cpuset.cpus
fi
;;
*)
;;
esac
fi
# error
else
echo "Usage: $(basename $0) [sync] cgroup"
exit 1
fi
exit 0