blob: 5b03a84b80cdee45f776842dd09736a27a7bd153 [file] [log] [blame] [edit]
%
% $Id$
%
% slides documenting job startup design
% not to detailed, but some basic info...
\documentclass[article,letter,landscape]{seminar}
\usepackage{pstcol}
\usepackage{slidesec}
\usepackage{semcolor}
\input{seminar.bug}
\input{seminar.bg2}
\usepackage{graphics}
\usepackage{fancybox}
\usepackage{fancyhdr}
\usepackage{epsfig}
\usepackage{epsf}
\pagestyle{empty}
\twoup[1]
\newcommand{\slurmd}{{\tt slurmd}}
\newcommand{\slurmctld}{{\tt slurmctld}}
\newcommand{\srun}{{\tt srun}}
\newcommand{\sq}{{\tt squeue}}
\newcommand{\entrylabel}[1]{\mbox{{\tt #1:}}\hfil}
\newenvironment{entry}
{\begin{list}{}%
{\renewcommand{\makelabel}{\entrylabel}%
\setlength{\labelwidth}{50pt}%
\setlength{\leftmargin}{\labelwidth}%
\addtolength{\leftmargin}{\labelsep}%
}%
}%
{\end{list}}
\newlength{\Mylen}
\newcommand{\Lentrylabel}[1]{%
\settowidth{\Mylen}{{\tt #1:}}%
\ifthenelse{\lengthtest{\Mylen > \labelwidth}}%
{\parbox[b]{\labelwidth}% term > labelwidth
{\makebox[0pt][l]{{\tt #1:}}\\}}%
{{\tt #1:}}% term < labelwidth
\hfil\relax}
\newenvironment{Lentry}
{\renewcommand{\entrylabel}{\Lentrylabel}%
\begin{entry}}
{\end{entry}}
% Headers and footers personalization using the `fancyhdr' package
\fancyhf{} % Clear all fields
\renewcommand{\headrulewidth}{0.2mm}
\renewcommand{\footrulewidth}{0.2mm}
\fancyhead[C]{\Large\textbf{SLURM Design - Job Initiation}}
\fancyfoot[L]{\includegraphics[scale=0.075]{penguin.eps}\\\tiny LINUX}
\fancyfoot[R]{\includegraphics[scale=0.2]{llnl.ps}\\\tiny LLNL}
\fancyfoot[C]{\tiny Page \theslide}
% Create room for headers and footers
\renewcommand{\slidetopmargin}{2cm}
\renewcommand{\slidebottommargin}{3cm}
% Center horizontally the headers and footers (see seminar.bug)
\renewcommand{\headwidth}{\textwidth}
% To adjust the frame length to the header and footer ones
\autoslidemarginstrue
% Hook to tell dvips to print in landscape mode
\def\printlandscape{\special{landscape}}
% Possibilities: shadow, oval, double, none, plain
\slideframe{none}
\definecolor{Blue}{rgb}{0.,0.,1.}
\definecolor{Gold}{rgb}{1.,0.84,0.}
\definecolor{Pink}{rgb}{1.,0.75,0.8}
\begin{document}
\slidepagestyle{fancy}
% The presentation begins here.
\begin{slide}
\slideheading{Design Constraints}
\mbox{}\\
\begin{center}
Overall Design Goals\\
\begin{itemize}
\item Scalable
\item Fault Tolerant
\item Simplified
\item Administrator Friendly
\item Secure
\end{itemize}
\end{center}
\end{slide}
\begin{slide}
\slideheading{Design Constraints}
\mbox{}\\
\begin{center}
Job Initiation Design Goals\\
\begin{itemize}
\item Run jobs in several different ``modes'':
\begin{itemize}
\item interactive
\item batch
\item allocate only (then run under allocation)
\end{itemize}
\item Support multiple runs under a single allocation
\item Ability to attach and detach from running jobs
\begin{itemize}
\item for job control and fault tolerance
\end{itemize}
\item Specification of partial or full nodelists
\item Support for prolog/epilog
\end{itemize}
\end{center}
\end{slide}
\begin{slide}
\slideheading{Security}
\begin{center}
\begin{itemize}
\item Link Encryption
\begin{itemize}
\item mongo encrypts all communication with a cluster-wide
shared key.
\item access to shared key will be group-limited.
\end{itemize}
\item Job Credential
\begin{itemize}
\item \slurmctld\ will have private keys for every \slurmd\
and will hash Job Credential against this private key.
\item \slurmd 's will verify \srun\ connections using these
job credentials.
\end{itemize}
\item \slurmd\ and \slurmctld\ will utilize PAM for configurable authentication
over the encrypted link.
\begin{itemize}
\item \slurmd\ will fall through to PAM if credential is null.
\end{itemize}
\end{itemize}
\end{center}
\end{slide}
\begin{slide}
\slideheading{\slurmd\ Initialization}
\begin{center}
\begin{itemize}
\item On startup, \slurmd\ will generate a private key and
attempt to contact slurm controller to join the cluster.
\item If slurm controller is not up at time of \slurmd\ intialization,
\slurmd\ will sleep and listen on a well-defined mongo-port for status
requests from controller, or other entity.
\item On first contact with controller, \slurmd\ will exchange secret key
by setting flag {\tt REFRESH\_KEY} in the corresponding message.
\item After private key has been exchanged, \slurmd\ will wait for
further work
\end{itemize}
\end{center}
\end{slide}
\begin{slide}
\slideheading{ Interactive Job Initiation }
\begin{center}
\begin{itemize}
\item For an interactive job, \srun\ will build an allocate request
to send to the controller, based upon user options.
\item the {\tt MSG\_ALLOCATE\_REQ} message will contain the
following information:
\begin{itemize}
\item immediate or block (flag)
\item partition
\item user info (username and/or uid)
\item nprocs, nnodes, ncpus\_per\_task
\item task distribution flag (block$|$cyclic)
\end{itemize}
optional:
\begin{itemize}
\item requested time limit
\item constraint and requested feature list
\end{itemize}
\end{itemize}
\end{center}
\end{slide}
\begin{slide}
\slideheading{Interactive Job Initiation, contd ...}
\begin{center}
\begin{itemize}
\item \srun\ initializes mongo with shared mongo-key and
sends allocate request to controller.
\item controller replies with an allocate reply message:
\item message type MSG\_ALLOCATE\_REPLY contains:
\begin{itemize}
\item return and error code
\item node list
\item credential list (1 per node)
\item cpus per node
\end{itemize}
\item if return code indicates an error, \srun\ will print
text representation of error code on stderr and exit
\end{itemize}
\end{center}
\end{slide}
\begin{slide}
\slideheading{Interactive Job Initiation, contd ...}
\begin{center}
\begin{itemize}
\item If stdout/err is to be copied to user's terminal, \srun\ initializes
two mongo-ports for every task and begins listening on these ports
\item \srun\ similarly initializes mongo-ports for stdin.
\item \srun\ formulates the list of nodes, credentials, and task
assignments into a MSG\_JOB\_RUN\_REQ message.
\end{itemize}
\end{center}
\end{slide}
\begin{slide}
\slideheading{Interactive Job Initiation: job\_run\_request}
\begin{center}
\begin{itemize}
\item MSG\_JOB\_RUN\_REQ contents:
\begin{itemize}
\item job credential
\item jobid
\item command line
\item user info (username and/or uid)
\item ntasks (number of tasks to initiate on this node)
\item stdin, stdout, stderr locations (mongo-ports or files)
(per task)
\end{itemize}
Optional contents (prefixed by id):
\begin{itemize}
\item cwd
\item environment
\item stop time
\item signal handling
\end{itemize}
\end{itemize}
\end{center}
\end{slide}
\begin{slide}
\slideheading{Interactive Job Initiation: cont'd ... }
\begin{center}
\begin{itemize}
\item One job run request message is built per node - \srun\
sends these messages to all nodes using a {\tt mongo\_sendto:}\\
\begin{verbatim} md = mongo_sendto <host>:<mongo_port>, &addr
\end{verbatim}
\item \srun\ then waits for reply messages from all \slurmd 's
with {\tt mongo\_recvfrom} or {\tt mongo\_select}
\item After a timeout, \srun\ sends status update to \slurmctld\
\end{itemize}
\end{center}
\end{slide}
\begin{slide}
\slideheading{Interactive Job Initiation: \slurmd}
\begin{center}
\begin{itemize}
\item \slurmd\ receives a message from \srun\ and spawns a
request thread ({\tt req\_thr})
\item {\tt req\_thr} reads message header, sees that the message is
of type MSG\_JOB\_RUN\_REQUEST and executes the {\tt job\_thr()}
function.
\item {\tt job\_thr} unpacks message body, then:
\begin{itemize}
\item if credential is non-null, decrypt credential to authorize job
\item null credential: fall through to PAM
\item if job and user are authorized, instantiate a task thread for
each task requested and send MSG\_JOB\_RUN\_REPLY.
\end{itemize}
\end{itemize}
\end{center}
\end{slide}
\begin{slide}
\slideheading{Interactive Job Initiation: \slurmd }
\begin{center}
\begin{itemize}
\item {\tt task\_thr} examines std\{in,err,out\} locations for
this task and opens appropriately
\begin{itemize}
\item {\tt /dev/null}
\item host:mongo port
\item filename
\end{itemize}
\item {\tt task\_thr} dups stdout/err/in fds as appropriate.
\item Each {\tt task\_thr } then does fork(), setuid(), chdir()
(if req'd), setup environment, and exec() of the user's executable.
\item {\tt task\_thr} will then wait() for its child.
\item {\tt job\_thr} continues to listen for job control information
over original connection (signals, cancel, etc.)
\item {\tt job\_thr} on node 0 of job will monitor connection to
\srun\ and will initiate a new \srun\ on local node if
necessary.
\end{itemize}
\end{center}
\end{slide}
\begin{slide}
\slideheading{Interactive Job Initiation, cont'd ...}
\begin{center}
\begin{itemize}
\item \srun\ remains attached to user's terminal, copying stderr
and stdout (optionally prepending task no) and forwarding signals
\item \srun\ transparently forwards signals generated on the
users terminal to remote tasks (MSG\_SIGNAL)
\item \srun\ also recieves status information on original channel:
\begin{itemize}
\item MSG\_TASK\_EXITED
\end{itemize}
\end{itemize}
\end{center}
\end{slide}
\begin{slide}
\slideheading{Batch Job Initiation}
\begin{center}
\begin{itemize}
\item a batch submission consists of a script that will be invoked
on only one node in the allocation
\item \srun\ sends MSG\_BATCH\_SUBMIT\_REQ to \slurmctld , which contains
\begin{itemize}
\item user info
\item nnodes, nprocs, cpus\_per\_task
\item script path, environment, cwd
\end{itemize}
Optional parameters:
\begin{itemize}
\item requested timelimit (default set by config)
\item filenames for stdout/err (default jobid.o,e)
\item stdin location (default {\tt /dev/null})
\item signal handling
\end{itemize}
\end{itemize}
\end{center}
\end{slide}
\begin{slide}
\slideheading{Batch Job Initiation, cont'd ...}
\begin{center}
\begin{itemize}
\item \slurmctld\ sends MSG\_JOB\_RUN\_REQ to first node in job
when it has allocated nodes
\item Job run request contents:
\begin{itemize}
\item credential, jobid, userinfo
\item stdin/out/err locations (always files)
\item ntasks $=1$
\item command line $=$ ``{\tt /path/to/script args}''
\item stop time
\end{itemize}
\item \slurmd\ handles job run request as previously explained
\item \slurmctld\ will keep connection open for this job, waiting
for task exit message, at which point job is complete.
\end{itemize}
\end{center}
\end{slide}
\begin{slide}
\slideheading{Batch Job Initiation, cont'd ...}
\begin{center}
\begin{itemize}
\item Presence of SLURM\_JOBID env var indicates to \srun\ that it
is running under an allocation (has access to resources).
\item SLURM\_HOSTLIST will also be set to the list of nodes allocated
\item \srun\ must query \slurmctld , however, for two reasons:
\begin{itemize}
\item \srun\ doesn't know number of cpus for each node
\item safer for \srun\ to obtain switch resources from controller
\end{itemize}
\item Therefore, \srun\ will first query \slurmctld\ for details of
its allocation
\item \srun\ will then process command line options relative to this
allocation. If a subset of nodes is specified, \srun\ will need
to requery the controller for switch resources. (We may be
able to get away without doing this $2^{nd}$ query)
\end{itemize}
\end{center}
\end{slide}
\begin{slide}
\slideheading{Allocate Mode}
\begin{center}
\begin{itemize}
\item Allocates a set of resources to the
user then spawns a shell with access to those resources.
(SLURM\_JOBID and SLURM\_HOSTLIST are set)
\item initiating \srun\ will need to {\tt wait()} for exit
of the shell to notify \slurmctld\ that job is complete.
\item subsequent invocations of \srun\ behave as in the batch run
case
\end{itemize}
\end{center}
\end{slide}
%\begin{slide}
% \slideheading{Attaching to a Job}
% \begin{center}
% \begin{itemize}
% \item User specifies job name or jobid on command line
% \item \srun\ connects to \slurmctld\ and sends a MSG\_JOB\_ATTACH\_REQ:
% \begin{itemize}
% \item job id
% \item job name
% \end{itemize}
% \item MSG\_JOB\_ATTACH\_REPLY contains similar information as
% MSG\_ALLOCATE\_REPLY
% \item \srun\ now has credentials to contact \slurmds\ in job
% \end{itemize}
% \end{center}
%\end{slide}
%\begin{slide}
% \slideheading{}
% \begin{center}
% \begin{itemize}
% \end{itemize}
% \end{center}
%\end{slide}
\end{document}