| % |
| % $Id$ |
| % |
| |
| % slides documenting job startup design |
| % not to detailed, but some basic info... |
| |
| \documentclass[article,letter,landscape]{seminar} |
| |
| |
| \usepackage{pstcol} |
| \usepackage{slidesec} |
| \usepackage{semcolor} |
| |
| \input{seminar.bug} |
| \input{seminar.bg2} |
| |
| \usepackage{graphics} |
| \usepackage{fancybox} |
| \usepackage{fancyhdr} |
| \usepackage{epsfig} |
| \usepackage{epsf} |
| |
| \pagestyle{empty} |
| \twoup[1] |
| |
| \newcommand{\slurmd}{{\tt slurmd}} |
| \newcommand{\slurmctld}{{\tt slurmctld}} |
| \newcommand{\srun}{{\tt srun}} |
| \newcommand{\sq}{{\tt squeue}} |
| |
| \newcommand{\entrylabel}[1]{\mbox{{\tt #1:}}\hfil} |
| \newenvironment{entry} |
| {\begin{list}{}% |
| {\renewcommand{\makelabel}{\entrylabel}% |
| \setlength{\labelwidth}{50pt}% |
| \setlength{\leftmargin}{\labelwidth}% |
| \addtolength{\leftmargin}{\labelsep}% |
| }% |
| }% |
| {\end{list}} |
| |
| \newlength{\Mylen} |
| \newcommand{\Lentrylabel}[1]{% |
| \settowidth{\Mylen}{{\tt #1:}}% |
| \ifthenelse{\lengthtest{\Mylen > \labelwidth}}% |
| {\parbox[b]{\labelwidth}% term > labelwidth |
| {\makebox[0pt][l]{{\tt #1:}}\\}}% |
| {{\tt #1:}}% term < labelwidth |
| \hfil\relax} |
| \newenvironment{Lentry} |
| {\renewcommand{\entrylabel}{\Lentrylabel}% |
| \begin{entry}} |
| {\end{entry}} |
| |
| |
| |
| % Headers and footers personalization using the `fancyhdr' package |
| \fancyhf{} % Clear all fields |
| \renewcommand{\headrulewidth}{0.2mm} |
| \renewcommand{\footrulewidth}{0.2mm} |
| \fancyhead[C]{\Large\textbf{SLURM Design - Job Initiation}} |
| \fancyfoot[L]{\includegraphics[scale=0.075]{penguin.eps}\\\tiny LINUX} |
| \fancyfoot[R]{\includegraphics[scale=0.2]{llnl.ps}\\\tiny LLNL} |
| \fancyfoot[C]{\tiny Page \theslide} |
| |
| |
| % Create room for headers and footers |
| \renewcommand{\slidetopmargin}{2cm} |
| \renewcommand{\slidebottommargin}{3cm} |
| |
| % Center horizontally the headers and footers (see seminar.bug) |
| \renewcommand{\headwidth}{\textwidth} |
| |
| % To adjust the frame length to the header and footer ones |
| \autoslidemarginstrue |
| |
| % Hook to tell dvips to print in landscape mode |
| \def\printlandscape{\special{landscape}} |
| |
| % Possibilities: shadow, oval, double, none, plain |
| \slideframe{none} |
| |
| \definecolor{Blue}{rgb}{0.,0.,1.} |
| \definecolor{Gold}{rgb}{1.,0.84,0.} |
| \definecolor{Pink}{rgb}{1.,0.75,0.8} |
| \begin{document} |
| |
| \slidepagestyle{fancy} |
| |
| % The presentation begins here. |
| |
| \begin{slide} |
| \slideheading{Design Constraints} |
| \mbox{}\\ |
| \begin{center} |
| Overall Design Goals\\ |
| \begin{itemize} |
| \item Scalable |
| \item Fault Tolerant |
| \item Simplified |
| \item Administrator Friendly |
| \item Secure |
| \end{itemize} |
| \end{center} |
| \end{slide} |
| |
| \begin{slide} |
| \slideheading{Design Constraints} |
| \mbox{}\\ |
| \begin{center} |
| Job Initiation Design Goals\\ |
| \begin{itemize} |
| \item Run jobs in several different ``modes'': |
| \begin{itemize} |
| \item interactive |
| \item batch |
| \item allocate only (then run under allocation) |
| \end{itemize} |
| \item Support multiple runs under a single allocation |
| \item Ability to attach and detach from running jobs |
| \begin{itemize} |
| \item for job control and fault tolerance |
| \end{itemize} |
| \item Specification of partial or full nodelists |
| \item Support for prolog/epilog |
| \end{itemize} |
| \end{center} |
| \end{slide} |
| |
| \begin{slide} |
| \slideheading{Security} |
| \begin{center} |
| \begin{itemize} |
| \item Link Encryption |
| \begin{itemize} |
| \item mongo encrypts all communication with a cluster-wide |
| shared key. |
| \item access to shared key will be group-limited. |
| \end{itemize} |
| \item Job Credential |
| \begin{itemize} |
| \item \slurmctld\ will have private keys for every \slurmd\ |
| and will hash Job Credential against this private key. |
| \item \slurmd 's will verify \srun\ connections using these |
| job credentials. |
| \end{itemize} |
| \item \slurmd\ and \slurmctld\ will utilize PAM for configurable authentication |
| over the encrypted link. |
| \begin{itemize} |
| \item \slurmd\ will fall through to PAM if credential is null. |
| \end{itemize} |
| \end{itemize} |
| \end{center} |
| \end{slide} |
| |
| |
| \begin{slide} |
| \slideheading{\slurmd\ Initialization} |
| \begin{center} |
| \begin{itemize} |
| \item On startup, \slurmd\ will generate a private key and |
| attempt to contact slurm controller to join the cluster. |
| \item If slurm controller is not up at time of \slurmd\ intialization, |
| \slurmd\ will sleep and listen on a well-defined mongo-port for status |
| requests from controller, or other entity. |
| \item On first contact with controller, \slurmd\ will exchange secret key |
| by setting flag {\tt REFRESH\_KEY} in the corresponding message. |
| \item After private key has been exchanged, \slurmd\ will wait for |
| further work |
| \end{itemize} |
| \end{center} |
| \end{slide} |
| |
| \begin{slide} |
| \slideheading{ Interactive Job Initiation } |
| \begin{center} |
| \begin{itemize} |
| \item For an interactive job, \srun\ will build an allocate request |
| to send to the controller, based upon user options. |
| \item the {\tt MSG\_ALLOCATE\_REQ} message will contain the |
| following information: |
| \begin{itemize} |
| \item immediate or block (flag) |
| \item partition |
| \item user info (username and/or uid) |
| \item nprocs, nnodes, ncpus\_per\_task |
| \item task distribution flag (block$|$cyclic) |
| \end{itemize} |
| optional: |
| \begin{itemize} |
| \item requested time limit |
| \item constraint and requested feature list |
| \end{itemize} |
| \end{itemize} |
| \end{center} |
| \end{slide} |
| |
| \begin{slide} |
| \slideheading{Interactive Job Initiation, contd ...} |
| \begin{center} |
| \begin{itemize} |
| \item \srun\ initializes mongo with shared mongo-key and |
| sends allocate request to controller. |
| \item controller replies with an allocate reply message: |
| \item message type MSG\_ALLOCATE\_REPLY contains: |
| \begin{itemize} |
| \item return and error code |
| \item node list |
| \item credential list (1 per node) |
| \item cpus per node |
| \end{itemize} |
| \item if return code indicates an error, \srun\ will print |
| text representation of error code on stderr and exit |
| \end{itemize} |
| \end{center} |
| \end{slide} |
| |
| \begin{slide} |
| \slideheading{Interactive Job Initiation, contd ...} |
| \begin{center} |
| \begin{itemize} |
| \item If stdout/err is to be copied to user's terminal, \srun\ initializes |
| two mongo-ports for every task and begins listening on these ports |
| \item \srun\ similarly initializes mongo-ports for stdin. |
| \item \srun\ formulates the list of nodes, credentials, and task |
| assignments into a MSG\_JOB\_RUN\_REQ message. |
| \end{itemize} |
| \end{center} |
| \end{slide} |
| |
| \begin{slide} |
| \slideheading{Interactive Job Initiation: job\_run\_request} |
| \begin{center} |
| \begin{itemize} |
| \item MSG\_JOB\_RUN\_REQ contents: |
| \begin{itemize} |
| \item job credential |
| \item jobid |
| \item command line |
| \item user info (username and/or uid) |
| \item ntasks (number of tasks to initiate on this node) |
| \item stdin, stdout, stderr locations (mongo-ports or files) |
| (per task) |
| \end{itemize} |
| Optional contents (prefixed by id): |
| \begin{itemize} |
| \item cwd |
| \item environment |
| \item stop time |
| \item signal handling |
| \end{itemize} |
| \end{itemize} |
| \end{center} |
| \end{slide} |
| |
| |
| \begin{slide} |
| \slideheading{Interactive Job Initiation: cont'd ... } |
| \begin{center} |
| \begin{itemize} |
| \item One job run request message is built per node - \srun\ |
| sends these messages to all nodes using a {\tt mongo\_sendto:}\\ |
| \begin{verbatim} md = mongo_sendto <host>:<mongo_port>, &addr |
| \end{verbatim} |
| \item \srun\ then waits for reply messages from all \slurmd 's |
| with {\tt mongo\_recvfrom} or {\tt mongo\_select} |
| \item After a timeout, \srun\ sends status update to \slurmctld\ |
| \end{itemize} |
| \end{center} |
| \end{slide} |
| |
| \begin{slide} |
| \slideheading{Interactive Job Initiation: \slurmd} |
| \begin{center} |
| \begin{itemize} |
| \item \slurmd\ receives a message from \srun\ and spawns a |
| request thread ({\tt req\_thr}) |
| \item {\tt req\_thr} reads message header, sees that the message is |
| of type MSG\_JOB\_RUN\_REQUEST and executes the {\tt job\_thr()} |
| function. |
| \item {\tt job\_thr} unpacks message body, then: |
| \begin{itemize} |
| \item if credential is non-null, decrypt credential to authorize job |
| \item null credential: fall through to PAM |
| \item if job and user are authorized, instantiate a task thread for |
| each task requested and send MSG\_JOB\_RUN\_REPLY. |
| \end{itemize} |
| \end{itemize} |
| \end{center} |
| \end{slide} |
| |
| \begin{slide} |
| \slideheading{Interactive Job Initiation: \slurmd } |
| \begin{center} |
| \begin{itemize} |
| \item {\tt task\_thr} examines std\{in,err,out\} locations for |
| this task and opens appropriately |
| \begin{itemize} |
| \item {\tt /dev/null} |
| \item host:mongo port |
| \item filename |
| \end{itemize} |
| \item {\tt task\_thr} dups stdout/err/in fds as appropriate. |
| \item Each {\tt task\_thr } then does fork(), setuid(), chdir() |
| (if req'd), setup environment, and exec() of the user's executable. |
| \item {\tt task\_thr} will then wait() for its child. |
| \item {\tt job\_thr} continues to listen for job control information |
| over original connection (signals, cancel, etc.) |
| \item {\tt job\_thr} on node 0 of job will monitor connection to |
| \srun\ and will initiate a new \srun\ on local node if |
| necessary. |
| \end{itemize} |
| \end{center} |
| \end{slide} |
| |
| \begin{slide} |
| \slideheading{Interactive Job Initiation, cont'd ...} |
| \begin{center} |
| \begin{itemize} |
| \item \srun\ remains attached to user's terminal, copying stderr |
| and stdout (optionally prepending task no) and forwarding signals |
| \item \srun\ transparently forwards signals generated on the |
| users terminal to remote tasks (MSG\_SIGNAL) |
| \item \srun\ also recieves status information on original channel: |
| \begin{itemize} |
| \item MSG\_TASK\_EXITED |
| \end{itemize} |
| \end{itemize} |
| \end{center} |
| \end{slide} |
| |
| \begin{slide} |
| \slideheading{Batch Job Initiation} |
| \begin{center} |
| \begin{itemize} |
| \item a batch submission consists of a script that will be invoked |
| on only one node in the allocation |
| \item \srun\ sends MSG\_BATCH\_SUBMIT\_REQ to \slurmctld , which contains |
| \begin{itemize} |
| \item user info |
| \item nnodes, nprocs, cpus\_per\_task |
| \item script path, environment, cwd |
| \end{itemize} |
| Optional parameters: |
| \begin{itemize} |
| \item requested timelimit (default set by config) |
| \item filenames for stdout/err (default jobid.o,e) |
| \item stdin location (default {\tt /dev/null}) |
| \item signal handling |
| \end{itemize} |
| \end{itemize} |
| \end{center} |
| \end{slide} |
| |
| \begin{slide} |
| \slideheading{Batch Job Initiation, cont'd ...} |
| \begin{center} |
| \begin{itemize} |
| \item \slurmctld\ sends MSG\_JOB\_RUN\_REQ to first node in job |
| when it has allocated nodes |
| \item Job run request contents: |
| \begin{itemize} |
| \item credential, jobid, userinfo |
| \item stdin/out/err locations (always files) |
| \item ntasks $=1$ |
| \item command line $=$ ``{\tt /path/to/script args}'' |
| \item stop time |
| \end{itemize} |
| \item \slurmd\ handles job run request as previously explained |
| \item \slurmctld\ will keep connection open for this job, waiting |
| for task exit message, at which point job is complete. |
| \end{itemize} |
| \end{center} |
| \end{slide} |
| |
| \begin{slide} |
| \slideheading{Batch Job Initiation, cont'd ...} |
| \begin{center} |
| \begin{itemize} |
| \item Presence of SLURM\_JOBID env var indicates to \srun\ that it |
| is running under an allocation (has access to resources). |
| \item SLURM\_HOSTLIST will also be set to the list of nodes allocated |
| \item \srun\ must query \slurmctld , however, for two reasons: |
| \begin{itemize} |
| \item \srun\ doesn't know number of cpus for each node |
| \item safer for \srun\ to obtain switch resources from controller |
| \end{itemize} |
| \item Therefore, \srun\ will first query \slurmctld\ for details of |
| its allocation |
| \item \srun\ will then process command line options relative to this |
| allocation. If a subset of nodes is specified, \srun\ will need |
| to requery the controller for switch resources. (We may be |
| able to get away without doing this $2^{nd}$ query) |
| \end{itemize} |
| \end{center} |
| \end{slide} |
| |
| \begin{slide} |
| \slideheading{Allocate Mode} |
| \begin{center} |
| \begin{itemize} |
| \item Allocates a set of resources to the |
| user then spawns a shell with access to those resources. |
| (SLURM\_JOBID and SLURM\_HOSTLIST are set) |
| \item initiating \srun\ will need to {\tt wait()} for exit |
| of the shell to notify \slurmctld\ that job is complete. |
| \item subsequent invocations of \srun\ behave as in the batch run |
| case |
| \end{itemize} |
| \end{center} |
| \end{slide} |
| |
| %\begin{slide} |
| % \slideheading{Attaching to a Job} |
| % \begin{center} |
| % \begin{itemize} |
| % \item User specifies job name or jobid on command line |
| % \item \srun\ connects to \slurmctld\ and sends a MSG\_JOB\_ATTACH\_REQ: |
| % \begin{itemize} |
| % \item job id |
| % \item job name |
| % \end{itemize} |
| % \item MSG\_JOB\_ATTACH\_REPLY contains similar information as |
| % MSG\_ALLOCATE\_REPLY |
| % \item \srun\ now has credentials to contact \slurmds\ in job |
| % \end{itemize} |
| % \end{center} |
| %\end{slide} |
| |
| %\begin{slide} |
| % \slideheading{} |
| % \begin{center} |
| % \begin{itemize} |
| % \end{itemize} |
| % \end{center} |
| %\end{slide} |
| |
| \end{document} |
| |
| |