Nothing
#' @title ClusterFunctions for Slurm Systems
#'
#' @description
#' Cluster functions for Slurm (\url{https://slurm.schedmd.com/}).
#'
#' Job files are created based on the brew template \code{template.file}. This
#' file is processed with brew and then submitted to the queue using the
#' \code{sbatch} command. Jobs are killed using the \code{scancel} command and
#' the list of running jobs is retrieved using \code{squeue}. The user must
#' have the appropriate privileges to submit, delete and list jobs on the
#' cluster (this is usually the case).
#'
#' The template file can access all resources passed to \code{\link{submitJobs}}
#' as well as all variables stored in the \code{\link{JobCollection}}.
#' It is the template file's job to choose a queue for the job and handle the desired resource
#' allocations.
#'
#' Note that you might have to specify the cluster name here if you do not want to use the default,
#' otherwise the commands for listing and killing jobs will not work.
#'
#' @template template
#' @param array.jobs [\code{logical(1)}]\cr
#' If array jobs are disabled on the computing site, set to \code{FALSE}.
#' @template nodename
#' @inheritParams makeClusterFunctions
#' @return [\code{\link{ClusterFunctions}}].
#' @family ClusterFunctions
#' @export
makeClusterFunctionsSlurm = function(template = "slurm", array.jobs = TRUE, nodename = "localhost", scheduler.latency = 1, fs.latency = 65) { # nocov start
assertFlag(array.jobs)
assertString(nodename)
template = findTemplateFile(template)
if (testScalarNA(template))
stopf("Argument 'template' (=\"%s\") must point to a readable template file", template)
template = cfReadBrewTemplate(template, "##")
quote = if (isLocalHost(nodename)) identity else shQuote
getClusters = function(reg) {
clusters = filterNull(lapply(reg$resources$resources, "[[", "cluster"))
if (length(clusters))
return(stri_flatten(unique(as.character(clusters)), ","))
return(character(0L))
}
submitJob = function(reg, jc) {
assertRegistry(reg, writeable = TRUE)
assertClass(jc, "JobCollection")
if (jc$array.jobs) {
logs = sprintf("%s_%i", fs::path_file(jc$log.file), seq_row(jc$jobs))
jc$log.file = stri_join(jc$log.file, "_%a")
}
outfile = cfBrewTemplate(reg, template, jc)
res = runOSCommand("sbatch", shQuote(outfile), nodename = nodename)
output = stri_flatten(stri_trim_both(res$output), "\n")
if (res$exit.code > 0L) {
temp.errors = c(
"Batch job submission failed: Job violates accounting policy (job submit limit, user's size and/or time limits)",
"Socket timed out on send/recv operation",
"Submission rate too high, suggest using job arrays"
)
i = wf(stri_detect_fixed(output, temp.errors))
if (length(i) == 1L)
return(makeSubmitJobResult(status = i, batch.id = NA_character_, msg = temp.errors[i]))
return(cfHandleUnknownSubmitError("sbatch", res$exit.code, res$output))
}
id = stri_split_fixed(output[1L], " ")[[1L]][4L]
if (jc$array.jobs) {
if (!array.jobs)
stop("Array jobs not supported by cluster function")
makeSubmitJobResult(status = 0L, batch.id = sprintf("%s_%i", id, seq_row(jc$jobs)), log.file = logs)
} else {
makeSubmitJobResult(status = 0L, batch.id = id)
}
}
listJobs = function(reg, args) {
assertRegistry(reg, writeable = FALSE)
args = c(args, "--noheader", "--format=%i")
if (array.jobs)
args = c(args, "-r")
clusters = getClusters(reg)
if (length(clusters))
args = c(args, sprintf("--clusters=%s", clusters))
res = runOSCommand("squeue", args, nodename = nodename)
if (res$exit.code > 0L)
OSError("Listing of jobs failed", res)
if (length(clusters)) tail(res$output, -1L) else res$output
}
# Full List of Slurm job state codes:
# https://slurm.schedmd.com/squeue.html
# BF,CA,CD,CF,CG,DL,F,NF,OOM,PD,PR,R,RD,RF,RH,RS,RV,SI,SE,SO,ST,S,TO
# Querying by RD (RESV_DEL_HOLD) status throwing error on slurm v20.11.4
listJobsQueued = function(reg) {
args = c(quote("--user=$USER"), "--states=PD,CF,RF,RH,RQ,SE")
listJobs(reg, args)
}
listJobsRunning = function(reg) {
args = c(quote("--user=$USER"), "--states=R,S,CG,RS,SI,SO,ST")
listJobs(reg, args)
}
# Slurm job state codes that will result in an expired status:
# BF,CA,CD,DL,F,NF,OOM,PR,RV,TO,RD
killJob = function(reg, batch.id) {
assertRegistry(reg, writeable = TRUE)
assertString(batch.id)
cfKillJob(reg, "scancel", c(sprintf("--clusters=%s", getClusters(reg)), batch.id), nodename = nodename)
}
makeClusterFunctions(name = "Slurm", submitJob = submitJob, killJob = killJob, listJobsRunning = listJobsRunning,
listJobsQueued = listJobsQueued, array.var = "SLURM_ARRAY_TASK_ID", store.job.collection = TRUE,
store.job.files = !isLocalHost(nodename), scheduler.latency = scheduler.latency, fs.latency = fs.latency)
} # nocov end
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.