R/sge_accounting.R

Defines functions parse_category.sge_accounting parse_category read_sge_accounting sge_accounting_file print.sge_accounting as_sge_accounting.raw_sge_accounting as_sge_accounting sge_accounting_col_types write_raw_sge_accounting read_raw_sge_accounting

Documented in as_sge_accounting as_sge_accounting.raw_sge_accounting parse_category parse_category.sge_accounting print.sge_accounting read_raw_sge_accounting read_sge_accounting sge_accounting_file write_raw_sge_accounting

#' Read a tab-delimited SGE accounting file (without parsing it)
#'
#' @param file (character or a file connection) The SGE \file{accounting} file to read.
#'
#' @param offset The file offset position (in bytes) from where to start reading.
#'
#' @param n_max (numeric) The maximum number of rows to read.
#'
#' @param skip (integer) Number of lines to skip before parsing file content.
#'
#' @param \ldots (optional) Addition arguments passed to [readr::read_delim()].
#'
#' @section Location of the SGE accounting file:
#' The SGE \file{accounting} file is typically located in a subfolder of
#' the folder \file{$SGE_ROOT/$SGE_CELL/}.  On Wynton HPC, the pathname
#' is given by `sge_accounting_file()`.
#'
#' @section File offset positions for each job entry:
#' If you know the file offset (in bytes) for the first job entry you wish to read,
#' then specify it via argument `offset`.  This speeds up the reading, because it
#' avoids having to parse jobs from the beginning.  To find the file offsets for
#' job entries, see [make_file_index()].
#'
#' @example incl/raw_sge_accounting.R
#'
#' @importFrom parallelly isConnectionValid
#' @importFrom readr read_delim
#' @rdname read_sge_accounting
#' @export
read_raw_sge_accounting <- function(file, offset = 0, n_max = Inf, skip = if (is.character(file) && offset == 0) 4L else 0L, ...) {
  if (inherits(offset, "sge_accounting_index_by_week")) {
    week_index <- offset
    jobs <- list()
    for (kk in seq_len(nrow(offset))) {
       jobs[[kk]] <- Recall(file = file, offset = week_index$file_offset[kk], n_max = week_index$nbr_of_jobs[kk], skip = 0L, ...)
    }
    jobs <- do.call(rbind, jobs)
    return(jobs)
  }

  stopifnot(is.numeric(offset), length(offset) == 1L, !is.na(offset), offset >= 0)
  stopifnot(is.numeric(n_max), length(n_max) == 1L, !is.na(n_max), n_max >= 0)
  stopifnot(is.numeric(skip), length(skip) == 1L, !is.na(skip), skip >= 0)

  header <- if (skip > 0L) readLines(file, n = skip) else character(0L)

  ## Assert that all temporary connections are closed at the end
  cons <- getAllConnections()
  on.exit(stopifnot(identical(getAllConnections(), cons)))
  
  if (inherits(file, "connection")) {
    con <- file
    stopifnot(isConnectionValid(con))
    if (offset > 0) seek(con, where = offset, origin = "start", rw = "read")
  } else {
    con <- open_file_at(file, offset = offset)
    stopifnot(isConnectionValid(con))
    on.exit(if (!is.null(con)) close(con), add = TRUE, after = FALSE)
  }
  
  col_types <- sge_accounting_col_types()
  col_names <- names(col_types$cols)
  x <- read_delim(file = con, delim = ":", col_names = col_names, col_types = col_types, skip = skip, n_max = n_max, ...)
  attr(x, "header") <- header
  class(x) <- c("raw_sge_accounting", class(x))
  x
}

#' @param x (raw_sge_accounting) An `tibble` data frame of class
#' `raw_sge_accounting`.
#'
#' @param header (character vector) Zero of more header lines to be written
#' at the top of the file.
#'
#' @importFrom readr write_delim
#' @rdname read_sge_accounting
#' @export
write_raw_sge_accounting <- function(x, file, header = attr(x, "header"), ...) {
  stopifnot(inherits(x, "raw_sge_accounting"))

  ## AD HOC: Drop trailing zeros in doubles
  ## FIXME: Some double fields should not undergo this, at least
  ## if we look at the input data /HB 2021-05-04
  modes <- vapply(x, FUN = storage.mode, FUN.VALUE = NA_character_)
  dbl <- which(modes == "double")
  x[dbl] <- lapply(x[dbl], FUN = function(x) {
    gsub("[.]0*$", "", sprintf("%f", x))
  })

  has_header <- (length(header) > 0L)
  if (has_header) writeLines(header, con = file)
  
  write_delim(x, file = file, delim = ":", col_names = FALSE, ..., append = has_header)
}



#' @importFrom readr cols col_character col_double col_integer
sge_accounting_col_types <- function() {
  ## Source: http://manpages.ubuntu.com/manpages/bionic/man5/sge_accounting.5.html
  col_types <- cols(
    qname           = col_character(), ## Name of the cluster queue in which the job has run
    hostname        = col_character(), ## Name of the execution host
    group           = col_character(), ## The effective group id of the job owner when executing the job
    owner           = col_character(), ## Owner of the Grid Engine job
    job_name        = col_character(), ## Job name
    job_number      = col_integer(),   ## Job identifier
    account         = col_character(), ## An account string as specified by the qsub(1) or qalter(1) -A option
    priority        = col_integer(),   ## Priority value assigned to the job
    submission_time = col_double(),    ## [epoch time] Submission time
    start_time      = col_double(),    ## [epoch time] Start time
    end_time        = col_double(),    ## [epoch time] End time 
    failed          = col_integer(),   ## Indicates the problem which occurred in case a job failed (at the system level, as opposed to the job script or binary having non-zero exit status)
    exit_status     = col_integer(),   ## Exit status of the job script (or Grid Engine-specific status in case of certain error conditions)
 
    ru_wallclock    = col_double(),    ## [time interval] Difference between `end_time` and `start_time`, except that if the job fails, it is zero.
    
    ## struct rusage {
    ##    struct timeval ru_utime; /* user CPU time used */
    ##    struct timeval ru_stime; /* system CPU time used */
    ##    long   ru_maxrss;        /* maximum resident set size */
    ##    long   ru_ixrss;         /* integral shared memory size */
    ##    long   ru_idrss;         /* integral unshared data size */
    ##    long   ru_isrss;         /* integral unshared stack size */
    ##    long   ru_minflt;        /* page reclaims (soft page faults) */
    ##    long   ru_majflt;        /* page faults (hard page faults) */
    ##    long   ru_nswap;         /* swaps */
    ##    long   ru_inblock;       /* block input operations */
    ##    long   ru_oublock;       /* block output operations */
    ##    long   ru_msgsnd;        /* IPC messages sent */
    ##    long   ru_msgrcv;        /* IPC messages received */
    ##    long   ru_nsignals;      /* signals received */
    ##    long   ru_nvcsw;         /* voluntary context switches */
    ##    long   ru_nivcsw;        /* involuntary context switches */
    ## };
    ## Source: http://manpages.ubuntu.com/manpages/trusty/man2/getrusage.2.html
    ru_utime         = col_double(),   ## [time interval] Total amount of time spent executing in user mod (seconds)
    ru_stime         = col_double(),   ## [time interval] Total amount of time spent executing in kernel mode (seconds)
    ru_maxrss        = col_double(),   ## [kB] maximum resident set size
    ru_ixrss         = col_double(),   ## [kB] integral shared memory size (UNUSED)
    ru_ismrss        = col_double(),   ## [kB] integral ??? size
    ru_idrss         = col_double(),   ## [kB] integral unshared data size (UNUSED)
    ru_isrss         = col_double(),   ## [kB] integral unshared stack size (UNUSED)
    ru_minflt        = col_double(),   ## [count] page reclaims (soft page faults)
    ru_majflt        = col_double(),   ## [count] page faults (hard page faults)
    ru_nswap         = col_double(),   ## [] swaps (UNUSED)
    ru_inblock       = col_double(),   ## [count] block input operations
    ru_oublock       = col_double(),   ## [count] block output operations
    ru_msgsnd        = col_double(),   ## [] IPC messages sent (UNUSED)
    ru_msgrcv        = col_double(),   ## [] IPC messages received (UNUSED)
    ru_nsignals      = col_double(),   ## [] signals received (UNUSED)
    ru_nvcsw         = col_double(),   ## [count] voluntary context switches (number of times a context switch resulted due to a process voluntarily giving up the processor before its time slice was completed (usually to await availability of a resource)
    ru_nivcsw        = col_double(),   ## [count] involuntary context switches (number of times a context switch resulted due to a higher priority process becoming runnable or because the current process exceeded its time slice)
         
    project          = col_character(),
    department       = col_character(),  
         
    granted_pe       = col_character(),   ## The parallel environment which was selected for the job
    slots            = col_integer(),     ## The number of slots which were dispatched to the job by the scheduler
    task_number      = col_integer(),     ##        
         
    cpu              = col_double(),      ## [time interval] The CPU time usage in seconds
    mem              = col_double(),      ## [GB*s] The integral memory usage in Gbytes seconds
    io               = col_double(),      ## [GB] The  amount of data transferred in input/output operations in GB (if available, otherwise 0)
         
    category         = col_character(),
         
    iow              = col_double(),      ## [time interval] The input/output wait time in seconds (if available, otherwise 0)
    pe_taskid        = col_character(),   ## If this identifier is not equal to NONE, the task was part of parallel job, and was passed to Grid Engine via the qrsh -inherit interface
         
    maxvmem          = col_double(),      ## [bytes] The maximum vmem size in bytes
    arid             = col_integer(),     ## Advance reservation identifier
    ar_sub_time      = col_double()       ## [epoch time] Advance reservation submission time if the job uses the resources of an advance reservation; otherwise "0"
  )
}





#' Coerce to an 'sge_accounting' Object
#'
#' @param x The object to be coerced.
#' 
#' @param \ldots (optional) Addition arguments passed to the S3 methods.
#'
#' @rdname read_sge_accounting
#' @export
as_sge_accounting <- function(x, ...) UseMethod("as_sge_accounting")

#' @rdname read_sge_accounting
#' @export
as_sge_accounting.raw_sge_accounting <- function(x, ...) {
  origin <- as.POSIXct("1970-01-01 00:00.00 UTC", tz = "GMT")

  ## Setting missing values
  x$ru_wallclock[x$failed != 0] <- NA_real_
  x$arid[x$arid == 0] <- NA_real_
  x$ar_sub_time[is.na(x$arid)] <- NA_real_
  for (name in c("granted_pe", "pe_taskid")) {
    value <- x[[name]]
    value[value == "NONE"] <- NA_character_
    x[[name]] <- value
  }
  
  ## epoch times
  for (name in c("submission_time", "start_time", "end_time", "ar_sub_time")) {
    value <- x[[name]]
    value[value == 0] <- NA_real_
    x[[name]] <- as.POSIXct(value, origin = origin)
  }

  ## time interval
  for (name in c("ru_wallclock", "ru_utime", "ru_stime", "iow", "cpu")) {
    value <- as.difftime(x[[name]], units = "secs")
    x[[name]] <- value
  }

  ## Convert kB to bytes (B)
  for (name in c("ru_maxrss", "ru_ixrss", "ru_ismrss", "ru_idrss", "ru_isrss")) {
    x[[name]] <- x[[name]] * 1000
  }

  ## Convert GB to bytes (B) (or GB*s to B*s)
  for (name in c("mem", "io")) {
    x[[name]] <- x[[name]] * 1000^3
  }
  
  attr(x, "spec") <- NULL
  class(x) <- c("sge_accounting", setdiff(class(x), "raw_sge_accounting"))
  x
}

#' @param format Either `"pretty"` or `"raw"`.
#'
#' @importFrom prettyunits pretty_bytes pretty_dt pretty_sec
#' @rdname read_sge_accounting
#' @export
print.sge_accounting <- function(x, format = c("pretty", "raw"), ...) {
  format <- match.arg(format)
  if (format == "pretty") {
    ## Time intervals (in seconds)
    for (name in c("ru_wallclock", "ru_utime", "ru_stime", "cpu")) {  # "iow"
      if (!name %in% names(x)) next
      ## WORKAROUND: pretty_ms() does not support NA:s
      value <- x[[name]]
      ok <- which(!is.na(value))
      x[[name]][ok] <- pretty_dt(value[ok])
    }
  
    ## Memory / Transfer (in bytes)
    for (name in c("ru_maxrss", "ru_ixrss", "ru_ismrss", "ru_idrss", "ru_isrss", "io")) {
      if (!name %in% names(x)) next
      x[[name]] <- pretty_bytes(x[[name]])
    }

    ## Memory / Transfer (in bytes)
    for (name in c("mem")) {
      if (!name %in% names(x)) next
      x[[name]] <- pretty_bytes(x[[name]])
    }
  }

  NextMethod()
}


#' Locate the SGE Accounting File on the Current System
#'
#' @param filename (character string) The name of the accounting file.
#'
#' @param path (character string) The path to the accounting file.
#'
#' @return (character string) The pathname to the SGE accounting file.
#' If not found, and error is thrown.
#'
#' @importFrom utils file_test
#' @rdname read_sge_accounting
#' @export
sge_accounting_file <- function(filename = "accounting", path = do.call(file.path, args = as.list(c(Sys.getenv(c("SGE_ROOT", "SGE_CELL")), "common")))) {
  stopifnot(file_test("-d", path))
  pathname <- file.path(path, filename)
  stopifnot(file_test("-f", pathname))
  pathname
}


#' Read an SGE accounting
#'
#' @param file (character) The SGE \file{accounting} file to read.
#'
#' @param \ldots Additional arguments passed to [read_raw_sge_accounting()].
#'
#' @return A `tibble` data frame with columns:
#'
#'  * `qname` (character) - name of the cluster queue in which the job has run
#'
#'  * `hostname` (character) - name of the execution host
#'
#'  * `group` (character) - the effective group id of the job owner when
#'    executing the job
#'
#'  * `owner` (character) - owner of the Grid Engine job
#'
#'  * `job_name` (character) - job name
#'
#'  * `job_number` (integer) - job identifier
#'
#'  * `account` (character) - an account string as specified by the `qsub` or `qalter`
#'
#'  * `priority` (integer) - priority value assigned to the job
#'
#'  * `submission_time` (dttm) - submission time
#'
#'  * `start_time` (dttm) - start time
#'
#'  * `end_time` (dttm) - end time
#'
#'  * `failed` (integer) - indicates the problem which occurred in case a job
#'    failed (at the system level, as opposed to the job script or binary having
#'    non-zero exit status).  Indicates the problem which occurred in case a job
#'    could not be started on the execution host (e.g. because the owner of the 
#'    job did not have a valid account on that machine). If Sun Grid Engine
#'    tries to start a job multiple times, this may lead to multiple entries in
#'    the accounting file corresponding to the same job ID
#'
#'  * `exit_status` (integer) - exit status of the job script (or Grid
#'    Engine-specific status in case of certain error conditions).
#'    The exit status is determined by following the normal shell conventions.
#'    If the command terminates normally the value of the command is its exit
#'    status. However, in the case that the command exits abnormally, a value
#'    of 0200 (octal), 128 (decimal) is added to the value of the command to
#'    make up the exit status.
#'    For example: If a job dies through signal 9 (`SIGKILL`) - probably issued
#'    by Grid Engine through `qdel`, or because the job exceeded time or memory
#'    hard limits - then the exit status is 128 + 9 = 137.
#'
#'  * `ru_wallclock` (drtn) - Difference between 'end_time' and 'start_time'
#'    (time interval), except that if the job fails, it is zero.
#'
#'  * `ru_utime` (drtn) - user CPU time (in seconds) used, i.e. total amount of
#'    time spent executing in user mode
#'
#'  * `ru_stime` (drtn) - system CPU time (in seconds) used, i.e. total amount
#'    of time spent executing in kernel mode
#'
#'  * `ru_maxrss` (character) - maximum resident set size (in kB)
#'
#'  * `ru_ixrss` (character) - integral shared memory size (in kB) \[UNUSED\]
#'
#'  * `ru_ismrss` (character) - ???
#'
#'  * `ru_idrss` (character) - integral unshared data size (in kB) \[UNUSED\]
#'
#'  * `ru_isrss` (character) - integral unshared stack size (in kB) \[UNUSED\]
#'
#'  * `ru_minflt` (numeric) - page reclaims (soft page faults)
#'
#'  * `ru_majflt` (numeric) - page faults (hard page faults)
#'
#'  * `ru_nswap` (numeric) - number of swaps \[UNUSED\]
#'
#'  * `ru_inblock` (numeric) - number of block input operations
#'
#'  * `ru_oublock` (numeric) - number of block output operations
#'
#'  * `ru_msgsnd` (numeric) - number of IPC messages sent \[UNUSED\]
#'
#'  * `ru_msgrcv` (numeric) - number of IPC messages received \[UNUSED\]
#'
#'  * `ru_nsignals` (numeric) - number of signals received
#'
#'  * `ru_nvcsw` (numeric) - number of voluntary context switches (number of
#'    times a context switch resulted due to a process voluntarily giving up
#'    the processor before its time slice was completed (usually to await
#'    availability of a resource)
#'
#'  * `ru_nivcsw` (numeric) - number of involuntary context switches (number of
#'    times a context switch resulted due to a higher priority process becoming
#'    runnable or because the current process exceeded its time slice)
#'
#'  * `project` (character) -
#'
#'  * `department` (character) -
#'
#'  * `granted_pe` (character) - the parallel environment which was selected
#'    for the job
#'
#'  * `slots` (integer) - the number of slots which were dispatched to the job
#'    by the scheduler
#'
#'  * `task_number` (integer) -
#'
#'  * `cpu` (drtn) - The CPU time usage (in seconds)
#'
#'  * `mem` (character) - the integral memory usage (in GB seconds)
#'
#'  * `io` (character) - the amount of data transferred in input/output
#'    operations (in GB) if available, otherwise `0`
#'
#'  * `category` (character) -
#'
#'  * `iow` (drtn) - the input/output wait time (in seconds) if available,
#'    otherwise 0
#'
#'  * `pe_taskid` (character) - if this identifier is not equal to `NONE`, the
#'    task was part of parallel job, and was passed to Grid Engine via the
#'    `qrsh`-inherit interface
#'
#'  * `maxvmem` (numeric) - the maximum vmem size (in bytes)
#'
#'  * `arid` (numeric) - advance reservation identifier
#'
#'  * `ar_sub_time` (dttm) - advance reservation submission time, if the job
#'    uses the resources of an advance reservation, otherwise `0`
#'
#'
#' @section Failed code:
#' 
#' |Code | Description                                    | OK | Explanation                                             |
#' | --: | ---------------------------------------------- | -- | ------------------------------------------------------- |
#' |   0 | no failure                                     | Y  | ran and exited normally                                 |
#' |   1 | assumedly before job                           | N  | failed early in execd                                   |
#' |   3 | before writing config                          | N  | failed before execd set up local spool                  |
#' |   4 | before writing PID                             | N  | shepherd failed to record its pid - filesystem problem? |
#' |   6 | setting processor set                          | N  | failed setting up processor set (obsolete)              |
#' |   7 | before prolog                                  | N  | failed before prolog                                    |
#' |   8 | in prolog                                      | N  | failed in prolog                                        |
#' |   9 | before pestart                                 | N  | failed before starting PE                               |
#' |  10 | in pestart                                     | N  | failed in PE starter                                    |
#' |  11 | before job                                     | N  | failed in shepherd before starting job                  |
#' |  12 | before pestop                                  | Y  | ran, but failed before calling PE stop procedure        |
#' |  13 | in pestop                                      | Y  | ran, but PE stop procedure failed                       |
#' |  14 | before epilog                                  | Y  | ran, but failed before calling epilog                   |
#' |  15 | in epilog                                      | Y  | ran, but failed in epilog                               |
#' |  16 | releasing processor set                        | Y  | ran, but processor set could not be released (obsolete) |
#' |  17 | through signal                                 | Y  | job killed by signal (possibly qdel)                    |
#' |  18 | shepherd returned error                        | N  | shepherd died somehow                                   |
#' |  19 | before writing exit_status                     | N  | shepherd didn't write reports correctly - probably program or machine crash |
#' |  20 | found unexpected error file                    | ?  | shepherd encountered a problem                          |
#' |  21 | in recognizing job                             | N  | qmaster asked about an unknown job (not in accounting?) |
#' |  24 | migrating (checkpointing jobs)                 | Y  | ran, will be migrated                                   |
#' |  25 | rescheduling                                   | Y  | ran, will be rescheduled                                |
#' |  26 | opening output file                            | N  | failed opening stderr/stdout file                       |
#' |  27 | searching requested shell                      | N  | failed finding specified shell                          |
#' |  28 | changing to working directory                  | N  | failed changing to start directory                      |
#' |  29 | AFS setup                                      | N  | failed setting up AFS security                          |
#' |  30 | application error returned                     | Y  | ran and exited 100 - maybe re-scheduled                 |
#' |  36 | checking configured daemons                    | N  | failed because of configured remote startup daemon      |
#' |  37 | qmaster enforced h_rt, h_cpu, or h_vmem limit  | Y  | ran, but killed due to exceeding run time limit         |
#' |  38 | adding supplementary group                     | N  | failed adding supplementary gid to job                  |
#' | 100 | assumedly after job                            | Y  | ran, but killed by a signal (perhaps due to exceeding resources), task died, shepherd died (e.g. node crash), etc. |
#' 
#' 
#' The following failed codes are specific to MS Windows:
#' 
#' |Code | Description                                    | OK | Explanation                                             |
#' | --: | ---------------------------------------------- | -- | ------------------------------------------------------- |
#' |  31 | accessing sgepasswd file                       | N  | failed because sgepasswd not readable*                  |
#' |  32 | entry is missing in password file              | N  | failed because user not in sgepasswd*                   |
#' |  33 | wrong password                                 | N  | failed because of wrong password against sgepasswd*     |
#' |  34 | communicating with Grid Engine Helper Service  | N  | failed because of failure of helper service*            |
#' |  35 | before job in Grid Engine Helper Service       | N  | failed because of failure running helper service*       |
#' 
#' 
#' Source: `man sge_status`.
#'
#' In addition to the above, I, the package author, have tried to gathered
#' additional information about the below `failed` codes based on real-world
#' observations.
#' 
#' * `21`: When this happens, both `qname` and `hostname` are `UNKNOWN`, 
#'         `qsub_time` is 0 ("Wed Dec 31 16:00:00 1969"),
#'         `start_time` and `end_time` are 0 ("-/-"),
#'         all run-time data, including `ru_wallclock`, are all `0`.
#'         It appears to happen to old jobs (per `job_number`) from times
#'         before one or more major downtimes.  Because of this, I believe
#'         these are from jobs with `Eqw` state that SGE eventually tries
#'         to flush out.
#' 
#' @section Common exit codes:
#'
#' |Code     | Description                                                 |
#' | --:     | ----------------------------------------------------------- |
#' |   0     | Success                                                     |
#' |   1     | Catchall for general errors                                 |
#' |   2     | Misuse of shell builtins (according to Bash documentation)  |
#' | 126     | Command invoked cannot execute, e.g. `/dev/null`            |
#' | 127     | "command not found"                                         |
#' | 128     | Invalid argument to exit, e.g. `exit 3.14`                  |
#' | 128 + n | Fatal error signal n                                        |
#' | 134     | 128 +   6 = 128 + `SIGABRT` - Abort signal from abort       |
#' | 135     | 128 +   7 = 128 + `SIGBUS`  - Bus error (bad memory access) |
#' | 136     | 128 +   8 = 128 + `SIGFPE`  - Floating-point exception      |
#' | 137     | 128 +   9 = 128 + `SIGKILL`                                 |
#' | 138     | 128 +  10 = 128 + `SIGUSR1`                                 |
#' | 140     | 128 +  12 = 128 + `SIGUSR2`                                 |
#' | 255     | 128 + 127 = Exit status out of range, e.g.`exit -1`         |
#'
#' Comment: `exit` only takes integers in \[0,255\]
#'
#'
#' @section Benchmarking:
#' The \file{accounting} on Wynton HPC took ~2 minutes to read when it
#' was 4.8 GB in size and ~6-8 minutes when it was 12 GB in size.
#'
#'
#' @example incl/sge_accounting.R
#'
#' @references
#' * `man accounting`
#' * `man sge_status`
#'
#' @export
read_sge_accounting <- function(file = sge_accounting_file(), ...) {
  ## Assert that all temporary connections are closed at the end
  cons <- getAllConnections()
  stopifnot(identical(getAllConnections(), cons))
  
  data <- read_raw_sge_accounting(file = file, ...)
  data <- as_sge_accounting(data)
  data
}


#' Parse SGE Accounting 'category' Field
#'
#' @param x An `sge_accounting` object.
#' 
#' @param \ldots (optional) Not used.
#'
#' @return A `tibble` data frame with columns corresponding to the requested
#' properties.
#'
#' @rdname read_sge_accounting
#' @export
parse_category <- function(x, ...) {
  UseMethod("parse_category")
}

#' @param properties (character vector) The properties to extract.
#'
#' @importFrom tibble as_tibble
#' @rdname read_sge_accounting
#' @export
parse_category.sge_accounting <- function(x, properties = c("h_rt", "s_rt", "mem_free"), ...) {
  properties <- match.arg(properties, several.ok = TRUE)

  category <- x$category
  n <- length(category)

  res <- list()
  for (field in properties) {
    if (field %in% c("h_rt", "s_rt")) {
      value <- rep(NA_integer_, times = n)
      pattern <- sprintf(".*%s=([[:digit:]]+).*", field)
      keep <- grep(pattern, category)
      if (length(keep) > 0L) {
        value[keep] <- as.integer(gsub(pattern, "\\1", category[keep]))
      }
      value <- .difftime(value, units = "secs")
    } else if (field == "mem_free") {
      value <- rep(NA_real_, times = n)
      pattern <- sprintf(".*%s=([[:digit:]]+[MG]).*", field)
      keep <- grep(pattern, category)
      if (length(keep) > 0L) {
        tmp <- gsub(pattern, "\\1", category[keep])
        tmp <- gsub("G$", "000M", tmp)
        tmp <- gsub("M$", "000K", tmp)
        tmp <- gsub("K$", "000", tmp)
        value[keep] <- tmp
      }
      value <- as.numeric(value)
    }
    res[[field]] <- value
  }

  as_tibble(res)
}


#' Table of SGE failed codes with descriptions
#'
#' @return A tibble
#'
#' @importFrom readr read_delim
#' @rdname read_sge_accounting
#' @export
sge_failed_codes <- local({
  codes <- NULL
  function() {
    if (is.null(codes)) {
      codes <- read_delim(delim = ";", col_names = TRUE, col_types = "iclc", trim_ws = TRUE, file = 
        "Code; Description                                  ; OK; Explanation                                            
            0; no failure                                   ; TRUE  ; and exited normally                              
            1; assumedly before job                         ; FALSE ; failed early in execd                                  
            3; before writing config                        ; FALSE ; failed before execd set up local spool                 
            4; before writing PID                           ; FALSE ; shepherd failed to record its pid - filesystem problem?
            6; setting processor set                        ; FALSE ; failed setting up processor set (obsolete)             
            7; before prolog                                ; FALSE ; failed before prolog                                   
            8; in prolog                                    ; FALSE ; failed in prolog                                       
            9; before pestart                               ; FALSE ; failed before starting PE                              
           10; in pestart                                   ; FALSE ; failed in PE starter                                   
           11; before job                                   ; FALSE ; failed in shepherd before starting job                 
           12; before pestop                                ; TRUE  ; ran, but failed before calling PE stop procedure       
           13; in pestop                                    ; TRUE  ; ran, but PE stop procedure failed                      
           14; before epilog                                ; TRUE  ; ran, but failed before calling epilog                  
           15; in epilog                                    ; TRUE  ; ran, but failed in epilog                              
           16; releasing processor set                      ; TRUE  ; ran, but processor set could not be released (obsolete)
           17; through signal                               ; TRUE  ; job killed by signal (possibly qdel)                   
           18; shepherd returned error                      ; FALSE ; shepherd died somehow                                  
           19; before writing exit_status                   ; FALSE ; shepherd didnt write reports correctly - probably program or machine crash
           20; found unexpected error file                  ; NA    ; shepherd encountered a problem                         
           21; in recognizing job                           ; FALSE ; qmaster asked about an unknown job (not in accounting?)
           24; migrating (checkpointing jobs)               ; TRUE  ; ran, will be migrated                                  
           25; rescheduling                                 ; TRUE  ; ran, will be rescheduled                               
           26; opening output file                          ; FALSE ; failed opening stderr/stdout file                      
           27; searching requested shell                    ; FALSE ; failed finding specified shell                         
           28; changing to working directory                ; FALSE ; failed changing to start directory                     
           29; AFS setup                                    ; FALSE ; failed setting up AFS security                         
           30; application error returned                   ; TRUE  ; ran and exited 100 - maybe re-scheduled                
           36; checking configured daemons                  ; FALSE ; failed because of configured remote startup daemon     
           38; adding supplementary group                   ; FALSE ; failed adding supplementary gid to job                 
           37; qmaster enforced h_rt, h_cpu, or h_vmem limit; TRUE  ; ran, but killed due to exceeding run time limit        
          100; assumedly after job                          ; TRUE  ; ran, but killed by a signal (perhaps due to exceeding resources), task died, shepherd died (e.g. node crash), etc.
      ")
    } # if (is.null(codes)
    codes
  }
})
UCSF-HPC/wyntonquery documentation built on March 6, 2025, 1:12 a.m.