fmri.pipeline: Analysis of fMRI Data on HPC clusters

Documented in calculate_subject_exclusions compute_spike_regressors file_ext file_sans_ext generate_motion_regressors get_contrasts_from_spec get_mr_abspath names_to_internal pca_motion populate_last_events respecify_l2_models_by_subject run_afni_command run_fsl_command sched_args_to_header summarize_contrasts

#' Wrapper for running an AFNI command safely within R
#'
#' @param args AFNI command string to be run
#' @param afnidir Location of AFNI installation. If NULL, the function will search the environment for AFNIDIR or afni on PATH
#' @param stdout File target for redirecting stdout. If NULL, stdout will not be written to file.
#' @param stderr File target for redirecting stderr. If NULL, stderr will not be written to file.
#' @param echo Whether to print AFNI command to the screen. Default: TRUE
#' @param omp_num_threads sets the number of OpenMP threads used for this AFNI command (if supported)
#' @param ... Arguments passed through to the `system` command.
#'
#' @return The exit status of the executed AFNI command. 0 for success, non-zero for failure
#'
#' @details
#'
#' This command ensures that AFNI commands are run in an environment that is setup correctly
#'
#' @author Michael Hallquist
#' @export
#'
#' @examples
#'
#' \dontrun{
#' run_afni_command("3dcopy test_data copy_data")
#' }
run_afni_command <- function(args, afnidir=NULL, stdout=NULL, stderr=NULL, echo = TRUE, omp_num_threads=1L, ...) {
  checkmate::assert_string(args, null.ok = FALSE)
  checkmate::assert_string(afnidir, null.ok = TRUE)
  if (!is.null(afnidir)) checkmate::assert_directory_exists(afnidir)

  checkmate::assert_string(stdout, null.ok = TRUE)
  checkmate::assert_string(stderr, null.ok = TRUE)
  checkmate::assert_logical(echo, len = 1L)
  checkmate::assert_integerish(omp_num_threads, lower = 1, len = 1)

  if (is.null(afnidir)) {
    # look for AFNIDIR in system environment if not passed in
    afnidir <- Sys.getenv("AFNIDIR")
    if (isFALSE(nzchar(afnidir))) {
      # look for location of afni binary on PATH
      afni_loc <- system("command -v afni", intern = TRUE)
      exit_code <- attr(afni_loc, "status")
      if (!is.null(exit_code) && exit_code == 1) {
        warning("Could not find afni using AFNIDIR or system PATH. Defaulting to ", paste0(normalizePath("~/"), "/abin"))
        afnidir <- paste0(normalizePath("~/"), "/abin")
      } else {
        afnidir <- dirname(afni_loc)
      }
    }
  }

  checkmate::assert_file_exists(file.path(afnidir, "afni")) # make sure afni actually exists in this folder

  cur_threads <- Sys.getenv("OMP_NUM_THREADS")
  if (omp_num_threads > 1L) {
    if (isTRUE(echo)) cat("Setting OMP_NUM_THREADS to:", omp_num_threads, "\n")
    Sys.setenv(OMP_NUM_THREADS = omp_num_threads) # setup OMP threads
  }
  Sys.setenv(AFNIDIR=afnidir) #export to R environment
  afnisetup <- paste0("AFNIDIR=", afnidir, "; PATH=${AFNIDIR}:${PATH}; DYLD_FALLBACK_LIBRARY_PATH=${AFNIDIR}; ${AFNIDIR}/")
  afnicmd  <- paste0(afnisetup, args)
  if (!is.null(stdout)) afnicmd <- paste(afnicmd, ">", stdout)
  if (!is.null(stderr)) afnicmd <- paste(afnicmd, "2>", stderr)
  if (isTRUE(echo)) cat("AFNI command:", afnicmd, "\n")
  retcode <- system(afnicmd, ...)
  if (omp_num_threads > 1L && cur_threads != "") {
    Sys.setenv(OMP_NUM_THREADS = cur_threads) # reset threads
  }
  return(retcode)
}


#' Wrapper for running an FSL command safely within R
#'
#' @param args FSL command string to be run
#' @param fsldir Location of FSL installation. If NULL, the function will search the environment for FSLDIR or FSL commands in the PATH
#' @param stdout File target for redirecting stdout. If NULL, stdout will not be captured
#' @param stderr File target for redirecting stderr. If NULL, stderr will not be captured
#' @param echo Whether to print FSL command to the screen. Default: TRUE
#' @param ... Arguments passed through to the `system` command.
#'
#' @return The exit status of the executed FSL command. 0 for success, non-zero for failure
#'
#' @details
#'
#' This command ensures that FSL command are run in an environment with FSL setup correctly.
#'
#' @author Michael Hallquist
#' @export
#'
#' @examples
#' \dontrun{
#' run_fsl_command("fslmaths test_data copy_data")
#' }
run_fsl_command <- function(args, fsldir = NULL, stdout = NULL, stderr = NULL, echo = TRUE, ...) {
  checkmate::assert_string(args, null.ok = FALSE)
  checkmate::assert_string(fsldir, null.ok = TRUE)
  if (!is.null(fsldir)) checkmate::assert_directory_exists(fsldir)

  checkmate::assert_string(stdout, null.ok = TRUE)
  checkmate::assert_string(stderr, null.ok = TRUE)
  checkmate::assert_logical(echo, len = 1L)

  if (is.null(fsldir)) {
    # look for FSLDIR in system environment if not passed in
    fsldir <- Sys.getenv("FSLDIR")
    if (isFALSE(nzchar(fsldir))) {
      # check for FSLDIR in .bashrc or .profile
      bashrc_fsldir <- ""
      if (file.exists("~/.profile")) {
        bashrc_fsldir <- system("source ~/.profile && echo $FSLDIR", intern = TRUE)
      }

      if (nzchar(bashrc_fsldir) && file.exists("~/.bashrc")) {
        bashrc_fsldir <- system("source ~/.bashrc && echo $FSLDIR", intern = TRUE)
      }

      # Fallback: look for location of fsl feat on PATH
      if (nzchar(bashrc_fsldir)) {
        feat_loc <- system("command -v feat", intern = TRUE)
        exit_code <- attr(feat_loc, "status")
        if (!is.null(exit_code) && exit_code == 1) {
          warning("Could not find FSL using FSLDIR or system PATH. Defaulting to Defaulting to /usr/local/fsl.")
          fsldir <- "/usr/local/fsl"
        } else {
          fsldir <- dirname(dirname(feat_loc))
        }
      }
    }
  }

  checkmate::assert_file_exists(file.path(fsldir, "bin", "feat")) # make sure FSL actually exists in this folder

  #Sys.setenv(LD_LIBRARY_PATH="/gpfs/group/mnh5174/default/sw/openblas/lib")
  Sys.setenv(FSLDIR=fsldir) #export to R environment
  fslsetup <- paste0("FSLDIR=", fsldir, "; PATH=${FSLDIR}/bin:${PATH}; . ${FSLDIR}/etc/fslconf/fsl.sh; ${FSLDIR}/bin/")
  fslcmd <- paste0(fslsetup, args)
  if (!is.null(stdout)) fslcmd <- paste(fslcmd, ">", stdout)
  if (!is.null(stderr)) fslcmd <- paste(fslcmd, "2>", stderr)
  if (isTRUE(echo)) cat("FSL command: ", fslcmd, "\n")
  retcode <- system(fslcmd, ...)
  return(retcode)
}

# Internal function used to get the commands that should be included in batch submission scripts
# to setup the compute environment.
# @param gpa a glm_pipeline_arguments object
# @param what which programs to pull from the compute environment. If not specified, it pulls all
#   programs. At present, the options are "global" (only global setup), "fsl", "afni", and "r".
get_compute_environment <- function(gpa, what="all") {
  # always include global
  compute_string <- gpa$parallel$compute_environment$global

  if (any(what %in% c("all", "fsl"))) {
    compute_string <- c(compute_string, gpa$parallel$compute_environment$fsl)
  }

  if (any(what %in% c("all", "afni"))) {
    compute_string <- c(compute_string, gpa$parallel$compute_environment$afni)
  }

  if (any(what %in% c("all", "r"))) {
    compute_string <- c(compute_string, gpa$parallel$compute_environment$r)
  }

  return(compute_string)
}

# write a small test script for the programs used in the compute environment
# currently capable of testing FSL, R, and AFNI
test_compute_environment <- function(gpa, what="all", stop_on_fail=TRUE) {
  #"set -x",
  prog_str <- c(
    "#!/bin/bash", "checks_passed=1", "",
    "echo 'Checking the compute environment settings'",
    "echo '-----------------------------------------'",
    get_compute_environment(gpa, what = what)
  )

  if (any(what %in% c("all", "afni"))) {
    prog_str <- c(
      prog_str,
      "afni_loc=$(command -v afni)
      [ $? -eq 0 ] && afni_exists=1 || afni_exists=0
      if [ $afni_exists -eq 0 ]; then
        echo 'Cannot find afni!'
        checks_passed=0
      else
        echo \"afni found: ${afni_loc}\"
      fi"
    )
  }

  if (any(what %in% c("all", "fsl"))) {
    prog_str <- c(
      prog_str,
      "feat_loc=$(command -v feat)
      [ $? -eq 0 ] && feat_exists=1 || feat_exists=0
      if [ $feat_exists -eq 0 ]; then
        echo 'Cannot find FSL feat!'
        checks_passed=0
      else
        echo \"feat found: ${feat_loc}\"
      fi
      if [ -n \"$FSLDIR\" ]; then
        echo \"FSLDIR set: $FSLDIR\"
      else
        echo 'FSLDIR is not set. This is necessary for FSL setup.'
        checks_passed=0
      fi
      "
    )
  }

  if (any(what %in% c("all", "r"))) {
    prog_str <- c(
      prog_str,
      "r_loc=$(command -v R)
      [ $? -eq 0 ] && r_exists=1 || r_exists=0
      if [ $r_exists -eq 0 ]; then
        echo 'Cannot find R!'
        checks_passed=0
      else
        echo \"R found: ${r_loc}\"
      fi
      "
    )
  }

  prog_str <- c(
    prog_str,
    "
    if [ $checks_passed -eq 0 ]; then
      echo 'One or more checks failed.'
      exit 1
    else
      echo 'All checks passed. Please still verify that the program locations match your expectations!'
    fi
    "
  )

  check_script <- tempfile(pattern="chk")
  writeLines(prog_str, check_script)
  res <- suppressWarnings(system(paste("bash", check_script), intern = TRUE))
  cat(res, sep = "\n")
  exit_code <- attr(res, "status")
  if (!is.null(exit_code) && exit_code == 1) {
    if (stop_on_fail) {
      stop("One or more problems were detected in the compute environment. Setup cannot continue.")
    } else {
      warning("One or more problems were detected in the compute environment. The pipeline may fail if these are not resolved.")
    }
  }
}

#' internal function for getting a file extension that may include a compressed ending
#' this is adapted from neurobase::file_imgext, but extended for other file types
#' @keywords internal
file_ext <- function(file, withdot = TRUE) {
  file <- tolower(file)
  matches <- grepl("^.*\\.(csv|dat|hdr|img|brik|head|nii|txt|tsv|yaml|json|1d)(\\.gz|\\.bz2|\\.zip|\\.xz)*$", file)
  ext <- rep(NA, length = length(file)) # return NA for inputs that can't be parsed
  ext[matches] <- sub("^(.*)\\.(csv|dat|hdr|img|brik|head|nii|txt|tsv|yaml|json|1d)(\\.gz|\\.bz2|\\.zip|\\.xz)*$", "\\2\\3", file[matches])
  if (isTRUE(withdot)) ext[matches] <- paste0(".", ext[matches])
  return(ext)
}


#' internal function for returning a file (and path) without its extension
#' at present, it returns NA if no recognized extension is there
#' @keywords internal
file_sans_ext <- function(file, withdot = TRUE) {
  matches <- grepl("^.*\\.(csv|dat|hdr|img|brik|head|nii|txt|tsv|yaml|json|1d)(\\.gz|\\.bz2|\\.zip|\\.xz)*$", file, ignore.case = TRUE)
  fout <- rep(NA, length=length(file)) # return NA for inputs that can't be parsed
  fout[matches] <- sub("^(.*)\\.(csv|dat|hdr|img|brik|head|nii|txt|tsv|yaml|json|1d)(\\.gz|\\.bz2|\\.zip|\\.xz)*$", "\\1", file[matches], ignore.case = TRUE)
  return(fout)
}

#' helper function for generating motion regressors from raw 6-parameter motion coregistration
#'
#' @param motion_params_file file containing 6 motion parameters
#' @param col.names names of columns in \code{motion_params_file}, in order from left to right
#' @param drop_volumes number of volumes to drop from beginning of motion params
#' @param last_volume final volume to include from motion params (if truncated at end). If \code{NULL},
#'   then the end of the time series is not truncated.
#' @param rot_units The units of the rotation parameters. Default is "rad" for radians
#' @param tra_units The units of the translation parameter. Only support "mm" right now millimeters
#'
#' @importFrom data.table fread setnames
#' @importFrom checkmate assert_file_exists assert_subset assert_integerish
#' @keywords internal
generate_motion_regressors <- function(motion_params_file = "motion.par",
                                       col.names = c("rx", "ry", "rz", "tx", "ty", "tz"),
                                       demean = FALSE, drop_volumes = 0L, last_volume=NULL,
                                       rot_units="rad", tra_units="mm",
                                       na.strings="NA", lg=NULL) {

  checkmate::assert_class(lg, "Logger")
  checkmate::assert_file_exists(motion_params_file)
  if (is.null(col.names)) {
    # explicit defaults in case of null
    lg$debug("In generate_motion_regressors, defaulting column order rx, ry, rz, tx, ty, tz")
    col.names <- c("rx", "ry", "rz", "tx", "ty", "tz")
  }
  checkmate::assert_subset(col.names, c("rx", "ry", "rz", "tx", "ty", "tz"))
  checkmate::assert_logical(demean, len = 1L)
  checkmate::assert_integerish(drop_volumes, lower = 0)
  checkmate::assert_subset(rot_units, c("rad", "deg"))
  checkmate::assert_subset(tra_units, c("mm"))
  checkmate::assert_character(na.strings)

  # read raw motion parameters file
  mot <- data.table::fread(motion_params_file, na.strings = na.strings)

  if (ncol(mot) != 6L) {
    lg$error("In generate_motion_regressors, motion file %s has %d columns instead of 6!", motion_params_file, ncol(mot))
    return(invisible(NULL))
  } else {
    data.table::setnames(mot, col.names)
  }

  if (is.null(last_volume)) { last_volume <- nrow(mot) }
  checkmate::assert_integerish(last_volume, upper=nrow(mot))

  # subset motion regressors to match drops and truncations in fmri data
  mot <- mot[(1 + drop_volumes):last_volume, ]

  mot_deriv <- mot[, lapply(.SD, function(x) { c(0, diff(x)) })]
  data.table::setnames(mot_deriv, paste0("d", names(mot))) #add delta to names
  mot <- cbind(mot, mot_deriv)

  mot_quad <- mot[, lapply(.SD, function(x) { x^2 })]
  data.table::setnames(mot_quad, paste0("q", names(mot))) #add delta to names
  mot <- cbind(mot, mot_quad)

  # https://wiki.cam.ac.uk/bmuwiki/FMRI
  if (rot_units=="rad") {
    framewise_displacement <- apply(mot_deriv[, c("drx", "dry", "drz")], 1, function(x) 50 * sum(abs(x))) +
      apply(mot_deriv[, c("dtx", "dty", "dtz")], 1, function(x) sum(abs(x)))
  } else if (rot_units=="deg") {
    framewise_displacement <- apply(mot_deriv[, c("drx", "dry", "drz")], 1, function(x) 50 * (pi / 180) * sum(abs(x))) +
      apply(mot_deriv[, c("dtx", "dty", "dtz")], 1, function(x) sum(abs(x)))
  } else {
    stop("unknown")
  }
  mot <- cbind(mot, framewise_displacement = framewise_displacement)

  # Demean all columns, if requested
  # Note that this is not used internally at present to avoid surprising results when testing motion thresholds (e.g., framewise_displacement)
  if (isTRUE(demean)) {
    mot <- mot[, lapply(.SD, function(x) { x - mean(x, na.rm=TRUE) }) ]
  }

  ## just PCA motion on the current run
  ## mregressors <- pca_motion(run_nifti[r], runlengths[r], motion_parfile="motion.par", numpcs=3, drop_volumes=drop_volumes)$motion_pcs_concat

  return(as.data.frame(mot))
}

#' compute spike regressors from a data.frame or matrix of motion parameters and a set of
#'   expressions that are evaluated against this parameter matrix.
#' 
#' @param mot a data.frame or matrix with volumes on rows and named motion parameters on columns
#' @param spike_volume a character vector of expressions to evaluate against \code{mot}. Resulting
#'   columns will be prefixed with the names of each expression. If the expressions are unnamed,
#'   prefixes will be expr1_, expr2_, etc.
#'
#' @return a matrix of spike regressors (volumes on rows, spike regressors on columns)
#' @importFrom dplyr lead lag
#' @keywords internal
compute_spike_regressors <- function(mot = NULL, spike_volume = NULL, lg=NULL) {
  if (is.null(mot)) return(NULL)
  if (is.null(spike_volume)) return(NULL)

  checkmate::assert_character(spike_volume, null.ok = TRUE)
  checkmate::assert_class(lg, "Logger")

  spikes <- do.call(cbind, lapply(seq_along(spike_volume), function(ii) {
    has_bounds <- grepl(";", spike_volume[ii], fixed = TRUE)
    if (isTRUE(has_bounds)) {
      esplit <- strsplit(spike_volume[ii], "\\s*;\\s*", perl = TRUE)[[1L]]
      stopifnot(length(esplit) == 2L)
      spike_bounds <- as.integer(eval(parse(text = paste0("c(", esplit[1], ")"))))
      expr <- esplit[2L]
    } else {
      spike_bounds <- 0L # only volume where expr is TRUE
      expr <- spike_volume[ii]
    }

    spike_vec <- tryCatch(with(mot, eval(parse(text = expr))),
      error = function(e) {
        lg$error(
          "Problem evaluating spike regressors for file %s, expr: %s",
          motion_params_file, expr
        )
        return(NULL)
      }
    )

    which_spike <- which(spike_vec)
    if (length(which_spike) == 0L) return(NULL) #don't attempt to generate spikes if no TRUE values

    # will evaluate to NULL if which_spike is empty
    spike_df <- do.call(cbind, lapply(which_spike, function(xx) {
      vec <- rep(0, nrow(mot))
      vec[xx] <- 1
      return(vec)
    }))

    colnames(spike_df) <- paste0("spike_", seq_len(ncol(spike_df)))

    if (!is.null(spike_df) && !identical(spike_bounds, 0L)) {
      # apply shifts
      shifts <- spike_bounds[spike_bounds != 0L]
      res <- do.call(cbind, lapply(shifts, function(ss) {
        shift_mat <- apply(spike_df, 2, function(col) {
          if (ss < 0) {
            dplyr::lead(col, abs(ss), default = 0)
          } else {
            dplyr::lag(col, ss, default = 0)
          }
        })
        colnames(shift_mat) <- paste0("spike_", ifelse(ss < 0, "m", "p"), abs(ss), "_", 1:ncol(shift_mat))
        return(shift_mat)
      }))

      spike_df <- cbind(spike_df, res)
    }

    if (is.null(names(spike_volume)[ii]) || names(spike_volume)[ii] == "") {
      colnames(spike_df) <- paste0("expr", ii, "_", colnames(spike_df))
    } else {
      colnames(spike_df) <- paste0(names(spike_volume)[ii], "_", colnames(spike_df))
    }
    return(spike_df)

  }))

  # we don't want to generate any duplicate columns (linear dependency problems) 
  # due to overlapping expressions or ranges of evaluation
  spikes <- spikes[, !duplicated(spikes, MARGIN=2)]

  return(spikes)
}


#' small helper function for compressing motion parameters using PCA
#'
#' @param motion_df volumes x motion parameters data frame for PCA compression
#' @param num_pcs number of principal components to extract
#' @param zscore whether to standardize motion parameters prior to PCA (this is a good idea)
#' @param verbose whether to print the variance explained by PCs and the multiple correlations of
#'   among motion parameters
#'
#' @return A matrix of PCA-compressed motion regressors
#' @importFrom psych smc
#' @keywords internal
pca_motion <- function(motion_df, num_pcs=3L, zscore=TRUE, verbose=FALSE) {
  checkmate::assert_data_frame(motion_df)
  checkmate::assert_integerish(num_pcs, lower=1, upper=50)
  checkmate::assert_logical(zscore)
  checkmate::assert_logical(verbose)

  #compute the PCA decomposition of motion parameters and their derivatives
  pc <- prcomp(motion_df, retx=TRUE, scale.=zscore)
  cumvar <- cumsum(pc$sdev^2/sum(pc$sdev^2))

  if (isTRUE(verbose)) message("first", num_pcs, "motion principal components account for: ", round(cumvar[num_pcs], 3))
  mregressors <- pc$x[, 1:num_pcs] #cf Churchill et al. 2012 PLoS ONE
  attr(mregressors, "variance.explained") <- cumvar[num_pcs]

  if (isTRUE(verbose)) {
    cat("Multiple correlation of motion parameters:\n\n")
    print(round(sqrt(psych::smc(motion_df)), 2))
  }

  return(as.data.frame(mregressors))
}

generateRunMask <- function(mr_files, outdir=getwd(), outfile="runmask") {
  if (file.exists(file.path(outdir, paste0(outfile, ".nii.gz")))) { return(invisible(NULL)) }
  ##generate mask of mr_files where temporal min is > 0 for all runs
  for (f in seq_along(mr_files)) {
    run_fsl_command(paste0("fslmaths ", mr_files[f], " -Tmin -bin ", outdir, "/tmin", f))#, fsldir="/usr/local/ni_tools/fsl")
  }

  ##sum mins together over runs and threshold at number of runs
  run_fsl_command(paste0("fslmaths ", paste(paste0(outdir, "/tmin", seq_along(mr_files)), collapse=" -add "), " ", outdir, "/tminsum"))#, fsldir="/usr/local/ni_tools/fsl")
  run_fsl_command(paste0("fslmaths ", outdir, "/tminsum -thr ", length(mr_files), " -bin ", outdir, "/", outfile))#, fsldir="/usr/local/ni_tools/fsl")
  run_fsl_command(paste0("imrm ", outdir, "/tmin*"))#, fsldir="/usr/local/ni_tools/fsl") #cleanup 
}

#compute the mean of each cluster in roimask
#within a 4d array: x, y, z, subbrik
#
#roimask is an x,y,z array with integer mask values
#ni4d is a concatenated set of cope values, either across subjects
# or, in the case of beta series, within a subject (over trials)
get_cluster_means <- function(roimask, ni4d) {
  #ni4d should be a 4s array in which the fourth dimension is either subject (single beta) or a beta series (single subject)
  maskvals <- sort(unique(as.vector(roimask)))
  maskvals <- maskvals[!maskvals == 0]

  roimats <- sapply(maskvals, function(v) {
    mi <- which(roimask==v, arr.ind=TRUE)
    nsubbriks <- dim(ni4d)[4]
    nvox <- nrow(mi)
    mi4d <- cbind(pracma::repmat(mi, nsubbriks, 1), rep(1:nsubbriks, each=nvox))

    mat <- matrix(ni4d[mi4d], nrow=nvox, ncol=nsubbriks) #need to manually reshape into matrix from vector
    #for each subject, compute huber m-estimator of location/center Winsorizing at 2SD across voxels (similar to voxel mean)
    #clusavg <- apply(mat, 2, function(x) { MASS::huber(x, k=2)$mu })
    clusavg <- apply(mat, 2, mean)

    return(clusavg)
  })
}

# internal function to extract mean beta series
# NOT USED AT THE MOMENT
get_beta_series <- function(inputs, roimask, n_bs=50) {
  #inputs <- inputs[1:5] #speed up testing

  beta_res <- foreach(i=iter(seq_along(inputs)), .packages=c("reshape2", "oro.nifti", "dplyr", "abind"), .export="get_cluster_means") %dopar% {
  #beta_res <- lapply(1:length(inputs), function(i) {
    run_dirs <- list.files(path=inputs[i], pattern="FEAT_LVL1_run\\d+\\.feat", recursive=FALSE, full.names=TRUE)
    run_betas <- list()

    for (r in seq_along(run_dirs)) {
      runnum <- as.numeric(sub(".*/FEAT_LVL1_run(\\d+)\\.feat", "\\1", run_dirs[r], perl=TRUE))
      copes <- file.path(run_dirs[r], "stats", paste0("cope", 1:n_bs, ".nii.gz"))

      if (!all(file.exists(copes))) {
        cat("File listing in dir:", run_dirs[r], list.files(run_dirs[r], recursive=FALSE), sep="\n", append=TRUE)
        cat("Cannot find all ", length(copes), " beta series copes in: ",
          run_dirs[r], "\n",
          file = "beta_series_errors.txt", append = TRUE
        )
        next #don't error for now, just omit
      }

      #in testing, it is faster to use readNIfTI to read each cope file individually (7.5s) than to use fslmerge + readNIfTI on the 4d (13s)
      #concat_file <- tempfile()
      #system.time(run_fsl_command(paste("fslmerge -t", concat_file, paste(copes, collapse=" "))))
      #system.time(cout <- readNIfTI(concat_file, reorient=FALSE)@.Data)

      cout <- do.call(abind, list(along=4, lapply(copes, function(x) { readNIfTI(x, reorient=FALSE)@.Data })))
      beta_series_cluster_means <- get_cluster_means(roimask, cout)
      beta_melt <- reshape2::melt(beta_series_cluster_means, value.name="bs_value", varnames=c("trial", "cluster_number")) %>%
        mutate(feat_input_id=i, run=runnum)

      run_betas[[r]] <- beta_melt
    }

    return(do.call(rbind, run_betas))

  }

  return(do.call(rbind, beta_res))
}

#' Internal helper function for printing a summary of a contrast matrix
#' @param cmat a contrast matrix where row names are the contrast names and column
#'   names are the column names of the corresponding design matrix.
#' @keywords internal
summarize_contrasts <- function(cmat) {
  sapply(seq_len(nrow(cmat)), function(x) {
    con_name <- rownames(cmat)[x]
    cvec <- cmat[x, ]
    nzcols <- which(cvec != 0)
    cols <- colnames(cmat)[nzcols]
    cat("Contrast: ", con_name, "\n")
    for (ii in seq_along(cols)) {
      cat(cols[ii], "=", cvec[nzcols[ii]], "\n")
    }
    cat("-----\n")
  })
}

#' helper function to rename columns of input data.frame to internal nomenclature
#'   based on the variable mapping (vm) vector
#' 
#' @param df a data.frame containing columns to be renamed to internal standards
#' @param vm a named vector of columns in \code{df} that identify internal constructs
#'   such as id, session, and run.
#'
#' @return a modified version of \code{df} with column names modified to use internal names
#' @keywords internal
names_to_internal <- function(df, vm) {
  # look for naming collisions
  nm <- names(df)
  nv <- names(vm)
  conflict <- intersect(nm[nm %in% names(vm)], nv[nv != vm])

  if (length(conflict) > 0L) {
    new_names <- paste0("..", conflict, "..")
    cat(
      "To avoid naming conflicts, renaming these columns:", paste(conflict, collapse = ", "),
      "to:", paste0(new_names, collapse = ", "), "\n"
    )

    nm[which(nm %in% conflict)] <- new_names
  }

  # convert variable names to internal constructs via the variable mapping
  vm_present <- vm[which(vm %in% nm)]
  if (length(vm_present) > 0L) {
    vm_pos <- sapply(vm_present, function(x) which(nm == x)[1])
    nm[vm_pos] <- names(vm_present)
  }

  return(nm)
}

#' small helper function to pull absolute paths to a given column in run_data or trial_data
#'
#' @param mr_df a data.frame from a gpa object, following standard variable mapping nomenclature
#' @param col a character string denoting the column in \code{mr_df} to be used for looking up
#'   absolute paths
#'
#' @details Note that if a given value of the requested column is an absolute path, it will
#'   not be combined with $mr_dir to generate the combined path. Thus, the \code{col} in
#'   \code{mr_df} can contain a mixture of relative an absolute paths. The relative paths will
#'   be combined with 
#' @return Absolute paths to all files in the specified \code{col}
#' @importFrom R.utils getAbsolutePath
#' @keywords internal
get_mr_abspath <- function(mr_df, col="run_nifti") {
  checkmate::assert_data_frame(mr_df)
  checkmate::assert_string(col)
  checkmate::assert_subset(col, names(mr_df))

  sapply(seq_len(nrow(mr_df)), function(ii) {
    if (!"mr_dir" %in% names(mr_df)) {
      # this should be rare (or ideally, not happen at all)
      # but if there is no mr_dir in the mr_df, we can only use the column itself
      wd <- NULL # defaults to getwd()
    } else {
      this_dir <- mr_df$mr_dir[ii]
      if (is.na(this_dir)) {
        wd <- getwd() # defaults to getwd()
      } else {
        wd <- this_dir
      }
    }

    R.utils::getAbsolutePath(mr_df[[col]][ii], workDirectory = wd, expandTilde = TRUE)
  })

}



#' small helper function to populate subject exclusions in $run_data and $subject_data
#'
#' @param gpa a \code{glm_pipeline_arguments} object that already has the \code{$exclude_run}
#'   column populated in \code{$l1_model_setup}.
#' @return a modified copy of \code{gpa} where $exclude_subject has been added to $run_data
#'   and $subject_data. This function also adds $n_good_runs to $subject_data which is useful
#'   if we want to enforce a lower bound on the number of runs used to define subject exclusion.
#' @keywords internal
#' @importFrom dplyr filter count left_join
#' @importFrom checkmate assert_class assert_data_table
#' @importFrom lgr get_logger
#' @author Michael Hallquist
calculate_subject_exclusions <- function(gpa) {
  lg <- lgr::get_logger("glm_pipeline/setup_glm_pipeline")
  lg$set_threshold(gpa$lgr_threshold)

  checkmate::assert_class(gpa, "glm_pipeline_arguments")
  checkmate::assert_data_frame(gpa$run_data)
  checkmate::assert_subset(c("id", "session", "run_number", "exclude_run"), names(gpa$run_data)) # verify that exclude_run is populated

  if ("exclude_subject" %in% names(gpa$subject_data) && "exclude_subject" %in% names(gpa$run_data)) {
    lg$debug("exclude_subject already calculated and populated in $subject_data and $run_data")
    return(gpa)
  }

  # if no subject exclusion string is provided, then keep all subjects
  if (is.null(gpa$confound_settings$exclude_subject)) {
    gpa$run_data$exclude_subject <- FALSE
    gpa$subject_data$exclude_subject <- FALSE
    return(gpa)
  }

  n_good_runs_df <- gpa$run_data %>%
    dplyr::group_by(id, session) %>%
    dplyr::summarize(n_good_runs = sum(exclude_run == FALSE)) %>% # sum of non-excluded runs
    dplyr::ungroup() %>%
    dplyr::select(id, session, n_good_runs)

  gpa$subject_data <- gpa$subject_data %>%
    dplyr::left_join(n_good_runs_df, by = c("id", "session"))

  gpa$run_data <- gpa$run_data %>%
    dplyr::left_join(n_good_runs_df, by = c("id", "session"))

  # evaluate subject exclusion in the context of the broader $subject_data data.frame
  gpa$subject_data$exclude_subject <- sapply(seq_len(nrow(gpa$subject_data)), function(ss) {
    s_info <- gpa$subject_data[ss, , drop = FALSE]
    tryCatch(with(s_info, eval(parse(text = gpa$confound_settings$exclude_subject))),
      error = function(e) {
        lg$error(
          "Problem evaluating subject exclusion for subject: %s, session: %s, expr: %s",
          s_info$id, s_info$session,
          gpa$confound_settings$exclude_subject
        )
        lg$error("Defaulting to retaining this subject.")
        return(FALSE)
      }
    )
  })

  # propagate subject exclusion down to $run_data
  gpa$run_data <- gpa$run_data %>%
    dplyr::left_join(
      gpa$subject_data %>% dplyr::select(id, session, exclude_subject),
      by = c("id", "session")
    )

  return(gpa)
}

#' Internal function to add last_onset and last_offset columns to gpa$run_data based on events in l1_models
#' @param gpa a \code{glm_pipeline_arguments} object containing valid $run_data and $l1_models objects
#' @param lg a Logger object for logging results
#' @details The last_onset and last_offset columns are calculated for each run based on the timing of all
#'   events in the $l1_models$events list. These are then used to facilitate run truncation if the user
#'   requests truncation after a final onset or offset.
#' @return a modified copy of gpa with $run_data populated with last_offset and last_onset columns (times in seconds)
#' @keywords internal
#' @importFrom dplyr summarize group_by left_join
populate_last_events <- function(gpa, lg) {
  # get all events as a long data.frame
  m_events <- data.table::rbindlist(lapply(gpa$l1_models$events, function(this_event) this_event$data))

  # default to 0 ISI if column is missing (shouldn't happen under normal circumstances)
  if (!"isi" %in% names(m_events)) m_events[, isi := 0]

  # convert NA values for ISI to zero with message
  na_isi <- is.na(m_events$isi)

  if (all(na_isi)) {
    lg$debug("In populate_last_events, all ISIs are NA. Converting to 0")
    m_events[, isi := 0]
  } else if (any(na_isi)) {
    lg$debug("Changing NA ISIs to 0 for the following cases: ")
    lg$debug("%s", capture.output(print(m_events[na_isi, c("id", "session", "run_number", "event", "trial")], nrows = 1e5)))
    m_events$isi[na_isi] <- 0
  }

  last_events <- m_events %>%
    group_by(id, session, run_number) %>%
    dplyr::summarize(
      last_event_idx = which.max(onset),
      last_event = event[last_event_idx],
      last_onset = max(onset, na.rm = TRUE), 
      last_offset = max(onset + duration, na.rm = TRUE),
      last_isi = isi[last_event_idx],
      .groups="drop")

  gpa$run_data <- gpa$run_data %>%
    dplyr::left_join(last_events, by=c("id", "session", "run_number"))

  return(gpa)
}

# slurm_job_array <- function(job_name = "slurm_array") {
# }

#' helper function to convert the $sched_args field to a vector
#'  of directives that can be included dynamically in the header of
#'  PBS or SBATCH scripts.
#'
#' @param gpa a \code{glm_pipeline_arguments} object containing the
#'   $parallel$sched_args field
#'
#' @return a character vector where the $sched_args elements are converted to
#'   corresponding # SBATCH or # PBS directives for inclusion in scripts
#' @keywords internal
sched_args_to_header <- function(gpa) {
  checkmate::assert_class(gpa, "glm_pipeline_arguments")

  #if there are no scheduler arguments, return nothing
  if (is.null(gpa$parallel$sched_args)) return(NULL)

  is_slurm <- gpa$scheduler == "slurm"
  directives <- sapply(gpa$parallel$sched_args, function(x) {
    ifelse(isTRUE(is_slurm), paste("#SBATCH", x), paste("#PBS", x))
  }, USE.NAMES = FALSE)
  return(directives)
}

#' helper function to generate a contrast matrix from an lm() object
#'   and a set of user-specified contrasts using emmeans
#'
#' @param mobj a model object created by build_l<X>_models
#' @param lmfit an optional lm() object used for emmeans calculations. If provided, this object
#'   will be used instead of mobj$lmfit (the parent lm on the overall dataset).
#'
#' @return a modified copy of the model object \code{mobj} with $contrast_list and $contrasts
#'   fully populated 
#' @keywords internal
#' @importFrom emmeans emmeans emtrends
get_contrasts_from_spec <- function(mobj, lmfit=NULL) {
  checkmate::assert_multi_class(mobj, c("l1_model_spec", "l1_wi_spec", "hi_model_spec")) # verify that we have an object of known structure
  if (is.null(lmfit)) {
    lmfit <- mobj$lmfit # use parent lmfit
  } else {
    # override existing contrasts to force lmfit-specific respecification
    mobj$contrast_list <- list()
    mobj$contrasts <- NULL
  }
  spec <- mobj$contrast_spec
  contrast_list <- mobj$contrast_list

  checkmate::assert_class(lmfit, "lm", null.ok=TRUE)
  checkmate::assert_list(spec)
  checkmate::assert_list(contrast_list)

  # prefix for contrast names
  prefix <- ifelse(inherits(mobj, "l1_wi_spec"), paste0(mobj$signal_name, "."), "")

  if (is.null(lmfit)) {
    c_colnames <- spec$regressors
  } else {
    c_colnames <- names(coef(lmfit)) # prefer model-specific regressors (if they vary by subject)
  }

  # for within-subject factors, always add the signal name as a prefix (cf. get_regressors_from_signal)
  c_colnames <- paste0(prefix, c_colnames)

  ### add diagonal contrasts
  c_diagonal <- contrast_list$diagonal
  if (isTRUE(spec$diagonal) && is.null(c_diagonal)) {

    diag_mat <- diag(length(c_colnames))
    rownames(diag_mat) <- paste0("EV_", prefix, c_colnames) # simple contrast naming for each individual regressor
    colnames(diag_mat) <- c_colnames # always have columns named by regressor

    c_diagonal <- diag_mat
  }

  ### add condition means, if requested
  c_cond_means <- contrast_list$cond_means
  if (length(spec$cond_means) > 0L && is.null(c_cond_means)) {
    for (vv in spec$cond_means) {
      ee <- emmeans(lmfit, as.formula(paste("~", vv)), weights = spec$weights)
      edata <- summary(ee)
      econ <- ee@linfct
      econ[abs(econ) < 1e-8] <- 0 # round tiny contrasts due to floating point
      enames <- unname(apply(edata[strsplit(vv, ":")[[1]]], 1, paste, collapse=".")) # combine multi-factor contrasts, if relevant

      # if any emmeans are not estimable, then this is likely due to aliasing. For now, drop
      which_na <- is.na(edata$emmean)
      if (any(which_na)) {
        econ <- econ[!which_na, , drop = FALSE]
        enames <- enames[!which_na]
      }

      # add contrast names to matrix
      rownames(econ) <- paste0(prefix, paste(vv, enames, sep = "."))

      # add contrasts to matrix
      c_cond_means <- rbind(c_cond_means, econ)
    }
    colnames(c_cond_means) <- c_colnames
  }

  ### add cell means for all factors, if requested
  c_cell_means <- contrast_list$cell_means
  if (isTRUE(spec$cell_means) && is.null(c_cell_means)) {
    # get model-predicted means for each factor
    ee <- emmeans(lmfit, as.formula(paste("~", paste(spec$cat_vars, collapse = "*"))), weights = spec$weights)
    # pp <- pairs(ee)
    edata <- summary(ee)

    econ <- ee@linfct
    econ[abs(econ) < 1e-8] <- 0 # round tiny contrasts due to floating point
    enames <- apply(edata[, spec$cat_vars, drop = FALSE], 1, function(x) { paste(names(x), x, sep=".", collapse = "_") })

    # if any emmeans are not estimable, then this is likely due to aliasing. For now, drop
    which_na <- is.na(edata$emmean)
    if (any(which_na)) {
      econ <- econ[!which_na, , drop = FALSE]
      enames <- enames[!which_na]
    }

    rownames(econ) <- paste0(prefix, enames)
    c_cell_means <- rbind(c_cell_means, econ)
    colnames(c_cell_means) <- c_colnames
  }

  ### add overall response mean
  c_overall <- contrast_list$overall
  if (isTRUE(spec$overall_response) && is.null(c_overall)) {
    ee <- emmeans(lmfit, ~1, weights = spec$weights)
    econ <- ee@linfct
    econ[abs(econ) < 1e-8] <- 0 # round tiny contrasts due to floating point
    rownames(econ) <- paste0(prefix, "overall")
    colnames(econ) <- c_colnames
    c_overall <- econ
  }

  ### add pairwise differences, if requested
  c_pairwise_diffs <- contrast_list$pairwise_diffs
  if (length(spec$pairwise_diffs) > 0L && is.null(c_pairwise_diffs)) {
    for (vv in spec$pairwise_diffs) {
      ee <- emmeans(lmfit, as.formula(paste("~", vv)), weights = spec$weights)
      pp <- pairs(ee)
      edata <- summary(pp)
      econ <- pp@linfct
      econ[abs(econ) < 1e-8] <- 0 # round tiny contrasts due to floating point
      enames <- paste(vv, make.names(sub("\\s+-\\s+", "_M_", edata$contrast, perl = TRUE)), sep=".") # names of contrasts

      # if any emmeans are not estimable, then this is likely due to aliasing. For now, drop
      which_na <- is.na(edata$estimate)
      if (any(which_na)) {
        econ <- econ[!which_na, , drop = FALSE]
        enames <- enames[!which_na]
      }

      # add contrast names to matrix
      rownames(econ) <- paste0(prefix, enames)

      # add pairwise differences for this factor to the pairwise matrix
      c_pairwise_diffs <- rbind(c_pairwise_diffs, econ)
    }
    colnames(c_pairwise_diffs) <- c_colnames
  }

  ### add per-cell model-predicted simple slopes
  c_simple_slopes <- contrast_list$simple_slopes
  if (length(spec$simple_slopes) > 0L && is.null(c_simple_slopes)) {
    for (vv in seq_along(spec$simple_slopes)) {
      trend_var <- names(spec$simple_slopes)[vv]
      for (comb in spec$simple_slopes[[vv]]) {
        ee <- emtrends(lmfit,
          specs = as.formula(paste("~", paste(comb, collapse = "*"))),
          var = trend_var, weights = spec$weights
        )
        edata <- summary(ee)
        econ <- ee@linfct
        econ[abs(econ) < 1e-8] <- 0 # round tiny contrasts due to floating point
        enames <- paste(trend_var, apply(edata[, comb, drop = FALSE], 1, paste, collapse = "."), sep = ".")

        # if any emmeans are not estimable, then this is likely due to aliasing. For now, drop
        which_na <- is.na(edata[[paste(trend_var, "trend", sep=".")]])
        if (any(which_na)) {
          econ <- econ[!which_na, , drop = FALSE]
          enames <- enames[!which_na]
        }

        rownames(econ) <- paste0(prefix, enames)
        c_simple_slopes <- rbind(c_simple_slopes, econ)
      }
    }
    colnames(c_simple_slopes) <- c_colnames
  }

  c_custom <- contrast_list$custom

  # walk through all within-subject factor contrasts and return as a single matrix
  c_within <- get_wi_contrast_matrix(mobj, c_colnames)

  #combine each element
  cmat_full <- rbind(
    c_diagonal,
    c_cond_means,
    c_cell_means,
    c_overall,
    c_pairwise_diffs,
    c_simple_slopes,
    c_within,
    c_custom
  )

  if (is.null(cmat_full)) {
    # no contrasts specified! Not good, but for now, just create an empty matrix and let upstream handle bothering the user
    cmat_full <- matrix(numeric(0), nrow=0, ncol=length(c_colnames), dimnames=list(NULL, c_colnames))
  }

  # handle contrasts from the full matrix that should be deleted
  # these are processed dynamically here so that the same set can be dropped if a contrast is respecified for a data subset or subject
  if (!is.null(spec$delete)) {
    which_del <- match(spec$delete, rownames(cmat_full))
    if (!any(is.na(which_del))) {
      cmat <- cmat_full[-1*which_del, , drop=FALSE]
    } else {
      warning("Some contrasts to be dropped were not found in the matrix: ", paste(spec$delete[is.na(which_del)], collapse = ", "))
      which_del <- na.omit(which_del)
      cmat <- if (length(which_del) > 0L) cmat_full[-1 * na.omit(which_del), , drop = FALSE] else cmat_full
    }
  } else {
    cmat <- cmat_full
  }

  # check for contrasts that are duplicated and ask user what to do about it. This should only happen once because we then update the $delete field
  dupe_list <- get_dupe_rows(cmat)
  drop_rows <- c()
  if (length(dupe_list) > 0L) {    
    message("Duplicate contrasts found in your matrix. This can occur for many benign reasons, but we need you to say what you want
      to call these contrasts in the output so that it is clear to you.\n")
    
    for (dd in dupe_list) {
      message("These contrasts are the same: ", paste(rownames(cmat)[dd], collapse=", "))
      whichkeep <- menu(rownames(cmat)[dd], title = "Which name should we use?")
      drop_rows <- c(drop_rows, dd[-whichkeep])
    }

    # add these back into model specification so that the same decisions are made if the spec is used to repopulate a new matrix
    mobj$contrast_spec$delete <- unique(c(mobj$contrast_spec$delete, rownames(cmat)[drop_rows]))
  } 

  # duplicated does not give user any control over which one to retain (in terms of contrast name) -- now superseded by duplicate resolution above
  # dupes <- duplicated(cmat, MARGIN = 1)
  dupes <- rep(FALSE, nrow(cmat))
  dupes[drop_rows] <- TRUE
  
  cmat <- cmat[!dupes, , drop=FALSE] # drop duplicated contrasts

  # parentheses in contrast names generate problems in running through FEAT (chokes on submission)
  # so far, this comes up only with intercept terms -- may need to do a more general substitution later
  if (!is.null(cmat)) { rownames(cmat) <- gsub("(Intercept)", "Intercept", rownames(cmat), fixed = T) }

  # populate any updates to the contrast_list object based on new calculations
  contrast_list$diagonal <- c_diagonal
  contrast_list$cond_means <- c_cond_means
  contrast_list$cell_means <- c_cell_means
  contrast_list$overall <- c_overall
  contrast_list$pairwise_diffs <- c_pairwise_diffs
  contrast_list$simple_slopes <- c_simple_slopes
  contrast_list$custom <- c_custom

  # populate contrasts back into model object (spec should not be updated)
  mobj$contrast_list <- contrast_list
  mobj$contrasts <- cmat
  return(mobj)
}

# helper function to identify duplicate contrasts
get_dupe_rows <- function(mat, enforce_rownames=FALSE) {
  if (!inherits(mat, "matrix")) return(list()) # return empty list if we are not passed a contrast matrix
  
  # rows to check -- start with all, then winnow
  rtc <- seq_len(nrow(mat))
  
  # if enforce_rownames is FALSE, drop rownames from local copy so that identical does not fail
  if (isFALSE(enforce_rownames)) rownames(mat) <- NULL
  
  dupe_list<- list()
  while(length(rtc) > 1L) {
    last_rtc <- length(rtc)
    to_compare <- mat[rtc[last_rtc],,drop=FALSE] # always compare preceding rows to last remaining row
    rdupes <- sapply(1:(length(rtc) - 1), function(i) identical(mat[rtc[i],,drop=FALSE], to_compare))
    to_drop <- last_rtc # always drop last eligible row after checking for dupes of it
    if (any(rdupes)) {
      # rdupes will always be 1 less than length of rtc. Add FALSE as last element so that rtc subset never gets scalar rtc[TRUE], which would grab all elements
      dupe_list <- c(dupe_list, list(c(rtc[c(rdupes, FALSE)], rtc[last_rtc])))
      to_drop <- c(to_drop, which(rdupes))
    }
    rtc <- rtc[-to_drop] # drop rows that have been processed
  }
  
  return(dupe_list)
}



get_wi_contrast_matrix <- function(mobj, c_colnames) {
  # this function is only intended to crawl over within-subject contrasts of an l1 model
  if (!checkmate::test_class(mobj, "l1_model_spec") || is.null(mobj$wi_models)) {
    return(invisible(NULL)) # quietly return NULL if we don't have an l1 model
  }

  # get contrast matrix for each wi_model
  clist <- lapply(mobj$wi_models, "[[", "contrasts")

  cnames <- do.call(c, lapply(clist, rownames))
  cmat <- matrix(0, nrow = length(cnames), ncol = length(c_colnames), dimnames = list(cnames, c_colnames))

  # fill in relevant rows and columns in combined matrix for each wi model contrast matrix
  for (cc in clist) {
    cmat[rownames(cc), colnames(cc)] <- cc
  }

  return(cmat)
}

# get_run_lmfits <- function() {

# }

#' Helper function to run the requested GLM model for each subject+session separately
#'
#' @param mobj an \code{l1_model_spec} or \code{hi_model_spec} object containing the GLM model to run
#' @param data The run-level data frame containing data for all ids and sessions. This will be split into individual chunks
#'
#' @return a modified copy of \code{mobj} where the $by_subject field has been added
#'
#' @details The function adds the $by_subject field, which contains the design matrices and contrasts
#'   for each subject and session in \code{data} based on the available data for that session. For example, if
#'   a subject is missing a few runs (or these are dropped from analysis), then some contrasts may change or drop out of the model.
#'
#' The $by_subject field is a keyed data.table object containing list elements for the cope_list (mapping cope numbers to contrast names),
#'   the contrasts, and the design matrix for each session.
#'
#' @keywords internal
#' @importFrom checkmate assert_data_frame assert_multi_class assert_subset
#' @importFrom data.table data.table
respecify_l2_models_by_subject <- function(mobj, data) {
  checkmate::assert_multi_class(mobj, c("l1_model_spec", "hi_model_spec")) # verify that we have an object of known structure
  checkmate::assert_data_frame(data)
  checkmate::assert_subset(c("id", "session"), names(data))

  # create a nested data.table object for fitting each id + session separately
  data <- data.table(data, key = c("id", "session"))
  data$dummy <- seq_len(nrow(data))
  dsplit <- data[, .(dt = list(.SD)), by = c("id", "session")]

  # use model formula from parent object
  model_formula <- terms(mobj$lmfit)

  cope_list <- list()
  contrast_list <- list()
  model_matrix_list <- list()
  n_l2_copes <- rep(NA_integer_, nrow(dsplit))
  for (vv in seq_len(nrow(dsplit))) {
    lmfit <- lm(model_formula, data = dsplit[[vv, "dt"]])

    mm <- get_contrasts_from_spec(mobj, lmfit)
    cope_df <- data.frame(
      id = dsplit$id[vv], session = dsplit$session[vv],
      l2_cope_number = seq_len(nrow(mm$contrasts)), l2_cope_name = rownames(mm$contrasts)
    )

    cope_list[[vv]] <- cope_df
    contrast_list[[vv]] <- mm$contrasts
    model_matrix_list[[vv]] <- model.matrix(lmfit)
    n_l2_copes[vv] <- ncol(mm$contrasts)
  }

  dsplit[, cope_list := cope_list]
  dsplit[, contrasts := contrast_list]
  dsplit[, model_matrix := model_matrix_list]
  dsplit[, n_l2_copes := n_l2_copes]
  dsplit[, dt := NULL] # no longer need original split data

  mobj$by_subject <- dsplit
  return(mobj)
}

#' Helper function to re-calculate the l3 design matrix for a model based on available data
#'
#' @param mobj a \code{hi_model_spec} object containing the L3 GLM model to run
#' @param data The run-level data frame containing data for all ids and sessions. This will be split into individual chunks
#'
#' @return a modified copy of \code{mobj} where the $by_subject field has been added
#'
#' @details The function adds the $by_subject field, which contains the design matrices and contrasts
#'   for each subject and session in \code{data} based on the available data for that session. For example, if
#'   a subject is missing a few runs (or these are dropped from analysis), then some contrasts may change or drop out
#'   of the model.
#'
#' The $by_subject field is a keyed data.table object containing list elements for the cope_list (mapping cope numbers
#'   to contrast names), the contrasts, and the design matrix for each session.
#'
#' @keywords internal
#' @importFrom checkmate assert_data_frame assert_multi_class assert_subset
#' @importFrom data.table data.table

mobj_refit_lm <- function(mobj, new_data) {

}

#' internal helper function to setup a linear model for a given l1, l2, or l3 model
#' 
#' @param mobj a model object to be populated or modified
#' @param model_formula a character string specifying the formula of the model to be fit
#' @param data a data.frame containing all columns used in model fitting
#' @param id_cols a character vector of column names in \code{data} that identify the observations
#'   and can be used for merging the model against related datasets
#' @return a model object containing the fitted model
#' @keywords internal
mobj_fit_lm <- function(mobj=NULL, model_formula=NULL, data, id_cols=NULL, lg=NULL) {
  if (is.null(lg)) { lg <- lgr::get_logger() }
  # verify that we have an object of known structure
  checkmate::assert_multi_class(mobj, c("l1_model_spec", "hi_model_spec"), null.ok=TRUE)
  if (is.null(mobj)) {
    mobj <- list()
    class(mobj) <- c("list", "hi_model_spec")
  } else {
    # if mobj is passed in, but other elements are NULL, we are refitting a model to a new/updated dataset
    if (is.null(model_formula)) {
      model_formula <- mobj$model_formula
    }

    if (is.null(id_cols)) {
      id_cols <- names(mobj$metadata)
    }
  }

  checkmate::assert_string(model_formula)
  checkmate::assert_data_frame(data)
  checkmate::assert_subset(id_cols, names(data))

  # verify that there is a dummy DV for lm fitting
  if (!"dummy" %in% names(data)) {
    data$dummy <- seq_len(nrow(data))
  }

  model_formula <- update.formula(as.formula(model_formula), "dummy ~ .") # add LHS

  # use model formula from parent object
  # model_formula <- terms(mobj$lmfit)
  model_vars <- all.vars(model_formula)

  checkmate::assert_subset(model_vars, names(data)) # verify that all predictors are present

  # look for missingness on any predictor variable
  # https://stackoverflow.com/questions/64287986/create-variable-that-captures-if-there-are-missing-fields-in-4-string-variables
  data <- data %>%
    dplyr::select(!!model_vars, !!id_cols) %>% # just keep model-relevant variables
    dplyr::mutate(any_miss = rowSums(is.na(dplyr::select(., any_of(model_vars)))) > 0)
    #dplyr::mutate(across(all_of(model_vars), ~ +(is.na(.))))

  miss_data <- data %>%
    dplyr::filter(any_miss == TRUE) %>%
    dplyr::select(-any_miss)

  # retain non-missing data
  data <- data %>%
    dplyr::filter(any_miss == FALSE) %>%
    dplyr::select(-any_miss)

  if (nrow(miss_data) > 0L) {
    lg$warn("Model data contain missing values for one or more covariates.")
    lg$warn("These observations will be dropped from model outputs!")
    lg$warn("%s", capture.output(print(miss_data %>% dplyr::select(-dummy))))
  }

  # Add metadata, for merging model matrix against identifying columns
  # This occurs before mean centering in case one of the id_cols is also in the model (e.g., run_number)
  mobj$metadata <- data %>% dplyr::select(!!id_cols)

  # apply just-in-time mean centering for requested variables
  if (!is.null(mobj$covariate_transform)) {
    for (cc in seq_along(mobj$covariate_transform)) {
      cname <- names(mobj$covariate_transform)[cc]
      ctype <- mobj$covariate_transform[cc]
      if (ctype == "mean") {
        data[[cname]] <- data[[cname]] - mean(data[[cname]], na.rm=TRUE)
      } else if (ctype == "zscore") {
        data[[cname]] <- as.vector(scale(data[[cname]])) # handles NAs automatically
      } else if (ctype == "min") {
        data[[cname]] <- data[[cname]] - min(data[[cname]], na.rm=TRUE)
      } else if (ctype == "max") {
        data[[cname]] <- data[[cname]] - max(data[[cname]], na.rm=TRUE)
      }
    }
  }

  # apply just-in-time factor reference releveling
  if (!is.null(mobj$reference_level)) {
    for (cc in seq_along(mobj$reference_level)) {
      cname <- names(mobj$reference_level)[cc]
      if (!is.factor(data[[cname]])) data[[cname]] <- factor(data[[cname]])
      ref <- mobj$reference_level[cc]
      if (!ref %in% levels(data[[cname]])) {
        default_lev <- names(which.max(table(data[[cname]])))
        lg$warn("The requested reference level: %s is not available. Defaulting to %s", ref, default_lev)
        ref <- default_lev
      }
      data[[cname]] <- relevel(data[[cname]], ref=ref)
    }
  }

  # keep track of data used for fitting model for refitting in case of missing data
  mobj$model_data <- data %>% dplyr::select(!!model_vars)

  # fit model and populate model information
  mobj$lmfit <- lm(model_formula, data)
  mobj$model_formula <- as.character(update.formula(as.formula(model_formula), "NULL ~ .")) # remove LHS and convert to string
  mobj$model_matrix <- model.matrix(mobj$lmfit)
  mobj$regressors <- colnames(mobj$model_matrix) # actual regressors after expanding categorical variables

  # handle coefficient aliasing
  al <- alias(mobj$lmfit)
  if (!is.null(al$Complete)) {
    cat("Problems with aliased (redundant) terms in model.\n\n")
    bad_terms <- rownames(al$Complete)
    cat(paste(bad_terms, collapse = ", "), "\n\n")

    # find unaliased (good) terms in the model
    good_terms <- colnames(mobj$model_matrix)[!(colnames(mobj$model_matrix) %in% bad_terms)]
    good_terms <- good_terms[!good_terms == "(Intercept)"]

    if (length(good_terms) == 0L) {
      warning(
        "No unaliased (good) terms in this model, suggesting that all covariates are constant or dependent. ",
        "Reverting to intercept-only model."
      )
      good_terms <- "1"
    }

    # build new model formula with only good terms
    newf <- as.formula(paste("dummy ~", paste(good_terms, collapse = " + ")))

    # also generate a model data.frame that expands dummy codes for terms, retaining only good variables
    mobj$model_matrix_noalias <- mobj$model_matrix[, grep(":", good_terms, fixed = TRUE, value = TRUE, invert = TRUE)]
    modeldf <- as.data.frame(mobj$model_matrix_noalias)
    modeldf$dummy <- data$dummy # copy across dummy DV for fitting
    mobj$lmfit_noalias <- lm(newf, modeldf)
    mobj$aliased_terms <- bad_terms

    # N.B. emmeans needs to calculate contrasts on the original design to see the factor structure
    # So, we also need to drop out columns from the emmeans linfct
  }

  return(mobj)

}


#OLD version
# respecify_l3_model <- function(mobj, data) {
#   checkmate::assert_multi_class(mobj, c("l1_model_spec", "hi_model_spec")) # verify that we have an object of known structure
#   checkmate::assert_data_frame(data)
#   checkmate::assert_subset(c("id", "session"), names(data))

#   # use model formula from parent object
#   model_formula <- terms(mobj$lmfit)

#   mobj$lmfit <- lm(model_formula, data = data)
#   mobj$regressors <- colnames(mobj$model_matrix)
#   mobj$model_matrix <- model.matrix(mobj$lmfit)
#   mobj$metadata <- data %>% dplyr::select(id, session)
#   mobj$model_data <- data

#   mobj <- get_contrasts_from_spec(mobj, mobj$lmfit)

#   return(mobj)
# }


# new version, but depends on mobj being specified under new approach
respecify_l3_model <- function(mobj, new_data) {
  checkmate::assert_class(mobj, "hi_model_spec") # verify that we have an object of known structure
  checkmate::assert_data_frame(new_data)
  checkmate::assert_subset(c("id", "session"), names(new_data))

  # use id and session in new_data to subset the full data from initial model specification
  full_data <- cbind(mobj$metadata, mobj$model_data)
  new_data <- new_data %>% dplyr::select(id, session) #just metadata of interest
  new_data <- new_data %>%
    dplyr::left_join(full_data, by=c("id", "session"))

  mobj <- mobj_fit_lm(mobj=mobj, data=new_data)

  mobj <- get_contrasts_from_spec(mobj, mobj$lmfit)

  return(mobj)
}

#' Helper function to obtain the number of volumes in a 4D nifti file
#' 
#' @details This function prefers to use fslval instead of an internal R library
#'   because both oro.nifti and RNifti are rather slow to obtain a single value from
#'   the NIfTI header
#' 
#' @param nifti a 4D nifti file
#' @return the number of volumes in \code{nifti}
#' @keywords internal
lookup_run_volumes <- function(nifti) {
  if (!file.exists(nifti)) return(NA_integer_)

  # fslval is much faster than any internal R command. Use it, if possible
  # TODO: Make this more robust, more like run_fsl_command with path expectations
  has_fslval <- system2("which", "fslval", stdout = FALSE, stderr = FALSE)
  has_fslval <- ifelse(has_fslval == 0L, TRUE, FALSE)
  if (isTRUE(has_fslval)) {
    nvol <- as.integer(system2("fslval", args = c(nifti, "dim4"), stdout = TRUE))
  } else {
    nvol <- oro.nifti::readNIfTI(nifti, read_data = FALSE)@dim_[5L]
  }

  # system.time(run_volumes <- oro.nifti::readNIfTI(run_nifti[nn], read_data = FALSE)@dim_[5L])
  # system.time(run_volumes <- RNifti::readNifti(run_nifti[nn], internal = TRUE))

  return(nvol)
}


#' Helper function to obtain the number of voxels in a 4D file for populating FEAT FSF
#'
#' @details This function prefers to use fslval instead of an internal R library
#'   because both oro.nifti and RNifti are rather slow to obtain a single value from
#'   the NIfTI header
#'
#' @param nifti a 4D nifti file
#' @return the number of voxels in \code{nifti} as calculated by x * y * z * t
#' @keywords internal
lookup_nvoxels <- function(nifti) {
  if (!file.exists(nifti)) return(NA_integer_)

  # fslval is much faster than any internal R command. Use it, if possible
  # TODO: Make this more robust, more like run_fsl_command with path expectations
  # TODO: should probably have a single global lookup for fslval in gpa (finalize step)
  has_fslval <- system2("which", "fslval", stdout = FALSE, stderr = FALSE)
  has_fslval <- ifelse(has_fslval == 0L, TRUE, FALSE)
  if (isTRUE(has_fslval)) {
    ndim <- as.integer(system2("fslval", args = c(nifti, "dim0"), stdout = TRUE))
    nvoxels <- prod(sapply(1:ndim, function(xx) {
      as.integer(system2("fslval", args = c(nifti, paste0("dim", xx)), stdout = TRUE))
    }))
  } else {
    nihead <- oro.nifti::readNIfTI(nifti, read_data = FALSE)
    ndim <- nihead@dim_[1L]
    nvoxels <- prod(nihead@dim_[2:(ndim + 1)])
  }

  return(nvoxels)
}


#' Helper function to obtain the dimensions of a 4D file
#'
#' @details This function prefers to use fslval instead of an internal R library
#'   because both oro.nifti and RNifti are rather slow to obtain a single value from
#'   the NIfTI header
#'
#' @param nifti a 4D nifti file
#' @return a named vector of dimensions for the input (dim_x, dim_y, dim_z)
#' @keywords internal
lookup_dim <- function(nifti) {
  ret_vec <- c(dim_x = NA_integer_, dim_y = NA_integer_, dim_z = NA_integer_, dim_t = NA_integer_)
  if (!file.exists(nifti)) return(ret_vec)

  # fslval is much faster than any internal R command. Use it, if possible
  # TODO: Make this more robust, more like run_fsl_command with path expectations
  # TODO: should probably have a single global lookup for fslval in gpa (finalize step)
  has_fslval <- system2("which", "fslval", stdout = FALSE, stderr = FALSE)
  has_fslval <- ifelse(has_fslval == 0L, TRUE, FALSE)
  if (isTRUE(has_fslval)) {
    ndim <- min(4, as.integer(system2("fslval", args = c(nifti, "dim0"), stdout = TRUE)))
    dim_vec <- sapply(1:ndim, function(xx) {
      as.integer(system2("fslval", args = c(nifti, paste0("dim", xx)), stdout = TRUE))
    })
  } else {
    nihead <- oro.nifti::readNIfTI(nifti, read_data = FALSE)
    ndim <- min(4, nihead@dim_[1L])
    dim_vec <- nihead@dim_[2:(ndim + 1)]
  }

  ret_vec[seq_along(dim_vec)] <- dim_vec # populate available dims

  return(ret_vec)
}

#' small helper function to parse duration syntax of days-hours:minutes:seconds
#'   into lubridate duration object
#'
#' @param str string containing a duration that may include a days specification
#' @importFrom lubridate hms
#' @importFrom checkmate assert_string
#' @keywords internal
dhms <- function(str) {
  checkmate::assert_string(str)
  str <- validate_dhms(str) # force checks
  if (grepl("^\\d+-", str, perl = TRUE)) {
    split_hyphen <- strsplit(str, "-", fixed = TRUE)[[1]]
    days <- as.numeric(split_hyphen[1])
    period <- lubridate::hms(split_hyphen[2:length(split_hyphen)])
    period@day <- days
  } else {
    period <- lubridate::hms(str)
  }
  return(period)
}

#' helper function to convert a period object back into a d-h:m:s string
#'
#' @keywords internal
#' @importFrom checkmate assert_class
#' @importFrom lubridate seconds_to_period period_to_seconds
as_dhms <- function(per) {
  checkmate::assert_class(per, "Period")

  # if any element of the period exceeds intelligent tolerances (e.g., minutes > 60),
  # re-create the period so that it enforces reasonable tolerances.
  per <- seconds_to_period(period_to_seconds(per))
    
  dhms_str <- paste0(
    sprintf("%02d", hour(per)), ":",
    sprintf("%02d", minute(per)), ":",
    sprintf("%02d", second(per))
  )

  if (per@day > 0) {
    dhms_str <- paste0(per@day, "-", dhms_str)
  }
  return(dhms_str)
}

#' helper function to validate format of walltime inputs for HPC submission
#'
#' @param str string containing a duration that may include a days specification
#' @importFrom checkmate assert_string
#' @details this always converts to an hms format, and if days are present, it
#'   converts to dhms. Supported date formats match slurm sbatch:
#'   https://slurm.schedmd.com/sbatch.html
#' @keywords internal
validate_dhms <- function(str) {
  checkmate::assert_string(str)
  if (grepl("^\\d+:\\d+?$", str, perl=T)) { # m:s input
    return(paste0("00:", str)) # add 0 hours prefix
  } else if (grepl("^(\\d+-)?\\d+:\\d+:\\d+?$", str, perl=T)) { # h:m:s or d-h:m:s input
    return(str) 
  } else if (grepl("^(\\d+-)?\\d+:\\d+?$", str, perl=T)) { # d-h:m input
    return(paste0(str, ":00")) # add 0 seconds  
  } else if (grepl("^\\d+-\\d+$", str, perl=T)) { # days-hours input
    return(paste0(str, ":00:00")) # add 0 minutes, 0 seconds
  } else if (grepl("^\\d+$", str, perl=T)) {
    return(paste0("00:", str, ":00")) # minutes only -> hours:minutes:seconds
  } else {
    stop("Invalid duration string: ", str)
  }
}

#' convert a number of hours to a days, hours, minutes, seconds format
#' 
#' @importFrom lubridate day hour minute second seconds_to_period dhours
#' @keywords internal
hours_to_dhms <- function(hours, frac=FALSE) {
  checkmate::assert_number(hours, lower = 0)
  dur <- lubridate::dhours(hours)
  period <- seconds_to_period(dur)

  if (isTRUE(frac)) {
    str <- sprintf("%02d:%02d:%.03f", hour(period), minute(period), second(period))
  } else {
    str <- sprintf("%02d:%02d:%02d", hour(period), minute(period), round(second(period)))
  }

  if (day(period) > 0) {
    str <- paste0(sprintf("%d-", day(period)), str)
  }

  return(str)
}

#' helper function to refresh l3 model status and save gpa object from batch pipeline back to its cache
#' 
#' @param gpa a glm_pipeline_arguments object
#' @return a refreshed version of the gpa object
#' @importFrom lgr get_logger
#' @export
cleanup_glm_pipeline <- function(gpa) {
  lg <- lgr::get_logger("glm_pipeline/cleanup_glm_pipeline")
  lg$set_threshold(gpa$lgr_threshold)
  
  gpa <- refresh_feat_status(gpa, level = 1L, lg = lg)
  gpa <- refresh_feat_status(gpa, level = 2L, lg = lg)
  gpa <- refresh_feat_status(gpa, level = 3L, lg = lg)
  res <- tryCatch(saveRDS(gpa, file = gpa$output_locations$object_cache), error = function(e) {
    lg$error("Could not save gpa object to file: %s", gpa$output_locations$object_cache)
    return(NULL)
  })

  # replace copied L1 niftis with symlinks to the corresponding file
  if (isTRUE(gpa$glm_settings$fsl$replace_l1_nifti_symlink)) {
    feat_files <- file.path(
      gpa$l1_model_setup$fsl$feat_dir,
      "filtered_func_data.nii.gz"
    )
    feat_files <- feat_files[file.exists(feat_files)]
    input_files <- gpa$l1_model_setup$fsl$run_nifti[file.exists(feat_files)]
    for (ff in seq_along(feat_files)) {
      unlink(feat_files[ff])
      file.symlink(input_files[ff], feat_files[ff])
    }
  }

  return(gpa)
}

#' helper function to copy any missing fields in target
#' @param target a named list to be populated by defaults if fields are missing
#' @param defaults a named list containing default values
populate_defaults <- function(target = NULL, defaults) {
  if (is.null(target)) target <- list()
  if (is.data.frame(target) && nrow(target) == 1L && is.data.frame(defaults) && nrow(defaults) == 1L) {
    return_df <- TRUE
    target <- as.list(target)
    defaults <- as.list(defaults)
  } else {
    return_df <- FALSE
  }

  miss_fields <- setdiff(names(defaults), names(target))
  if (length(miss_fields) > 0L) {
    for (mm in miss_fields) {
      target[[mm]] <- defaults[[mm]]
    }
  }

  if (isTRUE(return_df)) target <- as.data.frame(target)
  return(target)
}

# little helper function to create named list from objects
named_list <- function(...) {
  vnames <- as.character(match.call())[-1]
  return(setNames(list(...), vnames))
}

# little helper function to create named list from objects
named_vector <- function(...) {
  vnames <- as.character(match.call())[-1]
  vec <- list(...)
  nulls <- sapply(vec, is.null) # NULL will drop from vector
  vec[nulls] <- NULL
  vnames <- vnames[!nulls]

  # for now, all inputs must be length 1 atomic values or 0-length NULL/character(0) etc.
  sapply(vec, checkmate::assert_atomic, len = 1L)
  classes <- sapply(vec, class)
  if (length(unique(classes)) > 1L) {
    stop("All inputs must be of the same data type.")
  }

  return(setNames(do.call(c, vec), vnames))
}

#' Helper function to create named data.frame from a set of objects
#' 
#' @details This function helps with the problem of having several vectors
#'   in the workspace that you want to combine with data.fram
# example:
# 
# named_data_frame <- function(...) {
#   vnames <- as.character(match.call())[-1]
#   return(setNames(data.frame(...), vnames))
# }


enforce_glms_complete <- function(gpa, level=1L, lg=NULL) {
  checkmate::assert_class(gpa, "glm_pipeline_arguments")
  checkmate::assert_integerish(level, lower=1L, upper=3L)
  checkmate::assert_class(lg, "Logger")

  obj_name <- glue("l{level}_model_setup")
  expect_class <- glue("l{level}_setup")
  obj <- gpa[[obj_name]]

  if (is.null(obj) || !inherits(obj, expect_class)) {
    msg <- sprintf("No l%d_model_setup found in the glm pipeline object.", level)
    lg$error(msg)
    stop(msg)
  } else {
    if ("fsl" %in% gpa$glm_software) {
      nmiss <- sum(obj$fsl$feat_complete == FALSE)
      n_feat_runs <- nrow(obj$fsl)
      if (nmiss == n_feat_runs) {
        msg <- sprintf("All feat runs in %s$fsl are incomplete.", obj_name)
        lg$error(msg)
        stop(msg)
      } else if (nmiss > 0) {
        lg$warn(
          "There are %d missing FEAT outputs in %s$fsl. Using complete %d outputs.",
          nmiss, obj_name, n_feat_runs - nmiss
        )
      }
    }
  }

  return(invisible(NULL))
}


#' helper function to ask user to choose models at a given level for further processing
#' @param gpa a \code{glm_pipeline_arguments} with models specified at a given level
#' @param model_names a user-specified string of models to process/use
#' @param level the level of GLM analysis to be specified (1, 2, or 3)
#' @return a character vector of user-specified model names at this level
#' @keywords internal
choose_glm_models <- function(gpa, model_names, level, lg=NULL) {
  checkmate::assert_integerish(level, min = 1, max = 3)
  all_m_names <- names(gpa[[paste0("l", level, "_models")]]$models)
  checkmate::assert_subset(model_names, c("prompt", "all", "none", all_m_names))
  if (is.null(lg)) lg <- lgr::get_logger("")
  checkmate::assert_class(lg, "Logger")

  if (is.null(model_names)) {
    chosen_models <- NULL # happens when user de-selects all models at one level
  } else if (model_names[1L] == "all") {
    chosen_models <- all_m_names
  } else if (model_names[1L] == "none") {
    chosen_models <- NULL
  } else if (model_names[1L] == "prompt") {
    cat("\n")
    chosen_models <- select.list(all_m_names,
      multiple = TRUE,
      title = paste("Choose all level", level, "models to include: ")
    )
    if (identical(chosen_models, character(0))) {
      lg$info(paste("No level", level, "models were selected."))
      chosen_models <- NULL
    }
  } else {
    chosen_models <- model_names # user-specified set
  }
  return(chosen_models)
}


#' helper function to validate numbers in an input string or vector
#' @param inp A string, character vector, or numeric vector
#' @param lower The lowest valid value for each number
#' @param upper The highest valid value for each number
#' @param as_string If TRUE, return numbers as a space-separated string. If FALSE,
#'   return the validated numeric vector itself.
#' @keywords internal
check_nums <- function(inp, lower = 0, upper = 1e10, as_string = TRUE) {
  inp_name <- deparse(substitute(inp)) # get name of object passed in
  if (checkmate::test_string(inp)) {
    inp <- suppressWarnings(as.numeric(strsplit(inp, "[\\s,;]+", perl=TRUE)[[1]]))
  } else if (checkmate::test_character(inp)) {
    inp <- suppressWarnings(as.numeric(inp))
  }

  if (checkmate::test_numeric(inp, lower = lower, upper = upper, any.missing = FALSE)) {
    return(paste(inp, collapse = " "))
  } else {
    stop("Problem with ", inp_name, " specification: ", paste(inp, collapse = " "))
  }
}

#' this helper function replaces matching rows of a current data.frame with rows in a new data.frame
#' 
#' @param current The current data.frame that may contains rows matching the new data.frame
#' @param new A data.frame containing new data that should be added to the current data.frame
#' @param id_cols A character vector of columns names that must exist in both the current
#'   and new data.frames. These are used to determine which rows match in the two datasets.
#' @details The goal here is to keep records in the current data.frame that don't overlap with
#'   the new data.frame and replace overlapping records with those in the new data.frame. This is helpful
#'   when you want to update a master data.frame with new records that may overlap current records or may
#'   be truly new. If there is no overlap in the datasets, this function basically just binds them
#'   together using dplyr::bind_rows.
#' @importFrom dplyr bind_rows across
#' @importFrom tidyselect all_of
#' @keywords internal
update_df <- function(current = NULL, new = NULL, id_cols = NULL, sort = TRUE) {
  checkmate::assert_character(id_cols, any.missing = FALSE)
  checkmate::assert_data_frame(new)

  has_current <- !(is.null(current) || (is.data.frame(current) && nrow(current) == 0L))
  if (has_current) {
    checkmate::assert_data_frame(current)
    checkmate::assert_subset(id_cols, names(current)) # enforce all id columns in current
  } else {
    # no current data.frame (nothing to append to)
    return(new)
  }

  checkmate::assert_subset(id_cols, names(new)) # enforce all id columns in new

  id_list <- as.list(id_cols) # to make do.call happy

  # I wonder if there's a simple way to do an anti_join approach to the problem, but this string approach is efficient

  # create unique strings for the combination of identifying columns in the current and new data
  current_id <- do.call(paste, c(current[id_cols], sep = "-"))
  new_id <- do.call(paste, c(new[id_cols], sep="-"))

  to_replace <- current_id %in% new_id
  if (any(to_replace)) {
    ret <- current[!to_replace, , drop = F] %>% dplyr::bind_rows(new)
  } else {
    ret <- current %>% dplyr::bind_rows(new) # no overlap, so just rbind
  }

  if (isTRUE(sort)) {
    ret <- ret %>% dplyr::arrange(across(all_of(id_cols)))
  }

  return(ret)
}

# helper function for printing current selections in case of NULL
c_string <- function(vec, null_val="none") {
  if (is.null(vec)) {
    null_val
  } else {
    paste(vec, collapse = ", ")
  }
}

# fast permutation calculator to avoid gtools dependence
# @details This is way faster than permutations in gtools
# and is adapted from: https://stackoverflow.com/questions/11095992/generating-all-distinct-permutations-of-a-list-in-r
# @param n is the number of elements to permute
# @keys is an optional character vector of keys whose permutations should be calculated
# @return a permutations x keys/elements of n matrix with all possible permutations
compute_permutations <- function(n, keys = NULL) {
  if (!is.null(keys)) {
    if (missing(n) || is.null(n)) {
      # support key input alone
      n <- length(keys)
    }
    checkmate::assert_character(keys, len = n)
  }
  
  if (n == 1) {
    A <- matrix(1)
  } else {
    sp <- Recall(n - 1)
    p <- nrow(sp)
    A <- matrix(nrow = n * p, ncol = n)

    for (i in 1:n) {
      A[(i - 1) * p + 1:p, ] <- cbind(i, sp + (sp >= i))
    }
  }

  # convert to lookup of keys based on indices in A
  if (!is.null(keys)) {
    A <- apply(A, 2, function(x) { keys[x]} )
  }

  return(A)
}

#' internal function to print bidirectional set differences in a list containing vectors to be compared
#' @param l a named list containing vectors to be compared
#' @details All pairwise combinations of vectors in the list will be compared. The setdiff() operation is
#'   run twice for each pair, reflecting the two directions of comparison (s1 %notin% s2 and s2 %notin% s1).
#' @return NULL (invisibly)
#' @keywords internal
setdiff_list_combn <- function(l) {
  combs <- combn(length(l), 2)
  for (i in seq_len(ncol(combs))) {
    v1 <- l[[combs[1, i]]]
    v2 <- l[[combs[2, i]]]
    n1 <- names(l)[combs[1, i]]
    n2 <- names(l)[combs[2, i]]
    s1 <- setdiff(v1, v2)
    s2 <- setdiff(v2, v1)

    if (length(s1) > 0L) {
      cat(glue("Values in {n1} that are not in {n2}: {paste(s1, collapse=', ')}\n", .trim = F))
    }

    if (length(s2) > 0L) {
      cat(glue("Values in {n2} that are not in {n1}: {paste(s2, collapse=', ')}\n", .trim = F))
    }
  }
  return(invisible(NULL))
}

preprocess_all_functional <- function(run_df, output_directory = NULL, cleanup_failed = TRUE) {
  if (is.null(output_directory)) stop("Must specify the root directory for outputs")
  run_df <- run_df %>%
    dplyr::mutate(
      expect_func_dir = glue::glue_data(., "{output_directory}/sub-{id}/func/{task_name}_run-{sprintf('%02d', run_number)}"),
      expect_func_file = glue::glue_data(., "{expect_func_dir}/{basename(run_nifti)}"),
      expect_func_json = sub("\\.nii\\.gz$", ".json", run_nifti),
      expect_complete = glue::glue_data(., "{expect_func_dir}/.preprocessfunctional_complete"),
      expect_mprage_bet = glue::glue_data(., "{output_directory}/sub-{id}/anat/sub-{id}_T1w_bet.nii.gz"),
      expect_mprage_warpcoef = glue::glue_data(., "{output_directory}/sub-{id}/anat/sub-{id}_T1w_warpcoef_withgdc.nii.gz"),
      output_dir_exists = dir.exists(expect_func_dir),
      func_file_exists = file.exists(expect_func_file),
      is_complete = file.exists(expect_complete),
      mprage_bet_exists = file.exists(expect_mprage_bet),
      mprage_warpcoef_exists = file.exists(expect_mprage_warpcoef)
    )

    if (any(!run_df$mprage_bet_exists)) {
      warning("Cannot process some functional data because mprage_bet.nii.gz is not present (run preprocessMprage first?)")
      cat(run_df$expect_mprage_bet[!run_df$mprage_bet_exists], sep = "\n")
      run_df <- run_df %>% dplyr::filter(mprage_bet_exists == TRUE)
    }

    if (any(!run_df$mprage_warpcoef_exists)) {
      warning("Cannot process some functional data because mprage_warpcoef_withgdc.nii.gz is not present (run preprocessMprage first?)")
      cat(run_df$expect_mprage_warpcoef[!run_df$mprage_warpcoef_exists], sep = "\n")
      run_df <- run_df %>% dplyr::filter(mprage_warpcoef_exists == TRUE)
    }

    # cleanup failed runs
    if (isTRUE(cleanup_failed)) {
      to_dump <- with(run_df, output_dir_exists == TRUE & is_complete == FALSE)
      if (any(to_dump)) {
        message(glue("Deleting failed directories: {paste(run_df$expect_func_dir[to_dump], collapse=', ')}"))
        unlink(run_df$expect_func_dir[to_dump], recursive = TRUE)
        run_df$output_dir_exists[to_dump] <- FALSE
        run_df$func_file_exists[to_dump] <- FALSE
      }
    }

    # need to setup output directories before symlinks and calls
    miss_dirs <- with(run_df, output_dir_exists == FALSE)
    if (any(miss_dirs)) {
      sapply(run_df$expect_func_dir[miss_dirs], dir.create, recursive = TRUE)
      run_df$output_dir_exists[miss_dirs] <- TRUE
    }

    # make symlinks
    to_link <- with(run_df, func_file_exists == FALSE)
    if (any(to_link)) {
      file.symlink(run_df$run_nifti[to_link], run_df$expect_func_file[to_link]) # accepts vector inputs
      run_df$func_file_exists[to_link] <- TRUE
    }

    # read and copy through slice times from json
    for (ii in seq_len(nrow(run_df))) {
      jfile <- run_df$expect_func_json[ii]
      if (file.exists(jfile)) {
        ff <- jsonlite::read_json(jfile)
        stimes <- as.numeric(ff$SliceTiming)
        writeLines(paste(as.character(stimes), collapse = ","), con = file.path(run_df$expect_func_dir[ii], ".stimes"))
      }
    }

    to_run <- run_df %>% filter(is_complete == FALSE)
    # don't pass through sbref if it does not exist
    sbref_string <- sapply(to_run$sbref_nifti, function(x) if (file.exists(x)) paste("-func_refimg", x) else "")

    calls <- paste(
      "cd", to_run$expect_func_dir, "&&",
      "preprocessFunctional",
      "-4d", basename(to_run$expect_func_file), sbref_string, "-se_phasepos", to_run$se_pos, "-se_phaseneg", to_run$se_neg,
      "-mprage_bet", to_run$expect_mprage_bet, "-warpcoef", to_run$expect_mprage_warpcoef,
      "-epi_pedir y- -epi_echospacing .00053 -epi_te 30 -tr .635",
      "-hp_filter 120s -rescaling_method 100_voxelmean -template_brain MNI_2.3mm",
      "-func_struc_dof bbr -warp_interpolation spline -constrain_to_template y",
      "-4d_slice_motion -custom_slice_times .stimes",
      "-ica_aroma -motion_censor fd=0.9",
      "-nuisance_file nuisance_regressors.txt -nuisance_compute csf,dcsf,wm,dwm -smoothing_kernel 6 -cleanup",
      ">preprocessFunctional_stdout 2>preprocessFunctional_stderr"
    )

    pre <- c(
      "module use /proj/mnhallqlab/sw/modules",
      "module load afni/23.0.07",
      "module load fsl/6.0.6",
      "module load r/4.2.1",
      "module load c3d/1.1.0",
      "module load freesurfer/6.0.0",
      "module load ants/2.3.1",
      "module load imagemagick/7.1.1-11",
      "source /proj/mnhallqlab/lab_resources/lab_python3/bin/activate"
    )

    scripts_out <- tempdir()
    message("Writing job scripts to: ", scripts_out)

    cluster_submit_shell_jobs(calls,
      commands_per_cpu = 1L, cpus_per_job = 8L, memgb_per_command = 24, time_per_job = "50:00:00",
      pre = pre, debug = FALSE, job_out_dir = scripts_out
    )
}

preprocess_all_mprage <- function(run_df, output_directory = NULL, cleanup_failed = TRUE) {
  if (is.null(output_directory)) stop("Must specify the root directory for outputs")
  run_df <- run_df %>%
    dplyr::distinct(t1w, .keep_all = TRUE) %>% # drop repeated t1 images
    dplyr::mutate(
      expect_mprage_dir = glue::glue_data(., "{output_directory}/sub-{id}/anat"),
      expect_mprage_file = glue::glue_data(., "{output_directory}/sub-{id}/anat/sub-{id}_T1w.nii.gz"),
      expect_complete = glue::glue_data(., "{expect_mprage_dir}/.preprocessmprage_complete"),
      output_dir_exists = dir.exists(expect_mprage_dir),
      mprage_file_exists = file.exists(expect_mprage_file),
      is_complete = file.exists(expect_complete)
    )

  # cleanup failed runs
  if (isTRUE(cleanup_failed)) {
    to_dump <- with(run_df, output_dir_exists == TRUE & is_complete == FALSE)
    if (any(to_dump)) {
      message(glue("Deleting failed directories: {paste(run_df$expect_mprage_dir[to_dump], collapse=', ')}"))
      unlink(run_df$expect_mprage_dir[to_dump], recursive = TRUE)
      run_df$output_dir_exists[to_dump] <- FALSE
      run_df$mprage_file_exists[to_dump] <- FALSE
    }
  }

  # need to setup output directories before symlinks and calls
  miss_dirs <- with(run_df, output_dir_exists == FALSE)
  if (any(miss_dirs)) {
    sapply(run_df$expect_mprage_dir[miss_dirs], dir.create, recursive = TRUE)
    run_df$output_dir_exists[miss_dirs] <- TRUE
  }

  # make symlinks
  to_link <- with(run_df, mprage_file_exists == FALSE)
  if (any(to_link)) {
    file.symlink(run_df$t1w[to_link], run_df$expect_mprage_file[to_link]) # accepts vector inputs
    run_df$mprage_file_exists[to_link] <- TRUE
  }

  to_run <- run_df %>% filter(is_complete == FALSE)

  calls <- paste(
    "cd", to_run$expect_mprage_dir, "&&",
    "preprocessMprage -template_brain MNI_2mm -grad_unwarp prisma.coeff.grad -weak_bias -cleanup",
    "-n", to_run$expect_mprage_file,
    ">preprocessMprage_stdout 2>preprocessMprage_stderr"
  )

  pre <- c(
    "module use /proj/mnhallqlab/sw/modules",
    "module load afni/23.0.07",
    "module load fsl/6.0.6",
    "module load r/4.2.1",
    "module load c3d/1.1.0",
    "module load freesurfer/6.0.0",
    "module load ants/2.3.1",
    "module load imagemagick/7.1.1-11",
    "source /proj/mnhallqlab/lab_resources/lab_python3/bin/activate"
  )

  scripts_out <- tempdir()
  message("Writing job scripts to: ", scripts_out)

  cluster_submit_shell_jobs(calls, time_per_job = "2:00:00",
    commands_per_cpu = 1L, cpus_per_job = 8L, memgb_per_command = 12,
    pre = pre, debug = FALSE, job_out_dir = scripts_out
  )
}
UNCDEPENdLab/fmri.pipeline documentation built on April 3, 2025, 3:21 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
UNCDEPENdLab/fmri.pipeline
Analysis of fMRI Data on HPC clusters

R/glm_helper_functions.R
In UNCDEPENdLab/fmri.pipeline: Analysis of fMRI Data on HPC clusters

R Package Documentation

Browse R Packages

We want your feedback!

UNCDEPENdLab/fmri.pipeline Analysis of fMRI Data on HPC clusters

R/glm_helper_functions.R In UNCDEPENdLab/fmri.pipeline: Analysis of fMRI Data on HPC clusters

R Package Documentation

Browse R Packages

We want your feedback!

UNCDEPENdLab/fmri.pipeline
Analysis of fMRI Data on HPC clusters

R/glm_helper_functions.R
In UNCDEPENdLab/fmri.pipeline: Analysis of fMRI Data on HPC clusters