fmri.pipeline: Analysis of fMRI Data on HPC clusters

Documented in get_l1_confounds

#' helper function to generate confounds txt file for inclusion as additional regressors
#'
#' @details This function checks for whether an l1 confounds file already exists in the expected location.
#'   If it does not, the l1 confounds file will be generated according to the $confound_settings$l1_confound_regressors
#'   variable. If that is NULL, no confounds will be created.
#'
#' Confounds are generated by looking for motion parameters based on $confound_settings$motion_params_file and
#'   $confound_settings$confound_input_file. These are integrated, then columns that are requested in
#'   $confound_settings$l1_confound_regressors are written to a text file in the analysis subfolder for each subject
#'
#' Likewise, any run-level exclusions are tested according to $confound_settings$exclude_run. This should be a
#'   quoted expression that can tested against the confounds data.frame such that a single TRUE/FALSE is returned
#'   indicating whether a run is excluded (TRUE) or retained (FALSE) for higher-level analyses.
#'
#' @return a list containing: confounds=<location of l1 confounds file>, exclude_run=<TRUE/FALSE denoting run exclusion,
#'   exclude_data=<location of file containing columns used to calculate run exclusion>
#' @param run_df a single-row data.frame containing information about the run whose confounds should be calculated
#' @param id The subject id
#' @param session The session number
#' @param run_number The run number
#' @param gpa a \code{glm_pipeline_arguments} object containing pipeline specification
#' @param demean whether to demean confounds after calculation
#' @keywords internal
#' @importFrom dplyr n
#' @importFrom checkmate assert_data_frame test_null assert_integerish assert_class
#' @importFrom data.table setnames fread
get_l1_confounds <- function(run_df = NULL, id = NULL, session = NULL, run_number = NULL, gpa, demean=TRUE) {
  # For now, we have a fork in the inputs here. If a user specifies id, session, and run_number separately, then use those
  # If a single-row data.frame is passed in, use that, and update its fields. The latter is our default in finalize_pipeline_configuration
  if (!is.null(run_df)) {
    checkmate::assert_data_frame(run_df)
    return_run_df <- TRUE
    id <- run_df$id
    session <- run_df$session
    run_number <- run_df$run_number
  } else {
    return_run_df <- FALSE
    # lookup run information
    run_df <- gpa$run_data %>%
      dplyr::filter(id == !!id & session == !!session & run_number == !!run_number)
  }

  # populate drop_volumes in run_df if not present
  if (!"drop_volumes" %in% names(run_df)) run_df$drop_volumes <- gpa$drop_volumes

  if (checkmate::test_null(id)) stop("get_l1_confounds requires a specific id for lookup")
  checkmate::assert_integerish(session, null.ok=TRUE)
  if (is.null(session)) session <- 1
  checkmate::assert_integerish(run_number, lower=1, null.ok = FALSE)
  checkmate::assert_class(gpa, "glm_pipeline_arguments")

  lg <- lgr::get_logger("glm_pipeline/l1_setup")
  lg$set_threshold(gpa$lgr_threshold)

  # these columns should be populated to run_df for every case
  calculation_columns <- c(
    "run_nifti", "run_volumes", "orig_volumes", "first_volume",
    "last_volume", "truncate_volumes", "l1_confound_file", "exclude_run"
  )

  # use defaults if missing in run_df (ensure same return for every input)
  calculation_defaults <- data.frame(
    orig_volumes = NA_integer_, first_volume = NA_integer_, last_volume = NA_integer_,
    truncate_volumes = NA_integer_, l1_confound_file = NA_character_, exclude_run = FALSE, run_volumes = NA_integer_
  )

  run_df <- populate_defaults(run_df, calculation_defaults)

  # structure to return if this lookup fails (predictable elements so that caller can handle them)
  if (isTRUE(return_run_df)) {
    empty_set <- run_df # unmodified copy of run_df
  } else {
    empty_set <- list(
      l1_confound_file = NA_character_, l1_confounds_df = data.frame(),
      exclude_run = FALSE, exclude_data = data.frame(), truncation_data = data.frame()
    )
  }

  if (nrow(run_df) > 1L) {
    print(run_df)
    lg$error("Multiple matches for a single run in get_l1_confounds.")
    return(empty_set)
  } else if (nrow(run_df) == 0L) {
    lg$error("Unable to locate a record in gpa$run_data for id %s, session %s, run_number %s.", id, session, run_number)
    return(empty_set)
  }

  # Park confound files in an analysis-level subfolder, not a particular model, since they are re-used across models.
  # Note: normalizePath will fail to evaluate properly if directory does not exist
  analysis_outdir <- get_output_directory(id=id, session=session, gpa=gpa, create_if_missing = TRUE, what="sub")

  # Determine whether we should be returning information about l1 confound regressors
  # and whether this information has already been calculated
  generate_l1_confounds <- FALSE
  if (is.null(gpa$confound_settings$l1_confound_regressors) && is.null(gpa$confound_settings$spike_volumes)) {
    # no confounds requested
    expected_l1_confound_file <- NA_character_
  } else {
    expected_l1_confound_file <- file.path(analysis_outdir, paste0("run", run_number, "_l1_confounds.txt"))
    if (!file.exists(expected_l1_confound_file)) {
      lg$debug("Expected confounds file does not exist: %s.", expected_l1_confound_file)
      generate_l1_confounds <- TRUE
    }
  }

  # default empty data.frames for exclusion and truncation (in case of NULL exclude or truncate criteria)
  exclude_data <- data.frame()
  truncation_data <- data.frame()

  l1_cached_df <- read_df_sqlite(gpa = gpa, id = id, session = session, run_number = run_number, table = "l1_run_calculations")

  # determine whether we should be returning information about run exclusions
  # and whether this information has already been calculated
  generate_run_exclusion <- FALSE
  if (is.null(gpa$confound_settings$exclude_run)) {
    # no basis for exclusion (all runs okay)
    exclude_run <- FALSE
  } else {
    exclude_data <- read_df_sqlite(gpa = gpa, id = id, session = session, run_number = run_number, table = "l1_exclusion_data")
    if (!is.null(l1_cached_df)) {
      exclude_run <- as.logical(l1_cached_df$exclude_run)
    } else {
      lg$debug("No record of run exclusion exists in database: %s.", gpa$output_locations$sqlite_db)
      exclude_run <- NA #default to missing
      generate_run_exclusion <- TRUE
    }
  }

   # Determine whether we should be returning information about run truncation and whether this information has already been calculated.
   # Note that truncation has to be calculated alongside timing events since truncate expressions often involve timing
   generate_run_truncation <- FALSE
   if (!is.null(gpa$confound_settings$truncate_run)) {
     truncation_data <- read_df_sqlite(gpa = gpa, id = id, session = session, run_number = run_number, table = "l1_truncation_data")
     if (is.null(truncation_data)) {
       lg$debug("No record of run truncation data exists in database: %s.", gpa$output_locations$sqlite_db)
       generate_run_truncation <- TRUE
     }
   }

  # If run exclusion, l1 confounds, and truncation data exist, just use the precalculated information
  if (!(generate_l1_confounds || generate_run_exclusion || generate_run_truncation)) {
    if (!is.na(expected_l1_confound_file)) lg$debug("Returning extant file: %s in get_l1_confounds", expected_l1_confound_file)

    # populate cached values for run if not recalculating anything
    run_df$exclude_run <- exclude_run
    run_df$l1_confound_file <- expected_l1_confound_file

    if (is.null(l1_cached_df)) {
      msg <- "Not re-generating confound information, but l1_cached_df is NULL. Cannot proceed."
      lg$error(msg)
      stop(msg)
    } else {
      # SQLite cannot support logical/boolean types, need to convert 0/1 outcome
      l1_cached_df$exclude_run <- as.logical(l1_cached_df$exclude_run)
    }

    # add/replace any cached calculations.
    # TODO: the logic of exclude_run and l1_confound above seem redundant. Likewise, the 'is.null(l1_cached_df)' 
    # above seems like it could be removed/factored out.
    run_df[, calculation_columns] <- l1_cached_df[, calculation_columns]

    if (isTRUE(return_run_df)) {
      return(run_df)
    } else {
      return(list(
        l1_confound_file = expected_l1_confound_file,
        l1_confounds_df = data.table::fread(expected_l1_confound_file, data.table = FALSE),
        exclude_run = exclude_run, exclude_data = exclude_data, truncation_data = truncation_data
      ))
    }
  }

  # read external confounds file (contains all volumes)
  confound_df <- read_df_sqlite(gpa = gpa, id = id, session = session, run_number = run_number, table = "l1_confound_inputs")
  if (is.null(confound_df) && isTRUE(run_df$confound_input_file_present[1])) {
    lg$debug("Generating l1 confounds for id: %s, session: %s, run_number: %s", id, session, run_number)
    cfile <- get_mr_abspath(run_df, "confound_input_file")[1]
    lg$debug("Reading confound file: %s", cfile)
    confound_df <- tryCatch(data.table::fread(cfile, data.table=FALSE, na.strings=gpa$confound_settings$na_strings),
    error = function(e) {
      lg$error("Failed to read confound file: %s with error %s", cfile, as.character(e))
      return(NULL)
    })

    if (!is.null(gpa$confound_settings$confound_input_colnames) && !is.null(confound_df)) {
      if (length(gpa$confound_settings$confound_input_colnames) != ncol(confound_df)) {
        lg$warn(
          "Mismatch in number of columns in confound file: %s relative to $confound_settings$confound_input_colnames",
          run_df$confound_input_file[1]
        )
      } else {
        # only set names in confound_df if the number of columns matches
        data.table::setnames(confound_df, gpa$confound_settings$confound_input_colnames)
      }
    }

    # cache original confounds to db
    insert_df_sqlite(gpa,
      id = id, session = session, run_number = run_number,
      data = confound_df, table = "l1_confound_inputs", immediate=TRUE
    )
  }

  #read motion parameters file (all volumes)
  motion_df <- read_df_sqlite(gpa = gpa, id = id, session = session, run_number = run_number, table = "l1_motion_parameters")
  if (is.null(motion_df) && isTRUE(run_df$motion_params_present[1])) {
    lg$debug("Generating motion params for id: %s, session: %s, run_number: %s", id, session, run_number)
    mfile <- get_mr_abspath(run_df, "motion_params_file")[1]
    lg$debug("Reading motion file: %s", mfile)

    # Note: Avoid demeaning columns within generate_motion_regressors so that calculated regressors do not
    # change scale prior to testing the run exclusion criteria below. Demeaning can be applied safely thereafter.
    # Note: do not specify drop_volumes or last_volume at this phase -- we want original (full length) motion params
    motion_df <- tryCatch({
      generate_motion_regressors(
        mfile,
        col.names = gpa$confound_settings$motion_params_colnames,
        demean=FALSE, na.strings=gpa$confound_settings$na_strings, lg=lg
      )}, error = function(e) {
      lg$error("Failed to read motion file: %s with error %s", run_df$motion_params_file[1], as.character(e))
      return(NULL)
    })

    # cache motion parameters to db
    insert_df_sqlite(gpa,
      id = id, session = session, run_number = run_number, data = motion_df,
      table = "l1_motion_parameters", immediate=TRUE
    )
  }

  # combine motion and confound files (all volumes)
  if (is.null(motion_df) && is.null(confound_df)) {
    lg$info(
      "Neither confounds nor motion parameters are available for id: %s, session: %s, run_number: %d",
      id, session, run_number
    )
    return(empty_set)
  } else if (is.null(motion_df)) {
    all_confounds <- confound_df
  } else if (is.null(confound_df)) {
    all_confounds <- motion_df
  } else {
    if (nrow(motion_df) != nrow(confound_df)) {
      lg$error("Number of rows in motion_df is: %d and in confound_df is %d. Cannot combine", nrow(motion_df), nrow(confound_df))
      return(empty_set)
    } else {
      # prefer confound_df as authoritative in cases where motion_df has overlapping columns
      overlap_names <- intersect(names(motion_df), names(confound_df))
      if (length(overlap_names) > 0L) {
        lg$info(
          "Motion parameters have overlapping columns with confounds file: %s. Preferring confounds to motion params",
          run_df$confound_input_file[1L]
        )
        lg$info("Overlap: %s", overlap_names)
        data.table::setnames(motion_df, old = overlap_names, new = paste0(overlap_names, ".mot"))
      }

      all_confounds <- dplyr::bind_cols(confound_df, motion_df)
    }
  }

  # add volume and time columns to confounds in case these are helpful in exclusion or truncation expressions
  all_confounds <- all_confounds %>%
    mutate(volume = 1:n(), time = (volume - 1) * run_df$tr[1L])

  # handle NA values in confounds -> convert to 0
  has_na <- sapply(all_confounds, anyNA)
  if (any(has_na)) {
    lg$warn("NA values found in confounds file: %s", cfile)
    lg$warn("These will be converted to 0s before evaluating exclude_run and writing l1_confound_file: %s", expected_l1_confound_file)
    all_confounds[, has_na] <- as.data.frame(apply(all_confounds[, has_na], 2, function(x) {
      x[is.na(x)] <- 0
      return(x)
    }))
  }

  # retain columns used in calculating run truncation
  if (isTRUE(generate_run_truncation) && !is.null(gpa$confound_settings$truncate_run)) {
    truncation_data <- all_confounds[, intersect(gpa$confound_settings$run_truncation_columns, names(all_confounds)), drop = FALSE]

    insert_df_sqlite(gpa,
      id = id, session = session, run_number = run_number, data = truncation_data,
      table = "l1_truncation_data", immediate = TRUE
    )
  }

  # At this point, truncation_data should be populated. Perform trucation on run_nifti if needed and update run_df with final volume settings
  run_df <- truncate_runs(run_df, gpa, analysis_outdir, truncation_data, lg)

  # truncate confounds to match
  all_confounds <- all_confounds[run_df$first_volume:run_df$last_volume, , drop = FALSE]
  motion_df <- motion_df[run_df$first_volume:run_df$last_volume, , drop = FALSE]

  # calculate whether to retain or exclude this run
  if (isTRUE(generate_run_exclusion) && !is.null(gpa$confound_settings$exclude_run)) {
    if (!all(gpa$confound_settings$run_exclusion_columns %in% names(all_confounds))) {
      lg$warn("Missing exclusion columns for subject: %s, session: %s", run_df$id[1L], run_df$session[1L])
      lg$warn("Column: %s", setdiff(gpa$confound_settings$run_exclusion_columns, names(all_confounds)))
      lg$warn("We will exclude this run from analysis until this is resolved!")
      exclude_run <- TRUE
    } else {
      # evaluate run exclusion expression
      exclude_run <- tryCatch(with(all_confounds, eval(parse(text = gpa$confound_settings$exclude_run))),
        error = function(e) {
          lg$error(
            "Problem evaluating run exclusion for subject: %s, session: %s, run_number: %s, expr: %s",
            id, session, run_number,
            gpa$confound_settings$exclude_run
          )
          lg$error("Defaulting to exclusion of run.")
          return(TRUE)
        }
      )

      if (length(exclude_run) > 1L) {
        lg$error("Run exclusion expression %s returned %d results!", gpa$confound_settings$exclude_run, length(exclude_run))
        lg$error("Modify the expression so that it returns a single TRUE/FALSE.")
        lg$error("Defaulting to exclusion of run.")
        exclude_run <- TRUE
      }
    }

    # write the exclusion basis to the database
    exclude_data <- all_confounds[, intersect(gpa$confound_settings$run_exclusion_columns, names(all_confounds)), drop = FALSE]
    insert_df_sqlite(gpa,
      id = id, session = session, run_number = run_number, data = exclude_data,
      table = "l1_exclusion_data", immediate=TRUE
    )
  }

  # add run exclusion column to data.frame
  run_df$exclude_run <- exclude_run

  # check for missing confound columns
  if (!all(gpa$confound_settings$l1_confound_regressors %in% names(all_confounds))) {
    lg$warn("Missing confound columns for subject: %s, session: %s", run_df$id[1L], run_df$session[1L])
    lg$warn("Column: %s", setdiff(gpa$confound_settings$l1_confound_regressors, names(all_confounds)))
  }

  # trim l1 confounds to only those requested
  all_confounds <- all_confounds[, intersect(gpa$confound_settings$l1_confound_regressors, names(all_confounds))]
  num_cols <- sapply(all_confounds, class) %in% c("integer", "numeric")
  if (any(!num_cols)) {
    lg$warn("Confound file contains non-numeric columns. Not sure what will happen!")
    lg$warn("Column: %s", names(all_confounds)[!num_cols])
  }

  # demean confound regressors, if requested (usually a good idea)
  if (isTRUE(demean)) {
    lg$debug("Demeaning columns of l1 confounds matrix: %s", expected_l1_confound_file)

    all_confounds[, num_cols] <- as.data.frame(apply(all_confounds[, num_cols], 2, function(x) {
      x - mean(x, na.rm = TRUE)
    }))
  }

  # incorporate spike regressors if requested (not used in conventional AROMA)
  spikes <- compute_spike_regressors(motion_df, gpa$confound_settings$spike_volumes, lg = lg)
  if (!is.null(spikes)) all_confounds <- cbind(all_confounds, spikes)

  lg$debug("Writing l1 confounds to file: %s", expected_l1_confound_file)
  write.table(all_confounds, file = expected_l1_confound_file, row.names = FALSE, col.names = FALSE)
  run_df$l1_confound_file <- expected_l1_confound_file

  # cache final confounds to db
  insert_df_sqlite(gpa,
    id = id, session = session, run_number = run_number,
    data = all_confounds, table = "l1_confounds", immediate=TRUE
  )

  # cache run_df calculations to db
  insert_df_sqlite(gpa,
    id = id, session = session, run_number = run_number,
    data = run_df[, calculation_columns, drop = FALSE], table = "l1_run_calculations", immediate = TRUE
  )

  if (isTRUE(return_run_df)) {
    return(run_df)
  } else {
    return(list(
      l1_confound_file = expected_l1_confound_file,
      l1_confounds_df = all_confounds,
      exclude_run = exclude_run, exclude_data = exclude_data, truncation_data = truncation_data
    ))
  }

}