R/setup_l3_models.R

Defines functions get_feat_l3_inputs get_fsl_l3_model_df get_l3_cope_df get_l2_cope_df get_l1_cope_df setup_l3_models

Documented in setup_l3_models

#' This function generates the inputs for level 3 analyses, where multi-subject data are analyzed
#'   in group analyses.
#'
#' @param gpa a \code{glm_pipeline_arguments} object containing analysis speecification
#' @param l3_model_names a subset of L3 models to be setup by this function. If not specified,
#'   all models in gpa$l2_models will be included
#' @param l2_model_names a subset of L2 models to be setup by this function. If not specified,
#'   all models in gpa$l2_models will be included
#' @param l1_model_names a subset of L1 models to be passed to L2 by this function. If not
#'   specified, all models in gpa$l1_models will be included
#'
#' @details
#'   This function will setup FSL level 2 (subject) .fsf files for all combinations of
#'   \code{l2_model_names} and \code{l1_model_names}.
#'
#' FSL 2-level versus 3-level setup
#'
#' 2-level setup (one run per subject)
#'   - Pass L1 .feat folders as input to L3 .fsf setup
#'   - In this approach, the copes in the .fsf pertain to the L1 cope numbers
#'   - Requires one .fsf per L3 model
#'
#' 3-level setup (multiple runs per subject, combined at L2)
#'   - Pass individual cope*.feat folders within subject .gfeat folders
#'   - The folder cope numbers pertain to L1 copes
#'   - The cope*.nii.gz in the cope*.feat subfolders pertain to the L2 contrasts
#'   - Requires one .fsf per L1 cope x L3 model combination
#'   - Example: FSL_L2.gfeat/cope3.feat/stats/cope1.nii.gz
#'      ==> cope3 is the third contrast in the L1 feat model
#'      ==> cope1 is the first contrast in the L2 feat model
#'
#' @author Michael Hallquist
#' @importFrom checkmate assert_class assert_character assert_data_frame
#' @importFrom data.table is.data.table
#' @importFrom lgr get_logger
#' @export
setup_l3_models <- function(gpa, l3_model_names = NULL, l2_model_names = NULL, l1_model_names = NULL) {
  checkmate::assert_class(gpa, "glm_pipeline_arguments")
  checkmate::assert_character(l3_model_names, null.ok = TRUE)
  checkmate::assert_character(l2_model_names, null.ok = TRUE)
  checkmate::assert_character(l1_model_names, null.ok = TRUE)
  checkmate::assert_data_frame(gpa$run_data)
  checkmate::assert_data_frame(gpa$subject_data)
  checkmate::assert_class(gpa$l3_models, "hi_model_set")
  checkmate::assert_class(gpa$l1_models, "l1_model_set")

  # full 3-level analysis (runs, subject, sample)
  if (isTRUE(gpa$multi_run)) {
    checkmate::assert_class(gpa$l2_models, "hi_model_set")

    # if no l2 model subset is requested, output all models
    if (is.null(l2_model_names)) l2_model_names <- names(gpa$l2_models$models)
  }

  # if no l3 model subset is requested, output all models
  if (is.null(l3_model_names)) l3_model_names <- names(gpa$l3_models$models)

  # if no l1 model subset is requested, output all models
  if (is.null(l1_model_names)) l1_model_names <- names(gpa$l1_models$models)

  lg <- lgr::get_logger("glm_pipeline/l3_setup")
  lg$set_threshold(gpa$lgr_threshold)

  if (isTRUE(gpa$log_txt) && !"setup_l3_log_txt" %in% names(lg$appenders)) {
    lg$add_appender(lgr::AppenderFile$new(gpa$output_locations$setup_l3_log_txt), name = "setup_l3_log_txt")
  }

  if (isTRUE(gpa$log_json) && !"setup_l3_log_json" %in% names(lg$appenders)) {
    lg$add_appender(lgr::AppenderJson$new(gpa$output_locations$setup_l3_log_json), name = "setup_l3_log_json")
  }

  lg$debug("In setup_l3_models, setting up the following L3 models:")
  lg$debug("L3 model: %s", l3_model_names)
  if (isTRUE(gpa$multi_run)) {
    lg$debug("In setup_l3_models, passing the following L2 models to L3:")
    lg$debug("L2 model: %s", l2_model_names)
  }
  lg$debug("In setup_l3_models, passing the following L1 models to L3:")
  lg$debug("L1 model: %s", l1_model_names)

  # handle refresh of feat status for lower-level models
  # For multi-run data, L2 inputs must be complete for entry into L3
  # For single-run data, L1 inputs must be complete for entry into L3
  if (isTRUE(gpa$multi_run)) {
    # refresh l2 model status in $l2_model_setup
    gpa <- refresh_feat_status(gpa, level = 2L, lg = lg)
  } else {
    # refresh l1 model status in $l1_model_setup
    gpa <- refresh_feat_status(gpa, level = 1L, lg = lg)
  }

  excluded_runs <- gpa$run_data %>%
    dplyr::select(id, session, run_number, exclude_run, exclude_subject) %>%
    dplyr::filter(exclude_run == TRUE | exclude_subject == TRUE)

  if (nrow(excluded_runs) > 1L) {
    lg$info("In setup_l3_models, the following runs will be excluded from L3 modeling: ")
    lg$info(
      "  subject: %s, session: %s, run_number: %s",
      excluded_runs$id, excluded_runs$session, excluded_runs$run_number
    )
  }

  # make sure l1 models have already been generated
  enforce_glms_complete(gpa, level = 1L, lg)

  if (isTRUE(gpa$multi_run)) {
    lg$info("In setup_l3_models, using a multi-run 3-level setup with runs (l1), subjects (l2), sample (l3)")

    # in multi-run setup, an l2_model_setup must be present
    enforce_glms_complete(gpa, level = 2L, lg)

  } else {
    lg$info("In setup_l3_models, using a single run 2-level setup with subjects (l1), sample (l3)")
      # only retain good runs and subjects
      run_data <- gpa$run_data %>%
        dplyr::filter(exclude_run == FALSE & exclude_subject == FALSE)

      if (nrow(run_data) == 0L) {
        msg <- "In setup_l3_models, no runs survived the exclude_subject and exclude_run step."
        lg$warn(msg)
        warning(msg)
        return(NULL)
      }
  }

  # subjects and sessions to run at l3
  subj_df <- gpa$subject_data %>%
    dplyr::filter(exclude_subject==FALSE) %>%
    select(id, session)

  # loop over and setup all requested combinations of L1, L2, and L3 models
  feat_l3_df <- list()
  if (isTRUE(gpa$multi_run)) {
    model_set <- expand.grid(
      l1_model = l1_model_names, l2_model = l2_model_names,
      l3_model = l3_model_names, stringsAsFactors = FALSE
    )
  } else {
    model_set <- expand.grid(l1_model = l1_model_names, l3_model = l3_model_names, stringsAsFactors = FALSE)
  }

  # Get the copes for each contrast, respecting differences in l2 copes across subjects
  # This has the full combination of l1, l2, and l3 copes
  l3_cope_config <- get_fsl_l3_model_df(gpa, model_set, subj_df)

  # For obtaining inputs to l3 models, we don't need the multiple rows for each third level cope
  # since these are part of the l3 model (contrasts to be specified). Thus, just get the first row of
  # each l3 model for calculating inputs to l3.
  l3_cope_input_df <- l3_cope_config %>%
    dplyr::filter(l3_cope_number == 1L) %>%
    dplyr::select(-l3_cope_number, -l3_cope_name)

  to_run <- get_feat_l3_inputs(gpa, l3_cope_input_df, lg)

  if (length(to_run) == 0L) {
    lg$warn("No feat l3 inputs returned from get_feat_l3_inputs. Is it possible no lower-level analyses are complete?")
    return(gpa)
  }

  all_l3_list <- foreach(
    model_info = iter(to_run), .inorder = FALSE,
    .packages = c("fmri.pipeline", "dplyr", "data.table"), .export = c("lg", "gpa", "fsl_l3_model")
  ) %dopar% {
    model_info <- model_info # to avoid complaints about global variable binding in R CMD check

    l3_file_setup <- list(fsl = list(), spm = list(), afni = list())

    if ("fsl" %in% gpa$glm_software) {

      if (nrow(model_info) <= 3) {
        lg$warn(
          "Fewer than 4 complete feat input directories for l1 model %s, l2 model %s, l3 model %s",
          model_info$l1_model[1L], model_info$l2_model[1L], model_info$l3_model[1L]
        )
        l3_file_setup$fsl <- NULL
      }

      # setup Feat L3 files for each combination of lower-level models
      l3_file_setup$fsl <- tryCatch(fsl_l3_model(model_info, gpa = gpa),
        error = function(e) {
          lg$error(
            "Problem with fsl_l3_model. L1 Model: %s, L2 Model: %s, L3 model %s",
            model_info$l1_model[1L], model_info$l2_model[1L], model_info$l3_model[1L]
          )
          lg$error("Error message: %s", as.character(e))
          return(NULL)
        }
      )
    }

    if ("spm" %in% gpa$glm_software) {
      lg$warn("spm not supported in setup_l3_models")
    }

    if ("afni" %in% gpa$glm_software) {
      lg$warn("afni not supported in setup_l3_models")
    }

    return(l3_file_setup)
  }

  all_subj_l3_combined <- list(
    metadata = l3_cope_config,
    fsl = rbindlist(lapply(all_l3_list, "[[", "fsl"))
  )

  class(all_subj_l3_combined) <- c("list", "l3_setup")

  # Combine the generated l3_model_setup with any existing l3_model_setup information
  # This is important so that if a user requests a model subset (using input arguments, and from run_glm_pipeline),
  # we don't clear out information about many other models that may have already completed

  # Look for existing models in l3_model_setup that are identical to models that were just setup
  # Replace the rows corresponding to new models
  if (!is.null(gpa$l3_model_setup) && inherits(gpa$l3_model_setup, "l3_set")) {
    all_subj_l3_combined$fsl <- update_df(
      current = gpa$l3_model_setup$fsl, new = all_subj_l3_combined$fsl,
      id_cols = c("l1_model", "l1_cope_name", "l2_model", "l2_cope_name", "l3_model")
    )
  }

  # append l3 setup to gpa
  gpa$l3_model_setup <- all_subj_l3_combined

  # the expressions for l3 output locations should not generate any duplicate fsfs
  dupe_fsfs <- duplicated(gpa$l3_model_setup$fsl$feat_fsf)
  if (any(dupe_fsfs, na.rm = TRUE)) {
    lg$warn("There are duplicate fsfs in gpa$l3_model_setup$fsl.")
    lg$warn("This suggests a problem with gpa$output_locations$feat_l3_fsf and gpa$output_locations$feat_l3_directory settings.")
    lg$warn("gpa$output_locations$feat_l3_fsf: %s", gpa$output_locations$feat_l3_fsf)
    lg$warn("gpa$output_locations$feat_l3_directory: %s", gpa$output_locations$feat_l3_directory)
    lg$debug("%s", capture.output(print(gpa$l3_model_setup$fsl[dupe_fsfs, ])))
  }

  return(gpa)
}


############
# helper function to get a cope data.frame for all level 1 models
get_l1_cope_df <- function(gpa, model_set, subj_df=NULL) {
  if (is.null(subj_df)) {
    subj_df <- gpa$subject_data %>%
      dplyr::filter(exclude_subject == FALSE) %>%
      dplyr::select(id, session)
  }
  checkmate::assert_data_frame(model_set)
  dt <- data.table::rbindlist(
    lapply(unique(model_set$l1_model), function(mm) {
      data.frame(
        l1_model = mm,
        l1_cope_number = seq_along(gpa$l1_cope_names[[mm]]),
        l1_cope_name = gpa$l1_cope_names[[mm]]
      )
    })
  )

  #return subject-specific rows for each cope
  dt <- dt %>% tidyr::crossing(subj_df)
  return(dt)
}

# helper function to get a cope data.frame for all level 3 models
# handles the per-subject cope numbering problem
get_l2_cope_df <- function(gpa, model_set, subj_df=NULL) {
  if (is.null(subj_df)) {
    subj_df <- gpa$subject_data %>%
      dplyr::filter(exclude_subject == FALSE) %>%
      dplyr::select(id, session)
  }
  checkmate::assert_data_frame(model_set)

  data.table::rbindlist(
    lapply(unique(model_set$l2_model), function(mm) {
      if (!is.null(gpa$l2_models$models[[mm]]$by_subject)) {
        # combine as single data frame from nested list columns
        l2_df <- rbindlist(gpa$l2_models$models[[mm]]$by_subject$cope_list)
        l2_df$l2_model <- mm #retain model name
      } else {
        cope_names <- rownames(gpa$l2_models$models[[mm]]$contrasts)
        l2_df <- data.frame(
          l2_model = mm, l2_cope_number = seq_along(cope_names),
          l2_cope_name = cope_names
        )
        l2_df <- l2_df %>% tidyr::crossing(subj_df)
      }
      return(l2_df)

    })
  )
}

# helper function to get a cope data.frame for all level 3 models
get_l3_cope_df <- function(gpa, model_set, subj_df=NULL) {
  if (is.null(subj_df)) {
    subj_df <- gpa$subject_data %>%
      dplyr::filter(exclude_subject == FALSE) %>%
      dplyr::select(id, session)
  }
  checkmate::assert_data_frame(model_set)
  dt <- data.table::rbindlist(
    lapply(unique(model_set$l3_model), function(mm) {
      cope_names <- rownames(gpa$l3_models$models[[mm]]$contrasts)
      data.frame(
        l3_model = mm,
        l3_cope_number = seq_along(cope_names),
        l3_cope_name = cope_names
      )
    })
  )

  # return subject-specific rows for each cope
  dt <- dt %>% tidyr::crossing(subj_df)
  return(dt)

}


# data.table cross-join (tidyr::crossing is a bit faster and already exists)
# https://stackoverflow.com/questions/10600060/how-to-do-cross-join-in-r
# CJ.table <- function(X, Y) {
#   setkey(X[, c(k = 1, .SD)], k)[Y[, c(k = 1, .SD)], allow.cartesian = TRUE][, k := NULL]
# }

get_fsl_l3_model_df <- function(gpa, model_df, subj_df=NULL) {
  if (is.null(subj_df)) {
    subj_df <- gpa$subject_data %>%
      dplyr::filter(exclude_subject == FALSE) %>%
      dplyr::select(id, session)
  }
  model_df$model_id <- seq_len(nrow(model_df))

  l1_df <- get_l1_cope_df(gpa, model_df, subj_df)

  l3_df <- get_l3_cope_df(gpa, model_df, subj_df)

  if (isTRUE(gpa$multi_run)) {
    # model_df has l1_model, l2_model, l3_model
    l2_df <- get_l2_cope_df(gpa, model_df)

    combined <- model_df %>%
      tidyr::crossing(subj_df) %>%
      left_join(l1_df, by = c("id", "session", "l1_model")) %>%
      left_join(l2_df, by = c("id", "session", "l2_model")) %>%
      left_join(l3_df, by = c("id", "session", "l3_model"))
  } else {
    combined <- model_df %>%
      tidyr::crossing(subj_df) %>%
      left_join(l1_df, by = c("id", "session", "l1_model")) %>%
      left_join(l3_df, by = c("id", "session", "l3_model"))
  }

  return(combined)
}

get_feat_l3_inputs <- function(gpa, l3_cope_config, lg=NULL) {
  checkmate::assert_class(gpa, "glm_pipeline_arguments")
  checkmate::assert_data_frame(l3_cope_config)

  if (is.null(lg)) { lg <- lgr::get_logger() }
  if (isTRUE(gpa$multi_run)) {
    # feat directories in $l2_model_setup
    feat_inputs <- gpa$l2_model_setup$fsl %>%
      dplyr::filter(feat_complete == TRUE)

    #join up combination of all models with cope directories
    feat_inputs <- feat_inputs %>%
      dplyr::inner_join(l3_cope_config, by=c("id", "session", "l1_model", "l2_model"))

    # sort out expected cope files for each model combination
    feat_inputs <- feat_inputs %>%
      dplyr::mutate(cope_file = file.path(
        feat_dir,
        paste0("cope", l1_cope_number, ".feat"),
        "stats",
        paste0("cope", l2_cope_number, ".nii.gz")
      )) %>%
      dplyr::select(
        id, session, l1_model, l2_model, l3_model, l1_cope_name, l2_cope_name, feat_dir, cope_file
      )

    split_on <- c("l1_cope_name", "l2_cope_name", "l1_model", "l2_model", "l3_model")
  } else {
    # feat directories in $l1_model_setup
    feat_inputs <- gpa$l1_model_setup$fsl %>%
      dplyr::filter(feat_complete == TRUE)

    feat_inputs <- feat_inputs %>%
      dplyr::inner_join(l3_cope_config, by = c("id", "session", "l1_model"))

    feat_inputs <- feat_inputs %>%
      dplyr::mutate(cope_file = file.path(
        feat_dir,
        "stats",
        paste0("cope", l1_cope_number, ".nii.gz")
      )) %>%
      dplyr::select(
        id, session, l1_model, l3_model, l1_cope_name, feat_dir, cope_file
      )

    split_on <- c("l1_cope_name", "l1_model", "l3_model")

  }

  if (!is.data.table(feat_inputs)) data.table::setDT(feat_inputs)
  feat_inputs <- split(feat_inputs, by = split_on)

  return(feat_inputs)
}
UNCDEPENdLab/fmri.pipeline documentation built on April 3, 2025, 3:21 p.m.