Nothing
#' A function to create a design matrix, outcome, and penalty factor to be passed to a model fitting function
#'
#' @param data_file A filepath to rds file of processed data (data from `process_plink()` or `process_delim()`)
#' @param rds_dir The path to the directory in which you want to create the new '.rds' and '.bk' files.
#' @param obj The RDS object read in by `create_design()`
#' @param new_file User-specified filename (*without .bk/.rds extension*) for the to-be-created .rds/.bk files. Must be different from any existing .rds/.bk files in the same folder.
#' @param feature_id A string specifying the column in the data X (the feature data) with the row IDs (e.g., identifiers for each row/sample/participant/, etc.). No duplicates allowed.
#' - for PLINK data: a string specifying an ID column of the PLINK `.fam` file. Options are "IID" (default) and "FID"
#' - for all other filebacked data: a character vector of unique identifiers (IDs) for each row of the feature data (i.e., the data processed with `process_delim()`)
#' - if left NULL (default), X is assumed to have the same row-order as add_outcome.
#' **Note**: if this assumption is made in error, calculations downstream will be incorrect. Pay close attention here.
#' @param add_outcome A data frame or matrix with two columns: and ID column and a column with the outcome value (to be used as 'y' in the final design). IDs must be characters, outcome must be numeric.
#' @param outcome_id A string specifying the name of the ID column in 'add_outcome'
#' @param outcome_col A string specifying the name of the phenotype column in 'add_outcome'
#' @param na_outcome_vals A vector of numeric values used to code NA values in the outcome. Defaults to `c(-9, NA_integer)` (the -9 matches PLINK conventions).
#' @param add_predictor Optional (for PLINK data only): a matrix or data frame to be used for adding additional **unpenalized** covariates/predictors/features from an external file (i.e., not a PLINK file).
#' This matrix must have one column that is an ID column; all other columns aside the ID will be used as covariates in the design matrix. Columns must be named.
#' @param predictor_id Optional (for PLINK data only): A string specifying the name of the column in 'add_predictor' with sample IDs. **Required** if 'add_predictor' is supplied.
#' The names will be used to subset and align this external covariate with the supplied PLINK data.
#' @param unpen Optional (for delimited file data only): an optional character vector with the names of columns to mark as unpenalized (i.e., these features would always be included in a model).
#' **Note**: if you choose to use this option, X must have column names.
#' @param overwrite Logical: should existing .rds files be overwritten? Defaults to FALSE.
#' @param logfile Optional: name of the '.log' file to be written -- **Note:** do not append a `.log` to the filename; this is done automatically.
#' @param quiet Logical: should messages to be printed to the console be silenced? Defaults to FALSE
#'
#' @keywords internal
#'
#' @returns A filepath to the created .rds file containing all the information
#' for model fitting, including a standardized X and model design information
#'
create_design_filebacked <- function(data_file,
rds_dir,
obj,
new_file,
feature_id = NULL,
add_outcome,
outcome_id,
outcome_col,
na_outcome_vals = c(-9, NA_integer_),
add_predictor = NULL,
predictor_id = NULL,
unpen = NULL,
logfile = NULL,
overwrite = FALSE,
quiet = FALSE){
# check for input errors ----------------------------------------
if (any(add_outcome[,outcome_col] %in% na_outcome_vals)) {
stop('It appears that you have some missing values in the outcome data.
Please remove these samples; missing values are not permitted in the design.')
}
if (is.null(colnames(add_outcome))) {
stop('The columns of "add_outcome" must be named.')
}
if (grepl(pattern = 'fold', x = new_file)) {
warning("The string 'fold' is a keyword that is used to create intermediate files in cv_plmm().
If you call cv_plmm() on this design, there is a big possiblity that you will lose files unintentionally.
I recommend you either (1) choose a different 'new_file' name (best option) or (2)
double check that the folder where you will save your results from
downstream analysis is not the same folder where you are saving this design.")
}
# additional checks for case where add_predictor is specified
if (!is.null(add_predictor)) {
if (is.null(predictor_id)) {
stop("If add_predictor is specified, the user must also specify predictor_id")
}
if (is.null(colnames(add_predictor))){
stop('The columns of "add_predictor" must be named.')
}
if (any(is.na(add_predictor[,]))) {
stop('It appears that there are missing values in the predictor data.
Please remove these samples; missing values are not permitted in the design.')
}
if (!identical(add_outcome[,outcome_id], add_predictor[,predictor_id])) {
stop("Something is off in the supplied outcome and/or predictor data.
Make sure the indicated ID columns are character type, represent the same samples, and have the same order.
Note: 'create_design()' will align the two external data items (outcome and predictors) with
the PLINK data. It is your responsibility to align the external data items with each other.\n")
}
}
# initial setup --------------------------------------------------
existing_files <- list.files(rds_dir)
if(!is.null(logfile)){
logfile = file.path(rds_dir, logfile)
}
logfile <- create_log(outfile = logfile)
# create list to be returned
design <- list()
# check for files to be overwritten---------------------------------
if (overwrite){
# remove files with name pattern
to_remove <- paste0(file.path(rds_dir, new_file), c('.bk', '.rds', '.desc'))
if (any(file.exists(to_remove))) {
file.remove(to_remove)
}
# check for left over intermediate files
if (file.exists(file.path(rds_dir,'unstd_design_matrix.bk'))) {
file.remove(c(file.path(rds_dir,'unstd_design_matrix.bk'),
file.path(rds_dir,'unstd_design_matrix.desc')))
}
}
# attach the processed data -------------------------------
obj$X <- bigmemory::attach.big.matrix(obj$X)
# flag for data coming from plink
is_plink <- inherits(obj, "processed_plink")
# determine which IDs to use ---------------------------------
if (is.null(feature_id)){
if(!quiet) cat("No feature_id supplied; will assume data X are in same row-order as add_outcome.\n")
cat("No feature_id supplied; will assume data X are in same row-order as add_outcome.\n",
file = logfile, append = TRUE)
} else if (length(feature_id) == 1) {
if (feature_id == "IID"){
indiv_id <- "sample.ID"
og_ids <- obj$fam[,indiv_id] |> as.character()
} else if (feature_id == "FID"){
indiv_id <- "family.ID"
og_ids <- obj$fam[,indiv_id] |> as.character()
}
} else if (length(feature_id) == nrow(obj$X)) {
if (!inherits(feature_id, "character")) feature_id <- as.character(feature_id)
og_ids <- feature_id
} else {
stop("The feature_id argument is misspecified (see documentation for options).")
}
# check for any duplicated row IDs, if applicable
# TODO: update this syntax not to use 'exists()'
if (exists('og_ids')) {
if (any(duplicated(og_ids))) stop("Duplicated feature_id values detected.\n")
}
# save these original dim names
if (is_plink){
design$X_colnames <- obj$colnames <- obj$map$marker.ID
} else {
design$X_colnames <- colnames(obj$X)
}
if (exists('og_ids')) {
design$X_rownames <- og_ids
} else {
design$X_rownames <- paste0('row', 1:nrow(obj$X))
}
# save colnames of add_predictor (if supplied)
if (is.null(colnames(add_outcome))) {
stop("The matrix supplied to add_outcome must have column names.")
}
# save original dimensions
design$n <- obj$n
design$p <- obj$p # Note: p = # of features, not including any additional predictors!
# note whether data are from PLINK
design$is_plink <- is_plink
# index samples for subsetting ------------
# Note: this step uses the outcome (from external file) to determine which
# samples/observations should be pruned out; observations with no feature
# data will be removed from analysis
if (exists('og_ids')) {
sample_idx <- index_samples(obj = obj,
rds_dir = rds_dir,
indiv_id = og_ids,
add_outcome = add_outcome,
outcome_id = outcome_id,
outcome_col = outcome_col,
na_outcome_vals = na_outcome_vals,
outfile = logfile,
quiet = quiet)
# save items to return
design$outcome_idx <- sample_idx$outcome_idx # save indices of which rows in the feature data should be included in the design
design$y <- sample_idx$complete_samples[,outcome_col, with = FALSE] |> unlist()
design$std_X_rownames <- sample_idx$complete_samples$ID
} else {
# save items to return
design$outcome_idx <- 1:nrow(obj$X)
design$y <- unlist(add_outcome[,outcome_col])
design$std_X_rownames <- design$X_rownames
}
gc()
# align IDs between feature data and external data -------------------------
if (!is.null(add_predictor)) {
if (is.null(predictor_id)) stop('If add_predictor is supplied, the predictor_id argument must also be supplied')
aligned_add_predictor <- align_ids(id_var = predictor_id,
quiet = quiet,
add_predictor = add_predictor,
og_ids = og_ids)
gc()
# add predictors from external files --------------------------------------
unstd_X <- add_predictors(obj = obj,
add_predictor = aligned_add_predictor,
id_var = feature_id,
rds_dir = rds_dir,
quiet = quiet)
# save items to return
design$unpen <- unstd_X$unpen # save indices for unpenalized covariates
design$unpen_colnames <- setdiff(colnames(add_predictor), predictor_id)
gc()
} else {
# make sure 'unpen' was not specified by mistake
if (is_plink & !is_plink) stop("The 'unpen' argument is only for matrix data or delimited file data.
To create unpenalized covariates with PLINK file data,
see the documentation for the 'add_predictor' argument.")
if (is.null(unpen)) {
design$unpen <- NULL
design$unpen_colnames <- NULL
} else {
design$unpen <- which(design$X_colnames %in% unpen) # this will be used to index the columns which are unpenalized
design$unpen_colnames <- unpen
}
unstd_X <- obj
unstd_X$design_matrix <- obj$X
unstd_X$colnames <- design$X_colnames
}
if (is_plink){
design$fam <- unstd_X$obj$fam
design$map <- unstd_X$obj$map
}
# again, clean up to save space
rm(obj)
# index features for subsetting --------------------------------------------
design$ns <- count_constant_features(fbm = unstd_X$design_matrix,
outfile = logfile,
quiet = quiet)
gc()
# subsetting -----------------------------------------------------------------
subset_res <- subset_filebacked(X = unstd_X$design_matrix,
complete_samples = design$outcome_idx,
ns = design$ns,
rds_dir = rds_dir,
new_file = new_file,
outfile = logfile,
quiet = quiet)
# clean up
design$ns <- subset_res$ns
design$std_X_colnames <- unstd_X$colnames[subset_res$ns]
rm(unstd_X)
gc()
# standardization ------------------------------------------------------------
std_res <- standardize_filebacked(X = subset_res$subset_X,
new_file = new_file,
rds_dir = rds_dir,
outfile = logfile,
quiet = quiet,
overwrite = overwrite)
rm(subset_res)
gc()
# add meta data -------------------------------------------------------------
design$std_X <- std_res$std_X
design$std_X_n <- std_res$std_X_n
design$std_X_p <- std_res$std_X_p
design$std_X_center <- std_res$std_X_center
design$std_X_scale <- std_res$std_X_scale
design$penalty_factor <- c(rep(0, length(design$unpen)),
rep(1, design$std_X_p - length(design$unpen)))
#TODO: future work can add nuance to the way penalty.factor options are given
# below is one idea to hold onto...
# if penalty factor has 0s, these must be contiguous
# (necessary for subsetting later -- see setup_lambda(), for example)
# if (any(penalty_factor < 1e-8)){
# pf <- which(penalty_factor < 1e-8)
# if (!identical(pf, 1:length(pf))) {
# stop("It looks like you are trying to make some covariates *not* be penalized, as
# you have some penalty_factor values set to 0.\n
# If this is your intention, you must make all 'n' unpenalized covariates appear
# as the first 'n' columns in your design matrix.\n This is needed for subsetting later
# (for now, subsetting filebacked matrices requires a contiguous submatrix).\n")
# }
# }
# cleanup -------------------------------------------------------------------
list.files(rds_dir,
pattern=paste0('^unstd_design.*'),
full.names=TRUE) |> file.remove()
gc()
# return -------------------------------------------------------------
saveRDS(structure(design, class = "plmm_design"),
file.path(rds_dir, paste0(new_file, ".rds")))
return(file.path(rds_dir, paste0(new_file, ".rds")))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.