#######################################
## Functions to train a MOFA model ##
#######################################
#' @title Train a MOFA model
#' @name run_mofa
#' @description Function to train an untrained \code{\link{MOFA}} object.
#' @details This function is called once a MOFA object has been prepared (using \code{\link{prepare_mofa}})
#' In this step the R package calls the \code{mofapy2} Python package, where model training is performed. \cr
#' The interface with Python is done with the \code{\link{reticulate}} package.
#' If you have several versions of Python installed and R is not detecting the correct one,
#' you can change it using \code{reticulate::use_python} when loading the R session.
#' Alternatively, you can let us install mofapy2 for you using \code{basilisk} if you set use_basilisk to \code{TRUE}
#' @param object an untrained \code{\link{MOFA}} object
#' @param save_data logical indicating whether to save the training data in the hdf5 file.
#' This is useful for some downstream analysis (mainly functions with the prefix \code{plot_data}), but it can take a lot of disk space.
#' @param outfile output file for the model (.hdf5 format). If \code{NULL}, a temporary file is created.
#' @param use_basilisk use \code{basilisk} to automatically install a conda environment with mofapy2 and all dependencies?
#' If \code{FALSE} (default), you should specify the right python binary when loading R with \code{reticulate::use_python(..., force=TRUE)}
#' or the right conda environment with \code{reticulate::use_condaenv(..., force=TRUE)}.
#' @return a trained \code{\link{MOFA}} object
#' @import reticulate
#' @import basilisk
#' @export
#' @examples
#' # Load data (in data.frame format)
#' file <- system.file("extdata", "test_data.RData", package = "MOFA2")
#' load(file)
#'
#' # Create the MOFA object
#' MOFAmodel <- create_mofa(dt)
#'
#' # Prepare the MOFA object with default options
#' MOFAmodel <- prepare_mofa(MOFAmodel)
#'
#' # Run the MOFA model
#' \dontrun{ MOFAmodel <- run_mofa(MOFAmodel, use_basilisk = TRUE) }
run_mofa <- function(object, outfile = NULL, save_data = TRUE, use_basilisk = FALSE) {
# Sanity checks
if (!is(object, "MOFA"))
stop("'object' has to be an instance of MOFA")
if (object@status=="trained")
stop("The model is already trained! If you want to retrain, create a new untrained MOFA")
if (length(object@model_options)==0 | length(object@training_options)==0) {
stop("The model is not prepared for training, you have to run `prepare_mofa` before `run_mofa`")
}
# If no outfile is provided, store a file in a temporary folder with the respective timestamp
if (is.null(outfile) || is.na(outfile) || (outfile == "")) {
outfile <- object@training_options$outfile
if (is.null(outfile) || is.na(outfile) || (outfile == "")) {
outfile <- file.path(tempdir(), paste0("mofa_", format(Sys.time(), format = "%Y%m%d-%H%M%S"), ".hdf5"))
warning(paste0("No output filename provided. Using ", outfile, " to store the trained model.\n\n"))
}
}
if (file.exists(outfile))
message(paste0("Warning: Output file ", outfile, " already exists, it will be replaced"))
# Connect to mofapy2 using reticulate (default)
if (!use_basilisk) {
message("Connecting to the mofapy2 python package using reticulate (use_basilisk = FALSE)...
Please make sure to manually specify the right python binary when loading R with reticulate::use_python(..., force=TRUE) or the right conda environment with reticulate::use_condaenv(..., force=TRUE)
If you prefer to let us automatically install a conda environment with 'mofapy2' installed using the 'basilisk' package, please use the argument 'use_basilisk = TRUE'\n")
# Sanity checks
have_mofa2 <- py_module_available("mofapy2")
if (have_mofa2) {
mofa <- import("mofapy2")
tryCatch(tmp <- strsplit(mofa$version$`__version__`,"\\.")[[1]], error = function(e) { stop(sprintf("mofapy2 is not detected in the specified python binary, see reticulate::py_config(). Consider setting use_basilisk = TRUE to create a python environment with basilisk (https://bioconductor.org/packages/release/bioc/html/basilisk.html)")) })
v_major_reticulate = tmp[1]; v_minor_reticulate = tmp[2]; v_patch_reticulate = tmp[3]
tmp <- strsplit(.mofapy2_version,"\\.")[[1]]
v_major_pypi = tmp[1]; v_minor_pypi = tmp[2]; v_patch_pypi = tmp[3]
# return error if major or minor versions do not agree
if ((v_major_reticulate!=v_major_pypi) | (v_minor_reticulate!=v_minor_pypi)) {
warning(sprintf("The latest mofapy2 version is %s, you are using %s. Please upgrade with 'pip install mofapy2'",.mofapy2_version, mofa$version$`__version__`))
warning("Connecting to the latest mofapy2 python package using reticulate (use_basilisk = FALSE)")
have_mofa2 <- FALSE
}
# return warning if patch versions do not agree
if (v_patch_reticulate!=v_patch_pypi) {
warning(sprintf("The latest mofapy2 version is %s, you are using %s. Please upgrade with 'pip install mofapy2'",.mofapy2_version, mofa$version$`__version__`))
}
}
if (have_mofa2) {
.run_mofa_reticulate(object, outfile, save_data)
} else {
stop(sprintf("mofapy2_%s is not detected in the specified python binary, see reticulate::py_config(). Consider setting use_basilisk = TRUE to create a python environment with basilisk (https://bioconductor.org/packages/release/bioc/html/basilisk.html)", .mofapy2_version))
# use_basilisk <- TRUE
}
}
# Connect to mofapy2 using basilisk (optional)
if (use_basilisk) {
message("Connecting to the mofapy2 package using basilisk.
Set 'use_basilisk' to FALSE if you prefer to manually set the python binary using 'reticulate'.")
proc <- basiliskStart(mofa_env)
on.exit(basiliskStop(proc))
tmp <- basiliskRun(proc, function(object, outfile, save_data) {
.run_mofa_reticulate(object, outfile, save_data)
}, object=object, outfile=outfile, save_data=save_data)
}
# Load the trained model
object <- load_model(outfile)
return(object)
}
.run_mofa_reticulate <- function(object, outfile, save_data) {
# sanity checks
if (!is(object, "MOFA")) stop("'object' has to be an instance of MOFA")
if (!requireNamespace("reticulate", quietly = TRUE)) {
stop("Package \"reticulate\" is required but is not installed.", call. = FALSE)
}
# Initiate reticulate
mofa <- import("mofapy2")
# Call entry point
mofa_entrypoint <- mofa$run.entry_point$entry_point()
# Set data options
mofa_entrypoint$set_data_options(
scale_views = object@data_options$scale_views,
scale_groups = object@data_options$scale_groups,
center_groups = object@data_options$center_groups,
use_float32 = object@data_options$use_float32
)
# Set samples metadata
if (.hasSlot(object, "samples_metadata")) {
mofa_entrypoint$data_opts$samples_metadata <- r_to_py(lapply(object@data_options$groups,
function(g) object@samples_metadata[object@samples_metadata$group == g,]))
}
# Set features metadata
if (.hasSlot(object, "features_metadata")) {
mofa_entrypoint$data_opts$features_metadata <- r_to_py(unname(lapply(object@data_options$views,
function(m) object@features_metadata[object@features_metadata$view == m,])))
}
# r_to_py will convert a list with a single name to a string,
# hence those are to be wrapped in `list()`
maybe_list <- function(xs) {
if (length(xs) > 1) {
xs
} else {
list(xs)
}
}
# Set the data
mofa_entrypoint$set_data_matrix(
data = r_to_py( unname(lapply(object@data, function(x) unname( lapply(x, function(y) r_to_py(t(y)) ))) ) ),
likelihoods = unname(object@model_options$likelihoods),
views_names = r_to_py(as.list(object@data_options$views)),
groups_names = r_to_py(as.list(object@data_options$groups)),
samples_names = r_to_py(lapply(unname(lapply(object@data[[1]], colnames)), maybe_list)),
features_names = r_to_py(lapply(unname(lapply(object@data, function(x) rownames(x[[1]]))), maybe_list))
)
# Set covariates
if (.hasSlot(object, "covariates") && !is.null(object@covariates)) {
sample_cov_to_py <- r_to_py(unname(lapply(object@covariates, function(x) unname(r_to_py(t(x))))))
cov_names_2_py <- r_to_py(covariates_names(object))
mofa_entrypoint$set_covariates(sample_cov_to_py, cov_names_2_py)
}
# Set model options
mofa_entrypoint$set_model_options(
factors = object@model_options$num_factors,
spikeslab_factors = object@model_options$spikeslab_factors,
spikeslab_weights = object@model_options$spikeslab_weights,
ard_factors = object@model_options$ard_factors,
ard_weights = object@model_options$ard_weights
)
# Set training options
mofa_entrypoint$set_train_options(
iter = object@training_options$maxiter,
convergence_mode = object@training_options$convergence_mode,
dropR2 = object@training_options$drop_factor_threshold,
startELBO = object@training_options$startELBO,
freqELBO = object@training_options$freqELBO,
seed = object@training_options$seed,
gpu_mode = object@training_options$gpu_mode,
gpu_device = object@training_options$gpu_device,
verbose = object@training_options$verbose,
outfile = object@training_options$outfile,
weight_views = object@training_options$weight_views,
save_interrupted = object@training_options$save_interrupted
)
# Set stochastic options
if (object@training_options$stochastic) {
mofa_entrypoint$set_stochastic_options(
learning_rate = object@stochastic_options$learning_rate,
forgetting_rate = object@stochastic_options$forgetting_rate,
batch_size = object@stochastic_options$batch_size,
start_stochastic = object@stochastic_options$start_stochastic
)
}
# Set mefisto options
if (.hasSlot(object, "covariates") && !is.null(object@covariates) & length(object@mefisto_options)>1) {
warping_ref <- which(groups_names(object) == object@mefisto_options$warping_ref)
mofa_entrypoint$set_smooth_options(
scale_cov = object@mefisto_options$scale_cov,
start_opt = as.integer(object@mefisto_options$start_opt),
n_grid = as.integer(object@mefisto_options$n_grid),
opt_freq = as.integer(object@mefisto_options$opt_freq),
model_groups = object@mefisto_options$model_groups,
sparseGP = object@mefisto_options$sparseGP,
frac_inducing = object@mefisto_options$frac_inducing,
warping = object@mefisto_options$warping,
warping_freq = as.integer(object@mefisto_options$warping_freq),
warping_ref = warping_ref-1, # 0-based python indexing
warping_open_begin = object@mefisto_options$warping_open_begin,
warping_open_end = object@mefisto_options$warping_open_end,
warping_groups = r_to_py(object@mefisto_options$warping_groups)
)
}
# Build the model
mofa_entrypoint$build()
# Run the model
mofa_entrypoint$run()
# Interpolate
if (.hasSlot(object, "covariates") && !is.null(object@covariates) & length(object@mefisto_options)>1) {
if(!is.null(object@mefisto_options$new_values)) {
new_values <- object@mefisto_options$new_values
if(is.null(dim(new_values))){
new_values <- matrix(new_values, nrow = 1)
}
mofa_entrypoint$predict_factor(new_covariates = r_to_py(t(new_values)))
}
}
# Save the model output as an hdf5 file
mofa_entrypoint$save(outfile, save_data = save_data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.