R/subset.R

Defines functions subset_features subset_samples subset_factors subset_views subset_groups

Documented in subset_factors subset_features subset_groups subset_samples subset_views

################################
## Functions to do subsetting ##
################################

#' @title Subset groups
#' @name subset_groups
#' @description Method to subset (or sort) groups
#' @param object a \code{\link{MOFA}} object.
#' @param groups character vector with the groups names, numeric vector with the groups indices
#' or logical vector with the groups to be kept as TRUE.
#' @return A \code{\link{MOFA}} object
#' @export
#' @examples
#' # Using an existing trained model on simulated data
#' file <- system.file("extdata", "model.hdf5", package = "MOFA2")
#' model <- load_model(file)
#' 
#' # Subset the first group
#' model <- subset_groups(model, groups = 1)
subset_groups <- function(object, groups) {
  
  # Sanity checks
  if (!is(object, "MOFA")) stop("'object' has to be an instance of MOFA")
  stopifnot(length(groups) <= object@dimensions[["G"]])
  
  # Define groups    
  groups <- .check_and_get_groups(object, groups)
  
  # Subset expectations
  if (length(object@expectations)>0) {
    if ("Z" %in% names(object@expectations) & length(object@expectations$Z)>0)
      object@expectations$Z <- object@expectations$Z[groups]
    if ("Y" %in% names(object@expectations) & length(object@expectations$Y)>0)
      object@expectations$Y <- sapply(object@expectations$Y, function(x) x[groups], simplify = FALSE, USE.NAMES = TRUE) 
  }
  
  # Subset data
  if (length(object@data)>0) {
    object@data <- sapply(object@data, function(x) x[groups], simplify = FALSE, USE.NAMES = TRUE) 
  }

  # Subset imputed data
  if (length(object@imputed_data)>0) {
    object@imputed_data <- sapply(object@imputed_data, function(x) x[groups], simplify = FALSE, USE.NAMES = TRUE) 
  }
  
  # Subset intercepts
  if (length(object@intercepts[[1]])>0) {
    object@intercepts <- sapply(object@intercepts, function(x) x[groups], simplify = FALSE, USE.NAMES = TRUE) 
  }
  
  # Update dimensionality
  object@dimensions[["G"]] <- length(groups)
  object@dimensions[["N"]] <- object@dimensions[["N"]][groups]
  
  # Subset sample metadata
  stopifnot(groups%in%unique(object@samples_metadata$group))
  object@samples_metadata <- object@samples_metadata[object@samples_metadata$group %in% groups,]
  object@samples_metadata$group <- factor(object@samples_metadata$group, levels=groups)
  
  # Re-order samples
  samples <- unname(unlist(lapply(object@data[[1]],colnames)))
  object@samples_metadata <- object@samples_metadata[match(samples, object@samples_metadata$sample),]
  
  # Sanity checks
  stopifnot(object@samples_metadata$sample == unlist(lapply(object@data[[1]],colnames)))
  stopifnot(object@samples_metadata$sample == unlist(lapply(object@expectations$Z,rownames)))
  stopifnot(object@samples_metadata$sample == unlist(lapply(object@expectations$Y,colnames)))
  
  # Update groups names
  # groups_names(object) <- groups # don't need to run this
  object@data_options$groups <- groups
  
  # Subset variance explained
  object@cache[["variance_explained"]]$r2_per_factor <- object@cache[["variance_explained"]]$r2_per_factor[groups]
  object@cache[["variance_explained"]]$r2_total <- object@cache[["variance_explained"]]$r2_total[groups]
  
  return(object)
}


#' @title Subset views
#' @name subset_views
#' @description Method to subset (or sort) views
#' @param object a \code{\link{MOFA}} object.
#' @param views character vector with the views names, numeric vector with the views indices,
#' or logical vector with the views to be kept as TRUE.
#' @return A \code{\link{MOFA}} object
#' @export
#' @examples
#' # Using an existing trained model on simulated data
#' file <- system.file("extdata", "model.hdf5", package = "MOFA2")
#' model <- load_model(file)
#' 
#' # Subset the first view
#' model <- subset_views(model, views = 1)
subset_views <- function(object, views) {
  
  # Sanity checks
  if (!is(object, "MOFA")) stop("'object' has to be an instance of MOFA")
  stopifnot(length(views) <= object@dimensions[["M"]])
  # warning("Removing views a posteriori is fine for an exploratory analysis, but you should removing them before training!")
  
  # Define views
  views <- .check_and_get_views(object, views)
  
  # Subset relevant slots
  if (length(object@expectations)>0) {
    object@expectations$W <- object@expectations$W[views]
    object@expectations$Y <- object@expectations$Y[views]
  }

  # Subset data
  if (length(object@data)>0) {
    object@data <- object@data[views]
  }
  
  # Subset imputed data
  if (length(object@imputed_data)>0) {
    object@imputed_data <- object@imputed_data[views]
  }
  
  # Subset intercepts
  if (length(object@intercepts[[1]])>0) {
    object@intercepts <- object@intercepts[views]
  }
  
  # Subset feature metadata
  if (length(object@features_metadata)>0) {
    object@features_metadata <- object@features_metadata[object@features_metadata$view %in% views,]
  }
  
  # Subset likelihoods
  object@model_options$likelihoods <- object@model_options$likelihoods[views]
  # Update dimensionality
  object@dimensions[["M"]] <- length(views)
  object@dimensions[["D"]] <- object@dimensions[["D"]][views]
  
  # Update view names
  object@data_options$views <- views
  
  # Subset variance explained
  if ((methods::.hasSlot(object, "cache")) && ("variance_explained" %in% names(object@cache))) {
    object@cache[["variance_explained"]]$r2_per_factor <- lapply(object@cache[["variance_explained"]]$r2_per_factor, function(x) x[,views,drop=FALSE])
    object@cache[["variance_explained"]]$r2_total <- lapply(object@cache[["variance_explained"]]$r2_total, function(x) x[views])
  }
  
  return(object)
}


#' @title Subset factors
#' @name subset_factors
#' @description Method to subset (or sort) factors
#' @param object a \code{\link{MOFA}} object.
#' @param factors character vector with the factor names, or numeric vector with the index of the factors.
#' @param recalculate_variance_explained logical indicating whether to recalculate variance explained values. Default is \code{TRUE}.
#' @export
#' @return A \code{\link{MOFA}} object
#' @examples
#' # Using an existing trained model on simulated data
#' file <- system.file("extdata", "model.hdf5", package = "MOFA2")
#' model <- load_model(file)
#' 
#' # Subset factors 1 to 3
#' model <- subset_factors(model, factors = 1:3)
subset_factors <- function(object, factors, recalculate_variance_explained = TRUE) {
  
  # Sanity checks
  if (!is(object, "MOFA")) stop("'object' has to be an instance of MOFA")
  stopifnot(length(factors) <= object@dimensions[["K"]])
  
  # Define factors
  factors <- .check_and_get_factors(object, factors)
  
  # Subset expectations
  nodes_with_factors <- list(nodes = c("Z", "W", "Sigma", "AlphaZ", "AlphaW", "ThetaZ", "ThetaW"), axes = c(2, 2, 1, 0, 0, 0, 0))
  stopifnot(all(nodes_with_factors$axes %in% c(0, 1, 2)))

  if (length(object@expectations)>0) {
    for (i in seq_len(length(nodes_with_factors$nodes))) {
      node <- nodes_with_factors$nodes[i]
      axis <- nodes_with_factors$axes[i]
      if (node %in% names(object@expectations)) {
        if(node != "Sigma") {
        if (axis == 1) {
          object@expectations[[node]] <- sapply(object@expectations[[node]], function(x) x[factors,,drop=FALSE], simplify = FALSE, USE.NAMES = TRUE)
        } else if (axis == 2) {
          object@expectations[[node]] <- sapply(object@expectations[[node]], function(x) x[,factors,drop=FALSE], simplify = FALSE, USE.NAMES = TRUE)
        } else {
          object@expectations[[node]] <- sapply(object@expectations[[node]], function(x) x[factors], simplify = FALSE, USE.NAMES = TRUE)
        }
        } else {
          if (axis == 1) {
            object@expectations[[node]] <- sapply(object@expectations[[node]], function(x) x[factors,,,drop=FALSE], simplify = FALSE, USE.NAMES = TRUE)
          } else if (axis == 2) {
            object@expectations[[node]] <- sapply(object@expectations[[node]], function(x) x[,factors,,drop=FALSE], simplify = FALSE, USE.NAMES = TRUE)
          } else if (axis == 3) {
            object@expectations[[node]] <- sapply(object@expectations[[node]], function(x) x[,,factors,drop=FALSE], simplify = FALSE, USE.NAMES = TRUE)
          } 
      }
    }
    }
  }
  
  # Subset interpolations
  if(length(object@interpolated_Z) > 0) {
    object@interpolated_Z <- lapply(object@interpolated_Z, function(g) { 
      if(!is.null(g$mean)) {
        m <- g$mean[factors, , drop = FALSE]
      }
      if(!is.null(g$variance)) {
        v <- g$variance[factors, , drop = FALSE]
      }
      list(mean = m, variance = v, new_values = g$new_values)
    })
  }
  
  # Remove total variance explained estimates  
  if (length(factors)<object@dimensions[["K"]]) {
    object@cache[["variance_explained"]]$r2_total <- lapply(object@cache[["variance_explained"]]$r2_per_factor, colSums)
    if (recalculate_variance_explained) {
      message("Recalculating total variance explained values (r2_total)...")
      object@cache[["variance_explained"]]$r2_total <- calculate_variance_explained(object)[["r2_total"]]
    }
  }
  
  # # Relalculate total variance explained estimates (not valid for non-orthogonal factors)
  # if (length(factors) < object@dimensions[["K"]]) {
  #   object@cache[["variance_explained"]]$r2_total <- lapply(object@cache[["variance_explained"]]$r2_per_factor, colSums)
  
  # Subset per-factor variance explained estimates
  if ((methods::.hasSlot(object, "cache")) && ("variance_explained" %in% names(object@cache))) {
    object@cache[["variance_explained"]]$r2_per_factor <- lapply(object@cache[["variance_explained"]]$r2_per_factor, function(x) x[factors,,drop=FALSE])
  }
  
  # Subset lengthscales per factor
  if (!is.null(object@training_stats$structural_sig)) {
    object@training_stats$structural_sig <- object@training_stats$structural_sig[factors,drop=FALSE]
  }
  if (!is.null(object@training_stats$length_scales)) {
    object@training_stats$length_scales <- object@training_stats$length_scales[factors,drop=FALSE]
  }
  if (!is.null(object@training_stats$scales)) {
    object@training_stats$scales <- object@training_stats$scales[factors,drop=FALSE]
  }
  
  # Update dimensionality
  object@dimensions[["K"]] <- length(factors)
  
  # Update factor names
  factors_names(object) <- paste0("Factor", as.character(seq_len(object@dimensions[["K"]])))
  
  
  return(object)
}



#' @title Subset samples
#' @name subset_samples
#' @description Method to subset (or sort) samples
#' @param object a \code{\link{MOFA}} object.
#' @param samples character vector with the sample names or numeric vector with the sample indices.
#' @export
#' @return A \code{\link{MOFA}} object
#' @examples
#' # Using an existing trained model on simulated data
#' file <- system.file("extdata", "model.hdf5", package = "MOFA2")
#' model <- load_model(file)
#' 
#' # (TO-DO) Remove a specific sample from the model (an outlier)
subset_samples <- function(object, samples) {
  
  # Sanity checks
  if (!is(object, "MOFA")) stop("'object' has to be an instance of MOFA")
  
  # Define samples
  samples <- .check_and_get_samples(object, samples)
  
  # Check if an entire group needs to be removed
  groups <- as.character(unique(object@samples_metadata[match(samples, object@samples_metadata$sample),]$group))
  if (length(groups)<length(groups_names(object))) object <- subset_groups(object, groups)
  
  # Subset data and expectations
  groups <- groups_names(object)
  tmp <- lapply(groups, function(g) samples_names(object)[[g]][samples_names(object)[[g]] %in% samples])
  names(tmp) <- groups
  
  for (g in groups) {
    samples_g <- tmp[[g]]
    
    # Subset expectations
    if (length(object@expectations)>0) {
      if ("Z" %in% names(object@expectations) & length(object@expectations$Z)>0) {
        object@expectations$Z[[g]] <- object@expectations$Z[[g]][samples_g,, drop=FALSE]
      }
      
      if ("Y" %in% names(object@expectations) & length(object@expectations$Y)>0) {
        for (m in views_names(object)) {
          object@expectations$Y[[m]][[g]] <- object@expectations$Y[[m]][[g]][,samples_g,drop=FALSE]
        }  
      }
      if ("Tau" %in% names(object@expectations) & length(object@expectations$Tau)>0) {
        for (m in views_names(object)) {
          object@expectations$Tau[[m]][[g]] <- object@expectations$Tau[[m]][[g]][samples_g, , drop=FALSE]
        }  
      }
    if(g == groups[1]) {# only one Sigma node
      if ("Sigma" %in% names(object@expectations) & length(object@expectations$Sigma)>0) {
        samples <- unique(unlist(tmp)) # TODO - make Sigma live on covariate level or expand to group-level
        object@expectations$Sigma[[1]] <- object@expectations$Sigma[[1]][,samples,samples,drop=FALSE]
      }
    }
    }

    # Subset data
    if (length(object@data)>0) { 
      for (m in views_names(object)) {
        object@data[[m]][[g]] <- object@data[[m]][[g]][,samples_g,drop=FALSE]
      }
    }
    
    # Subset imputed data
    if (length(object@imputed_data)>0) { 
      for (m in views_names(object)) {
        object@imputed_data[[m]][[g]] <- object@imputed_data[[m]][[g]][,samples_g,drop=FALSE]
      }
    }
    
  }

  # Subset sample metadata
  object@samples_metadata <- object@samples_metadata[object@samples_metadata$sample %in% samples,]
  
  # Update dimensionality
  object@dimensions[["N"]] <- sapply(tmp, length)
  
  # Update sample names
  samples_names(object) <- tmp
  
  # Sanity checks
  stopifnot(object@samples_metadata$sample == unlist(lapply(object@data[[1]],colnames)))
  stopifnot(object@samples_metadata$sample == unlist(lapply(object@expectations$Z,rownames)))
  stopifnot(object@samples_metadata$sample == unlist(lapply(object@expectations$Y,colnames)))
  
  # Remove variance explained estimates  
  warning("After subsetting the samples the variance explained estimates are not valid anymore, removing them...")
  object@cache[["variance_explained"]] <- NULL
  
  return(object)
}


#' @title Subset features
#' @name subset_features
#' @description Method to subset (or sort) features
#' @param object a \code{\link{MOFA}} object.
#' @param view character vector with the view name or integer with the view index
#' @param features character vector with the sample names, numeric vector with the feature indices 
#' or logical vector with the samples to be kept as TRUE.
#' @return A \code{\link{MOFA}} object
#' @export

subset_features <- function(object, view, features) {
  
  # Sanity checks
  if (!is(object, "MOFA")) stop("'object' has to be an instance of MOFA")
  stopifnot(length(features) <= sapply(object@dimensions[["D"]], sum))
  warning("Removing features a posteriori is fine for an exploratory analysis, but we recommend removing them before training!")

  if (is.numeric(view)) view <- views_names(object)[view]
  stopifnot(all(view %in% views_names(object)))  

  # Define features
  if (is.character(features)) {
    stopifnot(all(features %in% features_names(object)[[view]]))
  } else {
    features <- features_names(object)[[view]][features]
  }
  
  # Subset relevant slots
  if (length(object@expectations)>0) {
    if ("W" %in% names(object@expectations) & length(object@expectations$W)>0)
      object@expectations$W <- lapply(object@expectations$W, function(x) x[features,, drop=FALSE])
    if ("Y" %in% names(object@expectations) & length(object@expectations$Y)>0)
      object@expectations$Y[[view]] <- lapply(object@expectations$Y[[view]], function(x) x[features,])

  if (length(object@data)>0)
    object@data <- lapply(object@data, function(x) sapply(x, function(y) y[features,], simplify = FALSE, USE.NAMES = TRUE))

  if (length(object@expectations)>0)
  object@intercepts <- lapply(object@intercepts, function(x) sapply(x, function(y) y[features], simplify = FALSE, USE.NAMES = TRUE))

  if (length(object@imputed_data) != 0) {
    stop()
    # object@imputed_data <- lapply(object@imputed_data, function(x) sapply(x, function(y) y[,samples], simplify = FALSE, USE.NAMES = TRUE)) 
    }
  }
  # Update dimensionality
  object@dimensions[["D"]][[view]] <- length(features)
  
  # Update features names
  features_names(object)[[view]] <- features
  
  # Remove variance explained estimates  
  warning("After subsetting the features the variance explained estimates are not valid anymore, removing them...")
  object@cache[["variance_explained"]] <- NULL
  
  return(object)
}
bioFAM/MOFA2 documentation built on Feb. 1, 2024, 6:41 a.m.