R/utils.R

Defines functions .merge_features .parse_taxonomy .set_feature_tab_dimnames .add_values_to_metadata .add_to_coldata .add_values_to_colData .check_and_get_altExp .check_MARGIN .check_altExp_present .check_colTree_present .check_rowTree_present .check_assay_present .get_name_in_parent .all_are_existing_files .is_function .is_numeric_string .are_whole_numbers .is_an_integer .is_a_string .is_non_empty_string .is_non_empty_character .is_a_bool .require_package

################################################################################
# internal methods loaded from other packages

.get_mat_from_sce <- scater:::.get_mat_from_sce
.get_mat_for_reddim <- scater:::.get_mat_for_reddim

################################################################################
# integration with other packages

.require_package <- function(pkg){
    if(!requireNamespace(pkg, quietly = TRUE)){
    stop("'",pkg,"' package not found. Please install the '",pkg,"' package ",
         "to use this function.", call. = FALSE)
    }
}

################################################################################
# testing

.is_a_bool <- function(x){
  is.logical(x) && length(x) == 1L && !is.na(x)
}

.is_non_empty_character <- function(x){
  is.character(x) && all(nzchar(x))
}

.is_non_empty_string <- function(x){
  .is_non_empty_character(x) && length(x) == 1L
}

.is_a_string <- function(x){
  is.character(x) && length(x) == 1L
}

.is_an_integer <- function(x){
    is.numeric(x) && length(x) == 1L && x%%1==0
}

.are_whole_numbers <- function(x){
  tol <- 100 * .Machine$double.eps
  abs(x - round(x)) <= tol && !is.infinite(x)
}

.is_numeric_string <- function(x){
  x <- as.character(x)
  suppressWarnings({x <- as.numeric(x)})
  !is.na(x)
}

.is_function <- function(x){
  typeof(x) == "closure" && is(x, "function")
}

.all_are_existing_files <- function(x){
  all(file.exists(x))
}

.get_name_in_parent <- function(x) {
  .safe_deparse(do.call(substitute, list(substitute(x), parent.frame())))
}

.safe_deparse <- function (expr, ...) {
  paste0(deparse(expr, width.cutoff = 500L, ...), collapse = "")
}

################################################################################
# checks

#' @importFrom SummarizedExperiment assays
.check_assay_present <- function(assay.type, x,
                                 name = .get_name_in_parent(assay.type)){
    if(!.is_non_empty_string(assay.type)){
        stop("'",name,"' must be a single non-empty character value.",
             call. = FALSE)
    }
    if(!(assay.type %in% names(assays(x)))){
        stop("'",name,"' must be a valid name of assays(x)", call. = FALSE)
    }
}

.check_rowTree_present <- function(tree_name, x,
                                   name = .get_name_in_parent(tree_name) ){
    if( !.is_non_empty_string(tree_name) ){
        stop("'", name, "' must be a single non-empty character value.",
             call. = FALSE)
    }
    if( !(tree_name %in% names(x@rowTree)) ){
        stop("'", name, "' must specify a tree from 'x@rowTree'.",
             call. = FALSE)
    }
}

.check_colTree_present <- function(tree_name, x,
                                   name = .get_name_in_parent(tree_name) ){
    if( !.is_non_empty_string(tree_name) ){
        stop("'", name, "' must be a single non-empty character value.",
             call. = FALSE)
    }
    if( !(tree_name %in% names(x@colTree)) ){
        stop("'", name, "' must specify a tree from 'x@colTree'.",
             call. = FALSE)
    }
}

# Check if alternative experiment can be found from altExp slot.
.check_altExp_present <- function(
        altexp, tse, altExpName = .get_name_in_parent(altexp),
        tse_name = .get_name_in_parent(tse), .disable.altexp = FALSE, ...){
    # Disable altExp if specified
    if( !.is_a_bool(.disable.altexp) ){
        stop("'.disable.altexp' must be TRUE or FALSE.", call. = FALSE)
    }
    if( .disable.altexp ){
        altexp <- NULL
    }
    # Check that altexp.name must be an integer or name
    if( !(.is_a_string(altexp) || .is_an_integer(altexp) || is.null(altexp)) ){
        stop(
            "'", altExpName, "' must be a string or an integer.", call. = FALSE)
    }
    # If is not NULL, but the object does not have altExp slot
    if( !is.null(altexp) && !is(tse, "SingleCellExperiment") ){
        stop(
            "'", altExpName, "', is specified but '", tse_name, "' does not ",
            "have altExp slot.", call. = FALSE)
    }
    # Then check that altExp can be found; name or index.
    if( !is.null(altexp) && !altexp %in% c(
            altExpNames(tse), seq_len(length(altExps(tse)))) ){
        stop(
          "'", altExpName, "', does not specify an experiment from altExp ",
          "slot of '", tse_name, "'.", call. = FALSE)
      }
}

# Check MARGIN parameters. Should be defining rows or columns.
.check_MARGIN <- function(MARGIN) {
    # Convert to lowcase if it is a string
    if( .is_non_empty_string(MARGIN) ) {
        MARGIN <- tolower(MARGIN)
    }
    # MARGIN must be one of the following options
    if( !(length(MARGIN) == 1L && MARGIN %in% c(
            1, 2, "1", "2", "features", "samples", "columns", "col", "row",
            "rows", "cols")) ) {
        stop("'MARGIN' must equal 1 or 2.", call. = FALSE)
    }
    # Convert MARGIN to numeric if it is not.
    MARGIN <- ifelse(MARGIN %in% c(
        "samples", "columns", "col", 2, "cols"), 2, 1)
    return(MARGIN)
}

################################################################################
# Internal wrappers for getters

# Input: (Tree)SE
# Output: (Tree)SE
.check_and_get_altExp <- function(
        x, altexp = NULL, ...){
    # If altexp is specified, check and get it.
    # Otherwise return the original object
    if( !is.null(altexp) ){
        # Check altexp
        .check_altExp_present(altexp, x, ...)
        # Get altExp and return it
        x <- altExp(x, altexp)
    }
    return(x)
}

################################################################################
# Internal wrappers for setters

#' @importFrom SummarizedExperiment colData colData<- rowData rowData<-
#' @importFrom S4Vectors DataFrame
.add_values_to_colData <- function(
        x, values, name, altexp = NULL, MARGIN = default.MARGIN,
        default.MARGIN = 2, transpose.MARGIN = FALSE, colname = "name",
        ...){
    #
    if( !.is_a_string(colname) ){
        stop("'colname' must be a string.", call. = FALSE)
    }
    #
    # Check if altExp can be found
    .check_altExp_present(altexp, x)
    # Check that MARGIN is correct
    MARGIN <- .check_MARGIN(MARGIN)
    #
    # If trasnpose.MARGIN is TRUE, transpose MARGIN, i.e. 1 --> 2, and 2 --> 1.
    # In certain functions, values calculated by rows (MARGIN=1) are stored to
    # colData (MARGIN=2) and vice versa.
    if( transpose.MARGIN ){
        MARGIN <- ifelse(MARGIN == 1, 2, 1)
    }
    # converts each value:name pair into a DataFrame
    values <- mapply(
        function(value, n){
            value <- DataFrame(value)
            colnames(value)[1L] <- n
            if(ncol(value) > 1L){
                i <- seq.int(2,ncol(value))
                colnames(value)[i] <- paste0(n,"_",colnames(value)[i])
            }
            value
        },
        values,
        name)
    values <- do.call(cbind, values)
    
    # Based on MARGIN, get rowDatra or colData
    FUN <- switch(MARGIN, rowData, colData)
    # If altexp.name was not NULL, then we know that it specifies correctly
    # altExp from the slot. Take the colData/rowData from experiment..
    if( !is.null(altexp) ){
        cd <- FUN( altExp(x, altexp) )
    } else{
        cd <- FUN(x)
    }
    
    # check for duplicated values
    f <- colnames(cd) %in% colnames(values)
    FUN_name <- switch(MARGIN, "rowData", "colData")
    if(any(f)) {
        warning(
            "The following values are already present in `", FUN_name,
            "` and will be overwritten: '",
            paste(colnames(cd)[f], collapse = "', '"),
            "'. Consider using the '", colname,
            "' argument to specify alternative names.",
            call. = FALSE)
    }
    # Keep only unique values
    cd <- cbind( (cd)[!f], values )
    
    # Replace colData with new one
    x <- .add_to_coldata(x, cd, altexp = altexp, MARGIN = MARGIN)
    return(x)
}

# Get feature or sample metadata. Allow hidden usage of MARGIN and altExp.
#' @importFrom SummarizedExperiment rowData colData
.add_to_coldata <- function(
        x, cd, altexp = NULL, .disable.altexp = FALSE,
        MARGIN = default.MARGIN, default.MARGIN = 1, ...){
    #
    if( !.is_a_bool(.disable.altexp) ){
        stop("'.disable.altexp' must be TRUE or FALSE.", call. = FALSE)
    }
    # Check if altExp can be found
    .check_altExp_present(altexp, x, ...)
    # Check that MARGIN is correct
    MARGIN <- .check_MARGIN(MARGIN)
    # Based on MARGIN, add result to rowData or colData
    FUN <- switch(MARGIN, `rowData<-`, `colData<-`)
    # If altexp was specified, add result to altExp. Otherwise add it directly
    # to x.
    if( !is.null(altexp) && !.disable.altexp ){
        altExp(x, altexp) <- FUN( altExp(x, altexp), value = cd )
    } else{
        x <- FUN(x, value = cd)
    }
    return(x)
}

#' @importFrom S4Vectors metadata metadata<-
.add_values_to_metadata <- function(
        x, names, values, altexp = NULL, metadata.name = "name", ...){
    #
    if( !.is_a_string(metadata.name) ){
        stop("'metadata.name' must be a string.", call. = FALSE)
    }
    # Check if altExp can be found
    .check_altExp_present(altexp, x)
    #
    # Create a list and name elements
    add_metadata <- list(values)
    names(add_metadata) <- names
    # Get old metadata
    if( !is.null(altexp) ){
        old_metadata <- metadata( altExp(x, altexp) )
    } else{
        old_metadata <- metadata(x)
    }
    # Check if names match with elements that are already present
    f <- names(old_metadata) %in% names(add_metadata)
    if( any(f) ){
        warning(
            "The following values are already present in `metadata` and will ",
            "be overwritten: '",
            paste(names(old_metadata)[f], collapse = "', '"),
            "'. Consider using the '", metadata.name,
            "' argument to specify alternative ", "names.", call. = FALSE)
    }
    # keep only unique values
    add_metadata <- c( old_metadata[!f], add_metadata )
    # Add metadata to altExp or directly to x
    if( !is.null(altexp) ){
        metadata( altExp(x, altexp) ) <- add_metadata
    } else{
        metadata(x) <- add_metadata
    }
    return(x)
}

################################################################################
# Other common functions

# keep dimnames of feature table (assay) consistent with the meta data 
# of sample (colData) and feature (rowData)
.set_feature_tab_dimnames <- function(feature_tab, 
                                      sample_meta, 
                                      feature_meta) {
    if (nrow(sample_meta) > 0 || ncol(sample_meta) > 0) {
        if (ncol(feature_tab) != nrow(sample_meta) 
            || !setequal(colnames(feature_tab), rownames(sample_meta))) {
            stop(
                "The sample ids in feature table are not incompatible ",
                "with those in sample meta",
                call. = FALSE
            )
        }
        if (!identical(colnames(feature_tab), rownames(sample_meta))) {
            feature_tab <- feature_tab[, rownames(sample_meta), drop = FALSE]
        }
    }
    
    if (nrow(feature_meta) > 0 || ncol(feature_meta) > 0) {
        if (nrow(feature_tab) != nrow(feature_meta)
            || !setequal(rownames(feature_tab), rownames(feature_meta))) {
            stop(
                "The feature names in feature table are not incompatible ",
                "with those in feature meta",
                call. = FALSE
            )
        }
        if (!identical(rownames(feature_tab), rownames(feature_meta))) {
            feature_tab <- feature_tab[rownames(feature_meta), , drop = FALSE]
        }
    }
  
    feature_tab
}

#' Parse taxa in different taxonomic levels
#' @param taxa_tab `data.frame` object.
#' 
#' @param sep character string containing a regular expression, separator
#'  between different taxonomic levels, defaults to one compatible with both
#'  GreenGenes and SILVA `; |;"`.
#'  
#' @param column_name a single \code{character} value defining the column of taxa_tab
#'  that includes taxonomical information.
#'  
#' @param remove.prefix {\code{TRUE} or \code{FALSE}: Should 
#'  taxonomic prefixes be removed? (default: \code{remove.prefix = FALSE})}
#'  
#' @return  a `data.frame`.
#' @keywords internal
#' @importFrom IRanges CharacterList IntegerList
#' @importFrom S4Vectors DataFrame
#' @noRd
.parse_taxonomy <- function(
    taxa_tab, sep = "; |;", column_name = "Taxon",
    remove.prefix = removeTaxaPrefixes, removeTaxaPrefixes = FALSE,
    returned.ranks = TAXONOMY_RANKS, ...) {
    ############################### Input check ################################
    # Check sep
    if(!.is_non_empty_string(sep)){
      stop("'sep' must be a single character value.",
           call. = FALSE)
    }
    # Check column_name
    if( !(.is_non_empty_string(column_name) && column_name %in% colnames(taxa_tab)) ){
      stop("'column_name' must be a single character value defining column that includes",
           " information about taxonomic levels.",
           call. = FALSE)
    }
    # Check remove.prefix
    if(!.is_a_bool(remove.prefix)){
        stop("'remove.prefix' must be TRUE or FALSE.", call. = FALSE)
    }
    # Check returned.ranks
    if( !is.character(returned.ranks) ){
        stop("'returned.ranks' must be a character vector.", call. = FALSE)
    }
    ############################## Input check end #############################
    
    #  work with any combination of taxonomic ranks available
    all_ranks <- c(
      "Kingdom","Phylum","Class","Order","Family","Genus","Species", "Strain")
    all_prefixes <- c("k__", "p__", "c__", "o__", "f__", "g__", "s__", "t__")
    
    # split the taxa strings
    taxa_split <- CharacterList(strsplit(taxa_tab[, column_name],sep))
    # extract present prefixes
    taxa_prefixes <- lapply(taxa_split, substr, 1L, 3L)
    # match them to the order given by present_prefixes
    taxa_prefixes_match <- lapply(taxa_prefixes, match, x = all_prefixes)
    taxa_prefixes_match <- IntegerList(taxa_prefixes_match)
    # get the taxa values
    if(remove.prefix){
        taxa_split <- lapply(
            taxa_split, gsub, pattern = "([kpcofgst]+)__", replacement = "")
      taxa_split <- CharacterList(taxa_split)
    }
    # extract by order matches
    taxa_split <- taxa_split[taxa_prefixes_match]
    #
    if(length(unique(lengths(taxa_split))) != 1L){
      stop("Internal error. Something went wrong while splitting taxonomic levels.",
           "Please check that 'sep' is correct.", call. = FALSE)
    }
    taxa_tab <- DataFrame(as.matrix(taxa_split))
    colnames(taxa_tab) <- all_ranks
    
    # Subset columns so that it includes TAXONOMY_RANKS columns by default.
    # If strain column has values, it it also returned.
    ind <- !(!tolower(colnames(taxa_tab)) %in% tolower(returned.ranks) &
        colSums(is.na(taxa_tab)) == nrow(taxa_tab))
    taxa_tab <- taxa_tab[ , ind, drop = FALSE]
    
    return(taxa_tab)
}

################################################################################
# internal wrappers for agglomerateByRank/agglomerateByVariable
.merge_features <- function(x, merge.by, ...) {
    # Check if merge.by parameter belongs to taxonomyRanks
    if (is.character(merge.by) && length(merge.by) == 1 && merge.by %in% taxonomyRanks(x)) {
         #Merge using agglomerateByRank
        x <- agglomerateByRank(x, rank = merge.by, ...)
    } else {
        # Merge using agglomerateByVariable
        x <- agglomerateByVariable(x, MARGIN = "rows", f = merge.by, ...)
    }
    return(x)
}
microbiome/mia documentation built on May 17, 2024, 2:18 a.m.