R/utilities.R

Defines functions get_eigValue default_color check_sample_names transpose_and_2df space check_confounder create_ps_from_mm calc_ef_md_f return_marker pre_ps_taxa_rank check_taxa_rank extract_rank create_pairwise_contrast create_contrast calc_coef create_design set_lvl check_contrast remove_na_samples check_samples get_norm_method select_quantile_func get_prefix fix_na_tax fix_duplicate_tax keep_taxa_in_rows phyloseq_qc preprocess_ps check_var_in_meta keep_prefix_desc get_available_prefix add_prefix_summarized upper_firstletter check_rank_names is_picrust2 check_tax_summarize

Documented in check_sample_names default_color get_eigValue

#' check whether tax abundance table is summarized or not
#' @noRd
check_tax_summarize <- function(ps) {
    taxa <- row.names(otu_table(ps))
    # whether taxa is separated by `|`,
    # may be required to add extra separate strings in the future
    has_separate <- any(grepl("[|]", taxa))

    has_separate
}


# whether picrust functional profile
is_picrust2 <- function(ps) {
    ps_ranks <- rank_names(ps)
    if ("Picrust_trait" %in% ps_ranks) TRUE else FALSE
}

#' check whether all names of taxonomic ranks include in available_ranks
#' @noRd
check_rank_names <- function(ps) {
    ps_ranks <- rank_names(ps)
    if (is_picrust2(ps)) {
        picrust_rank <- c("Picrust_trait", "Picrust_description")
        diff_rank <- setdiff(ps_ranks, picrust_rank)
        if (length(diff_rank)) {
            stop("ranks of picrust2 functional profile must be one of ",
                 paste(picrust_rank, collapse = ", "))
        }
    } else {
        if (!all(ps_ranks %in% available_ranks)) {
            stop(
                "ranks of taxonimic profile must be one of ",
                paste(available_ranks, collapse = ", ")
            )
        }
    }

    invisible(ps)
}

#' only first letter in lower case
#' @noRd
upper_firstletter <- function(x) {
    paste(toupper(substr(x, 1, 1)), tolower(substr(x, 2, nchar(x))), sep = "")
}

#' add prefix of taxonomic ranks for summarized data construct from original
#' lefse (galaxy lefse or python app) input, p__, k__
#' @param ps a [`phyloseq::phyloseq-class`] object
#' @param ranks character vector, prefix of ranks to add, e.g. "p", "c"
#' @importFrom phyloseq taxa_names<-
#' @noRd
add_prefix_summarized <- function(ps, ranks_prefix, sep = "|") {
    tax <- tax_table(ps)@.Data[, 1]
    tax_split <- strsplit(tax, split = sep, fixed = TRUE)

    if (max(lengths(tax_split)) != length(ranks_prefix)) {
        stop(
            "The length of `ranks_prefix` muste be",
            " equal to number of taxonomic ranks.",
            call. = FALSE
        )
    }

    # ensure the ranks_prefix is contained in available_ranks
    # and in descending order
    available_prefix <- get_available_prefix(available_ranks)
    if (!all(ranks_prefix %in% available_prefix)) {
        stop("all elements of ranks_prefix must be contained ", "
            in available_ranks")
    }
    ranks_prefix <- keep_prefix_desc(ranks_prefix, type = "ranks_prefix")

    tax_prefix <- purrr::map(
        tax_split,
        ~ paste0(ranks_prefix[seq_along(.x)], "__", .x) %>%
            paste0(collapse = sep)
    )
    tax_prefix <- do.call(rbind, tax_prefix)
    colnames(tax_prefix) <- paste0(ranks_prefix, collapse = sep)
    tax_table(ps) <- tax_table(tax_prefix)
    taxa_names(ps) <- tax_prefix[, 1]

    ps
}

# extract the first letter of taxonomic ranks as the prefixes of the ranks
get_available_prefix <- function(ranks) {
    substr(ranks, 1, 1) %>%
        tolower()
}

# keep prefix in descending order: "k" "p" "c" "o" "f" "g" "s"
keep_prefix_desc <- function(ranks_prefix, type = c("ranks", "ranks_prefix")) {
    type <- match.arg(type, choices = c("ranks", "ranks_prefix"))
    available_prefix <- get_available_prefix(available_ranks)
    idx_desc <- sort(match(ranks_prefix, available_prefix))

    if (type == "ranks") {
        return(available_ranks[idx_desc])
    } else {
        return(available_prefix[idx_desc])
    }
}

# check whether var in sample meta data, or raise an error
check_var_in_meta <- function(var, sample_meta) {
    stopifnot(inherits(sample_meta, "sample_data"))
    meta_nms <- names(sample_meta)
    if (!var %in% meta_nms) {
        stop(var, " must be one of variable of `sample_meta`", call. = FALSE)
    }
}

################################################################################
## preprocessing ps object
################################################################################

# preprocess of phyloseq object, including keep taxa in rows,
# filter taxa whose abundance is zero, fix duplicated tax
# filter samples whose library size is zero
#' @importFrom phyloseq prune_samples
#'
preprocess_ps <- function(ps) {
    zero_sample <- check_samples(ps)
    if (!is.null(zero_sample)) {
        warning(
            "The library size of sample(s): ",
            paste(zero_sample, collapse = ", "),
            " is/are zero, and will be removed in the subsequent analysis."
        )

        keep <- setdiff(sample_names(ps), zero_sample)
        ps <- prune_samples(keep, ps)
    }

    # keep taxa in rows
    ps <- keep_taxa_in_rows(ps)
    # filter the taxa whose abundance is zero
    ps <- phyloseq_qc(ps)
    # fix duplicated tax
    ps <- fix_duplicate_tax(ps)

    return(ps)
}

#' phyloseq quality control, remove otu/asv of which abundance is zero
#' @noRd
phyloseq_qc <- function(ps) {
    prune_taxa(taxa_sums(ps) > 0, ps)
}

#' Transpose the phyloseq object to ensure taxa are in rows
#' @param ps a [phyloseq::phyloseq-class] object
#' @importMethodsFrom phyloseq t
#' @keywords internal
#' @noRd
keep_taxa_in_rows <- function(ps) {
  if (!taxa_are_rows(ps)) {
    ps <- t(ps)
  }

  return(ps)
}

# https://github.com/lch14forever/microbiomeVizb
# /94cbfe452a735aadf88733b27b8221a03f450a55/R/utils.R#L68-L86
#
#' Duplicated taxa: e.g. maybe multiple species (s__uncultured)
#' belong to different genera. append the upper level taxa to the taxa to
#' distinguish this duplicated taxa
#' @param ps [phyloseq::phyloseq-class] object or
#'   [phyloseq::taxonomyTable-class] object
#' @importFrom phyloseq tax_table<-
#' @keywords internal
#' @noRd
fix_duplicate_tax <- function(ps) {

  # convert na to Unknown first
  ps <- fix_na_tax(ps)

  tax <- tax_table(ps)
  if (ncol(tax) == 1) {
      return(ps)
  }

  for (i in 2:ncol(tax)) {
    tax_uniq <- unique(tax[, i])
      for (j in seq_along(tax_uniq)) {
        if (is.na(tax_uniq[j])) next
        ind <- which(tax[, i] == as.character(tax_uniq[j]))
        if (length(unique(tax[ind, i - 1])) > 1) {
          tax[ind, i] <- paste(tax[ind, i - 1], tax[ind, i], sep = "_")
        }
      }
  }

  tax_table(ps) <- tax

  return(ps)
}

#' set NA (missing) tax to its prefix, e.g. s__ (or s__unknown?)
#' @keywords internal
#' @noRd
fix_na_tax <- function(ps) {

  tax <- as.data.frame(tax_table(ps))

  tax_fixed <- purrr::imap_dfc(
        tax,
        ~ ifelse(is.na(.x), get_prefix(.y), .x)
  ) %>%
      as.matrix()
  row.names(tax_fixed) <- taxa_names(ps)
  tax_table(ps) <- tax_fixed

  return(ps)
}

# extract the prefix of taxonomic ranks
get_prefix <- function(ranks) {

  prefix <- substr(ranks, 1, 1) %>%
    tolower() %>%
    paste("__", sep = "")

  prefix
}


# `metagenomeSeq::cumNormStatFast()` requires counts of all samples at least
# have two non zero features. Thus, if there are samples with only one non-zer
# features, `cumNormStat()` is taken to compute the pth quantile.
# This function was used to select the function to calculate the quantile used
# for CSS norm factors estimation in metagenomeSeq.
select_quantile_func <- function(counts) {
    if (sum(colSums(counts > 0) > 1) < ncol(counts)) {
        fun_p <- metagenomeSeq::cumNormStat
    } else {
        fun_p <- metagenomeSeq::cumNormStatFast
    }

    fun_p
}


get_norm_method <- function(norm) {
    new_norm <- ifelse(
        is.numeric(norm),
        paste("per-sample normalized (sum of all taxa) to", norm),
        norm
    )

    new_norm
}


# check samples in ps, make sure at least one non zero features in a sample
check_samples <- function(ps) {
    if (!taxa_are_rows(ps)) {
        ps <- t(ps)
    }
    lib_size <- colSums(otu_table(ps))
    zero_ind <- which(lib_size == 0)

    if (length(zero_ind) == 0) {
        return(NULL)
    }

    return(sample_names(ps)[zero_ind])
}


# remove samples with missing values for any variable specified in the group
remove_na_samples <- function(ps, group) {
    groups <- sample_data(ps)[[group]]
    na_idx <- is.na(groups)
    if (all(!na_idx)) {
        return(ps)
    }

    sample_nms <- sample_names(ps)
    warning(
        "remove sample(s): ", paste(sample_nms[na_idx], collapse = ","),
        ", which with missing value in `", group, "`"
    )
    ps <- phyloseq::prune_samples(sample_nms[!na_idx], ps)

    ps
}


## calculate coef for edgeR, metagenomeSeq
# if contrast is a two length character, set the first element as the first level
# (reference group), the second element as the second level, return a single
# integer
#
# if contrast is null, return a integer vector (number of levels - 1)
check_contrast <- function(contrast) {
    if (!is.null(contrast)) {
        if (!is.character(contrast) || length(contrast) != 2) {
            stop("`contrast` must be a two length character", call. = FALSE)
        }
    }

    contrast
}

set_lvl <- function(groups, contrast) {
    if (!is.factor(groups)) {
        stop("`groups` must be a factor", call. = FALSE)
    }
    # this will will change the elements simultaneously
    # levels(groups) <- c(contrast, setdiff(levels(groups), contrast))
    groups <- factor(
        groups,
        levels =  c(contrast, setdiff(levels(groups), contrast))
    )

    groups
}

create_design <- function(groups, meta, confounders = character(0)) {
    if (inherits(meta, "sample_data")) {
        meta <- data.frame(meta)
    }
    model_data <- data.frame(group = groups)
    if (!length(confounders)) {
        design <- stats::model.matrix(~ group, data = model_data)
    } else {
        model_data[confounders] <- meta[confounders]
        design <- stats::model.matrix(
            formula(paste(
                "~ + ",
                paste(c(confounders, "group"), collapse = " + "))),
            data = model_data
        )
    }

    design
}

calc_coef <- function(groups, design, contrast = NULL) {
    contrast <- check_contrast(contrast)
    groups <- set_lvl(groups, contrast)
    lvl <- levels(groups)
    n_lvl <- length(lvl)
    n_design <- ncol(design)

    if (n_lvl < 2) {
        stop("Differential analysis requires at least two groups.")
    }

    if (n_lvl == 2) { # two groups
        if (!is.null(contrast)) {
            warning(
                "`contrast` is ignored, you do not need to set it",
                call. = FALSE
            )
        }
        coef <- n_design
    } else { # multiple groups
        if (!is.null(contrast)) {
            coef <- n_design - n_lvl + 2L
        } else {
            coef <- (n_design - n_lvl + 2L):n_design
        }
    }

    coef
}

create_contrast <- function(groups, contrast = NULL) {
    if (!is.factor(groups)) {
        groups <- factor(groups)
    }
    lvl <- levels(groups)
    n_lvl <- length(lvl)
    if (n_lvl < 2) {
        stop("Differential analysis requires at least two groups.")
    }

    if (n_lvl == 2) { # two groups
        if (!is.null(contrast)) {
            warning(
                "`contrast` is ignored, you do not need to set it",
                call. = FALSE
            )
        }
        design <- rep(0, n_lvl)
        design[1] <- -1
        design[2] <- 1
    } else { # multiple groups
        if (!is.null(contrast)) {
            if (!is.character(contrast) || length(contrast) != 2) {
                stop("`contrast` must be a two length character", call. = FALSE)
            }

            idx <- match(contrast, lvl, nomatch = 0L)
            if (!all(idx)) {
                stop(
                    "all elements of `contrast` must be contained in `groups`",
                    call. = FALSE
                )
            }
            design <- rep(0, n_lvl)
            design[idx[1]] <- -1
            design[idx[2]] <- 1
            design <- matrix(design)
            row.names(design) <- lvl
            colnames(design) <- paste0(contrast[2], "-", contrast[1])
        } else {
            design <- create_pairwise_contrast(lvl)
        }
    }

    design
}

# create all pair-wise comparisons (contrasts) for anova-like test
create_pairwise_contrast <- function(groups) {
    groups <- factor(groups)
    lvl <- levels(groups)
    n <- length(lvl)

    design <- matrix(0, n, choose(n, 2))
    rownames(design) <- lvl
    colnames(design) <- seq_len(choose(n, 2))
    k <- 0
    for (i in seq_len(n - 1)) {
        for (j in (i + 1):n) {
            k <- k + 1
            design[j, k] <- 1
            design[i, k] <- -1
            colnames(design)[k] <- paste(lvl[j], "-", lvl[i], sep = "")
        }
    }
    design
}


# extract the specify rank of phyloseq object, return a phyloseq object
# with only one rank
extract_rank <- function(ps, taxa_rank) {
    ranks <- rank_names(ps)
    if (!taxa_rank %in% c("none", ranks)) {
        stop(
            "`taxa_rank` must be one of options: none, ",
            paste(rank_names(ps), collapse = ", "),
            call. = FALSE
        )
    }
    if (taxa_rank != "none") {
        ps <- aggregate_taxa(ps, taxa_rank)
        new_tax_table <- tax_table(ps)[, taxa_rank]
    } else {
        taxon <- taxa_names(ps)
        new_tax_table <- tax_table(matrix(taxon))
        colnames(new_tax_table) <- "otu"
        rownames(new_tax_table) <- taxon
    }

    tax_table(ps) <- new_tax_table

    # set the taxa names as the corresponding names
    if (taxa_rank != "none") {
        taxa_names(ps) <- new_tax_table[, 1]
    }

    ps
}

# only used for check the argument taxa_rank which is used to specify
# taxonomic rank to perform differential analysis on
check_taxa_rank <- function(ps, taxa_rank) {
    ranks <- rank_names(ps)
    all_taxa_rank <- c("all", "none", ranks)
    if (!taxa_rank %in% all_taxa_rank) {
        stop(
            "`taxa_rank` must be one of ",
            paste(all_taxa_rank, collapse = ", "),
            call. = FALSE
        )
    }

    invisible(ps)
}

# preprocess the ps according to para taxa_rank
pre_ps_taxa_rank <- function(ps, taxa_rank) {
    if (is_picrust2(ps)) {
        warning("para `taxa_rank` is not worked for picrust2 function profile",
                " and it will be ignored")
        return(ps)
    }

    ps <- check_taxa_rank(ps, taxa_rank)
    if (taxa_rank == "all") {
        ps_orig_summarized <- summarize_taxa(ps)
    } else if (taxa_rank == "none") {
        ps_orig_summarized <- extract_rank(ps, taxa_rank)
    } else {
        ps_orig_summarized <- aggregate_taxa(ps, taxa_rank) %>%
            extract_rank(taxa_rank)
    }

    return(ps_orig_summarized)
}

# return the marker_table, if no significant marker return NULL
return_marker <- function(sig_feature, all_feature) {
    if (nrow(sig_feature)) {
        row.names(sig_feature) <- paste0("marker", seq_len(nrow(sig_feature)))
        marker <- marker_table(sig_feature)

    } else {
        warning("No marker was identified", call. = FALSE)
        marker <- NULL
    }

    marker
}


# For multiple groups comparison of LRT test of DESeq2.
# Only fold changes of pair-wise comparisons are supported in DESse2.
# https://support.bioconductor.org/p/131272/#131450
# https://github.com/qiime2/q2-composition/blob/HEAD/q2_composition/_ancom.py
#'
#' Calculate effect size, mean differences for two groups, and f statistic of
#' one-way anova for multiple groups.
#' @noRd
#' @importFrom stats lm aov
calc_ef_md_f <- function(feature_abd, group) {
    group_n <- length(unique(group))
    if (group_n < 2) {
        stop("The number of group must be greater than 2", call. = FALSE)
    }

    if (group_n == 2) {
        ef <- abs(lm(feature_abd ~ group)$coefficients[2])
    } else if (group_n > 2) {
        # f statistic from aov
        ef <- summary(aov(feature_abd ~ group))[[1]]$`F value`[1]
    }

    ef
}

# create phyloseq from microbiomeMarker object,
# and keep only nodes correlated with significant features
create_ps_from_mm <- function(mm, only_marker = TRUE) {
    ot <- otu_table(mm)
    tt <- tax_table(mm)
    st <- sample_data(mm)
    mt <- marker_table(mm)
    sig_features <- mt$feature

    # extract all nodes correlated with the significant features
    # First, all parent nodes of marker
    down_nodes <- strsplit(sig_features, "|", fixed = TRUE) %>%
        purrr::map(~ purrr::map_chr(
            seq_along(.x), function(y) paste(.x[1:y], collapse = "|")))
    down_nodes <- unique(unlist(down_nodes))
    # Two, all children nodes of marker
    all_features <- tt@.Data[, 1]
    up_nodes <- purrr::map(sig_features,
               ~ all_features[grepl(.x, all_features, fixed = TRUE)])
    up_nodes <- unique(unlist(up_nodes))
    idx <- match(unique(c(down_nodes, up_nodes)), all_features)

    if (only_marker) {
        ot <- ot[idx, ]
        tt <- tt[idx, ]
    }
    ps <- phyloseq(ot, tt, st)

    ps
}

# check confounder
check_confounder <- function(ps, target_var, confounders = NULL) {
    meta <- sample_data(ps)
    vars <- names(meta)

    if (! target_var %in% vars) {
        stop(
            "the interested var `target_var` must be contained in the meta data",
            call. = FALSE
        )
    }

    other_vars <- setdiff(vars, target_var)

     if (is.null(confounders)) {
        confounders <- other_vars
        if (! length(confounders)) {
            stop("No confounding var in sample meta data")
        }
    } else {
        out_confounder <- setdiff(confounders, other_vars)
        if (length(out_confounder)) {
            stop("var(s) `", paste(out_confounder, collapse = "`, ` "),
                 "` not be contained in the sample meta data")
        }
    }

    confounders
}

# generate n spaces character
space <- function(n) {
    paste(rep(" ", each = n), collapse = "")
}


# transpose otu_table and then convert it to data.frame
#' @importMethodsFrom phyloseq t
transpose_and_2df <- function(ot) {
  t(ot) %>% as.data.frame()
}

######################################################################
#' check whether samples' names start with numeric and then paste "S_"
#'
#' @param object (Optional). a [`phyloseq::phyloseq-class`] or
#' [`Biobase::ExpressionSet`] or [`phyloseq::otu_table-class`] or
#' [`phyloseq::otu_table-class`] or matrix/metadata.
#'
check_sample_names <- function(object) {

  # phyloseq
  if (inherits(object, "phyloseq")) {
    if(length(grep("^\\d+", phyloseq::sample_names(object))) > 0) {
      message("samples' names are start with numeric")
      Names_index <- grep("^\\d+", phyloseq::sample_names(object))
      phyloseq::sample_names(object)[Names_index] <-
        paste0("S_", phyloseq::sample_names(object)[Names_index])
    }
  }

  # ExpressionSet
  if (inherits(object, "ExpressionSet")) {
    if (length(grep("^\\d+", Biobase::sampleNames(object))) > 0) {
      message("samples' names are start with numeric")
      Names_index <- grep("^\\d+", Biobase::sampleNames(object))
      Biobase::sampleNames(object)[Names_index] <-
        paste0("S_", Biobase::sampleNames(object)[Names_index])
    }
  }

  # otu table
  if (inherits(object, "otu_table")) {
    if (phyloseq::taxa_are_rows(object)) {
      if(length(grep("^\\d+", colnames(object))) > 0) {
        Names_index <- grep("^\\d+", colnames(object))
        colnames(object)[Names_index] <- paste0("S_", colnames(object)[Names_index])
      }
    } else {
      if(length(grep("^\\d+", rownames(object))) > 0) {
        Names_index <- grep("^\\d+", rownames(object))
        rownames(object)[Names_index] <- paste0("S_", rownames(object)[Names_index])
      }
    }
  }

  # matrix metadata
  if(length(grep("^\\d+", rownames(object))) > 0) {
    Names_index <- grep("^\\d+", rownames(object))
    rownames(object)[Names_index] <- paste0("S_", rownames(object)[Names_index])
  }
  if(length(grep("^\\d+", colnames(object))) > 0) {
    Names_index <- grep("^\\d+", colnames(object))
    colnames(object)[Names_index] <- paste0("S_", colnames(object)[Names_index])
  }

  return(object)
}


#' @title generate multiple colors for visualization
#'
#' @description
#' This funciton is to generate multiple colors for visualization
#'
#' @param number (Required). integer. the number of colors (default: 74).
#'
#' @importFrom RColorBrewer brewer.pal.info brewer.pal
#'
default_color <- function(number = 74) {

  dat_color <- RColorBrewer::brewer.pal.info
  qual_col_pals <- dat_color[dat_color$category == "qual", ]
  total_colors <-  unlist(mapply(RColorBrewer::brewer.pal,
                                 qual_col_pals$maxcolors,
                                 rownames(qual_col_pals)))

  res_colors <- total_colors[1:number]

  return(res_colors)
}


#' @title Extract and visualize the eigenvalues/variances of dimensions
#'
#' @description
#' This funciton is to obtain eigenvalues/variances of dimensions
#'
#' @param X (Required). Ordination result.
#'
get_eigValue <- function(X) {

  if (inherits(X, c("PCA", "CA", "MCA", "FAMD", "MFA", "HMFA",
                    "sPCA", "sCA", "sMCA", "sMFA", "sHMFA"))) {
    eig <- X$eig
  } else {
    if (inherits(X, "prcomp") | inherits(X, "princomp")){
      eig <- (X$sdev)^2
    } else if (inherits(X, c("pca", "coa", "acm")) &
               inherits(X, "dudi")) {
      eig <- X$eig
    } else if (inherits(X, "ca")) {
      eig <- X$sv^2
    } else if (inherits(X, "mjca")) {
      eig <- X$inertia.e
    } else if (inherits(X, "correspondence")) {
      eig <- X$cor^2
    } else if (inherits(X, "expoOutput")) {
      eig <- X$ExPosition.Data$eigs
    } else {
      stop("An object of class : ", class(X), " can't be handled by the function get_eigenvalue()")
    }

    variance <- eig * 100/sum(eig)
    cumvar <- cumsum(variance)
    eig <- data.frame(eigenvalue = eig, variance = variance,
                      cumvariance = cumvar)
  }
  colnames(eig) <- c("eigenvalue", "variance.percent", "cumulative.variance.percent")
  rownames(eig) <- paste0("Dim.", 1:nrow(eig))

  return(eig)
}
HuaZou/MicrobiomeAnalysis documentation built on May 13, 2024, 11:10 a.m.