#' @title Normalize RNA-seq or miRNA-seq dataset
#'
#' @description \code{tcgaNormalizer} normalizes RNA-seq expression dataset by 1) removing genes with > 70\% NA values across samples or across genes/microRNAs, 2) removing genes with low variation (sd < 0.2) across samples, 3) log2 transformation and 4) quantile normalization. For miRNA-seq data removing microNRAs with > 95\% NA values across samples in step 2).
#'
#' @param data A data matrix, with rows referring to genes/microRNAs and columns to samples, can be the output from \code{\link[mirNet]{tcgaTableGenerator}} or \code{\link[mirNet]{tcgaConvRownames}}.
#' @param dataType A string, 'microRNA' for microRNA-seq data or 'mRNA' for RNA-seq data.
#' @param mir_na_thre The threshold used to remove microRNAs with too many NAs across samples, defualt to 95\%.
#' @param mir_sd_thre The threshold used to remove microRNAs with low variation across samples, defualt to 0.2.
#' @param plotFig Logic, 'TRUE' if you want to plot a scatter plot of NA proportions, 'FALSE' otherwise.
#' @param filename The name of output scatterplot of NA proportions across samples and genes/microRNAs.
#'
#' @return A data matrix with normalized gene/microRNA expression data. A scatter plots of NA proportions across samples and genes/microRNAs.
#'
#' @seealso \code{\link[mirNet]{tcgaTableGenerator}} for generating a gene expression data matrix from single FPKM files downloaded from GDC Data Portal, \code{\link[mirNet]{tcgaConvRownames}} for converting rownames of a data matrix.
#'
#' @importFrom limma normalizeBetweenArrays
#' @importFrom graphics plot
#'
#' @export tcgaNormalizer
#'
#' @examples
#' tcgaNormalizer(gen.luad.m, dataType = 'mRNA', filename = 'scatter plot of genes')
#' tcgaNormalizer(mir.luad.m, dataType = 'microRNA', filename = 'scatter plot of miNRAs')
tcgaNormalizer <- function(data, dataType, mir_na_thre = 0.95, mir_sd_thre = 0.2, plotFig = TRUE, filename = NULL){
na.col.v <- apply(data, 2, function(x) sum(x == 0)/length(x))
na.row.v <- apply(data, 1, function(x) sum(x == 0)/length(x))
sd.row.v <- apply(data, 1, sd)
if(plotFig == TRUE){
pdf(paste0(filename, '.pdf'), width = 8, height = 7)
plot(na.col.v, main = 'Percentage of 0s in each sample', ylim = c(0, 1))
plot(na.row.v, main = paste0('Percentage of 0s in each ', ifelse(dataType == 'mRNA', 'gene', 'miRNA')), ylim = c(0, 1))
dev.off()
}
if(dataType == 'mRNA'){
idx.row.sel <- intersect(which(na.row.v < 0.7), which(sd.row.v > 0.2))
idx.col.sel <- which(na.col.v < 0.3)
data.sel <- data[idx.row.sel, idx.col.sel]
}
if(dataType == 'microRNA'){
idx.row.sel <- intersect(which(na.row.v < mir_na_thre), which(sd.row.v > mir_sd_thre))
data.sel <- data[idx.row.sel, ]
}
data.sel[which(data.sel == 0, arr.ind = TRUE)] <- min(data.sel[which(data.sel > 0, arr.ind = TRUE)])
data.log2 <- log2(data.sel)
data.norm <- normalizeBetweenArrays(data.log2)
data.norm
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.