#' edgeR analysis of omicsData objects
#'
#' Differential abundance analysis of count data using edgeR
#'
#' @param omicsData an object of the class 'seqData' created by \code{\link{as.seqData}}.
#' @param norm_factors Named vector of normalization parameters to put into DESeq2. If NULL, will use DESeq2's inbuilt normalization. Default is NULL.
#' @param pairs a matrix dictating which pairwise comparisons to make
#' @param test name of which differential expression test to use, options are "qcml", "lrt", "qlftest", or "paired". Default is "qcml". See details for further explanation.
#' @param adj multiple comparison adjustment method to use.
#' @param thresh p-value threshold for significance.
#'
#' @details Perform pairwise differential abundance analysis of count data using edgeR. For test parameter: "qcml" refers to quantile-adjusted conditional maximum likelihood method, which is the most reliable in terms of bias on a wide range of conditions and specifically performs best in the situation of many small samples with a common dispersion. "lrt" refers to generalized linear model likelihood ratio test, best used for cases where there a multiple treatment groups and provides inferences with GLMs. "qlftest" refers to the QL F-test, preferred as it reflects the uncertainty in estimating the dispersion for each gene and provides a more robust and reliable error rate control when the number of replicates is small. "paired" refers to a paired test and performs the generalized liner model likelihood ratio test for paired data (this should also be used for experiments with a block or batch effect). In the "paired" test, 'group1' should be the main effect of interest and 'group2' should be the covariate/block/batch. Currently, this can be used only if there are 2 main effects of interest.
#'
#' @return A matrix containing the log2 fold change (logFC), log-average concentration/abundance (logCPM), likelihood ratio (LR), exact p-value for differential expression using the negative binomial model (PValue), and the p-value adjusted for multiple testing (FDR) for every pairwise comparison and every feature.
#'
#' @examples
#' \dontrun{
#' library(mintJansson)
#' data(cDNA_hiseq_data)
#' mycdnadata <- group_designation(omicsData = cDNA_hiseq_data, main_effects = c("treatment"), time_course=NULL)
#' mycdnadata_norm <- normalize_data(omicsData = mycdnadata, norm_fn = "percentile")
#' mycdnadata_edgeR <- pmartRseq_edgeR(omicsData = mycdnadata_norm, test="qcml", pairs = cbind(list("Neg","Plus")), adj = "BH", thresh = 0.05)
#' mycdnadata_edgeR <- mint_edgeR(omicsData = mycdnadata_norm, test="qcml", pairs = cbind(list("Neg","Plus")), adj = "BH", thresh = 0.05)
#' }
#'
#' @author Allison Thompson
#'
#' @references Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor package for differential expression analysis of digital gene expression data. Bioinformatics, 26, pp. -1.
#' McCarthy, J. D, Chen, Yunshun, Smyth and K. G (2012). Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation. Nucleic Acids Research, 40(10), pp. -9.
#' Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing differences in tag abundance. Bioinformatics, 23, pp. -6.
#' Robinson MD and Smyth GK (2008). Small-sample estimation of negative binomial dispersion, with applications to SAGE data. Biostatistics, 9, pp. -11.
#' Zhou X, Lindsay H and Robinson MD (2014). Robustly detecting differential expression in RNA sequencing data using observation weights. Nucleic Acids Research, 42, pp. e91.
#'
pmartRseq_edgeR <- function(omicsData, norm_factors=NULL, test="qcml", pairs, adj, thresh){
library(edgeR)
if(!(class(omicsData) %in% c("seqData"))){
stop("Data must be of class 'seqData'")
}
if(!(tolower(test) %in% c("qcml","lrt","qlftest","paired"))){
stop("test must be qcml, lrt, qlftest, or paired")
}
# Make features be rownames and remove that column from the data, so the data is all numeric
rownames(omicsData$e_data) <- omicsData$e_data[,attr(omicsData,"cnames")$edata_cname]
omicsData$e_data <- omicsData$e_data[,-which(colnames(omicsData$e_data) == attr(omicsData,"cnames")$edata_cname)]
# Change NA's to 0 for edgeR
omicsData$e_data[is.na(omicsData$e_data)] <- 0
# Check if group1 and/or group2 is null
if(is.null(attributes(omicsData)$group_DF)){
stop("run group designation function first")
}
# format metadata
metadata <- attributes(omicsData)$group_DF
rownames(metadata) <- metadata[,attr(omicsData,"cnames")$fdata_cname]
metadata <- metadata[which(rownames(metadata)%in%colnames(omicsData$e_data)),]
metadata <- metadata[match(colnames(omicsData$e_data), metadata[,attr(omicsData,"cnames")$fdata_cname]),]
# Creates object to be used in differential expression tests
y <- edgeR::DGEList(counts = omicsData$e_data, group = metadata$Group)
# Set own sizeFactors(normalization factors), after running normalization
if(is.null(norm_factors)){
warning("using edgeR's inbuilt TMM normalization - if want to use own normalization, run normalization function first")
y <- edgeR::calcNormFactors(y)
}else{
# check that normalization (size) factors are present for every sample
if(all(colnames(omicsData$e_data) %in% names(norm_factors))){
norm_factors <- norm_factors[match(colnames(omicsData$e_data), names(norm_factors))]
y$samples$norm.factors <- norm_factors
}else{
stop("Must have a normalization (size) factor for every sample in the data")
}
}
if(test == "qcml"){
# quantile adjusted conditional maximum likelihood test
group <- metadata$Group
design <- model.matrix(~0+group, data=y$samples)
y <- edgeR::estimateDisp(y, design)
# get results from all pairwise comparisons
results <- apply(pairs, 2, function(x){
x <- as.character(x)
eres <- edgeR::topTags(exactTest(y, pair=c(x[2],x[1])), n=nrow(omicsData$e_data), adjust.method=adj)
})
# add a column for easy formatting later
if(adj == "none"){
results <- lapply(results, function(x){
x@.Data[[1]]$FDR=x@.Data[[1]]$PValue
return(x)
})
}
# format results for all comparisons
res <- lapply(results, function(x){
res <- as.data.frame(x@.Data[[1]])
res$Flag <- ifelse(res$FDR <= thresh, 1, 0)
res$Flag[which(res$logFC < 0)] <- res$Flag[which(res$logFC < 0)] * -1
names(res) <- unlist(lapply(names(res), function(y) paste(y,"_",x@.Data[[3]][2],"_vs_",x@.Data[[3]][1],sep="")))
res <- res[order(rownames(res)),]
return(res)
})
# combine results #
res2 <- do.call(cbind, res)
}else if(test == "lrt"){
# generalized linear model likelihood ratio test
group <- metadata$Group
design <- model.matrix(~0+group, data=y$samples)
y <- edgeR::estimateDisp(y, design)
fit <- edgeR::glmFit(y, design)
# get results from all comparisons
results <- apply(pairs, 2, function(x){
x <- as.character(x)
cntrst <- rep(0, length(levels(metadata$Group)))
cntrst[which(levels(metadata$Group) == x[1])] <- 1
cntrst[which(levels(metadata$Group) == x[2])] <- -1
lrt <- edgeR::glmLRT(fit, contrast=cntrst)
results <- edgeR::topTags(lrt, n=nrow(omicsData$e_data), adjust.method=adj)
return(results)
})
# add a column for easy formatting later
if(adj == "none"){
results <- lapply(results, function(x){
x@.Data[[1]]$FDR=x@.Data[[1]]$PValue
return(x)
})
}
# format results for all comparisons
res <- lapply(results, function(x){
res <- as.data.frame(x@.Data[[1]])
res$Flag <- ifelse(res$FDR <= thresh, 1, 0)
res$Flag[which(res$logFC < 0)] <- res$Flag[which(res$logFC < 0)] * -1
grps <- unlist(strsplit(x@.Data[[3]], " "))
grp2 <- strsplit(grps[grep("-1",grps)],"group")[[1]][2]
grp1 <- strsplit(grps[-grep("-1",grps)],"group")[[1]][2]
names(res) <- unlist(lapply(names(res), function(y) paste(y,"_",grp1,"_vs_",grp2,sep="")))
res <- res[order(rownames(res)),]
return(res)
})
# combine results
res2 <- do.call(cbind, res)
}else if(test == "qlftest"){
# QL F-test
group <- metadata$Group
design <- model.matrix(~0+group, data=y$samples)
y <- edgeR::estimateDisp(y, design)
fit <- edgeR::glmQLFit(y, design)
# get results from all comparisons
results <- apply(pairs, 2, function(x){
x <- as.character(x)
cntrst <- rep(0, length(levels(metadata$Group)))
cntrst[which(levels(metadata$Group) == x[1])] <- 1
cntrst[which(levels(metadata$Group) == x[2])] <- -1
qlf <- edgeR::glmQLFTest(fit, contrast=cntrst)
results <- edgeR::topTags(qlf, n=nrow(omicsData$e_data), adjust.method=adj)
return(results)
})
# add a column for easy formatting later
if(adj == "none"){
results <- lapply(results, function(x){
x@.Data[[1]]$FDR=x@.Data[[1]]$PValue
return(x)
})
}
# format results for all comparisons
res <- lapply(results, function(x){
res <- as.data.frame(x@.Data[[1]])
res$Flag <- ifelse(res$FDR <= thresh, 1, 0)
res$Flag[which(res$logFC < 0)] <- res$Flag[which(res$logFC < 0)] * -1
grps <- unlist(strsplit(x@.Data[[3]], " "))
grp2 <- strsplit(grps[grep("-1",grps)],"group")[[1]][2]
grp1 <- strsplit(grps[-grep("-1",grps)],"group")[[1]][2]
names(res) <- unlist(lapply(names(res), function(y) paste(y,"_",grp1,"_vs_",grp2,sep="")))
res <- res[order(rownames(res)),]
return(res)
})
# combine results
res2 <- do.call(cbind, res)
}else if(test == "paired"){
# paired generalized linear model likelihood ratio test
grp1 <- factor(attributes(omicsData)$group_DF[,pairs[1]])
grp2 <- factor(attributes(omicsData)$group_DF[,pairs[2]])
design <- model.matrix(~grp2+grp1)
y <- edgeR::estimateDisp(y, design)
fit <- edgeR::glmFit(y, design)
lrt <- edgeR::glmLRT(fit)
results <- edgeR::topTags(lrt, n=nrow(omicsData$e_data), adjust.method=adj)
# add a column for easy formatting later
if(adj == "none"){
results@.Data[[1]]$FDR = results@.Data[[1]]$PValue
# results <- lapply(results, function(x){
# x@.Data[[1]]$FDR=x@.Data[[1]]$PValue
# return(x)
# })
}
# format results
res <- as.data.frame(results@.Data[[1]])
res$Flag <- ifelse(res$FDR <= thresh, 1, 0)
res$Flag[which(res$logFC < 0)] <- res$Flag[which(res$logFC < 0)] * -1
names(res) <- unlist(lapply(names(res), function(x) paste(x,"_",levels(grp1)[2],"_vs_",levels(grp1)[1],sep="")))
res2 <- res[order(rownames(res)),]
}else{
stop("test must be one of 'qcml', 'lrt', 'qlftest', or 'paired")
}
return(res2)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.