#' A wrapper to DESeq2 that apply differential expression analysis on a PAC
#' object
#'
#' \code{PAC_deseq} DESeq2 analysis on PAC_object.
#'
#' Given a PAC object this function will apply a differential expression
#' analysis using DESeq2.
#'
#' @family PAC analysis
#'
#' @seealso \url{https://github.com/Danis102} for updates on the current
#' package.
#'
#' @param PAC PAC object containing a Pheno data.frame with samples as row
#' names, an Anno data.frame with sequences as row names and a Counts table
#' with raw counts. The Counts table must have the sample names as column
#' names and sequences as row names.
#'
#' @param model Character vector describing the statistical model based on
#' column names in Pheno.
#'
#' @param deseq_norm Logical whether to return deseq normalized values or raw
#' counts (default=FALSE).
#'
#' @param pheno_target (optional) List with: 1st object being a character
#' indicating the main target column in Pheno. 2nd object being a character
#' vector of the target group(s) in the target Pheno column (1st object).
#'
#' Important: In \code{PAC_deseq}, pheno_target controls the main comparative
#' factor category from which a summarized table and plots will be generated.
#' If, for instance, a target column named "groups" in pheno(PAC) contains
#' "control" and "treatment" categories, setting pheno_target=list("groups",
#' c("treatment", "controls") ensures that "treatment" is presented 1st in the
#' factor levels, making for example log2FC appear as "treatment vs control".
#' As default, pheno_target=NULL will result in the factor levels being
#' automatically sorted in ascending order, which in the example above would
#' result in control vs treatment. If no pheno_target is given, the
#' first feature in the model will also be the main comparison presented in
#' the graphs and summary tables.
#'
#' @param test Character parsed directly to \code{\link[DESeq2]{DESeq}} that
#' controls what type of statistical test that should be used. Alternatives
#' are either "Wald" (Wald significance test) or "LTR" (likelihood ratio
#' test/chi-squared test). See \code{\link[DESeq2]{DESeq}} for more details.
#' (Default="Wald")
#'
#' @param fitType Character parsed directly to \code{\link[DESeq2]{DESeq}} that
#' controls what type of dispersion fit that should be used. Alternatives are
#' either "parametric" (dispersion-mean relation), "local" (local regression
#' of log dispersions), "mean" (mean of gene-wise dispersion). See
#' \code{\link[DESeq2]{DESeq}} for more details. (Default="local")
#'
#' @param threads Integer number of threads to run in parallel.
#'
#' @return A list of objects:
#'
#' 1st object - Summarized result table merged with anno(PAC)
#'
#' 2nd object - Target graphs (p-val distribution and volcano)
#'
#' 3rd object - All output from \code{\link[DESeq2]{DESeq}}
#'
#' @examples
#'
#'
#'# Note, these examples will generate some warnings since data is based on
#'# heavily down-sampled fastq files, where many sequences receives low counts
#'# in specific groups.
#'
#'## Load test data
#'load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata",
#' package = "seqpac", mustWork = TRUE))
#'
#'## Simple model with embryonic stages using Wald test with local fit (default)
#'table(pheno(pac)$stage)
#'output_deseq <- suppressWarnings(PAC_deseq(pac, model= ~stage, threads=2))
#'
#'## Batch corrected, graphs are generated for 'stage' (=first in the model)
#'output_deseq <- suppressWarnings(PAC_deseq(pac, model= ~stage + batch,
#' threads=2))
#'
#'## Using pheno_target
#'output_deseq <- suppressWarnings(PAC_deseq(pac,model= ~stage + batch,
#' pheno_target=list("batch"),
#' threads=2))
#'
#'## With pheno_target we can change the direction for the comparison
#'# Stage5 vs Stage3 (reverse order):
#'output_deseq <- suppressWarnings(PAC_deseq(pac, model= ~stage + batch,
#' pheno_target = list("stage", c("Stage5", "Stage3")),
#' threads=2))
#'
#'## In the output you find PAC merged results, target plots and output_deseq
#'names(output_deseq)
#'tibble::as_tibble(output_deseq$result)
#'
#' @importFrom S4Vectors mcols
#' @importFrom stats terms.formula
#'
#' @export
PAC_deseq <- function(PAC, model, deseq_norm=FALSE, test="Wald",
fitType="local", threads=1, pheno_target=NULL){
## Check S4
if(isS4(PAC)){
tp <- "S4"
PAC <- as(PAC, "list")
}else{
tp <- "S3"
}
cat("\n")
##### Prepare data
pval <- log2FC <- neglog_padj <- DE <- NULL
PAC_sub <- PAC
PAC_sub$Counts <- apply(PAC_sub$Counts, 2, as.integer)
rownames(PAC_sub$Counts) <- rownames(PAC$Anno)
anno <- PAC_sub$Anno
pheno <- PAC_sub$Pheno
# Make factors of model columns
cols <- attr(terms.formula(model), "term.labels")
cols <- unique(unlist(strsplit(cols, ":")))
compr<-pheno[,colnames(pheno) %in% cols]
if(class(compr)=="data.frame"){
if((any(apply(combn(ncol(compr), 2), 2, function(x) identical(compr[, x[1]], compr[, x[2]]))))==TRUE) {
stop(cat="The column names in model appears to be repeated. \nThis may cause unwanted comparisons. To ensure a correct comparison, please check the colnames in Pheno!")
}}
for (i in seq.int(length(cols))){
#Sometime model terms ar complex
# search for best pheno columns
trm <- cols[i]
colnam <- colnames(pheno)
logi <- NULL
for(k in seq.int(length(colnam))){
logi <- c(logi, grepl(colnam[k], trm))
}
# Refactor best hit
colnam_hits <- colnam[logi]
bst <- nchar(trm) - nchar(colnam_hits)
log_bst <- bst == min(bst)
pheno[,colnam_hits[log_bst]] <- as.factor(pheno[,colnam_hits[log_bst]])
}
# Prepare pheno target and order factor
if(is.null(pheno_target)){
pheno_target <- list(cols[1])
}
if(length(pheno_target)==1){
pheno_target[[2]] <- as.character(unique(PAC$Pheno[,pheno_target[[1]]]))
}
trg <- pheno[,colnames(pheno) == pheno_target[[1]]]
if(any(duplicated(trg) | duplicated(trg, fromLast = TRUE))==TRUE){
warning(cat="The values in designated pheno_target are not unique. \nThis may cause unwanted comparisons. To ensure a correct comparison, please check the values in Pheno!")
}
mis <- !levels(trg) %in% pheno_target[[2]]
pheno[,colnames(pheno) == pheno_target[[1]]] <- factor(
trg, levels=c(rev(pheno_target[[2]]),levels(trg)[mis]))
dds <- DESeq2::DESeqDataSetFromMatrix(countData = PAC_sub$Counts,
colData = droplevels(pheno),
design=model)
dds <- DESeq2::estimateSizeFactors(dds)
### DEseq analysis and extract result table
BiocParallel::register(BiocParallel::MulticoreParam(workers=threads))
dds_fit <- DESeq2::DESeq(dds, test=test, fitType=fitType, parallel = TRUE)
res_nam <- DESeq2::resultsNames(dds_fit)
if(!is.null(pheno_target)){
target_nam <- res_nam[grepl(pheno_target[[1]], res_nam)]
target_nam <- target_nam[1]
}else{
target_nam <- res_nam[2]
}
res_DESeq2 <- DESeq2::results(dds_fit, name=target_nam)
comp <- strsplit(S4Vectors::mcols(res_DESeq2)[2][,1][2], ": ")[[1]][2]
cat("\n")
cat("\n")
cat(paste0("** ", comp, " **"))
cat("\n")
# Print summary working with different DESeq versions
test <- try(cat(DESeq2::summary(res_DESeq2)), silent = TRUE)
if(methods::is(test,"try-error")){
cat(DESeq2::DESeqResults(res_DESeq2))
}else{
print(test)
}
res_DESeq2_df <- as.data.frame(res_DESeq2)
anno_filt <- anno[match(rownames(res_DESeq2), rownames(anno)),,drop=FALSE]
if(!identical(rownames(dds), rownames(res_DESeq2))){
stop("\nNot identical ids in result and dds.",
"\nHave you been messing with the code?\n")
}
if(!identical(rownames(dds), rownames(anno_filt))){
stop("\nNot identical ids in anno and dds.\n")
}
res_counts <- cbind(
data.frame(feature_ID=rownames(dds)),
res_DESeq2_df,
DESeq2::counts(dds, normalized=deseq_norm),
anno_filt)
colnames(res_counts)[colnames(res_counts)=="log2FoldChange"] <- paste(
"log2FC", res_nam[2], sep="_")
### Make plots
logi_thresh <- ifelse(rowSums(
cbind(
res_DESeq2_df$padj <= 0.1,
res_DESeq2_df$log2FoldChange <=-1 | res_DESeq2_df$log2FoldChange >=1))==2,
"pass", "not_pass")
df_plot <- data.frame(
pval=res_DESeq2_df$pvalue,
neglog_padj=-log10(res_DESeq2_df$padj),
log2FC=res_DESeq2_df$log2FoldChange,
DE=logi_thresh)
p <- ggplot2::ggplot(data=df_plot, ggplot2::aes(x=pval)) +
ggplot2::geom_histogram(breaks=seq(0.0, 1.0, by=0.025),
col="black", fill="green", alpha=1) +
ggplot2::labs(title="p-value distributions",
subtitle=comp, x="p-value", y = "Number of features") +
ggplot2::theme_classic()
vcano <- ggplot2::ggplot(df_plot, ggplot2::aes(x=log2FC, y=neglog_padj)) +
ggplot2::geom_hline(yintercept=1, col="black", size=0.1)+
ggplot2::geom_vline(xintercept=c(-1, 1), col="black", size=0.1)+
ggplot2::geom_point(ggplot2::aes(colour = DE), size=1) +
ggplot2::scale_colour_manual(values = c("not_pass"="grey", "pass"= "red")) +
ggplot2::labs(title="Volcano plot DE features",
subtitle="red points:\nlog2FC >=1 & p-adj <=0.1",
x="Log2 fold changes", y = "-log10 p-value") +
ggplot2::theme_classic() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 0),
legend.position = "none")
res_lst <- list(result=res_counts,
plots=list(histogram=p, volcano=vcano),
output_deseq= res_DESeq2)
print(cowplot::plot_grid(plotlist=res_lst$plots))
return(res_lst)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.