# Some useful keyboard shortcuts for package authoring:
#
# Build and Reload Package: 'Ctrl + Shift + B'
# Check Package: 'Ctrl + Shift + E'
# Test Package: 'Ctrl + Shift + T'
# Generate DOC: 'Ctrl + Shift + Alt + r'
#' Iniitialize a DESeq2 object from salmon output.
#'
#' @param salmon_file_list A two-column file with the first column as sample names and
#' second column containing the path of quant.sf generated by salmon. Header line
#' is required but column names do not matter.
#'
#' Below is an example (pay attention to the **path** of quant.sf)
#'
#' ```
#' Samp path_quant.sf
#' untrt_N61311 untrt_N61311/untrt_N61311.salmon.count/quant.sf
#' untrt_N052611 untrt_N052611/untrt_N052611.salmon.count/quant.sf
#' untrt_N080611 untrt_N080611/untrt_N080611.salmon.count/quant.sf
#' untrt_N061011 untrt_N061011/untrt_N061011.salmon.count/quant.sf
#' trt_N61311 trt_N61311/trt_N61311.salmon.count/quant.sf
#' trt_N052611 trt_N052611/trt_N052611.salmon.count/quant.sf
#' trt_N080611 trt_N080611/trt_N080611.salmon.count/quant.sf
#' trt_N061011 trt_N061011/trt_N061011.salmon.count/quant.sf
#' ```
#'
#' @param sampleFile A file containing at least two columns. The first column is sample name just
#' like the first column of `salmon_file_list`. Other columns are sample attributes.
#' Normally one of sample attributes should contain the group information each sample belongs to.
#'
#' One simple example (conditions represent group information)
#'
#' ```
#' Samp conditions
#' untrt_N61311 untrt
#' untrt_N052611 untrt
#' untrt_N080611 untrt
#' untrt_N061011 untrt
#' trt_N61311 trt
#' trt_N052611 trt
#' trt_N080611 trt
#' trt_N061011 trt
#' ```
#'
#' Another example (3rd column meaning samples from two batches)
#'
#' ```
#' Samp conditions batch
#' untrt_N61311 untrt A
#' untrt_N052611 untrt A
#' untrt_N080611 untrt B
#' untrt_N061011 untrt B
#' trt_N61311 trt A
#' trt_N052611 trt A
#' trt_N080611 trt B
#' trt_N061011 trt B
#' ```
#'
#' @param design A column name from "sampleFile" like "conditions" in example.
#' This will be used as group variable for DE tests. Currently only simple
#' design is allowed. If one wants to model multiple variables, construct
#' one representation of super variable as indicated in
#' \url{https://support.bioconductor.org/p/67600/#67612} may be useful.
#'
#' @param covariate Names of columns containing informations maybe covariates
#' like batch effects or other sample info. Multiple covariates should be
#' supplied as a vector.
#'
#'
#' @param tx2gene Optional and only used if one want to get gene expression
#' instead of transcript expression.
#' A two-column file with the first column as transcript names and
#' second column as gene names. Header line is required but column names do not matter.
#'
#' Below is an example of file contents.
#'
#' ```
#' txname gene
#' ENST00000456328 ENSG00000223972
#' ENST00000450305 ENSG00000223972
#' ENST00000488147 ENSG00000227232
#' ENST00000619216 ENSG00000278267
#' ENST00000473358 ENSG00000243485
#' ```
#'
#' @param filter Filter genes with low read counts. Default genes with total
#' reads count lower than half of number of samples will be filtered out.
#' One can give any number here. Normally default is OK. The DESeq2 will ao
#' auto filter too. Check \url{https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html}.
#'
#' @param rundeseq Default \code{TRUE}. The function will perfrom deseq analysis
#' using \code{\link[DESeq2]{DESeq}} and return analyzed DESeqDataSet object.
#' If \code{FALSE}, just return a DESeqDataSet object and one can
#' run \code{\link[DESeq2]{DESeq}}on it with more customed parameters.
#'
#' @return A DESeqDataSet object.
#' @export
#'
#' @examples
#'
#' dds <- salmon2deseq(salmon_file_list, sampleFile, "conditions")
#' dds <- salmon2deseq(salmon_file_list, tx2gene=tx2gene,
#' sampleFile, "conditions")
#'
#'
salmon2deseq <- function(salmon_file_list, sampleFile, design, covariate=NULL,
tx2gene=NULL, filter=NULL, rundeseq=T) {
salmon_file <- read.table(salmon_file_list, header=T, row.names=1, sep="\t")
salmon_file_rowname <- rownames(salmon_file)
salmon_file <- as.vector(salmon_file[,1])
names(salmon_file) <- salmon_file_rowname
exist_or_not <- file.exists(salmon_file)
if(any(exist_or_not)==FALSE){
stop(paste("The following files can not be found:",
paste(salmon_file[!exist_or_not], collapse="\n"), sep="\n"))
}
if(!is.null(covariate)){
covariate <- paste(covariate, collapse="+")
formula <- as.formula(paste("~", covariate,"+", design))
} else {
formula <- as.formula(paste("~", design))
}
if (!is.null(tx2gene)){
tx2gene <- read.table(tx2gene, header=T, row.names=NULL, sep="\t")
}
# 整合读入的salmon文件和transcript2gene文件
txi <- tximport::tximport(salmon_file, type = "salmon", tx2gene = tx2gene)
sample <- read.table(sampleFile, header=T, row.names=1, com='',
quote='', check.names=F, sep="\t")
sample <- sample[match(colnames(txi$counts), rownames(sample)),, drop=F]
dds <- DESeq2::DESeqDataSetFromTximport(txi, colData=sample, design=formula)
print(paste("Read in", nrow(dds),"genes"))
if(is.null(filter)){
filter = nrow(sample)/2
}
keep <- rowSums(DESeq2::counts(dds))>filter
dds <- dds[keep,]
print(paste(nrow(dds),"genes remained after filtering of genes with all counts less than", nrow(sample)/2, "in all samples."))
if(rundeseq){
print("Perform DESeq on given datasets.")
dds <- DESeq(dds)
}
return(dds)
}
#' Iniitialize a DESeq2 object from raw reads count matrix.
#'
#' @param count_matrix_file A multiple column file with the first column as gene
#' names (must be unique) and other columns as gene expression reads count in
#' related samples.
#'
#' ```
#' Gene untrt_N61311 untrt_N052611 ... trt_N61311 trt_N052611 ...
#' GeneA 2 3 ... 10 20 ...
#' GeneB 2 3 ... 100 220 ...
#' GeneC 12 33 ... 10 20 ...
#' GeneD 222 301 ... 10 20 ...
#' ```
#'
#' @inheritParams salmon2deseq
#'
#' @return A DESeqDataSet object.
#' @export
#'
#' @examples
#'
#'
#' dds <- readscount2deseq(count_matrix_file, sampleFile, "conditions")
#'
#'
readscount2deseq <- function(count_matrix_file, sampleFile, design, covariate=NULL,
filter=NULL, rundeseq=T) {
data <- read.table(count_matrix_file, header=T, row.names=1, com='', quote='',
check.names=F, sep="\t")
if(!is.null(covariate)){
covariate <- paste(covariate, collapse="+")
formula <- as.formula(paste("~", covariate,"+", design))
} else {
formula <- as.formula(paste("~", design))
}
sample <- read.table(sampleFile, header=T, row.names=1, com='',
quote='', check.names=F, sep="\t")
sample <- sample[match(colnames(data), rownames(sample)),, drop=F]
dds <- DESeqDataSetFromMatrix(countData = data,
colData = sample, design=formula)
print(paste("Read in", nrow(dds),"genes"))
if(is.null(filter)){
filter = nrow(sample)/2
}
keep <- rowSums(DESeq2::counts(dds))>filter
dds <- dds[keep,]
print(paste(nrow(dds),"genes remained after filtering of genes with all counts less than", nrow(sample)/2, "in all samples."))
if(rundeseq){
print("Perform DESeq on given datasets.")
dds <- DESeq(dds)
}
return(dds)
}
#' To output normalized results to files named by given "output_prefix",
#' and also return a list containing normalized counts for downstream analysis.
#'
#' @param dds \code{\link{salmon2deseq}} or \code{\link{readscount2deseq}} or
#' \code{\link[DESeq2]{DESeq}} generated dataset.
#' @param output_prefix A string, will be used as output file name prefix.
#' @param rlog Get "rlog" transformed value for downstream correlation like analysis.
#' @param vst Get "vst" transformed value for downstream correlation like analysis. Normally faster than "rlog".
#' @param savemat Save normalized and rlog/vst matrix to file. Default T.
#' The file would be named like `output_prefix.DESeq2.normalized.xls`,
#' `output_prefix.DESeq2.normalized.rlog.xls`.
#'
#' @return A list containing normalized expression values and/or rlog, vst transformed normalized expression values.
#' @export
#'
#' @examples
#'
#' nomrexpr <- deseq2normalizedExpr(dds)
#'
deseq2normalizedExpr <- function(dds, output_prefix='ehbio', rlog=T, vst=F, savemat=T){
#标准化后的结果按整体差异大小排序,同时输出对数转换的结果。
# Get normalized counts
normalized_counts <- DESeq2::counts(dds, normalized=TRUE)
# 标准化的结果按整体差异大小排序
normalized_counts_mad <- apply(normalized_counts, 1, mad)
normalized_counts <- normalized_counts[order(normalized_counts_mad, decreasing=T), ]
# 常规R输出忽略左上角(输出文件第一列也就是基因列的列名字)
# 输出结果为完整矩阵,保留左上角的id。
normalized_counts_output = data.frame(id=rownames(normalized_counts), normalized_counts)
if(savemat){
print("Output normalized counts")
write.table(normalized_counts_output, file=paste0(output_prefix,".DESeq2.normalized.xls"),
quote=F, sep="\t", row.names=F, col.names=T)
}
normexpr <- list(normalized=normalized_counts, normalizedSave=normalized_counts_output)
if (rlog) {
rld <- DESeq2::rlog(dds, blind=FALSE)
rlogMat <- assay(rld)
rlogMat_mad <- apply(rlogMat, 1, mad)
rlogMat <- rlogMat[order(rlogMat_mad, decreasing=T), ]
rlogMat_output = data.frame(id=rownames(rlogMat), rlogMat)
if(savemat) {
print("Output rlog transformed normalized counts")
write.table(rlogMat_output, file=paste0(output_prefix,".DESeq2.normalized.rlog.xls"),
quote=F, sep="\t", row.names=F, col.names=T)
}
normexpr$rlog <- rlogMat
normexpr$rlogSave <- rlogMat_output
}
if (vst) {
rld <- DESEq2::vst(dds, blind=FALSE)
vstMat <- assay(rld)
vstMat_mad <- apply(vstMat, 1, mad)
vstMat <- vstMat[order(vstMat_mad, decreasing=T), ]
vstMat_output = data.frame(id=rownames(vstMat), vstMat)
if(savemat){
print("Output vst transformed normalized counts")
write.table(vstMat_output, file=paste0(output_prefix,".DESeq2.normalized.vst.xls"),
quote=F, sep="\t", row.names=F, col.names=T)
}
normexpr$vst <- vstMat
normexpr$vstSave <- vstMat_output
}
return(normexpr)
}
#' Plot distribution of normalzied expression to check the normalization effect of DESeq2.
#'
#' @param normexpr A list returned by \code{\link{deseq2normalizedExpr}}.
#' @param saveplot Save plot to given file "a.pdf", "b.png".
#' @param ... Additional parameters given to \code{\link{ggsave}}.
#'
#' @return A ggplot2 object
#' @export
#'
#' @examples
#'
#'
#' normexpr <- deseq2normalizedExpr(dds)
#' normalizedExpr2DistribBoxplo(normexpr)
#'
#'
normalizedExpr2DistribBoxplot <- function(normexpr, saveplot=NULL, ...) {
if ('rlog' %in% names(normexpr)){
p <- widedataframe2boxplot(as.data.frame(normexpr$rlog), saveplot=saveplot, ylab="rLog transformed expression value")
} else if ('vst' %in% names(normexpr)){
p <- widedataframe2boxplot(as.data.frame(normexpr$vst), saveplot=saveplot, ylab="VST transformed expression value")
}
return(p)
}
#' DE genes analysis for two groups.
#'
#' @param dds \code{\link{DESeq}} function returned object.
#' @param groupA Group name 1.
#' @param groupB Group name 2.
#' @param design The group column name. Default "conditions".
#' @param padj Multiple-test corrected p-value. Default 0.05.
#' @param log2FC Log2 transformed fold change. Default 1.
#' @param dropCol Columns to drop in final output. Default \code{c("lfcSE", "stat")}.
#' Other options \code{"ID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj"}.
#' This has no specific usages except make the table clearer.
#' @param output_prefix A string as prefix of output files.
#' @param ... Additional parameters given to \code{\link{ggsave}}.
#'
#' @import ggplot2
#' @import pheatmap
#' @return NULL
#' @export
#'
#' @examples
#'
#' twoGroupDEgenes(dds, "trt", "untrt")
#'
twoGroupDEgenes <- function
(
dds,
groupA,
groupB,
design="conditions",
padj=0.05,
log2FC=1,
# "ID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj"
dropCol=c("lfcSE", "stat"),
output_prefix="ehbio",
...
){
#print(sampleV)
#groupA <- as.vector(sampleV$sampA)
#groupB <- as.vector(sampleV$sampB)
#groupA = 'trt'
#groupB = 'untrt'
#design = "conditions"
#padj = 0.05
#log2FC = 1
# "ID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj"
#dropCol = c("lfcSE", "stat")
#output_prefix = "ehbio"
# print(groupA)
# if(is.list(groupA)){
# groupA <- unlist(groupA)
# }
# print(groupA)
# print(groupB)
# if(is.list(groupB)){
# groupB <- unlist(groupB)
# }
# print(groupB)
print(paste("DE genes between", groupA, groupB, sep=" "))
contrastV <- c(design, groupA, groupB)
print(contrastV)
res <- DESeq2::results(dds, contrast=contrastV)
normalized_counts <- DESeq2::counts(dds, normalized=TRUE)
baseA <- normalized_counts[, colData(dds)[[design]] == groupA]
if (is.vector(baseA)){
baseMeanA <- as.data.frame(baseA)
} else {
baseMeanA <- as.data.frame(rowMeans(baseA))
}
baseMeanA <- round(baseMeanA, 3)
colnames(baseMeanA) <- groupA
baseB <- normalized_counts[, colData(dds)[[design]] == groupB]
if (is.vector(baseB)){
baseMeanB <- as.data.frame(baseB)
} else {
baseMeanB <- as.data.frame(rowMeans(baseB))
}
baseMeanB <- round(baseMeanB, 3)
colnames(baseMeanB) <- groupB
res <- cbind(baseMeanA, baseMeanB, as.data.frame(res))
res <- data.frame(ID=rownames(res), res)
res$baseMean <- round(rowMeans(cbind(baseA, baseB)),3)
res$padj[is.na(res$padj)] <- 1
res$pvalue[is.na(res$pvalue)] <- 1
res$log2FoldChange <- round(res$log2FoldChange,3)
res$padj <- as.numeric(formatC(res$padj))
res$pvalue <- as.numeric(formatC(res$pvalue))
res <- res[order(res$padj),]
comp314 <- paste(groupA, "_vs_", groupB, sep=".")
file_base <- paste(output_prefix, "DESeq2", comp314, sep=".")
file_base1 <- paste(file_base, "results.xls", sep=".")
res_output <- res[, !(names(res) %in% dropCol), drop=F]
write.table(res_output, file=file_base1, sep="\t", quote=F, row.names=F)
res_de <- res[res$padj<padj, !(names(res) %in% dropCol), drop=F]
res_de_up <- subset(res_de, res_de$log2FoldChange>=log2FC)
file <- paste(output_prefix, "DESeq2",groupA, "_higherThan_", groupB,
'xls', sep=".")
write.table(as.data.frame(res_de_up), file=file, sep="\t", quote=F, row.names=F)
res_de_up_id <- subset(res_de_up, select=c("ID"))
file <- paste(output_prefix, "DESeq2",groupA, "_higherThan_", groupB,
'id.xls', sep=".")
write.table(as.data.frame(res_de_up_id), file=file, sep="\t",
quote=F, row.names=F, col.names=F)
if(dim(res_de_up_id)[1]>0) {
res_de_up_id_l <- cbind(res_de_up_id, paste(groupA, "_higherThan_",groupB, sep="."))
write.table(as.data.frame(res_de_up_id_l),
file=paste0(output_prefix,".DESeq2.all.DE"),
sep="\t",quote=F, row.names=F, col.names=F, append=T)
}
res_de_dw <- subset(res_de, res_de$log2FoldChange<=(-1)*log2FC)
file <- paste(output_prefix, "DESeq2",groupA, "_lowerThan_", groupB,
'xls', sep=".")
write.table(as.data.frame(res_de_dw), file=file, sep="\t", quote=F, row.names=F)
res_de_dw_id <- subset(res_de_dw, select=c("ID"))
file <- paste(output_prefix, "DESeq2",groupA, "_lowerThan_", groupB,
'id.xls', sep=".")
write.table(as.data.frame(res_de_dw_id), file=file, sep="\t",
quote=F, row.names=F, col.names=F)
if(dim(res_de_dw_id)[1]>0) {
res_de_dw_id_l <- cbind(res_de_dw_id, paste(groupA, "_lowerThan_",groupB, sep="."))
write.table(as.data.frame(res_de_dw_id_l),
file=paste0(output_prefix,".DESeq2.all.DE"),
sep="\t",quote=F, row.names=F, col.names=F, append=T)
}
res_output$level <- ifelse(res_output$padj<=padj,
ifelse(res_output$log2FoldChange>=log2FC,
paste(groupA,"UP"),
ifelse(res_output$log2FoldChange<=(-1)*(log2FC),
paste(groupB,"UP"), "NoDiff")) , "NoDiff")
volcanoPlot(res_output, "log2FoldChange", "padj",
"level", saveplot=paste0(file_base1,".volcano.pdf"), ...)
rankPlot(res_output, label=10, saveplot=paste0(file_base1,".rankplot.pdf"), width=20, ...)
res_de_up_top20_id <- as.vector(head(res_de_up$ID,20))
res_de_dw_top20_id <- as.vector(head(res_de_dw$ID,20))
res_de_top20 <- c(res_de_up_top20_id, res_de_dw_top20_id)
res_de_top20_expr <- normalized_counts[res_de_top20,]
sample = as.data.frame(dds@colData)
pheatmap::pheatmap(res_de_top20_expr, cluster_row=T, scale="row",
annotation_col=sample,
filename=paste0(file_base1,".top20DEgenes.heatmap.pdf"))
res_de_top20_expr2 <- data.frame(Gene=rownames(res_de_top20_expr), res_de_top20_expr)
res_de_top20_expr2 <- reshape2::melt(res_de_top20_expr, id=c("Gene"))
colnames(res_de_top20_expr2) <- c("Gene", "Sample", "Expression")
res_de_top20_expr2$Group <- sample[match(res_de_top20_expr2$Sample, rownames(sample)),design]
p = ggplot(res_de_top20_expr2, aes(x=Gene, y=Expression)) +
geom_point(aes(color=Group), alpha=0.5) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
axis.title.x = element_blank()) +
ylab("Normalized xpression value") + scale_y_log10()
ggsave(p, file=paste0(file_base1,".top20DEgenes.dotplot.pdf"), width=20,
height=14, units="cm", ...)
}
# geneExprColDotplot <- function(data,x,y,color,ylab){
# ggplot(res_de_top20_expr2, aes(x=Gene, y=Expression)) +
# geom_point(aes(color=Group), alpha=0.5) + theme_classic() +
# theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
# axis.title.x = element_blank()) + ylab("Normalized xpression value") +
# scale_y_log10()
# }
#' DE genes analysis for multiple groups.
#'
#' @param dds \code{\link{DESeq}} function returned object.
#' @param comparePairFile A file containing sample groups for comparing. Optional.
#' If not given, the function will use \code{colData} information in \code{dds}
#' and perform group compare for all possible combinations.
#'
#' ```
#' groupA groupB
#' groupA groupC
#' groupC groupB
#' ```
#' @inheritParams twoGroupDEgenes
#'
#' @return NULL
#' @export
#'
#' @examples
#'
#' multipleGroupDEgenes(dds)
#'
multipleGroupDEgenes <- function(
dds,
comparePairFile=NULL,
design="conditions",
padj=0.05,
log2FC=1,
# "ID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj"
dropCol=c("lfcSE", "stat"),
output_prefix="ehbio",
...
){
if (file.exists(paste0(output_prefix,".DESeq2.all.DE"))) {
file.remove(paste0(output_prefix,".DESeq2.all.DE"))
}
if(!is.null(comparePairFile)){
compare_data <- read.table(comparePairFile, sep="\t",
check.names=F, quote='', com='')
colnames(compare_data) <- c("sampA", "sampB")
} else {
sampleGroup <- as.data.frame(dds@colData)
compare_data <- as.vector(unique(sampleGroup[[design]]))
#compare_data <- letters[1:3]
len_compare_data <- length(compare_data)
compareL <- list()
count = 1
for(i in 1:(len_compare_data-1)) {
for(j in (i+1):len_compare_data) {
tmp_compare <- list(sampA=compare_data[i],sampB=compare_data[j])
compareL[[paste(i,j)]] <- tmp_compare
count = count + 1
}
}
compare_data <- as.data.frame(do.call(rbind, compareL))
}
unused <- by(compare_data, 1:nrow(compare_data), function (x)
twoGroupDEgenes(dds, groupA=unlist(x[1,1]), groupB=unlist(x[1,2]), design=design, padj=padj,
log2FC=log2FC, dropCol=dropCol,
output_prefix=output_prefix, ...))
#twoGroupDEgenes(dds, tmp_compare, design=design, padj=padj, log2FC=log2FC,
# dropCol=dropCol, output_prefix=output_prefix, ...)
}
#' One step DEseq2 DE genes analysis for salmon output.
#'
#' @param file A file containing salmon output file lists if "type=salmon"
#' with format described in \code{\link{salmon2deseq}}. Or
#' reads count matrix file if "type=readscount" with format described in
#' \code{\link{readscount2deseq}}.
#'
#' @param type Specify input file type, either "salmon" or "readscount".
#' "tx2gene" currently has no effects for "type=readscount".
#'
#' @inheritParams salmon2deseq
#' @inheritParams deseq2normalizedExpr
#' @inheritParams multipleGroupDEgenes
#'
#' @return NULL
#' @export
#'
#' @examples
#'
#' DESeq2_ysx(salmon_file_list, sampleFIle, conditions, type="salmon")
#' DESeq2_ysx(count_matrix_file, sampleFIle, conditions, type="readscount")
#'
DESeq2_ysx <- function(file, sampleFile, design, type,
covariate = NULL,
tx2gene=NULL, filter=NULL, output_prefix='ehbio',
rlog=T, vst=F, comparePairFile=NULL, padj=0.05,
log2FC=1, dropCol=c("lfcSE", "stat")){
if(type == "salmon"){
dds <- salmon2deseq(file, sampleFile, design, covariate=covariate,
tx2gene=tx2gene, filter=filter, rundeseq=T)
} else if(type == "readscount"){
dds <- readscount2deseq(file, sampleFile, design, covariate=covariate,
filter=filter, rundeseq=T)
}
normexpr <- deseq2normalizedExpr(dds, output_prefix, rlog=rlog, vst=vst)
normalizedExpr2DistribBoxplot(normexpr,
saveplot=paste(output_prefix, "DESeq2.normalizedExprDistrib.pdf", sep="."))
clusterSampleHeatmap2(normexpr$rlog,
cor_file=paste(output_prefix, "DESeq2.sampleCorrelation.txt", sep="."),
saveplot=paste(output_prefix, "DESeq2.sampleCorrelation.pdf", sep="."))
multipleGroupDEgenes(dds, comparePairFile=comparePairFile, design=design,
padj=padj, log2FC=log2FC, dropCol=dropCol,
output_prefix=output_prefix)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.