#* Copyright (C) 2018 the DEUS contributors.
#* Website: https://github.com/timjeske/DEUS
#*
#* This file is part of the DEUS R package.
#*
#* The DEUS R package is free software: you can redistribute it and/or modify
#* it under the terms of the GNU General Public License as published by
#* the Free Software Foundation, either version 3 of the License, or
#* (at your option) any later version.
#*
#* This program is distributed in the hope that it will be useful,
#* but WITHOUT ANY WARRANTY; without even the implied warranty of
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#* GNU General Public License for more details.
#*
#* You should have received a copy of the GNU General Public License
#* along with this program. If not, see <http://www.gnu.org/licenses/>.
#' Create a mapping of sequences to sequence identifiers
#'
#' This function creates a data frame mapping the row names of the input (sequence) count table to sequence identifiers.
#' In context of the DEUS package the sequence identifiers can be used as alternate short identifiers for each unique sequence.
#' @param count_table The count table generated by \link[DEUS]{createCountTableFromFastQs}
#' @return A data frame with sequences as row names and sequence identifiers in the column 'SequenceID'
#' @export
createMap <- function(count_table) {
map <- as.data.frame(paste("seq_", c(1:nrow(count_table)) , sep=""))
row.names(map) <- row.names(count_table)
names(map) <- c("SequenceID")
return(map)
}
#' Merge intermediate results of DEUS
#'
#' The function merges results created during DEUS analysis into a comprehensive summary table.
#' It is also possible to merge only a sub-set of all results or to generate the results with other methods than the provided ones.
#'
#' @param de_result Result of differential expression analysis generated by \link[DEUS]{runDESeq2} (columns 'Log2FoldChange', 'Pvalue' and 'IHWPvalue' are required)
#' @param count_stats Means and standard deviations of each condition in analysis generated by \link[DEUS]{getConditionCountStats}
#' @param blast_result Result of BLAST generated by \link[DEUS]{runBlast} (columns 'qseqid', 'sseqid', 'length' and 'evalue' are required)
#' @param clust_result Result of clustering generated by \link[DEUS]{runClustering}
#' @param map A data frame with sequences as row names and sequence identifiers in first column.
#' Can be generated by \link[DEUS]{createMap}
#' @return If all results are given, a data frame is created with sequences as row names and the columns 'SequenceID', 'Log2FoldChange', 'Pvalue', 'IHWPvalue', 'NormCounts_<cond1>_Mean', 'NormCounts_<cond1>_Sd', 'NormCounts_<cond2>_Mean', 'NormCounts_<cond2>_Sd', 'ClusterID', 'Length', 'BlastEvalue' and 'FeatureList'.
#' If any result is not given corresponding columns are missing.
#' @export
mergeResults <- function(de_result, count_stats, blast_result, clust_result, map) {
if(missing(de_result) && missing(count_stats) && missing(blast_result) && missing(clust_result)) {
stop("mergeResults requires at least de_result, count_stats, blast_result or clust_result!")
}
res <- map
colnames(res) <- c("SequenceID")
res$sequence <- row.names(map)
if(!missing(de_result)) {
if("Cl_Pvalue" %in% colnames(de_result)){
de_result <- de_result[c("Log2FoldChange","Pvalue","IHWPvalue","Cl_Log2FoldChange","Cl_Pvalue","Cl_IHWPvalue")]
} else {
de_result <- de_result[c("Log2FoldChange","Pvalue", "IHWPvalue")]
}
de_result$SequenceID <- map[row.names(de_result),1]
de_result$sequence <- row.names(de_result)
res <- de_result
}
if(!missing(count_stats)) {
count_stats$sequence <- row.names(count_stats)
res <- plyr::join(res, count_stats, type = "left")
}
if(!missing(clust_result)) {
res <- plyr::join(res, clust_result, type = "left")
}
if(!missing(blast_result)) {
blast_result <- blast_result[c("SequenceID", "Annotation", "Length", "BlastEvalue")]
res <- plyr::join(res, blast_result, type = "left")
group <- data.frame(FeatureList=c(by(res$Annotation, res$sequence, function(x)paste(x, collapse=","))))
group$sequence <- row.names(group)
group <- plyr::join(group, res, type = "full", match="first")
group <- group[-which(names(group)=="Annotation")]
# move featureList to last column
group <- group[,c(which(names(group)!="FeatureList"),which(names(group)=="FeatureList"))]
# move sequenceID to first column
group <- group[,c(which(names(group)=="SequenceID"),which(names(group)!="SequenceID"))]
group$Length <- nchar(group$sequence)
row.names(group) <- group$sequence
res <- group[-which(names(group)=="sequence")]
}
if("Pvalue" %in% colnames(res)) {
res <- res[order(res$Pvalue),]
}
return(res)
}
#' Compute mean and standard deviation of normalized counts for each condition
#'
#' @param count_table A table of (normalized) sequence counts.
#' A sequence count table can be generated by \link[DEUS]{createCountTableFromFastQs} and normalized by applying \link[DEUS]{runDESeq2}
#' @param pheno_info A data frame with sample names as row names and one or more condition columns.
#' @param condition_col Name of the column that should be used as the condition to group the samples.
#' @return A data frame with sequences as row names and the mean and standard deviation for each condition defined in pheno_info
#' @export
getConditionCountStats<-function(count_table, pheno_info, condition_col = "condition"){
#get group means
groups = unique(pheno_info[[condition_col]])
for(type in groups){
cols = row.names(pheno_info)[which(pheno_info[[condition_col]]==type)]
subset= count_table[,cols]
count_table=cbind(count_table,rowMeans(subset),data.frame(apply(subset,1,sd)))
names(count_table)[ncol(count_table)-1]=paste("NormCounts",type,"Mean",sep="_")
names(count_table)[ncol(count_table)]=paste("NormCounts",type,"Sd",sep="_")
}
index=length(groups)*2
#return only mean & sd columns
return(count_table[,c((ncol(count_table)-index+1):ncol(count_table))])
}
#' Count occurrences of feature classes
#'
#' As each unique sequence can have a lot of BLAST hits this function counts the number of hits grouped by user defined feature classes.
#'
#' @param summary Summary table generated by \link[DEUS]{mergeResults} (requires BLAST result generated by \link[DEUS]{runBlast})
#' @param feature_classes List of features representing classes to be counted.
#' Features can be defined as regular expressions, as described in \link[stringi]{stringi-search-regex}.
#' @return The summary table with additional columns for each feature term given by the user.
#' Each column holds the number of features in 'FeatureList' that match the corresponding feature term.
#' The column 'Other' reflects the number of all features in the 'FeatureList' after substracting the sum of feature class matches.
#' @export
addCountsOfFeatureClasses<- function(summary, feature_classes) {
if(!("FeatureList" %in% names(summary))) stop('Feature classes can only be counted if blast results have been merged to DE results!')
res <- summary
v_features <- strsplit(paste(summary$FeatureList),",")
sum <- 0
for(i in feature_classes) {
#res[i] <- sum(stringr::str_detect(string=v_features[[1]], i))
tmp_count <- lapply(v_features,stringr::str_detect,pattern=i)
res[i] <- t(as.data.frame(lapply(tmp_count,sum)))
sum <- sum + res[[i]]
}
res$"Other" <- as.numeric(lapply(v_features, function(x) length(x[! x == "NA" ]))) - sum
res <- res[,c(which(names(res)!="FeatureList"),which(names(res)=="FeatureList"))]
if("Pvalue" %in% colnames(res)) {
res <- res[order(res$Pvalue),]
}
return(res)
}
#' Write summary tables for DEUS result
#'
#' @param summary Summary table generated by \link[DEUS]{mergeResults} (requires BLAST result generated by \link[DEUS]{runBlast})
#' @param out_dir Directory where summary tables are written
#' @param expressed_only If true (default), only sequences with an IHWPvalue are provided in the output file. Sequences without IHWPvalue were not part of the DE analysis and therefore are assumed to have low expression values. If filtering is turned off, output files might include a large number of rows.
#' @return Files are written to the output directory. 'SummaryTable.tsv' corresponds to the table generated by \link[DEUS]{mergeResults}.
#' The table is split into a file 'SummaryTable_withBlast.tsv' and a file 'SummaryTable_noBlast.tsv' representing the sequences that have a BLAST hit and those with no BLAST hit.
#' @export
writeSummaryFiles <- function(summary, out_dir, expressed_only=TRUE) {
#Remove lines without IHWPvalue. Indicates that sequence expression is below cutoff used during DE analysis
if(expressed_only==TRUE){
summary <- summary[!is.na(summary$IHWPvalue),]
}
summary$Sequence <- row.names(summary)
summary <- summary[,c(ncol(summary),1:ncol(summary)-1)]
write.table(summary, paste(out_dir, "SummaryTable.tsv", sep="/"), sep="\t", quote=F, row.names=F, col.names=T)
filtered <- summary[!summary$FeatureList=="NA",]
write.table(filtered, paste(out_dir, "SummaryTable_withBlast.tsv", sep="/"), sep="\t", quote=F, row.names=F, col.names=T)
filtered <- summary[summary$FeatureList=="NA" | is.na(summary$FeatureList),]
write.table(filtered, paste(out_dir,"SummaryTable_noBlast.tsv",sep="/"), sep="\t", quote=F,row.names=F,col.names=T)
}
#' Create a vector of sequence identifiers and sequences in FASTA format
#'
#' @param de_result A data frame with sequences as row names.
#' In the context of DEUS, the data frame is the resulting table of differential expression analysis with significant sequences as row names.
#' Differential expression analysis can be performed via \link[DEUS]{runDESeq2}
#' @param map A data frame with sequences as row names and sequence identifiers in first column.
#' Can be generated by \link[DEUS]{createMap}
#' @return A vector of alternating sequence identifiers and nucleotide sequences
#' @export
sequencesAsFasta <- function(de_result, map) {
res <- as.vector(rbind(paste(">",map[row.names(de_result),1],sep=""),row.names(de_result)))
return(res)
}
#' Remove temporary FASTA file required for clustering
#'
#' This function is internally called by \link[DEUS]{runClustering}
#'
#' @param out_dir Directory where output files of clustering are saved
#' @return Deletes the file sig_sequences.fa
#' @export
deleteTmp <- function(out_dir){
tmp <- paste(out_dir,"sig_sequences.fa",sep="/")
if(file.exists(tmp)){
file.remove(tmp)
}
}
#' Aggregates sequence counts for each sequence cluster
#'
#' @param map A data frame with sequences as row names and sequence identifiers in first column.
#' Can be generated by \link[DEUS]{createMap}
#' @param cl_count_table Count table created by \link[DEUS]{createCountTableFromFastQs}
#' @param clust_result Dataframe containing clustering results as created by \link[DEUS]{runClustering}
#' @return A dataframe that includes sequence counts for each sequence cluster. Can be used as input for DE analysis
#' @export
mergeAndAggregate <- function(map, cl_count_table, clust_result){
#Add SequenceID
cl_count_table$"SequenceID" <- map[row.names(cl_count_table),]
#Add ClusterID
cl_count_table <- plyr::join(cl_count_table,clust_result)
#Remove extra cols before aggregation
cl_count_table <- cl_count_table[ , -which(names(cl_count_table) %in% c("SequenceID"))]
#Sum counts by cluster
cl_counts <- aggregate(cl_count_table[-(ncol(cl_count_table))], by=list(Category=cl_count_table$ClusterID), FUN=sum)
#Rownames indicate ClusterID
row.names(cl_counts) <- cl_counts$Category
cl_counts <- cl_counts[ , -which(names(cl_counts) %in% c("Category"))]
return(cl_counts)
}
#' Merges clustering results as well as DE results for single sequence and cluster approach
#'
#' @param cl_sig_results A data frame as created by \link[DEUS]{runDESeq2}. Includes DE pvalues for each cluster
#' @param clust_result Dataframe containing clustering results as created by \link[DEUS]{runClustering}
#' @param sig_results A data frame as created by \link[DEUS]{runDESeq2}. Includes DE pvalues for each individual sequence
#' @param map A data frame with sequences as row names and sequence identifiers in first column.
#' Can be generated by \link[DEUS]{createMap}
#' @return A combined data frame which includes all pvalues and clusterIDs
#' @export
mergeSingleAndClusterResults <- function(cl_sig_results, clust_result, sig_results, map){
#Adjust map to work here
map$sequences <- row.names(map)
names(map)[1]="SequenceID"
#Add prefix for cl_sig_results to ensure unique columns
names(cl_sig_results)=paste("Cl_",names(cl_sig_results),sep="")
#Add ClusterID column for joining
cl_sig_results$ClusterID <- row.names(cl_sig_results)
#combine cl_deseq and clustering results
cl_sig_results <- plyr::join(cl_sig_results,clust_result,type="inner",by="ClusterID")
#Combine both deseq results
sig_results$"SequenceID" <- map[row.names(sig_results),"SequenceID"]
sig_results <- plyr::join(sig_results,cl_sig_results,type="full",by="SequenceID")
#Add sequence as row name
sig_results <- plyr::join(sig_results,map,type="inner",by="SequenceID")
rownames(sig_results) <- sig_results$sequences
sig_results <- sig_results[ , -which(names(sig_results) %in% c("sequences"))]
return(sig_results)
}
#' Prints a summary of the number of significant sequences and clusters
#'
#' @param summary Summary table as created by \link[DEUS]{mergeResults}
#' @param sig_threshold Significance threshold used for filtering according to the IHW P-value of the sequences and clusters
#' @return Prints summary values to standard output.
#' @export
printClusterSummary <- function(summary, sig_threshold = 0.05) {
sig_seq_summary <- summary[(!is.na(summary$IHWPvalue) & summary$IHWPvalue < sig_threshold),]
sig_seqs <- length(sig_seq_summary$SequenceID)
sig_seqs_not_in_cluster <- length(sig_seq_summary[is.na(sig_seq_summary$ClusterID),1])
clust_incl_sig_seqs <- length(unique(sig_seq_summary$ClusterID))
#test if Cluster cols exist
if("Cl_IHWPvalue" %in% colnames(summary)){
sig_clust_summary <- summary[(!is.na(summary$Cl_IHWPvalue) & summary$Cl_IHWPvalue < sig_threshold),]
sig_clusts <- length(unique(sig_clust_summary$ClusterID))
sig_clust_sig_seq <- sig_clust_summary[(!is.na(sig_clust_summary$IHWPvalue) & sig_clust_summary$IHWPvalue < sig_threshold),]
sig_clust_no_sig_seq <- sig_clust_summary[!(!is.na(sig_clust_summary$IHWPvalue) & sig_clust_summary$IHWPvalue < sig_threshold),]
sig_clust_no_sig_seq_low_exp <- sig_clust_no_sig_seq[is.na(sig_clust_no_sig_seq$IHWPvalue),]
sig_clust_no_sig_seq_weak_DE <- sig_clust_no_sig_seq[!is.na(sig_clust_no_sig_seq$IHWPvalue) & !(sig_clust_no_sig_seq$IHWPvalue < sig_threshold),]
sig_seq_no_sig_clust <- summary[(!is.na(summary$IHWPvalue) & summary$IHWPvalue < sig_threshold) & (!is.na(summary$Cl_IHWPvalue) & summary$Cl_IHWPvalue >= sig_threshold),]
clusters_no_sig_seq <- length(setdiff(unique(sig_clust_summary$ClusterID), unique(sig_seq_summary$ClusterID)))
}
writeLines("## Sequence summary ##")
writeLines(sprintf("Significant sequences: %d", sig_seqs))
writeLines(sprintf("Significant sequences not clustered: %d", sig_seqs_not_in_cluster))
if(exists("sig_clust_summary")) {
writeLines(sprintf("Sequences in significant clusters: %d", length(sig_clust_summary$SequenceID)))
writeLines(sprintf("Non-significant sequences in significant clusters: %d", length(sig_clust_no_sig_seq$SequenceID)))
writeLines(sprintf("- too low expressed: %d", length(sig_clust_no_sig_seq_low_exp$SequenceID)))
writeLines(sprintf("- too weak differentially expressed: %d", length(sig_clust_no_sig_seq_weak_DE$SequenceID)))
writeLines(sprintf("Significant sequences in non-significant clusters: %d", length(sig_seq_no_sig_clust$SequenceID)))
}
writeLines("## Cluster summary ##")
if(exists("sig_clust_summary")) {
writeLines(sprintf("Significant clusters: %d", sig_clusts))
}
writeLines(sprintf("Clusters including significant sequences: %d", clust_incl_sig_seqs))
if(exists("sig_clust_summary")) {
writeLines(sprintf("Significant clusters including no significant sequences: %d", clusters_no_sig_seq))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.