DEUS: Differential expression of unique sequences

#*  Copyright (C) 2018 the DEUS contributors.
#*  Website: https://github.com/timjeske/DEUS
#*
#*  This file is part of the DEUS R package.
#*
#*  The DEUS R package is free software: you can redistribute it and/or modify
#*  it under the terms of the GNU General Public License as published by
#*  the Free Software Foundation, either version 3 of the License, or
#*  (at your option) any later version.
#*
#*  This program is distributed in the hope that it will be useful,
#*  but WITHOUT ANY WARRANTY; without even the implied warranty of
#*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#*  GNU General Public License for more details.
#*
#*  You should have received a copy of the GNU General Public License
#*  along with this program.  If not, see <http://www.gnu.org/licenses/>.


#' Create a mapping of sequences to sequence identifiers
#'
#' This function creates a data frame mapping the row names of the input (sequence) count table to sequence identifiers.
#' In context of the DEUS package the sequence identifiers can be used as alternate short identifiers for each unique sequence.
#' @param count_table The count table generated by \link[DEUS]{createCountTableFromFastQs}
#' @return A data frame with sequences as row names and sequence identifiers in the column 'SequenceID'
#' @export

createMap <- function(count_table) {
  map <- as.data.frame(paste("seq_", c(1:nrow(count_table)) , sep=""))
  row.names(map) <- row.names(count_table)
  names(map) <- c("SequenceID")
  return(map)
}

#' Merge intermediate results of DEUS
#'
#' The function merges results created during DEUS analysis into a comprehensive summary table.
#' It is also possible to merge only a sub-set of all results or to generate the results with other methods than the provided ones.
#'
#' @param de_result Result of differential expression analysis generated by \link[DEUS]{runDESeq2} (columns 'Log2FoldChange', 'Pvalue' and 'IHWPvalue' are required)
#' @param count_stats Means and standard deviations of each condition in analysis generated by \link[DEUS]{getConditionCountStats}
#' @param blast_result Result of BLAST generated by \link[DEUS]{runBlast} (columns 'qseqid', 'sseqid', 'length' and 'evalue' are required)
#' @param clust_result Result of clustering generated by \link[DEUS]{runClustering}
#' @param map A data frame with sequences as row names and sequence identifiers in first column.
#' Can be generated by \link[DEUS]{createMap}
#' @return If all results are given, a data frame is created with sequences as row names and the columns 'SequenceID', 'Log2FoldChange', 'Pvalue', 'IHWPvalue', 'NormCounts_<cond1>_Mean', 'NormCounts_<cond1>_Sd', 'NormCounts_<cond2>_Mean', 'NormCounts_<cond2>_Sd', 'ClusterID', 'Length', 'BlastEvalue' and 'FeatureList'.
#' If any result is not given corresponding columns are missing.
#' @export

mergeResults <- function(de_result, count_stats, blast_result, clust_result, map) {

  if(missing(de_result) && missing(count_stats) && missing(blast_result) && missing(clust_result)) {
    stop("mergeResults requires at least de_result, count_stats, blast_result or clust_result!")
  }

  res <- map
  colnames(res) <- c("SequenceID")
  res$sequence <- row.names(map)

  if(!missing(de_result)) {

    if("Cl_Pvalue" %in% colnames(de_result)){
      de_result <- de_result[c("Log2FoldChange","Pvalue","IHWPvalue","Cl_Log2FoldChange","Cl_Pvalue","Cl_IHWPvalue")]
    } else {
      de_result <- de_result[c("Log2FoldChange","Pvalue", "IHWPvalue")]
    }
    de_result$SequenceID <- map[row.names(de_result),1]
    de_result$sequence <- row.names(de_result)
    res <- de_result
  }

  if(!missing(count_stats)) {
    count_stats$sequence <- row.names(count_stats)
    res <- plyr::join(res, count_stats, type = "left")
  }

  if(!missing(clust_result)) {
    res <- plyr::join(res, clust_result, type = "left")
  }

  if(!missing(blast_result)) {
    blast_result <- blast_result[c("SequenceID", "Annotation", "Length", "BlastEvalue")]
    res <- plyr::join(res, blast_result, type = "left")
    group <- data.frame(FeatureList=c(by(res$Annotation, res$sequence, function(x)paste(x, collapse=","))))
    group$sequence <- row.names(group)
    group <- plyr::join(group, res, type = "full", match="first")
    group <- group[-which(names(group)=="Annotation")]
    # move featureList to last column
    group <- group[,c(which(names(group)!="FeatureList"),which(names(group)=="FeatureList"))]
    # move sequenceID to first column
    group <- group[,c(which(names(group)=="SequenceID"),which(names(group)!="SequenceID"))]
    group$Length <- nchar(group$sequence)
    row.names(group) <- group$sequence
    res <- group[-which(names(group)=="sequence")]
  }

  if("Pvalue" %in% colnames(res)) {
    res <- res[order(res$Pvalue),]
  }
  return(res)
}

#' Compute mean and standard deviation of normalized counts for each condition
#'
#' @param count_table A table of (normalized) sequence counts.
#' A sequence count table can be generated by \link[DEUS]{createCountTableFromFastQs} and normalized by applying \link[DEUS]{runDESeq2}
#' @param pheno_info A data frame with sample names as row names and one or more condition columns.
#' @param condition_col Name of the column that should be used as the condition to group the samples.
#' @return A data frame with sequences as row names and the mean and standard deviation for each condition defined in pheno_info
#' @export

getConditionCountStats<-function(count_table, pheno_info, condition_col = "condition"){

  #get group means
  groups = unique(pheno_info[[condition_col]])
  for(type in groups){
    cols  = row.names(pheno_info)[which(pheno_info[[condition_col]]==type)]
    subset= count_table[,cols]
    count_table=cbind(count_table,rowMeans(subset),data.frame(apply(subset,1,sd)))
    names(count_table)[ncol(count_table)-1]=paste("NormCounts",type,"Mean",sep="_")
    names(count_table)[ncol(count_table)]=paste("NormCounts",type,"Sd",sep="_")
  }
  index=length(groups)*2
  #return only mean & sd columns
  return(count_table[,c((ncol(count_table)-index+1):ncol(count_table))])
}

#' Count occurrences of feature classes
#'
#' As each unique sequence can have a lot of BLAST hits this function counts the number of hits grouped by user defined feature classes.
#'
#' @param summary Summary table generated by \link[DEUS]{mergeResults} (requires BLAST result generated by \link[DEUS]{runBlast})
#' @param feature_classes List of features representing classes to be counted.
#' Features can be defined as regular expressions, as described in \link[stringi]{stringi-search-regex}.
#' @return The summary table with additional columns for each feature term given by the user.
#' Each column holds the number of features in 'FeatureList' that match the corresponding feature term.
#' The column 'Other' reflects the number of all features in the 'FeatureList' after substracting the sum of feature class matches.
#' @export

addCountsOfFeatureClasses<- function(summary, feature_classes) {
  if(!("FeatureList" %in% names(summary))) stop('Feature classes can only be counted if blast results have been merged to DE results!')
  res <- summary
  v_features <- strsplit(paste(summary$FeatureList),",")
  sum <- 0
  for(i in feature_classes) {
    #res[i] <- sum(stringr::str_detect(string=v_features[[1]], i))
    tmp_count <- lapply(v_features,stringr::str_detect,pattern=i)
    res[i] <- t(as.data.frame(lapply(tmp_count,sum)))
    sum <- sum + res[[i]]
  }
  res$"Other" <- as.numeric(lapply(v_features, function(x) length(x[! x == "NA" ]))) - sum
  res <- res[,c(which(names(res)!="FeatureList"),which(names(res)=="FeatureList"))]

  if("Pvalue" %in% colnames(res)) {
    res <- res[order(res$Pvalue),]
  }
  return(res)
}

#' Write summary tables for DEUS result
#'
#' @param summary Summary table generated by \link[DEUS]{mergeResults} (requires BLAST result generated by \link[DEUS]{runBlast})
#' @param out_dir Directory where summary tables are written
#' @param expressed_only If true (default), only sequences with an IHWPvalue are provided in the output file. Sequences without IHWPvalue were not part of the DE analysis and therefore are assumed to have low expression values. If filtering is turned off, output files might include a large number of rows.
#' @return Files are written to the output directory. 'SummaryTable.tsv' corresponds to the table generated by \link[DEUS]{mergeResults}.
#' The table is split into a file 'SummaryTable_withBlast.tsv' and a file 'SummaryTable_noBlast.tsv' representing the sequences that have a BLAST hit and those with no BLAST hit.
#' @export

writeSummaryFiles <- function(summary, out_dir, expressed_only=TRUE) {

  #Remove lines without IHWPvalue. Indicates that sequence expression is below cutoff used during DE analysis
  if(expressed_only==TRUE){
    summary <- summary[!is.na(summary$IHWPvalue),]
  }

  summary$Sequence <- row.names(summary)
  summary <- summary[,c(ncol(summary),1:ncol(summary)-1)]
  write.table(summary, paste(out_dir, "SummaryTable.tsv", sep="/"), sep="\t", quote=F, row.names=F, col.names=T)

  filtered <- summary[!summary$FeatureList=="NA",]
  write.table(filtered, paste(out_dir, "SummaryTable_withBlast.tsv", sep="/"), sep="\t", quote=F, row.names=F, col.names=T)

  filtered <- summary[summary$FeatureList=="NA" | is.na(summary$FeatureList),]
  write.table(filtered, paste(out_dir,"SummaryTable_noBlast.tsv",sep="/"), sep="\t", quote=F,row.names=F,col.names=T)

}

#' Create a vector of sequence identifiers and sequences in FASTA format
#'
#' @param de_result A data frame with sequences as row names.
#' In the context of DEUS, the data frame is the resulting table of differential expression analysis with significant sequences as row names.
#' Differential expression analysis can be performed via \link[DEUS]{runDESeq2}
#' @param map A data frame with sequences as row names and sequence identifiers in first column.
#' Can be generated by \link[DEUS]{createMap}
#' @return A vector of alternating sequence identifiers and nucleotide sequences
#' @export

sequencesAsFasta <- function(de_result, map) {
  res <- as.vector(rbind(paste(">",map[row.names(de_result),1],sep=""),row.names(de_result)))
  return(res)
}

#' Remove temporary FASTA file required for clustering
#'
#' This function is internally called by \link[DEUS]{runClustering}
#'
#' @param out_dir Directory where output files of clustering are saved
#' @return Deletes the file sig_sequences.fa
#' @export

deleteTmp <- function(out_dir){
  tmp <- paste(out_dir,"sig_sequences.fa",sep="/")
  if(file.exists(tmp)){
    file.remove(tmp)
  }
}

#' Aggregates sequence counts for each sequence cluster
#'
#' @param map A data frame with sequences as row names and sequence identifiers in first column.
#' Can be generated by \link[DEUS]{createMap}
#' @param cl_count_table Count table created by \link[DEUS]{createCountTableFromFastQs}
#' @param clust_result Dataframe containing clustering results as created by \link[DEUS]{runClustering}
#' @return A dataframe that includes sequence counts for each sequence cluster. Can be used as input for DE analysis
#' @export

mergeAndAggregate <- function(map, cl_count_table, clust_result){
  #Add SequenceID
  cl_count_table$"SequenceID" <- map[row.names(cl_count_table),]

  #Add ClusterID
  cl_count_table <- plyr::join(cl_count_table,clust_result)

  #Remove extra cols before aggregation
  cl_count_table <- cl_count_table[ , -which(names(cl_count_table) %in% c("SequenceID"))]

  #Sum counts by cluster
  cl_counts <- aggregate(cl_count_table[-(ncol(cl_count_table))], by=list(Category=cl_count_table$ClusterID), FUN=sum)

  #Rownames indicate ClusterID
  row.names(cl_counts) <- cl_counts$Category
  cl_counts <- cl_counts[ , -which(names(cl_counts) %in% c("Category"))]

  return(cl_counts)
}


#' Merges clustering results as well as DE results for single sequence and cluster approach
#'
#' @param cl_sig_results A data frame as created by \link[DEUS]{runDESeq2}. Includes DE pvalues for each cluster
#' @param clust_result Dataframe containing clustering results as created by \link[DEUS]{runClustering}
#' @param sig_results A data frame as created by \link[DEUS]{runDESeq2}. Includes DE pvalues for each individual sequence
#' @param map A data frame with sequences as row names and sequence identifiers in first column.
#' Can be generated by \link[DEUS]{createMap}
#' @return A combined data frame which includes all pvalues and clusterIDs
#' @export

mergeSingleAndClusterResults <- function(cl_sig_results, clust_result, sig_results, map){
  #Adjust map to work here
  map$sequences <- row.names(map)
  names(map)[1]="SequenceID"

  #Add prefix for cl_sig_results to ensure unique columns
  names(cl_sig_results)=paste("Cl_",names(cl_sig_results),sep="")

  #Add ClusterID column for joining
  cl_sig_results$ClusterID <- row.names(cl_sig_results)

  #combine cl_deseq and clustering results
  cl_sig_results <- plyr::join(cl_sig_results,clust_result,type="inner",by="ClusterID")

  #Combine both deseq results
  sig_results$"SequenceID" <- map[row.names(sig_results),"SequenceID"]
  sig_results <- plyr::join(sig_results,cl_sig_results,type="full",by="SequenceID")

  #Add sequence as row name
  sig_results <- plyr::join(sig_results,map,type="inner",by="SequenceID")
  rownames(sig_results) <- sig_results$sequences
  sig_results <- sig_results[ , -which(names(sig_results) %in% c("sequences"))]

  return(sig_results)
}

#' Prints a summary of the number of significant sequences and clusters
#'
#' @param summary Summary table as created by \link[DEUS]{mergeResults}
#' @param sig_threshold Significance threshold used for filtering according to the IHW P-value of the sequences and clusters
#' @return Prints summary values to standard output.
#' @export

printClusterSummary <- function(summary, sig_threshold = 0.05) {
  sig_seq_summary <- summary[(!is.na(summary$IHWPvalue) & summary$IHWPvalue < sig_threshold),]
  sig_seqs <- length(sig_seq_summary$SequenceID)
  sig_seqs_not_in_cluster <- length(sig_seq_summary[is.na(sig_seq_summary$ClusterID),1])
  clust_incl_sig_seqs <- length(unique(sig_seq_summary$ClusterID))

  #test if Cluster cols exist
  if("Cl_IHWPvalue" %in% colnames(summary)){
    sig_clust_summary <- summary[(!is.na(summary$Cl_IHWPvalue) & summary$Cl_IHWPvalue < sig_threshold),]
    sig_clusts <- length(unique(sig_clust_summary$ClusterID))
    sig_clust_sig_seq <- sig_clust_summary[(!is.na(sig_clust_summary$IHWPvalue) & sig_clust_summary$IHWPvalue < sig_threshold),]
    sig_clust_no_sig_seq <- sig_clust_summary[!(!is.na(sig_clust_summary$IHWPvalue) & sig_clust_summary$IHWPvalue < sig_threshold),]
    sig_clust_no_sig_seq_low_exp <- sig_clust_no_sig_seq[is.na(sig_clust_no_sig_seq$IHWPvalue),]
    sig_clust_no_sig_seq_weak_DE <- sig_clust_no_sig_seq[!is.na(sig_clust_no_sig_seq$IHWPvalue) & !(sig_clust_no_sig_seq$IHWPvalue < sig_threshold),]
    sig_seq_no_sig_clust <- summary[(!is.na(summary$IHWPvalue) & summary$IHWPvalue < sig_threshold) & (!is.na(summary$Cl_IHWPvalue) & summary$Cl_IHWPvalue >= sig_threshold),]
    clusters_no_sig_seq <- length(setdiff(unique(sig_clust_summary$ClusterID), unique(sig_seq_summary$ClusterID)))
  }

  writeLines("## Sequence summary ##")
  writeLines(sprintf("Significant sequences: %d", sig_seqs))
  writeLines(sprintf("Significant sequences not clustered: %d", sig_seqs_not_in_cluster))
  if(exists("sig_clust_summary")) {
    writeLines(sprintf("Sequences in significant clusters: %d", length(sig_clust_summary$SequenceID)))
    writeLines(sprintf("Non-significant sequences in significant clusters: %d", length(sig_clust_no_sig_seq$SequenceID)))
    writeLines(sprintf("- too low expressed: %d", length(sig_clust_no_sig_seq_low_exp$SequenceID)))
    writeLines(sprintf("- too weak differentially expressed: %d", length(sig_clust_no_sig_seq_weak_DE$SequenceID)))
    writeLines(sprintf("Significant sequences in non-significant clusters: %d", length(sig_seq_no_sig_clust$SequenceID)))
  }

  writeLines("## Cluster summary ##")
  if(exists("sig_clust_summary")) {
    writeLines(sprintf("Significant clusters: %d", sig_clusts))
  }
  writeLines(sprintf("Clusters including significant sequences: %d", clust_incl_sig_seqs))
  if(exists("sig_clust_summary")) {
    writeLines(sprintf("Significant clusters including no significant sequences: %d", clusters_no_sig_seq))
  }
}