R/clustering.R

#*  Copyright (C) 2018 the DEUS contributors.
#*  Website: https://github.com/timjeske/DEUS
#*
#*  This file is part of the DEUS R package.
#*
#*  The DEUS R package is free software: you can redistribute it and/or modify
#*  it under the terms of the GNU General Public License as published by
#*  the Free Software Foundation, either version 3 of the License, or
#*  (at your option) any later version.
#*
#*  This program is distributed in the hope that it will be useful,
#*  but WITHOUT ANY WARRANTY; without even the implied warranty of
#*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#*  GNU General Public License for more details.
#*
#*  You should have received a copy of the GNU General Public License
#*  along with this program.  If not, see <http://www.gnu.org/licenses/>.



#' Run CD-HIT clustering
#'
#' Sequentially, each sequency is either assinged to an existing cluster or is classified as a new
#' cluster representative if no matching cluster can be found.
#'
#' @param cdhit_path Path to cd-hit-est executable
#' @param sequences Vector of sequences in FASTA style generated by the \link[DEUS]{sequencesAsFasta}
#' @param out_dir Directory to save output files of clustering
#' @param identity_cutoff Sequence identity cutoff used for clustering
#' @param length_cutoff Length difference cutoff
#' @param wordlength CD-Hit word length
#' @param map A data frame with sequences as row names and sequence identifiers in first column.
#' Can be generated by \link[DEUS]{createMap}
#' @param write_fastas Boolean that indicates whether a fasta file will be generated for each cluster
#' @param optional Optional execution parameters
#' @return A data frame with the columns 'SequenceID' and 'ClusterID' assigning each sequence to a cluster of similar sequences via their identifiers.
#' Additionally, a file CD-HIT.fa, CD-HIT.fa.clstr and a folder Clusters is generated in the given output directory.
#' The CD-HIT.fa file is the FASTA file of all cluster representatives.
#' The CD-HIT.fa.clustr file lists all identified clusters and the assigned sequence identifiers together with the percentage of overlapping sequence with the cluster representative.
#' In the Clusters directory there is a FASTA file for each cluster.
#' @export

runClustering <- function(cdhit_path, sequences, out_dir, identity_cutoff, length_cutoff, wordlength, map, write_fastas=FALSE, optional="") {
  # tmp fasta file
  seq <- paste(out_dir, "sig_sequences.fa", sep="/")
  write.table(sequences, seq,quote = F, row.names = F, col.names = F)
  # run clustering
  clusters <- paste(out_dir,"CD-HIT.fa",sep="/")
  command <- paste("-i",seq,"-c",identity_cutoff,"-s",length_cutoff,"-n",wordlength,"-g 1",optional,"-o",clusters,sep=" ")
  message("CD-HIT configuration:")
  message(paste(cdhit_path,command))
  cluster.out <- system2(cdhit_path,command, stdout=TRUE)
  # remove tmp files
  deleteTmp(out_dir)

  # generate result
  out.df <- processClusters(map, clusters, out_dir, write_fastas)
  out.df["sequences"] <- NULL
  names(out.df)<-c("SequenceID","ClusterID")
  row.names(out.df)=out.df$SequenceID
  return(out.df)
}

#' Processes CD-HIT result file
#'
#' This function is internally called by \link[DEUS]{runClustering}
#'
#' @param map A data frame with sequences as row names and sequence identifiers in first column.
#' Can be generated by \link[DEUS]{createMap}
#' @param clusters The path to CD-HIT.fa
#' @param out_dir Directory of CD-HIT result file and where generated files will be saved
#' @param write_fastas Boolean that indicates whether a fasta file will be generated for each cluster
#' @return A data frame with the columns 'qseqid', 'cl_id' and 'sequences' containing the sequence identifier, the sequence and the assigned cluster identifier.
#' @export
processClusters <- function(map, clusters, out_dir, write_fastas) {

  #Get cluster file
  cluster.file <- paste(clusters,".clstr",sep="")

  #Read file and set -1 as default cluster
  clst <- readLines(cluster.file)
  clst.index <- unlist(lapply(clst,startsWith,prefix=">"))
  clst.df <- as.data.frame(clst)
  blank <- -1
  clst.df$"cl_id" <- blank

  #Set correct cluster ids for first cluster member
  clst.df[clst.index,2] <- as.numeric(gsub(">Cluster ","",clst.df[clst.index,1]))

  #Fill correct cluster ids for remaining members
  cl_ids <- clst.df$cl_id
  isnotblank <- cl_ids != blank
  cl_ids <- cl_ids[which(isnotblank)][cumsum(isnotblank)]
  clst.df$cl_id <- cl_ids

  #Grep sequence id
  seqIDsIndex=lapply(clst.df$clst,regexpr,pattern="seq_[0-9]+",perl=TRUE)
  clst.df$SequenceID[!clst.index] <- unlist(regmatches(clst.df$clst,seqIDsIndex))
  clst.df <- clst.df[-1]
  clst.df <- clst.df[!is.na(clst.df$SequenceID),]

  map$sequences<-row.names(map)
  out.df<-merge(x = clst.df, y = map, by = "SequenceID", all.x = TRUE)

  #Write fasta for each cluster if required
  if(write_fastas){
    cl.dir <- paste(out_dir, "Clusters",sep="/")
    dir.create(cl.dir, showWarnings = FALSE)
    by(out.df,out.df$cl_id,printCluster,cl_dir=cl.dir)
  }

  return(out.df)
}


#' Print FASTA file for cluster of sequences
#'
#' This function is internally called by \link[DEUS]{processClusters}
#'
#' @param data Subset of overall data provided within the \link[DEUS]{processClusters} function
#' @param cl_dir Directory of CD-HIT result file and where generated files will be saved
#' @return A FASTA file in the Clusters folder in cl_dir representing the sequences of a cluster.
#' @export

printCluster<-function(data,cl_dir){
  cl_id<-data[1,2]
  outfile <- paste(cl_dir,"/Cluster",cl_id,".fa",sep="")
  out<-paste(">",data[,1],"\n",data[,3],sep="")
  write.table(out,outfile,col.names=F,row.names=F,quote=F)
}
timjeske/DEUS documentation built on June 6, 2019, 12:59 p.m.