#* Copyright (C) 2018 the DEUS contributors.
#* Website: https://github.com/timjeske/DEUS
#*
#* This file is part of the DEUS R package.
#*
#* The DEUS R package is free software: you can redistribute it and/or modify
#* it under the terms of the GNU General Public License as published by
#* the Free Software Foundation, either version 3 of the License, or
#* (at your option) any later version.
#*
#* This program is distributed in the hope that it will be useful,
#* but WITHOUT ANY WARRANTY; without even the implied warranty of
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#* GNU General Public License for more details.
#*
#* You should have received a copy of the GNU General Public License
#* along with this program. If not, see <http://www.gnu.org/licenses/>.
#' Run CD-HIT clustering
#'
#' Sequentially, each sequency is either assinged to an existing cluster or is classified as a new
#' cluster representative if no matching cluster can be found.
#'
#' @param cdhit_path Path to cd-hit-est executable
#' @param sequences Vector of sequences in FASTA style generated by the \link[DEUS]{sequencesAsFasta}
#' @param out_dir Directory to save output files of clustering
#' @param identity_cutoff Sequence identity cutoff used for clustering
#' @param length_cutoff Length difference cutoff
#' @param wordlength CD-Hit word length
#' @param map A data frame with sequences as row names and sequence identifiers in first column.
#' Can be generated by \link[DEUS]{createMap}
#' @param write_fastas Boolean that indicates whether a fasta file will be generated for each cluster
#' @param optional Optional execution parameters
#' @return A data frame with the columns 'SequenceID' and 'ClusterID' assigning each sequence to a cluster of similar sequences via their identifiers.
#' Additionally, a file CD-HIT.fa, CD-HIT.fa.clstr and a folder Clusters is generated in the given output directory.
#' The CD-HIT.fa file is the FASTA file of all cluster representatives.
#' The CD-HIT.fa.clustr file lists all identified clusters and the assigned sequence identifiers together with the percentage of overlapping sequence with the cluster representative.
#' In the Clusters directory there is a FASTA file for each cluster.
#' @export
runClustering <- function(cdhit_path, sequences, out_dir, identity_cutoff, length_cutoff, wordlength, map, write_fastas=FALSE, optional="") {
# tmp fasta file
seq <- paste(out_dir, "sig_sequences.fa", sep="/")
write.table(sequences, seq,quote = F, row.names = F, col.names = F)
# run clustering
clusters <- paste(out_dir,"CD-HIT.fa",sep="/")
command <- paste("-i",seq,"-c",identity_cutoff,"-s",length_cutoff,"-n",wordlength,"-g 1",optional,"-o",clusters,sep=" ")
message("CD-HIT configuration:")
message(paste(cdhit_path,command))
cluster.out <- system2(cdhit_path,command, stdout=TRUE)
# remove tmp files
deleteTmp(out_dir)
# generate result
out.df <- processClusters(map, clusters, out_dir, write_fastas)
out.df["sequences"] <- NULL
names(out.df)<-c("SequenceID","ClusterID")
row.names(out.df)=out.df$SequenceID
return(out.df)
}
#' Processes CD-HIT result file
#'
#' This function is internally called by \link[DEUS]{runClustering}
#'
#' @param map A data frame with sequences as row names and sequence identifiers in first column.
#' Can be generated by \link[DEUS]{createMap}
#' @param clusters The path to CD-HIT.fa
#' @param out_dir Directory of CD-HIT result file and where generated files will be saved
#' @param write_fastas Boolean that indicates whether a fasta file will be generated for each cluster
#' @return A data frame with the columns 'qseqid', 'cl_id' and 'sequences' containing the sequence identifier, the sequence and the assigned cluster identifier.
#' @export
processClusters <- function(map, clusters, out_dir, write_fastas) {
#Get cluster file
cluster.file <- paste(clusters,".clstr",sep="")
#Read file and set -1 as default cluster
clst <- readLines(cluster.file)
clst.index <- unlist(lapply(clst,startsWith,prefix=">"))
clst.df <- as.data.frame(clst)
blank <- -1
clst.df$"cl_id" <- blank
#Set correct cluster ids for first cluster member
clst.df[clst.index,2] <- as.numeric(gsub(">Cluster ","",clst.df[clst.index,1]))
#Fill correct cluster ids for remaining members
cl_ids <- clst.df$cl_id
isnotblank <- cl_ids != blank
cl_ids <- cl_ids[which(isnotblank)][cumsum(isnotblank)]
clst.df$cl_id <- cl_ids
#Grep sequence id
seqIDsIndex=lapply(clst.df$clst,regexpr,pattern="seq_[0-9]+",perl=TRUE)
clst.df$SequenceID[!clst.index] <- unlist(regmatches(clst.df$clst,seqIDsIndex))
clst.df <- clst.df[-1]
clst.df <- clst.df[!is.na(clst.df$SequenceID),]
map$sequences<-row.names(map)
out.df<-merge(x = clst.df, y = map, by = "SequenceID", all.x = TRUE)
#Write fasta for each cluster if required
if(write_fastas){
cl.dir <- paste(out_dir, "Clusters",sep="/")
dir.create(cl.dir, showWarnings = FALSE)
by(out.df,out.df$cl_id,printCluster,cl_dir=cl.dir)
}
return(out.df)
}
#' Print FASTA file for cluster of sequences
#'
#' This function is internally called by \link[DEUS]{processClusters}
#'
#' @param data Subset of overall data provided within the \link[DEUS]{processClusters} function
#' @param cl_dir Directory of CD-HIT result file and where generated files will be saved
#' @return A FASTA file in the Clusters folder in cl_dir representing the sequences of a cluster.
#' @export
printCluster<-function(data,cl_dir){
cl_id<-data[1,2]
outfile <- paste(cl_dir,"/Cluster",cl_id,".fa",sep="")
out<-paste(">",data[,1],"\n",data[,3],sep="")
write.table(out,outfile,col.names=F,row.names=F,quote=F)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.