R/uprot.R

#' uprot
#'
#' Identifies and extracts a set of 40 prokaryotic universal proteins used as phylogenetic markers.
#' @param path is the full path to where genome annotations in fasta format are placed.
#' @param pattern a pattern (like '.faa') for recognizing the annotation files.
#' @param outdir a name for the output directory that will be created for results.
#' @param align takes a logical (default, TRUE) indicating if multiple sequence alignment is performed.
#' @param phylogeny takes 'NJ' (default) for a Neighbor-Joining tree, 'ML' for a Maximum-Likelihood tree or 'NO' for avoiding tree reconstruction.
#' @param proc is the number of threads used for protein searches.
#' @keywords ribosomal proteins universal markers
#' @export
#' @examples
#' uprot(path='./prodigal_output',pattern='.faa',outdir='uprot_output',proc=2)

uprot<-function(pattern='.faa',
			    path='.',
			    outdir='uprot_output',
			    align=TRUE,
			    phylogeny='NJ',
			    proc=2)
			   
			    {

	# Options #
	
	options(getClass.msg=FALSE)
	gw<-getwd()
	os<-.getOS()
	
	# Dependencies #
	
	suppressMessages(require(seqinr,quietly=T))
	suppressMessages(require(foreach,quietly=T))
	suppressMessages(require(doMC,quietly=T))
	#suppressMessages(require(msa,quietly=T))
	suppressMessages(require(plyr,quietly=T))
	suppressMessages(require(ape,quietly=T))


	
	# Internal functions #
	
	chunker<-function(m,n){

		s<-split(m,cut(seq_along(m),n,labels=F))
		return(s)
	}
				    	
	# Select OS #
	
	if (os=='linux'){
	
		clustalo<-paste(system.file('clustalo',package='taxxo'),'/linux/clustalo',sep='')
		hmmsearch<-paste(system.file('hmmer3',package='taxxo'),'/linux/hmmsearch',sep='')
		bldbcd<-paste(system.file('blast',package='taxxo'),'/linux/blastdbcmd',sep='')
		mkbldb<-paste(system.file('blast',package='taxxo'),'/linux/makeblastdb',sep='')
		fasttree<-paste(system.file('fasttree',package='taxxo'),'/linux/FastTree',sep='')
		
	} else if (os=='darwin'){
		
		clustalo<-paste(system.file('clustalo',package='taxxo'),'/darwin/clustalo',sep='')
		hmmsearch<-paste(system.file('hmmer3',package='taxxo'),'/darwin/hmmsearch',sep='')
		bldbcd<-paste(system.file('blast',package='taxxo'),'/darwin/blastdbcmd',sep='')
		mkbldb<-paste(system.file('blast',package='taxxo'),'/darwin/makeblastdb',sep='')
		fasttree<-paste(system.file('fasttree',package='taxxo'),'/darwin/FastTree',sep='')
		
	} else {
		
		stop('ERROR: Unknown OS, unable to proceed.')
	}

	# List HMMs and genome annotations
	
	phmm<-paste(system.file('exdata',package='taxxo'),'/uprot',sep='')
	thrs<-paste(system.file('exdata',package='taxxo'),'/uprot/tresholds.csv',sep='')
	
	flist<-gsub('//','/',list.files(path=path,pattern=pattern,full.names=T))
	fnams<-list.files(path=path,pattern=pattern)		
	hmms<-list.files(phmm,pattern='.hmm',full.names=T)
	hmm2<-list.files(phmm,pattern='.hmm')
	
	# Find proteins
	
	result<-matrix(ncol=length(hmms),nrow=length(flist),NA)
	colnames(result)<-gsub('.hmm','',hmm2)
	rownames(result)<-gsub(pattern,'',fnams)
	
	tresholds<-read.table(thrs,sep='\t',header=F,skip=1)
	
	for (f in 1:length(flist)){
		
			genome<-gsub(pattern,'',fnams[f])
					
		for (h in 1:length(hmms)){
			
			model<-gsub('.hmm','',hmm2[h])
			
			if (model%in%tresholds$V1){
			
				thr<-tresholds[which(tresholds[,1]==model),2]
			
				cmd<-paste(hmmsearch,
				           ' --noali -T ',thr,
				           ' --cpu ',proc,
				           ' -o search_tmp',
			    	       ' --domtblout out_tmp',
			        	   ' ',hmms[h],' ',flist[f],sep='')

			} else {
				
				cmd<-paste(hmmsearch,
				           ' --noali',
				           ' --cpu ',proc,
				           ' -o search_tmp',
			    	       ' --domtblout out_tmp',
			        	   ' ',hmms[h],' ',flist[f],sep='')
			}
			
			system(cmd)
			
			rl<-readLines('out_tmp')
			rl<-rl[which(!grepl("\\#",rl))]
			rl<-gsub('[ ]+',' ',rl)
			
			system('rm -rf out_tmp search_tmp')
			
			lr<-length(rl)
			
			if (lr>0){
			
				lst<-strsplit(rl,' ')
					
				hit<-sapply(lst,function(x){x[1]})
  				pfmID<-sapply(lst,function(x){x[2]})
  				query<-sapply(lst,function(x){x[4]})
  				evalu<-as.numeric(sapply(lst,function(x){x[13]}))
  				score<-as.numeric(sapply(lst,function(x){x[14]}))
  			
				htab<-data.frame(Query=query,
								 Hit=hit,
								 PfamID=pfmID,
								 Evalue=evalu,
								 Score=score,
        	                  	 stringsAsFactors=F)
                          	 
        	    		dimi<-dim(htab)[1]
  			
  				if (dimi>1){
  				
  					sc<-htab$Score
  				
  					if ((sc[1]!=sc[2]) & dimi==2){
  				
  						maxi<-max(htab$Score)
  						gene<-as.vector(htab[which(htab$Score==maxi),2])
  						
  					} else {
  						
  						gene<-as.vector(htab[1,2])
  					}
  				
  				} else if (dimi==1){
  				
  					gene<-as.vector(htab[1,2])
  				}
  				
  				result[f,h]<-gene
  			}
		}
	}
  			
  	system(paste('mkdir',outdir))
  			
  	presence_absence_uprot<-result		
  	save(presence_absence_uprot,file='presence_absence_uprot.Rdata')				
 	system(paste('mv presence_absence_uprot.Rdata',outdir),ignore.stdout=T)
	
 	# Make databases
 	
	cmd2<-paste('cat ',paste(flist,collapse=' '),' > all_genomes_uprot.faa',sep='')
	system(cmd2)
	
	cmd3<-paste(mkbldb,
		    ' -in all_genomes_uprot.faa',
		    ' -dbtype prot -hash_index -parse_seqids -title',
		    ' uprotdb -out uprotdb',sep='')
	system(cmd3,ignore.stdout=T)
	system('rm -rf all_genomes_uprot.faa')
	
 	# Extract sequences
 		 		
 	uprots<-colnames(presence_absence_uprot)
 	genoms<-rownames(presence_absence_uprot)
 		
 	for (u in 1:length(uprots)){
 			
 		outfile<-paste(uprots[u],'.uprot.faa',sep='')
		#absents<-paste(uprots[u],'.absent',sep='')
 			
 		prots<-as.vector(presence_absence_uprot[,u])
			
		for (p in 1:length(prots)){
			
			system(paste('touch',outfile))

			prot<-prots[p]
				
			if (is.na(prot)==F){
					
				cmd<-paste(bldbcd,' -entry ',prot,' -db uprotdb >> ',outfile,sep='')
					
				system(cmd)
					
			} else {
				
				cat(paste('>',genome[p],'_absent',sep=''),sep='\n',file=outfile,append=T)
				#cat(paste(genoms[p],'_absent','\t',p,sep=''),sep='\n',file=outfile,append=T)
				cat(c2s(rep('-',10)),sep='\n',file=outfile,append=T)
			}
		}
	}
 		
	# Align sequences
	
	if (align==TRUE){
 		
 		multi<-list.files(pattern='.uprot.faa')
 		
 		for (m in multi){
			
			fasta<-read.fasta(m)
			sequs<-lapply(getSequence(fasta),toupper)
			rline<-readLines(m)
			namos<-gsub('>','',rline[grep('>',rline)])
			
			whn<-which(lapply(lapply(sequs,unique),length)==1)
			whs<-setdiff(1:length(namos),whn)
			
			sequs[whn]<-sequs[whs[[1]]]
			namos2<-paste('n',rep(1:length(namos)),sep='')
			
			evalu<-unique(unlist(lapply(sequs,length)))
			leval<-length(evalu)
			
			if (leval>1){
				
				write.fasta(sequs,names=namos2,file='auxalign.fa')
			
				cmd<-paste(clustalo,
					   ' -i auxalign.fa -o auxalign.ali ',
					   '--threads ',proc,
					   ' --output-order=input-order',
					   sep='')
				
				system(cmd)
				
				#aux<-capture.output(
				#alignment<-msa(inputSeqs='auxalign.fa',method='ClustalOmega',type='protein',order='input'))
				#aliconver<-msaConvert(alignment,type='seqinr::alignment')
			
				fali<-read.fasta('auxalign.ali')
				
				sequs2<-lapply(getSequence(fali),toupper)
				lesequ<-length(sequs2[[1]])
				gaps<-rep('-',lesequ)
			
				sequs2[whn]<-replicate(length(whn),list(gaps))
			
				write.fasta(sequs2,names=namos,file=gsub('.faa','.ali',m))
				
				system('rm -rf auxalign.*')
				
			} else {
				
				lesequ<-length(sequs[whs][[1]])
				gaps<-rep('-',lesequ)
				
				sequs[whn]<-replicate(length(whn),list(gaps))
				
				write.fasta(sequs,names=namos,file=gsub('.faa','.ali',m))
			}	
		}

 		# Concatenate alignment
 		
 		alis<-list.files(pattern='.uprot.ali')
 		
 		catmat<-NULL
 		
 		for (a in alis){
			
			if (file.info(a)$size>0){ 
 			
				fasta<-read.fasta(a)
 				sequs<-lapply(getSequence(fasta),toupper)
 				smatx<-do.call(rbind,sequs)
 			
 				catmat<-cbind(catmat,smatx)
 				catlis<-alply(catmat,1)
 		
				nams<-gsub(pattern,'',fnams)
				#nams<-gsub('>','',system(paste('grep ">" ',a,sep=''),intern=T))
				#nams<-getName(fasta)
 				#nams<-unlist(lapply(nams,function(x){strsplit(x,'_')[[1]][1]}))
 			
 				write.fasta(catlis,names=nams,file='universal_proteins.ali')
			}
 		}
 	
		system(paste('mv *uprot.faa',outdir))
		system(paste('mv *uprot.ali',outdir))
		system(paste('mv universal_proteins.ali',outdir))
		system('rm -rf uprotdb*')
	
	} else {
		
		system(paste('mv uprot.faa',outdir))
		system('rm -rf uprotdb*')
	}
			
	if (align==TRUE & phylogeny!='NO'){
		
		setwd(outdir)
		
		if (length(flist)<3){
			
			stop('Makes sense to build a tree with two tips?')
			
		} else {
		
			if (phylogeny=='NJ'){
		
				upali<-read.fasta('universal_proteins.ali')
				asali<-as.alignment(upali)
				dista<-dist.alignment(asali)^2			
				tre<-nj(dista)

				write.tree(tre,file='NJ.uprot.tree.nwk')
					
			} else if (phylogeny=='ML'){
			
				cmd<-paste(fasttree,'universal_proteins.ali > ML.uprot.tree.nwk')
				
				system(cmd,ignore.stderr=T)
			
			} else {
				
				stop('Parameter "phylogeny" must be set to NJ or ML')
			}
		}
	
	} else if (align==FALSE & phylogeny!='NO'){
		
		stop('For tree building both "align" and "phylogeny" must be set!')
	}
	
 	setwd(gw)
}
giraola/taxxo documentation built on May 17, 2019, 5:25 a.m.