R/orf.lcam.R

#--code for processing the output of LCAMapper when applied to ORF sequences from a metagenome assembly--
#--RW on 30 October 2015--
#--see vignette document for comments on reading in gff files from MetaGeneMark or similar programs--difficult to standardise this step!

#--FUNCTIONS FOR HANDLING AND PROCESSING .TAX OUTPUT--
#--here 'tax' refers to the .tax output from LCAMapper
#--this function reads a *-tax.txt file...set to export in NAMESPACE
read.tax<-function(file)
{
 #--read in a .tax file, named as 'file'
 filecon<-file(description=file,open="rt")
 taxScan<-readLines(con=filecon,n=-1)
 close(filecon)
 return(taxScan)
}

#--function strips everything after the pipe character in the ORF-annotation string
#--internal
strip.after.first.pipe<-function(S)
{
 #--'S' is a character vector
 res<-sapply(strsplit(S,"|",fixed=T),FUN=function(x){x[[1]][1]})
 return(res)
}

#--we also need a function to in a ORF .fasta file from MetaGeneMark and process all the information we need (ORFs,contigs)
#--formatting on '>' headers of 'orf.fa' may change, so keep an eye on performance and errors!
#--set to export in NAMESPACE
process.orf.fasta.file<-function(orf.fa,mainSplitString)
{
 #--'orf.fa' is a .fasta file generated by MetaGeneMark, containing the DNA sequence from each predicted ORF.
 #--'mainSplitString' gives you flexibility about how to split up the header...seems to be some variability in how MetaGeneMark encodes this?
 #--process our information about ORFs and contigs...and return a (contig-named) list whose elements are vectors of ORF-ids
 orfSet<-Biostrings::readDNAStringSet(filepath=orf.fa)
 headerSplitList<-strsplit(names(orfSet),mainSplitString)
 orfSet.contigids<-sapply(headerSplitList,FUN=function(x){strsplit(x[2]," ")[[1]][1]})
 orfSet.contiglen<-sapply(headerSplitList,FUN=function(x){as.numeric(substring(strsplit(x[2]," ")[[1]][2],8))})
 orfSet.orfids<-sapply(headerSplitList,FUN=function(x){strsplit(x[1],"|",fixed=T)[[1]][1]})
 combTable<-data.frame(contig=orfSet.contigids,len=orfSet.contiglen,orf=orfSet.orfids,stringsAsFactors=F)
 return(combTable)
}

#--write a function to process these data into a node-edge list...
#--internal
process.tax.to.nel<-function(taxScan)
{
 #--'taxScan' is an imported (readLines) version of a .tax output from LCAmapper (see 'read.tax' function above)
 #--entities (reads,ORFs) that do not have annotations are returned to res$null in this function
 res<-list(nel=NULL,ew=NULL,null=NULL)
    
 #--extract the read names...
 taxScanList<-strsplit(taxScan,";")
 taxScan.reads<-sapply(taxScanList,FUN=function(x){x[1]})
 #--and the annotations into a character vector; odd elements are taxa, even elements are percentages
 taxScanAnnotationList<-lapply(taxScanList,FUN=function(x){gdata::trim(x[c(-1,-2)])})
 
 for(r in (1:length(taxScan.reads)))
 {
  curread<-taxScan.reads[r]
  curset<-taxScanAnnotationList[[r]]
  curlen<-length(curset)
  if(curlen>0)
  {
   curanno<-curset[seq(1,curlen,2)]
   curweights<-curset[seq(2,curlen,2)]
   res$nel<-rbind(res$nel,cbind(first=rep(curread,length(curanno)),second=curanno))
   res$ew<-c(res$ew,as.numeric(curweights))
  }
  else
  if(curlen==0)
  {
   res$null<-c(res$null,curread)
  }
 }
 return(res)
}

#--this is a variant of 'strip.after.first.pipe' that extract taxonomic strings as well
#--internal
convert.tax.anno.strings.to.list<-function(orfTaxAnno)
{
 res<-list()
 O<-length(orfTaxAnno)
 for(o in (1:O))
 {
  curorf<-orfTaxAnno[o]
  curorftag<-strip.after.first.pipe(curorf)
  print(paste(curorftag,":",o,"/",O,sep=""))
  curres<-process.tax.to.nel(curorf)
  curout<-curres$ew
  names(curout)<-curres$nel[,"second"]
  res[[curorftag]]<-curout
 }
 return(res)
}

#--the next two are minor functions
#--internal
extract.at.level<-function(x,l)
{
 tags<-substring(names(x),1,3)
 res<-names(x)[tags==l]
 return(res)
}

#--internal
process.annoListResults<-function(x)
{
 cdata<-table(unlist(x))
 res<-as.vector(cdata)
 names(res)<-names(cdata)
 res<-res[order(res,decreasing=T)]
 return(res)
}

#--import ORF annotations, a contig-id/len/ORF-id table ('contig2orfTable') and a set of contigs ('contigsToUse') and generate an ORF-level output at the 100% hit rate for a given taxonomic level ('level')
#--set for export in NAMESPACE
process.ORF.annotations<-function(taxScan,contig2orfTable,contigsToUse,level=c("d","p","c","o","f","g","s"))
{
 #--returns a list with 3 elements that are required for further bin- or neighborhood-level processing
 #--res$contig2orfList is a (contig-named) list, whose elements are congnate ORF-ids
 #--res$taxaToExtract is a vector containing the number of ORF/s annotated to a given taxonomic entity at the 100% hit rate level.
 #--res$taxScanSelected is subset of 'taxScan' that you have selected via 'contigsToUse'.
 res<-list(contig2orfList=NULL,taxaToExtract=NULL,taxScanSelected=NULL)
 
 #--clean up ORF-names from original annotations
 taxScan.orfids<-strip.after.first.pipe(taxScan)
 
 #--subset table using contigs of interest
 contig2orfTableCut<-contig2orfTable[contig2orfTable$contig%in%contigsToUse,]
 res$contig2orfList<-tapply(as.character(contig2orfTableCut$orf),INDEX=contig2orfTableCut$contig,FUN=function(x){x})
 
 #--retain only those ORFs are contained on the contigs of interest
 mind<-which((taxScan.orfids%in%contig2orfTableCut$orf)==T)
 taxScanToUse<-taxScan[mind]
 names(taxScanToUse)<-taxScan.orfids[mind]
 
 #--generate gene level results...
 taxScanToUse.annoList<-convert.tax.anno.strings.to.list(taxScanToUse)
 taxScanToUse.annoList.cut100<-lapply(taxScanToUse.annoList,FUN=function(x){x[x==100]})

 taxScanToUse.annoList.cut100.level<-lapply(taxScanToUse.annoList.cut100,FUN=extract.at.level,l=paste(level,"__",sep=""))
 res$taxaToExtract<-process.annoListResults(taxScanToUse.annoList.cut100.level)
 
 res$taxScanSelected<-taxScanToUse

 return(res)
}

#--FUNCTIONS FOR PROCESSING .TAX DATA AGAINST CONTIG SETS ("BINS")--

#--this function extracts all ORFs that are annotated to a taxon of interest(e.g. g__Candidatus Competibacter or g__Candidatus Accumulibacter)
#--internal
extract.orf2tax.graph.for.taxa.of.interest<-function(orfTaxAnno,taxonOfInterest)
{
 #--'orfTaxAnno' is an R character vector containing the import data from a .tax file from LCAMapper
 #--'taxonOfInterest' is a string, which selects the taxon you are interested in analysing
 #--NOTE THAT we are searching for "100" cases only e.g. we grep for "g__Candidatus Accumulibacter; 100"
 #--returns an augmented version of the output of 'define.id2id.mapping.as.graph'

 #--make pattern argument for greg...
 taxonOfInterestPattern<-paste(taxonOfInterest,"; 100",sep="")

 print(paste("Searching for ORFs annotated to",taxonOfInterestPattern,sep=" "))
 taxonOfInterestInd<-grep(taxonOfInterestPattern,orfTaxAnno,fixed=T)
 #--subset 'orfTaxAnno' accordingly...
 orfTaxAnnoCut<-orfTaxAnno[taxonOfInterestInd]
 #--check length, stop if zero
 if(length(orfTaxAnnoCut)==0)
 {
  stop(paste("No ORFs are annotated to",taxonOfInterest,sep=" "))
 }
 print(paste(length(orfTaxAnnoCut),"ORFs annotated to",taxonOfInterest,sep=" "))
 print("Process to node edge list...")
 orfTaxAnnoCut2nel<-process.tax.to.nel(orfTaxAnnoCut)

 #--process the gene column of the nel....
 nelNewFirstCol<-strip.after.first.pipe(orfTaxAnnoCut2nel$nel[,1])
 orfTaxAnnoCut2nel$nel[,1]<-nelNewFirstCol

 #--generate id2id mapping object...
 print("Generating ORF-to-taxon graph...")
 res<-define.id2id.mapping.as.graph(orfTaxAnnoCut2nel$nel)

 #--add the 'taxonOfInterest' and the nel data as elements of 'res'
 res$taxonOfInterest<-taxonOfInterest
 res$nelResList<-orfTaxAnnoCut2nel
 #--we also need the 1-neighbourhood of 'taxonOfInterest' here
 res$orf2taxonOfInterest<-names(igraph::neighborhood(res$id2idGraph,1,res$taxonOfInterest,mindist=1)[[1]])

 return(res)
}

#--internal only
list2df<-function(L)
{
 res=NULL
 for(e in (1:length(L)))
 {
  curres<-data.frame(contig=rep(names(L)[e],length(L[[e]])),ORF=L[[e]],stringsAsFactors=F)
  res<-rbind(res,curres)
 }
 row.names(res)<-res[,2]
 return(res)
}

#--for each contig, count the number of ORFs that hit the taxon-of-interest, and the maximum, mean and minimum proportion of hits across the ORF set...
#--internal
generate.toi.summary.results<-function(contig2orfList,orf2taxResList)
{
 #--for each contig, calculate summary statistics in tabular form for the /t/axon /o/f /i/nterest ('toi': a character vector of ORF ids)
 #--'contig2orfList' is (contig-named) list, whose elements are ORF-ids that belong on the contig
 #--'orf2taxResList' is an output from the function 'extract.orf2tax.graph.for.taxa.of.interest'.
 res=NULL
 
 print(paste("Constructing summary results for",orf2taxResList$taxonOfInterest,sep=" "))
 
 #--prep nelResList$ew with names for later use...
 print("Formatting edge weight data")
 ew<-orf2taxResList$nelResList$ew
 names(ew)<-apply(orf2taxResList$nelResList$nel,1,paste,collapse="|")

 #--extract contigs that contain at least one such ORF
 print(paste("Finding contigs that contain at least one ORF annotated to",orf2taxResList$taxonOfInterest,sep=" "))
 curcontigOverlap<-lapply(contig2orfList,FUN=intersect,y=orf2taxResList$orf2taxonOfInterest)
 curcontigOverlapNonEmpty<-curcontigOverlap[which(sapply(curcontigOverlap,length)>0)]
 curcontigOverlapNonEmptyNames<-names(curcontigOverlapNonEmpty)
 #--for each such contig, run 'list2df' to convert 'curcontigOverlapNonEmpty' to a data frame, augmented with ew data
 print("Collating results...")
 curL2df<-list2df(contig2orfList[curcontigOverlapNonEmptyNames])
 #--and make an extra column to store the edge-weights for the taxon of interest
 curew<-rep(0,nrow(curL2df))
 names(curew)<-paste(curL2df[,"ORF"],orf2taxResList$taxonOfInterest,sep="|")
 #--populate the non-zero values
 curorfnz<-names(curew)[names(curew)%in%names(ew)]
 curew[curorfnz]<-ew[curorfnz]
 #--and add as an extra column to the data frame
 curL2df<-cbind(curL2df,ew=curew)

 #--calculate summary statistics per contig
 print("Calculating statistics...")
 curmax<-tapply(curL2df$ew,INDEX=curL2df$contig,FUN=max)
 curmean<-tapply(curL2df$ew,INDEX=curL2df$contig,FUN=mean)
 curmin<-tapply(curL2df$ew,INDEX=curL2df$contig,FUN=min)
 #--and tabulate...
 print("Formating output")
 res<-data.frame(taxonOfInterest=rep(orf2taxResList$taxonOfInterest,length(curcontigOverlapNonEmptyNames)),
                    contig=curcontigOverlapNonEmptyNames,
                    num.orf=sapply(contig2orfList[curcontigOverlapNonEmptyNames],length),
                    num.anno=sapply(curcontigOverlapNonEmpty[curcontigOverlapNonEmptyNames],length),
                    max=curmax[curcontigOverlapNonEmptyNames],
                    mean=curmean[curcontigOverlapNonEmptyNames],
                    min=curmin[curcontigOverlapNonEmptyNames])
 return(res)
}

#--internal
collate.annotation.statistics.across.bins<-function(binMembershipList,annoStatsDf)
{
 #--'binMembershipList' is a (bin-names) list, whose elements contain the contigs who are members of a given bin
 #--'annoStatsDf' is an element from the function 'generate.toi.summary.results'
 #--for each bin, use the "mean" column from 'annoStatsDf' to define mean annotation statistics for the bin (contigs that are not in 'annoStatsDf' are assigned zero)
 #--returns a data frame, with bins indexed in rows, columns are as follows: bin-id, size (contigs), and min, mean, median and max annotation statistics
 meanStatsList<-list()
 annotatedContigsList<-list()
 #--first, extract annotated contigs and associated statistics for each bin...
 for(curbin in names(binMembershipList))
 {
  print(curbin)
  curannstats<-rep(0,length(binMembershipList[[curbin]]))
  names(curannstats)<-binMembershipList[[curbin]]
  annoContigsInBin<-intersect(binMembershipList[[curbin]],rownames(annoStatsDf))
  annotatedContigsList[[curbin]]<-annoContigsInBin
  if(length(annoContigsInBin)>0)
  {
   curannstats[annoContigsInBin]<-annoStatsDf[annoContigsInBin,"mean"]
  }
  meanStatsList[[curbin]]<-curannstats
 }
 
 #--next compute statistics and collate into a data frame...
 res<-data.frame(taxonOfInterest=rep(as.character(annoStatsDf$taxonOfInterest[1]),length(meanStatsList)),
                    bin.id=names(meanStatsList),
                    bin.size=sapply(meanStatsList,length),
                    num.anno.contigs=sapply(annotatedContigsList,length),
                    min.annoStat=sapply(meanStatsList,min),
                    median.annoStat=sapply(meanStatsList,stats::median),
                    mean.annoStat=sapply(meanStatsList,mean),
                    max.annoStat=sapply(meanStatsList,max),stringsAsFactors=F)
 return(res)
}


#--this function calculates down to per-bin results for every element of the character vector 'taxaToExtract'
#--set for export in NAMESPACE
extract.bin.level.results.across.taxa<-function(orfAnnoList,binMembershipList)
{
 #--'orfAnnoList' is a list output from 'process.ORF.annotations'
 #--'binMembershipList' is a (bin-named) list, whose elements contain cognate contig-ids
 res<-list()
 for(curtaxon in names(orfAnnoList$taxaToExtract))
 {
  print(paste(curtaxon,":",match(curtaxon,names(orfAnnoList$taxaToExtract)),"/",length(orfAnnoList$taxaToExtract),sep=""))
  curres1<-extract.orf2tax.graph.for.taxa.of.interest(orfAnnoList$taxScanSelected,curtaxon)
  curres2<-generate.toi.summary.results(orfAnnoList$contig2orfList,curres1)
  curres3<-collate.annotation.statistics.across.bins(binMembershipList,curres2)
  res[[curtaxon]]<-curres3
 }
 return(res)
}

#--FUNCTIONS FOR HANDLING AND PROCESSING .KO OUTPUT--
#--here 'ko' refers to the .tax output from LCAMapper
#--basically, we need to construct a weighted graph that contains connects KO to ORFs

#--start with a function to read in the output file--
#--curently internal
read.ko<-function(file)
{
 #--read in a .ko file, named as 'file'
 filecon<-file(description=file,open="rt")
 koScan<-readLines(con=filecon,n=-1)
 close(filecon)
 return(koScan)
}

#--write a function to process these data into a node-edge list
#--currently internal
process.ko.to.nel<-function(koScan)
{
 #--'koScan' is an imported (readLines) version of a .ko output from LCAmapper (see 'read.ko' function above)
 #--entities (ORFs) that do not have annotations are returned to res$null in this function
 #--for an entire ORF set, this calculation will take a long time...consider computing on a cluster machine!
 res<-list(nel=NULL,ew=NULL,nhits=NULL)
    
 #--extract the read names...the " ;" tag returns a simple output (element is length 1 if no annotation)
 koScanList<-strsplit(koScan," ;")
 koScanList.len<-sapply(koScanList,length)
 print(paste("Processing",length(koScanList.len),"orfs..."))
 koScanListWithAnno<-koScanList[which(koScanList.len>1)]
 print(paste("..of which",length(koScanListWithAnno),"have annotations at this threshold"))
 #--for the ORFs, we have to remove the end ";" char
 orfVec<-sapply(koScanListWithAnno,FUN=function(x){substring(x[1],1,(nchar(x[1])-1))})
 #--the annotations are more complicated, see below for further required processing; here, we just trim the white-spaces
 annoVec<-sapply(koScanListWithAnno,FUN=function(x){gdata::trim(x[2])})
 
 for(r in (1:length(annoVec)))
 {
  curannoElement<-annoVec[r]
  curorf<-orfVec[r]
  print(paste("Extracting ",curorf,"--",r,"/",length(annoVec),sep=""))
  curannoElementSplit<-strsplit(curannoElement," ")[[1]]
  curannoElementSplitLen<-length(curannoElementSplit)
  #--remove the last char which contains the number-of-hits-above-threshold
  curhitnum<-as.numeric(curannoElementSplit[curannoElementSplitLen])
  res$nhits<-c(res$nhits,curhitnum)
  #--next extract the KO, and remove the ":" a the end...
  curannoElementSplitKo<-curannoElementSplit[seq(2,curannoElementSplitLen-2,3)]
  curannoElementSplitKo<-sapply(curannoElementSplitKo,FUN=function(x){substring(x,1,nchar(x)-1)})
  #--and then extract the edge-weights...
  curannoElementSplitEw<-as.numeric(curannoElementSplit[seq(3,curannoElementSplitLen-2,3)])
  #--form outputs...
  curnel<-data.frame(orf=rep(curorf,length(curannoElementSplitKo)),ko=curannoElementSplitKo,stringsAsFactors=T)
  res$nel<-rbind(res$nel,curnel)
  res$ew<-c(res$ew,curannoElementSplitEw)
 }
 
 #--name res$nhits with 'orfVec'
 names(res$nhits)<-orfVec
 
 return(res)

}

#--and we'll need a function to filter on the best hit...
#--currently internal
filter.nel.for.best.hit<-function(L)
{
 #--filter L (a list containing L$nel and L$ew, as generated by 'process.list2nel') for the most common hit
 #--note that other components of L that do not reference matches will be unaffected
 res<-L
 
 #--we'll need the complete set of unique entities (mostly, reads)....
 uniqueEntities<-unique(L$nel[,1])
 #--then find their match in the L$new[,1], and extract the corresponding L$nel (the top hit will be the first one)
 ind<-match(uniqueEntities,L$nel[,1])
 res$nel<-L$nel[ind,]
 res$ew<-L$ew[ind]
 
 #--return as a new list containing 'nel' and 'ew' components...(and anything else defined in 'L')
 return(res)
 
}
rbhwilliams/orf.lcam documentation built on May 27, 2019, 3 a.m.