#--code for processing the output of LCAMapper when applied to ORF sequences from a metagenome assembly--
#--RW on 30 October 2015--
#--see vignette document for comments on reading in gff files from MetaGeneMark or similar programs--difficult to standardise this step!
#--FUNCTIONS FOR HANDLING AND PROCESSING .TAX OUTPUT--
#--here 'tax' refers to the .tax output from LCAMapper
#--this function reads a *-tax.txt file...set to export in NAMESPACE
read.tax<-function(file)
{
#--read in a .tax file, named as 'file'
filecon<-file(description=file,open="rt")
taxScan<-readLines(con=filecon,n=-1)
close(filecon)
return(taxScan)
}
#--function strips everything after the pipe character in the ORF-annotation string
#--internal
strip.after.first.pipe<-function(S)
{
#--'S' is a character vector
res<-sapply(strsplit(S,"|",fixed=T),FUN=function(x){x[[1]][1]})
return(res)
}
#--we also need a function to in a ORF .fasta file from MetaGeneMark and process all the information we need (ORFs,contigs)
#--formatting on '>' headers of 'orf.fa' may change, so keep an eye on performance and errors!
#--set to export in NAMESPACE
process.orf.fasta.file<-function(orf.fa,mainSplitString)
{
#--'orf.fa' is a .fasta file generated by MetaGeneMark, containing the DNA sequence from each predicted ORF.
#--'mainSplitString' gives you flexibility about how to split up the header...seems to be some variability in how MetaGeneMark encodes this?
#--process our information about ORFs and contigs...and return a (contig-named) list whose elements are vectors of ORF-ids
orfSet<-Biostrings::readDNAStringSet(filepath=orf.fa)
headerSplitList<-strsplit(names(orfSet),mainSplitString)
orfSet.contigids<-sapply(headerSplitList,FUN=function(x){strsplit(x[2]," ")[[1]][1]})
orfSet.contiglen<-sapply(headerSplitList,FUN=function(x){as.numeric(substring(strsplit(x[2]," ")[[1]][2],8))})
orfSet.orfids<-sapply(headerSplitList,FUN=function(x){strsplit(x[1],"|",fixed=T)[[1]][1]})
combTable<-data.frame(contig=orfSet.contigids,len=orfSet.contiglen,orf=orfSet.orfids,stringsAsFactors=F)
return(combTable)
}
#--write a function to process these data into a node-edge list...
#--internal
process.tax.to.nel<-function(taxScan)
{
#--'taxScan' is an imported (readLines) version of a .tax output from LCAmapper (see 'read.tax' function above)
#--entities (reads,ORFs) that do not have annotations are returned to res$null in this function
res<-list(nel=NULL,ew=NULL,null=NULL)
#--extract the read names...
taxScanList<-strsplit(taxScan,";")
taxScan.reads<-sapply(taxScanList,FUN=function(x){x[1]})
#--and the annotations into a character vector; odd elements are taxa, even elements are percentages
taxScanAnnotationList<-lapply(taxScanList,FUN=function(x){gdata::trim(x[c(-1,-2)])})
for(r in (1:length(taxScan.reads)))
{
curread<-taxScan.reads[r]
curset<-taxScanAnnotationList[[r]]
curlen<-length(curset)
if(curlen>0)
{
curanno<-curset[seq(1,curlen,2)]
curweights<-curset[seq(2,curlen,2)]
res$nel<-rbind(res$nel,cbind(first=rep(curread,length(curanno)),second=curanno))
res$ew<-c(res$ew,as.numeric(curweights))
}
else
if(curlen==0)
{
res$null<-c(res$null,curread)
}
}
return(res)
}
#--this is a variant of 'strip.after.first.pipe' that extract taxonomic strings as well
#--internal
convert.tax.anno.strings.to.list<-function(orfTaxAnno)
{
res<-list()
O<-length(orfTaxAnno)
for(o in (1:O))
{
curorf<-orfTaxAnno[o]
curorftag<-strip.after.first.pipe(curorf)
print(paste(curorftag,":",o,"/",O,sep=""))
curres<-process.tax.to.nel(curorf)
curout<-curres$ew
names(curout)<-curres$nel[,"second"]
res[[curorftag]]<-curout
}
return(res)
}
#--the next two are minor functions
#--internal
extract.at.level<-function(x,l)
{
tags<-substring(names(x),1,3)
res<-names(x)[tags==l]
return(res)
}
#--internal
process.annoListResults<-function(x)
{
cdata<-table(unlist(x))
res<-as.vector(cdata)
names(res)<-names(cdata)
res<-res[order(res,decreasing=T)]
return(res)
}
#--import ORF annotations, a contig-id/len/ORF-id table ('contig2orfTable') and a set of contigs ('contigsToUse') and generate an ORF-level output at the 100% hit rate for a given taxonomic level ('level')
#--set for export in NAMESPACE
process.ORF.annotations<-function(taxScan,contig2orfTable,contigsToUse,level=c("d","p","c","o","f","g","s"))
{
#--returns a list with 3 elements that are required for further bin- or neighborhood-level processing
#--res$contig2orfList is a (contig-named) list, whose elements are congnate ORF-ids
#--res$taxaToExtract is a vector containing the number of ORF/s annotated to a given taxonomic entity at the 100% hit rate level.
#--res$taxScanSelected is subset of 'taxScan' that you have selected via 'contigsToUse'.
res<-list(contig2orfList=NULL,taxaToExtract=NULL,taxScanSelected=NULL)
#--clean up ORF-names from original annotations
taxScan.orfids<-strip.after.first.pipe(taxScan)
#--subset table using contigs of interest
contig2orfTableCut<-contig2orfTable[contig2orfTable$contig%in%contigsToUse,]
res$contig2orfList<-tapply(as.character(contig2orfTableCut$orf),INDEX=contig2orfTableCut$contig,FUN=function(x){x})
#--retain only those ORFs are contained on the contigs of interest
mind<-which((taxScan.orfids%in%contig2orfTableCut$orf)==T)
taxScanToUse<-taxScan[mind]
names(taxScanToUse)<-taxScan.orfids[mind]
#--generate gene level results...
taxScanToUse.annoList<-convert.tax.anno.strings.to.list(taxScanToUse)
taxScanToUse.annoList.cut100<-lapply(taxScanToUse.annoList,FUN=function(x){x[x==100]})
taxScanToUse.annoList.cut100.level<-lapply(taxScanToUse.annoList.cut100,FUN=extract.at.level,l=paste(level,"__",sep=""))
res$taxaToExtract<-process.annoListResults(taxScanToUse.annoList.cut100.level)
res$taxScanSelected<-taxScanToUse
return(res)
}
#--FUNCTIONS FOR PROCESSING .TAX DATA AGAINST CONTIG SETS ("BINS")--
#--this function extracts all ORFs that are annotated to a taxon of interest(e.g. g__Candidatus Competibacter or g__Candidatus Accumulibacter)
#--internal
extract.orf2tax.graph.for.taxa.of.interest<-function(orfTaxAnno,taxonOfInterest)
{
#--'orfTaxAnno' is an R character vector containing the import data from a .tax file from LCAMapper
#--'taxonOfInterest' is a string, which selects the taxon you are interested in analysing
#--NOTE THAT we are searching for "100" cases only e.g. we grep for "g__Candidatus Accumulibacter; 100"
#--returns an augmented version of the output of 'define.id2id.mapping.as.graph'
#--make pattern argument for greg...
taxonOfInterestPattern<-paste(taxonOfInterest,"; 100",sep="")
print(paste("Searching for ORFs annotated to",taxonOfInterestPattern,sep=" "))
taxonOfInterestInd<-grep(taxonOfInterestPattern,orfTaxAnno,fixed=T)
#--subset 'orfTaxAnno' accordingly...
orfTaxAnnoCut<-orfTaxAnno[taxonOfInterestInd]
#--check length, stop if zero
if(length(orfTaxAnnoCut)==0)
{
stop(paste("No ORFs are annotated to",taxonOfInterest,sep=" "))
}
print(paste(length(orfTaxAnnoCut),"ORFs annotated to",taxonOfInterest,sep=" "))
print("Process to node edge list...")
orfTaxAnnoCut2nel<-process.tax.to.nel(orfTaxAnnoCut)
#--process the gene column of the nel....
nelNewFirstCol<-strip.after.first.pipe(orfTaxAnnoCut2nel$nel[,1])
orfTaxAnnoCut2nel$nel[,1]<-nelNewFirstCol
#--generate id2id mapping object...
print("Generating ORF-to-taxon graph...")
res<-define.id2id.mapping.as.graph(orfTaxAnnoCut2nel$nel)
#--add the 'taxonOfInterest' and the nel data as elements of 'res'
res$taxonOfInterest<-taxonOfInterest
res$nelResList<-orfTaxAnnoCut2nel
#--we also need the 1-neighbourhood of 'taxonOfInterest' here
res$orf2taxonOfInterest<-names(igraph::neighborhood(res$id2idGraph,1,res$taxonOfInterest,mindist=1)[[1]])
return(res)
}
#--internal only
list2df<-function(L)
{
res=NULL
for(e in (1:length(L)))
{
curres<-data.frame(contig=rep(names(L)[e],length(L[[e]])),ORF=L[[e]],stringsAsFactors=F)
res<-rbind(res,curres)
}
row.names(res)<-res[,2]
return(res)
}
#--for each contig, count the number of ORFs that hit the taxon-of-interest, and the maximum, mean and minimum proportion of hits across the ORF set...
#--internal
generate.toi.summary.results<-function(contig2orfList,orf2taxResList)
{
#--for each contig, calculate summary statistics in tabular form for the /t/axon /o/f /i/nterest ('toi': a character vector of ORF ids)
#--'contig2orfList' is (contig-named) list, whose elements are ORF-ids that belong on the contig
#--'orf2taxResList' is an output from the function 'extract.orf2tax.graph.for.taxa.of.interest'.
res=NULL
print(paste("Constructing summary results for",orf2taxResList$taxonOfInterest,sep=" "))
#--prep nelResList$ew with names for later use...
print("Formatting edge weight data")
ew<-orf2taxResList$nelResList$ew
names(ew)<-apply(orf2taxResList$nelResList$nel,1,paste,collapse="|")
#--extract contigs that contain at least one such ORF
print(paste("Finding contigs that contain at least one ORF annotated to",orf2taxResList$taxonOfInterest,sep=" "))
curcontigOverlap<-lapply(contig2orfList,FUN=intersect,y=orf2taxResList$orf2taxonOfInterest)
curcontigOverlapNonEmpty<-curcontigOverlap[which(sapply(curcontigOverlap,length)>0)]
curcontigOverlapNonEmptyNames<-names(curcontigOverlapNonEmpty)
#--for each such contig, run 'list2df' to convert 'curcontigOverlapNonEmpty' to a data frame, augmented with ew data
print("Collating results...")
curL2df<-list2df(contig2orfList[curcontigOverlapNonEmptyNames])
#--and make an extra column to store the edge-weights for the taxon of interest
curew<-rep(0,nrow(curL2df))
names(curew)<-paste(curL2df[,"ORF"],orf2taxResList$taxonOfInterest,sep="|")
#--populate the non-zero values
curorfnz<-names(curew)[names(curew)%in%names(ew)]
curew[curorfnz]<-ew[curorfnz]
#--and add as an extra column to the data frame
curL2df<-cbind(curL2df,ew=curew)
#--calculate summary statistics per contig
print("Calculating statistics...")
curmax<-tapply(curL2df$ew,INDEX=curL2df$contig,FUN=max)
curmean<-tapply(curL2df$ew,INDEX=curL2df$contig,FUN=mean)
curmin<-tapply(curL2df$ew,INDEX=curL2df$contig,FUN=min)
#--and tabulate...
print("Formating output")
res<-data.frame(taxonOfInterest=rep(orf2taxResList$taxonOfInterest,length(curcontigOverlapNonEmptyNames)),
contig=curcontigOverlapNonEmptyNames,
num.orf=sapply(contig2orfList[curcontigOverlapNonEmptyNames],length),
num.anno=sapply(curcontigOverlapNonEmpty[curcontigOverlapNonEmptyNames],length),
max=curmax[curcontigOverlapNonEmptyNames],
mean=curmean[curcontigOverlapNonEmptyNames],
min=curmin[curcontigOverlapNonEmptyNames])
return(res)
}
#--internal
collate.annotation.statistics.across.bins<-function(binMembershipList,annoStatsDf)
{
#--'binMembershipList' is a (bin-names) list, whose elements contain the contigs who are members of a given bin
#--'annoStatsDf' is an element from the function 'generate.toi.summary.results'
#--for each bin, use the "mean" column from 'annoStatsDf' to define mean annotation statistics for the bin (contigs that are not in 'annoStatsDf' are assigned zero)
#--returns a data frame, with bins indexed in rows, columns are as follows: bin-id, size (contigs), and min, mean, median and max annotation statistics
meanStatsList<-list()
annotatedContigsList<-list()
#--first, extract annotated contigs and associated statistics for each bin...
for(curbin in names(binMembershipList))
{
print(curbin)
curannstats<-rep(0,length(binMembershipList[[curbin]]))
names(curannstats)<-binMembershipList[[curbin]]
annoContigsInBin<-intersect(binMembershipList[[curbin]],rownames(annoStatsDf))
annotatedContigsList[[curbin]]<-annoContigsInBin
if(length(annoContigsInBin)>0)
{
curannstats[annoContigsInBin]<-annoStatsDf[annoContigsInBin,"mean"]
}
meanStatsList[[curbin]]<-curannstats
}
#--next compute statistics and collate into a data frame...
res<-data.frame(taxonOfInterest=rep(as.character(annoStatsDf$taxonOfInterest[1]),length(meanStatsList)),
bin.id=names(meanStatsList),
bin.size=sapply(meanStatsList,length),
num.anno.contigs=sapply(annotatedContigsList,length),
min.annoStat=sapply(meanStatsList,min),
median.annoStat=sapply(meanStatsList,stats::median),
mean.annoStat=sapply(meanStatsList,mean),
max.annoStat=sapply(meanStatsList,max),stringsAsFactors=F)
return(res)
}
#--this function calculates down to per-bin results for every element of the character vector 'taxaToExtract'
#--set for export in NAMESPACE
extract.bin.level.results.across.taxa<-function(orfAnnoList,binMembershipList)
{
#--'orfAnnoList' is a list output from 'process.ORF.annotations'
#--'binMembershipList' is a (bin-named) list, whose elements contain cognate contig-ids
res<-list()
for(curtaxon in names(orfAnnoList$taxaToExtract))
{
print(paste(curtaxon,":",match(curtaxon,names(orfAnnoList$taxaToExtract)),"/",length(orfAnnoList$taxaToExtract),sep=""))
curres1<-extract.orf2tax.graph.for.taxa.of.interest(orfAnnoList$taxScanSelected,curtaxon)
curres2<-generate.toi.summary.results(orfAnnoList$contig2orfList,curres1)
curres3<-collate.annotation.statistics.across.bins(binMembershipList,curres2)
res[[curtaxon]]<-curres3
}
return(res)
}
#--FUNCTIONS FOR HANDLING AND PROCESSING .KO OUTPUT--
#--here 'ko' refers to the .tax output from LCAMapper
#--basically, we need to construct a weighted graph that contains connects KO to ORFs
#--start with a function to read in the output file--
#--curently internal
read.ko<-function(file)
{
#--read in a .ko file, named as 'file'
filecon<-file(description=file,open="rt")
koScan<-readLines(con=filecon,n=-1)
close(filecon)
return(koScan)
}
#--write a function to process these data into a node-edge list
#--currently internal
process.ko.to.nel<-function(koScan)
{
#--'koScan' is an imported (readLines) version of a .ko output from LCAmapper (see 'read.ko' function above)
#--entities (ORFs) that do not have annotations are returned to res$null in this function
#--for an entire ORF set, this calculation will take a long time...consider computing on a cluster machine!
res<-list(nel=NULL,ew=NULL,nhits=NULL)
#--extract the read names...the " ;" tag returns a simple output (element is length 1 if no annotation)
koScanList<-strsplit(koScan," ;")
koScanList.len<-sapply(koScanList,length)
print(paste("Processing",length(koScanList.len),"orfs..."))
koScanListWithAnno<-koScanList[which(koScanList.len>1)]
print(paste("..of which",length(koScanListWithAnno),"have annotations at this threshold"))
#--for the ORFs, we have to remove the end ";" char
orfVec<-sapply(koScanListWithAnno,FUN=function(x){substring(x[1],1,(nchar(x[1])-1))})
#--the annotations are more complicated, see below for further required processing; here, we just trim the white-spaces
annoVec<-sapply(koScanListWithAnno,FUN=function(x){gdata::trim(x[2])})
for(r in (1:length(annoVec)))
{
curannoElement<-annoVec[r]
curorf<-orfVec[r]
print(paste("Extracting ",curorf,"--",r,"/",length(annoVec),sep=""))
curannoElementSplit<-strsplit(curannoElement," ")[[1]]
curannoElementSplitLen<-length(curannoElementSplit)
#--remove the last char which contains the number-of-hits-above-threshold
curhitnum<-as.numeric(curannoElementSplit[curannoElementSplitLen])
res$nhits<-c(res$nhits,curhitnum)
#--next extract the KO, and remove the ":" a the end...
curannoElementSplitKo<-curannoElementSplit[seq(2,curannoElementSplitLen-2,3)]
curannoElementSplitKo<-sapply(curannoElementSplitKo,FUN=function(x){substring(x,1,nchar(x)-1)})
#--and then extract the edge-weights...
curannoElementSplitEw<-as.numeric(curannoElementSplit[seq(3,curannoElementSplitLen-2,3)])
#--form outputs...
curnel<-data.frame(orf=rep(curorf,length(curannoElementSplitKo)),ko=curannoElementSplitKo,stringsAsFactors=T)
res$nel<-rbind(res$nel,curnel)
res$ew<-c(res$ew,curannoElementSplitEw)
}
#--name res$nhits with 'orfVec'
names(res$nhits)<-orfVec
return(res)
}
#--and we'll need a function to filter on the best hit...
#--currently internal
filter.nel.for.best.hit<-function(L)
{
#--filter L (a list containing L$nel and L$ew, as generated by 'process.list2nel') for the most common hit
#--note that other components of L that do not reference matches will be unaffected
res<-L
#--we'll need the complete set of unique entities (mostly, reads)....
uniqueEntities<-unique(L$nel[,1])
#--then find their match in the L$new[,1], and extract the corresponding L$nel (the top hit will be the first one)
ind<-match(uniqueEntities,L$nel[,1])
res$nel<-L$nel[ind,]
res$ew<-L$ew[ind]
#--return as a new list containing 'nel' and 'ew' components...(and anything else defined in 'L')
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.