ncbitax: Simple base R parser for NCBI Taxonomy

#!/usr/bin/Rscript

## TODO: fix this, doesnt work anymore in ncbitax
## due to taxon IDs as characters?

## calculates taxonomic relations
## USAGE eg.
#datpath="/home/raim/data/introns/prokaryotes_2018/taxonomy"
#outpath="/home/raim/work/CoilProject/halime/taxonomy/species2silva"
#infile="/data/topoisomerases/cyanoids.txt"
##
#scripts/cmpTaxa.R  -txd $datpath  -tf ${datpath}/LTPs132_SSU_tree2tax.dat -sf $infile -out $outpath -xf ${outpath}/species2silva_tree.RData 


library(ncbitax)

## parse commandline into normal variables
# logical parameters, override with --<parameter>, e.g, --inc
dbug <- FALSE
sub <- FALSE

args  <- R.utils::commandArgs(excludeReserved=TRUE, asValues=TRUE)
for ( i in 2:length(args) ) {
  arg <- names(args)[i]
  #cat(paste(arg, as.character(args[arg]), "\n"))
  assign(arg, as.character(args[arg]))
}

#tf <- "taxonomy/phispy_taxons.dat"
#sf <- "genomes/bacteria/taxonomy.dat"
#xf <- "taxonomy/phispyXspecies.phy"

if ( !exists("tf",mode="character") ) {
  cat(paste("no target taxons provided, use --tf=<filename>\n"))
  if ( !interactive() ) quit(save="no")
}
if ( !exists("sf",mode="character") ) {
  cat(paste("no source taxons provided, use --sf=<filename>\n"))
  if ( !interactive() ) quit(save="no")
}
#if ( !exists("txd",mode="character") ) {
#  cat(paste("no taxononmy dir, use -txd=<dir>\n"),file=stderr())
#  if ( !interactive() ) quit(save="no")
#}
#if ( !exists("xf",mode="character") ) xf <- ""
if ( !exists("out",mode="character") ) out <- "./sp2ph"
## other columns to add
if ( !exists("tn",mode="character") ) tn <- "" ## eg. ame, acc colun
if ( !exists("ti",mode="character") ) ti <- "" ## 
dbug <- as.logical(dbug)
sub <- as.logical(sub)

## check if either taxonomy newick tree or NCBI taxonomy directory
## are present

if ( !exists("txd",mode="character") & !exists("xf",mode="character") )
    stop("either newick tree or directory with NCBI taxonomy files required!")


## find the closest relatives between two lists of ncbi taxon ids

## map species to 16S rRNA,
## where species is not present, load taxonomy tree to find closest relative
## that is present in 16S rRNA

### PARSE QUERY AND TARGET TAXON IDs

targt <- read.csv(tf, header=TRUE, sep="\t")
sourc <- read.csv(sf, header=TRUE, sep="\t")

if ( !"taxon"%in%colnames(targt) | !"taxon"%in%colnames(sourc) )
    stop("both query and target require a column with header `taxon'")

## only one per source taxa
sourc <- sourc[!duplicated(sourc[,"taxon"]),,drop=FALSE]

## taxon IDs, can be several!?
targtx <- lapply(strsplit(as.character(targt[,"taxon"]),";"),
                 function(x) gsub(" +","",x))
srcetx <- lapply(strsplit(as.character(sourc[,"taxon"]),";"),
                 function(x) gsub(" +","",x))

## source taxon IDs
allsrc <- unlist(srcetx)
## several taxon IDs, per target
alltar <- unlist(targtx) 

## combined list of taxon IDs
taxids <- unique(c(alltar,allsrc))

## TODO: update IDs before comparing,
## requires to load taxonomy before constructing
## saved tree, or update in tree and afterwards
## OR: update in input


## READ OR GENERATE TAXONOMY

## read or generate common taxonomy tree
## get tree for all taxa from NCBI taxonomy, including singletons!
## (cutting the full taxonomy tree here saves lots of time below)


## TODO: write/read newick tree instead
##       * the written tree can not be parsed!?
##       read tree with singleton nodes?
##       * rank information missing from tree, only provided by
##       taxonomy parser

## check if an RData file with the tree exists already
xfile <- ""
if ( exists("xf",mode="character") )
    if ( file.exists(xf) )
        xfile <- xf

if ( xfile != "" ) {

    cat(paste("loading previously generated common taxonomy tree:",xfile,"\n"))

    ## load previously parsed taxonomy as RData
    ## TODO: load tree
    load(xfile)
    
    ##stop("TREE PARSING NOT IMPLEMENTED YET",
    ##     "please provide taxonomy directory")
    ##require("ape")
    ##tree <- read.tree(file=xf)

} else if ( exists("txd",mode="character") ) {

    cat(paste("generating common taxonomy tree, can take long:",txd,"\n"))

    ## generate common taxomony tree for all taxonomy IDs
    taxo <- tax2newick(taxids, txd, names=FALSE,
                       ranks=TRUE, full=TRUE,
                       verb=dbug, dbug=dbug)
    ## store as RData
    ## if xf was provided but didn't exist as an RData file yet
    ## TODO: store as newick tree such that parsing above works!?
    if ( exists("xf",mode="character") ) {
        ##require("ape");
        ##write.tree(tree, file=xf)
        save(taxo, file=xf)
    } 
}

## get tree and rank from tax2newick object
tree <- taxo$tree
rank <- taxo$rank
## get all nodes. the order is used as index in tree$edge
nodes <- c(tree$tip.label,tree$node.label)


## TODO: generate new NCBI tax object, and use this below instead
## of tree$edges
ntax <- list()
ntax$parents <- edg2idx(taxo$tree$edge)
ntax$rank <- taxo$rank

#warnings(file=stderr()) ## reports missing taxon ids

## filter not found nodes
if ( sum(!alltar %in% nodes)>0 )
  cat(paste("\nMISSING TARGET SPECIES:",
            paste(alltar[!alltar %in% nodes],collapse=";"), "\n"))
alltar <- alltar[alltar %in% nodes]

cat(paste("taxonomy loaded; searching closest neighbors ...\n"),file=stderr())


## get closest target species for each source
## (and collect names, IDs and common ancestor)
src2tar <- src2nam <- src2tax <- rep(list(NA), length(allsrc))
src2anc <- s2t.dst <- reltype <- rep(NA,length(allsrc))

## node index of target taxa
idst <- unique(alltar)
tid <- rep(NA,length(idst))
for ( j in 1:length(idst) )
    if ( idst[j]%in%nodes ) {
        tid[j] <- which(nodes==idst[j])
    } else {warning("target", j, ", taxon ID", idst[j],
                    " not found in nodes; obsolete tax id?")}

for ( i in 1:length(allsrc) ) {
    
    if ( dbug )
        cat(paste0("searching targets for taxon #",
                   i, "/", round(i/length(allsrc),2),
                   ", ID ", allsrc[i],"\t"))
    
    ## first, check if query taxon is present at all in taxonomy
    ## and get numeric index of its node, as used in tree$edge
    ## (note: only required for option sub, which we may skip alltogether)
    nid <- which(nodes==allsrc[i])
    if ( length(nid)!=1 ) {
        warning("taxon ID ", allsrc[i], " not found in nodes; obsolete tax id?")
        next
    }
    
    
    ## 1) is any target equal to the source?
    if ( allsrc[i] %in% alltar ) {
        
        src2tax[[i]] <- src2anc[i] <- allsrc[i]
        s2t.dst[i] <- 0
        reltype[i] <- "ident"
        if ( dbug ) cat("found direct hit")
    } else {
        
        ## if not, scan taxonomy tree
        
        ## get target taxa and ancestor
        ## via the lowest branch between src <-> target
        pars <- as.numeric(get.parents(nid, ntax))
        
        ## 2) is any target a direct ancestor? <- find parent species of strains
        if ( any(pars%in%tid) )  {
            
            ## take the first parent!
            src2tax[[i]] <- src2anc[i] <-
                nodes[pars[pars%in%tid]][1] ## ancestor=target
            s2t.dst[i] <- which(pars%in%tid)[1] - 1 ## taxonomic distance
            reltype[i] <- "parent"
            if ( dbug ) cat("found parent")
            
        } else {
            
            ## 3) scan children of ancestors <- find sibling strains
            ## TODO: record siblings, even if parent is present?
            
            ## to save time ??, extract a subtree
            ## NOTE: this doesn't save time apparently
            ## TODO: what is root here? <- should be character?
            if ( sub ) 
                st <- getAllParents(c(nid,tid),
                                    ntax,
                                    root=length(tree$tip.label)+1,
                                    dbug=FALSE)
            else st <- ntax
            
            ## note: first "parent" is query taxon
            for ( j in 2:length(pars) ) {
                skip <- pars[1:(j-1)] # skip prev. to save time!
                cdr <- as.numeric(get.children(pars[j], st, skip=skip))
                present <- cdr %in% tid
                if ( any(present) ) {
                    src2tax[[i]] <- nodes[cdr[present]] #nodes[tid[tid%in%cdr]] 
                    src2anc[i] <- nodes[pars[j]]
                    ## taxonomic distance to ancestor
                    ## TODO: add distance ancestor-sibling ?
                    s2t.dst[i] <- -j # -1 
                    reltype[i] <- "sibling"
                    if ( dbug ) cat(paste("found", sum(present),
                                          "sibling(s) at", -j))
                    break
                }
            }
        }
    }
    
    ## get names and rank of target taxa
    idx <- unlist(lapply(targtx, function(x) any(src2tax[[i]] %in% x)))
    if ( sum(idx)==0 )
        stop("taxon ", i, " ID ", alltar[i], ": NOTHING FOUND, SHOULDN'T",
             "HAPPEN - BUG??")

    if ( ti%in%colnames(targt) )
        src2tar[[i]] <- c(as.character(targt[idx,ti]))
    if ( tn%in%colnames(targt) )
        src2nam[[i]] <- c(as.character(targt[idx,tn]))
    
    ## if target has "no rank" get next ancestor with rank
    if ( ntax$rank[src2anc[i]] == "no rank" ) {
        id <- nodes[as.numeric(get.parents(id=which(nodes==src2anc[i]), ntax))]
        idx <- ntax$rank[id] != "no rank"
        if ( any(idx) ) {
            ## take first with rank
            src2anc[i] <- id[which(idx)[1]]
            ##s2t.dst[i] <- s2t.dst[i] + which(idx)[1] - 1
        }
    }
    
    if ( dbug ) cat("; done\n")
}

txids <- unlist(lapply(src2tax, function(x) paste(x,collapse=";")))

cat(paste("... done. writing results ....\n"), file=stderr())
results <- cbind.data.frame(sourc,
                            taxdist=s2t.dst,
                            relation=reltype,
                            ancestor=src2anc,
                            ancestor.rank=ntax$rank[src2anc],
                            target.taxon=txids,
                            stringsAsFactors=FALSE)

trgs <- rep(NA,nrow(sourc))
if ( tn != "" ) {
    trgs <- unlist(lapply(src2nam, function(x) paste(x,collapse=";")))
    results <- cbind(results, target.name=trgs)
}
tids <- rep(NA,nrow(sourc))
if ( ti != "" ) {
    tids <- unlist(lapply(src2tar, function(x) paste(x,collapse=";")))
    results <- cbind(results, target.id=tids)
}

## sort by distance to ancestor
#results <- results[order(as.numeric(results[,"taxdist"])),]

file.name <- paste(out,".dat",sep="")
if ( file.exists(file.name) ){
  cat(paste("WARNING: overwriting", file.name, "\n"),file=stderr())
  unlink(out)
}
write.table(results,file.name,row.names=FALSE,quote=FALSE,sep="\t",na="")

cat(paste("... done.\n"),file=stderr())

if (!interactive() ) {
  file.name <- paste(out,".RData",sep="")
  save.image(file=file.name)
  quit(save="no")
}