R/taxTools.R
In clstutils: Tools for performing taxonomic assignment.

Documented in findOutliers maxDists refpkgContents taxonomyFromRefpkg

maxDists <- function(mat, idx=NA, N=1, exclude=rep(FALSE,nrow(mat)), include.center=TRUE){

  ## Return indices of N objects with a maximal sum of pairwise
  ## distances.
  ##
  ## * mat - square distance matrix
  ## * idx - starting indices; if is.na, starts with the object with the
  ##   minimum median distance to all other objects.
  ## * N - total number of selections; length of idx is subtracted.
  ## * exclude - boolean vector indicating elements to exclude from the calculation.
  ## * include.center - includes the most central object in the output if TRUE

  ## TODO: unit tests
  
  N <- min(N, nrow(mat))
  m <- mat
  m[exclude,] <- NA
  
  if(length(idx) == 0 || is.na(idx)){
    idx <- integer(0)
  }
  
  ## add most central element
  if(include.center){
    ## exclude rows corresponding to already selected objects
    m[idx,] <- NA 
    idx <- c(idx, which.min(apply(m,1,median)))
  }

  remaining <- N - length(idx)

  ## accumulate a total of N elements
  if(remaining > 0){
    for(x in seq(remaining)){    
      m[idx,] <- NA
      cols <- m[,idx,drop=FALSE]
      idx <- c(idx, which.max(apply(cols,1,sum)))
    }
  }
  
  ## pairs <- combn(idx, 2)
  ## print(apply(pairs, 2, function(p) mat[p[1],p[2]]))

  return(idx)
}

findOutliers <- function(mat, quant, cutoff){

  ## Outliers are defined as elements with edge length to the
  ## centermost element > cutoff.
  ##
  ## Input
  ## -----
  ##
  ## * mat - square matrix of distances
  ## * quant - given all pairwise distances x, calculate distance
  ##   threshold as quantile(x, quant).  Values closer to 0 are more
  ##   stringent.
  ## * cutoff - an absolute cutoff overriding quant
  ##
  ## Value
  ## -----
  ##
  ## Returns a boolean vector corresponding to margin of mat; outliers
  ## have a value of TRUE.
  ##
  
  if(missing(cutoff)){
    cutoff <- quantile(mat[lower.tri(mat)], quant, na.rm=TRUE)
  }

  ## find index of most central element
  center <- which.min(apply(mat,1,median))

  ## distance from each element to most central element
  dd <- mat[center,]

  return(dd > cutoff)  
}


## nodeTable <- function(taxData, tre, tax_ids){

##   ## * taxData - list of taxa, nodes, ranks (output of getTaxonomyFromDB)
##   ## * tre - ape phylo object read from a placefile (output of placeTree)
##   ## * tax_ids - character vector of NCBI tax ids in tree order

##   ## path from the root (first element of each vector) to each tip (last element)
##   tipsToRoot <- .Call("seq_root2tip",
##                       tre$edge, length(tre$tip.label),
##                       tre$Nnode, PACKAGE = "ape")

##   ## list of descendants of each node
##   nodes <- descendants(tre, tipsToRoot)

##   ## functionality of classify16s::nodemap here: generate a list
##   ## describing parents, children, and descendant tax_ids at each node

##   ## restrict taxTable to a subset of ranks defined elsewhere
##   rankNames <- taxData$ranks
##   taxTable <- taxData$taxa[tax_ids,rankNames]

##   ## add ancestry of the root node
##   edges <- rbind(tre$edge, c(NA, length(tre$tip.label)+1))
##   ## now the parent of each node i is given by parents[i] 
##   parents <- edges[order(edges[,2]),1]
  
##   nmap <- lapply(
##                  seq_along(nodes),
##                  function(i){
##                    leaves <- nodes[[i]]
##                    if(length(leaves)==0){
##                      leaves <- i
##                    }

##                    L1 <- list(
##                               i=i,
##                               parent=parents[i],
##                               children=leaves,
##                               nChildren=length(leaves),
##                               tax_id=setdiff(unique(tax_ids[leaves]),NA)
##                               )
                   
##                    ## add set of taxa at each rank
##                    L2 <- lapply(
##                                 rankNames,
##                                 function(r){setdiff(unique(taxTable[[r]][leaves]),NA)}
##                                 )
##                    names(L2) <- rankNames
##                    c(L1,L2)
##                  }
##                  )
  
##   ## functionality of classify16s:::nodeNames here: define
##   ## rank-specific taxonomic names for each node

##   ## define ranks to include

##   ## singleRankOk lists ranks that are permitted to have a single value;
##   ## this is useful to prevent confusion generated by rarely-defined
##   ## ranks (such as species group) while allowing higher-level ranks
##   ## (such as kingdom) to have the same value in all rows.
##   singleRankOk=c('superkingdom','kingdom', 'superphylum','phylum','class')
##   ndefined <- apply(taxTable, 2, function(x) length(setdiff(unique(x),NA)))
##   excludedRanks <- setdiff(names(ndefined)[ndefined < 2], singleRankOk)
##   ## get rid of 'no_rank' while we're at it
##   ranks <- setdiff(rankNames, c(excludedRanks,'no_rank'))

##   ## taxonomic names corresponding to tax_ids
##   taxnames <- taxData$nodes$tax_name
##   names(taxnames) <- taxData$nodes$tax_id

##   ## name each node and return a data.frame
##   nnames <- do.call(rbind, lapply(nmap,
##                                   function(x){
##                                     nodeNamer(x, taxnames, ranks)
##                                   }))

##   ## nnames lacks mapping to pplacer node numbers; need to provide
##   ## this using pmap
##   pmap <- placeMap(tre)
##   nnames$at <- as.integer(pmap$pplacer[match(nnames$node, pmap$node)])
  
##   return(nnames)
## }

## nodeNamer <- function(node, taxnames, ranks){
##   ## Define a name for the given node.
##   ##
##   ## Input
##   ## -----
##   ## * node - a list with named elements ...
##   ## * taxnames - named character vector of taxonomic names (names are tax_ids)
##   ## * ranks - character vector of ranks to consider in constructing the name

##   thisNode <- node[ranks]
  
##   stopifnot(all(unlist(thisNode) %in% names(taxnames)))
##   nranks <- length(ranks)

##   ## positions along ranks with a single taxon assignment
##   oneName <- which(sapply(thisNode,length) == 1)
##   tax_ids <- unlist(thisNode)[ranks]

##   if(length(oneName) > 0){
##     ##if(max(oneName) == nranks){
##     ## the most specific rank has only one tax_id - we are essentially done    
##     tab <- data.frame(
##                node=rep(node$i, nranks),
##                rank=ranks,
##                thisRank=ranks,
##                tax_id=tax_ids,
##                tax_name=taxnames[tax_ids]
##                )

##     if(max(oneName) < length(ranks)){
##       ## restrict to ranks with a single taxon and assign names for this
##       ## subset; fill in lower ranks with the most specific
##       ## classification available.
##       mostSpecific <- max(oneName)
##       ii <- seq(mostSpecific + 1, length(ranks))
      
##       tab$thisRank[ii] <- tab$thisRank[mostSpecific]
##       tab$tax_id[ii] <- tab$tax_id[mostSpecific]
##       tab$tax_name[ii] <- tab$tax_name[mostSpecific]
##     }    
##   }else{
##     ## not even the most general rank is uniquely defined; here we punt
##     thisRank <- ranks[1]    
##     tab <- data.frame(
##                node=rep(node$i, nranks),
##                rank=ranks,
##                thisRank=rep(thisRank, nranks),
##                tax_id=rep(paste(node[[thisRank]], collapse=' '), nranks),
##                tax_name=rep(paste(node[[thisRank]], collapse=' '), nranks)
##                )    
##   }

##   return(tab)
  
## }

refpkgContents <- function(path, manifest='CONTENTS.json'){
  ## Read the file `manifest` from a refpackage located at `path` and
  ## return a list containing the package contents.

  ## TODO: import(rjson) in NAMESPACE fails - how do I use this
  ## package properly?
  library(rjson)
  contents <- fromJSON(file=file.path(path, manifest))

  ## TODO: validate md5 checksums
  
  files <- lapply(file.path(path, contents$files), normalizePath)
  names(files) <- names(contents$files)
  contents$files <- files
    
  return(contents)
}


taxonomyFromRefpkg <- function(path, seqnames, lowest_rank=NA){

  ## Construct a data.frame providing the lineage of each sequence
  ## described in seq_info.
  ##
  ## Input
  ## -----
  ##
  ## * path - path to a refpkg directory
  ## * seqnames - optional character vector of sequence names. If
  ##   provided, determines the order of $taxTab
  ## * lowest_rank - name of the most specific (ie, rightmost) rank to
  ##   include. Default is the name of the rightmost column in
  ##   refpkg_contents$taxonomy
  ##
  ## Value
  ## -----
  ##
  ## A list with the following elements
  ## * taxNames - a named character vector of taxonomic names (names
  ##   are tax_ids)
  ## * taxTab - a data.frame in which each row corresponds to a
  ##   reference sequence and contains a tax_id followed by the
  ##   corresponding lineage (columns are root...lowest_rank)
  
  first_rank <- 'root'
  
  contents <- refpkgContents(path)

  ## taxonomic assignments of reference sequences
  seq_info <- read.csv(contents$files$seq_info, as.is=TRUE)

  ## TODO: taxtable.py should represent Non/NA as ,, not ,"",
  taxonomy <- read.csv(contents$files$taxonomy, colClasses='character', na.strings=c(""))

  taxNames <- taxonomy$tax_name
  names(taxNames) <- taxonomy$tax_id

  ## this merge should preserve the order of seq_info
  taxTab <- merge(seq_info[,c('seqname','tax_id')], taxonomy, sort=FALSE)
  rownames(taxTab) <- taxTab[['seqname']]
  
  ## define the columns to be included as those bounded by first_rank
  ## and lowest_rank
  if(is.na(lowest_rank)){
    lowest_rank <- colnames(taxTab)[ncol(taxTab)]
  }
  
  cols <- do.call(seq,
                  lapply(c(first_rank,lowest_rank),
                         function(n) match(n, colnames(taxTab)))
                  )

  taxTab <- cbind(tax_id=taxTab$tax_id,
                  taxTab[,cols],
                  stringsAsFactors = FALSE)

  if(!missing(seqnames)){
    taxTab <- taxTab[match(seqnames, rownames(taxTab)),]
  }

  ## sanity checks for taxTab:

  ## will fail if not all sequences in seq_info have corresponding
  ## entries in taxonomy table
  if(any(is.na(taxTab$tax_id))){
    stop('Every value of tax_id in the reference package seq_info.csv must be represented in taxonomy.csv')
  }
  
  return(list(taxNames=taxNames, taxTab=taxTab))
}