Nothing
classifyPlacements <- function(taxdata, treedists, placetab, ..., verbose=FALSE, debug=FALSE){
## Perform classification of placements by pplacer on a phylogenetic
## tree using clst.
##
## Input
## -----
##
vcat <- function(msg){if(verbose){cat(msg)}}
vprint <- function(msg){if(verbose){print(msg)}}
## dmat is the square matrix of distances between tips in the
## reference tree
dmat <- treedists$dmat
## taxonomic data
taxNames <- taxdata$taxNames
taxTab <- taxdata$taxTab
## TODO: define leftmost column as root in clst::classifyIter
taxTab$tax_id <- NULL
## distances: dvects is a rectangular matrix containing distances
## between nodes (rows) and tips (columns)
requiredCols <- c('at','edge','branch')
if(!all(requiredCols %in% colnames(placetab))){
stop(cat(paste('placetab must have all of the columns', requiredCols, '\n')))
}
## TODO: need to consider how to correct for long branch lengths of
## placements.
## * One approach would be to ignore the branch lengths of the
## placements, and instead add an offset, perhaps calculated from
## the lengths of the terminal branches of the reference
## sequences.
## * another approach might be to transform the branch term below,
## somehow - take the log, perhaps?
## * or, calculate the pairwise distances between reference
## sequences by ignoring terminal branch lengths as well?
## Of course, the problem might arise mostly from not having
## representative reference sequences to start with.
dvects <- with(placetab, {
treedists$dists[at+1,,drop=FALSE] + treedists$paths[at+1,,drop=FALSE]*edge + branch
})
## compatibility checks for input data:
if(!nrow(taxTab) == nrow(dmat) | !nrow(taxTab) == ncol(dmat)){
stop('treedists$dmat must have an edge length equal to the number of rows in taxdata$taxTab')
}
stopifnot(setequal(rownames(taxTab), rownames(dmat)))
## taxTab and margins of dmat must be in the same order
taxTab <- taxTab[match(rownames(dmat),rownames(taxTab)),]
stopifnot(all(rownames(taxTab) == rownames(dmat)))
## perform the classification. each list element contains results
## from a single atgroup; perform the classification and summarize the
## results
## classified <- lapply(1:10,
classified <- lapply(seq(nrow(dvects)),
function(i){
vcat(gettextf('*** i = %s ***\n',i))
cc <- clst::classifyIter(dmat, taxTab, dvects[i,], ..., verbose=debug)
last <- cc[[length(cc)]]
d <- with(last, {
if(length(matches) > 0){
tids <- matches
}else{
## if no match, save the tally for the taxon
## with the lowest median distance to the query
tids <- rownames(tally)[which.min(tally$median)]
}
cbind(data.frame(tax_id=tids,
tax_name=taxNames[tids],
rank=names(rank)),
tally[tids,,drop=FALSE])
})
d$d <- last$thresh$D
d$at <- placetab$at[i]
d$atgroup <- placetab$atgroup[i]
## d$tally <- tab$tally[i]
vprint(d)
return(d)
})
cdata <- do.call(rbind, classified)
rownames(cdata) <- NULL
return(cdata)
}
placeMat <- function(file){
## Parses a lower triangular matrix of between-edge distances
## produced by placeutil --distmat
##
## Input
## -----
##
## * file - name of input file
##
## Output
## ------
##
## A list(ddist,paths) of two square matrices
##
## * ddist - distances between distal ends of pairs of edges.
## * paths - Describes the path between edge i,j as parallel (-1) or
## serial (1).
## some notes from Erick:
## a placement on an edge looks like this:
## proximal
## |
## | d_p
## |
## |---- x
## |
## | d_d
## |
## |
## distal
## d_p is the distance from the placement x to the proximal side of the
## edge, and d_d the distance to the distal side.
## If the distance from x to a leaf y is an S-distance Q, then the
## path from x to y will go through the distal side of the edge and
## we will need to add d_d to Q to get the distance from x to y. If
## the distance from x to a leaf y is a P-distance Q, then the path
## from x to y will go through the proximal side of the edge, and we
## will need to subtract off d_d from Q to get the distance from x
## to y. In either case, we always need to add the
## length of the pendant edge, which is the second column.
## To review, say the values of the two leftmost columns are a and b
## for a given placement x, and that it is on an edge i. We are
## interested in the distance of x to a leaf y, which is on edge j.
## We look at the distance matrix, entry (i,j), and say it is an
## S-distance Q. Then our distance is Q+a+b. If it is a P-distance
## Q, then the distance is Q-a+b.
## The distances between leaves should always be P-distances, and there
## we need no trickery.
v <- scan(file, what="character", quiet=TRUE)
N <- ceiling(sqrt(length(v)*2))
stopifnot(N*(N-1)/2 == length(v))
m <- matrix(nrow=N, ncol=N)
m[upper.tri(m)] <- v
## numeric values
ddist <- substring(m,2)
mode(ddist) <- "numeric"
lt <- lower.tri(ddist)
ddist[lt] <- t(ddist)[lt]
diag(ddist) <- 0
## textual values (P,S)
pstrs <- substring(m,1,1)
## these values will be applied to the position of placement on an
## edge; P (parallel) leaves are corrected by multiplying by -1, S
## (serial) by 1
paths <- ifelse(pstrs == 'P', -1L, 1L)
paths[lt] <- t(paths)[lt]
diag(paths) <- 1L
rownames(paths) <- NULL
list(ddist=ddist, paths=paths)
}
edgeMap <- function(placefile){
treestr <- fromJSON(file=placefile)$tree
spl <- strsplit(treestr, '[,(]')[[1]]
edgelist <- lapply(spl, function(s){
data.frame(
name = strsplit(s, ':')[[1]][1],
edge = as.numeric(strsplit(s, '(\\]|\\[)')[[1]][2])
)
})[grepl(':[^[]+\\[',spl)]
edgemap <- do.call(rbind, edgelist)
## i corresponds to margins of placeMat output
edgemap$i <- edgemap$edge + 1
return(edgemap)
}
treeDists <- function(placefile, distfile){
## Provides objects (dists, paths) that can be used to calculate
## vectors of distances between an internal node and each leaf
## node. Also returns a square matrix of distances between leaf
## nodes.
##
## Input
## -----
##
## * placefile - path to pplacer output
## * distfile - path to output of 'placeutil --distmat'
##
## Value
## -----
##
## A list with named elements $dists, $paths, and $dmat
##
## * $dists, $paths - rectangular matrices with rows corresponding
## to all nodes, and named columns corresponding to tips.
## * $dmat - square matrix containing distances between pairs of tips.
##
edgemap <- edgeMap(placefile)
## TODO: if distfile is missing, create using `guppy distmat`
pmat <- placeMat(distfile)
leaves <- edgemap$i
dists <- pmat$ddist[,leaves]
paths <- pmat$paths[,leaves]
colnames(dists) <- colnames(paths) <- edgemap$name
dmat <- pmat$ddist[leaves,leaves]
colnames(dmat) <- rownames(dmat) <- edgemap$name
list(dists=dists, paths=paths, dmat=dmat)
}
placeData <- function(file){
## Input
## -----
##
## * file - output of pplacer (.place file); may be the name of a
## file or an open connection.
##
## Output
## ------
##
## * a data.frame with columns
## 'name','hit','at','mlwr','ppost','mlll','bml','edge','branch'
## (see Details)
##
## Details
## -------
##
## Columns in the data.frame output include 'name' (name of the
## sequence), 'hit' (integer corresponding to each row of output for
## a given sequence in the pplacer file); plus the following columns
## as described in
## (http://matsen.fhcrc.org/pplacer/manual.html#placefile):
##
## 1. Placement edge number, numbered as in a postorder traversal
## 2. ML likelihood weight ratio (i.e. the normalized ML likelihood values)
## 3. Posterior probability (or a dash if the -p option was not set)
## 4. ML log likelihood
## 5. Bayes marginal likelihood (or a dash if the -p option was not set)
## 6. The ML distance from the distal (farthest from the root) side of the edge
## 7. The ML pendant branch length
cnames <- c('at','mlwr','ppost','mlll','bml','edge','branch','tax_id','x')
lines <- readLines(file)
commentIdx <- grep('^#', lines, value=FALSE, perl=TRUE)
nameIdx <- grep('^>', lines, value=FALSE, perl=TRUE)
names <- c('none', substring(lines[nameIdx], 2))
## remove lines with comments, names and sequences
discard <- c(commentIdx, nameIdx, nameIdx+1)
lines[discard] <- paste(rep('\t',length(cnames)-1),collapse='')
con <- textConnection(lines)
tab <- read.table(con, sep='\t')
close(con)
colnames(tab) <- cnames
## rows corresponding to each name extend from here....
first <- c(1, nameIdx)
## ... to here
last <- c(nameIdx - 1, length(lines))
each <- last - first + 1
## add sequence names; no idea why tab$name <- rep(names, each=last -
## first + 1) doesn't give the expected behavior here
tab$name <- unlist(sapply(seq_along(names), function(i) rep(names[i],each[i])))
## subtract 2 from each value of each: first two lines are name, sequence
tab$hit <- as.integer(unlist(sapply(seq_along(each), function(i) seq(each[i]))) - 2)
## delete unneeded rows and rearrange columns
tab <- tab[-discard, c('name','hit', setdiff(cnames,'x'))]
tab$tax_id <- substring(tab$tax_id, 3)
return(tab)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.