Nothing
LearnTaxa <- function(train,
taxonomy,
rank=NULL,
K=NULL,
N=500,
minFraction=0.01,
maxFraction=0.06,
maxIterations=10,
multiplier=100,
maxChildren=200,
alphabet=AA_REDUCED[[139]],
verbose=TRUE) {
# error checking
if (!is(train, "DNAStringSet") && !is(train, "RNAStringSet") && !is(train, "AAStringSet"))
stop("train must be an AAStringSet, DNAStringSet, or RNAStringSet.")
l <- length(train)
if (l < 2)
stop("At least two training sequences are required.")
if (!is.character(taxonomy))
stop("taxonomy must be a character vector.")
if (length(taxonomy) != l)
stop("taxonomy must be the same length as train.")
if (!is.null(rank)) {
if (!is.data.frame(rank))
stop("rank must be a data.frame.")
ns <- c('Index', 'Name', 'Parent', 'Level', 'Rank')
w <- which(!(ns %in% names(rank)))
if (length(w) > 0)
stop("rank is missing columns named: ",
paste(ns[w], collapse=", "))
if (any(duplicated(rank$Index)))
stop("All values of Index within rank must be unique.")
}
if (any(!grepl("Root;", taxonomy, fixed=TRUE)))
stop("All elements of taxonomy must contain 'Root;'.")
if (is(train, "AAStringSet")) {
if (any(alphabet==""))
stop("No elements of alphabet can be empty.")
r <- strsplit(alphabet, "", fixed=TRUE)
alphabet <- setNames(rep(0L, 20),
AA_STANDARD)
for (i in seq_along(r)) {
w <- which(!(r[[i]] %in% AA_STANDARD))
if (length(w) > 0)
stop("Unrecognized letter(s) found in alphabet: ",
paste(r[[i]][w], collapse=", "),
".")
w <- which(alphabet[r[[i]]] != 0L)
if (length(w) > 0)
stop("Repeated amino acids found in alphabet: ",
paste(r[[i]][w], collapse=", "),
".")
alphabet[r[[i]]] <- i
}
w <- which(alphabet==0L)
if (length(w) > 0)
stop("Standard amino acids missing from alphabet: ",
paste(names(w), collapse=", "),
".")
size <- max(alphabet)
if (size==1)
stop("More than one grouping of amino acids is required in the alphabet.")
alphabet <- alphabet - 1L
} else {
alphabet <- NULL
size <- 4
}
if (is.null(K)) {
if (!is.numeric(N))
stop("N must be a numeric.")
if (N <= 1)
stop("N must be greater than one.")
quant <- quantile(width(train), 0.99)
if (max(width(train)) > N/50*quant)
warning("Extra long sequences in train may negatively affect accuracy.")
if (is(train, "AAStringSet")) {
K <- floor(log(N*quant,
.Call("alphabetSizeReducedAA",
train,
alphabet,
PACKAGE="DECIPHER")))
} else {
K <- floor(log(N*quant,
.Call("alphabetSize",
train,
PACKAGE="DECIPHER")))
}
if (is(train, "AAStringSet")) {
maxK <- as.integer(floor(log(1e8, size))) # theoretically 4294967295, but may encounter memory errors
} else {
maxK <- 13 # theoretically 15, but may encounter memory errors
}
if (K < 1) {
K <- 1
} else if (K > maxK) {
K <- maxK
warning("K may be smaller than ideal.")
}
} else {
if (!is.numeric(K))
stop("K must be a numeric.")
if (length(K) != 1)
stop("K must be a single numeric.")
if (K != floor(K))
stop("K must be a whole number.")
if (K < 1)
stop("K must be at least one.")
if (is(train, "AAStringSet")) {
maxK <- as.integer(floor(log(4294967295, size)))
} else {
maxK <- 15
}
if (K > maxK)
stop(paste("K can be at most ", maxK, ".", sep=""))
}
if (!is.numeric(minFraction))
stop("minFraction must be a numeric.")
if (minFraction <= 0)
stop("minFraction must be greater than zero.")
if (!is.numeric(maxFraction))
stop("maxFraction must be a numeric.")
if (maxFraction <= minFraction)
stop("maxFraction must be greater than minFraction.")
if (maxFraction >= 1)
stop("maxFraction must be less than one.")
if (!is.numeric(maxIterations))
stop("maxIterations must be a numeric.")
if (maxIterations < 1)
stop("maxIterations must be at least one.")
if (maxIterations != floor(maxIterations))
stop("maxIterations must be a whole number.")
if (!is.numeric(multiplier))
stop("multiplier must be a numeric.")
if (multiplier <= 0)
stop("multiplier must be greater than zero.")
if (!is.numeric(maxChildren))
stop("maxChildren must be a numeric.")
if (maxChildren < 1)
stop("maxChildren must be at least one.")
if (maxChildren != floor(maxChildren))
stop("maxChildren must be a whole number.")
if (!is.logical(verbose))
stop("verbose must be a logical.")
a <- vcountPattern("-", train)
if (any(a > 0))
stop("Gap characters ('-') are not permitted in train.")
a <- vcountPattern(".", train)
if (any(a > 0))
stop("Unknown characters ('.') are not permitted in train.")
if (verbose) {
num <- 0L
time.1 <- Sys.time()
pBar <- txtProgressBar(style=ifelse(interactive(), 3, 1))
}
# set default values
nKmers <- size^K
B <- 100L
# get the full classification beginning from "Root"
classes <- gsub(" *; *", ";", taxonomy)
classes <- sapply(strsplit(classes, "Root;"),
`[`,
2L)
classes <- gsub(" +$", "", classes)
classes <- paste(classes,
ifelse(endsWith(classes, ";"),
"",
";"),
sep="")
# index and sort all unique (non-ambiguous) k-mers
if (is(train, "AAStringSet")) {
kmers <- .Call("enumerateSequenceReducedAA",
train,
K,
alphabet,
PACKAGE="DECIPHER")
} else {
kmers <- .Call("enumerateSequence",
train,
K,
PACKAGE="DECIPHER")
}
kmers <- lapply(kmers,
function(x)
sort(unique(x + 1L), na.last=NA))
if (any(lengths(kmers)==0))
stop("All training sequences must have at least one k-mer.")
# create lists of:
# taxonomy = all distinct taxa ranks
# taxa = the taxon added in each taxonomy
# crossIndex = index of taxonomy for each sequence
# children = index (in taxonomy) of immediate children for each taxa
# parents = index (in taxonomy) of the parent level
# sequences = index (in classes/train/kmers) of sequences belonging to each taxa
u_classes <- unique(classes)
if (length(u_classes) <= 1)
stop("More than one taxonomic group is required.")
taxonomy <- strsplit(u_classes, ";", fixed=TRUE)
taxonomy <- lapply(taxonomy,
function(x) {
sapply(seq_along(x),
function(n)
paste(x[seq_len(n)],
";",
sep="",
collapse=""))
})
taxonomy <- unique(unlist(taxonomy))
crossIndex <- match(classes, taxonomy) + 1L # offset by 1 to add "Root"
taxonomy <- c("Root;", paste("Root;", taxonomy, sep=""))
s <- strsplit(taxonomy, ";", fixed=TRUE)
taxa <- unlist(lapply(s,
function(x) x[length(x)]))
levels <- lengths(s)
maxL <- max(levels) - 1L # max level with children
levs <- vector("list", maxL)
for (i in seq_len(maxL))
levs[[i]] <- which(levels==(i + 1L))
starts <- rep(1L, maxL)
children <- vector("list", length(taxonomy))
for (i in seq_along(children)) {
j <- levels[i]
if (j <= maxL) {
while (starts[j] <= length(levs[[j]]) &&
levs[[j]][starts[j]] <= i)
starts[j] <- starts[j] + 1L
if (starts[j] <= length(levs[[j]])) {
w <- levs[[j]][starts[j]:length(levs[[j]])]
w <- w[startsWith(taxonomy[w], taxonomy[i])]
} else {
w <- integer()
}
} else {
w <- integer()
}
children[[i]] <- w
}
parents <- numeric(length(taxonomy))
for (i in seq_along(children))
parents[children[[i]]] <- i
# record the rank of each taxon
if (is.null(rank)) {
ranks <- NULL
} else {
ranks <- character(length(taxa))
for (i in seq_along(ranks)) {
w <- which(rank$Name==taxa[i])
if (length(w)==0) {
stop("rank is missing the name: ",
taxa[i])
} else if (length(w) > 1) {
w <- w[which(rank$Level[w]==(levels[i] - 1L))]
if (length(w)==0) {
stop("rank is missing the name: ",
taxa[i],
" at level ",
levels[i] - 1L)
} else if (length(w) > 1) {
stop("rank contains a duplicated name: ",
taxa[i],
" at level ",
levels[i] - 1L)
}
} else {
if (rank$Level[w] != (levels[i] - 1L))
stop("rank matches a name at the wrong level: ",
taxa[i])
}
ranks[i] <- rank$Rank[w]
# confirm the correct parent
w <- rank$Parent[w] # Index of Parent
if (w > 0) { # not Root
w <- which(rank$Index==w) # Index matching Parent
if (length(w)==0)
stop("All values of Parent in rank must have a corresponding Index.")
if (rank$Name[w] != taxa[parents[i]])
stop("rank contains an unexpected Parent for Name: ",
taxa[i])
}
}
}
# find the sequences corresponding to each taxonomy
end_taxonomy <- substring(taxonomy, 6) # without "Root;"
sequences <- vector("list", length(end_taxonomy))
nchars <- nchar(end_taxonomy)
u_nchars <- unique(nchars)
nchars <- match(nchars, u_nchars)
nchars <- tapply(seq_along(nchars), nchars, c, simplify=FALSE)
for (i in seq_along(u_nchars)) {
m <- match(substring(classes, 0, u_nchars[i]),
end_taxonomy[nchars[[i]]])
m <- tapply(seq_along(m), m, c, simplify=FALSE)
sequences[nchars[[i]]] <- m
}
nSeqs <- lengths(sequences)
# record the most distinctive k-mers at each node in the taxonomic tree
.createTree <- function(k) {
l <- length(children[[k]])
if (l > 0 && l <= maxChildren) {
# obtain relative k-mer frequency by group
profile <- vector("list", l)
descendants <- integer(l)
for (i in seq_len(l)) {
results <- .createTree(children[[k]][i])
profile[[i]] <- results[[1]]
descendants[i] <- results[[2]]
}
profile <- matrix(unlist(profile),
ncol=l)
# calculate cross-entropy between parent and children
q <- colSums(t(profile)*descendants) # weighted k-mer frequencies
descendants <- sum(descendants) # total descendant groups
q <- q/descendants # relative k-mer frequency overall
H <- -profile*log(q) # entropy_i = -p_i*log(q_i)
# select the most distinguishing k-mers for each group
o <- apply(H, 2, order, decreasing=TRUE)
keep <- logical(length(q))
count <- 0L # number of k-mers
i <- 1L # k-mer
j <- 0L # group
recordKmers <- ceiling(max(colSums(profile > 0)))*0.10
while (count < recordKmers) {
j <- j + 1L
if (j > l) {
j <- 1L
i <- i + 1L
}
if (!keep[o[i, j]]) {
count <- count + 1L
keep[o[i, j]] <- TRUE
}
}
keep <- which(keep)
decision_kmers[[k]] <<- list(keep, t(profile[keep,]))
invisible(list(q, descendants))
} else {
w <- sequences[[k]]
profile <- tabulate(unlist(kmers[w]),
nKmers)
profile <- profile/sum(profile) # normalize
invisible(list(profile, 1L))
}
}
decision_kmers <- vector("list", length(taxonomy))
.createTree(1) # traverse the taxonomic tree
# learn the appropriate fraction for sampling
fraction <- rep(maxFraction, length(taxonomy))
incorrect <- rep(TRUE, l)
predicted <- character(l)
# allow each sequence maxIterations*delta control over fraction
delta <- (maxFraction - minFraction)*multiplier
for (it in seq_len(maxIterations)) {
remainingSeqs <- which(incorrect)
if (length(remainingSeqs)==0)
break
# test whether the sequences can be correctly classified
for (i in remainingSeqs) {
if (length(kmers[[i]])==0)
next # no k-mers to test
k <- 1L
correct <- TRUE
repeat {
subtrees <- children[[k]]
n <- length(decision_kmers[[k]][[1]])
if (n==0) { # no decision k-mers
break
} else if (length(subtrees) > 1) {
# set number of k-mers to choose each bootstrap replicate
if (is.na(fraction[k])) {
s <- ceiling(n*minFraction)
} else {
s <- ceiling(n*fraction[k])
}
# sample the decision k-mers
sampling <- matrix(sample(n, s*B, replace=TRUE), B, s)
hits <- matrix(0,
nrow=length(subtrees),
ncol=B)
matches <- .Call("intMatch",
decision_kmers[[k]][[1]],
kmers[[i]],
PACKAGE="DECIPHER")
myweights <- decision_kmers[[k]][[2]]
for (j in seq_along(subtrees)) {
hits[j,] <- .Call("vectorSum",
matches,
myweights[j,],
sampling,
B,
PACKAGE="DECIPHER")
}
maxes <- apply(hits, 2, max) # max hits per bootstrap replicate
hits <- colSums(t(hits)==maxes & maxes > 0)
w <- which.max(hits)
if (hits[w] < B*0.8) # require 80% confidence to further descend the taxonomic tree
break
} else { # only one child
w <- 1
}
if (!startsWith(classes[i], end_taxonomy[subtrees[w]])) {
correct <- FALSE
break
}
if (length(children[[subtrees[w]]])==0)
break
k <- subtrees[w]
}
if (correct) { # correct group
incorrect[i] <- FALSE
if (verbose) {
num <- num + 1L
setTxtProgressBar(pBar, num/l)
}
} else { # incorrect group
predicted[i] <- taxonomy[subtrees[w]]
if (is.na(fraction[k])) {
incorrect[i] <- NA
if (verbose) {
num <- num + 1L
setTxtProgressBar(pBar, num/l)
}
} else {
fraction[k] <- fraction[k] - delta/nSeqs[k]
if (fraction[k] <= minFraction) {
incorrect[i] <- NA
fraction[k] <- NA
if (verbose) {
num <- num + 1L
setTxtProgressBar(pBar, num/l)
}
}
}
}
}
}
# record problematic taxonomic groups and sequences
w <- which(incorrect | is.na(incorrect))
if (length(w) > 0) {
problemSequences <- data.frame(Index=w,
Expected=paste("Root;", classes[w], sep=""),
Predicted=predicted[w],
#row.names=names(train)[w],
stringsAsFactors=FALSE)
} else {
problemSequences <- data.frame(Index=w,
Expected=character(),
Predicted=predicted[w],
stringsAsFactors=FALSE)
}
w <- which(is.na(fraction))
if (length(w) > 0) {
problemGroups <- taxonomy[w]
} else {
problemGroups <- character()
}
# count the number of representatives per class
classTable <- table(classes)
weight <- 1/classTable[classes] # weigh each class equally
# compute Inverse Document Frequency (IDF) weights
counts <- numeric(nKmers)
for (i in seq_along(classes)) {
counts[kmers[[i]]] <- counts[kmers[[i]]] + weight[i]
}
counts <- length(classTable)/(1 + counts) # Max/(1 + F)
counts <- log(counts) # IDF weight is log(Max/(1 + F))
# prepare the result to return
result <- list(taxonomy=taxonomy,
taxa=taxa,
ranks=ranks,
levels=levels,
children=children,
parents=parents,
fraction=fraction,
sequences=sequences,
kmers=kmers,
crossIndex=crossIndex,
K=K,
IDFweights=counts,
decisionKmers=decision_kmers,
problemSequences=problemSequences,
problemGroups=problemGroups,
alphabet=alphabet)
class(result) <- c("Taxa", "Train")
if (verbose) {
setTxtProgressBar(pBar, 1)
close(pBar)
cat("\n")
time.2 <- Sys.time()
print(round(difftime(time.2,
time.1,
units='secs'),
digits=2))
cat("\n")
}
invisible(result)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.