R/IdTaxa.R
In DECIPHER: Tools for curating, analyzing, and manipulating biological sequences

Documented in IdTaxa

IdTaxa <- function(test,
	trainingSet,
	type="extended",
	strand="both",
	threshold=60,
	bootstraps=100,
	samples=L^0.47,
	minDescend=0.98,
	fullLength=0,
	processors=1,
	verbose=TRUE) {
	
	# error checking
	if (!is(test, "DNAStringSet") && !is(test, "RNAStringSet") && !is(test, "AAStringSet"))
		stop("test must be an AAStringSet, DNAStringSet, or RNAStringSet.")
	if (is(test, "AAStringSet"))
		strand <- "top"
	# de-replicate sequences
	ns <- names(test)
	d <- !duplicated(test)
	map <- match(test, test[d]) # mapping of unique sequences
	# ensure that the strand vector is consistently mapped
	if (length(strand)==length(test)) {
		# check for different strand with same sequence
		w <- strand[d][map] != strand
		if (any(w)) { # include both strands
			d <- d | w
			map <- match(test, test[d])
		}
		strand <- strand[d]
	} else if (length(strand) > 1) {
		stop("strand must be length 1 or the length of test.")
	}
	test <- test[d]
	l <- length(test)
	if (l < 1)
		stop("At least one test sequence is required.")
	if (!is(trainingSet, "Taxa") || !is(trainingSet, "Train"))
		stop("trainingSet must be an object of class 'Taxa' (subclass 'Train').")
	if (is(test, "AAStringSet") && is.null(trainingSet$alphabet))
		stop("trainingSet was built from amino acid sequences but test contains nucleotide sequences.")
	TYPES <- c("collapsed", "extended")
	type <- pmatch(type[1], TYPES)
	if (is.na(type))
		stop("Invalid type.")
	if (type == -1)
		stop("Ambiguous type.")
	if (length(strand) != 1 &&
		length(strand) != length(test))
		stop("strand must be length 1 or the length of test.")
	STRANDS <- c("both", "top", "bottom")
	strand <- pmatch(strand, STRANDS)
	if (any(is.na(strand)))
		stop("Invalid strand.")
	if (any(strand==-1))
		stop("Ambiguous strand.")
	if (length(strand)==1)
		strand <- rep(strand, length(test))
	w <- which(strand==3)
	if (length(w) > 0) # use opposite orientation
		test[w] <- reverseComplement(test[w])
	if (!is.numeric(threshold))
		stop("threshold must be a numeric.")
	if (threshold < 0 || threshold > 100)
		stop("threshold must be between 0 and 100 (inclusive).")
	if (!is.numeric(minDescend))
		stop("minDescend must be a numeric.")
	if (minDescend < 0.5 || minDescend > 1)
		stop("minDescend must be between 0.5 and 1 (inclusive).")
	sexpr <- substitute(samples)
	if (!is.numeric(sexpr)) {
		if (is.name(sexpr)) { # function name
			sexpr <- call(as.character(sexpr),
				as.name("L")) # pass in 'L'
	    } else if (!(is.call(sexpr) && # call
	   	 	"L" %in% all.vars(sexpr))) { # containing 'L'
			stop("samples must be a call containing 'L'.")
		}
	}
	if (!is.numeric(fullLength))
		stop("fullLength should be a numeric.")
	if (length(fullLength)==1L && fullLength==0) {
		fullLength <- c(0, Inf)
	} else {
		if (any(fullLength < 0))
			stop("fullLength must be greater than zero.")
		if (length(fullLength)==1L) {
			if (fullLength < 1L) { # upper/lower quantiles
				fullLength <- sort(c(fullLength,
					1 - fullLength))
			} else { # fold-differences
				fullLength <- c(1/fullLength,
					fullLength)
			}
		} else if (length(fullLength)==2L) {
			if (all.equal(fullLength[1], fullLength[2]))
				stop("fullLength must be 2 different numerics.")
			fullLength <- sort(fullLength)
		} else {
			stop("fullLength must be 1 or 2 numerics.")
		}
	}
	if (!is.logical(verbose))
		stop("verbose must be a logical.")
	if (!is.null(processors) && !is.numeric(processors))
		stop("processors must be a numeric.")
	if (!is.null(processors) && floor(processors)!=processors)
		stop("processors must be a whole number.")
	if (!is.null(processors) && processors < 1)
		stop("processors must be at least 1.")
	if (is.null(processors)) {
		processors <- detectCores()
	} else {
		processors <- as.integer(processors)
	}
	a <- vcountPattern("-", test)
	if (any(a > 0))
		stop("Gap characters ('-') are not permitted in test.")
	a <- vcountPattern(".", test)
	if (any(a > 0))
		stop("Unknown characters ('.') are not permitted in test.")
	
	# set default values
	taxonomy <- trainingSet$taxonomy
	taxa <- trainingSet$taxa
	children <- trainingSet$children
	parents <- trainingSet$parents
	fraction <- trainingSet$fraction
	sequences <- trainingSet$sequences
	kmers <- trainingSet$kmers
	crossIndex <- trainingSet$crossIndex
	K <- trainingSet$K
	counts <- trainingSet$IDFweights
	decision_kmers <- trainingSet$decisionKmers
	nKmers <- ifelse(is(test, "AAStringSet"),
		max(trainingSet$alphabet) + 1,
		4)^K
	
	Ls <- lengths(kmers) # number of k-mers per sequence
	if (all(fullLength < 1)) { # infer length quantiles
		t <- tapply(Ls,
			crossIndex,
			function(x) {
				if (length(x)==1L) {
					NA_real_
				} else {
					x/mean(x)
				}
			})
		fullLength <- quantile(unlist(t),
			fullLength,
			na.rm=TRUE)
		if (is.na(fullLength[1]))
			stop("fullLength cannot be inferred from trainingSet.")
	}
	
	if (verbose) {
		time.1 <- Sys.time()
		pBar <- txtProgressBar(style=ifelse(interactive(), 3, 1))
	}
	
	# index and sort all unique (non-ambiguous) k-mers
	if (is(test, "AAStringSet")) {
		testkmers <- .Call("enumerateSequenceReducedAA",
			test,
			K,
			trainingSet$alphabet,
			PACKAGE="DECIPHER")
	} else {
		testkmers <- .Call("enumerateSequence",
			test,
			K,
			PACKAGE="DECIPHER")
	}
	
	# determine the total number of k-mers per sequence
	notNAs <- unlist(lapply(testkmers,
		function(x)
			sum(!is.na(x))))
	# calculate the number of k-mers to subsample
	S <- eval(sexpr,
		envir=list(L=notNAs),
		enclos=parent.frame())
	if (!is.numeric(S))
		stop("samples did not evaluate to a numeric.")
	if (any(is.na(S)))
		stop("samples evalated to NA.")
	if (any(S < 0))
		stop("samples evaluated to a negative number of samples.")
	if (length(S)==1) {
		S <- rep(S, length(notNAs))
	} else if (length(S) != length(notNAs)) {
		stop("samples did not evaluate to a vector of length equal to the number of sequences in test.")
	}
	S <- ceiling(S)
	
	# require a minimum sample size
	L <- quantile(Ls, 0.9)
	for (minS in seq(2L, 100L, 2L)) {
		# probability of observing half of k-mers by chance
		P <- 1 - pbinom(minS*0.5 - 1,
			minS,
			L/nKmers)
		# probability of occurrence in one or more sequences
		P <- 1 - pbinom(0, # observed once or more
			length(kmers), # number of sequences
			P)
		if (P < 0.01) # less than 1% of bootstrap replicates
			break
	}
	S <- ifelse(S < minS, minS, S)
	
	testkmers <- lapply(testkmers,
		function(x)
			sort(unique(x + 1L), na.last=NA))
	
	if (!is.numeric(bootstraps))
		stop("bootstraps must be a numeric.")
	if (bootstraps != floor(bootstraps))
		stop("bootstraps must be a whole number.")
	if (bootstraps < 1)
		stop("bootstraps must be at least one.")
	# set B such that > 99% of k-mers are sampled
	B <- 5*lengths(testkmers)/S # 1 - P(0) = 1 - exp(-5)
	B <- ifelse(B > bootstraps, bootstraps, B)
	B <- as.integer(B)
	
	boths <- which(strand==1)
	if (length(boths) > 0) {
		revkmers <- .Call("enumerateSequence",
			reverseComplement(test[boths]),
			K,
			PACKAGE="DECIPHER")
		revkmers <- lapply(revkmers,
			function(x)
				sort(unique(x + 1L), na.last=NA))
	}
	
	if (type==1L) {
		if (is.null(trainingSet$ranks)) {
			results <- "Root [0%]; unclassified_Root [0%]"
		} else {
			results <- paste("Root [",
				trainingSet$ranks[1],
				", 0%]; unclassified_Root [",
				trainingSet$ranks[2],
				", 0%]",
				sep="")
		}
	} else {
		if (is.null(trainingSet$ranks)) {
				results <- list(list(taxon=c("Root",
						"unclassified_Root"),
					confidence=rep(0, 2)))
		} else {
			results <- list(list(taxon=c("Root",
					"unclassified_Root"),
				confidence=rep(0, 2),
				rank=c(trainingSet$ranks[1],
					trainingSet$ranks[2])))
		}
	}
	results <- rep(results, length(testkmers))
	I <- c(seq_along(testkmers),
		boths)
	O <- c(rep(0L, length(testkmers)),
		seq_along(boths))
	confs <- sims <- numeric(length(testkmers))
	for (i in seq_along(I)) {
		if (O[i]) { # use revkerms
			mykmers <- revkmers[[O[i]]]
		} else { # use testkmers
			mykmers <- testkmers[[I[i]]]
		}
		
		if (length(mykmers) <= S[I[i]]) # not enough k-mers to test
			next
		
		# choose which training sequences to use for benchmarking
		k <- 1L
		repeat {
			subtrees <- children[[k]]
			n <- length(decision_kmers[[k]][[1]])
			
			if (n==0 || is.na(fraction[k])) { # use every subtree
				w <- seq_along(subtrees)
				break
			} else if (length(subtrees) > 1) {
				# set number of k-mers to choose each bootstrap replicate
				s <- ceiling(n*fraction[k])
				
				# sample the decision k-mers
				sampling <- matrix(sample(n, s*B[I[i]], replace=TRUE), B[I[i]], s)
				
				hits <- matrix(0,
					nrow=length(subtrees),
					ncol=B[I[i]])
				matches <- .Call("intMatch",
					decision_kmers[[k]][[1]],
					mykmers,
					PACKAGE="DECIPHER")
				myweights <- decision_kmers[[k]][[2]]
				for (j in seq_along(subtrees)) {
					hits[j,] <- .Call("vectorSum",
						matches,
						myweights[j,],
						sampling,
						B[I[i]],
						PACKAGE="DECIPHER")
				}
				
				maxes <- apply(hits, 2, max) # max hits per bootstrap replicate
				hits <- colSums(t(hits)==maxes & maxes > 0)
				w <- which(hits >= minDescend*B[I[i]])
				if (length(w) != 1) { # zero or multiple groups with 100% confidence
					w <- which(hits >= B[I[i]]*0.5) # require 50% confidence to use a subset of groups
					if (length(w)==0) {
						w <- seq_along(hits) # use all groups
						break
					}
					
					w <- which(hits > 0) # use any group with hits
					if (length(w)==0)
						w <- seq_along(hits) # use all groups
					break
				}
			} else { # only one child
				w <- 1
			}
			
			if (length(children[[subtrees[w]]])==0)
				break
			
			k <- subtrees[w]
		}
		
		keep <- unlist(sequences[children[[k]][w]])
		if (fullLength[1] > 0 || is.finite(fullLength[2])) {
			keep <- keep[Ls[keep] >= fullLength[1]*length(mykmers) &
				Ls[keep] <= fullLength[2]*length(mykmers)]
			if (length(keep)==0L) # no sequences
				next
		}
		
		# determine the number of k-mers per bootstrap replicate
		s <- S[I[i]] # subsample size
		
		# sample the same query k-mers for all comparisons
		sampling <- matrix(sample(mykmers, s*B[I[i]], replace=TRUE), B[I[i]], s)
		
		# only match unique k-mers to improve speed
		uSampling <- sort(unique(as.vector(sampling)))
		m <- match(sampling, uSampling)
		
		# lookup IDF weights for sampled k-mers
		myweights <- matrix(counts[sampling], B[I[i]], s)
		
		# record the matches to each group
		hits <- .Call("parallelMatch",
			uSampling,
			kmers,
			keep,
			m,
			myweights,
			B[I[i]],
			processors,
			PACKAGE="DECIPHER")
		
		# find the index of the top hit per group
		sumHits <- hits[[2]] # score per sequence
		hits <- hits[[1]] # matrix of hits
		lookup <- crossIndex[keep] # indices in taxonomy
		index <- sort(unique(lookup)) # index of each tested group
		o <- order(lookup)
		topHits <- o[.Call("groupMax",
			sumHits[o],
			lookup[o],
			index,
			PACKAGE="DECIPHER")]
		hits <- hits[, topHits, drop=FALSE] # only look at the top sequence per group
		
		# compute confidence from the number of hits per group
		totHits <- numeric(length(topHits))
		davg <- mean(rowSums(myweights))
		maxes <- max.col(hits)
		for (j in seq_len(B[I[i]])) # each bootstrap replicate
			totHits[maxes[j]] <- totHits[maxes[j]] + hits[j, maxes[j]]/davg
		
		# choose the group with highest confidence
		w <- which(totHits==max(totHits))
		if (length(w) > 1) {
			selected <- sample(w, 1)
		} else {
			selected <- w
		}
		if (O[i]) { # second pass
			if (sum(hits[, selected])/davg <= sims[O[i]]) {
				if (verbose)
					setTxtProgressBar(pBar, i/length(I))
				next # other strand was higher similarity
			}
		} else { # first pass (record result)
			confs[i] <- totHits[selected] # confidence
			sims[i] <- sum(hits[, selected])/davg # weighted k-mer similarity
		}
		
		# record confidences up the hierarchy
		predicteds <- index[selected]
		p <- parents[predicteds]
		while (p > 0) {
			predicteds <- c(p, predicteds)
			p <- parents[p]
		}
		confidences <- rep(totHits[selected]/B[I[i]]*100, length(predicteds))
		w <- totHits > 0
		w[selected] <- FALSE
		w <- which(w)
		for (j in seq_along(w)) {
			p <- parents[index[w[j]]]
			while (p > 0) {
				m <- match(p, predicteds)
				if (!is.na(m))
					confidences[m] <- confidences[m] + totHits[w[j]]/B[I[i]]*100
				p <- parents[p]
			}
		}
		
		# record the results for this sequence
		w <- which(confidences >= threshold)
		if (type==1L) {
			if (length(w)==length(predicteds)) {
				confidences <- formatC(confidences,
					digits=1,
					format="f")
				if (is.null(trainingSet$ranks)) {
					results[I[i]] <- paste(taxa[predicteds],
						paste(" [",
							confidences,
							"%]",
							sep=""),
						sep="",
						collapse="; ")
				} else {
					results[I[i]] <- paste(taxa[predicteds],
						paste(" [",
							trainingSet$ranks[predicteds],
							", ",
							confidences,
							"%]",
							sep=""),
						sep="",
						collapse="; ")
				}
			} else {
				if (length(w)==0)
					w <- 1 # assign to Root
				confidences <- formatC(c(confidences[w],
						confidences[w[length(w)]]),
					digits=1,
					format="f")
				if (is.null(trainingSet$ranks)) {
					results[I[i]] <- paste(c(taxa[predicteds[w]],
							paste("unclassified",
								taxa[predicteds[w[length(w)]]],
								sep="_")),
						paste(" [",
							confidences,
							"%]",
							sep=""),
						sep="",
						collapse="; ")
				} else {
					results[I[i]] <- paste(c(taxa[predicteds[w]],
							paste("unclassified",
								taxa[predicteds[w[length(w)]]],
								sep="_")),
						paste(" [",
							c(trainingSet$ranks[predicteds[w]],
								trainingSet$ranks[predicteds[w[length(w)] + 1]]),
							", ",
							confidences,
							"%]",
							sep=""),
						sep="",
						collapse="; ")
				}
			}
		} else { # type==2L
			if (length(w)==length(predicteds)) {
				if (is.null(trainingSet$ranks)) {
					results[[I[i]]] <- list(taxon=taxa[predicteds],
						confidence=confidences)
				} else {
					results[[I[i]]] <- list(taxon=taxa[predicteds],
						confidence=confidences,
						rank=trainingSet$ranks[predicteds])
				}
			} else {
				if (length(w)==0)
					w <- 1 # assign to Root
				if (is.null(trainingSet$ranks)) {
					results[[I[i]]] <- list(taxon=c(taxa[predicteds[w]],
							paste("unclassified",
								taxa[predicteds[w[length(w)]]],
								sep="_")),
						confidence=c(confidences[w],
							confidences[w[length(w)]]))
				} else {
					results[[I[i]]] <- list(taxon=c(taxa[predicteds[w]],
							paste("unclassified",
								taxa[predicteds[w[length(w)]]],
								sep="_")),
						confidence=c(confidences[w],
							confidences[w[length(w)]]),
						rank=c(trainingSet$ranks[predicteds[w]],
							trainingSet$ranks[predicteds[w[length(w)] + 1]]))
				}
			}
		}
		
		if (verbose)
			setTxtProgressBar(pBar, i/length(I))
	}
	results <- results[map] # re-replicate
	names(results) <- ns # inherit names from test
	if (type==2L) # extended
		class(results) <- c("Taxa", "Test")
	
	if (verbose) {
		close(pBar)
		cat("\n")
		time.2 <- Sys.time()
		print(round(difftime(time.2,
				time.1,
				units='secs'),
			digits=2))
		cat("\n")
	}
	
	return(results)
}