R/PoisMixClus.R
In HTSCluster: Clustering High-Throughput Transcriptome Sequencing (HTS) Data

Documented in PoisMixClus

PoisMixClus <- function(y, g, conds, norm="TMM",  
	init.type = "small-em", init.runs = 1, init.iter = 10, alg.type = "EM", cutoff = 10e-6, 
	iter = 1000, fixed.lambda = NA, equal.proportions = FALSE, prev.labels = NA, 
	prev.probaPost = NA, verbose = FALSE, interpretation = "sum", EM.verbose = FALSE, wrapper=FALSE,
	subset.index = NA) {

	## fixed.lambda should be a list of length (number of fixed clusters)
	## g gives the number of clusters IN ADDITION to the fixed clusters
	## 	specified by fixed.lambda
	## equal.proportions should be TRUE or FALSE

	## Next loop only run if PoisMixClus is called directly (otherwise called by PoisMixClusWrapper)
	if(wrapper==FALSE) {
		if(is.matrix(y) == FALSE & is.data.frame(y) == FALSE) 
			stop(paste(sQuote("y"), "must be a matrix"))
		if(min(y) < 0 | sum(as.numeric(round(y))) != sum(as.numeric(y))) 
			stop(paste(sQuote("y"), "must be a matrix made up of nonnegative counts"))
		if(min(rowSums(y)) == 0)
			stop(paste("at least one observation in", sQuote("y"), 
			"contains all 0's and must be removed from the data"))
		if(is.vector(conds) == FALSE | length(conds) != ncol(y))
			stop(paste(sQuote("conds"), "must be a vector the same length as the number of columns in", sQuote("y")))
		if(length(init.type) > 1)
			stop(paste(sQuote("init.type"), "must be of length 1"))
		if(init.type != "small-em" & init.type != "kmeans" & init.type != "split.small-em") 
			stop(paste(sQuote("init.type"), "must be one of", dQuote("small-em"), "or", 
				dQuote("kmeans"), "or", dQuote("split.small-em")))
		if(alg.type != "EM" & alg.type != "CEM")
			stop(paste(sQuote("alg.type"), "must be one of", dQuote("EM"), "or", dQuote("CEM")))
		if(length(alg.type) > 1)
			stop(paste(sQuote("alg.type"), "must be one of", dQuote("EM"), "or", dQuote("CEM")))
		if(is.logical(verbose) == FALSE)
			stop(paste(sQuote("verbose"), "must be", dQuote("TRUE"), "or", dQuote("FALSE")))
		if(class(fixed.lambda) != "list" & is.na(fixed.lambda[1]) == FALSE)
			stop(paste(sQuote("fixed.lambda"), "must be", dQuote("NA") , "or a list."))
		if(is.vector(prev.labels) == FALSE & is.na(prev.labels[1]) == FALSE)
			stop(paste(sQuote("prev.labels"), "must be", dQuote("NA") , "or a vector of labels."))

		## Grouping columns of y in order of condition (all replicates put together)
		o.ycols <- order(conds)
		y <- y[,o.ycols]
		conds <- conds[o.ycols]
		conds.names <- unique(conds)
		d <- length(unique(conds))
		r <- as.vector(table(conds))
		if(length(rownames(y)) == 0) rn <- 1:nrow(y);
		if(length(rownames(y)) > 0) rn <- rownames(y);
		y <- as.matrix(y, nrow = nrow(y), ncol = ncol(y))
		rownames(y) <- rn;
		

		## Only calculate s values if they are not provided
		if(length(norm) != 1 & length(norm) != length(conds)) stop(paste(sQuote("norm"), "must be one of
		the following: none, TC, UQ, Med, DESeq, TMM, or a vector of the same length as", sQuote("conds")))
		## If estimated from data, all genes should be used
		if(length(norm) == 1) {
			if(norm == "none") 	s <- rep(1, cols);
			if(norm == "TC") s <- colSums(y) / sum(as.numeric(y));
			if(norm == "UQ") s <- apply(y, 2, quantile, 0.75) / sum(apply(y, 2, quantile, 0.75));
			if(norm == "Med") s <- apply(y, 2, median) / sum(apply(y, 2, median));
			if(norm == "DESeq") {
				## Code from DESeq, v1.8.3
				loggeomeans <- rowMeans(log(y))
				s <- apply(y, 2, function(x) exp(median((log(x)-loggeomeans)[is.finite(loggeomeans)])))
				s <- s / sum(s)
			}
			if(norm == "TMM") {
				f <- calcNormFactors(as.matrix(y), method = "TMM")
				s <- colSums(y)*f / sum(colSums(y)*f)
			} 
		}
		if(length(norm) == length(conds)) {
			s <- norm / sum(norm)
		}
		s.dot <- rep(NA, d) 
		for(j in 1:d) {
			s.dot[j] <- sum(s[which(conds == (unique(conds))[j])])
		}
		
		## In case only a subset of data are to be used for analysis
		if(is.na(subset.index)[1] == FALSE) {
			y <- y[subset.index,]
			n <- dim(y)[1];cols <- dim(y)[2]
			w <- rowSums(y)
		}
		if(is.na(subset.index)[1] == TRUE) {
			n <- dim(y)[1];cols <- dim(y)[2]
			w <- rowSums(y)
		}
	}
	
	if(wrapper==TRUE) {
		conds.names <- unique(conds)
		d <- length(unique(conds))
		r <- as.vector(table(conds))
		n <- dim(y)[1];cols <- dim(y)[2]
		w <- rowSums(y)
		K <- g
		s <- norm
		s.dot <- rep(NA, d) 
		for(j in 1:d) {
			s.dot[j] <- sum(s[which(conds == (unique(conds))[j])])
		}
	}

	K <- g
	if(class(fixed.lambda) == "list") {
		for(ll in 1:length(fixed.lambda)) {
			if(is.vector(fixed.lambda[[ll]]) == FALSE |
				length(fixed.lambda[[ll]]) != d)
				stop(paste(sQuote("fixed.lambda"), "must be", dQuote("NA") , 
					"or a list of vectors with length equal to the number of conditions."))
			if(length(which(fixed.lambda[[ll]] == 0)) > 0) {
				if(length(which(fixed.lambda[[ll]] == 1)) + 
					length(which(fixed.lambda[[ll]] == 0)) == length(fixed.lambda[[ll]])) {

					tmp <- 1/sum(s.dot[which(fixed.lambda[[ll]] == 1)])
					new <- fixed.lambda[[ll]]
					new[which(fixed.lambda[[ll]] == 1)] <- tmp
					message(cat("Fixed lambda\n", fixed.lambda[[ll]], "\n", "modified to\n",
						new, "\n", "to satisfy imposed parameter constraints.\n"))
					fixed.lambda[[ll]][which(fixed.lambda[[ll]] == 1)] <- tmp
				}
			}
			if(abs(as.numeric(fixed.lambda[[ll]] %*% s.dot)-1) > 10e-8)
				warning(paste("Check that constraint on lambda*s.dot is upheld for",sQuote("fixed.lambda")))
		}
		K <- g + length(fixed.lambda);	
	}
	diff <- 100 ## Convergence criterion
	index <- 0; go <- 1;

	## Inital values
	## init.type: "kmeans", "small-em", "split.small-em"

	if(init.type == "kmeans") {
		init.alg <- "kmeanInit";
		init.args <- list(y = y, g = K, conds = conds, norm=s, 
			fixed.lambda = fixed.lambda, equal.proportions = equal.proportions)
	}
	if(init.type == "small-em") {
		init.alg <- "emInit"
		init.args <- list(y = y, g = g, conds = conds, norm=s,
			alg.type = alg.type, init.run = init.runs,
			init.iter = init.iter, fixed.lambda = fixed.lambda, 
			equal.proportions = equal.proportions, verbose = verbose)
	}
	if(init.type == "split.small-em") {
		init.alg <- "splitEMInit"
		init.args <- list(y = y, g = g, conds = conds, norm=s,
			alg.type = alg.type,
			fixed.lambda = fixed.lambda,
			equal.proportions = equal.proportions, 
			prev.labels = prev.labels, prev.probaPost = prev.probaPost,
			init.iter = init.iter, init.runs = init.runs, verbose = verbose)
	}
	## Adding quote = TRUE to speed up do.call
	param.init <- do.call(init.alg, init.args, quote=TRUE)

	if(equal.proportions == FALSE) {
		pi <- pi.old <- param.init$pi.init
	}
	if(equal.proportions == TRUE) {
		pi <- pi.old <- rep(1/K, K)
	}
	lambda <- lambda.old <- param.init$lambda.init
	mean.calc <- mean.old <- PoisMixMean(y = y, g = K, conds = conds, 
		s = s, lambda = lambda)

	while(go == 1) {

		############
		## E-step ##
		############
		t <- probaPost(y, K, conds, pi, s, lambda)

		############
		## C-step ##
		############
		if(alg.type == "CEM") {
			## If two values of t_{ik} are map, 
			## arbitrarily choose the first
			partition <- unlist(apply(t, 1, 
				function(x) which(x == max(x, na.rm = TRUE))[1]))
			partition.mat <- matrix(0, nrow = n, ncol = K)
			for(i in 1:n) partition.mat[i,partition[i]] <- 1;
		}

		############
		## M-step ##
		############
		if(alg.type == "CEM") {
			if(equal.proportions == FALSE) {
				for(k in 1:K) {
					pi[k] <- length(which(partition == k))/n
				}
			}
			if(equal.proportions == TRUE) {
				pi <- rep(1/K, K)
			}
			denom <- colSums(partition.mat * w)
			if(class(fixed.lambda) != "list") {
				for(j in 1:d) {
					denom.bis <- denom * s.dot[j]
					num <- colSums(partition.mat * 
					rowSums(as.matrix(y[,which(conds == (unique(conds))[j])])))
					lambda[j,] <- num / denom.bis
				}
			}
			if(class(fixed.lambda) == "list") {
				for(ll in 1:length(fixed.lambda)) {
					lambda[,ll] <- fixed.lambda[[ll]]
				}
				for(j in 1:d) {
					denom.bis <- denom * s.dot[j]
					denom.bis <- denom.bis[-c(1:length(fixed.lambda))]
					num <- colSums(partition.mat * 
						rowSums(as.matrix(y[,which(conds == (unique(conds))[j])])))
					num <- num[-c(1:length(fixed.lambda))]
					lambda[j,-c(1:length(fixed.lambda))] <- num / denom.bis
				}
			}
		}

		if(alg.type == "EM") {
			if(equal.proportions == FALSE) {
				pi <- colSums(t)/n
			}
			if(equal.proportions == TRUE) {
				pi <- rep(1/K, K)
			}
			denom <- colSums(t * w)
			if(class(fixed.lambda) != "list") {
				denom.bis <- matrix(rep(denom, length(s.dot)) * rep(s.dot, each=K), byrow=T, ncol=K)
				num <- matrix(rowsum(matrix(y, ncol=n, nrow=length(conds), byrow=T), group=conds), 
					nrow=n, ncol=d, byrow=T)
				num <- matrix(unlist(lapply(lapply(1:d, function(x) t*num[,x]), colSums), recursive=FALSE,
					use.names=FALSE), nrow=d, ncol=K, byrow=T)
				lambda <- num/denom.bis
			}
			if(class(fixed.lambda) == "list") {
				for(ll in 1:length(fixed.lambda)) {
					lambda[,ll] <- fixed.lambda[[ll]]
				}
				## This loop could be improved for speed as above
				for(j in 1:d) {
					denom.bis <- denom * s.dot[j]
					denom.bis <- denom.bis[-c(1:length(fixed.lambda))]
					num <- colSums(t * 
						matrix(rep(rowSums(as.matrix(y[,which(conds == (unique(conds))[j])])),K), 
						ncol = K))
					num <- num[-c(1:length(fixed.lambda))]
					lambda[j,-c(1:length(fixed.lambda))] <- num / denom.bis
				}
			}	
		}

		#################
		## Convergence ##
		#################
		mean.calc <- PoisMixMean(y, g = K, conds, s, lambda)
		diff <- abs(logLikePoisMixDiff(y, mean.calc, pi, mean.old, pi.old))
		lambda.old <- lambda; pi.old <- pi; mean.old <- mean.calc;

		index <- index + 1
		if(verbose == TRUE) print(paste("Log-like diff:", diff))
		if(diff < cutoff) go <- 0;
		if(iter != FALSE & iter == index) go <- 0;
	}
	
	if(EM.verbose == TRUE) {
		cat("#####################################\n")
		cat("Number of EM iterations:", index, "\n")
		cat("Last log-likelihood difference:", diff, "\n")
		cat("#####################################\n")
	}

	#####################################
	## Final estimates of lambda and p ##
	#####################################
	names(pi) <- paste("Cluster", 1:K)
	colnames(lambda) <- paste("Cluster", 1:K)
	rownames(lambda) <- conds.names
	lambda.final <- lambda
	pi.final <- pi

	## Check to make sure one of the components is not degenerate
	if(min(pi) == 0 | is.nan(sum(lambda)) == TRUE) {
		probaPost <- NA
		labels <- NA
		BIC <- NA
		ICL <- NA
	}

	if(min(pi) > 0 | is.nan(sum(lambda)) == FALSE) {

		mean.calc <- PoisMixMean(y, g = K, conds, s, lambda)
		LL.tmp <- logLikePoisMix(y, mean.calc, pi)
		LL <- LL.tmp$ll
	
		######################
		## Determine labels ##
		######################
		t <- probaPost(y, K, conds, pi, s, lambda)
		## If two clusters have exactly identical map estimators,
		## arbitrarily choose the first one
		map <- unlist(apply(t, 1, function(x) which(x == max(x, 
			na.rm = TRUE))[1]))
		z <- matrix(0, nrow = n, ncol = K)
		for(i in 1:n) z[i,map[i]] <- 1;
		probaPost <- t
		labels <- map

		##############################
		## Calculate BIC, ICL       ##
		##############################
		if(equal.proportions == FALSE & class(fixed.lambda) != "list") {
#			np <- (K-1) + n + (d-1)*K 	# pi + w + lambda
			## CHANGE September 25, 2013: w is not considered as a parameter
			np <- (K-1) + (d-1)*K 	# pi + lambda
		}
		if(equal.proportions == TRUE & class(fixed.lambda) != "list") {
#			np <- n + (d-1)*K 	# w + lambda
			## CHANGE September 25, 2013: w is not considered as a parameter
			np <- (d-1)*K 		# lambda
		}
		if(equal.proportions == FALSE & class(fixed.lambda) == "list") {
#			np <- (K-1) + n + (d-1)*(K-length(fixed.lambda))	# pi + w + lambda not fixed
			## CHANGE September 25, 2013: w is not considered as a parameter
			np <- (K-1) + (d-1)*(K-length(fixed.lambda))		# pi + lambda not fixed
		}
		if(equal.proportions == TRUE & class(fixed.lambda) == "list") {
#			np <- n + (d-1)*(K-length(fixed.lambda))			# w + lambda not fixed
			## CHANGE September 25, 2013: w is not considered as a parameter
			np <- (d-1)*(K-length(fixed.lambda))				# lambda not fixed

		}
		BIC <- -LL + (np/2) * log(n)
#		entropy <- -2*sum(z*log(t), na.rm = TRUE)
		## CHANGE October 18, 2013: replace z with t in the entropy calculation for ICL
#		entropy <- -2*sum(t*log(t), na.rm = TRUE)
		## CHANGE July 25, 2014: typo in entropy (thanks, Melina Gallopin!)
		entropy <- -sum(t*log(t), na.rm = TRUE)
		ICL <- BIC + entropy
	}
	## Should cluster behavior (lambda) be interpretated wrt the gene means or sums?	
	if(interpretation == "mean") {
		s <- s * q
		w <- w / q
	}
	results <- list(lambda = lambda.final, pi = pi.final, labels = labels, 
		probaPost = probaPost, log.like = LL, BIC = -BIC, ICL = -ICL, 
		alg.type = alg.type, norm = s,
		conds = conds, iterations = index, logLikeDiff = diff, model.selection = NA,
		subset.index = subset.index)

	class(results) <- "HTSCluster"
	return(results)
}