R/dexss.R

Defines functions dexss

Documented in dexss

# Copyright (C) 2013 Klambauer Guenter and Thomas Unterthiner
# <klambauer@bioinf.jku.at>


#' @title Detection of Differential Expression in a semi-supervised Setting

#' @description Performs the DEXSS algorithm for detection of differentially 
#' expressed genes in RNA-seq data for a semi-supervised setting, i.e. that 
#' the condition of some samples is known, and for some samples the condition
#' is unkown.
#' 
#' @details The read count \eqn{x} is explained by
#' a finite mixture of negative binomials:
#' 
#' \deqn{ 
#' p(x) = \sum_{i=1} ^n \alpha_i\  \mathrm{NB}(x;\ \mu_i, r_i),
#' }
#' 
#' where \eqn{\alpha_i} is the weight of the mixture component, \eqn{\mathrm{NB}}
#' is the negative binomial with mean parameter \eqn{\mu_i} and size parameter
#' \eqn{r_i}. The parameters are selected by an EM algorithm in a Baysian 
#' framework.
#' 
#' Each component in the mixture model corresponds to one condition.
#' 
#' \itemize{
#' \item If the groups, conditions, replicate status, or labels are unknown, DEXUS
#' tries to estimate these conditions. For each transcript DEXUS tries to
#' explain the read counts by one negative binomial distribution. If this is
#' possible, the transcript is explained by one condition and therefore it 
#' is not differentially expressed. If more than one negative binomial 
#' distribution is needed to explain the read counts of a transcript, this
#' transcript indicates that it is differentially expressed. Evidence for
#' differential expression is strong if a large amount of samples participate
#' in each condition and the mean expression values are well separated. Both
#' of these criteria are measured by the informative/non-informative (I/NI)
#' call. 
#' 
#' \item If there are more than two groups given by the vector \env{labels}, 
#' DEXUS uses a generalized linear model to explain the data in analogy to
#' (McCarthy, 2012). 
#' 
#' \item If there are two groups given by the vector \env{labels}, DEXUS
#' uses the exact test for count data to test between the 
#' sample groups, as implemented by (Anders and Huber, 2010) in the package 
#' "DESeq".
#' }
#' 
#'  
#' 
#' @param X either a vector of counts or a raw data matrix, where
#' columns are interpreted as samples and rows as genomic regions. An instance
#' of "countDataSet" is also accepted.
#' @param nclasses The number of conditions, i.e. mixture components. (Default = 2)
#' @param G The weight of the prior distribution of the mixture weights. 
#' Not used in the supervised case. (Default = 1).
#' @param cyc Positive integer that sets the number of cycles of the EM algorithm. 
#' (Default =  20).
#' @param alphaInit The initial estimates of the condition sizes, i.e., mixture weights.
#' Not used in the  supervised case. (Default = c(0.5,0.5)) .
#' @param labels The labels for the classes, will be coerced into an integer.
#' For this semi-supervised version the known labels/conditions 
#' must be coded as integers starting with 1. The samples with the label 1 will
#' be considered as being in the "major condition".
#' For the samples with unknown labels/conditions an "NA" must be set. 
#' @param normalization method used for normalizing the reads. 
#' "RLE" is the method used by (Anders and Huber, 2010),
#' "upperquartile" is the Upper-Quartile method by (Bullard et al., 2010), and none
#' deactivates normalization. (Default = "RLE").
#' @param kmeansIter number of times the K-Means algorithm is run. (Default = 10).
#' @param ignoreIfAllCountsSmaller Ignores transcript for which all read counts
#' are smaller than this value. These transcripts are considered as "not expressed" 
#' (Default = 1).
#' @param theta The weight of the prior on the size parameter or inverse
#' dispersion parameter. Theta is adjusted to each transcript by dividing
#' by the mean read count of the transcript. The higher \code{theta}, the lower \code{r} and
#' the higher the overdispersion will be. (Default = 2.5).
#' @param minMu Minimal mean for all negative binomial distributions. (Default = 0.5).
#' @param rmax Maximal value for the size parameter. The inverse of this parameter
#' is the lower bound on the dispersion. In analogy to (Anders and Huber, 2010) we
#' use 13 as default. (Default = 13).
#' @param initialization Method used to find the initial clusters. 
#' Dexus can either use the quantiles of the readcounts of each gene or 
#' run k-means on the counts. (Default = "kmeans").
#' @param multiclassPhiPoolingFunction In "multiClass" mode the dispersion 
#' is either estimated across all classes at once (NULL), or separately for 
#' each condition, i.e., class. The size parameters or dispersion per class are
#' then joined to one estimate by the mean ("mean"), minimum ("min") or maximum ("max").
#' In our investigations estimation across all classes at once performed best.
#' (Default = NULL). 
#' @param quiet Logical that indicates whether dexus should report the steps
#' of the algorithm. Supresses messages from the program if set to TRUE. (Default = FALSE). 
#' @param resultObject Type of the result object; can either be a list ("list")
#' or an instance of "DEXUSResult" ("S4"). (Default="S4").
#' @return "list" or "DEXUSResult".  
#' A list containing the results and the parameters of the algorithm or an
#' instance of "DEXUSResult".
#' @references 
#' 
#' Anders, S. and Huber, W. (2010). \emph{Differential expression analysis
#' for sequence count data.} Genome Biol, 11(10), R106.
#' 
#' Bullard, J. H., Purdom, E., Hansen, K. D., and Dudoit, S. (2010).
#' \emph{Evaluation of statistical methods for normalization and differential
#'  expression in mRNA-seq experiments.} 
#' BMC Bioinformatics, 11, 94.
#' 
#' McCarthy, D. J., Chen, Y., and Smyth, G. K. (2012). 
#' \emph{Differential expression analysis of multifactor RNA-Seq experiments
#' with respect to biological variation.} Nucleic Acids Res, 40(10),
#' 4288-4297.
#' 
#' @examples 
#' data(dexus)
#' labels1 <- substr(colnames(countsBottomly),1,2)
#' labels2 <- c()
#' labels2[which(labels1=="D2")] <- 1
#' labels2[which(labels1=="B6")] <- 2
#' labels2[c(3,7,8,10,12,15)] <- NA
#' res <- dexss(countsBottomly[1:100, ],labels=labels2,nclasses=2,G=0)
#' @aliases DEXSS, dexss
#' @useDynLib dexus
#' @author Guenter Klambauer \email{klambauer@@bioinf.jku.at} and Thomas 
#' Unterthiner \email{unterthiner@@bioinf.jku.at}
#' @export

dexss <- function(X, nclasses=2, G=1, alphaInit,
		cyc=20, labels, normalization = "RLE", 
		kmeansIter=10, ignoreIfAllCountsSmaller=1, theta=2.5, minMu=0.5,rmax=13.0,
		initialization="kmeans", 
		multiclassPhiPoolingFunction=NULL, quiet=FALSE, resultObject="S4"){
	
	### check input ############################################################
	if (is.matrix(X)){
		if (is.null(rownames(X))){
			rownames(X) <- paste("Transcript_",1:nrow(X),sep="")
		} else {
			if (length(unique(rownames(X)))!=length(rownames(X)))
				stop("Transcript names must be unique.")
		}
		if (is.null(colnames(X))){
			colnames(X) <- paste("Sample_",1:ncol(X),sep="")
		} else {
			if (length(unique(colnames(X)))!=length(colnames(X)))
				stop("Sample names must be unique.")
		}
	} else if (is.vector(X)) {
		X <- matrix(X,nrow=1)
		rownames(X) <- "Transcript1"
		colnames(X) <- paste("Sample_",1:ncol(X),sep="")
		normalization <- "none"
	} else if (inherits(X,"CountDataSet")) {
		object <- X
		X <- DESeq::counts(X)
		labels <- DESeq::conditions(object)
		if (is.null(rownames(X))){
			rownames(X) <- paste("Transcript_",1:nrow(X),sep="")
		} else {
			if (length(unique(rownames(X)))!=length(rownames(X)))
				stop("Transcript names must be unique.")
		}
		if (is.null(colnames(X))){
			colnames(X) <- paste("Sample_",1:ncol(X),sep="")
		} else {
			if (length(unique(colnames(X)))!=length(colnames(X)))
				stop("Sample names must be unique.")
		}
	} else {
		stop("Input data types must be a matrix or vector!")
	}
	if (!is.numeric(nclasses) | nclasses < 1 | nclasses!=as.integer(nclasses)){
		stop("\"nclasses\" must be integer greater or equal to 1.")
	}
	
	if (is.null(labels)){
		stop("Labels must be given for DEXSS!")
	}
	if (!(is.integer(labels) | is.numeric(labels))){
		stop("Labels must be integers!")	
	} 
	if (!any(is.na(labels))){
		stop("Some labels must be NA.")
	}
	
	if(length(labels)!=ncol(X)){
		stop("Length of labels must be equal to number of rows in count matrix!")
	}
	
	labels <- labels-1
	labels[which(is.na(labels))] <- (-1)
	labels <- as.integer(labels)
	
	if ((length(unique(labels))-1) > nclasses){
		stop("\"nclasses\" must be greater or equal number of different labels!")
	}
	

	if (nclasses > ncol(X))
		stop("Number of conditions greater than the number of observations.")
	## if (!is.numeric(alphaInit) | any(alphaInit < 0) | length(alphaInit)!=nclasses){
	##     message("\"alphaINIT\" must be numeric greater and all values greater 0.")
	##     stop("Length of this vector must be \"nclasses\"")
	## }
	if (!is.numeric(G) | length(G) !=1 | G < 0){
		stop("\"G\" must be a numeric greater or equal to 0.")
	}
	if (!is.numeric(cyc) | length(cyc) !=1 | cyc < 1){
		stop("\"cyc\" must be an integer greater or equal to 1.")
	}
	
	if (missing(alphaInit))
		alphaInit <- rep(1,nclasses)
	if (!is.numeric(alphaInit) | any(alphaInit < 0) | length(alphaInit)!=nclasses){
		message("\"alphaINIT\" must be numeric greater and all values greater 0.")
		stop("Length of this vector must be \"nclasses\"")
	}
	
	if (!(normalization %in% c("none","RLE","upperquartile"))){
		stop("\"normalization\" must be \"none\",\"RLE\" or \"upperquartile\".")
	}
	if (!is.numeric(kmeansIter) | length(kmeansIter) !=1 | kmeansIter < 1){
		stop("\"kmeansIter\" must be an integer greater or equal to 1.")
	}
	if (!is.numeric(ignoreIfAllCountsSmaller) | length(ignoreIfAllCountsSmaller) !=1){
		stop("\"ignoreIfAllCountsSmaller\" must be a numeric value.")
	}
	if (!is.numeric(theta) | length(theta) !=1 | theta < 0){
		stop("\"theta\" must be a positive numeric value.")
	}
	if (!is.numeric(minMu) | length(minMu) !=1 | minMu < 0){
		stop("\"minMu\" must be a positive numeric value.")
	}
	if (!is.numeric(rmax) | length(rmax) !=1){
		stop("\"rmax\" must be a positive numeric value.")
	}
	if (rmax==Inf) rmax <- -1
	if (!(initialization %in% c("kmeans","quantiles"))){
		stop("\"initialization\" must be \"kmeans\" or \"quantiles\".")
	}
	if (!(is.null(multiclassPhiPoolingFunction))){
		if (!(multiclassPhiPoolingFunction %in% c("min","max","mean"))){
			stop(paste("\"multiclassPhiPoolingFunction\" must be \"min\"",
							"\"max\", or \"mean\" or NULL"))
		}
	}
	if (!is.logical(quiet)){
		stop("\"quiet\" must be a logical.")
	}
	
	
	
	############################################################################
	
	
	# Normalization
	X.raw <- X
	norm <- normalizeData(X, normalization)
	X <- norm$X
	
	gamma <- rep(1,nclasses)
	gamma[1] <- 1+G
	#eps1 <- 10^(-(1:nclasses)-6)
	#gamma <- gamma + eps1
	mainClassIdx <- 1
	trows <- nrow(X)
	exclIdx <- (apply(X,1,max) < ignoreIfAllCountsSmaller)
	
	if (all(exclIdx)){
		message("All transcripts filtered out because of low counts.")
		stop("Check your count matrix or change \"ignoreIfAllCountsSmaller\".")
	}
	
	if (length(exclIdx) & !quiet) 
		message(paste("Filtered out",100*round(mean(exclIdx),4),
						"% of the genes due to low counts"))
	X.all <- X
	X <- pmax(X,minMu)
	X <- X[!exclIdx, ,drop=FALSE]
	N <- ncol(X)
	g <- nrow(X)
	etaPerGene <- theta/((1+rowMeans(X)))
	varToMeanT <- 1
	
	### Semi-supervised############################################################
	mode <- "semi-supervised"
	
	if (!quiet) message("Semi-supervised mode.")
	clusterFunc <- switch(initialization,
			kmeans=initialize.clusters.kmeans,
			quantiles=initialize.clusters.quantiles)
	
	
	clusterResults <- (apply(X,1,clusterFunc, n=nclasses, 
						mainClassIdx=mainClassIdx, rmax=rmax,kmeansIter))
	
	meansINIT <- sapply(clusterResults, function(x) x$means)
	rINIT <- sapply(clusterResults, function(x) x$r)
	alphaInit <- as.numeric(alphaInit/sum(alphaInit))
	
	
	res <- .Call("dexss",as.vector(t(X)), nrow(X), ncol(X),
			alphaInit, as.vector(rINIT), as.vector(meansINIT),
			as.numeric(gamma), as.integer(nclasses), as.integer(cyc),
			as.numeric(varToMeanT), as.integer(labels), 
			as.numeric(etaPerGene), as.numeric(minMu),as.numeric(rmax))	
	
	## rerun if the major class is not mainClassIdx after learning
	alpha <- matrix(res$alpha,nrow=g,byrow=TRUE)
	A <- array(res$A,dim=c(nclasses,N,g))  ##right!
	r <- matrix(res$r,nrow=g,byrow=TRUE)
	means <-  matrix(res$means,nrow=g,byrow=TRUE)
	resp <- t(sapply(1:g,function(i) apply(A[,,i],2,which.max) ))	
	rerunIdx <- which(apply(resp,1,function(x) any(table(x) > table(x)[mainClassIdx])))
	if (length(rerunIdx)>0){
		reorder <- t(apply(resp,1,function(x){
							h <- hist(x,breaks=seq(0.5,nclasses+0.5,1),plot=FALSE)$counts
							order(h,decreasing=TRUE)[order(gamma,decreasing=TRUE)]
						}
				))
		res2 <- .Call("dexss",as.vector(t(X[rerunIdx, ,drop=FALSE])), 
				length(rerunIdx), ncol(X),
				alphaInit, 
				as.vector(t(r[rerunIdx,reorder[rerunIdx, ,drop=FALSE]])), 
				as.vector(t(means[rerunIdx,reorder[rerunIdx, ,drop=FALSE] ])),
				as.numeric(gamma), as.integer(nclasses), as.integer(cyc),
				as.numeric(varToMeanT), as.integer(labels), as.numeric(etaPerGene),
				as.numeric(minMu),as.numeric(rmax))
		
		#implant res2 into res... 
		implantIdx <- as.vector(sapply(rerunIdx*nclasses,
						function(x) (x-nclasses+1):x ))
		res$r[implantIdx] <- res2$r
		res$means[implantIdx] <- res2$means
		res$alpha[implantIdx] <- res2$alpha
		
		implantIdx2 <- as.vector(sapply((rerunIdx-1)*N*nclasses,
						function(x) (x+1):(x+N*(nclasses) )))
		res$A[implantIdx2] <- res2$A
		#check: res$A[implantIdx2]==as.vector(A[,,rerunIdx])
	}		
	pval <- rep(NA,nrow(X))
	
	# extracting from result object
	alpha <- matrix(res$alpha,nrow=g,byrow=TRUE)
#	 if (mode=="semi-supervised" & nclasses >2){
#	     oo <- t(apply(alpha,1,order,decreasing=TRUE))
#	     alpha <- t(sapply(seq(g), function(x) alpha[x, oo[x, ]] ))
#	 }
#	 
	A <- array(res$A,dim=c(nclasses,N,g))  ##right!
#	if (mode=="semi-supervised" & nclasses >2){
#		A <- lapply(seq(g), function(x) A[oo[x, ], ,x] )
#		A <- array(unlist(A),dim=c(nclasses,N,g))
#	}
#	
#	
	poisIdx <- which(res$r < 0) 
	res$r[poisIdx] <- Inf
	r <- matrix(res$r,nrow=g,byrow=TRUE)
#	if (mode=="semi-supervised" & nclasses >2)
#		r <- t(sapply(seq(g), function(x) r[x, oo[x, ]] ))
#	
	means <-  matrix(res$means,nrow=g,byrow=TRUE)
#	if (mode=="semi-supervised" & nclasses >2)
#		means <- t(sapply(seq(g), function(x) means[x, oo[x, ]] ))
#	
	p <- res$means / (res$means + res$r)
	sds <- sqrt(res$means * (res$means + res$r) / res$r)
	sds[which(r==Inf)] <- sqrt(res$means[which(r==Inf)])
	sds <- matrix(sds,nrow=g,byrow=TRUE)
	resp <- t(sapply(1:g,function(i) apply(A[,,i],2,which.max) ))
	fc <- log((means+1e-4)/(means[,mainClassIdx]+1e-4))
	INI <- sapply(1:g, function(i) sum(alpha[i, ] * abs(fc)[i, ]))
	
	### reorder 
	
	
	### re-insert excluded genes
	pFinal <- matrix(NA,nrow=trows,ncol=nclasses)
	pFinal[exclIdx, ] <- rep(NA,nclasses)
	pFinal[!exclIdx, ] <- p
	rownames(pFinal) <- rownames(X.all)
	colnames(pFinal) <- paste("Condition",1:nclasses,sep="_")
	
	
	alphaFinal <- matrix(NA,nrow=trows,ncol=nclasses)
	alphaFinal[exclIdx, ] <- rep(0,nclasses)
	alphaFinal[exclIdx,mainClassIdx] <- 1
	alphaFinal[!exclIdx, ] <- alpha
	rownames(alphaFinal) <- rownames(X.all)
	colnames(alphaFinal) <- paste("Condition",1:nclasses,sep="_")
	
	
	AFinal <- array(NA,dim=c(nclasses,N,trows))
	AFinal[, ,exclIdx] <- matrix(NA,nrow=nclasses,ncol=N)
	AFinal[, ,!exclIdx] <- A
	
	respFinal <- matrix(NA,nrow=trows,ncol=N)
	respFinal[exclIdx, ] <- rep(mainClassIdx,N)
    #browser()
	#respFinal[exclIdx, ] <- matrix(rep(labels,length(which(exclIdx))),nrow=length(which(exclIdx)))
	respFinal[!exclIdx, ] <- resp
	rownames(respFinal) <- rownames(X.all)
	colnames(respFinal) <- colnames(X.all)
	
	
	rFinal <- matrix(NA,nrow=trows,ncol=nclasses)
	rFinal[exclIdx, ] <- rep(NA,nclasses)
	rFinal[!exclIdx, ] <- r
	rownames(rFinal) <- rownames(X.all)
	colnames(rFinal) <- paste("Condition",1:nclasses,sep="_")
	
	meansFinal <- matrix(NA,nrow=trows,ncol=nclasses)
	if (any(exclIdx))
		meansFinal[exclIdx, ] <- 
				t(sapply(rowMeans(X.all[exclIdx, ,drop=FALSE]),rep,nclasses))
	meansFinal[!exclIdx, ] <- means
	rownames(meansFinal) <- rownames(X.all)
	colnames(meansFinal) <- paste("Condition",1:nclasses,sep="_")
	
	sdsFinal <- matrix(NA,nrow=trows,ncol=nclasses)
	sdsFinal[exclIdx, ] <- rep(NA,nclasses)
	sdsFinal[!exclIdx, ] <- sds
	rownames(sdsFinal) <- rownames(X.all)
	colnames(sdsFinal) <- paste("Condition",1:nclasses,sep="_")
	
	fcFinal <- matrix(NA,nrow=trows,ncol=nclasses)
	fcFinal[exclIdx, ] <- rep(NA,nclasses)
	fcFinal[!exclIdx, ] <- fc
	rownames(fcFinal) <- rownames(X.all)
	colnames(fcFinal) <- paste("Condition",1:nclasses,sep="_")
	
	INIFinal <- vector("numeric", length=trows)
	INIFinal[!exclIdx] <- INI
	if (length(which(exclIdx))>0)
		INIFinal[exclIdx] <- runif(n=length(which(exclIdx)),min=0,max=min(INI))
	names(INIFinal) <- rownames(X.all)
	
	pvalFinal <- rep(1, length=trows)
	pvalFinal[!exclIdx] <- pval
	if (all(is.na(pval))){
		pvalFinal[exclIdx] <- NA
	} else{
		pvalFinal[exclIdx] <- runif(n=length(which(exclIdx)),0.01,1.00)
	}
	names(pvalFinal) <- rownames(X.all)
	
	
	params <- list(nclasses, alphaInit, G, cyc, labels, normalization,
			kmeansIter, ignoreIfAllCountsSmaller, theta, minMu, rmax, initialization,
			mode, multiclassPhiPoolingFunction,quiet,resultObject)
	names(params) <- c("nclasses","alphaINIT","G","cyc","labels",
			"normalization","kmeansIter","ignoreIfAllCountsSmaller",
			"theta","minMu","rmax","initialization","mode",
			"multiclassPhiPoolingFunction","quiet","resultObject")
	
	
	if (resultObject!="S4"){
		res <- list(params, X.raw, X.all, norm$sizeFactors,
				AFinal, respFinal, fcFinal, INIFinal, pvalFinal,
				t(rINIT), t(meansINIT), pFinal, alphaFinal, 
				rFinal, meansFinal, sdsFinal)
		
		names(res) <- c("params","X","X.norm", "sizeFactors",
				"A","resp","logfc","INI","pval",
				"rINIT", "meansINIT","p","alpha",
				"r","means","sds")
		return(res)
	} else {
		res <- new("DEXUSResult")
		res@transcriptNames	 <- rownames(X.all)
		res@sampleNames		 <- colnames(X.all)
		res@inputData		 <- X.raw
		res@normalizedData   <- X.all  
		res@sizeFactors		 <- norm$sizeFactors
		res@INIValues        <- INIFinal
		res@INICalls		 <- INIFinal>0.1
		res@INIThreshold 	 <- 0.1
		res@pvals			 <- pvalFinal
		res@responsibilities <- respFinal	 
		res@posteriorProbs	 <- AFinal	
		res@logFC			 <- fcFinal 	
		res@conditionSizes	 <- alphaFinal	
		res@sizeParameters   <- rFinal		
		res@means            <- meansFinal			
		res@dispersions	     <- 1/rFinal		
		res@params           <- params 
		return(res)
	}
}

Try the dexus package in your browser

Any scripts or data that you put into this service are public.

dexus documentation built on Nov. 8, 2020, 11:08 p.m.