R/ihw_convex.R

Defines functions ihw_milp presorted_grenander ihw_convex ihw.DESeqResults ihw.formula ihw_internal ihw.default ihw

Documented in ihw ihw.default ihw.DESeqResults ihw.formula

#' @export
ihw <- function(...)
{
    UseMethod( "ihw" )
}




#' ihw: Main function for Independent Hypothesis Weighting
#'
#' Given a vector of p-values, a vector of covariates which are independent of the p-values under the null hypothesis and
#' a nominal significance level alpha, IHW learns multiple testing weights and then applies the weighted Benjamini Hochberg
#' (or Bonferroni) procedure.
#'
#'
#' @param pvalues  Numeric vector of unadjusted p-values.
#' @param covariates  Vector which contains the one-dimensional covariates (independent under the H0 of the p-value)
#'                for each test. Can be numeric or a factor. (If numeric it will be converted into factor by binning.)
#' @param alpha   Numeric, sets the nominal level for FDR control.
#' @param covariate_type  "ordinal" or "nominal" (i.e. whether covariates can be sorted in increasing order or not)
#' @param nbins  Integer, number of groups into which p-values will be split based on covariate. Use "auto" for
#'             automatic selection of the number of bins. Only applicable when covariates is not a factor.
#' @param m_groups Integer vector of length equal to the number of levels of the covariates (only to be specified
#'               when the latter is a factor/categorical). Each entry corresponds to the number of hypotheses to be tested in
#'               each group (stratum). This argument needs to be given when the complete vector of p-values is
#'               not available, but only p-values below a given threshold, for example because of memory reasons.
#'               See the vignette for additional details and an example of how this principle can be applied with
#'               numerical covariates.
#' @param folds  Integer vector or NULL. Pre-specify assignment of hypotheses into folds.
#' @param quiet  Boolean, if False a lot of messages are printed during the fitting stages.
#' @param nfolds Number of folds into which the p-values will be split for the pre-validation procedure
#' @param nfolds_internal  Within each fold, a second  (nested) layer of cross-validation can be conducted to choose a good
#'              regularization parameter. This parameter controls the number of nested folds.
#' @param nsplits_internal  Integer, how many times to repeat the nfolds_internal splitting. Can lead to better regularization
#'              parameter selection but makes ihw a lot slower.
#' @param lambdas  Numeric vector which defines the grid of possible regularization parameters.
#'				Use "auto" for automatic selection.
#' @param seed Integer or NULL. Split of hypotheses into folds is done randomly. To have the output of the function be reproducible, 
#'	the seed of the random number generator is set to this value at the start of the function. Use NULL if you don't want to set the seed.
#' @param distrib_estimator  Character ("grenander" or "ECDF"). Only use this if you know what you are doing. ECDF with nfolds > 1
#'              or lp_solver == "lpsymphony" will in general be excessively slow, except for very small problems.
#' @param lp_solver  Character ("lpsymphony" or "gurobi"). Internally, IHW solves a sequence of linear programs, which
#'        can be solved with either of these solvers.
#' @param adjustment_type Character ("BH" or "bonferroni") depending on whether you want to control FDR or FWER.
#' @param null_proportion Boolean, if True (default is False), a modified version of Storey's estimator
#'        is used within each bin to estimate the proportion of null hypotheses.
#' @param null_proportion_level Numeric, threshold for Storey's pi0 estimation procedure, defaults to 0.5
#' @param return_internal Returns a lower level representation of the output (only useful for debugging purposes).
#' @param ... Arguments passed to internal functions.
#'
#' @return A ihwResult object.
#' @seealso ihwResult, plot,ihwResult-method, ihw.DESeqResults
#'
#' @examples
#'
#' save.seed <- .Random.seed; set.seed(1)
#' X   <- runif(20000, min=0, max=2.5)   # covariate
#' H   <- rbinom(20000,1,0.1)            # hypothesis true or false
#' Z   <- rnorm(20000, H*X)              # Z-score
#' .Random.seed <- save.seed
#' pvalue <- 1-pnorm(Z)                  # pvalue
#'
#' ihw_fdr <- ihw(pvalue, X, .1)        # Standard IHW for FDR control
#' ihw_fwer <- ihw(pvalue, X, .1, adjustment_type = "bonferroni")    # FWER control
#
#' table(H[adj_pvalues(ihw_fdr) <= 0.1] == 0) #how many false rejections?
#' table(H[adj_pvalues(ihw_fwer) <= 0.1] == 0)
#'
#'
#' @export
#' @aliases ihw
#'

ihw.default <- function(pvalues, covariates, alpha,
						covariate_type = "ordinal",
						nbins = "auto",
						m_groups = NULL,
						folds = NULL,
						quiet =TRUE ,
						nfolds = 5L,
						nfolds_internal = 5L,
						nsplits_internal=1L,
						lambdas = "auto",
						seed = 1L,
						distrib_estimator = "grenander",
						lp_solver="lpsymphony",
						adjustment_type = "BH",
						null_proportion = FALSE,
						null_proportion_level = 0.5,
						return_internal = FALSE,
						...){

	# This function essentially wraps the lower level function ihw_internal
	# e.g. takes care of NAs, sorts pvalues and then
	# returns a nice ihw object



	if (!adjustment_type %in% c("BH","bonferroni")){
		stop("IHW currently only works with BH or bonferroni types of multiple testing corrections")
	}

	if (adjustment_type == "bonferroni" & distrib_estimator != "grenander"){
		stop("For Bonferroni-like FWER adjustment currently only the grenander estimator is supported.")
	}

	if (is.null(folds)){
		nfolds <- as.integer(nfolds)
	} else {
		nfolds <- length(unique(folds))
	}

	nfolds_internal <- as.integer(nfolds_internal)


	if (nfolds==1){
		message("Using only 1 fold! Only use this if you want to learn the weights, but NEVER for testing!")
	}


	# gracefully handle NAs
   	nna <- !is.na(pvalues)

   	if (all(nna)){
        nna <- TRUE
   	}

   	# filter our p-values
    pvalues <- pvalues[nna]
    covariates <- covariates[nna]
    weights <- rep(NA, length(pvalues))

    if (any(is.na(covariates))){
    	stop("Covariates corresponding to non-NA p-values should never be NA. Aborting.")
    }


	if (covariate_type =="ordinal" & is.numeric(covariates)){

		if (!is.null(m_groups)){
			stop("m_groups should only be specified when the covariates are a factor")
		}

		if (nbins == "auto"){
			nbins <- max(1,min(40, floor(length(pvalues)/1500))) # rule of thumb..
		}
		groups <- as.factor(groups_by_filter(covariates, nbins, seed=seed))
		penalty <- "total variation"

	} else if (is.factor(covariates)){
		groups <- covariates
		if (nbins != "auto" & nbins != nlevels(groups)){
			warning("Overwriting manually specified nbins, since it has to equal the number
				of levels for categorical covariates")
		}

		nbins <- nlevels(groups)

		if (covariate_type == "nominal"){
			penalty <- "uniform deviation"
		} else if (covariate_type == "ordinal") {
			penalty <- "total variation"
		}
	} else {
		stop("Covariates are not of the appropriate type")
	}

	if ((length(lambdas)== 1) & (lambdas[1] == "auto")){
		# just a few for now until I get warm starts of the LP solvers to work
		lambdas <- c(0, 1, nbins/8, nbins/4, nbins/2, nbins, Inf)
	}

	if (nbins < 1) {
		stop("Cannot have less than one bin.")
	}


	group_levels <- levels(droplevels(groups))
	# sort pvalues globally
	order_pvalues <- order(pvalues) #TODO BREAK ties by filter statistic rank
	reorder_pvalues <- order(order_pvalues)

	sorted_groups <- groups[order_pvalues]
	sorted_pvalues <- pvalues[order_pvalues]

	if (!is.null(folds)){
		sorted_folds <- folds[order_pvalues]
	} else {
		sorted_folds <- NULL
	}

	if (is.null(m_groups)) {
		if (is.null(folds)){
			m_groups <- table(sorted_groups)
		} else {
			m_groups <- table(sorted_groups, sorted_folds)
		}
	} else {
		# TODO: also check if m_groups entered by user is plausible based on observed m_groups and max p_value
	}

    #--- check if m_groups is correctly specified----------------------
    if (is.null(folds)){
		if (length(group_levels) != length(m_groups)){
			stop("Number of unique levels should be equal to length of m_groups.")
		}
	} else {
		if (any(dim(m_groups) != c(length(group_levels), nfolds))){
			stop("m_groups should be of dimension nbins*nfolds")
		}
	}
	# once we have groups, check whether they include enough p-values
	if (nbins > 1 & any(m_groups < 1000)){
		message("We recommend that you supply (many) more than 1000 p-values for meaningful data-driven hypothesis weighting results.")
	}

	# start by making sure seed is correctly specified
	if (!is.null(seed)){
		#http://stackoverflow.com/questions/14324096/setting-seed-locally-not-globally-in-r?rq=1
		tmp <- runif(1)
		old <- .Random.seed
  		on.exit( { .Random.seed <<- old } )
  		set.seed(as.integer(seed)) #seed
	}

	if (nbins == 1){
		nfolds <- 1L
		nfolds_internal <- 1L
		nsplits_internal <- 1L
		message("Only 1 bin; IHW reduces to Benjamini Hochberg (uniform weights)")
		sorted_adj_p <- p.adjust(sorted_pvalues, method=adjustment_type, n=sum(m_groups))
		rjs <- sum(sorted_adj_p <= alpha)
		res <- list(lambda=0, fold_lambdas=0, rjs=rjs, sorted_pvalues=sorted_pvalues,
					sorted_weighted_pvalues = sorted_pvalues,
					sorted_adj_p=sorted_adj_p, sorted_weights=rep(1, length(sorted_pvalues)),
					sorted_groups=as.factor(rep(1, length(sorted_pvalues))),
					sorted_folds=as.factor(rep(1, length(sorted_pvalues))),
					weight_matrix = matrix(1))
	} else if (nbins > 1) {
		res <- ihw_internal(sorted_groups, sorted_pvalues, alpha, lambdas,
						m_groups,
						penalty = penalty,
						quiet = quiet,
						sorted_folds = sorted_folds,
						nfolds = nfolds,
						nfolds_internal = nfolds_internal,
						nsplits_internal = nsplits_internal,
						seed = NULL,
						distrib_estimator = distrib_estimator,
						lp_solver = lp_solver,
						adjustment_type = adjustment_type,
						null_proportion = null_proportion,
						null_proportion_level = null_proportion_level,
						...)
	}

	if (return_internal){
		return(res)
	}

	# resort back to original form
	weights <- fill_nas_reorder(res$sorted_weights, nna, reorder_pvalues)
	pvalues <- fill_nas_reorder(res$sorted_pvalues, nna, reorder_pvalues)
	weighted_pvalues <- fill_nas_reorder(res$sorted_weighted_pvalues, nna, reorder_pvalues)
	adj_pvalues <- fill_nas_reorder(res$sorted_adj_p, nna, reorder_pvalues)
	groups     <- factor(fill_nas_reorder(res$sorted_groups, nna, reorder_pvalues),levels=group_levels)
	folds      <- factor(fill_nas_reorder(res$sorted_folds, nna, reorder_pvalues),levels=1:nfolds)
	covariates <- fill_nas_reorder(covariates, nna, 1:length(covariates))


	df <- data.frame(pvalue = pvalues,
					 adj_pvalue = adj_pvalues,
				     weight = weights,
				     weighted_pvalue = weighted_pvalues,
					 group= groups,
					 covariate = covariates,
					 fold=as.factor(folds))

	ihw_obj <- new("ihwResult",
		 			df = df,
		 			weights = res$weight_matrix,
		 			alpha = alpha,
					nbins = as.integer(nbins),
					nfolds = nfolds,
					regularization_term = res$fold_lambdas,
					m_groups = as.integer(m_groups),
					penalty = penalty,
					covariate_type = covariate_type,
					adjustment_type = adjustment_type,
					reg_path_information = data.frame(),
		 			solver_information = list())

	ihw_obj

}

# operate on pvalues which have already been sorted and without any NAs
ihw_internal <- function(sorted_groups, sorted_pvalues, alpha, lambdas,
								m_groups,
							    penalty="total variation",
							    seed=NULL,
								quiet=TRUE,
								sorted_folds=NULL,
								nfolds = 10L,
								nfolds_internal = nfolds,
								nsplits_internal = 1L,
								distrib_estimator = "distrib_estimator",
								lp_solver="lpsymphony",
								adjustment_type="BH",
								null_proportion = FALSE,
								null_proportion_level = 0.5,
								debug_flag=FALSE,
								...){

	folds_prespecified <- !is.null(sorted_folds)

	m <- length(sorted_pvalues)
	split_sorted_pvalues <- split(sorted_pvalues, sorted_groups)
	m_groups_available <- sapply(split_sorted_pvalues, length)


	#  do the k-fold strategy
	if (!is.null(seed)){
		#http://stackoverflow.com/questions/14324096/setting-seed-locally-not-globally-in-r?rq=1
		old <- .Random.seed
  		on.exit( { .Random.seed <<- old } )
  		set.seed(as.integer(seed)) #seed
	}

	if (!folds_prespecified){
		sorted_folds <- sample(1:nfolds, m, replace = TRUE)
	}

	sorted_weights <- rep(NA, m)
	fold_lambdas <- rep(NA, nfolds)
	weight_matrix <- matrix(NA, nlevels(sorted_groups), nfolds)

	if (debug_flag==TRUE){
		rjs_path_mat <- matrix(NA, nrow=nfolds, ncol=length(lambdas))
	}
	for (i in 1:nfolds){

		if (!quiet) message(paste("Estimating weights for fold", i))
		# don't worry about inefficencies right now, they are only O(n) and sorting dominates this
		if (nfolds == 1){
			filtered_sorted_groups <- sorted_groups
			filtered_sorted_pvalues <- sorted_pvalues
			filtered_split_sorted_pvalues <- split(filtered_sorted_pvalues, filtered_sorted_groups)

			m_groups_holdout_fold <- m_groups
			m_groups_other_folds <- m_groups
		} else {
			filtered_sorted_groups <- sorted_groups[sorted_folds!=i]
			filtered_sorted_pvalues <- sorted_pvalues[sorted_folds!=i]
			filtered_split_sorted_pvalues <- split(filtered_sorted_pvalues, filtered_sorted_groups)

      if (!folds_prespecified){
				# TODO: add check to see if for common use case the first term is 0
				m_groups_holdout_fold <- (m_groups-m_groups_available)/nfolds +
						 m_groups_available - sapply(filtered_split_sorted_pvalues, length)
				m_groups_other_folds <- m_groups - m_groups_holdout_fold
			} else {
				m_groups_holdout_fold <- m_groups[,i]
				m_groups_other_folds <- rowSums(m_groups[,-i,drop=FALSE])
			}
		}
		# within each fold do iterations to also find lambda

		# TODO replace for loop by single call to ihw_convex with warm starts
		if (length(lambdas) > 1){
			rjs  <- matrix(NA, nrow=length(lambdas), ncol=nsplits_internal)
			for (k in 1:length(lambdas)){
				lambda <- lambdas[k]
				for (l in 1:nsplits_internal){

					rjs[k,l] <- ihw_internal(filtered_sorted_groups, filtered_sorted_pvalues, alpha,
											lambda, m_groups_other_folds,
									    	seed=NULL, quiet=quiet,
									    	nfolds=nfolds_internal,
									    	distrib_estimator = distrib_estimator,
									    	lp_solver=lp_solver,
									    	adjustment_type=adjustment_type,...)$rjs
				}
				if (debug_flag==TRUE){
					rjs_path_mat[i,k] <- rjs[k] #only works for nsplits_internal currently
				}
			}
			lambda <- lambdas[which.max(apply(rjs,1, mean))]
		} else if (length(lambdas) == 1){
			lambda <- lambdas
		} else {
			stop("something went wrong, need at least 1 value for regularization parameter!")
		}
		fold_lambdas[i] <- lambda

		# ok we have finally picked lambda and can proceed

		if (distrib_estimator=="grenander"){
			res <- ihw_convex(filtered_split_sorted_pvalues, alpha,
						   m_groups_holdout_fold, m_groups_other_folds,
						   penalty=penalty, lambda=lambda, lp_solver=lp_solver, 
						   adjustment_type=adjustment_type, quiet=quiet,...)
		} else if (distrib_estimator == "ECDF"){
			res <- ihw_milp(filtered_split_sorted_pvalues, alpha, m_groups_holdout_fold,
				           penalty=penalty, lambda=lambda, lp_solver=lp_solver, ...)
		} else {
			stop("This type of distribution estimator is not available.")
		}

		sorted_weights[sorted_folds == i] <- res$ws[sorted_groups[sorted_folds==i]]
		weight_matrix[,i] <- res$ws

		if (null_proportion){
			pi0_est <- weighted_storey_pi0(sorted_pvalues[sorted_folds==i],
				                  sorted_weights[sorted_folds == i],
				                  tau= null_proportion_level,
				                  m=sum(m_groups_holdout_fold))

			sorted_weights[sorted_folds == i] <- sorted_weights[sorted_folds == i]/pi0_est
			weight_matrix[,i] <- weight_matrix[,i]/pi0_est

		}

	}

	sorted_weighted_pvalues <- mydiv(sorted_pvalues, sorted_weights)


	sorted_adj_p <- p.adjust(sorted_weighted_pvalues, method = adjustment_type, n = sum(m_groups))
	rjs   <- sum(sorted_adj_p <= alpha)
	lst <- list(lambda=lambda, fold_lambdas=fold_lambdas, rjs=rjs, sorted_pvalues=sorted_pvalues,
					sorted_weighted_pvalues = sorted_weighted_pvalues,
					sorted_adj_p=sorted_adj_p, sorted_weights=sorted_weights,
					sorted_groups=sorted_groups, sorted_folds=sorted_folds,
					weight_matrix = weight_matrix)

	if (debug_flag == TRUE) {
		lst$rjs_path_mat <- rjs_path_mat
	}
	lst
}


#' @rdname ihw.default
#' @param formula \code{\link{formula}}, specified in the form pvalue~covariate (only 1D covariate supported)
#' @param data data.frame from which the variables in formula should be taken
#' @export
ihw.formula <- function(formula, data=parent.frame(), ...){
	if (length(formula) != 3){
		stop("expecting formula of the form pvalue~covariate")
	}
	pv_name <- formula[[2]]
	cov_name <- formula[[3]]
	pvalues <- eval(pv_name, data, parent.frame())
	covariates <- eval(cov_name, data, parent.frame())
	ihw(pvalues, covariates, ...)
}


#' ihw.DESeqResults: IHW method dispatching on DESeqResults objects
#'
#' @param deseq_res "DESeqResults" object
#' @param filter Vector of length equal to number of rows of deseq_res object. This is used
#'         for the covariates in the call to ihw. Can also be a character,
#'         in which case deseq_res[[filter]] is used as the covariate
#' @param alpha   Numeric, sets the nominal level for FDR control.
#' @param adjustment_type Character ("BH" or "bonferroni") depending on whether you want to control FDR or FWER.
#' @param ... Other optional keyword arguments passed to ihw.
#'
#' @return A "DESeqResults" object, which includes weights and adjusted p-values returned
#'         	by IHW. In addition, includes a metadata slot with an "ihwResult" object.
#' @seealso ihw, ihwResult
#'
#' @examples \dontrun{
#'    library("DESeq2")
#'    library("airway")
#'    data("airway")
#'    dds <- DESeqDataSet(se = airway, design = ~ cell + dex)
#'    dds <- DESeq(dds)
#'    deseq_res <- results(dds)
#'    deseq_res <- ihw(deseq_res, alpha=0.1)
#'    #equivalent: deseq_res2 <- results(dds, filterFun = ihw)
#' }
#'
#' @export
#'
ihw.DESeqResults <- function(deseq_res, filter="baseMean",
								alpha=0.1, adjustment_type="BH",...){

    if (missing(filter)) {
    	filter <- deseq_res$baseMean
  	} else if (is.character(filter)){
  		stopifnot(filter %in% colnames(deseq_res))
  		filter <- deseq_res[[filter]]
  	}

  	stopifnot(length(filter) == nrow(deseq_res))

	ihw_res <- ihw(deseq_res$pvalue, filter, alpha=alpha,
							adjustment_type=adjustment_type,...)

	deseq_res$padj <- adj_pvalues(ihw_res)
 	deseq_res$weight <- weights(ihw_res)

	mcols(deseq_res)$type[names(deseq_res)=="padj"] <- "results"
	mcols(deseq_res)$description[names(deseq_res)=="padj"] <- paste("Weighted", adjustment_type,"adjusted p-values")

	mcols(deseq_res)$type[names(deseq_res)=="weight"] <- "results"
	mcols(deseq_res)$description[names(deseq_res)=="weight"] <- "IHW weights"

	metadata(deseq_res)[["alpha"]] <- alpha
	metadata(deseq_res)[["ihwResult"]] <- ihw_res
 	deseq_res
}

#' @importFrom slam simple_triplet_zero_matrix simple_triplet_matrix
#' @importFrom lpsymphony lpsymphony_solve_LP
ihw_convex <- function(split_sorted_pvalues, alpha, m_groups, m_groups_grenander,
		penalty="total variation", lambda=Inf, lp_solver="gurobi",
		adjustment_type= "BH", grenander_binsize = 1, quiet=quiet){

	# m_groups used for balancing (weight budget)
	# m_groups_grenander (used for grenander estimator)
	# Practically always it will hold: m_groups \approx m_groups_grenander

	nbins <- length(split_sorted_pvalues)

	if (lambda==0){
		ws <- rep(1, nbins)
		return(list(ws=ws))
	}


	# preprocessing:  Set very low p-values to 0, otherwise LP solvers have problems due to numerical instability
	# Note: This only affects the internal optimization, the higher level functions will
	# still return adjusted pvalues based on the original p-values

	split_sorted_pvalues <- lapply(split_sorted_pvalues, function(x) ifelse(x > 10^(-20), x, 0))
	m <- sum(m_groups)

	if (nbins != length(m_groups)){
		stop("length of m_groups should be equal to number of bins")
	}

	#lapply grenander...
	if (!quiet) message("Applying Grenander estimator within each bin.")
	grenander_list <- mapply(presorted_grenander,
		                    split_sorted_pvalues,
		                    m_groups_grenander,
		                    grenander_binsize = grenander_binsize,
		                    quiet=quiet,
		                    SIMPLIFY=FALSE)


	#set up LP
	nconstraints_per_bin <- sapply(grenander_list, function(x) x$length)
	nconstraints <- sum(nconstraints_per_bin)
	i_yi <- 1:nconstraints
	j_yi <- rep(1:nbins,  times=nconstraints_per_bin)
	v_yi <- rep(1,  nconstraints)

	i_ti <- 1:nconstraints
	j_ti <- nbins + rep(1:nbins, times=nconstraints_per_bin)
	v_ti <- unlist(lapply(grenander_list, function(x) -x$slope.knots))

	constr_matrix <- slam::simple_triplet_matrix(c(i_yi, i_ti), c(j_yi,j_ti), c(v_yi, v_ti))
	rhs <- unlist(lapply(grenander_list, function(x) x$y.knots-x$slope.knots*x$x.knots))

	obj <- c(m_groups/m*nbins*rep(1,nbins), rep(0,nbins))

	if (lambda < Inf){
		if (penalty == "total variation"){
			# -f + t_g - t_{g-1} <= 0
			i_fi <- rep(1:(nbins-1),3)
			j_fi <-	c((nbins+2):(2*nbins), (nbins+1):(2*nbins-1), (2*nbins+1):(3*nbins-1))
			v_fi <- c(rep(1,nbins-1), rep(-1,nbins-1), rep(-1, nbins-1))
			absolute_val_constr_matrix_1 <- slam::simple_triplet_matrix(i_fi,j_fi,v_fi)

			# -f - t_g + t_{g-1} <= 0
			i_fi <- rep(1:(nbins-1),3)
			j_fi <-	c((nbins+2):(2*nbins), (nbins+1):(2*nbins-1), (2*nbins+1):(3*nbins-1))
			v_fi <- c(rep(-1,nbins-1), rep(1,nbins-1), rep(-1, nbins-1))
			absolute_val_constr_matrix_2 <- slam::simple_triplet_matrix(i_fi,j_fi,v_fi)

			constr_matrix <- rbind(cbind(constr_matrix, slam::simple_triplet_zero_matrix(nconstraints, nbins-1,mode="double")),
							    absolute_val_constr_matrix_1,
								absolute_val_constr_matrix_2)

			obj <- c(obj, rep(0, nbins-1))

			total_variation_constr <- matrix(c(rep(0,nbins), -lambda*m_groups/m, rep(1,nbins-1)),nrow=1)
			constr_matrix <- rbind(constr_matrix, total_variation_constr)

			rhs <- c(rhs,rep(0, 2*(nbins-1)),0) #add RHS for absolute differences and for total variation penalty

		} else if (penalty == "uniform deviation"){
			# -f + m t_g - sum_g m_i t_i <= 0

			absolute_val_constr_matrix_1 <- matrix(rep(-m_groups,nbins), nbins,nbins, byrow=TRUE)
			diag(absolute_val_constr_matrix_1) <- -m_groups + m
			absolute_val_constr_matrix_1 <- cbind( slam::simple_triplet_zero_matrix(nbins, nbins,mode="double"),
												   absolute_val_constr_matrix_1,
										           -diag(nbins))

			# -f - m t_g +  sum_g m_i t_i  <= 0

			absolute_val_constr_matrix_2 <- matrix(rep(+m_groups,nbins), nbins,nbins, byrow=TRUE)
			diag(absolute_val_constr_matrix_2) <- m_groups - m
			absolute_val_constr_matrix_2 <- cbind( slam::simple_triplet_zero_matrix(nbins, nbins,mode="double"),
												   absolute_val_constr_matrix_2,
										           -diag(nbins))

			constr_matrix <- rbind(cbind(constr_matrix, slam::simple_triplet_zero_matrix(nconstraints, nbins,mode="double")),
							    absolute_val_constr_matrix_1,
								absolute_val_constr_matrix_2)

			obj <- c(obj, rep(0, nbins))

			total_variation_constr <- matrix(c(rep(0,nbins), -lambda*m_groups, rep(1,nbins)),nrow=1)
			constr_matrix <- rbind(constr_matrix, total_variation_constr)

			rhs <- c(rhs,rep(0, 2*nbins),0) #add RHS for absolute differences and for total variation penalty

		}
	}

	if (adjustment_type == "BH"){
		# incorporate the FDR constraint
		fdr_constr<- matrix(c(rep(-alpha,nbins)*m_groups,
						      rep(1,nbins)*m_groups,
						      rep(0,ncol(constr_matrix)-2*nbins)), nrow=1)
		constr_matrix <- rbind(constr_matrix, fdr_constr)
		rhs <- c(rhs,0)
	} else if (adjustment_type == "bonferroni"){
		# incorporate the FWER constraint
		fwer_constr<- matrix(c(rep(0,nbins),
						      rep(1,nbins)*m_groups,
						      rep(0,ncol(constr_matrix)-2*nbins)), nrow=1)
		constr_matrix <- rbind(constr_matrix, fwer_constr)
		rhs <- c(rhs,alpha)
	} else {
		stop("Only BH-like and bonferroni-like adjustments available.")
	}
	nvars <- ncol(constr_matrix)

	if (!quiet) message("Starting to solve LP.")

	if (lp_solver == "gurobi"){

		#if (!require("gurobi", quietly=TRUE)){
		#	stop("Gurobi solver appears not be installed. Please use lpsymphony or install gurobi.")
		#}

		if (!requireNamespace("Matrix", quietly=TRUE)){
			stop("Matrix package required to use gurobi in IHW.")
		}

		model <- list()
		model$A <- Matrix::sparseMatrix(i=constr_matrix$i, j=constr_matrix$j, x=constr_matrix$v,
           		dims=c(constr_matrix$nrow, constr_matrix$ncol))
		model$obj <- obj
		model$modelsense <- "max"
		model$rhs        <- rhs
		model$lb         <- 0
		model$ub         <- 2
		model$sense      <- '<'

		params <- list(OutputFlag=1)
		res <- gurobi(model, params)
		sol <- res$x
		solver_status <- res$status

	} else if (lp_solver=="lpsymphony") {

		#rsymphony_bounds <- list(lower=list(ind=1:nvars, val=rep(0,nvars)),
		#					 upper=list(ind=1:nvars, val=rep(1,nvars)))

		#if (is.infinite(time_limit)) time_limit <- -1
		#if (is.infinite(node_limit)) node_limit <- -1
		res <- lpsymphony::lpsymphony_solve_LP(obj, constr_matrix, rep("<=", nrow(constr_matrix)),
			rhs, #bounds= rsymphony_bounds,
			max = TRUE, verbosity = -2, first_feasible = FALSE)
		sol <- res$solution
		solver_status <- res$status
	} else {
		stop("Only gurobi and lpsymphony solvers currently supported.")
	}

	# catch negative thresholds due to numerical rounding
	ts <- pmax(sol[(1:nbins)+nbins],0)
	ws <- thresholds_to_weights(ts, m_groups)	#\approx ts/sum(ts)*nbins

	# return matrix as follows: nbins x nlambdas)
	return(list(ws=ws))
}


#' @importFrom fdrtool gcmlcm
presorted_grenander <- function(sorted_pvalues, m_total=length(sorted_pvalues),
							grenander_binsize = 1,
							quiet=TRUE){

  	unique_pvalues <- unique(sorted_pvalues)
  	ecdf_values <- cumsum(tabulate(match(sorted_pvalues, unique_pvalues)))/m_total

  	if (min(unique_pvalues) > 0){
  		# I think fdrtool neglects this borderline case and this causes returned object
  		# to be discontinuous hence also not concave
  		unique_pvalues <- c(0,unique_pvalues)
  		ecdf_values   <- c(0, ecdf_values)
  	}

  	if (max(unique_pvalues) < 1){
  		unique_pvalues <- c(unique_pvalues,1)
  		ecdf_values    <- c(ecdf_values,1)
  	}

  	if (grenander_binsize != 1){
  		nmax = length(unique_pvalues)
  		idx_thin <- c(seq(1, nmax-1, by=grenander_binsize), nmax)
  		unique_pvalues <- unique_pvalues[idx_thin]
  		ecdf_values <- ecdf_values[idx_thin]
  	}

  	ll <- fdrtool::gcmlcm(unique_pvalues, ecdf_values, type="lcm")
	ll$length <- length(ll$slope.knots)
	ll$x.knots <- ll$x.knots[-ll$length]
  	ll$y.knots <- ll$y.knots[-ll$length]
	if (!quiet) message(paste("Grenander fit with", ll$length, "knots."))
	ll
}


# mostly deprecated function (users should not be using it)
# needed for reproducibility of manuscript
# to demonstrate that "naive IHW" algorithm does not control type-I error

ihw_milp <- function(split_sorted_pvalues, alpha, m_groups, lambda=Inf, lp_solver="gurobi", 
		quiet=quiet,
		optim_pval_threshold=1,
		penalty="total variation",
		lp_relaxation=FALSE,
		t_bh=0,
		bh_rejections =0,
		rejections_bound=NULL,
		time_limit=Inf,
		node_limit=Inf,
		threads=0,
		solution_limit=Inf,
		mip_gap_abs=Inf,
		mip_gap = 10^(-4),
		MIPFocus=0){



	nbins <- length(split_sorted_pvalues)

	if (lambda==0){
		ws <- rep(1, nbins)
		return(list(ws=ws))
	}

	m <- sum(m_groups)
	mtests <- m

	if (nbins != length(m_groups)){
		stop("length of m_groups should be equal to number of bins")
	}


	if (optim_pval_threshold < 1){
		# keep at least 2 p-values in each group, so we do not have to add special cases for n=0 or n=1
		split_sorted_pvalues <- lapply(split_sorted_pvalues,
			function(p) p[p <= max(p[2],optim_pval_threshold)])
	}

	pmax <- max(unlist(split_sorted_pvalues))
	split_sorted_pvalues <- lapply(split_sorted_pvalues, function(p) p/pmax)
	#ts_bh <- sapply(split_sorted_pvalues, function(ps) max(0, ps[ps <= t_bh]))


	# create a simple_triplet_matrix (sparse matrix) representation,

	# function to create sparse-representation of group-wise blocks enforcing the constraints
	# z_1 >= z_2 >= .. in each group  i.e. [(m_group -1) X m_group] matrix of the form:
	#
	# 1 -1  0  ......
	# 0  1 -1  ......
	# 0  0  1  ......
 	# ...............
 	#
 	# Also parameters to make it easy to diagonally combine these blocks afterwards

	single_block_triple_representation <- function(start, p_length,prev_block){
		one_block <- cbind(start:(start+p_length-2), (prev_block+start):(prev_block+start+p_length-2), rep(1,p_length-1))
		minus_one_block <- cbind(start:(start+p_length-2), (prev_block+1+start):(prev_block+start+p_length-1), rep(-1, p_length-1))
	    rbind(one_block, minus_one_block)
	}



	# create, combine above blocks and store them as simple triplet matrix
	p_lengths <- sapply(split_sorted_pvalues,length)
	prev_blocks <- 0:(nbins-1)
	starts <- cumsum(c(1,p_lengths[-nbins]-1))

	full_triplet <- do.call(rbind, mapply(single_block_triple_representation, starts,p_lengths, prev_blocks, SIMPLIFY=FALSE))
	full_triplet <- slam::simple_triplet_matrix(full_triplet[,1],full_triplet[,2], full_triplet[,3])


	# add final constraint to matrix which corresponds to control of plug-in FDR estimate
	diff_coefs <- unlist(mapply(function(p, m_group) diff(c(0,p))* m_group , split_sorted_pvalues, m_groups))
	#plugin_fdr_constraint <- (alpha - diff_coefs * mtests/m)/m #alpha - diff_coefs * mtests/m
	plugin_fdr_constraint <- (alpha/pmax - diff_coefs * mtests/m)/m #alpha - diff_coefs * mtests/m

	full_triplet <- rbind(full_triplet, matrix(plugin_fdr_constraint, nrow=1))

	z_vars <- ncol(full_triplet) #number of binary variables

	obj        <- rep(1, z_vars) # maximize number of rejections

	if (is.finite(lambda)){
		# start with new code which introduces regularization

		# first we introduce #nbins new random variables T_g, such that
		# t_{g} <= T_g < t'_{g} [in practice the second < has to be turned into a <=]
		# this reflects the fact that for all these values of T_g, still the same hypotheses will be rejected
		# in stratum g.

		full_triplet <- cbind(full_triplet, matrix(0, nrow(full_triplet), nbins))

		# Below we modify objective a tiny bit by enforcing some minimization on \sum m_g T_g
		# because 1/(4m)* \sum m_g T_g << 1 we ensure that our objective of interest (number of rejections) does not change
		# Motivation for this soft constraint:
		# 1) Make final solution (in terms of threshold T_g) "somewhat" unique
    	# 2) (more importantly): assist solver in preventing possible violation of plugin FDR control due to numerical issues
    	# due to small p-values.

		obj <- c(obj, -.25*m_groups/m)


		# first g constraints representing t_{g} <= T_g can be represented by [-A I_g] with A:
		#
		#   1 1 |          |          |                  # t_1
		#       |  1  1  1 |          |					 # t_2
		#       |          |  1  1  1 |                  # t_3
		#
		#	 (Replace 1s by diff(c(0,p)) values)

		pdiff_list <- lapply(split_sorted_pvalues, function(p) diff(c(0,p)))

		left_ineq_matrix_block <- function(g, p_length, pdiff){
			mat <- 	matrix(0, nbins, p_length)
			mat[g,] <- - pdiff
			mat
 		}


 		# now put above blocks together
 		ineq_matrix <-  rbind( cbind(do.call(cbind, mapply(left_ineq_matrix_block, 1:nbins, p_lengths, pdiff_list, SIMPLIFY=FALSE)),
 							  	diag(1, nbins)))

 		# for these T_g's we also require the plug-in FDR estimate to be controlled
 		# (in fact we could drop the old constraint on the t_g's, but the extra constraint could help with the relaxations)

 		# we also divide the constraint by m (arbitrary) so that the coefficient range of the matrix is not too large
 		# which could cause numerical problems
		#plugin_fdr_constraint2 <- matrix(c(rep(alpha/m,z_vars), -m_groups/m), nrow=1)
		plugin_fdr_constraint2 <- matrix(c(rep(alpha/m/pmax,z_vars), -m_groups/m), nrow=1)

		# put these constraints together..
		full_triplet <- rbind(full_triplet, ineq_matrix, plugin_fdr_constraint2)


		if (penalty=="total variation"){
			# this implements a penalty of the form: \sum_{g=2}^G |w_g - w_{g-1}| <= reg_par

			# |T_g - T_{g-1}| = f_g  introduce absolute value constants, i.e. introduce nbins-1 new continuous variables

			full_triplet <- cbind(full_triplet, matrix(0,nrow(full_triplet),nbins-1))
			obj <- c(obj, rep(0, nbins-1))

			# we now need inequalities: T_g - T_{g-1} + f_g >= 0    and -T_g + T_{g-1} + f_g >= 0
			# start with matrix diff_matrix (nbins-1) X nbins as a scaffold i.e.
			#
			#  -1  1
			#     -1  1
			#        -1  1


			diff_matrix <- diag(-1,nbins-1,nbins) + cbind(rep(0,nbins-1), diag(1,nbins-1))
			abs_constraint_matrix <- rbind(cbind(slam::simple_triplet_zero_matrix(nbins-1, z_vars,mode="double"),
											diff_matrix, slam::simple_triplet_diag_matrix(1, nrow=nbins-1)),
									   cbind(slam::simple_triplet_zero_matrix(nbins-1, z_vars,mode="double"),
											-diff_matrix, slam::simple_triplet_diag_matrix(1, nrow=nbins-1)))


 			# add final regularization inequality:
 			# 			\sum |w_g - w_{g-1}| <= reg_par
 			# <=>       \sum |T_g - T_{g-1}|*m <= reg_par * \sum m_g T_g
 			regularization_constraint <- matrix(c(rep(0,z_vars), lambda/m*m_groups, rep(-1, nbins-1)), nrow=1)

 			# to be used for ub afterwards
 			regularization_ub <- c(rep(1, nbins), rep(2, nbins-1))

 		} else if (penalty=="uniform deviation"){
 			# this implements a penalty of the form: \sum_{g=1}^G |w_g -1| <= reg_par

 			# |m*T_g - \sum_{i=1}^G m_i T_i| = f_g  introduce absolute value constants, i.e. introduce nbins new continuous variables

			full_triplet <- cbind(full_triplet, matrix(0,nrow(full_triplet),nbins))
			obj <- c(obj, rep(0, nbins))

			# we now need inequalities: m*T_g - \sum_{i=1}^G m_i T_i + f_g >= 0   and -m*T_g  +  \sum_{i=1}^G m_iT_i + f_g >= 0
			# start with matrix diff_matrix (nbins X nbins) as a scaffold i.e.
			#
			#  (m-m_1)    -m_2      -m_3   ......
			#    -m_1  (m - m_2)    -m_3   ......
			#    -m_1     -m_2    (m-m_3)  ......
			#     .         .         .    .
			#     .         .         .      .
			#     .         .         .        .



			diff_matrix <- diag(m, nbins, nbins) - matrix(rep(m_groups, each=nbins), nrow=nbins) 

			abs_constraint_matrix <- rbind(cbind(slam::simple_triplet_zero_matrix(nbins, z_vars,mode="double"),
											diff_matrix, slam::simple_triplet_diag_matrix(1, nrow=nbins)),
									   cbind(slam::simple_triplet_zero_matrix(nbins, z_vars,mode="double"),
											-diff_matrix, slam::simple_triplet_diag_matrix(1, nrow=nbins)))


 			# add final regularization inequality:
 			# 			\sum |w_g - 1| <= reg_par
 			# <=>       \sum |m*T_g - \sum m_i T_i| <= reg_par * \sum m_i T_i
 			regularization_constraint <- matrix(c(rep(0,z_vars), lambda*m_groups, rep(-1, nbins)), nrow=1)

 			# to be used for ub afterwards
 			regularization_ub <- c(rep(1, nbins), rep(2*m, nbins))

 		} else {
 			stop("No such regularization penalty currently supported.")
 		}
 		# put all constraints together
 		full_triplet <- rbind(full_triplet, abs_constraint_matrix, regularization_constraint)

 	}

    # finally add an inequality ensuring that we get at least as many rejections as BH
    # added this feature as an ad-hoc solution to a previous problematic regularization implementation
    # but actually seems to also speed up the optimization problem!

    bh_rj_inequality <- matrix(c(rep(1, z_vars), rep(0, ncol(full_triplet)-z_vars)), nrow=1)
    full_triplet <- rbind(full_triplet, bh_rj_inequality)

    if (!is.null(rejections_bound)){
    	full_triplet <- rbind(full_triplet, -bh_rj_inequality)
    }

    nrows <- nrow(full_triplet)
    model_rhs <- c(rep(0,nrows-1), bh_rejections)

    if (!is.null(rejections_bound)){
    	model_rhs <- c(rep(0,nrows-2), -rejections_bound)
    }

    if (lp_relaxation){
    	model_vtype <- rep('C', ncol(full_triplet))
    } else {
    	model_vtype <- c(rep('B', z_vars), rep('C', ncol(full_triplet)-z_vars))
    }
    # model ub will actually depend on which penalty we choose
    # model ub = 1 for binary variables, 1 for thresholds, x for f_g, where
    # x=2 for total variation, x=2*m for uniform deviation
    model_ub <- c(rep(1, z_vars))
    if (z_vars < nrows){
    	model_ub <- c(model_ub, regularization_ub)
    }

	if (lp_solver=="lpsymphony") {

		rsymphony_ub <- list(upper = list(ind = 1:ncol(full_triplet), val = model_ub))

		if (is.infinite(time_limit)) time_limit <- -1
		if (is.infinite(node_limit)) node_limit <- -1
		res<- lpsymphony::lpsymphony_solve_LP(obj, full_triplet, rep(">=", nrows), 
			model_rhs,
    		types = model_vtype, bounds= rsymphony_ub,
			max = TRUE, verbosity = -2, time_limit = time_limit,
			node_limit = node_limit, gap_limit = ifelse(mip_gap <= 10^(-4), -1, mip_gap*100), #now default mip_gap=10^(-4) as in Gurobi while -1 default for Rsymphony
			first_feasible = FALSE)
		sol <- res$solution
		print(str(res))
		solver_status <- res$status

	} else if (lp_solver=="gurobi"){


		#if (!require("gurobi", quietly=TRUE)){
		#	stop("Gurobi solver appears not be installed. Please use lpsymphony or install gurobi.")
		#}

		if (!requireNamespace("Matrix", quietly=TRUE)){
			stop("Matrix package required to use gurobi in IHW.")
		}
		# keep code for commercial solver for now
		#speed up compared to Symphony appears to be at least ~2-10, depending on problem

		# convert simple_triplet_matrix of pkg Slam to sparse matrix of pkg Matrix
		# gurobi has problems with simple_triplet_matrix for an (undocumented) reason (?bug)
		full_triplet <-  Matrix::sparseMatrix(i=full_triplet$i, j=full_triplet$j, x=full_triplet$v,
           dims=c(full_triplet$nrow, full_triplet$ncol))

		model <- list()
		model$A <- full_triplet
		model$obj <- obj
		model$modelsense <- "max"
		model$rhs        <- model_rhs
		model$lb         <- 0
		model$ub         <- model_ub
		model$sense      <- '>'
		model$vtype      <- model_vtype

		# the parameters below make the optimization procedure MUCH slower
		# but they are actually necessary... (need to check what happens with open source solvers)
		# turns out that enforcing T_g >= t_g is numerically very hard in the context of this problem

		params <- list(NumericFocus=3, FeasibilityTol=10^(-9), ScaleFlag=0 , Quad=1, IntFeasTol=10^(-9),
							OutputFlag = 0, #Presolve=0,#10,
							Threads=threads,MIPFocus=MIPFocus,
							MIPGap = mip_gap)

		if (is.finite(time_limit)) params$TimeLimit <- time_limit
		if (is.finite(node_limit)) params$NodeLimit <- node_limit
		if (is.finite(solution_limit)) params$SolutionLimit <- solution_limit
		if (is.finite(mip_gap_abs)) params$MIPGapAbs <- mip_gap_abs

		res <- gurobi(model, params)
		sol <- res$x
		solver_status <- res$status

	} else {
		stop("Only gurobi and lpsymphony solvers are currently supported.")
	}

	# next try to extract the thresholds/weights from solver output
	lengths <- cumsum(sapply(split_sorted_pvalues, length))
	start_lengths <- c(1, lengths[-nbins]+1)

	pidx_list <- mapply(function(start,end) sol[start:end], start_lengths,lengths, SIMPLIFY=FALSE)

	get_threshold <- function(plist, pidx){
		max_idx <- which.max(which(pidx==1))
		ifelse(length(max_idx)==0, .0, plist[max_idx])
	}

	#get_threshold2 <- function(pdiff, pidx){
	#	sum(pdiff*pidx)
	#}

	t_thresholds1 <- mapply(get_threshold, split_sorted_pvalues, pidx_list)     # to be stored in ddhw object
	#t_thresholds2 <- mapply(get_threshold2, pdiff_list, pidx_list)
	t_thresholds <- if (is.infinite(lambda)){
					 	t_thresholds1
					} else {
						sol[(z_vars+1):(z_vars+nbins)]  # to be used for calculating weights
					}

    ####################################################################################################
    #	Possible checks of whether solver did not do (important) numerical errors: #TODO
    #
    # 1) t_thresholds >= t_thresholds1
    # 2) Plugin FDR has to be controlled (with respect to both t_thresholds, t_thresholds2)
    # 3) total variation of weights has to be less than lambda
    #
    #####################################################################################################

	ws <-  if (all(t_thresholds1 == .0) || all(t_thresholds == .0)){
				rep(1,nbins)
			} else {
				t_thresholds*m/sum(m_groups*t_thresholds) #might need adjustment if mtests > m
			}

	return(list(ws=ws))
}
nignatiadis/IHW documentation built on Aug. 22, 2023, 2:11 p.m.