#' Main function for happi, p=q=1; this script contains the modularized version of happi with correct implementation of log likelihood
#'
#' @param outcome length-n vector; this is the vector of a target gene's presence/absence; should be coded as 0 or 1
#' @param covariate n x p matrix; this is the matrix for the primary predictor/covariate of interest
#' @param h0_param the column index in covariate that has beta=zero under the null
#' @param quality_var length-n vector; this is the quality variable vector, currently p = 1 TODO(turn into n x q matrix)
#' @param covariate_formula alternative to \code{covariate} argument, a formula for covariates of the form \eqn{~ covariate1 + covariate2 + ...}, requires \code{data} argument
#' @param covariate_formula_h0 alternative to \code{h0_param} argument, a formula for covariates in the null model, takes the form \eqn{ ~ 1} for an intercept-only model, requires \code{data} argument
#' @param quality_var_formula alternative to \code{quality_var} argument, a formula for quality variable of the form \eqn{~ quality_var}, requires \code{data} argument
#' @param data required with \code{formula} arguments, a data frame including covariates and the quality variable
#' @param max_iterations the maximum number of EM steps that the algorithm will run for
#' @param min_iterations the minimum number of EM steps that the algorithm will run for
#' @param nstarts number of starts; Integer. Defaults to \code{1}. Number of starts for optimization.
#' @param change_threshold algorithm will terminate early if the likelihood changes by this percentage or less for 5 iterations in a row for both the alternative and the null
#' @param epsilon probability of observing a gene when it should be absent; probability between 0 and 1; default is 0. Either a single value or a vector of length n.
#' @param method method for estimating f. Defaults to "splines" which fits a monotone spline with df determined by
#' argument spline_df; "isotone" for isotonic regression fit
#' @param random_starts whether to pick the starting values of beta's randomly. Defaults to FALSE.
#' @param firth use firth penalty? Default is TRUE.
#' @param spline_df degrees of freedom (in addition to intercept) to use in
#' monotone spline fit; default 3
#' @param seed numeric number to set seed for random multiple starts
#' @param norm_sd positive number to set as the standard deviation for the Normal distribution used to draw initial parameter values from.
#' @param run_npLRT logical, if TRUE, non-parametric permutation LRT test will also be run.
#' @param P if \code{run_npLRT} is TRUE, number of permutations to run
#' @param verbose TRUE to return all information generated by happi, FALSE to only return effect size and p-value
#'
#' @importFrom msos logdet
#' @importFrom utils tail
#' @import tibble
#' @import stats
#' @import splines2
#' @importFrom isotone activeSet fSolver
#' @importFrom logistf logistf
#'
#' @return An object of class \code{happi}.
#'
#' @examples
#' data(TM7_data)
#' x_matrix <- model.matrix(~tongue, data = TM7_data)
#' happi_results <- happi (outcome = TM7_data$`Cellulase/cellobiase CelA1`,
#' covariate=x_matrix,
#' quality_var=TM7_data$mean_coverage,
#' max_iterations=1000,
#' change_threshold=0.1,
#' epsilon=0,
#' nstarts = 1,
#' spline_df = 3)
#' @export
happi <- function(outcome,
covariate = NULL,
h0_param = 2,
quality_var = NULL,
covariate_formula = NULL,
covariate_formula_h0 = NULL,
quality_var_formula = NULL,
data = NULL,
max_iterations = 1000,
min_iterations = 15,
change_threshold = 0.05,
epsilon = 0,
method = "splines",
random_starts = FALSE,
firth = TRUE,
spline_df = 3,
nstarts = 1,
seed = 13,
norm_sd = 1,
run_npLRT = FALSE,
P = NULL,
verbose = TRUE
) {
# number of observations
nn <- length(outcome)
# check that all inputs are provided in either form
if (is.null(covariate)) {
if (is.null(covariate_formula) | is.null(covariate_formula_h0) |
is.null(quality_var_formula) | is.null(data)) {
stop("If using formulas, please include `covariate_formula`, `covariate_formula_h0`, `quality_val_formula`, and `data`.")
}
} else {
if (is.null(h0_param) | is.null(quality_var)) {
stop("If using covariate matrix, please also include `h0_param` and `quality_var`.")
}
}
# make covariate matrices from formulas
if (!(is.null(covariate_formula))) {
if (nrow(data) != nn) {
stop("Make sure that `outcome` and `data` have the same number of rows.")
}
covariate <- stats::model.matrix(covariate_formula, data)
pp <- ncol(covariate)
covariate_null <- stats::model.matrix(covariate_formula_h0, data)
if ((pp - ncol(covariate_null)) > 1) {
stop("Currently happi cannot test multiple parameters at once.")
}
quality_var_mat <- stats::model.matrix(quality_var_formula, data)
# if quality variable matrix has intercept, just take column with quality variable
if (ncol(quality_var_mat) == 1) {
quality_var <- quality_var_mat
} else {
quality_var <- quality_var_mat[, -1]
}
# otherwise use covariate matrices provided
} else {
if (nrow(covariate) != nn | length(quality_var) != nn) {
stop("Make sure that `outcome`, `covariate_data`, and `quality_variable` have the same number of observations.")
}
pp <- ncol(covariate)
if (pp == 1) {
covariate <- matrix(covariate, ncol = 1, nrow = nn)
covariate_null <- matrix(1, ncol = 1, nrow = nn)
} else {
covariate_null <- covariate[, -h0_param]
}
if (!is.matrix(covariate_null)) covariate_null <- matrix(covariate_null, nrow=nn)
}
# stop if there is any missing data
stopifnot(all(!is.na(c(outcome, covariate, quality_var))))
# check that epsilon is either a single value or a vector of length n
if (length(epsilon) == 1) {
epsilon_vec <- rep(epsilon, nn)
} else if (length(epsilon == nn)) {
epsilon_vec <- epsilon
} else {
error("epsilon should be a single number or a length n vector.")
}
# if(h0_param != 2) warning("Amy hasn't properly checked that testing a different parameter results in sensible output")
## reorder all elements of all data by ordering in quality_var
## TODO(change back at end)
my_order <- order(quality_var)
quality_var <- quality_var[my_order]
outcome <- outcome[my_order]
covariate <- covariate[my_order, ]
epsilon_vec <- epsilon_vec[my_order]
if (pp == 1) {
covariate <- matrix(covariate, ncol = 1, nrow = nn)
covariate_null <- matrix(1, ncol = 1, nrow = nn)
} else {
covariate_null <- covariate[, -h0_param]
} #TODO(PT) take in formula
if (!is.matrix(covariate_null)) covariate_null <- matrix(covariate_null, nrow=nn)
# generate beta initial starts for alternative model
inits <- happi::genInits(num_covariate = pp, nstarts = nstarts, seed = seed, norm_sd = norm_sd)
# generate beta initial starts for null model
inits_null <- happi::genInits(num_covariate = pp - 1, nstarts = nstarts, seed = seed, norm_sd = norm_sd)
# generate f initial starts for both alternative and null models
inits_f <- happi::genInits_f(nn = nn, nstarts = nstarts, seed = seed, outcome = outcome)
bestOut <- NULL
bestOut_null <- NULL
### Start for loop through multiple starts
# save information from each start
starts_df <- data.frame(starts = 1:nstarts,
alt_ll = NA,
null_ll = NA)
for (i in 1:nstarts) {
####################################
## Create matrices to hold results #
####################################
my_estimates <- tibble::tibble("iteration" = 0:max_iterations,
"epsilon" = ifelse(length(epsilon) == 1, epsilon, "multiple"),
"loglik" = NA,
"loglik_nopenalty" = NA)
my_estimates_null <- tibble::tibble("iteration" = 0:max_iterations,
"epsilon" = ifelse(length(epsilon) == 1, epsilon, "multiple"),
"loglik" = NA,
"loglik_nopenalty" = NA)
my_estimated_beta <- matrix(NA, nrow = max_iterations + 1, ncol = pp)
my_estimated_beta_null <- matrix(NA, nrow = max_iterations + 1, ncol = max(1, pp - 1))
my_fitted_xbeta <- matrix(NA, nrow = max_iterations + 1, ncol = nn)
my_fitted_xbeta_null <- matrix(NA, nrow = max_iterations + 1, ncol = nn)
my_estimated_f <- matrix(NA, nrow = max_iterations + 1, ncol = nn)
my_estimated_f_null <- matrix(NA, nrow = max_iterations + 1, ncol = nn)
my_estimated_ftilde <- matrix(NA, nrow = max_iterations + 1, ncol = nn)
my_estimated_ftilde_null <- matrix(NA, nrow = max_iterations + 1, ncol = nn)
my_estimated_p <- matrix(NA, nrow = max_iterations + 1, ncol = nn)
my_estimated_p_null <- matrix(NA, nrow = max_iterations + 1, ncol = nn)
my_estimated_basis_weights <- matrix(NA, nrow = max_iterations + 1, ncol = spline_df + 1)
my_estimated_basis_weights_null <- matrix(NA, nrow = max_iterations + 1, ncol = spline_df + 1)
##################
## Input starts ##
##################
my_estimated_beta[1, ] <- inits[i,]
my_estimated_beta_null[1, ] <- inits_null[i,]
my_fitted_xbeta[1, ] <- c(covariate %*% my_estimated_beta[1, ])
my_fitted_xbeta_null[1, ] <- c(covariate_null %*% my_estimated_beta_null[1, ])
my_estimated_f[1, ] <- inits_f[i,]
my_estimated_f_null[1, ] <- inits_f[i,]
# f-tilde = logit(f)
my_estimated_ftilde[1, ] <- happi::logit(my_estimated_f[1, ])
my_estimated_ftilde_null[1, ] <- happi::logit(my_estimated_f_null[1, ])
my_estimated_p[1, ] <- happi::calculate_p(xbeta = my_fitted_xbeta[1, ],
ff = my_estimated_f[1, ],
epsilon = epsilon_vec,
outcome = outcome)
my_estimated_p_null[1, ] <- happi::calculate_p(xbeta = my_fitted_xbeta_null[1, ],
ff = my_estimated_f_null[1, ],
epsilon = epsilon_vec,
outcome = outcome)
my_estimates[1, "loglik"] <- happi::incomplete_loglik(xbeta = my_fitted_xbeta[1, ],
ff = my_estimated_f[1, ],
firth = firth,
outcome = outcome,
epsilon = epsilon_vec,
covariate = covariate)
my_estimates[1, "loglik_nopenalty"] <- happi::incomplete_loglik(xbeta = my_fitted_xbeta[1, ],
ff = my_estimated_f[1, ],
firth = F,
outcome = outcome,
epsilon = epsilon_vec,
covariate = covariate)
my_estimates_null[1, "loglik"] <- happi::incomplete_loglik(xbeta = my_fitted_xbeta_null[1, ],
ff = my_estimated_f_null[1, ],
firth = firth,
outcome = outcome,
epsilon = epsilon_vec,
covariate = covariate_null)
my_estimates_null[1, "loglik_nopenalty"] <- happi::incomplete_loglik(xbeta = my_fitted_xbeta_null[1, ],
ff = my_estimated_f_null[1, ],
firth = F,
outcome = outcome,
epsilon = epsilon_vec,
covariate = covariate_null)
###############################################################################################################
## E-M algorithm for penalized maximum likelihood estimation of our parameter estimates for alternative model #
###############################################################################################################
mlout <- tryCatch(happi::run_em(outcome = outcome,
quality_var = quality_var,
max_iterations = max_iterations,
min_iterations = min_iterations,
change_threshold = change_threshold,
epsilon = epsilon_vec,
method = method,
firth = firth,
spline_df = spline_df,
nn = nn,
em_covariate = covariate,
em_estimated_p = my_estimated_p,
em_fitted_xbeta = my_fitted_xbeta,
em_estimated_f = my_estimated_f,
em_estimates = my_estimates,
em_estimated_beta = my_estimated_beta,
em_estimated_basis_weights = my_estimated_basis_weights,
em_estimated_ftilde = my_estimated_ftilde),
error = function(e) {cat("WARNING alternative model E-M at initial start row index", paste(i),":", conditionMessage(e),"\n")})
########################################################################################################
## E-M algorithm for penalized maximum likelihood estimation of our parameter estimates for null model #
########################################################################################################
mlout_null <- tryCatch(happi::run_em(outcome = outcome,
quality_var = quality_var,
max_iterations = max_iterations,
min_iterations = min_iterations,
change_threshold = change_threshold,
epsilon = epsilon_vec,
method = method,
firth = firth,
spline_df = spline_df,
nn = nn,
em_covariate = covariate_null,
em_estimated_p = my_estimated_p_null,
em_fitted_xbeta = my_fitted_xbeta_null,
em_estimated_f = my_estimated_f_null,
em_estimates = my_estimates_null,
em_estimated_beta = my_estimated_beta_null,
em_estimated_basis_weights = my_estimated_basis_weights_null,
em_estimated_ftilde = my_estimated_ftilde_null),
error = function(e) {cat("WARNING null model E-M at initial start row index", paste(i),":", conditionMessage(e),"\n")}) # END WHILE loop that stops when convergence is met
############################################################
# Evaluate for best set of results for alternative model ###
############################################################
## if we get a converged result then...
## check if bestOut is NULL.
## If NULL then replace with the converged result
## If NOT NUlL then replace only if LL is better than what is currently in bestOut
tryCatch(if(!is.na(tail(mlout$loglik$loglik[!is.na(mlout$loglik$loglik)],1))){ # check that not NA for alternative model
starts_df[i, "alt_ll"] <- tail(mlout$loglik$loglik[!is.na(mlout$loglik$loglik)],1)
if (is.null(bestOut)) {
bestOut <- mlout
} else if (!is.null(bestOut)){
if(tail(mlout$loglik$loglik[!is.na(mlout$loglik$loglik)],1) > tail(bestOut$loglik$loglik[!is.na(bestOut$loglik$loglik)],1)){
bestOut <- mlout
}
}
}, error = function(e) {cat("WARNING alternative model did not converge at initial start row index", paste(i),":", conditionMessage(e),"\n")})
#####################################################
# Evaluate for best set of results for null model ###
#####################################################
## if we get a converged result then...
## check if bestOut_null model is NULL.
## If bestOut_null is NULL then replace with the converged result
## If bestOut_null is NOT NUlL then replace only if LL is better than what is currently in bestOut_null
tryCatch(if(!is.na(utils::tail(mlout_null$loglik$loglik[!is.na(mlout_null$loglik$loglik)],1))){ # check that not NA for null model
starts_df[i, "null_ll"] <- tail(mlout_null$loglik$loglik[!is.na(mlout_null$loglik$loglik)],1)
if (is.null(bestOut_null)) {
bestOut_null <- mlout_null
} else if (!is.null(bestOut_null)){
if(utils::tail(mlout_null$loglik$loglik[!is.na(mlout_null$loglik$loglik)],1) > tail(bestOut_null$loglik$loglik[!is.na(bestOut_null$loglik$loglik)],1)){
bestOut_null <- mlout_null
}
}
}, error = function(e) {cat("WARNING null model did not converge at initial start row index", paste(i),":", conditionMessage(e),"\n")})
} # END -- nstarts
if (is.null(bestOut)) stop("Full model could not be optimized! Try increasing the number of nstarts or simplifying your model.")
if (is.null(bestOut_null)) stop("Null model could not be optimized! Try increasing the number of nstarts or simplifying your model.")
#############################################
### If alternative LL is smaller than null:
## restart estimation of alternative model using the null model
#############################################
if (utils::tail(bestOut$loglik$loglik_nopenalty[!is.na(bestOut$loglik$loglik_nopenalty)],1) <
tail(bestOut_null$loglik$loglik_nopenalty[!is.na(bestOut_null$loglik$loglik_nopenalty)],1)) {
message("Likelihood greater under null; restarting...")
my_estimates <- tibble::tibble("iteration" = 0:max_iterations,
"epsilon" = ifelse(length(epsilon) == 1, epsilon, "multiple"),
"loglik" = NA,
"loglik_nopenalty" = NA)
my_estimated_beta <- matrix(NA, nrow = max_iterations + 1, ncol = pp)
my_fitted_xbeta <- matrix(NA, nrow = max_iterations + 1, ncol = nn)
my_estimated_f <- matrix(NA, nrow = max_iterations + 1, ncol = nn)
my_estimated_ftilde <- matrix(NA, nrow = max_iterations + 1, ncol = nn)
my_estimated_p <- matrix(NA, nrow = max_iterations + 1, ncol = nn)
if (pp == 1){
my_estimated_beta[1, 1] <- 0
} else if (pp == 2) {
my_estimated_beta[1, 1] <- utils::tail(bestOut_null$beta[!is.na(bestOut_null$beta[,1]),],1)[1] ## start at converged null
my_estimated_beta[1, h0_param] <- 0
} else if (pp == 3){
my_estimated_beta[1, 1] <- utils::tail(bestOut_null$beta[!is.na(bestOut_null$beta[,1]),],1)[1] ## start at converged null
my_estimated_beta[1, h0_param] <- 0
my_estimated_beta[1, 3] <- tail(bestOut_null$beta[!is.na(bestOut_null$beta[,1]),],1)[2] ## start at converged null
} ## TO DO (PT): need to take in pp > 3 if needed
stopifnot(h0_param == 2)
my_fitted_xbeta[1, ] <- c(covariate %*% my_estimated_beta[1, ])
my_estimated_f[1, ] <- utils::tail(bestOut_null$f[!is.na(bestOut_null$f[,1]),],1)
my_estimated_ftilde[1, ] <- happi::logit(my_estimated_f[1, ])
my_estimated_p[1, ] <- happi::calculate_p(xbeta = my_fitted_xbeta[1, ],
ff = my_estimated_f[1, ],
epsilon = epsilon_vec,
outcome = outcome)
my_estimates[1, "loglik"] <- happi::incomplete_loglik(xbeta = my_fitted_xbeta[1, ],
ff = my_estimated_f[1, ],
outcome = outcome,
epsilon = epsilon_vec,
covariate = covariate)
my_estimates[1, "loglik_nopenalty"] <- happi::incomplete_loglik(xbeta = my_fitted_xbeta[1, ],
ff = my_estimated_f[1, ],
outcome = outcome,
epsilon = epsilon_vec,
covariate = covariate,
firth = F)
## restart at null model if likelihood greater under null than alternative
tryOut <- tryCatch(happi::run_em(outcome = outcome,
quality_var = quality_var,
max_iterations = max_iterations,
min_iterations = min_iterations,
change_threshold = change_threshold,
epsilon = epsilon_vec,
method = method,
firth = firth,
nn = nn,
spline_df = spline_df,
em_covariate = covariate,
em_estimated_p = my_estimated_p,
em_fitted_xbeta = my_fitted_xbeta,
em_estimated_f = my_estimated_f,
em_estimates = my_estimates,
em_estimated_beta = my_estimated_beta,
em_estimated_basis_weights = my_estimated_basis_weights,
em_estimated_ftilde = my_estimated_ftilde),
error = function(e) {cat("WARNING alternative model E-M at null model estimates:", conditionMessage(e),"\n")})
if (!(is.null(tryOut))) {
# use tryOut as new bestOut if it has a higher likelihood value
if (utils::tail(tryOut$loglik$loglik[!is.na(tryOut$loglik$loglik)],1) > utils::tail(bestOut_null$loglik$loglik[!is.na(bestOut_null$loglik$loglik)],1)) {
bestOut <- tryOut
}
}
if (utils::tail(bestOut$loglik$loglik[!is.na(bestOut$loglik$loglik)],1) < utils::tail(bestOut_null$loglik$loglik[!is.na(bestOut_null$loglik$loglik)],1)) {
message("Restarting to estimate beta_alt didn't work. Penalized likelihood is still greater under the null than alt. pvalue = 1")
# message(paste("Had not converged after", tt_restart - 1, "iterations; LL % change:", round(pct_change_llks, 3)))
}
# if (tail(bestOut$loglik$loglik_nopenalty[!is.na(bestOut$loglik$loglik_nopenalty)],1) < tail(bestOut_null$loglik$loglik_nopenalty[!is.na(bestOut_null$loglik$loglik_nopenalty)],1)) {
# message("Weird! Nonpenalized Likelihood is also greater under the null than alt. pvalue = 1")
# } else {
# message("Phew! Nonpenalized Likelihood is not greater under the null than alt.")
# }
} ## End restart if likelihood is greater under the null
###############################
### Output the best estimates##
###############################
my_estimates$loglik <- bestOut$loglik$loglik
my_estimates$loglik_nopenalty <- bestOut$loglik$loglik_nopenalty
my_estimates$iteration <- bestOut$loglik$iteration
my_estimates$loglik_null <- bestOut_null$loglik$loglik
my_estimates$loglik_null_nopenalty <- bestOut_null$loglik$loglik_nopenalty
my_estimates$iteration_null <- bestOut_null$loglik$iteration
my_estimated_beta <- bestOut$beta
my_estimated_beta_null <- bestOut_null$beta
my_estimated_f <- bestOut$f
my_estimated_f_null <- bestOut_null$f
my_estimated_basis_weights <- bestOut$basis_weights
my_estimated_basis_weights_null <- bestOut_null$basis_weights
my_estimated_p <- bestOut$p
my_estimated_p_null <- bestOut_null$p
########################
# LRT based on best LLs
########################
my_estimates$LRT <- 2*(utils::tail(my_estimates$loglik[!is.na(my_estimates$loglik)],1) - utils::tail(my_estimates$loglik_null[!is.na(my_estimates$loglik_null)],1))
my_estimates$pvalue <- 1 - pchisq(my_estimates$LRT, df=1)
my_estimates$LRT_nopenalty <- 2*(utils::tail(my_estimates$loglik_nopenalty[!is.na(my_estimates$loglik_nopenalty)],1) - utils::tail(my_estimates$loglik_null_nopenalty[!is.na(my_estimates$loglik_null_nopenalty)],1))
my_estimates$pvalue_nopenalty <- 1 - pchisq(my_estimates$LRT_nopenalty, df=1)
full_output <- list("loglik" = my_estimates,
"beta" = my_estimated_beta,
"beta_null" = my_estimated_beta_null,
"f" = my_estimated_f,
"f_null" = my_estimated_f_null,
"basis_weights" = my_estimated_basis_weights,
"basis_weights_null" = my_estimated_basis_weights_null,
"p" = my_estimated_p,
"p_null" = my_estimated_p_null,
"quality_var" = quality_var,
"outcome" = outcome,
"covariate" = covariate,
"starts_df" = starts_df)
if (run_npLRT) {
npLRT_p <- npLRT(happi_out = full_output,
max_iterations = max_iterations,
min_iterations = min_iterations,
h0_param = h0_param,
change_threshold = change_threshold,
epsilon = epsilon,
method = method,
firth = firth,
spline_df = spline_df,
nstarts = nstarts,
P = P)
full_output$npLRT_p <- npLRT_p
}
pvalue <- tail(my_estimates$pvalue_nopenalty
[!is.na(my_estimates$pvalue_nopenalty)], 1)
beta <- tail(my_estimated_beta[!is.na(my_estimated_beta[, 1]), ], 1)
LRT <- tail(my_estimates$LRT_nopenalty[
!is.na(my_estimates$LRT_nopenalty)], 1)
small_results <- list(pvalue_LRT = pvalue, beta = beta, LRT = LRT)
if (run_npLRT) {
small_results$pvalue_npLRT <- npLRT_p
}
full_output$summary <- small_results
if (verbose) {
return(full_output)
} else {
return(full_output$summary)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.