Nothing
#' Semiparametric score function for distance-based kernel
#'
#' Description of the semiparametric score function for distance-based kernel function and binary outcome.
#'
#' This is the main function that calculates the p-value associated with a semiparametric kernel test
#' of association between the kernel and binary outcome variable. A null model (where the kernel is not
#' associated with the outcome) is initially fit. Then, the variance of
#' \eqn{Y_{i}|X_{i}}{Yi | Xi}
#' is used as the basis for the score test,
#' \deqn{S\left(\rho\right) = \frac{Q_{\tau}\left(\hat{\beta_0},\rho\right)-\mu_Q}{\sigma_Q}}{S (rho) = ( Q_tau (beta0, rho) ) / sigma_Q}.
#' However, because \eqn{\rho}{rho} disappears under the null hypothesis, we
#' run a grid search over a range of values of \eqn{\rho}{rho} (the bounds
#' of which were derived by Liu et al. in 2008). This grid search gets the upper bound for the score test's p-value.
#' This function is tailored for the underlying model
#' \deqn{y_{i} = x_{i}^{T}\beta + h\left(z_{i}\right) + e_{i},}{y_i = x_i ^ T beta + h(z_i)}
#' where \eqn{h\left(\cdot\right)}{h(.)} is
#' the kernel function, \eqn{z_{i}}{z_i} is a multidimensional array of variables,
#' \eqn{x_{i}}{x_i} is a vector or matrix of covariates, \eqn{\beta}{beta} is a vector
#' of regression coefficients, and \eqn{y_{i}}{y_i} is a binary outcome taking
#' values in {0, 1}.
#'
#' The function returns an numeric p-value for the kernel score test of association.
#'
#' @param outcome a numeric vector containing the binary outcome variable, 0/1 (in the same ID order as dist_mat)
#' @param covars a dataframe containing the covariates to be modeled parametrically (should NOT include an ID variable)
#' @param dist_mat a square distance matrix
#' @param grid_gran a numeric value specifying the grid search length, preset to 5000
#'
#' @seealso \code{\link{hms}}, \code{\link{ext_distance}}, \code{\link{ham_distance}}
#' \code{\link{score_log_nonparam}} for nonparametric score function of distance-based kernel functions and binary outcome.
#' \code{\link{score_cont_nonparam}} for nonparametric score function of distance-based kernel function and continuous outcome.
#' \code{\link{score_cont_semiparam}} for semiparametric score function of distance-based kernel function and continuous outcome.
#'
#' @return the score function p-value
#'
#' @references Liu D, Ghosh D, and Lin X (2008) "Estimation and testing for the effect of a
#' genetic pathway on a disease outcome using logistic kernel machine regression via
#' logistic mixed models." BMC Bioinformatics, 9(1), 292. ISSN 1471-2105.
#' \doi{10.1186/1471-2105-9-292}.
#'
#' @examples
#'
#' data(simasd_hamil_df)
#' data(simasd_covars)
#'
#' hamil_matrix <- ham_distance(simasd_hamil_df)
#' covars_df <- simasd_covars[,3:4]
#'
#'\donttest{
#' score_log_semiparam(
#' outcome = simasd_covars$dx_group,
#' covars = covars_df,
#' dist_mat = hamil_matrix,
#' grid_gran = 5000
#' )
#' }
#'
#' @export
score_log_semiparam <- function(outcome, covars, dist_mat, grid_gran = 5000) {
if (grid_gran <= 1) {
stop("Need to specify a grid search length of at least 2")
}
if (is.vector(outcome) == FALSE) {
stop("Outcome must be specified as a vector")
}
if (is.numeric(outcome) == FALSE) {
stop("Outcome vector must be numeric")
}
if (nrow(dist_mat) != ncol(dist_mat)) {
stop("The distance matrix must be a square matrix")
}
if (nrow(dist_mat) != length(outcome)) {
stop("The number of rows in the distance matrix must be equal to the length of the outcome vector")
}
if (nrow(dist_mat) != nrow(covars)) {
stop("The number of rows in the distance matrix must be equal to the number of rows of the covariate dataframe")
}
n <- ncol(dist_mat)
xnam <- colnames(covars)
y <- outcome
flma <- stats::as.formula(paste("y ~ ", paste(xnam, collapse = "+")))
null_mod <- stats::glm(formula = flma, family = "binomial", data = covars) #Fitting the null model without the kernel
pred_null <- stats::predict(null_mod, type = "response") #Pulls the predicted values from the logistic regression
D_0 <- diag(pred_null * (1 - pred_null))
X <- as.matrix(stats::model.matrix(null_mod))
P_0 <- D_0 - D_0 %*% X %*% solve(t(X) %*% D_0 %*% X) %*% t(X) %*% D_0
bounds <- up_low(dist_mat)
U <- max(bounds) * 100
L <- min(bounds[bounds > 0]) * 0.1
rho <- seq(L, U, length = grid_gran) #Grid of rho values for kernel
S <- rep(0, grid_gran) #Row vector of zeros of grid length
for (i in 1:grid_gran) {
k <- kernel(dist_mat, rho[i])
Q <- t(outcome - pred_null) %*% k %*% (outcome - pred_null) #Test statistic
mu <- tr(P_0 %*% k)
sigma <- sqrt(2 * tr(P_0 %*% k %*% P_0 %*% k))
S[i] <- (Q - mu)/sigma #Standardized version of test statistic for each value of kernel
}
M <- max(S) #Max value of standardized test statistic
W <- 0
for (j in 1:(grid_gran - 1)) {
W <- W + abs(S[j + 1] - S[j]) #Total variation of S in grid
}
p_value <- stats::pnorm(-M) + W * exp(-M^2/2)/sqrt(8 * pi) #(12) in Liu et al. (2008)
return(p_value)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.