R/bootstrap.en_lr.R

# The 'bootstrap.en_lr' function
# Written by Kevin Potter
# email: kevin.w.potter@gmail.com
# Please email me directly if you
# have any questions or comments
# Last updated 2018-10-04

# Table of contents
# 1) bootstrap.en_lr

###
### 1) bootstrap.en_lr
###

bootstrap.en_lr = function( fit, y, X,
                            param, nRep = 1000, ci_width = .95 ) {
  # Purpose:
  # Computes p-values (with confidence intervals) for glmnet
  # coefficients using Monte Carlo sampling.
  # Arguments:
  # fit      - The output from the 'glmnet' function
  # y        - The dependent variable
  # X        - The matrix of predictors
  # param    - The best parameters for the mixing probability
  #            and the penalty term
  # nRep     - Number of iterations for Monte Carlo sampling
  # ci_width - Width of confidence interval around p-values
  # Returns:
  # A list containing
  # 1) A data frame with the original coefficient values,
  #    the frequency for which they were exceeded by the
  #    coefficients based on the shuffled data, the estimated
  #    p-value, and the lower and upper boundaries of the
  #    confidence interval for the p-values.
  # 2) The duration it took the algorithm to run.
  # 3) The width of the confidence intervals around the p-values.

  # Estimate run_time
  start = ert()

  # Create index to shuffle
  i = 1:length(y)

  # Create matrix to store estimates
  all_cf = matrix( 0, ncol(X), nRep )
  rownames( all_cf ) = colnames( X )

  # Extract parameters for glmnet estimation
  best_alpha = param[1]
  best_lambda = param[2]

  # Extract original estimates
  orig_cf = rep( 0, ncol(X) )
  names( orig_cf ) = colnames(X)

  cf = coef( fit, s = best_lambda )
  orig_cf[ rownames( cf )[-1] ] = cf[-1,1]

  # Loop over iterations
  for ( nr in 1:nRep ) {
    si = sample( i, replace = T )

    btstrp = glmnet::glmnet(
      x = X,
      y = y[si],
      family = 'binomial',
      standardize = T,
      alpha = best_alpha,
      lambda = fit$lambda
    )

    cf = coef( btstrp, s = best_lambda )
    sel = rownames( cf )[-1]
    all_cf[sel,nr] = cf[-1,1]
  }

  # Report original coefficients
  out = data.frame(
    Coefficients = orig_cf
  )
  # Frequency that estimates based on reshuffled
  # data exceed original values
  out$Frequency_exceed = rowSums( all_cf >= orig_cf )

  # Compute approximate p-value
  out$p_value = out$Frequency_exceed/nRep
  # Confidence interval around p-values
  interval = (1 - ci_width)/2
  interval = c( interval, interval + ci_width )

  out$p_value_LB = qbinom( interval[1], nRep,
                             out$Frequency_exceed/nRep )/nRep
  out$p_value_UB = qbinom( interval[2], nRep,
                           out$Frequency_exceed/nRep )/nRep

  sel = out$p_value > .5
  if ( any(sel) ) {
    out$p_value[sel] = 1 - out$p_value[sel]
    out$p_value_LB[sel] = 1 - out$p_value_LB[sel]
    out$p_value_UB[sel] = 1 - out$p_value_UB[sel]
  }

  run_time = ert(start)

  out = list(
    CT = out,
    run_time = run_time,
    CI_width = ci_width
  )

  return( out )
}
rettopnivek/binclass documentation built on May 13, 2019, 4:46 p.m.