R/bc_simulate.R

# The 'bv_simulate' function
# Written by Kevin Potter
# email: kevin.w.potter@gmail.com
# Please email me directly if you
# have any questions or comments
# Last updated 2018-10-04

# Table of contents
# 1) bc_simulate                     |
#   1.1) is.bc_simulate              | tested
#   1.2) subset.bc_simulate          | tested
#   1.3) levels.bc_simulate          | tested
#   1.4) dimnames.bc_simulate        | tested
#   1.5) coef.bc_simulate            | tested
#   1.6) mean.bc_simulate            | tested
#   1.7) print.bc_simulate           | tested
#   1.8) features.bc_simulate        |
#   1.9) size.bc_simulate            |

###
### 1) bc_simulate
###

#' Simulate Data for Binary Classification
#'
#' Simulates a matrix of independent variables and a
#' binary dependent variable that is predicted by
#' a subset of the independent variables.
#'
#' @param n The number of subjects to simulate.
#' @param K The number of predictors to include.
#' @param K_sig The number of non-zero coefficients for
#'   predictors.
#' @param correlated Logical; if TRUE, a correlation
#'   matrix is randomly created so that predictors have
#'   non-zero correlations between each other. Optionally,
#'   a user-defined correlation matrix can instead be
#'   submitted.
#' @param intercept The value of the model intercept in terms of
#'  the log-oods.
#' @param param A vector with the lower and upper bounds bewteen
#'   which non-zero predictor coefficients should fall.
#' @param levels The terms to use for the factor describing the
#'   binary dependent variable.
#' @param outliers An optional vector indicating the
#'   proportion of outliers that should occur, and the
#'   lower and upper boundaries between which each outlier should
#'   fall.
#' @param rng_seed An optional list with seeds controlling the
#'   RNG state for 1) selecting which predictors will have non-zero
#'   coefficients, 2) the sign, followed by the values of the
#'   coefficients, and 3) the correlation matrix.
#'
#' @details The method \code{subset} can be used to extract
#'   the simulated dependent variable (\code{y = TRUE}), or
#'   the simulated matrix of predictors. The variables
#'   can also be extracted directly from the list of outputs
#'   (see examples). The method \code{coef} extracts the
#'   parameters used in the logistic regression to simulate
#'   data. The method \code{as.integer} can be used to
#'   convert the dependent variable into binary values.
#'   The method \code{features} extracts the labels for
#'   the non-zero predictors.
#'
#' @return An R object of class 'bc_simulate'.
#'
#' @export
#' @examples
#' sim = bc_simulate( 100, 8, 4 )
#' # Extract the dependent variable
#' y = subset( sim ); y = sim$y
#' # Extract the predictors
#' X = subset( sim, y = F ); X = sim$X

bc_simulate = function( n,
                        K,
                        K_sig,
                        correlated = F,
                        intercept = -.6,
                        param = c( .5, 1.5 ),
                        levels = c( 'Yes', 'No' ),
                        outliers = NULL,
                        rng_seed = list(
                          feature = NULL,
                          coef = NULL,
                          cor = NULL
                        ) ) {

  # Starting time point
  start = ert()

  ### Simulate values for predictors ###

  if ( is.logical( correlated ) ) {

    if ( correlated ) {

      # Generate a random correlation matrix
      if ( !is.null( rng_seed[[3]] ) ) set.seed( rng_seed[[3]] )
      A = matrix( runif( K^2 )*2 - 1, ncol = K )
      Sigma = t(A) %*% A
      Omega = cov2cor( Sigma )

    } else {

      # All correlations set to 0
      Omega = diag( K )

    }

  }
  if ( is.matrix( correlated ) ) {
    # Set correlation matrix to specified value
    Omega = correlated
  }

  # Generate random values for predictors
  X = MASS::mvrnorm( n, rep( 0, K ), Sigma = Omega )

  # Set names for predictors
  colnames( X ) = paste( 'P', 1:K, sep = '' )

  # Initialize vector of parameters
  bt = rep( 0, K )

  sig_pred = c( rep( T, K_sig ), rep( F, K - K_sig ) )
  if ( !is.null( rng_seed[[1]] ) ) set.seed( rng_seed[[1]] )
  sig_pred = sample( sig_pred )

  # Determine sign for significant parameters
  if ( !is.null( rng_seed[[2]] ) ) set.seed( rng_seed[[2]][1] )
  coef_sign = ( rbinom( K_sig, 1, .5 ) - .5 )*2
  # Generate coefficient values for non-zero predictors
  if ( !is.null( rng_seed[[2]] ) ) set.seed( rng_seed[[2]][2] )
  bt[ sig_pred ] = runif( K_sig, param[1], param[2] ) * coef_sign
  # Set names for coefficients
  names( bt ) = colnames( X )

  # Set mean over values of predictors
  alpha = cbind( 1, X ) %*% cbind( c( intercept, bt ) )
  # Convert to probability
  theta = 1/( 1 + exp( -alpha ) )
  # Simulate binary data
  y_raw = rbinom( n, 1, theta )
  # Convert to factor
  y = rep( levels[2], n )
  y[ y_raw == 1 ] = levels[1]
  y = factor( y,
              levels = levels )

  # Add noise in form of outliers
  if ( !is.null( outliers ) ) {

    # Simulate desired percentage of outliers
    yes_out = rbinom( length( y ), 1, outliers[1] )
    # Select random predictors to adjust
    which_k = sample( 1:K,
                      sum( yes_out ),
                      replace = T )
    # Loop over selected observations to
    # convert to outliers
    inc = 1
    for ( i in 1:length( y ) ) {
      if ( yes_out[i] == 1 ) {

        # Inflate value of X
        sn = sign( X[ i, which_k[inc] ] )
        X[ i, which_k[inc] ] = runif( 1, outliers[2], outliers[3] )*sn

      }
    }

  }

  # Compute run time
  run_time = ert( start )

  # Output
  out = list(
    y = y, # Dependent variable
    X = X, # Matrix of predictors
    beta = bt, # Regression coefficients
    intercept = intercept,
    Omega = Omega, # Correlation matrix
    run_time = run_time
  )

  # Set class for output
  class( out ) = "bc_simulate"

  return( out )
}

# 1.1)
#' @rdname bc_simulate
#' @export

is.bc_simulate = function( x ) {
  # Purpose:
  # Checks if an object is of class 'bc_simulate'
  # Arguments:
  # x - An R object
  # Returns:
  # TRUE if the object is of class 'bc_simulate',
  # FALSE otherwise.

  return( inherits( x, 'bc_simulate' ) )
}

# 1.2)
#' @rdname bc_simulate
#' @export

subset.bc_simulate = function( x, y = T ) {
  # Purpose:
  # Extracts the dependent variable or
  # matrix of predictors from the
  # 'bc_simulate' object.
  # Arguments:
  # x - An R object of class 'bc_simulate'
  # y - Logical; if TRUE, returns the
  #     dependent variable
  # Returns:
  # The specified subset from the simulated results.

  # Initialize output
  out = NULL

  if ( y ) {
    # Extract dependent variable
    out = x$y
  } else {
    # Matrix of predictors
    out = x$X
  }

  return( out )
}

# 1.3)
#' @rdname bc_simulate
#' @export

levels.bc_simulate = function( x ) {
  # Purpose:
  # Extracts the binary levels for the
  # factor representing the dependent variable.
  # Arguments:
  # x - An R object of class 'bc_simulate'
  # Returns:
  # The binary levels for the factor.

  # Extract levels of dependent variable
  y = subset( x )
  out = levels( y )

  return( out )
}

# 1.4)
#' @rdname bc_simulate
#' @export

dimnames.bc_simulate = function( x ) {
  # Purpose:
  # Extracts the column labels for the matrix
  # of predictors.
  # Arguments:
  # x - An R object of class 'bc_simulate'
  # Returns:
  # A vector with the column labels for the
  # predictors.

  # Extract labels for predictors
  out = colnames( subset( x, F ) )

  return( out )
}

# 1.5)
#' @rdname bc_simulate
#' @export

coef.bc_simulate = function( x, int = FALSE ) {
  # Purpose:
  # Extracts coefficients from
  # an object of class 'bc_simulate'.
  # Arguments:
  # x   - An R object of class 'bc_simulate'
  # int - Logical; if TRUE, includes
  #       the intercept
  # Returns:
  # The regression coefficients used to generate data
  # from a logistic regression model.

  if ( int ) {
    # If intercept should be included in values reported

    out = c( x$intercept, x$beta )
    names( out ) = c(
      'Intercept',
      names( x$beta )
    )

  } else {

    out = x$beta

  }

  return( out )
}

# 1.6)
#' @rdname bc_simulate
#' @export

as.integer.bc_simulate = function( x ) {
  # Purpose:
  # Converts the dependent variable into binary
  # values (1, 0).
  # Arguments:
  # x - An R object of class 'bc_simulate'
  # Returns:
  # A binary vector.

  # Convert to numeric version
  y = as.numeric( subset( x, T ) == levels( x )[1] )

  return( y )
}

# 1.7)
#' @rdname bc_simulate
#' @export

print.bc_simulate = function( x ) {
  # Purpose:
  # Prints a brief summary of the number predictors,
  # sample size, and proportion of positive trials
  # for the simulated data.
  # Arguments:
  # x - An R object of class 'bc_simulate'

  n = length( x$y )
  levs = levels( x )
  p = mean( as.integer( x ) )
  K = ncol( x$X )

  string = paste( 'Sample size:', n )
  cat( string, '\n' )
  string = paste( 'Number of predictors:', K )
  cat( string, '\n' )
  string = paste( round( 100*p ), '% = ', levs[1], sep = '' )
  cat( string, '\n' )

}

# 1.8
#' @rdname bc_simulate
#' @export

features.bc_simulate = function( x ) {
  # Purpose:
  # Returns the labels for the predictors
  # that had non-zero coefficients when
  # generating the observations for the
  # dependent variable.
  # Arguments:
  # x - An R object of class 'bc_simulate'
  # Returns:
  # A character vector with the labels for
  # the selected predictors.

  out = NULL

  cf = coef( x )
  sel = cf != 0
  if ( any(sel) ) out = names( cf )[sel]

  return( out )
}

# 1.9
#' @rdname bc_simulate
#' @export

size.bc_simulate = function( x, y = T ) {
  # Purpose:
  # Extracts the size of the dependent variable
  # or matrix of predictors.
  # Arguments:
  # x     - An R object of class 'bc_simulate'
  # y     - Logical; if TRUE, returns the size of
  #         the dependent variable
  # Returns:
  # Either the length of the dependent variable
  # or dimensions of the matrix of predictors.

  out = NULL
  if ( y ) {
      out = length( x$y )
  } else {
    out = dim( x$X )
  }

  return( out )
}
rettopnivek/binclass documentation built on May 13, 2019, 4:46 p.m.