# The 'bv_simulate' function
# Written by Kevin Potter
# email: kevin.w.potter@gmail.com
# Please email me directly if you
# have any questions or comments
# Last updated 2018-10-04
# Table of contents
# 1) bc_simulate |
# 1.1) is.bc_simulate | tested
# 1.2) subset.bc_simulate | tested
# 1.3) levels.bc_simulate | tested
# 1.4) dimnames.bc_simulate | tested
# 1.5) coef.bc_simulate | tested
# 1.6) mean.bc_simulate | tested
# 1.7) print.bc_simulate | tested
# 1.8) features.bc_simulate |
# 1.9) size.bc_simulate |
###
### 1) bc_simulate
###
#' Simulate Data for Binary Classification
#'
#' Simulates a matrix of independent variables and a
#' binary dependent variable that is predicted by
#' a subset of the independent variables.
#'
#' @param n The number of subjects to simulate.
#' @param K The number of predictors to include.
#' @param K_sig The number of non-zero coefficients for
#' predictors.
#' @param correlated Logical; if TRUE, a correlation
#' matrix is randomly created so that predictors have
#' non-zero correlations between each other. Optionally,
#' a user-defined correlation matrix can instead be
#' submitted.
#' @param intercept The value of the model intercept in terms of
#' the log-oods.
#' @param param A vector with the lower and upper bounds bewteen
#' which non-zero predictor coefficients should fall.
#' @param levels The terms to use for the factor describing the
#' binary dependent variable.
#' @param outliers An optional vector indicating the
#' proportion of outliers that should occur, and the
#' lower and upper boundaries between which each outlier should
#' fall.
#' @param rng_seed An optional list with seeds controlling the
#' RNG state for 1) selecting which predictors will have non-zero
#' coefficients, 2) the sign, followed by the values of the
#' coefficients, and 3) the correlation matrix.
#'
#' @details The method \code{subset} can be used to extract
#' the simulated dependent variable (\code{y = TRUE}), or
#' the simulated matrix of predictors. The variables
#' can also be extracted directly from the list of outputs
#' (see examples). The method \code{coef} extracts the
#' parameters used in the logistic regression to simulate
#' data. The method \code{as.integer} can be used to
#' convert the dependent variable into binary values.
#' The method \code{features} extracts the labels for
#' the non-zero predictors.
#'
#' @return An R object of class 'bc_simulate'.
#'
#' @export
#' @examples
#' sim = bc_simulate( 100, 8, 4 )
#' # Extract the dependent variable
#' y = subset( sim ); y = sim$y
#' # Extract the predictors
#' X = subset( sim, y = F ); X = sim$X
bc_simulate = function( n,
K,
K_sig,
correlated = F,
intercept = -.6,
param = c( .5, 1.5 ),
levels = c( 'Yes', 'No' ),
outliers = NULL,
rng_seed = list(
feature = NULL,
coef = NULL,
cor = NULL
) ) {
# Starting time point
start = ert()
### Simulate values for predictors ###
if ( is.logical( correlated ) ) {
if ( correlated ) {
# Generate a random correlation matrix
if ( !is.null( rng_seed[[3]] ) ) set.seed( rng_seed[[3]] )
A = matrix( runif( K^2 )*2 - 1, ncol = K )
Sigma = t(A) %*% A
Omega = cov2cor( Sigma )
} else {
# All correlations set to 0
Omega = diag( K )
}
}
if ( is.matrix( correlated ) ) {
# Set correlation matrix to specified value
Omega = correlated
}
# Generate random values for predictors
X = MASS::mvrnorm( n, rep( 0, K ), Sigma = Omega )
# Set names for predictors
colnames( X ) = paste( 'P', 1:K, sep = '' )
# Initialize vector of parameters
bt = rep( 0, K )
sig_pred = c( rep( T, K_sig ), rep( F, K - K_sig ) )
if ( !is.null( rng_seed[[1]] ) ) set.seed( rng_seed[[1]] )
sig_pred = sample( sig_pred )
# Determine sign for significant parameters
if ( !is.null( rng_seed[[2]] ) ) set.seed( rng_seed[[2]][1] )
coef_sign = ( rbinom( K_sig, 1, .5 ) - .5 )*2
# Generate coefficient values for non-zero predictors
if ( !is.null( rng_seed[[2]] ) ) set.seed( rng_seed[[2]][2] )
bt[ sig_pred ] = runif( K_sig, param[1], param[2] ) * coef_sign
# Set names for coefficients
names( bt ) = colnames( X )
# Set mean over values of predictors
alpha = cbind( 1, X ) %*% cbind( c( intercept, bt ) )
# Convert to probability
theta = 1/( 1 + exp( -alpha ) )
# Simulate binary data
y_raw = rbinom( n, 1, theta )
# Convert to factor
y = rep( levels[2], n )
y[ y_raw == 1 ] = levels[1]
y = factor( y,
levels = levels )
# Add noise in form of outliers
if ( !is.null( outliers ) ) {
# Simulate desired percentage of outliers
yes_out = rbinom( length( y ), 1, outliers[1] )
# Select random predictors to adjust
which_k = sample( 1:K,
sum( yes_out ),
replace = T )
# Loop over selected observations to
# convert to outliers
inc = 1
for ( i in 1:length( y ) ) {
if ( yes_out[i] == 1 ) {
# Inflate value of X
sn = sign( X[ i, which_k[inc] ] )
X[ i, which_k[inc] ] = runif( 1, outliers[2], outliers[3] )*sn
}
}
}
# Compute run time
run_time = ert( start )
# Output
out = list(
y = y, # Dependent variable
X = X, # Matrix of predictors
beta = bt, # Regression coefficients
intercept = intercept,
Omega = Omega, # Correlation matrix
run_time = run_time
)
# Set class for output
class( out ) = "bc_simulate"
return( out )
}
# 1.1)
#' @rdname bc_simulate
#' @export
is.bc_simulate = function( x ) {
# Purpose:
# Checks if an object is of class 'bc_simulate'
# Arguments:
# x - An R object
# Returns:
# TRUE if the object is of class 'bc_simulate',
# FALSE otherwise.
return( inherits( x, 'bc_simulate' ) )
}
# 1.2)
#' @rdname bc_simulate
#' @export
subset.bc_simulate = function( x, y = T ) {
# Purpose:
# Extracts the dependent variable or
# matrix of predictors from the
# 'bc_simulate' object.
# Arguments:
# x - An R object of class 'bc_simulate'
# y - Logical; if TRUE, returns the
# dependent variable
# Returns:
# The specified subset from the simulated results.
# Initialize output
out = NULL
if ( y ) {
# Extract dependent variable
out = x$y
} else {
# Matrix of predictors
out = x$X
}
return( out )
}
# 1.3)
#' @rdname bc_simulate
#' @export
levels.bc_simulate = function( x ) {
# Purpose:
# Extracts the binary levels for the
# factor representing the dependent variable.
# Arguments:
# x - An R object of class 'bc_simulate'
# Returns:
# The binary levels for the factor.
# Extract levels of dependent variable
y = subset( x )
out = levels( y )
return( out )
}
# 1.4)
#' @rdname bc_simulate
#' @export
dimnames.bc_simulate = function( x ) {
# Purpose:
# Extracts the column labels for the matrix
# of predictors.
# Arguments:
# x - An R object of class 'bc_simulate'
# Returns:
# A vector with the column labels for the
# predictors.
# Extract labels for predictors
out = colnames( subset( x, F ) )
return( out )
}
# 1.5)
#' @rdname bc_simulate
#' @export
coef.bc_simulate = function( x, int = FALSE ) {
# Purpose:
# Extracts coefficients from
# an object of class 'bc_simulate'.
# Arguments:
# x - An R object of class 'bc_simulate'
# int - Logical; if TRUE, includes
# the intercept
# Returns:
# The regression coefficients used to generate data
# from a logistic regression model.
if ( int ) {
# If intercept should be included in values reported
out = c( x$intercept, x$beta )
names( out ) = c(
'Intercept',
names( x$beta )
)
} else {
out = x$beta
}
return( out )
}
# 1.6)
#' @rdname bc_simulate
#' @export
as.integer.bc_simulate = function( x ) {
# Purpose:
# Converts the dependent variable into binary
# values (1, 0).
# Arguments:
# x - An R object of class 'bc_simulate'
# Returns:
# A binary vector.
# Convert to numeric version
y = as.numeric( subset( x, T ) == levels( x )[1] )
return( y )
}
# 1.7)
#' @rdname bc_simulate
#' @export
print.bc_simulate = function( x ) {
# Purpose:
# Prints a brief summary of the number predictors,
# sample size, and proportion of positive trials
# for the simulated data.
# Arguments:
# x - An R object of class 'bc_simulate'
n = length( x$y )
levs = levels( x )
p = mean( as.integer( x ) )
K = ncol( x$X )
string = paste( 'Sample size:', n )
cat( string, '\n' )
string = paste( 'Number of predictors:', K )
cat( string, '\n' )
string = paste( round( 100*p ), '% = ', levs[1], sep = '' )
cat( string, '\n' )
}
# 1.8
#' @rdname bc_simulate
#' @export
features.bc_simulate = function( x ) {
# Purpose:
# Returns the labels for the predictors
# that had non-zero coefficients when
# generating the observations for the
# dependent variable.
# Arguments:
# x - An R object of class 'bc_simulate'
# Returns:
# A character vector with the labels for
# the selected predictors.
out = NULL
cf = coef( x )
sel = cf != 0
if ( any(sel) ) out = names( cf )[sel]
return( out )
}
# 1.9
#' @rdname bc_simulate
#' @export
size.bc_simulate = function( x, y = T ) {
# Purpose:
# Extracts the size of the dependent variable
# or matrix of predictors.
# Arguments:
# x - An R object of class 'bc_simulate'
# y - Logical; if TRUE, returns the size of
# the dependent variable
# Returns:
# Either the length of the dependent variable
# or dimensions of the matrix of predictors.
out = NULL
if ( y ) {
out = length( x$y )
} else {
out = dim( x$X )
}
return( out )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.