# The 'train_test' function
# Written by Kevin Potter
# email: kevin.w.potter@gmail.com
# Please email me directly if you
# have any questions or comments
# Last updated 2018-10-10
# Table of contents
# 1) train_test | tested
# 1.1) is.train_test | tested
# 1.2) subset.train_test | tested
# 1.3) levels.train_test | tested
# 1.4) dimnames.train_test | tested
# 1.5) as.integer.train_test | tested
# 1.6) print.train_test | tested
# 1.7) as.data.frame.train_test | tested
# 1.8) size.train_test |
###
### 1) train_test
###
#' Create Training and Test Data Subsets
#'
#' Creates a list of lists, containing the dependent
#' variable and matrix of predictors for the
#' training set and the test set, respectively.
#'
#' @param current_fold The index for data to withhold and use
#' for the test subset.
#' @param index A vector of indices indicating which observations belong
#' to which fold for cross-validation (see \code{\link{cv_index}}).
#' @param y A vector for the dependent variable. Converted to a
#' factor if it is not already.
#' @param X A matrix of predictors.
#' @param scale Logical; if \code{TRUE}, standardizes the
#' predictors.
#'
#' @details The method \code{subset} can be used to extract the
#' dependent variable (\code{y = TRUE}) or matrix of predictors from the
#' training (\code{train = TRUE}) or test subsets, respectively. The
#' method \code{as.data.frame} converts the specified subset into a
#' data frame (useful for \code{glm}). The methods \code{dimnames} and
#' \code{levels} can extract the column labels for predictors and
#' the levels for the dependent variable, respectively. The method
#' \code{as.integer} can convert the subset for the dependent variable
#' into integer values. When standardizing predictors, the training
#' set is standardized, and then the test set is scaled relative
#' to the training set.
#'
#' @return A list of lists, of class 'train_test'. The element
#' \code{train} contains the training subset for the dependent
#' variable \code{y} and matrix of predictors \code{X}, while
#' the element \code{test} contains the test subset.
#'
#' @export
#' @examples
#' # Example data
#' y = sample( c( 'Yes', 'No' ), 1000, replace = T, prob = c(.4,.6) )
#' y = as.factor(y)
#' X = matrix( rnorm( 1000*8 ), 1000, 8 )
#' index = cv_index( 10, 1000 )
#' ex = train_test( 1, index, y, X )
#' ex
#' # Extract dependent variable from test set
#' y_test = subset( ex, y = T, train = F )
#' # Extract matrix of predictors from training set
#' X_train = subset( ex, y = F, train = T )
#' # Extract column names for predictors
#' names( ex )
#' # Extract levels for dependent variable
#' levels( ex )
train_test = function( current_fold,
index,
y,
X,
scale = T ) {
# Check if dependent variable is a factor
if ( !is.factor( y ) ) {
y = as.factor( y )
}
# Determine length of y
n = length(y)
# Check structures of inputs
if ( !is.matrix(X) )
stop( 'Need matrix of predictors', call. = F )
if ( nrow(X) != n )
stop( 'Mismatch between number of observations and rows for predictors',
call. = F )
if ( length( index ) != n )
stop( 'Mismatch between number of observations and indices',
call. = F )
if ( !(current_fold %in% index) )
stop( 'Must provide value contained in index',
call. = F )
# If no column names are provided for X,
# create default names
if ( is.null( colnames( X ) ) ) {
colnames( X ) = paste( 'X', 1:ncol( X ), sep = '' )
}
# Training subset
sel = index != current_fold
y_train = y[sel]
X_train = X[sel,]
# Test subset
sel = index == current_fold
y_test = y[sel]
X_test = X[sel,]
# Ensure that X variable is a matrix
if ( !is.matrix( X_train ) ) {
X_train = as.matrix( X_train )
colnames( X_train ) = colnames( X )
}
if ( !is.matrix( X_test ) ) {
X_test = as.matrix( X_test )
colnames( X_test ) = colnames( X )
}
# If specified, standardize predictors
if ( scale ) {
# First standardize training set
m = colMeans( X_train )
s = apply( X_train, 2, sd )
X_train = X_train - matrix( m, nrow(X_train), ncol(X), byrow = T )
X_train = X_train/matrix( s, nrow(X_train), ncol(X), byrow = T )
# Then scale test sample relative to traning set
X_test = X_test - matrix( m, nrow(X_test), ncol(X), byrow = T )
X_test = X_test/matrix( s, nrow(X_test), ncol(X), byrow = T )
}
# Output
out = list(
train = list( y = y_train,
X = X_train ),
test = list( y = y_test,
X = X_test )
)
class( out ) = 'train_test'
return( out )
}
# 1.1)
#' @rdname train_test
#' @export
is.train_test = function( x ) {
# Purpose:
# Checks if an object is of class 'train_test'
# Arguments:
# x - An R object
# Returns:
# TRUE if the object is of class 'train_test',
# FALSE otherwise.
return( inherits( x, 'train_test' ) )
}
# 1.2)
#' @rdname train_test
#' @export
subset.train_test = function( x, y = T, train = T ) {
# Purpose:
# Extracts the dependent variable or
# matrix of predictors from the training
# or test data subsets.
# Arguments:
# x - An R object of class 'train_test'
# y - Logical; if TRUE, returns the
# dependent variable
# train - Logical; if TRUE, returns the
# training data subset
# Returns:
# The specified subset for the data set.
# Initialize output
out = NULL
if ( y ) {
# Extract dependent variable
if ( train ) {
out = x$train$y
} else {
out = x$test$y
}
} else {
# Extract matrix of predictors
if ( train ) {
out = x$train$X
} else {
out = x$test$X
}
}
return( out )
}
# 1.3)
#' @rdname train_test
#' @export
levels.train_test = function( x ) {
# Purpose:
# Extracts the binary levels for the
# factor representing the dependent variable.
# Arguments:
# x - An R object of class 'train_test'
# Returns:
# The binary levels for the factor.
# Extract levels of dependent variable
y = subset( x, T, T )
out = levels( y )
return( out )
}
# 1.4)
#' @rdname train_test
#' @export
dimnames.train_test = function( x ) {
# Purpose:
# Extracts the column labels for the matrix
# of predictors.
# Arguments:
# x - An R object of class 'train_test'
# Returns:
# A vector with the column labels for the
# predictors.
# Extract labels for predictors
X = subset( x, F, T )
out = colnames( X )
return( out )
}
# 1.7)
#' @rdname train_test
#' @export
as.integer.train_test = function( x, train = T ) {
# Purpose:
# Converts the specified subset for the
# dependent variable into binary values (1, 0).
# Arguments:
# x - An R object of class 'train_test'
# train - Logical; if TRUE, returns the
# training data subset
# Returns:
# A binary vector.
y = subset( x, y = T, train = train )
out = rep( 0, length( y ) )
out[ y == levels( x )[1] ] = 1
return( out )
}
# 1.5)
#' @rdname train_test
#' @export
print.train_test = function( x ) {
# Purpose:
# Provides a brief summary of the 'train_test'
# object. Reports the number of predictors,
# the percentage of positive trials, and the
# sample size.
# Arguments:
# x - An R object of class 'train_test'
# Display number of predictors
K = ncol( subset( x, F, T ) )
string = paste( 'Number of predictors: ', K )
names( string ) = ' '
print( string, quote = F )
# Report percentage for binary variable
# and sample size for training and
# test sets
train = c( T, F )
ttl = c( 'Training subset', 'Test subset' )
names( ttl ) = rep( ' ', 2 )
for ( i in 1:2 ) {
# Type of subset
print( ttl[i], quote = F )
y = subset( x, T, train[i] )
n = length( y )
lev = levels( x )
percent = round( mean( y == lev[1] )* 100 )
string = c(
paste( percent, '%', sep = '' ),
as.character( n ) )
names( string ) = c(
paste( 'Category: ', lev[1] ),
'Sample size'
)
print( string, quote = F )
}
}
# 1.7)
#' @rdname train_test
#' @export
as.data.frame.train_test = function( x, train = T ) {
# Purpose:
# Combines the subset for the dependent variable
# and matrix of predictors into a single
# data frame.
# Arguments:
# x - An R object of class 'train_test'
# train - Logical; if TRUE, returns the
# training data subset
# Returns:
# A data frame with columns for the
# dependent variable (as integers) and
# each of the predictors.
# Extract data
y = as.integer( x, train = train )
X = subset( x, F, train )
# Create data frame
out = data.frame(
y = y
)
# Add predictors
out = cbind( out, X )
return( out )
}
# 1.8)
#' @rdname train_test
#' @export
size.train_test = function( x, y = T, train = T ) {
# Purpose:
# Extracts the size of the dependent variable
# or matrix of predictors for the training or
# test subsets.
# Arguments:
# x - An R object of class 'train_test'
# y - Logical; if TRUE, returns the size of
# the dependent variable
# train - Logical; if TRUE, returns the
# dimensions of the training data subset
# Returns:
# Either the length of the dependent variable
# or dimensions of the matrix of predictors
# for either the training or test subsets.
out = NULL
if ( y ) {
if ( train ) {
out = length( x$train$y )
} else {
out = length( x$test$y )
}
} else {
if ( train ) {
out = dim( x$train$X )
} else {
out = dim( x$test$X )
}
}
return( out )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.