R/train_test.R

# The 'train_test' function
# Written by Kevin Potter
# email: kevin.w.potter@gmail.com
# Please email me directly if you
# have any questions or comments
# Last updated 2018-10-10

# Table of contents
# 1) train_test                   | tested
#   1.1) is.train_test            | tested
#   1.2) subset.train_test        | tested
#   1.3) levels.train_test        | tested
#   1.4) dimnames.train_test      | tested
#   1.5) as.integer.train_test    | tested
#   1.6) print.train_test         | tested
#   1.7) as.data.frame.train_test | tested
#   1.8) size.train_test          |

###
### 1) train_test
###

#' Create Training and Test Data Subsets
#'
#' Creates a list of lists, containing the dependent
#' variable and matrix of predictors for the
#' training set and the test set, respectively.
#'
#' @param current_fold The index for data to withhold and use
#'   for the test subset.
#' @param index A vector of indices indicating which observations belong
#'   to which fold for cross-validation (see \code{\link{cv_index}}).
#' @param y A vector for the dependent variable. Converted to a
#'   factor if it is not already.
#' @param X A matrix of predictors.
#' @param scale Logical; if \code{TRUE}, standardizes the
#'   predictors.
#'
#' @details The method \code{subset} can be used to extract the
#' dependent variable (\code{y = TRUE}) or matrix of predictors from the
#' training (\code{train = TRUE}) or test subsets, respectively. The
#' method \code{as.data.frame} converts the specified subset into a
#' data frame (useful for \code{glm}). The methods \code{dimnames} and
#' \code{levels} can extract the column labels for predictors and
#' the levels for the dependent variable, respectively. The method
#' \code{as.integer} can convert the subset for the dependent variable
#' into integer values. When standardizing predictors, the training
#' set is standardized, and then the test set is scaled relative
#' to the training set.
#'
#' @return A list of lists, of class 'train_test'. The element
#'   \code{train} contains the training subset for the dependent
#'   variable \code{y} and matrix of predictors \code{X}, while
#'   the element \code{test} contains the test subset.
#'
#' @export
#' @examples
#' # Example data
#' y = sample( c( 'Yes', 'No' ), 1000, replace = T, prob = c(.4,.6) )
#' y = as.factor(y)
#' X = matrix( rnorm( 1000*8 ), 1000, 8 )
#' index = cv_index( 10, 1000 )
#' ex = train_test( 1, index, y, X )
#' ex
#' # Extract dependent variable from test set
#' y_test = subset( ex, y = T, train = F )
#' # Extract matrix of predictors from training set
#' X_train = subset( ex, y = F, train = T )
#' # Extract column names for predictors
#' names( ex )
#' # Extract levels for dependent variable
#' levels( ex )

train_test = function( current_fold,
                       index,
                       y,
                       X,
                       scale = T ) {

  # Check if dependent variable is a factor
  if ( !is.factor( y ) ) {
    y = as.factor( y )
  }
  # Determine length of y
  n = length(y)

  # Check structures of inputs
  if ( !is.matrix(X) )
    stop( 'Need matrix of predictors', call. = F )
  if ( nrow(X) != n )
    stop( 'Mismatch between number of observations and rows for predictors',
          call. = F )
  if ( length( index ) != n )
    stop( 'Mismatch between number of observations and indices',
          call. = F )
  if ( !(current_fold %in% index) )
    stop( 'Must provide value contained in index',
          call. = F )

  # If no column names are provided for X,
  # create default names
  if ( is.null( colnames( X ) ) ) {
    colnames( X ) = paste( 'X', 1:ncol( X ), sep = '' )
  }

  # Training subset
  sel = index != current_fold
  y_train = y[sel]
  X_train = X[sel,]

  # Test subset
  sel = index == current_fold
  y_test = y[sel]
  X_test = X[sel,]

  # Ensure that X variable is a matrix
  if ( !is.matrix( X_train ) ) {
    X_train = as.matrix( X_train )
    colnames( X_train ) = colnames( X )
  }
  if ( !is.matrix( X_test ) ) {
    X_test = as.matrix( X_test )
    colnames( X_test ) = colnames( X )
  }

  # If specified, standardize predictors
  if ( scale ) {

    # First standardize training set
    m = colMeans( X_train )
    s = apply( X_train, 2, sd )
    X_train = X_train - matrix( m, nrow(X_train), ncol(X), byrow = T )
    X_train = X_train/matrix( s, nrow(X_train), ncol(X), byrow = T )

    # Then scale test sample relative to traning set
    X_test = X_test - matrix( m, nrow(X_test), ncol(X), byrow = T )
    X_test = X_test/matrix( s, nrow(X_test), ncol(X), byrow = T )

  }

  # Output
  out = list(
    train = list( y = y_train,
                  X = X_train ),
    test = list( y = y_test,
                 X = X_test )
  )

  class( out ) = 'train_test'

  return( out )
}


# 1.1)
#' @rdname train_test
#' @export

is.train_test = function( x ) {
  # Purpose:
  # Checks if an object is of class 'train_test'
  # Arguments:
  # x - An R object
  # Returns:
  # TRUE if the object is of class 'train_test',
  # FALSE otherwise.

  return( inherits( x, 'train_test' ) )
}

# 1.2)
#' @rdname train_test
#' @export

subset.train_test = function( x, y = T, train = T ) {
  # Purpose:
  # Extracts the dependent variable or
  # matrix of predictors from the training
  # or test data subsets.
  # Arguments:
  # x     - An R object of class 'train_test'
  # y     - Logical; if TRUE, returns the
  #         dependent variable
  # train - Logical; if TRUE, returns the
  #         training data subset
  # Returns:
  # The specified subset for the data set.

  # Initialize output
  out = NULL

  if ( y ) {
    # Extract dependent variable
    if ( train ) {
      out = x$train$y
    } else {
      out = x$test$y
    }
  } else {
    # Extract matrix of predictors
    if ( train ) {
      out = x$train$X
    } else {
      out = x$test$X
    }
  }

  return( out )
}

# 1.3)
#' @rdname train_test
#' @export

levels.train_test = function( x ) {
  # Purpose:
  # Extracts the binary levels for the
  # factor representing the dependent variable.
  # Arguments:
  # x - An R object of class 'train_test'
  # Returns:
  # The binary levels for the factor.

  # Extract levels of dependent variable
  y = subset( x, T, T )
  out = levels( y )

  return( out )
}

# 1.4)
#' @rdname train_test
#' @export

dimnames.train_test = function( x ) {
  # Purpose:
  # Extracts the column labels for the matrix
  # of predictors.
  # Arguments:
  # x - An R object of class 'train_test'
  # Returns:
  # A vector with the column labels for the
  # predictors.

  # Extract labels for predictors
  X = subset( x, F, T )
  out = colnames( X )

  return( out )
}

# 1.7)
#' @rdname train_test
#' @export

as.integer.train_test = function( x, train = T ) {
  # Purpose:
  # Converts the specified subset for the
  # dependent variable into binary values (1, 0).
  # Arguments:
  # x     - An R object of class 'train_test'
  # train - Logical; if TRUE, returns the
  #         training data subset
  # Returns:
  # A binary vector.

  y = subset( x, y = T, train = train )
  out = rep( 0, length( y ) )
  out[ y == levels( x )[1] ] = 1

  return( out )
}

# 1.5)
#' @rdname train_test
#' @export

print.train_test = function( x ) {
  # Purpose:
  # Provides a brief summary of the 'train_test'
  # object. Reports the number of predictors,
  # the percentage of positive trials, and the
  # sample size.
  # Arguments:
  # x - An R object of class 'train_test'

  # Display number of predictors
  K = ncol( subset( x, F, T ) )
  string = paste( 'Number of predictors: ', K )
  names( string ) = ' '
  print( string, quote = F )

  # Report percentage for binary variable
  # and sample size for training and
  # test sets
  train = c( T, F )
  ttl = c( 'Training subset', 'Test subset' )
  names( ttl ) = rep( ' ', 2 )
  for ( i in 1:2 ) {
    # Type of subset
    print( ttl[i], quote = F )

    y = subset( x, T, train[i] )
    n = length( y )
    lev = levels( x )
    percent = round( mean( y == lev[1] )* 100 )
    string = c(
      paste( percent, '%', sep = '' ),
      as.character( n ) )
    names( string ) = c(
      paste( 'Category: ', lev[1] ),
      'Sample size'
    )
    print( string, quote = F )
  }

}

# 1.7)
#' @rdname train_test
#' @export

as.data.frame.train_test = function( x, train = T ) {
  # Purpose:
  # Combines the subset for the dependent variable
  # and matrix of predictors into a single
  # data frame.
  # Arguments:
  # x     - An R object of class 'train_test'
  # train - Logical; if TRUE, returns the
  #         training data subset
  # Returns:
  # A data frame with columns for the
  # dependent variable (as integers) and
  # each of the predictors.

  # Extract data
  y = as.integer( x, train = train )
  X = subset( x, F, train )

  # Create data frame
  out = data.frame(
    y = y
  )

  # Add predictors
  out = cbind( out, X )

  return( out )
}

# 1.8)
#' @rdname train_test
#' @export

size.train_test = function( x, y = T, train = T ) {
  # Purpose:
  # Extracts the size of the dependent variable
  # or matrix of predictors for the training or
  # test subsets.
  # Arguments:
  # x     - An R object of class 'train_test'
  # y     - Logical; if TRUE, returns the size of
  #         the dependent variable
  # train - Logical; if TRUE, returns the
  #         dimensions of the training data subset
  # Returns:
  # Either the length of the dependent variable
  # or dimensions of the matrix of predictors
  # for either the training or test subsets.

  out = NULL
  if ( y ) {

    if ( train ) {
      out = length( x$train$y )
    } else {
      out = length( x$test$y )
    }

  } else {

    if ( train ) {
      out = dim( x$train$X )
    } else {
      out = dim( x$test$X )
    }

  }

  return( out )
}
rettopnivek/binclass documentation built on May 13, 2019, 4:46 p.m.