lunarLearningAlgorithms: lunar learning algorithms

Documented in NNetEarlyStoppingCV NNetIterations

#' General Neural network for regression with backprop on the weight
#'
#'Training by using neural network with gradient descending
#'( real numbers for regression ).
#'
#'@param X.mat (feature matrix, n_observations x n_features)
#'@param y.vec (label vector, n_observations x 1)
#'@param max.iterations (int scalar > 1)
#'@param step.size (the steps size used when decending)
#'@param n.hidden.units (number of hidden units)
#'@param is.train (logical vector of size n_observations,
#'TRUE if the observation is in the train set, FALSE for the validation set)
#'
#'@return pred.mat (n_observations x max.iterations matrix of predicted values or n x k)
#'@return W.mat:final weight matrix (n_features+1 x n.hidden.units or p+1 x u)
#'@return v.vec: final weight vector (n.hidden.units+1 or u+1).
#'@return predict (testX.mat):
#'a function that takes a test features matrix and returns a vector of predictions
#' ( real numbers for regression )
#' The first row of W.mat should be the intercept terms;
#' the first element of v.vec should be the intercept term.
#'
#' @export
#'
#' @examples
#' 
#' Hard coded example
#' ------------------
#' hard_incidence <- as.numeric(c(84.106289446623, 55.328513822534, 38.106289446623, 45.345737285, 24.324, 30.324))
#' hard_emission <- as.numeric(c(38.288719431206, 0.13392142633291, 4.573728543534, 23.737285372, 54.345, 14.4))
#' hard_phase <- as.numeric(c(120.59515694473, 55.329415379509, 100.1339214473, 154.9728542, 55.57, 47.00))
#' 
#' NNtestX.mat <- as.matrix(data.frame(hard_incidence, hard_emission, hard_phase))
#' NNtestY.vec <- as.numeric(c(0.0607816, 0.078306 , 0.098325, 0.052368, .0163620, 0.0757853))
#' max.iterations <- as.integer(10)
#' step.size <- as.numeric(0.05)
#' NNtest.is.train <- as.logical(c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))
#' 
#' # 3 hidden units
#' results <- NNetIterations(NNtestX.mat, NNtestY.vec, max.iterations, step.size, as.integer(3), NNtest.is.train)
#' results$prediction(NNtestX.mat)
#' 
#' need other example that works for modern R

NNetIterations <- function(X.mat,y.vec,max.iterations,step.size,n.hidden.units,is.train){
  # if all the matrix is not of type numeric fail
  if(!all(is.matrix(X.mat),is.double(X.mat))){
    stop("X.mat must be a numberic matrix!")
  }
  
  # stop if the label vector is not numeric and length does not fit mathematically with X.mat
  if (!all(is.vector(y.vec), is.double(y.vec),length(y.vec) == nrow(X.mat))) {
    stop("y.vec must be a numeric vector of the same number of rows as X.mat!")
  }
  
  # stop if the max.iterations value is not an integer or it is too small to be used for learning
  if(!all(max.iterations>=1, is.integer(max.iterations))){
    stop("max.iterations must be an interger greater or equal to 1!")
  }
  
  # stop if the step.size variable is not valid or not a number
  if(!all(is.numeric(step.size), 0<step.size, step.size<=1)){
    stop("step.size must be a number between 0 and 1!")
  }
  
  # stop if the number of hidden layers is not valid
  if(!all(n.hidden.units>=1, is.integer(n.hidden.units))){
    stop("n.hidden.units must be an interger greater or equal to 1!")
  }
  # stop if the is.train is not logical and not the right size
  if(!all(is.logical(is.train), length(is.train)==nrow(X.mat))){
    stop("is.train must be a logical vector of the same number of rows as X.mat!")
  }
  
  if(length(unique(y.vec))==2){
    error("This model is not capable of binary classification")
  }
  
  # get the total data dimensions
  n.observations <- nrow(X.mat)
  n.features <- ncol(X.mat)
  
  #find the train indices and validation indices by using which() to return a vector of all the indexes needed
  train.index = which(is.train==TRUE)
  validation.index = which(is.train!=TRUE)
  
  # split the data into the two sets using the indices
  X.train = X.mat[train.index,]
  y.train = y.vec[train.index]
  
  # split the validation or test data using the indices vector
  X.validation = X.mat[validation.index,]
  y.validation = y.vec[validation.index]
  
  #compute a scaled input matrix, which has mean=0 and sd=1 for each column
  X.scaled.train = scale(X.train,center = TRUE,scale = TRUE)
  #X.scaled.validation = scale(X.validation,center = TRUE,scale = TRUE)
  X.scaled.mat = scale(X.mat,center = TRUE,scale = TRUE)
  
  # initialize all data variables needed for discent
  pred.mat = matrix(0,n.observations, max.iterations)
  v.mat = matrix(runif((n.features+1)*n.hidden.units),n.features+1,n.hidden.units)
  w.vec = runif(n.hidden.units+1)
  w.gradient=rep(0,n.hidden.units+1)
  v.gradient=matrix(0,n.features+1,n.hidden.units)
  
  # loop over all iterations of the discent ( this trains multiple models at once to find the best )
  for(iteration in 1:max.iterations){
    
    # multiply the scaled train matrix by the current overal weight vec. This backpropigates the lower layers back up to the current model predictor
    A.mat = (cbind(1,X.scaled.train))%*%v.mat
    
    # scale it back into 0 - 1
    X.zeta.mat = sigmoid(A.mat)
    
    # Get the likleyhood of each feature to be active ( the bias ) 
    X.bias.vec = as.numeric((cbind(1,X.zeta.mat)) %*% w.vec)
    
    # No Minnaert Here, using basic linear regression and predicting with weights
    pred.mat[,iteration] = cbind(1,sigmoid(cbind(1,X.scaled.mat)%*%v.mat))%*%w.vec
    
    # calculate the labels w/ biases
    delta.w = X.bias.vec - y.train
    
    # calculate the gradient values
    delta.v = delta.w * (X.zeta.mat * (1-X.zeta.mat)) * matrix( w.vec[-1], nrow(X.zeta.mat * (1-X.zeta.mat)), ncol(X.zeta.mat * (1-X.zeta.mat)) )
    
    # get the current gradient value for this observation for both weight vectors
    w.gradient = (t(cbind(1,X.zeta.mat))%*%delta.w)/n.observations
    v.gradient =  (t(cbind(1,X.scaled.train))%*%delta.v)/ n.observations
    
    # apply the gradient step to each
    w.vec = w.vec - step.size*as.vector(w.gradient)
    v.mat = v.mat - step.size*v.gradient
  }
  
  # return the list of values and a func to predict
  result.list = list(
    pred.mat = pred.mat,
    v.mat = v.mat,
    w.vec = w.vec,
    prediction = function(testX.mat){
      # never going to be bindary
      prediction.vec = cbind(1,sigmoid(cbind(1,testX.mat)%*%v.mat))%*%w.vec
      
      return (prediction.vec)
    }
  )
  return(result.list)
}

#' a function using neural networks through cross validation
#'
#' use K-fold cross validation based on the folds IDs provided in fold.vec(randomly)
#'
#' for each validarion/train split, use NNetIterations to compute the predictions
#' for all observations
#'
#' compute mean.validation.loss.vec, which is a vector(with max.iterations elements)
#' of mean validation loss over all K folds
#'
#' compute the mean.train.loss.vec, analogous to above but for the train data
#'
#' minimize the mean validation loss to determine selected.steps,
#' the optimal number of steps/iterations
#'
#' finally use NNetIteration(max.iterations=selected.steps) on the whole training data set for the optimal run
#'
#' @param X.mat : (n x p matrix of observations)
#' @param y.vec : (vector with n labels that align with X.mat rows )
#' @param fold.vec : (number of validation/training sets)
#' fold.vec = samole(1:n.folds,length(y.vec))
#' @param max.iterations (the maximum number of iterations for each fold)
#' @param step.size (the step size used in the weighted gradient decent)
#' @param n.hidden.units (the number of hidden units or "layer" to the net)
#' @param n.folds (default = 4; this is the number of folds to be used to break data into folds
#'  Warning: make sure it is not too high or NULLS will cause failures)
#'
#' @return mean.validation.loss
#' @return mean.train.loss.vec
#' @return selected.steps
#' 
#' @export
#' 
#' @examples
#' 
#' 
NNetEarlyStoppingCV <-
  function(X.mat, y.vec, fold.vec, max.iterations, step.size, n.hidden.units, n.folds = 4){
    
    # init loss vectors ad logical vector
    mean.train.loss.vec =  rep(0,max.iterations)
    mean.validation.loss.vec =  rep(0,max.iterations)
    is.train = rep(TRUE, length(y.vec))
    
    for(fold.number in 1:n.folds){
      # set the folds incides to logical values that represent this fold
      is.train[which(fold.vec == fold.number)] = FALSE
      is.train[which(fold.vec != fold.number)] = TRUE
      
      # get the indexes of the train and test data seperatly
      train.index = as.numeric(which(is.train==TRUE))
      validation.index = as.numeric(which(is.train==FALSE))
    
      # get the seperate data matrices using the index vectors
      X.train = X.mat[train.index,]
      y.train = y.vec[train.index]
      X.validation = X.mat[validation.index,]
      y.validation = y.vec[validation.index]
      
      # run the training function
      return.list.i = NNetIterations( X.mat, y.vec, max.iterations, step.size, n.hidden.units, is.train )
      
      # isolate the training and validation set used in the training function
      prediction.train = return.list.i$pred.mat[train.index,]
      prediction.validation =  return.list.i$pred.mat[validation.index,]
      
      # Here we incriment the mean loss of all folds by taking the mean
      # error of each iteration and add them to the vector of losses over iterations
      
      # This uses mean loss
      mean.train.loss.vec = mean.train.loss.vec + colMeans( abs(prediction.train - y.train) )
      mean.validation.loss.vec = mean.train.loss.vec + colMeans( abs(prediction.validation - y.validation) )
      
      # This uses mean root squared loss
      #mean.train.loss.vec = mean.train.loss.vec + sqrt(colMeans( abs(prediction.train - y.train)^2 ) )
      #mean.validation.loss.vec = mean.train.loss.vec + sqrt( colMeans(  abs(prediction.validation - y.validation)^2 ) )
    }
    
    # take the min of the average of all folds
    mean.train.loss.vec = mean.train.loss.vec / n.folds
    mean.validation.loss.vec = mean.validation.loss.vec / n.folds
    selected.steps = which.min(mean.validation.loss.vec)
    
    
    result.list = list(
      mean.train.loss.vec = mean.train.loss.vec,
      mean.validation.loss.vec = mean.validation.loss.vec,
      selected.steps = selected.steps
    )
    
    return(result.list)
  }

## Helpers 

# sigmoid function
sigmoid <- function(x){
  return(1/(1+exp(-x)))
}