R/gradDescentR.Methods.R

Defines functions GD MBGD SGD SAGD MGD AGD ADAGRAD ADADELTA RMSPROP ADAM SVRG SSGD SARAH SARAHPlus

Documented in ADADELTA ADAGRAD ADAM AGD GD MBGD MGD RMSPROP SAGD SARAH SARAHPlus SGD SSGD SVRG

#############################################################################
#
#  This file is a part of the R package "gradDescent".
#
#  Author: Galih Praja Wijaya
#  Co-author: Dendi Handian, Imam Fachmi Nasrulloh
#  Supervisors: Lala Septem Riza, Rani Megasari, Enjun Junaeti
#  Copyright (c) Department of Computer Science Education, Indonesia University of Education.
#
#  This package is free software: you can redistribute it and/or modify it under
#  the terms of the GNU General Public License as published by the Free Software
#  Foundation, either version 2 of the License, or (at your option) any later version.
#
#  This package is distributed in the hope that it will be useful, but WITHOUT
#  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
#  A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
#############################################################################
#' A function to build prediction model using Gradient Descent method.
#'
#' This function build a prediction model using Gradient Descent (GD) method.
#' Gradient Descent is a first order optimization algorithm to find a local
#' minimum of an objective function by searching along the steepest descent
#' direction. In machine learning, it is mostly used for dealing with supervised
#' learning, which is regression task. By using GD, we construct a model
#' represented in a linear equation that maps the relationship between input
#' variables and the output one. In other words, GD determines suitable coefficient
#' of each variables. So, that the equation can express the mapping correctly.
#'
#' @title Gradient Descent (GD) Method Learning Function
#'
#' @param dataTrain a data.frame that representing training data (\eqn{m \times n}),
#'        where \eqn{m} is the number of instances and \eqn{n} is the number
#'        of variables where the last column is the output variable. dataTrain
#'        must have at least two columns and ten rows of data that contain
#'        only numbers (integer or float).
#'
#' @param alpha a float value representing learning rate. Default value is 0.1
#'
#' @param maxIter the maximal number of iterations.
#'
#' @param seed a integer value for static random. Default value is NULL, which means
#'        the function will not do static random.
#'
#' @examples
#' ##################################
#' ## Learning and Build Model with GD
#' ## load R Package data
#' data(gradDescentRData)
#' ## get z-factor data
#' dataSet <- gradDescentRData$CompressilbilityFactor
#' ## split dataset
#' splitedDataSet <- splitData(dataSet)
#' ## build model with GD
#' GDmodel <- GD(splitedDataSet$dataTrain)
#' #show result
#' print(GDmodel)
#'
#' @return a vector matrix of theta (coefficient) for linear model.
#'
#' @seealso \code{\link{MBGD}}
#'
#' @references
#' L.A. Cauchy,
#' "Methode generale pour la resolution des systemes d equations",
#' Compte Rendu a l Academie des Sciences 25,
#' pp. 536-538 (1847)
#'
#' @export

GD <- function(dataTrain, alpha=0.1, maxIter=10, seed=NULL){
	#convert data.frame dataSet in matrix
	dataTrain <- matrix(unlist(dataTrain), ncol=ncol(dataTrain), byrow=FALSE)
	#shuffle data train
	set.seed(seed)
	dataTrain <- dataTrain[sample(nrow(dataTrain)), ]
	set.seed(NULL)
	#initialize theta
	theta <- getTheta(ncol(dataTrain), seed=seed)
	#bind 1 column to dataTrain
	dataTrain <- cbind(1, dataTrain)
	#parse dataTrain into input and output
	inputData <- dataTrain[,1:ncol(dataTrain)-1]
	outputData <- dataTrain[,ncol(dataTrain)]
	#temporary variables
	temporaryTheta <- matrix(ncol=length(theta), nrow=1)
	#updateRule <- matrix(0, ncol=length(theta), nrow=1)
	#constant variables
	rowLength <- nrow(dataTrain)
	#loop the gradient descent
	for(iteration in 1:maxIter){
		error <- (inputData %*% t(theta)) - outputData
		for(column in 1:length(theta)){
			term <- error * inputData[,column]
			#calculate gradient
			gradient <- sum(term) / rowLength
			temporaryTheta[1,column] = theta[1,column] - (alpha*gradient)
		}
		#update all theta in the current iteration
		theta <- temporaryTheta
	}
	result <- theta
	return(result)
}

#' A function to build prediction model using Mini-Batch Gradient Descent (MBGD) method.
#'
#' This function based on \code{\link{GD}} method with optimization to use
#' the training data partially. MBGD has a parameter named batchRate that represent
#' the instances percentage of training data.
#'
#' @title Mini-Batch Gradient Descent (MBGD) Method Learning Function
#'
#' @param dataTrain a data.frame that representing training data (\eqn{m \times n}),
#'        where \eqn{m} is the number of instances and \eqn{n} is the number
#'        of variables where the last column is the output variable. dataTrain
#'        must have at least two columns and ten rows of data that contain
#'        only numbers (integer or float).
#'
#' @param alpha a float value representing learning rate. Default value is 0.1
#'
#' @param maxIter the maximal number of iterations.
#'
#' @param nBatch a integer value representing the training data batch.
#'
#' @param seed a integer value for static random. Default value is NULL, which means
#'        the function will not do static random.
#'
#' @examples
#' ##################################
#' ## Learning and Build Model with MBGD
#' ## load R Package data
#' data(gradDescentRData)
#' ## get z-factor data
#' dataSet <- gradDescentRData$CompressilbilityFactor
#' ## split dataset
#' splitedDataSet <- splitData(dataSet)
#' ## build model with 0.8 batch rate MBGD
#' MBGDmodel <- MBGD(splitedDataSet$dataTrain, nBatch=2)
#' #show result
#' print(MBGDmodel)
#'
#' @return a vector matrix of theta (coefficient) for linear model.
#'
#' @seealso \code{\link{GD}}
#'
#' @references
#' A. Cotter, O. Shamir, N. Srebro, K. Sridharan
#' Better Mini-Batch Algoritms via Accelerated Gradient Methods,
#' NIPS,
#' pp. 1647- (2011)
#'
#' @export

MBGD <- function(dataTrain, alpha=0.1, maxIter=10, nBatch=2, seed=NULL){
  #convert nBatch to batchRate
  batchRate <- 1/nBatch;
	#convert data.frame dataSet in matrix
	dataTrain <- matrix(unlist(dataTrain), ncol=ncol(dataTrain), byrow=FALSE)
	#shuffle dataTrain
	set.seed(seed)
	dataTrain <- dataTrain[sample(nrow(dataTrain)), ]
	set.seed(NULL)
	#initialize theta
	theta <- getTheta(ncol(dataTrain), seed=seed)
	#bind 1 column to dataTrain
	dataTrain <- cbind(1, dataTrain)
	#temporary variables
	temporaryTheta <- matrix(ncol=length(theta), nrow=1)
	#updateRule <- matrix(0, ncol=length(theta), nrow=1)
	#loop the gradient descent
	for(iteration in 1:maxIter){
	  #split dataTrain to Batch
	  if(iteration %% nBatch == 1){
	    temp <- 1
	    x <- nrow(dataTrain)*batchRate
	    temp2 <- x
	  }else if(nBatch == 1){
	    temp <- 1
	    x <- nrow(dataTrain)*batchRate
	    temp2 <- x
	  }
	  #dataTrain batch
	  batch <- dataTrain[temp:temp2,]
	  #parse dataTrain into input and output
	  inputData <- batch[,1:ncol(batch)-1]
	  outputData <- batch[,ncol(batch)]
	  #constant variables
	  rowLength <- nrow(batch)
	  #next batch
	  temp <- temp + x
	  temp2 <- temp2 + x

		error <- (inputData %*% t(theta)) - outputData
		for(column in 1:length(theta)){
			term <- error * inputData[,column]
			#calculate gradient
			gradient <- sum(term) / rowLength
			temporaryTheta[1,column] = theta[1,column] - (alpha*gradient)
		}
		#update all theta in the current iteration
		theta <- temporaryTheta
	}
	result <- theta
	return(result)
}

#' A function to build prediction model using Stochastic Gradient Descent (SGD) method.
#'
#' This function based on \code{\link{GD}} method with optimization to use only one instance
#' of training data stochasticaly. So, SGD will perform fast computation and the learning.
#' However, the learning to reach minimum cost will become more unstable.
#'
#' @title Stochastic Gradient Descent (SGD) Method Learning Function
#'
#' @param dataTrain a data.frame that representing training data (\eqn{m \times n}),
#'        where \eqn{m} is the number of instances and \eqn{n} is the number
#'        of variables where the last column is the output variable. dataTrain
#'        must have at least two columns and ten rows of data that contain
#'        only numbers (integer or float).
#'
#' @param alpha a float value representing learning rate. Default value is 0.1
#'
#' @param maxIter the maximal number of iterations.
#'
#' @param seed a integer value for static random. Default value is NULL, which means
#'        the function will not do static random.
#'
#' @examples
#' ##################################
#' ## Learning and Build Model with SGD
#' ## load R Package data
#' data(gradDescentRData)
#' ## get z-factor data
#' dataSet <- gradDescentRData$CompressilbilityFactor
#' ## split dataset
#' splitedDataSet <- splitData(dataSet)
#' ## build model with SGD
#' SGDmodel <- SGD(splitedDataSet$dataTrain)
#' #show result
#' print(SGDmodel)
#'
#' @return a vector matrix of theta (coefficient) for linear model.
#'
#' @seealso \code{\link{SAGD}}
#'
#' @references
#' N. Le Roux, M. Schmidt, F. Bach
#' A Stochastic Gradient Method with an Exceptional Convergence Rate for Finite Training Sets,
#' Advances in Neural Information Processing Systems,
#' (2011)
#'
#' @export

SGD <- function(dataTrain, alpha=0.1, maxIter=10, seed=NULL){
	#convert data.frame dataSet in matrix
	dataTrain <- matrix(unlist(dataTrain), ncol=ncol(dataTrain), byrow=FALSE)
	#shuffle dataTrain
	set.seed(seed)
	dataTrain <- dataTrain[sample(nrow(dataTrain)), ]
	set.seed(NULL)
	#initialize theta
	theta <- getTheta(ncol(dataTrain), seed=seed)
	#bind 1 column to dataTrain
	dataTrain <- cbind(1, dataTrain)
	#parse dataTrain into input and output
	inputData <- dataTrain[,1:ncol(dataTrain)-1]
	outputData <- dataTrain[,ncol(dataTrain)]
	#temporary variables
	temporaryTheta <- matrix(ncol=length(theta), nrow=1)
	# updateRule <- matrix(0, ncol=length(theta), nrow=1)
	#constant variables
	rowLength <- nrow(dataTrain)
	set.seed(seed)
	stochasticList <- sample(1:rowLength, maxIter, replace=TRUE)
	set.seed(NULL)
	#loop the gradient descent
	for(iteration in 1:maxIter){
		error <- (inputData[stochasticList[iteration],] %*% t(theta)) - outputData[stochasticList[iteration]]
		for(column in 1:length(theta)){
			#calculate gradient
			gradient <- error * inputData[stochasticList[iteration], column]
			temporaryTheta[1,column] = theta[1,column] - (alpha*gradient)
		}
		#update all theta in the current iteration
		theta <- temporaryTheta
	}
	result <- theta
	return(result)
}

#' A function to build prediction model using Stochastic Average Gradient Descent (SAGD) method.
#'
#' This function based on \code{\link{SGD}} that only compute one instances of
#' of training data stochasticaly. But \code{SAGD} has an averaging control optimization
#' to decide between do the coefficient update or not randomly. This optimization
#' will speed-up the learning, if it doesn't perform computation and
#' update the coefficient.
#'
#' @title Stochastic Average Gradient Descent (SAGD) Method Learning Function
#'
#' @param dataTrain a data.frame that representing training data (\eqn{m \times n}),
#'        where \eqn{m} is the number of instances and \eqn{n} is the number
#'        of variables where the last column is the output variable. dataTrain
#'        must have at least two columns and ten rows of data that contain
#'        only numbers (integer or float).
#'
#' @param alpha a float value representing learning rate. Default value is 0.1
#'
#' @param maxIter the maximal number of iterations.
#'
#' @param seed a integer value for static random. Default value is NULL, which means
#'        the function will not do static random.
#'
#' @examples
#' ##################################
#' ## Learning and Build Model with SAGD
#' ## load R Package data
#' data(gradDescentRData)
#' ## get z-factor data
#' dataSet <- gradDescentRData$CompressilbilityFactor
#' ## split dataset
#' splitedDataSet <- splitData(dataSet)
#' ## build model with SAGD
#' SAGDmodel <- SAGD(splitedDataSet$dataTrain)
#' #show result
#' print(SAGDmodel)
#'
#' @return a vector matrix of theta (coefficient) for linear model.
#'
#' @seealso \code{\link{SGD}}
#'
#' @references
#' M. Schmidt, N. Le Roux, F. Bach
#' Minimizing Finite Sums with the Stochastic Average Gradient,
#' INRIA-SIERRA Project - Team Departement d'informatique de l'Ecole Normale Superieure,
#' (2013)
#'
#' @export

SAGD <- function(dataTrain, alpha=0.1, maxIter=10, seed=NULL){
	#convert data.frame dataSet in matrix
	dataTrain <- matrix(unlist(dataTrain), ncol=ncol(dataTrain), byrow=FALSE)
	#shuffle dataTrain
	set.seed(seed)
	dataTrain <- dataTrain[sample(nrow(dataTrain)), ]
	set.seed(NULL)
	#initialize theta
	theta <- getTheta(ncol(dataTrain), seed=seed)
	#bind 1 column to dataTrain
	dataTrain <- cbind(1, dataTrain)
	#parse dataTrain into input and output
	inputData <- dataTrain[,1:ncol(dataTrain)-1]
	outputData <- dataTrain[,ncol(dataTrain)]
	#temporary variables
	temporaryTheta <- matrix(ncol=length(theta), nrow=1)
	# updateRule <- matrix(0, ncol=length(theta), nrow=1)
	#constant variables
	rowLength <- nrow(dataTrain)
	set.seed(seed)
	stochasticList <- sample(1:rowLength, maxIter, replace=TRUE)
	set.seed(NULL)
	#loop the gradient descent
	for(iteration in 1:maxIter){
		#stochastic average randomization
		if(sample(0:1,1) == 1){
			error <- (inputData[stochasticList[iteration],] %*% t(theta)) - outputData[stochasticList[iteration]]
			for(column in 1:length(theta)){
				#calculate gradient
				gradient <- error * inputData[stochasticList[iteration], column]
				temporaryTheta[1,column] = theta[1,column] - (alpha*gradient)
			}
			#update all theta in the current iteration
			theta <- temporaryTheta
		}
	}
	result <- theta
	return(result)
}

#' A function to build prediction model using Momentum Gradient Descent (MGD) method.
#'
#' This function based on \code{\link{SGD}} with an optimization to speed-up the learning
#' by adding a constant momentum.
#'
#' @title Momentum Gradient Descent (MGD) Method Learning Function
#'
#' @param dataTrain a data.frame that representing training data (\eqn{m \times n}),
#'        where \eqn{m} is the number of instances and \eqn{n} is the number
#'        of variables where the last column is the output variable. dataTrain
#'        must have at least two columns and ten rows of data that contain
#'        only numbers (integer or float).
#'
#' @param alpha a float value representing learning rate. Default value is 0.1
#'
#' @param maxIter the maximal number of iterations.
#'
#' @param momentum a float value represent momentum give a constant speed to learning process.
#'
#' @param seed a integer value for static random. Default value is NULL, which means
#'        the function will not do static random.
#'
#' @examples
#' ##################################
#' ## Learning and Build Model with MGD
#' ## load R Package data
#' data(gradDescentRData)
#' ## get z-factor data
#' dataSet <- gradDescentRData$CompressilbilityFactor
#' ## split dataset
#' splitedDataSet <- splitData(dataSet)
#' ## build model with MGD
#' MGDmodel <- MGD(splitedDataSet$dataTrain)
#' #show result
#' print(MGDmodel)
#'
#' @return a vector matrix of theta (coefficient) for linear model.
#'
#' @seealso \code{\link{AGD}}
#'
#' @references
#' N. Qian
#' On the momentum term in gradient descent learning algorithms.,
#' Neural networks : the official journal of the International Neural Network Society,
#' pp. 145-151- (1999)
#'
#' @export

MGD <- function(dataTrain, alpha=0.1, maxIter=10, momentum=0.9, seed=NULL){
	#convert data.frame dataSet in matrix
	dataTrain <- matrix(unlist(dataTrain), ncol=ncol(dataTrain), byrow=FALSE)
	#shuffle dataTrain
	set.seed(seed)
	dataTrain <- dataTrain[sample(nrow(dataTrain)), ]
	set.seed(NULL)
	#initialize theta
	theta <- getTheta(ncol(dataTrain), seed=seed)
	#bind 1 column to dataTrain
	dataTrain <- cbind(1, dataTrain)
	#parse dataTrain into input and output
	inputData <- dataTrain[,1:ncol(dataTrain)-1]
	outputData <- dataTrain[,ncol(dataTrain)]
	#temporary variables
	temporaryTheta <- matrix(ncol=length(theta), nrow=1)
	updateRule <- matrix(0, ncol=length(theta), nrow=1)
	#constant variables
	rowLength <- nrow(dataTrain)
	#loop the gradient descent
	for(iteration in 1:maxIter){
		error <- (inputData %*% t(theta)) - outputData
		for(column in 1:length(theta)){
			term <- error * inputData[,column]
			#calculate gradient
			gradient <- sum(term) / rowLength
			updateRule[1,column] <-  (momentum*updateRule[1,column]) + (alpha*gradient)
			temporaryTheta[1,column] = theta[1,column] - updateRule[1,column]
		}
		#update all theta in the current iteration
		theta <- temporaryTheta
	}
	result <- theta
	return(result)
}

#' A function to build prediction model using Accelerated Gradient Descent (AGD) method.
#'
#' This function based on \code{\link{SGD}} and \code{\link{MGD}} with optimization
#' to accelerate the learning with momentum constant in each iteration.
#'
#' @title Accelerated Gradient Descent (AGD) Method Learning Function
#'
#' @param dataTrain a data.frame that representing training data (\eqn{m \times n}),
#'        where \eqn{m} is the number of instances and \eqn{n} is the number
#'        of variables where the last column is the output variable. dataTrain
#'        must have at least two columns and ten rows of data that contain
#'        only numbers (integer or float).
#'
#' @param alpha a float value representing learning rate. Default value is 0.1
#'
#' @param maxIter the maximal number of iterations.
#'
#' @param momentum a float value represent momentum give a constant speed to learning process.
#'
#' @param seed a integer value for static random. Default value is NULL, which means
#'        the function will not do static random.
#'
#' @examples
#' ##################################
#' ## Learning and Build Model with AGD
#' ## load R Package data
#' data(gradDescentRData)
#' ## get z-factor data
#' dataSet <- gradDescentRData$CompressilbilityFactor
#' ## split dataset
#' splitedDataSet <- splitData(dataSet)
#' ## build model with AGD
#' AGDmodel <- AGD(splitedDataSet$dataTrain)
#' #show result
#' print(AGDmodel)
#'
#' @return a vector matrix of theta (coefficient) for linear model.
#'
#' @seealso \code{\link{MGD}}
#'
#' @references
#' Y. Nesterov
#' A method for unconstrained convex minimization problem with the rate of convergence O (1/k2),
#' Soviet Mathematics Doklady 27 (2),
#' pp. 543-547 (1983)
#'
#' @export

AGD <- function(dataTrain, alpha=0.1, maxIter=10, momentum=0.9, seed=NULL){
	#convert data.frame dataSet in matrix
	dataTrain <- matrix(unlist(dataTrain), ncol=ncol(dataTrain), byrow=FALSE)
	#shuffle dataTrain
	set.seed(seed)
	dataTrain <- dataTrain[sample(nrow(dataTrain)), ]
	set.seed(NULL)
	#initialize theta
	theta <- getTheta(ncol(dataTrain), seed=seed)
	#bind 1 column to dataTrain
	dataTrain <- cbind(1, dataTrain)
	#parse dataTrain into input and output
	inputData <- dataTrain[,1:ncol(dataTrain)-1]
	outputData <- dataTrain[,ncol(dataTrain)]
	#temporary variables
	temporaryTheta <- matrix(ncol=length(theta), nrow=1)
	updateRule <- matrix(0, ncol=length(theta), nrow=1)
	#constant variables
	rowLength <- nrow(dataTrain)
	#loop the gradient descent
	for(iteration in 1:maxIter){
		#accelerate
		theta <- theta - (updateRule * momentum)
		error <- (inputData %*% t(theta)) - outputData
		for(column in 1:length(theta)){
			term <- error * inputData[,column]
			#calculate gradient
			gradient <- sum(term) / rowLength
			updateRule[1,column] <-  (momentum*updateRule[1,column]) + (alpha*gradient)
			temporaryTheta[1,column] = theta[1,column] - updateRule[1,column]
		}
		#update all theta in the current iteration
		theta <- temporaryTheta
	}
	result <- theta
	return(result)
}

#' A function to build prediction model using ADAGRAD method.
#'
#' This function based on \code{\link{SGD}} with an optimization to create
#' an adaptive learning rate with an approach that accumulate previous cost in each iteration.
#'
#' @title ADAGRAD Method Learning Function
#'
#' @param dataTrain a data.frame that representing training data (\eqn{m \times n}),
#'        where \eqn{m} is the number of instances and \eqn{n} is the number
#'        of variables where the last column is the output variable. dataTrain
#'        must have at least two columns and ten rows of data that contain
#'        only numbers (integer or float).
#'
#' @param alpha a float value representing learning rate. Default value is 0.1
#'
#' @param maxIter the maximal number of iterations.
#'
#' @param seed a integer value for static random. Default value is NULL, which means
#'        the function will not do static random.
#'
#' @examples
#' ##################################
#' ## Learning and Build Model with ADAGRAD
#' ## load R Package data
#' data(gradDescentRData)
#' ## get z-factor data
#' dataSet <- gradDescentRData$CompressilbilityFactor
#' ## split dataset
#' splitedDataSet <- splitData(dataSet)
#' ## build model with ADAGRAD
#' ADAGRADmodel <- ADAGRAD(splitedDataSet$dataTrain)
#' #show result
#' print(ADAGRADmodel)
#'
#' @return a vector matrix of theta (coefficient) for linear model.
#'
#' @seealso \code{\link{ADADELTA}}, \code{\link{RMSPROP}}, \code{\link{ADAM}}
#'
#' @references
#' J. Duchi, E. Hazan, Y. Singer
#' Adaptive Subgradient Methods for Online Learning and Stochastic Optimization,
#' Journal of Machine Learning Research 12,
#' pp. 2121-2159 (2011)
#'
#' @export

ADAGRAD <- function(dataTrain, alpha=0.1, maxIter=10, seed=NULL){
	#convert data.frame dataSet in matrix
	dataTrain <- matrix(unlist(dataTrain), ncol=ncol(dataTrain), byrow=FALSE)
	#shuffle dataTrain
	set.seed(seed)
	dataTrain <- dataTrain[sample(nrow(dataTrain)), ]
	set.seed(NULL)
	#initialize theta
	theta <- getTheta(ncol(dataTrain), seed=seed)
	#bind 1 column to dataTrain
	dataTrain <- cbind(1, dataTrain)
	#parse dataTrain into input and output
	inputData <- dataTrain[,1:ncol(dataTrain)-1]
	outputData <- dataTrain[,ncol(dataTrain)]
	#temporary variables
	temporaryTheta <- matrix(ncol=length(theta), nrow=1)
	updateRule <- matrix(0, ncol=length(theta), nrow=1)
	gradientList <- matrix(nrow=1, ncol=0)
	#constant variables
	rowLength <- nrow(dataTrain)
	set.seed(seed)
	stochasticList <- sample(1:rowLength, maxIter, replace=TRUE)
	set.seed(NULL)
	#loop the gradient descent
	for(iteration in 1:maxIter){
		error <- (inputData[stochasticList[iteration],] %*% t(theta)) - outputData[stochasticList[iteration]]
		for(column in 1:length(theta)){
			#calculate gradient
			gradient <- error * inputData[stochasticList[iteration], column]
			#adagrad update rule calculation
			gradientList <- cbind(gradientList, gradient)
			gradientSum <- sqrt(gradientList %*% t(gradientList))
			updateRule[1,column] <- (alpha / gradientSum) * gradient
			temporaryTheta[1,column] = theta[1,column] - updateRule[1,column]
		}
		#update all theta in the current iteration
		theta <- temporaryTheta
	}
	result <- theta
	return(result)
}

#' A function to build prediction model using ADADELTA method.
#'
#' This function based on \code{\link{SGD}} with an optimization to create
#' an adaptive learning rate by hessian approximation correction approach.
#' Correction and has less computation load than \code{\link{ADAGRAD}}. This method
#' create an exclusive learning rate and doesn't need \code{alpha} parameter, but uses
#' momentum parameter same as \code{\link{MGD}} and \code{\link{AGD}}.
#'
#' @title ADADELTA Method Learning Function
#'
#' @param dataTrain a data.frame that representing training data (\eqn{m \times n}),
#'        where \eqn{m} is the number of instances and \eqn{n} is the number
#'        of variables where the last column is the output variable. dataTrain
#'        must have at least two columns and ten rows of data that contain
#'        only numbers (integer or float).
#'
#' @param maxIter the maximal number of iterations.
#'
#' @param momentum a float value represent momentum give a constant speed to learning process.
#'
#' @param seed a integer value for static random. Default value is NULL, which means
#'        the function will not do static random.
#'
#' @examples
#' ##################################
#' ## Learning and Build Model with ADADELTA
#' ## load R Package data
#' data(gradDescentRData)
#' ## get z-factor data
#' dataSet <- gradDescentRData$CompressilbilityFactor
#' ## split dataset
#' splitedDataSet <- splitData(dataSet)
#' ## build model with ADADELTA
#' ADADELTAmodel <- ADADELTA(splitedDataSet$dataTrain)
#' #show result
#' print(ADADELTAmodel)
#'
#' @return a vector matrix of theta (coefficient) for linear model.
#'
#' @seealso \code{\link{ADAGRAD}}, \code{\link{RMSPROP}}, \code{\link{ADAM}}
#'
#' @references
#' M. D. Zeiler
#' Adadelta: An Adaptive Learning Rate Method,
#' arXiv: 1212.5701v1,
#' pp. 1-6 (2012)
#'
#' @export

ADADELTA <- function(dataTrain, maxIter=10, momentum=0.9, seed=NULL){
	#convert data.frame dataSet in matrix
	dataTrain <- matrix(unlist(dataTrain), ncol=ncol(dataTrain), byrow=FALSE)
	#shuffle dataTrain
	set.seed(seed)
	dataTrain <- dataTrain[sample(nrow(dataTrain)), ]
	set.seed(NULL)
	#initialize theta
	theta <- getTheta(ncol(dataTrain), seed=seed)
	#bind 1 column to dataTrain
	dataTrain <- cbind(1, dataTrain)
	#parse dataTrain into input and output
	inputData <- dataTrain[,1:ncol(dataTrain)-1]
	outputData <- dataTrain[,ncol(dataTrain)]
	#temporary variables
	temporaryTheta <- matrix(ncol=length(theta), nrow=1)
	updateRule <- matrix(0, ncol=length(theta), nrow=1)
	ESG <- 0
	ESR <- 0
	RMSUpdate <- 0
	smooth <- 0.0000001
	#constant variables
	rowLength <- nrow(dataTrain)
	set.seed(seed)
	stochasticList <- sample(1:rowLength, maxIter, replace=TRUE)
	set.seed(NULL)
	#loop the gradient descent
	for(iteration in 1:maxIter){
		error <- (inputData[stochasticList[iteration],] %*% t(theta)) - outputData[stochasticList[iteration]]
		for(column in 1:length(theta)){
			#calculate gradient
			gradient <- error * inputData[stochasticList[iteration], column]
			#adadelta update rule calculation
			ESG <- (momentum*ESG) + (1-momentum)*gradient^2
			RMSGradient <- sqrt(ESG + smooth)
			ESR <- (momentum*ESR) + (1-momentum)*updateRule[1,column]^2
			updateRule[1,column] <- (RMSUpdate / RMSGradient) * gradient
			#temporary change
			temporaryTheta[1,column] = theta[1,column] - updateRule[1,column]
			#adadelta temporary change
			RMSUpdate <- sqrt(ESR + smooth)
		}
		#update all theta in the current iteration
		theta <- temporaryTheta
	}
	result <- theta
	return(result)
}

#' A function to build prediction model using RMSPROP method.
#'
#' This function based on \code{\link{SGD}} with an optimization to create
#' an adaptive learning rate by RMS cost and hessian approximation correction approach.
#' In other word, this method combine the \code{\link{ADAGRAD}} and \code{\link{ADADELTA}}
#' approaches.
#'
#' @title ADADELTA Method Learning Function
#'
#' @param dataTrain a data.frame that representing training data (\eqn{m \times n}),
#'        where \eqn{m} is the number of instances and \eqn{n} is the number
#'        of variables where the last column is the output variable. dataTrain
#'        must have at least two columns and ten rows of data that contain
#'        only numbers (integer or float).
#'
#' @param alpha a float value representing learning rate. Default value is 0.1
#'
#' @param maxIter the maximal number of iterations.
#'
#' @param momentum a float value represent momentum give a constant speed to learning process.
#'
#' @param seed a integer value for static random. Default value is NULL, which means
#'        the function will not do static random.
#'
#' @examples
#' ##################################
#' ## Learning and Build Model with RMSPROP
#' ## load R Package data
#' data(gradDescentRData)
#' ## get z-factor data
#' dataSet <- gradDescentRData$CompressilbilityFactor
#' ## split dataset
#' splitedDataSet <- splitData(dataSet)
#' ## build model with RMSPROP
#' RMSPROPmodel <- RMSPROP(splitedDataSet$dataTrain)
#' #show result
#' print(RMSPROPmodel)
#'
#' @return a vector matrix of theta (coefficient) for linear model.
#'
#' @seealso \code{\link{ADAGRAD}}, \code{\link{ADADELTA}}, \code{\link{ADAM}}
#'
#' @references
#' M. D. Zeiler
#' Adadelta: An Adaptive Learning Rate Method,
#' arXiv: 1212.5701v1,
#' pp. 1-6 (2012)
#'
#' @export

RMSPROP <- function(dataTrain, alpha=0.1, maxIter=10, momentum=0.9, seed=NULL){
	#convert data.frame dataSet in matrix
	dataTrain <- matrix(unlist(dataTrain), ncol=ncol(dataTrain), byrow=FALSE)
	#shuffle dataTrain
	set.seed(seed)
	dataTrain <- dataTrain[sample(nrow(dataTrain)), ]
	set.seed(NULL)
	#initialize theta
	theta <- getTheta(ncol(dataTrain), seed=seed)
	#bind 1 column to dataTrain
	dataTrain <- cbind(1, dataTrain)
	#parse dataTrain into input and output
	inputData <- dataTrain[,1:ncol(dataTrain)-1]
	outputData <- dataTrain[,ncol(dataTrain)]
	#temporary variables
	temporaryTheta <- matrix(ncol=length(theta), nrow=1)
	updateRule <- matrix(0, ncol=length(theta), nrow=1)
	ESG <- 0
	smooth <- 0.0000001
	#constant variables
	rowLength <- nrow(dataTrain)
	set.seed(seed)
	stochasticList <- sample(1:rowLength, maxIter, replace=TRUE)
	set.seed(NULL)
	#loop the gradient descent
	for(iteration in 1:maxIter){
		error <- (inputData[stochasticList[iteration],] %*% t(theta)) - outputData[stochasticList[iteration]]
		for(column in 1:length(theta)){
			#calculate gradient
			gradient <- error * inputData[stochasticList[iteration], column]
			#rmsprop update rule calculation
			ESG <- (momentum*ESG) + (1-momentum)*gradient^2
			RMSGradient <- sqrt(ESG + smooth)
			updateRule[1,column] <- (alpha / RMSGradient) * gradient
			#temporary change
			temporaryTheta[1,column] = theta[1,column] - updateRule[1,column]
		}
		#update all theta in the current iteration
		theta <- temporaryTheta
	}
	result <- theta
	return(result)
}

#' A function to build prediction model using ADAM method.
#'
#' This function based on \code{\link{SGD}} with an optimization to create
#' an adaptive learning rate by two moment estimation called mean and variance.
#'
#' @title ADADELTA Method Learning Function
#'
#' @param dataTrain a data.frame that representing training data (\eqn{m \times n}),
#'        where \eqn{m} is the number of instances and \eqn{n} is the number
#'        of variables where the last column is the output variable. dataTrain
#'        must have at least two columns and ten rows of data that contain
#'        only numbers (integer or float).
#'
#' @param alpha a float value representing learning rate. Default value is 0.1
#'
#' @param maxIter the maximal number of iterations.
#'
#' @param seed a integer value for static random. Default value is NULL, which means
#'        the function will not do static random.
#'
#' @examples
#' ##################################
#' ## Learning and Build Model with ADAM
#' ## load R Package data
#' data(gradDescentRData)
#' ## get z-factor data
#' dataSet <- gradDescentRData$CompressilbilityFactor
#' ## split dataset
#' splitedDataSet <- splitData(dataSet)
#' ## build model with ADAM
#' ADAMmodel <- ADAM(splitedDataSet$dataTrain)
#' #show result
#' print(ADAMmodel)
#'
#' @return a vector matrix of theta (coefficient) for linear model.
#'
#' @seealso \code{\link{ADAGRAD}}, \code{\link{RMSPROP}}, \code{\link{ADADELTA}}
#'
#' @references
#' D.P Kingma, J. Lei
#' Adam: a Method for Stochastic Optimization,
#' International Conference on Learning Representation,
#' pp. 1-13 (2015)
#'
#' @export

ADAM <- function(dataTrain, alpha=0.1, maxIter=10, seed=NULL){
	#convert data.frame dataSet in matrix
	dataTrain <- matrix(unlist(dataTrain), ncol=ncol(dataTrain), byrow=FALSE)
	#shuffle dataTrain
	set.seed(seed)
	dataTrain <- dataTrain[sample(nrow(dataTrain)), ]
	set.seed(NULL)
	#initialize theta
	theta <- getTheta(ncol(dataTrain), seed=seed)
	#bind 1 column to dataTrain
	dataTrain <- cbind(1, dataTrain)
	#parse dataTrain into input and output
	inputData <- dataTrain[,1:ncol(dataTrain)-1]
	outputData <- dataTrain[,ncol(dataTrain)]
	#temporary variables
	temporaryTheta <- matrix(ncol=length(theta), nrow=1)
	updateRule <- matrix(0, ncol=length(theta), nrow=1)
	beta1 <- 0.9
	beta2 <- 0.999
	meanMoment <- 0
	varianceMoment <- 0
	smooth <- 0.0000001
	#constant variables
	rowLength <- nrow(dataTrain)
	set.seed(seed)
	stochasticList <- sample(1:rowLength, maxIter, replace=TRUE)
	set.seed(NULL)
	#loop the gradient descent
	for(iteration in 1:maxIter){
		error <- (inputData[stochasticList[iteration],] %*% t(theta)) - outputData[stochasticList[iteration]]
		for(column in 1:length(theta)){
			#calculate gradient
			gradient <- error * inputData[stochasticList[iteration], column]
			#adam update rule calculation
			meanMoment <- (beta1*meanMoment) + (1-beta1)*gradient
			varianceMoment <- (beta2*varianceMoment) + (1-beta2)*(gradient^2)
			mean.hat <- meanMoment/(1-beta1)
			variance.hat <- varianceMoment/(1-beta2)
			updateRule[1,column] <- (alpha/(sqrt(variance.hat)+smooth)) * mean.hat
			#temporary change
			temporaryTheta[1,column] = theta[1,column] - updateRule[1,column]
		}
		#update all theta in the current iteration
		theta <- temporaryTheta
	}
	result <- theta
	return(result)
}

#' A function to build prediction model using SVRG method.
#'
#' This function based on \code{\link{SGD}} with an optimization that accelerates
#' the process toward converging by reducing the gradient in \code{\link{SGD}}
#'
#' @title Stochastic Variance Reduce Gradient (SVRG) Method Learning Function
#'
#' @param dataTrain a data.frame that representing training data (\eqn{m \times n}),
#'        where \eqn{m} is the number of instances and \eqn{n} is the number
#'        of variables where the last column is the output variable. dataTrain
#'        must have at least two columns and ten rows of data that contain
#'        only numbers (integer or float).
#'
#' @param alpha a float value representing learning rate. Default value is 0.1
#'
#' @param maxIter the maximal number of iterations in outerloop.
#'
#' @param innerIter the maximal number of iterations in innerloop.
#'
#' @param option is an option to set the theta. option 1 set the theta with the last theta
#'        in innerloop. option 2 set the theta with random theta from 1 to last innerloop.
#'
#' @param seed a integer value for static random. Default value is NULL, which means
#'        the function will not do static random.
#'
#' @examples
#' ##################################
#' ## Learning and Build Model with SVRG
#' ## load R Package data
#' data(gradDescentRData)
#' ## get z-factor data
#' dataSet <- gradDescentRData$CompressilbilityFactor
#' ## split dataset
#' splitedDataSet <- splitData(dataSet)
#' ## build model with SVRG
#' SVRGmodel <- SVRG(splitedDataSet$dataTrain)
#' #show result
#' print(SVRGmodel)
#'
#' @return a vector matrix of theta (coefficient) for linear model.
#'
#' @seealso \code{\link{SSGD}}, \code{\link{SARAH}}, \code{\link{SARAHPlus}}
#'
#' @references
#' Rie Johnson, Tong Zang
#' Accelerating Stochastic Gradient Descent using Predictive Variance Reduction,
#' Advances in Neural Information Processing Systems,
#' pp. 315-323 (2013)
#'
#' @export

#variance baru (SVRG)
SVRG <- function(dataTrain, alpha=0.1, maxIter=10, innerIter=10, option=2, seed=NULL){
  #convert data.frame dataSet in matrix
  dataTrain <- matrix(unlist(dataTrain), ncol=ncol(dataTrain), byrow=FALSE)
  #shuffle dataTrain
  set.seed(seed)
  dataTrain <- dataTrain[sample(nrow(dataTrain)), ]
  set.seed(NULL)
  #initialize theta
  theta <- getTheta(ncol(dataTrain), seed=seed)
  #print(theta[1,1])
  #bind 1 column to dataTrain
  dataTrain <- cbind(1, dataTrain)
  #parse dataTrain into input and output
  inputData <- dataTrain[,1:ncol(dataTrain)-1]
  outputData <- dataTrain[,ncol(dataTrain)]
  #temporary variables
  temporaryTheta <- matrix(ncol=length(theta), nrow=1)
  temporaryTheta2 <- matrix(ncol=length(theta), nrow=1)
  temporaryThetaList <- matrix(ncol=length(theta))
  #updateRule <- matrix(0, ncol=length(theta), nrow=1)
  gradient <- matrix(0, ncol=length(theta), nrow=1)
  #constant variables
  rowLength <- nrow(dataTrain)
  set.seed(seed)
  randRowList <- sample(1:rowLength, innerIter, replace=TRUE)
  set.seed(NULL)
  #loop the gradient descent
  for(iteration in 1:maxIter){

    temporaryTheta <- theta

    error <- (inputData %*% t(temporaryTheta)) - outputData
    for(column in 1:length(temporaryTheta)){
      term <- error * inputData[,column]
      #calculate gradient
      gradient[,column] <- sum(term) / rowLength
    }

    temporaryTheta2 <- temporaryTheta
    temporaryThetaList <- temporaryTheta2

    for(innerIteration in 1:(innerIter)){
      error2 <- (inputData[randRowList[innerIteration],] %*% t(temporaryTheta2)) - outputData[randRowList[innerIteration]]
      error3 <- (inputData[randRowList[innerIteration],] %*% t(temporaryTheta)) - outputData[randRowList[innerIteration]]

      for(column in 1:length(theta)){
        term2 <- error2 * inputData[randRowList[innerIteration],column]
        term3 <- error3 * inputData[randRowList[innerIteration],column]

        totalGrad <- term2 - term3 + gradient[,column]

        temporaryTheta2[1,column] <- temporaryTheta2[1,column] - (alpha*totalGrad)
      }

      temporaryThetaList <- rbind(temporaryThetaList, temporaryTheta2)
    }

    randInnerIter <- sample(innerIter,1)
    if(option == 1){
      theta[1,] <- temporaryThetaList[innerIter,]
    }else{
      theta[1,] <- temporaryThetaList[randInnerIter,]
    }
  }

  result <- theta
  return(result)
}

#' A function to build prediction model using SSGD method.
#'
#' This function combines elements from both \code{\link{GD}} and \code{\link{SGD}}. \code{\link{SSGD}} starts by
#' computing the full gradient once and then proceeds with stochastic updates by choosing one of
#' the gradients at a time.
#'
#' @title Semi Stochastic Gradient Descent (SSGD) Method Learning Function
#'
#' @param dataTrain a data.frame that representing training data (\eqn{m \times n}),
#'        where \eqn{m} is the number of instances and \eqn{n} is the number
#'        of variables where the last column is the output variable. dataTrain
#'        must have at least two columns and ten rows of data that contain
#'        only numbers (integer or float).
#'
#' @param alpha a float value representing learning rate. Default value is 0.1
#'
#' @param maxIter the maximal number of iterations in outerloop.
#'
#' @param innerIter the maximal number of iterations in innerloop.
#'
#' @param lamda a float value to generate random value from innerIter with probability for innerloop.
#'
#' @param seed a integer value for static random. Default value is NULL, which means
#'        the function will not do static random.
#'
#' @examples
#' ##################################
#' ## Learning and Build Model with SSGD
#' ## load R Package data
#' data(gradDescentRData)
#' ## get z-factor data
#' dataSet <- gradDescentRData$CompressilbilityFactor
#' ## split dataset
#' splitedDataSet <- splitData(dataSet)
#' ## build model with SSGD
#' SSGDmodel <- SSGD(splitedDataSet$dataTrain)
#' #show result
#' print(SSGDmodel)
#'
#' @return a vector matrix of theta (coefficient) for linear model.
#'
#' @seealso \code{\link{SVRG}}, \code{\link{SARAH}}, \code{\link{SARAHPlus}}
#'
#' @references
#' George Papamakarios
#' Comparison of Modern Stochastic Optimization Algorithms,
#' (2014)
#'
#' @export

#variance baru (SSGD)
SSGD <- function(dataTrain, alpha=0.1, maxIter=10, lamda=0, innerIter=10, seed=NULL){
  start_time <- Sys.time()
  #convert data.frame dataSet in matrix
  dataTrain <- matrix(unlist(dataTrain), ncol=ncol(dataTrain), byrow=FALSE)
  #shuffle dataTrain
  set.seed(seed)
  dataTrain <- dataTrain[sample(nrow(dataTrain)), ]
  set.seed(NULL)
  #initialize theta
  theta <- getTheta(ncol(dataTrain), seed=seed)
  #bind 1 column to dataTrain
  dataTrain <- cbind(1, dataTrain)
  #parse dataTrain into input and output
  inputData <- dataTrain[,1:ncol(dataTrain)-1]
  outputData <- dataTrain[,ncol(dataTrain)]
  #temporary variables
  temporaryTheta <- matrix(ncol=length(theta), nrow=1)
  #updateRule <- matrix(0, ncol=length(theta), nrow=1)
  gradient <- matrix(0, ncol=length(theta), nrow=1)
  #constant variables
  rowLength <- nrow(dataTrain)
  #loop the gradient descent
  for (iteration in 1:maxIter) {
    error <- (inputData %*% t(theta)) - outputData
    for(column in 1:length(theta)){
      term <- error * inputData[,column]
      #calculate gradient
      gradient[,column] <- sum(term) / rowLength
    }

    #temporary theta
    themp <- theta

    #random innerIteration
    randIter <- getRandomProb(innerIter,lamda,alpha)
    #print(randIter)

    for (innerIteration in 1:randIter) {
      #choose random of row
      randRow <- sample(rowLength,1)

      error2 <- (inputData[randRow,] %*% t(themp)) - outputData[randRow]
      error3 <- (inputData[randRow,] %*% t(theta)) - outputData[randRow]

      for(column in 1:length(theta)){
        term2 <- error2 * inputData[randRow,column]
        term3 <- error3 * inputData[randRow,column]

        totalGrad <- gradient[,column] - term2 + term3

        themp[1,column] <- themp[1,column] - (alpha*totalGrad)
      }
    }
    theta <- themp
  }
  result <- theta
  end_time <- Sys.time()
  print(end_time - start_time)
  return(result)
}

#' A function to build prediction model using SARAH method.
#'
#' This function Similarly to \code{\link{SVRG}}, \code{\link{SARAH}} iterations are divided
#' into the outer loop where a full gradient is computed and the inner loop where only
#' stochastic gradient is computed. Unlike the case of \code{\link{SVRG}}, the steps of
#' the inner loop of \code{\link{SARAH}} are based on accumulated stochastic information.
#'
#' @title Stochastic Recursive Gradient Algorithm (SARAH) Method Learning Function
#'
#' @param dataTrain a data.frame that representing training data (\eqn{m \times n}),
#'        where \eqn{m} is the number of instances and \eqn{n} is the number
#'        of variables where the last column is the output variable. dataTrain
#'        must have at least two columns and ten rows of data that contain
#'        only numbers (integer or float).
#'
#' @param alpha a float value representing learning rate. Default value is 0.1
#'
#' @param maxIter the maximal number of iterations in outerloop.
#'
#' @param innerIter the maximal number of iterations in innerloop.
#'
#' @param seed a integer value for static random. Default value is NULL, which means
#'        the function will not do static random.
#'
#' @examples
#' ##################################
#' ## Learning and Build Model with SARAH
#' ## load R Package data
#' data(gradDescentRData)
#' ## get z-factor data
#' dataSet <- gradDescentRData$CompressilbilityFactor
#' ## split dataset
#' splitedDataSet <- splitData(dataSet)
#' ## build model with SARAH
#' SARAHmodel <- SARAH(splitedDataSet$dataTrain)
#' #show result
#' print(SARAHmodel)
#'
#' @return a vector matrix of theta (coefficient) for linear model.
#'
#' @seealso \code{\link{SVRG}}, \code{\link{SSGD}}, \code{\link{SARAHPlus}}
#'
#' @references
#' Lam M. Nguyen, Jie Lu, Katya Scheinberg, Martin Takac
#' SARAH: A Novel Method for Machine Learning Problems Using Stochastic Recursive Gradient,
#' arXiv preprint arXiv:1703.00102,
#' (2017)
#'
#' @export

#variance baru (SARAH)
SARAH <- function(dataTrain, alpha=0.1, maxIter=10, innerIter=10, seed=NULL){
  start_time <- Sys.time()
  #convert data.frame dataSet in matrix
  dataTrain <- matrix(unlist(dataTrain), ncol=ncol(dataTrain), byrow=FALSE)
  #shuffle dataTrain
  set.seed(seed)
  dataTrain <- dataTrain[sample(nrow(dataTrain)), ]
  set.seed(NULL)
  #initialize theta
  theta <- getTheta(ncol(dataTrain), seed=seed)
  #print(theta[1,1])
  #bind 1 column to dataTrain
  dataTrain <- cbind(1, dataTrain)
  #parse dataTrain into input and output
  inputData <- dataTrain[,1:ncol(dataTrain)-1]
  outputData <- dataTrain[,ncol(dataTrain)]
  #temporary variables
  temporaryTheta <- matrix(ncol=length(theta), nrow=1)
  temporaryTheta2 <- matrix(ncol=length(theta), nrow=1)
  temporaryThetaList <- matrix(ncol=length(theta))
  #updateRule <- matrix(0, ncol=length(theta), nrow=1)
  gradient <- matrix(0, ncol=length(theta), nrow=1)
  #constant variables
  rowLength <- nrow(dataTrain)
  set.seed(seed)
  randRowList <- sample(1:rowLength, innerIter, replace=TRUE)
  set.seed(NULL)
  #loop the gradient descent
  for(iteration in 1:maxIter){

    temporaryTheta <- theta

    error <- (inputData %*% t(temporaryTheta)) - outputData
    for(column in 1:length(temporaryTheta)){
      term <- error * inputData[,column]
      #calculate gradient
      gradient[,column] <- sum(term) / rowLength
      temporaryTheta2[,column] <- temporaryTheta[,column] - (alpha*gradient[,column])
    }

    temporaryThetaList <- temporaryTheta

    for(innerIteration in 1:(innerIter-1)){
      error2 <- (inputData[randRowList[innerIteration],] %*% t(temporaryTheta2)) - outputData[randRowList[innerIteration]]
      error3 <- (inputData[randRowList[innerIteration],] %*% t(temporaryTheta)) - outputData[randRowList[innerIteration]]

      temporaryTheta <- temporaryTheta2

      for(column in 1:length(theta)){
        term2 <- error2 * inputData[randRowList[innerIteration],column]
        term3 <- error3 * inputData[randRowList[innerIteration],column]

        gradient[,column] <- term2 - term3 + gradient[,column]

        temporaryTheta2[1,column] <- temporaryTheta2[1,column] - (alpha*gradient[,column])
      }

      temporaryThetaList <- rbind(temporaryThetaList, temporaryTheta2)
    }

    randInnerIter <- sample(innerIter,1)
    theta[1,] <- temporaryThetaList[randInnerIter,]
  }

  result <- theta
  end_time <- Sys.time()
  print(end_time - start_time)
  return(result)
}

#' A function to build prediction model using SARAH+ method.
#'
#' This function is practical variant of \code{\link{SARAH}}, \code{\link{SARAHPlus}} provides a possibility of
#' earlier termination and unnecessary careful choices of maximum innerloop size, and it also covers the
#' classical gradient descent when we set gammaS = 1 (since the while loop does not proceed).
#'
#' @title Stochastic Recursive Gradient Algorithm+ (SARAH+) Method Learning Function
#'
#' @param dataTrain a data.frame that representing training data (\eqn{m \times n}),
#'        where \eqn{m} is the number of instances and \eqn{n} is the number
#'        of variables where the last column is the output variable. dataTrain
#'        must have at least two columns and ten rows of data that contain
#'        only numbers (integer or float).
#'
#' @param alpha a float value representing learning rate. Default value is 0.1
#'
#' @param maxIter the maximal number of iterations in outerloop.
#'
#' @param innerIter the maximal number of iterations in innerloop.
#'
#' @param gammaS a float value to provide sufficient reduction. Default value is 0.125.
#'
#' @param seed a integer value for static random. Default value is NULL, which means
#'        the function will not do static random.
#'
#' @examples
#' ##################################
#' ## Learning and Build Model with SARAH+
#' ## load R Package data
#' data(gradDescentRData)
#' ## get z-factor data
#' dataSet <- gradDescentRData$CompressilbilityFactor
#' ## split dataset
#' splitedDataSet <- splitData(dataSet)
#' ## build model with SARAH+
#' SARAHPlusmodel <- SARAHPlus(splitedDataSet$dataTrain)
#' #show result
#' print(SARAHPlusmodel)
#'
#' @return a vector matrix of theta (coefficient) for linear model.
#'
#' @seealso \code{\link{SVRG}}, \code{\link{SSGD}}, \code{\link{SARAH}}
#'
#' @references
#' Lam M. Nguyen, Jie Lu, Katya Scheinberg, Martin Takac
#' SARAH: A Novel Method for Machine Learning Problems Using Stochastic Recursive Gradient,
#' arXiv preprint arXiv:1703.00102,
#' (2017)
#'
#' @export

#variance baru (SARAH+)
SARAHPlus <- function(dataTrain, alpha=0.1, maxIter=10, innerIter=10, gammaS=0.125, seed=NULL){
  start_time <- Sys.time()
  #convert data.frame dataSet in matrix
  dataTrain <- matrix(unlist(dataTrain), ncol=ncol(dataTrain), byrow=FALSE)
  #shuffle dataTrain
  set.seed(seed)
  dataTrain <- dataTrain[sample(nrow(dataTrain)), ]
  set.seed(NULL)
  #initialize theta
  theta <- getTheta(ncol(dataTrain), seed=seed)
  #print(theta[1,1])
  #bind 1 column to dataTrain
  dataTrain <- cbind(1, dataTrain)
  #parse dataTrain into input and output
  inputData <- dataTrain[,1:ncol(dataTrain)-1]
  outputData <- dataTrain[,ncol(dataTrain)]
  #temporary variables
  temporaryTheta <- matrix(ncol=length(theta), nrow=1)
  temporaryTheta2 <- matrix(ncol=length(theta), nrow=1)
  temporaryThetaList <- matrix(ncol=length(theta))
  #updateRule <- matrix(0, ncol=length(theta), nrow=1)
  gradient <- matrix(0, ncol=length(theta), nrow=1)
  #constant variables
  rowLength <- nrow(dataTrain)
  set.seed(seed)
  randRowList <- sample(1:rowLength, innerIter, replace=TRUE)
  set.seed(NULL)
  #loop the gradient descent
  for(iteration in 1:maxIter){

    temporaryTheta <- theta

    error <- (inputData %*% t(temporaryTheta)) - outputData
    for(column in 1:length(temporaryTheta)){
      term <- error * inputData[,column]
      #calculate gradient
      gradient[,column] <- sum(term) / rowLength
      temporaryTheta2[,column] <- temporaryTheta[,column] - (alpha*gradient[,column])
    }
    firstGradient <- gradient

    temporaryThetaList <- temporaryTheta

    iter <- 1

    while(((abs(colSums(gradient)/3))^2) > (gammaS*((abs(colSums(firstGradient)/3))^2)) && iter < innerIter){

      error2 <- (inputData[randRowList[iter],] %*% t(temporaryTheta2)) - outputData[randRowList[iter]]
      error3 <- (inputData[randRowList[iter],] %*% t(temporaryTheta)) - outputData[randRowList[iter]]

      temporaryTheta <- temporaryTheta2

      for(column in 1:length(theta)){
        term2 <- error2 * inputData[randRowList[iter],column]
        term3 <- error3 * inputData[randRowList[iter],column]

        gradient[,column] <- term2 - term3 + gradient[,column]

        temporaryTheta2[1,column] <- temporaryTheta2[1,column] - (alpha*gradient[,column])
      }

      temporaryThetaList <- rbind(temporaryThetaList, temporaryTheta2)

      iter <- iter + 1
    }

    theta[1,] <- temporaryThetaList[iter,]
  }

  result <- theta
  end_time <- Sys.time()
  print(end_time - start_time)
  return(result)
}
computer-science-upi/gradDescent documentation built on May 29, 2019, 4:46 a.m.