knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
options(rmarkdown.html_vignette.check_title = FALSE)

Contents of the package

In this package are included two principal functions: GradD() and SteepD(). These perform the algorithm of optimization called Gradient Descend and Steepest Descend, respectively. These functions are especially build to perform an optimization in order to find $\beta$ parameters of a liner model, i.e. $$ Y=X\beta + \varepsilon $$

library(DescendMethods)

Question 1 - Implement a gradient descend function optimization

In order to answer this question, we have implemented the function called GradD(). This take as input the following argument (that are full explained in the function documentation, callable with ?GradD) :

GradD(data,
      stepsize = 1e-04,
      init = NULL,
      tol = 1e-04,
      maxit = 1000L,
      verb = F,
      check_loss = F)

and returns a list of element composed by: : Beta_hat that contains the $\beta$ coefficient of interest : Minimum return the value of the loss function at the convergence point (only if verb = TRUE) : Final_error return the value of the error at the convergence point : Num_iter return the number of iterations that the function used to reach the minimum : Time it is the time elapsed to perform the optimization (increased by 2 seconds to make it traceable even with small data)

We want to notice that, since the stopping criteria consider only the absolute variation on the $\beta$ coefficient and we think that it can be misleading in certain cases, we have introduced a boolean flag called check_loss = T. This option, if TRUE, allow to evaluate the absolute difference from the loss function, instead on the parameters.

Question 2 - Steepest descend method

In order to answer this question, we have implemented the function called SteepD(). This take as input the following argument (that are full explained in the function documentation, callable with ?SteepD) :

SteepD(data,
       init = NULL,
       tol = 1e-04,
       maxit = 1000L,
       verb = F,
       check_loss = F)

and return a list of element composed by: : Beta_hat that contains the $\beta$ coefficient of interest : Minimum return the value of the loss function at the convergence point (only if verb = TRUE) : Final_error return the value of the error at the convergence point : Num_iter return the number of iterations that the function used to reach the minimum : Time it is the time elapsed to perform the optimization (increased by 2 seconds to make it traceable even with small data)

We want to notice that, since the stopping criteria consider only the absolute variation on the $\beta$ coefficient and we think that it can be misleading in certain cases, we have introduced a boolean flag called check_loss = T. This option, if TRUE, allow to evaluate the absolute difference from the loss function, instead on the parameters.

Question 3 - Considerations

In order to answer this question, we need to make some examples. We have chosen to use simulated datasets to compare the two functions, dividing the various examples based on the amount of data and dimensionality related to the problem.

Low dimensionality

Starting with small dataset

set.seed(1)
k <- 2
n <- 100
x <- cbind(1, matrix(runif(n*k), nrow = n, ncol = k))
beta <- runif(k+1, -10, 10)
y <- x %*% beta + rnorm(n)
df <- list(X = x, Y = y)

g <- GradD(data = df, verb = F, maxit = 3000)
s <- SteepD(data = df, verb = F)

cat('\n Estimated values - GradD \n')
as.numeric(t(g$Beta_hat))

cat('\n Estimated values - SteepD \n')
as.numeric(t(s$Beta_hat))

cat('\n Real values \n')
beta

cat('\n Elapsed time - GradD \n')
g$Time

cat('\n Elapsed time - SteepD \n')
s$Time

Continuing with bigger dataset

set.seed(1)
k <- 2
n <- 10^5
x <- cbind(1, matrix(runif(n*k), nrow = n, ncol = k))
beta <- runif(k+1, -10, 10)
y <- x %*% beta + rnorm(n)
df <- list(X = x, Y = y)

g <- GradD(data = df, verb = F, stepsize = 1e-06, maxit = 3000)
s <- SteepD(data = df, verb = F) 

cat('\n Estimated values - GradD \n')
as.numeric(t(g$Beta_hat))

cat('\n Estimated values - SteepD \n')
as.numeric(t(s$Beta_hat))

cat('\n Real values \n')
beta

cat('\n Elapsed time - GradD \n')
g$Time

cat('\n Elapsed time - SteepD \n')
s$Time

High dimensionality

Starting with small dataset

set.seed(1)
k <- 25
n <- 100
x <- cbind(1, matrix(runif(n*k), nrow = n, ncol = k))
beta <- runif(k+1, -10, 10)
y <- x %*% beta + rnorm(n)
df <- list(X = x, Y = y)

g <- GradD(data = df, verb = F, stepsize = 1e-03, maxit = 3000)
s <- SteepD(data = df, verb = F)

cat('\n Estimated values - GradD \n')
as.numeric(t(g$Beta_hat))

cat('\n Estimated values - SteepD \n')
as.numeric(t(s$Beta_hat))

cat('\n Real values \n')
beta

cat('\n Elapsed time - GradD \n')
g$Time

cat('\n Elapsed time - SteepD \n')
s$Time

Continuing with bigger dataset

set.seed(1)
k <- 25
n <- 10^5
x <- cbind(1, matrix(runif(n*k), nrow = n, ncol = k))
beta <- runif(k+1, -10, 10)
y <- x %*% beta + rnorm(n)
df <- list(X = x, Y = y)

g <- GradD(data = df, verb = F, stepsize = 1e-06, maxit = 3000)
s <- SteepD(data = df, verb = F) 

cat('\n Estimated values - GradD \n')
as.numeric(t(g$Beta_hat))

cat('\n Estimated values - SteepD \n')
as.numeric(t(s$Beta_hat))

cat('\n Real values \n')
beta

cat('\n Elapsed time - GradD \n')
g$Time

cat('\n Elapsed time - SteepD \n')
s$Time

From this examples, it's easy to see that the function which implements steepest descend method, SteepD(), is faster than the other one that implements the gradient descend method, GradD(), under all points of view.

An important disadvantage of GradD(), respect to SteepD(), is its dependence from stepsize argument, with respect to it is very sensitive. On the contrary, this problem not affects SteepD().

Then, not only the results provided by SteepD() are faster, but they are also more robusts, since they do not depend from another argument (the stepsize) which affects the optimization algorithm critically in the case of GradD().

Benchmark on simulated data

library(microbenchmark)

set.seed(445)
n <- 100
k <- 5
X <- cbind(1, matrix(runif(n*(k-1), 0, 10), nrow = n, ncol = (k-1)))
beta_true <- rnorm(n = k)
y <- X %*% beta_true + rnorm(n)
df <- list(X = X, Y = y)

microbenchmark(
  BFGS = optim(par = rep(1, k), fn = LossD, X = X, Y = y, method = c("BFGS")),
  CG = optim(par = rep(1, k), fn = LossD, X = X, Y = y, method = c("CG")),
  SteepD = SteepD(data = df),
  Gradd = GradD(data = df)
)

Question 4 - Parallel cross-validation

The package also provides two functions that perform a cross validation on predictive capability for a linear model, achieved by a set of parameters obtained throught an optimization process. They are CrossVD() and PcrossVD(), and they were written following two different flow of operation: sequential and parallel respectively.

The call for the sequential version is the following one:

CrossVD(data, 
        K = NULL, 
        get_mean = T,
        verb = F,
        OPT, ...)

As usual, all the arguments are all explained in the function documentation. We here just report that the idea is to build a function that takes the name of the optimization function to be used as an argument. At the moment, OPT accepts only GradD and SteepD. Moreover, if get_mean is TRUE, the CV-MSE is returned. Otherwise the function returns a vector containing all MSE computed for each fold.

The call for the parallel version is the following one:

PcrossVD(data, 
        K = NULL, 
        get_mean = T,
        n_clust = NULL,
        OPT, ...)

The only difference with respect to the previous one is the additional argument to get the number of cores/clusters the user want to use. In both functions, we rely on the map-reduce paradigm. In the parallel version, only the map transformation has been parallelized.

Prediction

The package also provides specific functions for prediction in linear models. PredictD gets as input the estimated coefficients and the new covariates and returns the pointwise estimation. To emulate the usage of the well know stats::predict function, the new data to be predicted must be a data.frame. However, none check is done on the order of the variables. Hence, the user is supposed to use the same ordering of the variables that is given by the vector of the estimated coefficients. MseD gets the estimated coefficients and the data and it return the Mean Squared error.

Cross Validation

Usage example for leave-one-out cross validation.

set.seed(445)
n <- 1000
x1 <- runif(n)
x2 <- rnorm(n)
x3 <- rnorm(n)
x4 <- rnorm(n)
y <- 1+3.6*x1 + 5.3*x2 + 4.2*x3 + 2.1*x4 + rnorm(n)
x <- cbind(1, x1, x2, x3, x4)
df <- list(X = x, Y = y)

tictoc::tic()
DescendMethods::CrossVD(data = df, OPT = SteepD)
tictoc::toc()

tictoc::tic()
DescendMethods::PcrossVD(data = df, OPT = SteepD)
tictoc::toc()

Usage example for K-fold cross validation.

set.seed(445)
n <- 10000
x1 <- runif(n)
x2 <- rnorm(n)
x3 <- rnorm(n)
x4 <- rnorm(n)
y <- 1+3.6*x1 + 5.3*x2 + 4.2*x3 + 2.1*x4 + rnorm(n)
x <- cbind(1, x1, x2, x3, x4)
df <- list(X = x, Y = y)

tictoc::tic()
CrossVD(data = df, K = 10, OPT = SteepD)
tictoc::toc()

tictoc::tic()
PcrossVD(data = df, K = 10, OPT = SteepD)
tictoc::toc()

The K-fold cross validation is less computational expensive than the leave-one-out counter part. Therefore the fixed costs of the parallelization exceeds its gain. However, if we increase the dimensionality of the problem, the advantages are clear.

set.seed(445)
n <- 100000
k <- 50
X <- matrix( rnorm( n*(k-1) ), nrow = n, ncol = (k-1))
X <- cbind(1,X)
beta_true <- rnorm(n = k)
y <- X %*% beta_true + rnorm(n)
df <- list(X = X, Y = y)

tictoc::tic()
CrossVD(data = df, K = 10, OPT = SteepD)
tictoc::toc()

tictoc::tic()
PcrossVD(data = df, K = 10, OPT = SteepD)
tictoc::toc()

Examples

Below are some code chunks as examples, in which the functions are implemented.

Simulated data:

First of all, we test the functions on simulated data in a low dimension scenario.

set.seed(1)
n <- 100
k <- 4
x <- cbind(1, matrix(runif(n*(k-1), 0, 10), nrow = n, ncol = (k-1)))
beta <- rnorm(k, 5, 5)
y <- x %*% beta + rnorm(n)
df <- list(X = x, Y = y)
res_D   <- GradD(data = df, verb = F, maxit = 3000)
res_SD  <- SteepD(data = df, verb = F) 
res_D_loss   <- GradD(data = df, verb = F, check_loss = T , maxit = 3000)
res_SD_loss  <- SteepD(data = df, verb = F,  check_loss = T ) 
res_opt <- optim(par = rep(1, 4), fn = LossD, X = x, Y = y, method = c("CG"))


cat('\n Estimated values - GradD \n')
t(res_D$Beta_hat)

cat('\n Estimated values - SteepD \n')
t(res_D$Beta_hat)

cat('\n Estimated values - GradD - loss \n')
t(res_D$Beta_hat)

cat('\n Estimated values - SteepD - loss \n')
t(res_D$Beta_hat)

cat('\n Estimated values - optim \n')
t(res_opt$par)

cat('\n Real values \n')
beta

Both stopping criteria arrive to the same results.

Real data

The next experiment is done using real data. First of all, we fit a simple linear regression without the intercept.

x <- as.matrix(trees$Height)
y <- trees$Girth
df <- list(X = x, Y = y)

res_D   <- GradD(data = df, verb = F, stepsize = 1e-8 ,maxit = 3000)
res_SD  <- SteepD(data = df, verb = F) 
res_D_loss   <- GradD(data = df, verb = F, stepsize = 1e-8, check_loss = T , maxit = 3000)
res_SD_loss  <- SteepD(data = df, verb = F,  check_loss = T ) 
res_opt <- optim(par = rep(1, ncol(x)), fn = LossD, X = x, Y = y, method = c("BFGS"))
res_lm  <- lm(Girth ~ Height - 1, data = trees)


cat('\n Estimated values - GradD \n')
t(res_D$Beta_hat)

cat('\n Estimated values - SteepD \n')
t(res_D$Beta_hat)

cat('\n Estimated values - GradD - loss \n')
t(res_D$Beta_hat)

cat('\n Estimated values - SteepD - loss \n')
t(res_D$Beta_hat)

cat('\n Estimated values - optim \n')
t(res_opt$par)

cat('\n Estimated values - lm \n')
coef(res_lm)

We noticed that GradD is sensible to the choice of the stepsize. Indeed, if it is not small enough, it is even possible that the errors explode to $+\infty$.

Then, we also introduced the intercept.

x <- cbind(1,as.matrix(trees[, -1]))
y <- trees$Girth
df <- list(X = x, Y = y)

res_D   <- GradD(data = df, verb = F, stepsize = 1e-8 ,maxit = 3000)
res_SD  <- SteepD(data = df, verb = F) 
res_D_loss   <- GradD(data = df, verb = F, stepsize = 1e-8, check_loss = T , maxit = 3000)
res_SD_loss  <- SteepD(data = df, verb = F,  check_loss = T ) 
res_opt <- optim(par = rep(1, ncol(x)), fn = LossD, X = x, Y = y, method = c("BFGS"))
res_lm  <- lm(Girth ~ ., data = trees)


cat('\n Estimated values - GradD \n')
t(res_D$Beta_hat)

cat('\n Estimated values - SteepD \n')
t(res_D$Beta_hat)

cat('\n Estimated values - GradD - loss \n')
t(res_D$Beta_hat)

cat('\n Estimated values - SteepD - loss \n')
t(res_D$Beta_hat)

cat('\n Estimated values - optim \n')
t(res_opt$par)

cat('\n Estimated values - lm \n')
coef(res_lm)


lucapresicce/DescendMethods documentation built on April 26, 2022, 6 p.m.