knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(linr)
Example for Simple Linear Regression with Big Data
Here we simulate a sinple linear regression with sample size of 5,000,000.
y.big = rnorm(5000000) x.big = rnorm(5000000) linr.fit = linr(y.big ~ x.big) lm.fit = lm(y.big ~ x.big) sum = summary(lm.fit)
linr
function against the lm
function.# Testing Correctness: all(c(all.equal(lm.fit$coefficients, linr.fit$coefficients), all.equal(lm.fit$fitted.values, linr.fit$fitted.values), all.equal(lm.fit$residuals, linr.fit$residuals), all.equal(linr.fit$F_statistic, sum$fstatistic[[1]]), all.equal(linr.fit$R.square, sum$r.squared), all.equal(linr.fit$Adj.R.square, sum$adj.r.squared), all.equal(linr.fit$std.error, sum$coefficients[,2]), all.equal(linr.fit$T_statistic, sum$coefficients[,3]), all.equal(linr.fit$p_value.T, sum$coefficients[,4]) ))
linr
fitting big data simple linear regression.# Testing Efficiency: time.lm = system.time(lm(y.big ~ x.big)) # Fitted by lm function time.linr = system.time(linr(y.big ~ x.big)) # Fitted by defult method (Cholesky) time.linr.qr = system.time(linr(y.big ~ x.big, method = "qr")) # Fitted by QR decomposition time.linr.svd = system.time(linr(y.big ~ x.big, method = "svd")) # Fitted by SVD decomposition time.linr.cholesky = system.time(linr(y.big ~ x.big, method = "cholesky")) # Fitted by Cholesky decomposition data.frame(elapsed_time = c(lm = time.lm[3], linr = time.linr[3], linr.qr = time.linr.qr[3], linr.svd = time.linr.svd[3], linr.cholesky = time.linr.cholesky[3]))
As shown in above result, all three method perform a high efficiency when conduction simple linear regression to big data.
Example for Multiple Linear Regression with Big Data
Here we simulate a multiple linear regression with sample size of 10,000, with 500 covariates.
args = commandArgs(trailingOnly = TRUE) if (length(args) == 0) { n = 10000L p = 500L q = 10L rho = 0.99 } else { n = as.integer(args[1]) p = as.integer(args[2]) q = as.integer(args[3]) rho = as.numeric(args[4]) } X.big = matrix(rnorm(p * n, sd = sqrt(1 - rho)), nrow = n, ncol = p) + matrix(rnorm(n, sd = sqrt(rho)), nrow = n, ncol = p) beta = c(rep(c(1, -1), length = q), rep(0, length = p - q)) # True coefficient epsi = rnorm(n, sd = 1) # True error term Y.big = X.big %*% beta + epsi
linr
function against the lm
function.# Testing Correctness: linr.fit.mul = linr(Y.big ~ X.big) linr.fit.mul.qr = linr(Y.big ~ X.big, method = "qr") linr.fit.mul.svd = linr(Y.big ~ X.big, method = "svd") lm.fit.mul = lm(Y.big ~ X.big) sum.mul = summary(lm.fit.mul) all(c(all.equal(lm.fit.mul$coefficients, lm.fit.mul$coefficients), all.equal(linr.fit.mul.qr$coefficients, lm.fit.mul$coefficients), all.equal(linr.fit.mul.svd$coefficients, lm.fit.mul$coefficients), all.equal(lm.fit.mul$fitted.values, linr.fit.mul$fitted.values), all.equal(lm.fit.mul$residuals, linr.fit.mul$residuals), all.equal(linr.fit.mul$F_statistic, sum.mul$fstatistic[[1]]), all.equal(linr.fit.mul$R.square, sum.mul$r.squared), all.equal(linr.fit.mul$Adj.R.square, sum.mul$adj.r.squared), all.equal(linr.fit.mul$std.error, sum.mul$coefficients[,2]), all.equal(linr.fit.mul$T_statistic, sum.mul$coefficients[,3]), all.equal(linr.fit.mul$p_value.T, sum.mul$coefficients[,4]) ))
linr
fitting big data with multiple linear regression.# Testing Efficiency: MLR.time.linr = system.time(linr(Y.big ~ X.big)) # Fitted by Cholesky decomposition MLR.time.linr.qr = system.time(linr(Y.big ~ X.big, method = "qr")) # Fitted by QR decomposition MLR.time.linr.svd = system.time(linr(Y.big ~ X.big, method = "svd")) # Fitted by SVD decomposition MLR.time.lm = system.time(lm(Y.big ~ X.big)) data.frame(elapsed_time = c(lm = MLR.time.linr[3], linr = MLR.time.lm[3], linr.qr = MLR.time.linr.qr[3], linr.svd = MLR.time.linr.svd[3] ))
As shown in above result, the Cholesky decomposition method (defult, or use with method = "cholesky"
) perform a very high efficiency compares to all other methods.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.