#' @title Function to Fit Linear Models with High Efficiency
#'
#' @description "linr" is used to fit a linear model. In this function, linear regression can be done by three matrix decomposition methods, which are the QR decomposition, Cholesky decomposition and the singular value decomposition (SVD). The defalt fitting method used is the Cholesky decomposition method. All three decomposition methods can fit linear model with high efficiency.
#'
#' @details See {\code{vignette("Intro_to_linr", package = "linr")}} for an overview of the package. Also see {\code{vignette("Efficiency_tests", package = "linr")}} for efficiency testing of linr.
#'
#' @param formula an object of class 'formula' (or one that can be coerced to that class): a symbolic description of the model to be fitted. (e.g. Y ~ X + Z, Y is the outcome, X and Z are predictors)
#' @param data an optional data frame, list or environment containing the variables in the model. If not found in data, the variables are defultly taken from formula (environment), typically the environment from which lm is called.
#' @param method an optional character string specifying the fitting method of matrix decomposition. It must be one of the strings "qr", "cholesky", or "svd".
#'
#' @return linr returns an object of class "linr", a list containing at least the following components:
#' \itemize{
#' \item {Call} - {The fitted linear model fomula and the corresponding data.}
#' \item {coefficients} - {A vector of coefficients estimates. Containing the estimated regression parameters for intercept and each covariates.}
#' \item {fitted.values} - {The fitted mean values.}
#' \item {residuals} - {A vector of the difference between the observed value and the fitted mean values for that observation}
#' \item {MSE} - {The residual standard error, the square root of the residual sum of squares divided by the residual degrees of freedom. It is a measure used to assess how well a linear regression model fits the data.}
#' \item {R.square} - {The coefficient determination, which indicates fraction of variance explained by the fitted model.}
#' \item {Adj.R.square} - {A modified version of R-squared that has been adjusted for the number of predictors in the model.}
#' \item {std.error} - {A vector of standatd error corresponds to each estimated coefficient.}
#' \item {T_statistic} - {A vector of T-statistic corresponds to each estimated coefficient.}
#' \item {p_value.T} - {The p-value (two-sided) for the T-statistic}
#' \item {F_statistic} - {F-statistic, The ratio of the mean regression sum of squares divided by the mean error sum of squares.}
#' \item {p_value.F} - {The p-value for the F-statistic}
#' }
#' @seealso Useful links:
#' \itemize{
#' \item {Github page} {\url{https://github.com/SelinaSong0412/linr}}
#' \item {Report bug at} {\url{https://github.com/SelinaSong0412/linr/issues}}
#' }
#'
#' @examples
#' # you can fit linear model by existing datasets
#' Linear_model.linr = linr(dist~speed, data=cars)
#' print(Linear_model.linr$Call) # the fitted model
#' print(Linear_model.linr$coefficients) # the coeeficient estimates
#' head(Linear_model.linr$residuals) # the first 6 residuals
#' head(Linear_model.linr$fitted.values) # the first 6 fitted values
#'
#'
#' # you can also use formula and predefined outcome and predictor to fit
#' y = rnorm(300)
#' x = matrix(rnorm(600), 300, 2)
#' model.linr = linr(y ~ x)
#' print(model.linr$Call) # the fitted model
#' print(model.linr$MSE) # the mean square error of the fit
#' print(model.linr$std.error) # the standard errer of estimates
#' print(model.linr$T_statistic) # the T statistics of estimates
#' print(model.linr$p_value.T) # the p value of T-test
#' print(model.linr$F_statistic) # the F statistics of estimates
#' print(model.linr$p_value.F) # the p value of F-test
#' print(model.linr$R.square) # The coefficient determination
#' print(model.linr$Adj.R.square) # The adjusted coefficient determination
#'
#' # you can use 3 type of fitting methods as follows
#' Y = rnorm(100)
#' X = matrix(rnorm(600), 100, 6)
#' fit1 = linr(Y ~ X, method = "qr")
#' fit2 = linr(Y ~ X, method = "svd")
#' fit3 = linr(Y ~ X, method = "cholesky")
#'
#' @importFrom methods hasArg
#' @importFrom stats na.omit
#' @importFrom stats pf
#' @importFrom stats pt
#' @importFrom stats model.frame
#'
#' @docType package
#' @name linr
#' @export
linr <- function(formula, data, method = "cholesky") {
cl = match.call()
if (hasArg(data)) {
mf = model.frame(formula, na.omit(data))
} else {
mf = model.frame(formula)
}
# Defining the outcome matrix Y and the Observation matrix X.
Y = as.matrix(mf[, 1])
X = as.matrix(mf[, -1])
colnames(X) = NULL
rownames(X) = NULL
if (!is.numeric(Y)) {
stop("The outcomes should be numeric")
} else if (dim(Y)[2] != 1) {
stop("Please input valid regression outcome")
}
# Checking dimension of X. If wrong, return error.
if (nrow(X) < ncol(X)) {
stop("Number of observations is less than predictors.")
}
n = nrow(Y)
p = ncol(X) + 1
if (p == 2L) { # High Efficient Simple Linear Regression.
x = as.vector(X) - mean(X)
y = as.vector(Y) - mean(Y)
SSX = sum(x * x)
SSY = sum(y * y)
SSXY = sum(x * y)
R.XY = SSXY / sqrt(SSY * SSX) # Corr(X, Y)
betahat1 = SSXY / SSX
betahat0 = mean(Y) - betahat1 * mean(X)
betahat = c(betahat0, betahat1) # Coefficient estimates
fit.val = betahat0 + betahat1 * as.vector(X) # Fitted values
residuals = as.vector(Y) - fit.val # Residuals
SSE = sum(residuals ^ 2)
MSE = SSE / (n - 2) # Mean Square Error
SE.betahat1 = sqrt(MSE / SSX)
SE.betahat0 = sqrt(MSE * (1 / n + mean(X) ^ 2 / SSX))
SE.betahat = c(SE.betahat0, SE.betahat1) # Coefficient Standard Error
}
else { # High Efficient Multiple Linear Regression partly wrote with Rcpp
X = cbind(rep(1, n), X)
if (method != "svd" & method != "qr" & method != "cholesky") {
stop(gettextf("method = '%s' is not supported. Using 'qr', 'svd' or 'cholesky'", method), domain = NA)
}
else if (method == "svd") { # if using SVD decomposition method
svd_decom = svd(X)
tuy = crossprod(svd_decom$u, Y)
betahat = crossprod(t(svd_decom$v), (tuy / svd_decom$d))
}
else if (method == "qr") { # if using QR decomposition method
res_qr <- qr(X, lapack=TRUE)
betahat = qr.coef(res_qr, Y)
}
else { # Else, do Cholesky decomposition as defult.
XtX = crossprod(X)
XtY = crossprod(X,Y)
R = chol(XtX)
z = forwardsolve(R, XtY, upper.tri = TRUE, transpose = TRUE)
betahat = backsolve(R, z)
}
fit.val = as.vector(crossprod(t(X), betahat)) # Fitted value
residuals = as.vector(Y) - fit.val # Residuals
SSE = sum(residuals ^ 2) # residual sum of square
SSY = sum((Y - mean(Y)) ^ 2) # total sum of square
MSE = SSE / (n - p) # mean square error
varmat = solve(crossprod(X), diag(MSE, p, p), tol = 0, transpose = FALSE)
SE.betahat = sqrt(diag(varmat)) # standard error of estimates
betahat = as.vector(betahat)
}
R.square = 1 - SSE / SSY # Coefficient determination R^2
Adj.R.square = 1 - (n - 1) * MSE / SSY # Adjusted coefficient determination R_adj^2
T.stat = betahat / SE.betahat # T statistics
p.val.T = pt(abs(T.stat), n - p, lower.tail = FALSE) * 2 # p value of T test
F.stat = (SSY - SSE) / (p - 1) / MSE # F statistics
p.val.F = pf(F.stat, (p - 1), (n - p), lower.tail = FALSE)# p value of F test
# Give names to variables and observations.
vars = colnames(mf)[-1]
if (hasArg(data)) { # If have input data
variab = c("(Intercept)", vars)
observ = rownames(mf)
} else { # If not have input data
if (p > 2) {
variab = c("(Intercept)", paste0(vars, 1:(p - 1)))
} else {
variab = c("(Intercept)", vars)
}
observ = 1:n
}
names(betahat) = variab
names(fit.val) = observ
names(residuals) = observ
names(SE.betahat) = variab
names(T.stat) = variab
names(p.val.T) = variab
fit = list(Call = cl, coefficients = betahat, fitted.values = fit.val,
residuals = residuals, MSE = MSE, std.error = SE.betahat,
R.square = R.square, Adj.R.square = Adj.R.square,
T_statistic = T.stat, p_value.T = p.val.T,
F_statistic = F.stat, p_value.F = p.val.F)
class(fit) = "linr"
return(fit)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.