R/linreg.R

#' A multiple linear regression function.
#'
#' Multiple linear regression function which returns the coeficients (the regression weights) of the independent variables with their error,
#' t-value and p-value also an a estimate for a residual variance and the degrees of freedom in the model.
#' The description of the linear model can be found here: \url{https://en.wikipedia.org/wiki/Regression_analysis}
#'
#' @field formula: an object of class object, the model to fitted
#' @field data: an data frame, contain the variables in the model
#' @field coefficients: an matrix, regressions coefficients
#' @field fitted_values: an matrix, fitted values
#' @field residuals: an matrix, the residulas
#' @field df: an integer, the degrees of freedom
#' @field residual_var: numeric, the residual variance
#' @field coefficients_var: numeric, the variance of the regression coefficients
#' @field t_values: an matrix, the t-values for each coefficient
#'
#' @import methods
#' @export linreg
#' @exportClass linreg
linreg <- setRefClass("linreg",
                      fields = list(
                        formula = "formula",
                        data = "character",
                        coefficients = "matrix",
                        fitted_values = "matrix",
                        residuals = "matrix",
                        df = "integer",
                        residual_var = "numeric",
                        coefficients_var = "numeric",
                        t_values = "matrix"
                      ),
                      methods = list(

                        ##### initialize all fields #####
                        initialize = function(formula, data) {
                          "Initialize all regression variables"

                          if (class(formula) != "formula") {
                            stop("invalid formula")
                          }
                          if (class(data) != "data.frame") {
                            stop("invalid data, data has to be data.frame")
                          }

                          # independent variables
                          X <- model.matrix(formula, data)
                          # dependent variables
                          y <- data[, all.vars(formula)[1]]

                          # QR decomposition
                          qr_factor <- qr(X)
                          qr_Q <- qr.Q(qr_factor)
                          qr_R <- qr.R(qr_factor)

                          # Regressions coefficients, βˆ = inverse(R) t(Q) yˆ (where X = QR)
                          coefficients <<- solve(qr_R) %*% t(qr_Q) %*% y

                          # Fitted Values, yˆ = X βˆ
                          fitted_values <<- X %*% coefficients

                          # The residuals, eˆ = y − yˆ = y − X βˆ
                          residuals <<- y - fitted_values

                          # The degrees of freedom, df = n − k - 1
                          # n:number of observations, k:number of independent variables
                          df <<- nrow(data) - ncol(X)

                          # The residual variance, σˆ2 = eTe/df
                          residual_var <<- as.numeric((t(residuals) %*% residuals) / df)

                          # The variance of the regression coefficients
                          coefficients_var <<- residual_var * diag(solve(qr_R) %*% solve(t(qr_R)))

                          # The t-values for each coefficient
                          t_values <<- coefficients / sqrt(coefficients_var)

                          formula <<- formula
                          data <<- deparse(substitute(data))
                        },
                        #############################################
                        ##### print function #####
                        print = function() {
                          "Show linreg formula and coefficients of indepentends"

                          cat("Call:\n")
                          cat(paste0("linreg(formula = ", format(formula),
                                     ", data = ", data, ")\n\n"))

                          cat("Coefficients:\n")
                          cat(row.names(coefficients)[1])

                          for (i in 2:(length(coefficients))) {
                            cat(sprintf("%20s", row.names(coefficients)[i]))
                          }
                          cat("\n     ")
                          cat(round(coefficients[1], 3))
                          for (i in 2:(length(coefficients))) {
                            coef <- as.character(round(coefficients[i], 3))
                            cat(sprintf("%20s", coef))
                          }
                        },
                        #############################################
                        ##### plot function #####
                        plot = function() {
                          "Show residuals vs fitted plot and scale-location plot"

                          # data.frame for plot variables
                          plot.df <- data.frame(fitted_values = fitted_values,
                                                residuals = residuals,
                                                standar_residuals = (abs(residuals/(residual_var^(1/2))))^(1/2))

                         # Residuals vs Fitted
                          p1 <- ggplot2::ggplot(plot.df, ggplot2::aes(x = fitted_values, y = residuals)) +
                            ggplot2::theme_classic() +
                            ggplot2::theme(axis.title = ggplot2::element_text(face = "bold"), # axis names size
                                           plot.title = ggplot2::element_text(face = "bold", hjust = 0.5),
                                           panel.border = ggplot2::element_rect(fill = NA, size = 1)) +
                            ggplot2::geom_point() +
                            ggplot2::geom_smooth(method = "loess", se = FALSE, col = "red", lwd = 0.5) +
                            ggplot2::geom_hline(yintercept = 0, linetype = "dotted", color = "grey") +
                            ggplot2::scale_x_continuous(name = paste0("Fitted Values\nlinreg(", format(formula), ")")) +
                            ggplot2::scale_y_continuous(name = "Residuals") +
                            ggplot2::ggtitle("Residuals vs Fitted")

                          # Scale-Location
                          p2 <- ggplot2::ggplot(plot.df, ggplot2::aes(x = fitted_values, y = standar_residuals)) +
                            ggplot2::theme_classic() +
                            ggplot2::theme(axis.title = ggplot2::element_text(face = "bold"), # axis names size
                                           plot.title = ggplot2::element_text(face = "bold", hjust = 0.5),
                                           panel.border = ggplot2::element_rect(fill = NA, size = 1)) +
                            ggplot2::geom_point() +
                            ggplot2::geom_smooth(method = "loess", se = FALSE, col = "red", lwd = 0.5) +
                            ggplot2::scale_x_continuous(name = paste0("Fitted Values\nlinreg(", format(formula), ")")) +
                            ggplot2::scale_y_continuous(name = expression(sqrt(abs("Standar residuals")))) +
                            ggplot2::ggtitle("Scale-Location")

                          gridExtra::grid.arrange(p1, p2)

                        },
                        #############################################
                        ##### resid function #####
                        resid = function() {
                          "Return residuals"
                          return(as.vector(residuals))
                        },
                        #############################################
                        ##### pred function #####
                        pred = function() {
                          "Return fitted values"
                          return(as.vector(fitted_values))
                        },
                        #############################################
                        ##### coef function #####
                        coef = function() {
                          "Return coeficients"
                          coef <- as.vector(coefficients)
                          names(coef) <- row.names(coefficients)
                          return(coef)
                        },
                        #############################################
                        ##### summary function #####
                        summary = function() {
                          "Show resituals, coefficients with their estimate, standard error, t-value and p-value"
                          ##### Calls #####
                          cat("Call:\n")
                          cat(paste0("linreg(formula = ", format(formula),
                                     ", data = ", data, ")\n\n"))
                          #############################################
                          ##### Residuals #####
                          cat("Residuals:\n")
                          residual_quan <- round(quantile(residuals), 3)
                          cat(format(c("Min", "1Q", "Median", "3Q", "Max"), width = 10, justify = "right"))
                          cat("\n")
                          cat(format(residual_quan, width = 10, justify = "right"))
                          #############################################
                          ##### Coefficients #####
                          cat("\n")
                          cat("\nCoefficients:\n")

                          estimate <- round(coefficients, 3)
                          std_error <- round(sqrt(as.numeric(coefficients_var)), 3)
                          t_value <- round(t_values, 3)
                          p_value <- round(2*pt(-abs(t_values), df = nrow(residuals)-1), 6)

                          # signif marks
                          signif <- c()
                          for (i in p_value) {
                            if (i < 0.001) {
                              signif <- c(signif, "***")
                              next()
                            }
                            if (i > 0.001 & i < 0.01) {
                              signif <- c(signif, "**")
                              next()
                            }
                            if (i > 0.01 & i < 0.05) {
                              signif <- c(signif, "*")
                              next()
                            }
                            if (i > 0.05 & i < 0.1) {
                              signif <- c(signif, ".")
                              next()
                            } else {
                              signif <- c(signif, " ")
                            }
                          }

                          coef_df <- data.frame(estimate, std_error, t_value, p_value, signif)
                          coef_df$signif <- as.character(coef_df$signif)
                          width <- max(nchar(row.names(coef_df)))
                          cat("             ")
                          cat(format(c("Estimate", "Std. Error", "t value", "Pr(>|t|)"), width = width, justify = "right"))

                          cat("\n")

                          for (i in 1:nrow(coef_df)) {
                            cat(format(c(rownames(coef_df)[i], coef_df[i, ]), width = width, justify = "right"))
                            cat("\n")
                          }

                          #############################################
                          ##### Residual standard error #####
                          cat("\n")
                          cat(paste0("Residual standard error: ",
                                     round(sqrt(residual_var), 3),
                                     " on ", df, " degrees of freedom \n"))
                          #############################################

                        }

                      )
)

#' @importFrom ggplot2 ggplot
#' @importFrom gridExtra grid.arrange
NULL
shihs/LiUAdRLab4 documentation built on May 9, 2019, 8:19 a.m.