TMA4315-Project-1: MULTIPLE REGRESSION MODELS

# Select Build, Build and reload to build and lode into the R-session.

get.signif.code <- function(pvals)
{
  # Use "ifelse" instead of traditional if-else statements since these
  # will parallelize. In the event of many p-values, this will come in handy.
  signif.code <- ifelse(pvals < 0.001, "***",
                        ifelse(pvals < 0.01, "**",
                               ifelse(pvals < 0.05, "*",
                                      ifelse(pvals < 0.1, ".", " "))))
  return(signif.code)
}

mylm <- function(formula, data = list(), contrasts = NULL, ...){
  # Extract model matrix & responses
  mf <- model.frame(formula = formula, data = data)
  X  <- model.matrix(attr(mf, "terms"), data = mf, contrasts.arg = contrasts)
  X.T <- t(X)
  y  <- model.response(mf)
  y.T <- t(y)
  terms <- attr(mf, "terms")

  # Important dimensions
  n <- nrow(X)
  p <- ncol(X)
  k <- ifelse("(Intercept)" %in% colnames(X), p - 1, p)
  # Add code here to calculate coefficients, residuals, fitted values, etc...
  # and store the results in the list est
  xtxinv <- solve(X.T %*% X)
  betahat <- xtxinv %*% X.T %*% y
  sse <- t(y - (X %*% betahat)) %*% (y - (X %*% betahat))

  df <- n - p
  sigmasqhat <- sse / df  # unbiased sigmasq-est

  stderror <- sqrt(sigmasqhat)
  betahatstderrors <- stderror[1,1] * sqrt(diag(xtxinv))

  betahatcov <- sigmasqhat[1,1] * xtxinv
  # z-test of each estimated coefficient
  zhat <- (betahat - 0) / sqrt(diag(betahatcov))
  betahatpvals <- 2 * pnorm(abs(zhat), lower.tail = FALSE)

  yhat <- X %*% betahat
  epsilon <- y - yhat

  iminush <- diag(n) - (X %*% xtxinv %*% t(X)) # I-H
  stdepsilon <- epsilon / (stderror[1,1] * sqrt(diag(iminush)))
  studentepsilon <- stdepsilon * sqrt((n-p-1) / (n-p-stdepsilon^2))
  # anova


  # rsq
  sst <- y.T %*% (diag(n) - 1/n * matrix(1, n, n)) %*% y
  ssr <- sst - sse
  rsq <- ssr / sst
  rsqadj<- 1 - ((n - 1) / (n - p)) * (1 - rsq)
  msqr<-ssr/p
  msqe<-sse/df

  #Chisquare approximation
  chisqhat<-ssr/(sse/(n-p))
  p.val<- pchisq(chisqhat,k, lower.tail = FALSE)

  est <- list(terms = terms,
              model = mf,
              X = X,
              betahat = betahat,
              sse = sse,
              df = df,
              n=n,
              p=p,
              k=k,
              sigmasqhat = sigmasqhat,
              betahatcov = betahatcov,
              stderror = stderror,
              betahatstderrors = betahatstderrors,
              zhat = zhat,
              betahatpvals = betahatpvals,
              yhat = yhat,
              epsilon = epsilon,
              studentepsilon = studentepsilon,
              sst=sst,
              ssr=ssr,
              rsq=rsq,
              rsqadj=rsqadj,
              msqr=msqr,
              msqe=msqe,
              chisqhat=chisqhat,
              p.val=p.val,
              y=y)


  # Store call and formula used
  est$call <- match.call()
  est$formula <- formula

  # Set class name. This is very important!
  class(est) <- 'mylm'

  # Return the object with all results
  return(est)
}

print.mylm <- function(object, ...){
  cat('Call\n')
  print(object$call)
  cat("\n")
  cat('Coefficients:\n')

  output.betahat <- as.vector(object$betahat)
  names(output.betahat) <- rownames(object$betahat)

  print(output.betahat)
}

summary.mylm <- function(object, ...){

  cat("Call:\n")
  print(object$call)
  cat("\n")

  residuals.output.vec <- quantile(object$epsilon)
  names(residuals.output.vec) <- c("Min", "1Q", "Median", "3Q", "Max")

  cat("Residuals:\n")
  print(round(residuals.output.vec, digits = 2))

  cat("\n")

  cat('Coefficients:\n')

  coeff.df <- data.frame("Estimate" = object$betahat)

  coeff.df$`Std. Error` <- object$betahatstderrors
  coeff.df$`z-value` <- object$zhat
  coeff.df$`Pr(>|z|)` <- object$betahatpvals
  coeff.df$` ` <- get.signif.code(object$betahatpvals)

  print(format(coeff.df, digits = 3))
  cat("---\n")
  cat("Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n\n")

  cat(c("Residual standard error: ", format(object$stderror, digits = 4),
        " on ", object$df, " degrees of freedom", "\n"))
  cat(c("Multiple R-squared: ", format(object$rsq, digits = 4),
        ", \t", "Adjusted R-squared: ", format(object$rsqadj, digits = 4)), "\n")
  cat(c("Chi Square Statistic: ", format(object$chisqhat, digits = 4),
        "on ", object$k, " degrees of freedom",",\t",c("Pr(>|X|):",format(object$p.val,digits=4)), "\n"))
}

plot.mylm <- function(object, ...){
  library(ggplot2)

  # ggplot requires that the data is in a data.frame, this must be done here
  df <- data.frame(yhat = object$yhat, epsilon = object$epsilon)
  ggplot(df, aes(x=yhat, y=epsilon)) +
    geom_point() + geom_smooth(method = "loess") +
    labs(x = "Fitted values", y = "Residuals") +
    ggtitle("Residual vs Fitted Values")
}

anova.mylm <- function(object, ...){
  # Code here is used when anova(object) is used on objects of class "mylm"

  # Name of response
  response <- deparse(object$terms[[2]])

  # Components to test
  comp <- attr(object$terms, "term.labels")

  # Add Intercept to the list of covariates IF it is in fact used in the model.
  if ("(Intercept)" %in% colnames(object$X))
  {
    covariates <- c("(Intercept)", comp)
    txtFormula <- paste(response, "~", 1, sep = "")
  }
  else
  {
    covariates <- comp
    txtFormula <- paste(response, "~", covariates[1])
  }

  if (length(covariates) == 1)
  {
    stop("ANOVA can not be calculated because only one covariate was used.")
  }

  formula <- formula(txtFormula)

  # Growing lists in loops is slow in R. As such, we initialize our lists with a length.
  model <- vector("list", length(covariates))
  model[[1]] <- mylm(formula = formula, data = object$model)

  # Again, initialize list with pre-set length so it's slightly faster.
  output.df.list <- vector("list", length(covariates) - 1)

  # Fit the sequence of models
  # Start iterating from 2 because we already fit the first model above.
  for(numCovar in 2:length(covariates)){

    txtFormula <- paste(txtFormula, covariates[numCovar], sep = "+")
    formula <- formula(txtFormula)

    model[[numCovar]] <- mylm(formula = formula, data = object$model)

    model1 <- model[[numCovar - 1]]
    model2 <- model[[numCovar]]

    df <- model1$df - model2$df
    output.df <- data.frame("Df" = df)
    output.df$`Sum Sq` <- as.numeric(model1$sse - model2$sse)
    output.df$`Mean Sq` <- as.numeric(output.df$`Sum Sq` / output.df$Df)
    output.df$`Chisq Value` <- as.numeric(output.df$`Sum Sq` / (object$sse / object$df))
    output.df$`Pr(>X)` <- as.numeric(pchisq(output.df$`Chisq Value`, df = output.df$Df, lower.tail = FALSE))

    output.df.list[[numCovar - 1]] <- output.df
  }

  output.df <- do.call(rbind, output.df.list)


  # Prepare output.df for printing
  output.df$` ` <- get.signif.code(output.df$`Pr(>X)`)
  output.df <- format(output.df, digits = 3)

  # Add line for Residuals:
  residuals.output <- c(object$df, object$sse, object$sse / object$df, NA, NA, NA)
  output.df <- rbind(output.df, round(residuals.output))
  output.df[is.na(output.df)] <- ""

  # Name rows
  row.names(output.df) <- c(comp, "Residuals")

  # Print Analysis of Variance Table
  cat('Analysis of Variance Table\n\n')
  cat(c('Response: ', response, '\n'), sep = '')
  print(format(output.df, digits = 3))
  cat('---\n')
  cat("Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n")
}