R/regression.R

Defines functions logisticregsimulate logisticregsummary reg_crossval regsimulate pred_interval_reg regsummary

Documented in logisticregsimulate logisticregsummary pred_interval_reg reg_crossval regsimulate regsummary

#' Summarize the results from a regression analysis
#'
#' Alternative to `summary.lm` to summarize a regression from `lm`.
#' Prints a table similar to the one generated by SAS and Minitab.
#' @param lmobject a fitted regression model from `lm`.
#' @param anova `TRUE` if an ANOVA table is computed.
#' @param fit_measures `TRUE` if measures of fit (R² etc) is computed.
#' @param param `TRUE` if parameter estimates, standard errors etc is computed.
#' @param conf_intervals `TRUE` if confidence intervals for parameters.
#' @param vif_factors `TRUE` if variance inflation factors are to be printed.
#' @return list with three tables: param, anova and fit_measures
#' @export
#' @examples
#' library(regkurs)
#' lmfit = lm(nRides ~ temp + hum + windspeed, data = bike)
#' regsumm = regsummary(lmfit, anova = T, conf_intervals = T, vif_factors = T)
#' regsumm$param
#' regsumm$anova
#' regsumm$fit_measures
regsummary <- function(lmobject, anova = T,  fit_measures = T, param = T,
                       conf_intervals = F, vif_factors = F){

  if ("(Intercept)" %in% names(lmobject$coefficients)) intercept = 1 else intercept = 0

  lmsummary = summary(lmobject)
  df_regr = lmsummary$df[1] - intercept
  df_error = lmsummary$df[2]
  df_total = df_error + df_regr
  sse = (lmsummary$sigma^2)*df_error
  sst = var(lmobject$model[,1])*df_total
  ssr = sst - sse

  # Anova table
  anova_table = NA
  if (anova){
    anova_table = matrix(rep(NA,5*3),3,5)
    anova_table[,1] = c(df_regr,df_error,df_total)
    anova_table[,2] = c(ssr,sse,sst)
    anova_table[1:2,3] = c(ssr/df_regr, sse/df_error)
    anova_table[1,4] = lmsummary$fstatistic[1]
    anova_table[1,5] = pf(lmsummary$fstatistic[1], lmsummary$fstatistic[2], lmsummary$fstatistic[3], lower.tail = F)
    rownames(anova_table) <- c("Regr","Error","Total")
    colnames(anova_table) <- c("df","SS","MS","F","Pr(>F)")
    {cat("\nAnalysis of variance - ANOVA\n------------------------------------------------\n");
    print(anova_table, digits = 5, na.print = "")}
  }

  fit_table = NA
  if (fit_measures){
    fit_table = c(sqrt(sse/df_error), lmsummary$r.squared, lmsummary$adj.r.squared)
    names(fit_table) <- c("Root MSE","R2","R2-adj")
    {cat("\nMeasures of model fit\n------------------------------------------------\n");
    print(fit_table, digits = 5, na.print = "")}
  }

  # Table with estimated coefficients etc
  if (param){
    # Confidence intervals on parameters
    if (conf_intervals){
      param_table = cbind(lmsummary$coefficients, confint(lmobject))
    }else
    {
      param_table = lmsummary$coefficients
    }

    # Variance inflation factors
    if (vif_factors & df_regr>1){
      data = model.matrix(lmobject)
      X = data[,-1]
      vif = rep(NA,df_regr)
      for (j in 1:df_regr){
        vif[j] = 1/(1-summary(lm(X[,j] ~ data.matrix(X[,-j])))$r.squared)
      }
      if (intercept) vif = c(NA,vif)
      param_table = cbind(param_table,vif)
      colnames(param_table)[ncol(param_table)] = "VIF"
    }

    {cat("\nParameter estimates\n------------------------------------------------\n");
    print(param_table, digits = 5, na.print = "")}

  }else{param_table = NA}

  invisible(list(param = param_table, anova = anova_table, fit_measures = fit_table))
}


#' Plot prediction intervals for simple linear regression \cr
#'
#' @param lmobject a fitted regression model from `lm`.
#' @param conf_int_line if TRUE, then conf intervals for regression line are plotted.
#' @param pred_interval if TRUE, then prediction intervals are plotted.
#' @param level confidence level, default is level = 0.95
#' @param ngrid number of grid points over which the intervals are computed
#' @return data frame with confidence and prediction intervals over xGrid
#' @export
#' @examples
#' library(regkurs)
#' lmfit = lm(mpg ~ hp, data = mtcars)
#' res = pred_interval_reg(lmfit)
pred_interval_reg <- function(lmobject, conf_int_line = T, pred_interval = T,
                              level = 0.95, ngrid = 100){
  y = lmobject$model[,1]
  x = lmobject$model[,2]
  yname = names(lmobject$model)[1]
  xname = names(lmobject$model)[2]
  xmingrid = min(x) - 0.01*(max(x)-min(x))
  xmaxgrid = max(x) + 0.01*(max(x)-min(x))
  datagrid = data.frame(seq(xmingrid, xmaxgrid, length = ngrid))
  names(datagrid) <- xname
  CI =  predict.lm(lmobject, newdata = datagrid,
                   interval = "confidence", level = level)
  PI =  predict.lm(lmobject, newdata = datagrid,
                   interval = "prediction", level = level)

  plot(x, y, type="n", xlab = xname, ylab = yname, cex = 0.7,
       ylim = c(min(CI,PI), max(CI,PI)), xlim = c(xmingrid,xmaxgrid),
       main = "Regression - Confidence and Prediction intervals"
  )
  if (pred_interval){
    polygon(c(datagrid[,1], rev(datagrid[,1])), c(PI[,2], rev(PI[,3])),
            col = rgb(173/255, 216/255, 230/255, 0.4), border = 0)
  }
  if (conf_int_line){
    lines(datagrid[,1], CI[,2], col = rgb(160/255, 32/255, 240/255), lwd = 2)
    lines(datagrid[,1], CI[,3], col = rgb(160/255, 32/255, 240/255), lwd = 2)
  }
  lines(c(xmingrid, xmaxgrid),
        c(lmobject$coef[1] + lmobject$coef[2]*xmingrid,
          lmobject$coef[1] + lmobject$coef[2]*xmaxgrid),
        col = rgb(0/255, 0/255, 139/255), lwd = 3)
  points(x, y, pch = 19, cex = 0.7)

  legend(x = "topright", inset=.05,
         legend = c("Data", "Regression line","C.I.", "P.I."),
         pch = c(19,NA,NA,NA), pt.lwd = c(1,3,2,2), lty = c(0,1,1,0),
         fill = c(NA,NA,NA, rgb(173/255, 216/255, 230/255, 0.4)),
         border = c(0,0,0,0),
         col = c("black",
                 rgb(0/255, 0/255, 139/255),
                 rgb(160/255, 32/255, 240/255),
                 rgb(173/255, 216/255, 230/255, 0.4)),
         box.lty=1
  )


  # Table with estimated coefficients etc
  if (param){
    # Confidence intervals on parameters
    if (conf_intervals){
      param_table = cbind(glmsummary$coefficients, suppressMessages(confint(glmobject)))
    }else
    {
      param_table = glmsummary$coefficients
    }

    # Variance inflation factors
    if (vif_factors & k>1){
      vif = rep(NA,k)
      for (j in 1:k){
        vif[j] = 1/(1-summary(lm(X[,j] ~ data.matrix(X[,-j])))$r.squared)
      }
      if (intercept) vif = c(NA,vif)
      param_table = cbind(param_table,vif)
      colnames(param_table)[ncol(param_table)] = "VIF"
    }

    {cat("\nParameter estimates\n------------------------------------------------\n");
      print(param_table, digits = 5, na.print = "")}

  }else{param_table = NA}



  # Table with odds ratios etc
  if (odds_ratio){
    # Confidence intervals on parameters
    if (conf_intervals){
      odds_ratio_table = cbind(exp(glmsummary$coef[,1:2]),glmsummary$coef[,3:4], exp(suppressMessages(confint.default(glmobject))))
    }else
    {
      odds_ratio_table = cbind(exp(glmsummary$coef[,1:2]),glmsummary$coef[,3:4])
    }

    # Variance inflation factors
    if (vif_factors & k>1){
      vif = rep(NA,k)
      for (j in 1:k){
        vif[j] = 1/(1-summary(lm(X[,j] ~ data.matrix(X[,-j])))$r.squared)
      }
      if (intercept) vif = c(NA,vif)
      odds_ratio_table = cbind(odds_ratio_table,vif)
      colnames(odds_ratio_table)[ncol(odds_ratio_table)] = "VIF"
    }

    {cat("\nOdds ratio estimates\n------------------------------------------------\n");
      print(odds_ratio_table, digits = 5, na.print = "")}

  }else{odds_ratio_table = NA}


  invisible(list(param = param_table, odds_ratio = odds_ratio_table))

  return(data.frame(xgrid = datagrid[,1],
                    CI_low = CI[,2], CI_high = CI[,3],
                    PI_low = PI[,2], PI_high = PI[,3]
  )
  )

}

#' Simulate from a linear regression model
#'
#' Simulates a dataset with `n` observation from the linear regression model
#' \deqn{y = \beta_0 + \beta_1 x_1 + \ldots + \beta_k x_k + \epsilon, \epsilon \sim N(0, \sigma_\epsilon^2)}{y = \beta_0 + \beta_1 * x_1 + ... + \beta_k * x_k + \epsilon, \epsilon ~ N(0, \sigma_ \epsilon^2)}
#' with covariates (x) simulated from a normal distribution with the same correlation `rho_x` \cr
#' between all pairs of covariates. Covariate \eqn{x_j}{x_j} has standard deviation `sigma_x[j]`. \cr
#' Alternatively the covariate can follow a uniform distribution.
#' @param n the number of observations in the simulated dataset.
#' @param betavect a vector with regression coefficients
#' c(beta_0,beta_1,...beta_k). First element is intercept if `intercept = TRUE`
#' @param sigma_eps standard deviation of the error terms, epsilon.
#' @param intercept if `TRUE` an intercept is added to the model.
#' @param covdist distribution of the covariates. Options: `'normal'` or `'uniform'`.
#' @param rho_x correlation among the covariates. Same for all covariate pairs.
#' @param sigma_x vector with standard deviation of the covariates.
#' @return dataframe with simulated data (y, X1, X2, ..., XK) (no intercept included).
#' @export
#' @examples
#' library(regkurs)
#' simdata <- regsimulate(n = 500, betavect = c(1, -2, 1, 0), sigma_eps = 2)
#' lmfit <- lm(y ~ X1 + X2 + X3, data = simdata)
#' regsummary(lmfit, anova = F)
regsimulate <- function(n, betavect, sigma_eps, intercept = TRUE, covdist = 'normal',
                        rho_x = 0, sigma_x = rep(1,length(betavect)-intercept)){

  k = length(betavect) - intercept

  # Generate covariates
  if (covdist == 'normal'){
    rho = matrix(rho_x, k, k)
    diag(rho) <- 1
    sigma = diag(sigma_x)%*%rho%*%diag(sigma_x)
    X = mvtnorm::rmvnorm(n, sigma = sigma)
  }else{
    X = matrix(runif(n*k), n, k)
    if (rho_x != 0) warning("uniformly distributed covariates are always uncorrelated")
  }
  if (intercept) X = cbind(1,X)

  # Simulate responses
  y = X%*%betavect + rnorm(n, sd = sigma_eps)

  if (intercept) X = X[,-1] # remove intercept in the returned dataset
  data = data.frame(cbind(y,X))
  xnames = rep(NA,k)
  for (j in 1:k) xnames[j] = paste("X",j, sep = "")
  names(data) <- c("y", xnames)
  return(data)
}

#' K-fold cross-validation of regression models estimated with lm()
#'
#' @param formula an object of class "formula": a symbolic description of the model to be fitted.
#' @param data a data frame with the data used for fitting the models.
#' @param nfolds the number of folds in the cross-validation.
#' @param obs_order order of the observations when splitting the data. obs_order = "random" gives a random order.
#' @return RMSE Root mean squared prediction error on test data
#' @export
#' @examples
#' library(regkurs)
#' RMSE_CV = reg_crossval(mpg ~ hp, data = mtcars, nfolds = 4, obs_order = 1:32)
#' print(RMSE_CV)
reg_crossval <- function(formula, data, nfolds, obs_order = "random"){

  n = dim(data)[1]
  if (is.character(obs_order)) obs_order = sample(1:n)

  obs_per_fold = ceiling(n/nfolds)
  yhat = matrix(NA, obs_per_fold, nfolds)
  if (n %% nfolds == 0){
    test_obs_matrix = matrix(obs_order, obs_per_fold) # k:th column contains test for fold k
  }else{
    nobs_last_fold = n-obs_per_fold*(nfolds-1)
    test_obs_matrix = matrix(NA, obs_per_fold, nfolds)
    test_obs_matrix[,1:(nfolds-1)] = matrix(obs_order[obs_per_fold*(nfolds-1)], obs_per_fold)
    test_obs_matrix[1:nobs_last_fold, nfolds] = obs_order[(obs_per_fold*(nfolds-1)+1):n]
  }
  for (k in 1:nfolds){
    testfold = test_obs_matrix[,k][!is.na(test_obs_matrix[,k])]
    trainingfold = setdiff(obs_order,testfold)
    fit = lm(formula, data = data[trainingfold,])
    yhat[1:length(testfold),k] = predict(fit, newdata = data[testfold,])
  }
  yhat = c(yhat)[!is.na(c(yhat))]
  yordered = data[all.names(formula)[2]][obs_order,]
  RMSE = sqrt(sum((yordered - yhat)^2)/n)
}


#' Summarize the results from a logistic regression analysis
#'
#' Alternative to `summary.glm` to summarize a regression from `glm`.
#' Prints a table similar to the one generated by SAS and Minitab.
#' @param glmobject a fitted regression model from `glm`.
#' @param odds_ratio `TRUE` if odds ratios for parameters is computed.
#' @param param `TRUE` if parameter estimates, standard errors etc is computed.
#' @param conf_intervals `TRUE` if confidence intervals for parameters.
#' @return list with two tables: param, odds_ratio
#' @export
#' @examples
#' library(regkurs)
#' glmfit <- glm(survived ~ age + sex + firstclass, data = titanic, family = binomial)
#' logisticregsummary(glmfit)
logisticregsummary <- function(glmobject, odds_ratio = T, param = T, conf_intervals = F){

  if ("(Intercept)" %in% names(glmobject$coefficients)) intercept = 1 else intercept = 0

  glmsummary = summary(glmobject)
  data = model.matrix(glmobject)
  X = data[,-1]
  k = ncol(X) # number of covariates, excluding intercept

  # Table with estimated coefficients etc
  if (param){
    # Confidence intervals on parameters
    if (conf_intervals){
      param_table = cbind(glmsummary$coefficients, suppressMessages(confint(glmobject)))
    }else
    {
      param_table = glmsummary$coefficients
    }

    {cat("\nParameter estimates\n------------------------------------------------\n");
      print(param_table, digits = 5, na.print = "")}

  }else{param_table = NA}



  # Table with odds ratios etc
  if (odds_ratio){
    # Confidence intervals on parameters
    if (conf_intervals){
      odds_ratio_table = cbind(exp(glmsummary$coef[,1:2]),glmsummary$coef[,3:4], exp(suppressMessages(confint.default(glmobject))))
    }else
    {
      odds_ratio_table = cbind(exp(glmsummary$coef[,1:2]),glmsummary$coef[,3:4])
    }

    {cat("\nOdds ratio estimates\n------------------------------------------------\n");
      print(odds_ratio_table, digits = 5, na.print = "")}

  }else{odds_ratio_table = NA}


  invisible(list(param = param_table, odds_ratio = odds_ratio_table))
}







#' Simulate from a logistic regression model
#'
#' Simulates a dataset with `n` observation from the logistic regression model \cr
#' \deqn{\mathrm{Pr}(y = 1 | x) = \frac{1}{1 + \exp(-(\beta_0 + \beta_1x_1 + \ldots + \beta_k x_k))}}{Pr(y = 1 | x) = 1/(1 + exp(-(\beta_0 + \beta_1 * x_1 + ... + \beta_k * x_k)))}
#' with covariates (x) simulated from a normal distribution with the same correlation `rho_x` \cr
#' between all pairs of covariates. Covariate x_j has standard deviation `sigma_x[j]`. \cr
#' Alternatively the covariate can follow a uniform distribution.
#' @param n the number of observations in the simulated dataset.
#' @param betavect a vector with regression coefficients
#' c(beta_0,beta_1,...beta_k). First element is intercept if `intercept = TRUE`
#' @param intercept if `TRUE` an intercept is added to the model.
#' @param covdist distribution of the covariates. Options: `'normal'` or `'uniform'`.
#' @param rho_x correlation among the covariates. Same for all covariate pairs.
#' @param sigma_x vector with standard deviation of the covariates.
#' @return dataframe with simulated data (y, X1, X2, ..., XK) (no intercept included).
#' @export
#' @examples
#' library(regkurs)
#' simdata <- logisticregsimulate(n = 500, betavect = c(1, -2, 1, 0))
#' glmfit <- glm(y ~ X1 + X2 + X3, data = simdata, family = binomial)
#' logisticregsummary(glmfit, odds_ratio = F)
logisticregsimulate <- function(n, betavect, intercept = TRUE, covdist = 'normal',
                        rho_x = 0, sigma_x = rep(1,length(betavect)-intercept)){

  k = length(betavect) - intercept

  # Generate covariates
  if (covdist == 'normal'){
    rho = matrix(rho_x, k, k)
    diag(rho) <- 1
    sigma = diag(sigma_x)%*%rho%*%diag(sigma_x)
    X = mvtnorm::rmvnorm(n, sigma = sigma)
  }else{
    X = matrix(runif(n*k), n, k)
    if (rho_x != 0) warning("uniformly distributed covariates are always uncorrelated")
  }
  if (intercept) X = cbind(1,X)

  # Simulate binary responses
  y = rbinom(n, 1, prob = 1/(1 + exp(-X%*%betavect)))

  if (intercept) X = X[,-1] # remove intercept in the returned dataset
  data = data.frame(cbind(y,X))
  xnames = rep(NA,k)
  for (j in 1:k) xnames[j] = paste("X",j, sep = "")
  names(data) <- c("y", xnames)
  return(data)
}
StatisticsSU/regkurs documentation built on Jan. 29, 2023, 4:54 p.m.