# R/regSel_code.R In EvanWie/regSel:

#### Documented in back_selectlin_model

#' Linear regression model
#'
#' Performs linear regression using Ordinary Least Squares (or Weighted Least Squares when weights are specified)
#'
#' @param formula Model formula using specified columns of DataFrame 'data'. Can include interactions and select no intercept
#' with -1
#' @param data Dataframe from which model variables are pulled.
#' @param weights Vector of weights to be used in Weighted Least Squares regression estimates. Default = NULL.
#' @return List containing the following elements
#' \itemize{
#' \item beta estimates
#' \item estimate standard errors
#' \item test statistics
#' \item p-values
#' \item residuals
#' }
#' @examples  x = rnorm(10)
#' x2 = rnorm(10)
#' y = rnorm(10)
#' df = data.frame(y,x,x2)
#'
#' lin_model(formula = y~x*x2, data = df, weights = 1:10)
#'
#' lin_model(formula = y~1, data = df)
#'
#' @export
lin_model = function(formula, data, weights= NULL){
if(length(formula[[2]]) != 1){
stop("y must be univariate")
}

#Parse formula into its response and covariates
modframe = model.frame(formula = formula, data = data)

y <- model.response(modframe, "numeric")

x = model.matrix(formula, data = data)

n = length(y)

#OLS beta, variance calculations
if(is.null(weights)){
beta = solve(t(x)%*%x)%*%t(x)%*%y

res = y - x%*%beta

sigma_sq = sum(res^2)/(n-ncol(x))

varcov_beta = solve(t(x)%*%x)*sigma_sq
} else {
#WLS beta, variance calculations

if(!is.vector(weights) | length(weights) != n){
stop("Weights must be written as a vector with length equal to length y")
}

w = diag(weights)
beta = solve(t(x) %*% w %*% x) %*% t(x) %*% w %*% y

res = y - x%*%beta

sigma_sq <- sum(weights*res^2)/(n - ncol(x))

varcov_beta = sigma_sq* solve(t(x)%*%w%*%x)
}

se_beta =sqrt(diag(varcov_beta))

#Calculate test stats and p-values and gather results
test_stat = beta/se_beta

p = 2*pt(-abs(test_stat), n-1)

results = list(betas = as.vector(beta), se_beta = as.vector(se_beta),
test_statistic = as.vector(test_stat), p_value = as.vector(p), res = as.vector(res))

names(results$betas) = colnames(x) names(results$se_beta) = names(results$betas) names(results$test_statistic) = names(results$betas) names(results$p_value) = names(results$betas) return(results) } #' Backwards selcection of linear regression model #' #' Performs backwards selection of model parameters. Removes parameter with greatest p-value above "prem" threshold. #' P-values are calculated using Ordinary Least Squares (no weighting option). Must have at least one non-intercept #' covariate in the model. #' #' @param formula Model formula using specified columns of DataFrame 'data'. Can include interactions and select no intercept #' with -1. #' @param data Dataframe from which model variables are pulled. #' @param prem Threshold at which a parameter will be removed from the model if it has the highest p-value above the threshold. #' The default value is .1. #' @return List containing the following elements #' \itemize{ #' \item beta estimates #' \item estimate standard errors #' \item test statistics #' \item p-values #' \item residuals #' \item covariates removed from model #' } #' @examples x = rnorm(10) #' x2 = rnorm(10) #' y = rnorm(10) #' df = data.frame(y,x,x2) #' #' back_select(formula = y~x*x2, data = df, prem = .1) #' #' @export back_select = function(formula, data, prem = .1){ if(length(formula[[2]]) != 1){ stop("y must be univariate") } if(formula[[3]]==1){ stop("Model must have at least one covariate") } #Parse formula into response and covariates modframe = model.frame(formula = formula, data = data) y <- model.response(modframe, "numeric") x = model.matrix(formula, data = data) n = length(y) #Calculate initial OLS beta, variance, and p-values beta = solve(t(x)%*%x)%*%t(x)%*%y res = y - x%*%beta q = ifelse(is.matrix(x), ncol(x), 1) sigma_sq = sum(res^2)/(n-q) varcov_beta = solve(t(x)%*%x)*sigma_sq se_beta =sqrt(diag(varcov_beta)) test_stat = beta/se_beta p = 2*pt(-abs(test_stat), n-1) removed_vars = c() #Repeat parameter estimation and variance calculations until maximum p-value less than removal threshold while(max(p[2:max(2,q)]) > prem){ rem_col = which(p == max(p[2:ncol(x)])) removed_vars = c(removed_vars, colnames(x)[rem_col]) x = x[,-rem_col] beta = solve(t(x) %*% x) %*% t(x) %*% y res = y - x%*%beta q = ifelse(is.matrix(x), ncol(x), 1) sigma_sq = sum(res^2)/(n-q) varcov_beta = solve(t(x)%*%x)*sigma_sq se_beta =sqrt(diag(varcov_beta)) test_stat = beta/se_beta p = 2*pt(-abs(test_stat), n-1) if(q == 1){break} } #Gather results from final model results = list(betas = as.vector(beta), se_beta = as.vector(se_beta), test_statistic = as.vector(test_stat), p_value = as.vector(p), res = as.vector(res), removed_vars = removed_vars) names(results$betas) = c("(Intercept)",colnames(x)[2:max(2,q)])
names(results$se_beta) = names(results$betas)
names(results$test_statistic) = names(results$betas)
names(results$p_value) = names(results$betas)

return(results)
}

EvanWie/regSel documentation built on Nov. 26, 2019, 2:11 a.m.