#'mylm
#'
#'fit a linear model with basic model diagnose
#'
#'@param dat input data frame with no NAs, should include all the data that will be used
#'
#'@param response name string for response
#'
#'@param covariates string vector of the chosen variables in the input data frame
#'
#'@param inter specify the pairwise interaction in regression , e.g.:c('Age*Sex', 'Age*R_E')
#'
#'@param category specify the categorical variables by input the names vector
#'
#'@param cat_method choose the coding method for categorical variables, should be 'reference' or 'cellmeans'
#'
#'@param ref specify the reference level for each categorical variable
#'
#'@param model.diag default is True, activate the model diagnose function
#'
#'@param intercept default is True, make the model with an intercept
#'
#'@param cutoff specify the level of significance for the test
#'
#'@return similar output table as summary(lm), residuals,R.square, R.square.adj, SSE
#'
#'@import dplyr
#'@import Matrix
#'@importFrom matrixStats rowProds
#'@import tidyverse
#'@importFrom stringr str_split
#'@import ggplot2
#'
#'@examples
#'data(mydata)
#'covar2 = c('Fatalism', 'Sex', 'R_E', 'Age_4Cat', 'NIHSS_4Cat')
#'covar3 = c('Fatalism', 'Sex', 'R_E', 'Age_4Cat')
#'t4 = mylm(mydata, 'Depression', covar2, category = c('Age_4Cat', 'NIHSS_4Cat'), ref = c(1,1))
#'t5 = mylm(mydata, 'Depression', covar3, category = c('Age_4Cat'),
#'cat_method = 'cellmeans', intercept = FALSE)
#'
#'@export
#'
mylm = function(dat, response, covariates,inter = c(),
category = c(), cat_method = 'reference', ref = c(),
model.diag = T, intercept = T, cutoff = 0.05){
model_dat = subset(dat, select = c(response, covariates))
# checking the data
if (any(is.na(model_dat))){
stop("Nas exist among the selected variables")
}
# describe the data
obs = nrow(model_dat)
if(obs <= 1){
stop('do not input empty data or Need more observations ')
}
tmp_dat = subset(model_dat, select = covariates)
# categorize variables
if(length(category) >= 1 ){
model_dat = categorize(data = tmp_dat, vars = category, method = cat_method, ref = ref)
tmp_dat = model_dat
}
# process the interaction term
inter_n = length(inter)
if(inter_n >= 1){
inter_matrix = matrix(rep(0,inter_n*obs), nrow = obs, ncol = inter_n)
colnames(inter_matrix) = inter
i = 1
while( i <= inter_n){
# parse the parameter
new_inter_vec = str_split(inter[i], ":")[[1]]
tmp_group = subset(tmp_dat, select = new_inter_vec)
inter_matrix[, i] =rowProds(as.matrix(tmp_group))
i = i + 1
}
tmp_dat = cbind(tmp_dat, inter_matrix)
}
beta_n = ncol(tmp_dat)
X_matrix = as.matrix(tmp_dat)
covariates = colnames(tmp_dat)
if (intercept){
X_matrix = cbind(rep(1, nrow(model_dat)), X_matrix)
beta_n = beta_n + 1
colnames(X_matrix) = c('intercept', covariates)
}
Y_matrix = as.matrix(subset(dat, select = response))
X_T = t(X_matrix)
# store the value of the (t(X)X)-1 to save time
pre_useful_matrix = X_T %*% X_matrix
flag = min(nrow(pre_useful_matrix), ncol(pre_useful_matrix))
if (rankMatrix(pre_useful_matrix) != flag ){
warning('t(X) %*% X is singular, stop the process')
return(c())
}
useful_matrix = solve(pre_useful_matrix)
# calculate betas
betas = useful_matrix %*% X_T %*% Y_matrix
# fitting the model
Hat = X_matrix %*% useful_matrix %*% X_T
Y_fitted = Hat %*% Y_matrix
res = Y_matrix - Y_fitted
res_T = t(res)
# estimating variance
MSE = ((res_T %*% res)/(obs - beta_n))[1,1]
SSE = MSE * (obs - beta_n)
SSY = sum((Y_matrix-mean(Y_matrix))^2)
R_square = 1 - SSE/SSY
R_adj_square = 1 - MSE/(SSY/(obs-1))
betas_var = MSE * useful_matrix
betas_sd = sqrt(diag(betas_var))
# t-statistic
t_stats = c(betas/betas_sd)
p_val = c(2 * (1 - pt(q = abs(t_stats), df = obs-beta_n)))
ifsign = ifelse(p_val <= cutoff, ifelse(p_val <= 0.01, ifelse(p_val<=0.001, '***', '**'),'*'), '-')
results = data.frame(coef = c(betas), std. = betas_sd, t.val = t_stats,
p.val = p_val, significant = ifsign)
out_list = list(results = results, residuals = res, R.square = R_square,
R.square.adj = R_adj_square, SSE = SSE)
if (model.diag == T){
# Linearity diagnose
# consider the SLR scenario
if(beta_n <= 2){
AXIS_Y = res
if (intercept){
AXIS_X = X_matrix[,2]
}
else{
AXIS_X = X_matrix[,1]
}
xlab = 'X'
ylab = 'residuals'
title = 'Residuals Plot'
df_pic = data.frame(x = AXIS_X, y = AXIS_Y)
colnames(df_pic) = c('x', 'y')
tmp_p = ggplot(df_pic, aes(x = x, y = y)) +
geom_point(shape = 5) + geom_smooth(method = lm)+
labs(x=xlab,y=ylab,title=title)+
theme_bw()+
theme(
plot.title = element_text(size = 14,face = "bold", vjust = 0.5, hjust = 0.5),
axis.title.x = element_text(size = 10,face = "bold"),
axis.title.y = element_text(size = 10,face = "bold"),
axis.text.x=element_text(size = 10,face = "bold"),
axis.text.y=element_text(size = 10,face = "bold"),
)
print(tmp_p)
}
else{
model_dignose(Y_matrix, covariates, tmp_dat)
}
# equal variance test
df_pic = data.frame(x = Y_fitted, y = res)
colnames(df_pic) = c('x', 'y')
xlab = 'Fitted values'
ylab = 'residuals'
title = "Plot for Constant Variance"
tmp_p2 = ggplot(df_pic, aes(x = x, y = y)) +
geom_point(shape = 5) +
labs(x=xlab,y=ylab,title=title)+
theme_bw()+
theme(
plot.title = element_text(size = 14,face = "bold", vjust = 0.5, hjust = 0.5),
axis.title.x = element_text(size = 10,face = "bold"),
axis.title.y = element_text(size = 10,face = "bold"),
axis.text.x=element_text(size = 10,face = "bold"),
axis.text.y=element_text(size = 10,face = "bold"),
)
print(tmp_p2)
}
return(out_list)
}
# model diagnose using partial plots
model_dignose = function(Y, covariates, modeldat){
n = length(covariates)
i = 1
ylab = paste(colnames(Y), 'others', sep = '|')
title = 'Partial Regression Plot'
while (i <= n){
tmp_X = select(modeldat, -c(covariates[i]))
new_vars = colnames(tmp_X)
newdata = cbind(Y, tmp_X)
AXIS_Y = mylm(newdata, colnames(Y), new_vars, model.diag = F)
if (length(AXIS_Y) !=0 ){
AXIS_Y = AXIS_Y[['residuals']]
}
else{
print('matrix is singular in diagnose process')
break()
}
choosen = select(modeldat, c(covariates[i]))
newdata = cbind(choosen, tmp_X)
AXIS_X = mylm(newdata, colnames(choosen), new_vars, model.diag = F)[['residuals']]
#####
# No need to double check, because x-K has been singular or not
#if (length(AXIS_X) !=0){
#AXIS_X = AXIS_X[['residuals']]
#}
#else{
#print('matrix is singular in diagnose process')
#break()
#}
#####
df_pic = data.frame(x = AXIS_X, y = AXIS_Y)
colnames(df_pic) = c('x', 'y')
xlab = paste(covariates[i], 'others', sep = '|')
tmp_p = ggplot(df_pic, aes(x = x, y = y)) +
geom_point(shape = 5) + geom_smooth(method = lm)+
labs(x=xlab,y=ylab,title=title)+
theme_bw()+
theme(
plot.title = element_text(size = 14,face = "bold", vjust = 0.5, hjust = 0.5),
axis.title.x = element_text(size = 10,face = "bold"),
axis.title.y = element_text(size = 10,face = "bold"),
axis.text.x=element_text(size = 10,face = "bold"),
axis.text.y=element_text(size = 10,face = "bold"),
)
print(tmp_p)
i = i + 1
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.