knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
knitr::opts_chunk$set(message = FALSE, warning = FALSE) library(qwzLinear)
This package integrates the basic elements in the linear regression based on BIOSTAT650. The main function mylm will return a list of 5 items, They are:
results: dataframe, with similar format with summary(lm)
residuals: The residuals of the model
R.square: The R^2 which demonstrate the percentage of variance in Y explained by the model
R.square.adj: The adjusted of R^2
SSE: The sum of square errors in the model
Also, the function will automatically plot the partial regression plots for MLR and residual plot for SLR to test "linearity" assumption. $\hat{\epsilon}$ v.s. $\hat{Y}$ is also generated to test "constant variance" assumption"
The example dataset named with "mydata" was generated from the reference below:
Morgenstern, Lewis B., et al. "Fatalism, optimism, spirituality, depressive symptoms, and stroke outcome: a population-based analysis." Stroke 42.12 (2011): 3518-3523.
The main fucntion in this package is mylm and categorize. The details of input could be found by using "help(mylm)" or “help(categorize)".
1.Using the internal dataset
data(mydata) dim(mydata)
2.Basic MLR with/without model diagnose
covar1 = c('Fatalism', 'Sex', 'R_E', 'Age') t1 = mylm(mydata, 'Depression', covar1) # check the result table t1$results # could also fit a model without intercept and deactivate the diagnose function t2 = mylm(mydata, 'Depression', covar1, intercept = FALSE, model.diag = FALSE) t2$results
3.Categorize variables into dummy variables
tmp = categorize(mydata, c('R_E', 'NIHSS_4Cat'), ref = c(0,1)) # We get new dummy variables and the original columns were deleted head(tmp[,52:55]) # We could also use cellmeans coding method tmp = categorize(mydata, c('R_E', 'NIHSS_4Cat'), method = 'cellmeans') head(tmp[,52:56])
4.Users could also directly add categorical variables in the mylm funciont
covar2 = c('Fatalism', 'Sex', 'R_E', 'Age_4Cat', 'NIHSS_4Cat') covar3 = c('Fatalism', 'Sex', 'R_E', 'Age_4Cat') t4 = mylm(mydata, 'Depression', covar2, category = c('Age_4Cat', 'NIHSS_4Cat'), ref = c(1,1), model.diag = FALSE) t5 = mylm(mydata, 'Depression', covar3, category = c('Age_4Cat'), cat_method = 'cellmeans', intercept = F, model.diag = F)
# The results for model with categorical variables t4$results t5$results
5.Users could also add interaction term in the model
t8 = mylm(mydata, 'Depression', covar1, inter = c('Age:Sex','Age:Fatalism'), model.diag = FALSE) t8$results
6.To add interaction with categorical variables, using categorize() first
dat9 = categorize(mydata, c('NIHSS_4Cat', 'Comorbidity1'), ref = c(1,0)) covar5 = c('NIHSS_4Cat.2', 'NIHSS_4Cat.4', 'NIHSS_4Cat.3', 'Comorbidity1.4', 'Comorbidity1.3', 'Comorbidity1.6', 'Comorbidity1.5','Comorbidity1.2', 'Comorbidity1.1', 'Comorbidity1.7', 'Fatalism', 'Sex', 'R_E', 'Age', 'Optimism','HiChol') t9 = mylm(dat9, 'Depression', covar5, inter = c('Age:Sex','Age:Fatalism','HiChol:Fatalism'), model.diag = FALSE) t9$results
7.SLR also works
covar4 = c('Fatalism') t6 = mylm(mydata, 'Depression', covar4) t7 = mylm(mydata, 'Depression', covar4, intercept = F,model.diag = FALSE) t6$results t7$results
NOTE
High order interaction like 3-way interaction is not recommended dut to the hardness of interpretation. The algorithm estimating this kind of cases are also out the range of BIOSTAT650. And the transformation term could be realized by adding transformed values in the column.
1.Correctness Correctness is confirmed by passing all the unit tests. The tests are based on the comparing the results to lm() and anova(). Basic examples are shown below to show the Correctness.
m1 = lm(Depression ~ Fatalism + Sex + R_E + Age , data = mydata) summary(m1) anova(m1) t1$results
2.Batch comparisons
# function used to compare the output to the lm() function, if the results are same # the output of function will be 1 # The criteria of having same results is the errors between my function and lm and anova # less than 1e-6 mylm_test = function(my, R_out){ my_p = nrow(my) r_summy = summary(R_out) r_SSE = anova(R_out)[['Sum Sq']] r_sse_index = length(r_SSE) r_SSE = r_SSE[r_sse_index] # use sum to aviod different oders after categorizaion if (sum(abs(my$results[['coef']]))-sum(abs(R_out$coefficients)) > 1e-6){ return(0) } # then check std. else if (sum(my$results[['std.']])-sum(r_summy$coefficients[,2]) > 1e-6){ return(0) } # then check t-stats else if (sum(abs(my$results[['t.val']]))-sum(abs(r_summy$coefficients[,3])) > 1e-6){ return(0) } # then check p-val else if (sum(my$results[['p.val']])-sum(r_summy$coefficients[,4]) > 1e-6){ return(0) } # then check R square else if ((my$R.square - r_summy$r.squared > 1e-6) || (my$R.square.adj - r_summy$adj.r.squared > 1e-6)){ return(0) } # then check residuals else if (any(abs(my[['residuals']]-r_summy$residuals) > 1e-6)){ return(0) } # then check SSE else if (abs(my[['SSE']]-r_SSE <= 1e-6)){ return(1) } } covar5 = c('NIHSS_4Cat.2', 'NIHSS_4Cat.4', 'NIHSS_4Cat.3', 'Comorbidity1.4', 'Comorbidity1.3', 'Comorbidity1.6', 'Comorbidity1.5', 'Comorbidity1.2', 'Comorbidity1.1', 'Comorbidity1.7', 'Fatalism', 'Sex', 'R_E', 'Age', 'Optimism','HiChol') covar6 = c('NIHSS_4Cat.1','NIHSS_4Cat.2', 'NIHSS_4Cat.4', 'NIHSS_4Cat.3', 'Comorbidity1.4', 'Comorbidity1.3', 'Comorbidity1.6', 'Comorbidity1.5','Comorbidity1.2', 'Comorbidity1.1', 'Comorbidity1.7', 'Fatalism', 'Sex', 'R_E', 'Age', 'Optimism','HiChol') t3 = mylm(mydata, 'Depression', covar2, model.diag = FALSE) t9 = mylm(dat9, 'Depression', covar5, inter = c('Age:Sex','Age:Fatalism','HiChol:Fatalism'), model.diag = FALSE) dat10 = categorize(mydata, c('NIHSS_4Cat', 'Comorbidity1'), method = 'cellmeans') t10 = mylm(dat10, 'Depression', covar6, inter = c('Age:Sex','Age:Fatalism','HiChol:Fatalism'), intercept = F, model.diag = FALSE) m1 = lm(Depression ~ Fatalism + Sex + R_E + Age , data = mydata) m2 = lm(Depression ~ -1 + Fatalism + Sex + R_E + Age , data = mydata) m3 = lm(Depression ~ Fatalism + Sex + R_E + Age_4Cat + NIHSS_4Cat, data = mydata) m4 = lm(Depression ~ Fatalism + Sex + R_E + factor(Age_4Cat) + factor(NIHSS_4Cat), data = mydata) m5 = lm(Depression ~ -1 + Fatalism + Sex + R_E + factor(Age_4Cat) , data = mydata) m6 = lm(Depression ~ Fatalism, data = mydata) m7 = lm(Depression ~ -1 + Fatalism, data = mydata) m8 = lm(Depression ~ Fatalism + Sex + R_E + Age + Age*Sex + Age*Fatalism, data = mydata) m9 = lm(Depression ~ Fatalism + Sex + R_E + Age + factor(NIHSS_4Cat) + factor(Comorbidity1) + Optimism + HiChol + Age*Sex + Age*Fatalism + HiChol * Fatalism, data = mydata) # lm will only cell mean coding the first factor m10 = lm(Depression ~ -1 + Fatalism + Sex + R_E + Age + factor(NIHSS_4Cat) + factor(Comorbidity1) + Optimism + HiChol + Age*Sex + Age*Fatalism + HiChol * Fatalism, data = mydata) compare_vec = c(mylm_test(t1,m1), mylm_test(t2,m2), mylm_test(t3,m3), mylm_test(t4,m4), mylm_test(t5,m5), mylm_test(t6,m6), mylm_test(t7,m7), mylm_test(t8,m8), mylm_test(t9,m9), mylm_test(t10,m10)) print(compare_vec)
3.Model Diagnose comparison
# residual plots from car package car::avPlots(m1) car::residualPlots(m1)
# The patterns are the same t1 = mylm(mydata, 'Depression', covar1)
More Examples of same patterns
# SLR only need residuals plot car::residualPlots(m6) t6 = mylm(mydata, 'Depression', covar4)
car::avPlots(m4) car::residualPlots(m4) t4 = mylm(mydata, 'Depression', covar2, category = c('Age_4Cat', 'NIHSS_4Cat'), ref = c(1,1))
4.Benchmark
Because the benchmark function require the identical output for each function, I make some modifications for comparsion. However, basically my efficiency is lower than the lm function.
lm_and_car = function(form){ mx = lm(formula = form, data = mydata) return(as.numeric(mx$coefficients)) } bench1 = bench::mark(mylm(mydata, 'Depression', covar1, model.diag = FALSE)$results[['coef']],lm_and_car(Depression ~ Fatalism + Sex + R_E + Age)) bench1$expression[[1]] = 'mylm' bench1$expression[[2]] = 'lm' plot(bench1) bench2 = bench::mark(sum(mylm(dat9, 'Depression', covar5, inter = c('Age:Sex','Age:Fatalism','HiChol:Fatalism'), model.diag = FALSE)$results[['coef']]), sum(lm_and_car(Depression ~ Fatalism + Sex + R_E + Age + factor(NIHSS_4Cat) + factor(Comorbidity1) + Optimism + HiChol + Age * Sex + Age * Fatalism + HiChol * Fatalism))) bench2$expression[[1]] = 'mylm' bench2$expression[[2]] = 'lm' plot(bench2)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.