knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
In the last lecture, we discussed:
What else should we consider for checking our data?
For parametric statistics, we should think about:
The procedure:
If two variables are not additive, this implies that the variables are too related, which reduces power.
This analysis is necessary when you have multiple continuous variables. If you only have one dependent variable, then you cannot run this check.
library(rio) master <- import("data/data_screening.csv") notypos <- master #update the dataset with each step notypos$Sex <- factor(notypos$Sex, levels = c(1,2), #no 3 labels = c("Women", "Men")) notypos$SES <- factor(notypos$SES, levels = c(1,2, 3), labels = c("Low", "Medium", "High")) notypos$Grade[ notypos$Grade > 15 ] <- NA notypos[ , 6:19][ notypos[ , 6:19] > 7 ] <- NA percentmiss <- function(x){ sum(is.na(x))/length(x) * 100 } missing <- apply(notypos, 1, percentmiss) replace_rows <- subset(notypos, missing <= 5) #5% noreplace_rows <- subset(notypos, missing > 5) replace_columns <- replace_rows[ , -c(1,2,4)] noreplace_columns <- replace_rows[ , c(1,2,4)] #notice these are both replace_rows library(mice) temp_no_miss <- mice(replace_columns) nomiss <- complete(temp_no_miss, 1) #pick a dataset 1-5 all_columns <- cbind(noreplace_columns, nomiss) all_rows <- rbind(noreplace_rows, all_columns) mahal <- mahalanobis(all_columns[ , -c(1,4)], colMeans(all_columns[ , -c(1,4)], na.rm=TRUE), cov(all_columns[ , -c(1,4)], use ="pairwise.complete.obs")) cutoff <- qchisq(1-.001, ncol(all_columns[ , -c(1,4)])) noout <- subset(all_columns, mahal < cutoff)
str(noout) cor(noout[ , -c(1,4)])
library(corrplot) corrplot(cor(noout[ , -c(1,4)]))
random <- rchisq(nrow(noout), 7) #why 7? It works, any number bigger than 2 fake <- lm(random ~ ., #Y is predicted by all variables in the data data = noout) #You can use categorical variables now! standardized <- rstudent(fake) fitvalues <- scale(fake$fitted.values)
{qqnorm(standardized) abline(0,1)} plot(fake, 2)
{qqnorm(standardized) abline(0,1)}
Remember the Central Limit Theorem - at what point is the sample size large enough to assume normality?
Check out the sample distribution of residuals as an approximation for multivariate normality.
hist(noout$RS1) library(moments) skewness(noout[ , -c(1,4)]) kurtosis(noout[ , -c(1,4)]) - 3 #to get excess kurtosis
hist(standardized, breaks=15) length(standardized)
Ways to check: you do NOT want p < .001:
Sphericity - the assumption that the differences between measurements in repeated measures have approximately the same variance and correlations.
knitr::include_graphics("pictures/datascreen/2.png")
knitr::include_graphics("pictures/datascreen/3.png")
Create a scatterplot of the fake regression.
In theory, the residuals should be randomly distributed (hence why we created a random variable to test with).
{plot(fitvalues, standardized) abline(0,0) abline(v = 0)}
Homogeneity - is the spread above that line the same as below that 0, 0 line (both directions)?
Homoscedasticity - is the spread equal all the way across the x axis?
{plot(fitvalues, standardized) abline(0,0) abline(v = 0)}
In this lecture, we have covered:
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.