knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Data Screening: Next Steps

Data Screening: Assumptions

Data Screening: Assumptions

Assumptions: Independence

Assumptions: Additivity

Assumptions: Additivity

library(rio)
master <- import("data/data_screening.csv")
notypos <- master #update the dataset with each step 
notypos$Sex <- factor(notypos$Sex, 
                     levels = c(1,2), #no 3
                     labels = c("Women", "Men"))
notypos$SES <- factor(notypos$SES, 
                     levels = c(1,2, 3),
                     labels = c("Low", "Medium", "High"))
notypos$Grade[ notypos$Grade > 15 ] <- NA
notypos[ , 6:19][ notypos[ , 6:19] > 7 ] <- NA
percentmiss <- function(x){ sum(is.na(x))/length(x) * 100 }
missing <- apply(notypos, 1, percentmiss)
replace_rows <- subset(notypos, missing <= 5) #5%
noreplace_rows <- subset(notypos, missing > 5)
replace_columns <- replace_rows[ , -c(1,2,4)]
noreplace_columns <- replace_rows[ , c(1,2,4)] #notice these are both replace_rows
library(mice)
temp_no_miss <- mice(replace_columns)
nomiss <- complete(temp_no_miss, 1) #pick a dataset 1-5 
all_columns <- cbind(noreplace_columns, nomiss)
all_rows <- rbind(noreplace_rows, all_columns)
mahal <- mahalanobis(all_columns[ , -c(1,4)],
                    colMeans(all_columns[ , -c(1,4)], na.rm=TRUE),
                    cov(all_columns[ , -c(1,4)], use ="pairwise.complete.obs"))
cutoff <- qchisq(1-.001, ncol(all_columns[ , -c(1,4)]))
noout <- subset(all_columns, mahal < cutoff)
str(noout)
cor(noout[ , -c(1,4)])

Assumptions: Additivity

library(corrplot)
corrplot(cor(noout[ , -c(1,4)]))

Assumptions: Linearity

Assumptions: Fake Regression

Assumptions: Fake Regression

Assumptions: Fake Regression

random <- rchisq(nrow(noout), 7) #why 7? It works, any number bigger than 2
fake <- lm(random ~ ., #Y is predicted by all variables in the data
           data = noout) #You can use categorical variables now!
standardized <- rstudent(fake)
fitvalues <- scale(fake$fitted.values)

Assumptions: Linearity

{qqnorm(standardized)
abline(0,1)}

plot(fake, 2)

Assumptions: Linearity

Assumptions: Linearity

{qqnorm(standardized)
abline(0,1)}

Assumptions: Normality

Assumptions: Normality

hist(noout$RS1)
library(moments)
skewness(noout[ , -c(1,4)])
kurtosis(noout[ , -c(1,4)]) - 3 #to get excess kurtosis

Assumptions: Normality

hist(standardized, breaks=15)
length(standardized)

Assumptions: Homogeneity

Assumptions: Homogeneity

knitr::include_graphics("pictures/datascreen/2.png")

Assumptions: Homoscedasticity

Assumptions: Homogeneity and Homoscedasticity

knitr::include_graphics("pictures/datascreen/3.png")

Assumptions: Homogeneity and Homoscedasticity

Assumptions: Homogeneity and Homoscedasticity

{plot(fitvalues, standardized) 
abline(0,0)
abline(v = 0)}

Assumptions: Homogeneity and Homoscedasticity

{plot(fitvalues, standardized) 
abline(0,0)
abline(v = 0)}

Summary



doomlab/learnSTATS documentation built on June 9, 2022, 12:54 a.m.