knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Data Screening

An Important Note

What Order Should be followed?

Why is this the order?

Data Screening Example

library(rio)
master <- import("data/data_screening.csv")
str(master)

Accuracy

Accuracy: Categorical Variables

notypos <- master #update the dataset with each step 
apply(notypos[ , c("Sex", "SES")], 2, table)
#3 here for sex is probably incorrect

Accuracy: Categorical Variables

## fix the categorical labels and typos
notypos$Sex <- factor(notypos$Sex, 
                     levels = c(1,2), #no 3
                     labels = c("Women", "Men"))
notypos$SES <- factor(notypos$SES, 
                     levels = c(1,2, 3),
                     labels = c("Low", "Medium", "High"))
apply(notypos[ , c("Sex", "SES")], 2, table)

Accuracy: Continuous Variables

summary(notypos)

How do we "fix" issues?

Accuracy: Continuous Variables

summary(notypos$Grade)
notypos$Grade[ notypos$Grade > 12 ] <- NA
summary(notypos$Grade)

summary(notypos$Absences)
notypos$Absences[ notypos$Absences > 15 ] <- NA
summary(notypos$Absences)

Accuracy: Continuous Variables

names(notypos)
head(notypos[ , 6:19]) #lots of ways to do this part!
notypos[ , 6:19][ notypos[ , 6:19] > 7 ] <- NA
summary(notypos)

Accuracy: Continuous Variables

Accuracy: Continuous Variables

names(notypos)
apply(notypos[ , -c(1,3)], 2, mean, na.rm = T)
apply(notypos[ , -c(1,3)], 2, sd, na.rm = T)

Missing Data

summary(notypos)
apply(notypos, 2, function(x) { sum(is.na(x)) })

Missing Data

Types of Missing Data

What do I do with missing data?

knitr::include_graphics("pictures/datascreen/missing.png")

What do I do with missing data?

What do I do with missing data?

Visualize Missing Data

library(VIM, quietly = T)
aggr(notypos, numbers = T)

Replacing Missing Data: Rows

percentmiss <- function(x){ sum(is.na(x))/length(x) * 100 }
missing <- apply(notypos, 1, percentmiss)
table(missing)

Replacing Missing Data: Rows

replace_rows <- subset(notypos, missing <= 5) #5%
noreplace_rows <- subset(notypos, missing > 5)

nrow(notypos)
nrow(replace_rows)
nrow(noreplace_rows)

Replacing Missing Data: Columns

apply(replace_rows, 2, percentmiss)

Replacing Missing Data: Columns

replace_columns <- replace_rows[ , -c(1,2,4)]
noreplace_columns <- replace_rows[ , c(1,2,4)] #notice these are both replace_rows

Replacing Missing Data: Using mice

library(mice)
temp_no_miss <- mice(replace_columns)

Replacing Missing Data: Using mice

nomiss <- complete(temp_no_miss, 1) #pick a dataset 1-5 

#combine back together
dim(notypos) #original data from previous step
dim(nomiss) #replaced data

#get all columns 
all_columns <- cbind(noreplace_columns, nomiss)
dim(all_columns)

#get all rows
all_rows <- rbind(noreplace_rows, all_columns)
dim(all_rows)

Outliers

Outliers: Types

Outliers: Mahalanobis

Outliers: Analyze and Eliminate

## you can use all columns or all rows here
## however, all rows has missing data, which will not get a score 
str(all_columns)
mahal <- mahalanobis(all_columns[ , -c(1,4)],
                    colMeans(all_columns[ , -c(1,4)], na.rm=TRUE),
                    cov(all_columns[ , -c(1,4)], use ="pairwise.complete.obs"))

Outliers: Analyze and Eliminate

## remember to match the number of columns
cutoff <- qchisq(1-.001, ncol(all_columns[ , -c(1,4)]))

## df and cutoff
ncol(all_columns[ , -c(1,4)])
cutoff

##how many outliers? Look at FALSE
summary(mahal < cutoff)

## eliminate
noout <- subset(all_columns, mahal < cutoff)
dim(all_columns)
dim(noout)

Summary



doomlab/learnSTATS documentation built on June 9, 2022, 12:54 a.m.