library(learnr)
library(learnSEM)
knitr::opts_chunk$set(echo = FALSE)
data(datascreen)

Data Screening Practice

This exercise will give you practice working on data screening for multivariate analysis. You should work through data screening a particular order:

Data Screening Videos

You can use vignette("lecture_data_screen", "learnSEM") to view these notes in R.

Data Screening Data

Study: This datascreen dataset includes a male body dissatisfaction scale with the following questions:

  1. I think my body should be leaner
  2. I am concerned that my stomach is too flabby
  3. I feel dissatisfied with my overall body build
  4. I think I have too much fat on my body
  5. I think my abs are not thin enough
  6. I feel satisfied with the size and shape of my body
  7. Has eating sweets, cakes, or other high calorie food made you feel fat or weak?
  8. Have you felt excessively large and rounded (i.e., fat)?
  9. Have you felt ashamed of your body size or shape?
  10. Has seeing your reflection (e.g., in a mirror or window) made you feel badly about your size or shape?
  11. Have you been so worried about your body size or shape that you have been feeling that you ought to diet?

The datascreen dataset also includes a participant ID for each person in the study.

Exercises

In this next section, you will answer questions using the R code blocks provided. Be sure to use the solution option to see the answer if you need it!

Please enter your name for submission. If you do not need to submit, just type anything you'd like in this box.

question_text(
  "Student Name:",
  answer("Your Name", correct = TRUE),
  incorrect = "Thanks!",
  try_again_button = "Modify your answer",
  allow_retry = TRUE
)

Accuracy

Check the data for out of range scores. The scale ranges from 1 (never) to 6 (always). Use the summary() function to inspect the scores.

library(learnSEM)
data(datascreen)
library(learnSEM)
data(datascreen)
summary(datascreen)
question_text(
  "What are the problems you see in the dataset?",
  answer("There are no accuracy errors.", correct = TRUE),
  incorrect = "You should note that there are no accuracy errors.",
  try_again_button = "Modify your answer",
  allow_retry = TRUE
)

Missing data

summary(datascreen)
question_text(
  "Using the summary function from above, what type of missing data do you appear to have?",
  answer("Missing Completely at Random", correct = TRUE),
  incorrect = "You should note that it is likely this data is missing completely at random.",
  try_again_button = "Modify your answer",
  allow_retry = TRUE
)
# Create percent missing function

# Calculate missing percentages on datascreen

# How much missing data is there?
# Create percent missing function
percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100 }
# Calculate missing percentages on datascreen
missing <- apply(datascreen, 1, percentmiss)
# How much missing data is there?
table(missing)

Exclude all participants with more than 5% missing data, and call this dataset nomissing.

# Create percent missing function
percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100 }
# Calculate missing percentages on datascreen
missing <- apply(datascreen, 1, percentmiss)

nomissing <- subset(datascreen, missing <= 5)

Calculate missing scores by variable (columns) on the dataset with participants who have less than 5% missing.

# Create percent missing function
percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100}
# Calculate missing percentages on datascreen
missing <- apply(datascreen, 1, percentmiss)
nomissing <- subset(datascreen, missing <= 5)

apply(nomissing, 2, percentmiss)
question_text(
  "Do you need to estimate any missing data? ",
  answer("No", correct = TRUE),
  incorrect = "No, it appears that the there is no missing data after excluding participants with MNAR data.",
  try_again_button = "Modify your answer",
  allow_retry = TRUE
)

Outliers

# Create percent missing function
percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100}
# Calculate missing percentages on datascreen
missing <- apply(datascreen, 1, percentmiss)
nomissing <- subset(datascreen, missing <= 5)

mahal <- mahalanobis(nomissing[ , -1],
                     colMeans(nomissing[ , -1]),
                     cov(nomissing[ , -1]))
cutoff <- qchisq(1-.001, ncol(nomissing[ , -1]))
table(mahal < cutoff)
question_text(
  "What is the *df* for your cut off score?",
  answer("11", correct = TRUE),
  incorrect = "We have 11 df because there are 11 columns as part of the calculation of mahalanobis.",
  try_again_button = "Modify your answer",
  allow_retry = TRUE
)
question_text(
  "What is the cut off score? ",
  answer("31.26", correct = TRUE),
  incorrect = "The cut off score is 31.26.",
  try_again_button = "Modify your answer",
  allow_retry = TRUE
)
question_text(
  "How many outliers did you have?",
  answer("16", correct = TRUE),
  incorrect = "We have 16 outliers (the FALSE answers in your table).",
  try_again_button = "Modify your answer",
  allow_retry = TRUE
)

Exclude the outliers, and name that dataset noout.

# Create percent missing function
percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100}
# Calculate missing percentages on datascreen
missing <- apply(datascreen, 1, percentmiss)
nomissing <- subset(datascreen, missing <= 5)
mahal <- mahalanobis(nomissing[ , -1],
                     colMeans(nomissing[ , -1]),
                     cov(nomissing[ , -1]))
cutoff <- qchisq(1-.001, ncol(nomissing[ , -1]))

noout <- subset(nomissing, mahal < cutoff)

Additivity

Include a depiction of the correlation between scale items with corrplot, and be sure to use the noout dataset.

# Create percent missing function
percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100}
# Calculate missing percentages on datascreen
missing <- apply(datascreen, 1, percentmiss)
nomissing <- subset(datascreen, missing <= 5)
mahal <- mahalanobis(nomissing[ , -1],
                     colMeans(nomissing[ , -1]),
                     cov(nomissing[ , -1]))
cutoff <- qchisq(1-.001, ncol(nomissing[ , -1]))
noout <- subset(nomissing, mahal < cutoff)
library(corrplot)
library(corrplot)
corrplot(cor(noout[ , -1]))
question_text(
  "Do you meet the assumption of additivity?",
  answer("Yes", correct = TRUE),
  incorrect = "Yes, it appears we do not have any perfect correlations, although some are highly correlated.",
  try_again_button = "Modify your answer",
  allow_retry = TRUE
)

Assumption Set Up

Set up the rest of the assumption testing by doing the following:

# Create percent missing function
percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100}
# Calculate missing percentages on datascreen
missing <- apply(datascreen, 1, percentmiss)
nomissing <- subset(datascreen, missing <= 5)
mahal <- mahalanobis(nomissing[ , -1],
                     colMeans(nomissing[ , -1]),
                     cov(nomissing[ , -1]))
cutoff <- qchisq(1-.001, ncol(nomissing[ , -1]))
noout <- subset(nomissing, mahal < cutoff)

random <- rchisq(nrow(noout), 7)
fake <- lm(random ~ ., data = noout[ , -1])
standardized <- rstudent(fake)
fitted <- scale(fake$fitted.values)

Normality

Include the multivariate normality histogram by using hist() on the standardized variable.

# Create percent missing function
percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100}
# Calculate missing percentages on datascreen
missing <- apply(datascreen, 1, percentmiss)
nomissing <- subset(datascreen, missing <= 5)
mahal <- mahalanobis(nomissing[ , -1],
                     colMeans(nomissing[ , -1]),
                     cov(nomissing[ , -1]))
cutoff <- qchisq(1-.001, ncol(nomissing[ , -1]))
noout <- subset(nomissing, mahal < cutoff)
random <- rchisq(nrow(noout), 7)
fake <- lm(random ~ ., data = noout[ , -1])
standardized <- rstudent(fake)
fitted <- scale(fake$fitted.values)

hist(standardized)
question_text(
  "Interpret the graph.  Does it indicate multivariate normality?",
  answer("No, it appears the data may be a bit skewed.", correct = TRUE),
  incorrect = "You should note that the data appears a bit skewed because of the longer tail on the right, as the data is not even centered over 0.",
  try_again_button = "Modify your answer",
  allow_retry = TRUE
)

Linearity

Include the multivariate QQ plot.

# Create percent missing function
percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100}
# Calculate missing percentages on datascreen
missing <- apply(datascreen, 1, percentmiss)
nomissing <- subset(datascreen, missing <= 5)
mahal <- mahalanobis(nomissing[ , -1],
                     colMeans(nomissing[ , -1]),
                     cov(nomissing[ , -1]))
cutoff <- qchisq(1-.001, ncol(nomissing[ , -1]))
noout <- subset(nomissing, mahal < cutoff)
random <- rchisq(nrow(noout), 7)
fake <- lm(random ~ ., data = noout[ , -1])
standardized <- rstudent(fake)
fitted <- scale(fake$fitted.values)

{qqnorm(standardized)
abline(0,1)}

#or
plot(fake, 2)
question_text(
  "Interpret the graph.  Does it indicate multivariate linearity?",
  answer("Maybe, the data appears to bend away at the edges a bit.", correct = TRUE),
  incorrect = "You should note that the line mostly indicates linearity with a bit of non-linearity towards the edges (i.e., it bends away from the graph around -1 and 1.",
  try_again_button = "Modify your answer",
  allow_retry = TRUE
)

Homogeneity/Homoscedasticity

Include the multivariate residuals plot.

# Create percent missing function
percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100}
# Calculate missing percentages on datascreen
missing <- apply(datascreen, 1, percentmiss)
nomissing <- subset(datascreen, missing <= 5)
mahal <- mahalanobis(nomissing[ , -1],
                     colMeans(nomissing[ , -1]),
                     cov(nomissing[ , -1]))
cutoff <- qchisq(1-.001, ncol(nomissing[ , -1]))
noout <- subset(nomissing, mahal < cutoff)
random <- rchisq(nrow(noout), 7)
fake <- lm(random ~ ., data = noout[ , -1])
standardized <- rstudent(fake)
fitted <- scale(fake$fitted.values)

{plot(standardized, fitted)
abline(v = 1)
abline(h = 1)}
question_text(
  "Interpret the graph.  Does it indicate multivariate linearity?",
  answer("Yes, it mostly appears to support homogeneity and homoscedasticity.", correct = TRUE),
  incorrect = "You should note that more of the data is below 0 on the left side, but in general, this plot shows an even distribution of residuals.",
  try_again_button = "Modify your answer",
  allow_retry = TRUE
)

Submit

On this page, you will create the submission for your instructor (if necessary). Please copy this report and submit using a Word document or paste into the text window of your submission page. Click "Generate Submission" to get your work!

encoder_logic()
encoder_ui()


doomlab/learnSEM documentation built on Jan. 25, 2024, 2 p.m.