library(learnr) library(learnSEM) knitr::opts_chunk$set(echo = FALSE) data(datascreen)
This exercise will give you practice working on data screening for multivariate analysis. You should work through data screening a particular order:
You can use vignette("lecture_data_screen", "learnSEM")
to view these notes in R.
Study: This datascreen
dataset includes a male body dissatisfaction scale with the following questions:
The datascreen
dataset also includes a participant ID for each person in the study.
In this next section, you will answer questions using the R code blocks provided. Be sure to use the solution
option to see the answer if you need it!
Please enter your name for submission. If you do not need to submit, just type anything you'd like in this box.
question_text( "Student Name:", answer("Your Name", correct = TRUE), incorrect = "Thanks!", try_again_button = "Modify your answer", allow_retry = TRUE )
Check the data for out of range scores. The scale ranges from 1 (never) to 6 (always). Use the summary()
function to inspect the scores.
library(learnSEM) data(datascreen)
library(learnSEM) data(datascreen) summary(datascreen)
question_text( "What are the problems you see in the dataset?", answer("There are no accuracy errors.", correct = TRUE), incorrect = "You should note that there are no accuracy errors.", try_again_button = "Modify your answer", allow_retry = TRUE )
summary(datascreen)
question_text( "Using the summary function from above, what type of missing data do you appear to have?", answer("Missing Completely at Random", correct = TRUE), incorrect = "You should note that it is likely this data is missing completely at random.", try_again_button = "Modify your answer", allow_retry = TRUE )
percentmiss
function to calculate the percent of missing data. apply()
function, and name that variable missing
.table()
function. # Create percent missing function # Calculate missing percentages on datascreen # How much missing data is there?
# Create percent missing function percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100 } # Calculate missing percentages on datascreen missing <- apply(datascreen, 1, percentmiss) # How much missing data is there? table(missing)
Exclude all participants with more than 5% missing data, and call this dataset nomissing
.
# Create percent missing function percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100 } # Calculate missing percentages on datascreen missing <- apply(datascreen, 1, percentmiss)
nomissing <- subset(datascreen, missing <= 5)
Calculate missing scores by variable (columns) on the dataset with participants who have less than 5% missing.
# Create percent missing function percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100} # Calculate missing percentages on datascreen missing <- apply(datascreen, 1, percentmiss) nomissing <- subset(datascreen, missing <= 5)
apply(nomissing, 2, percentmiss)
question_text( "Do you need to estimate any missing data? ", answer("No", correct = TRUE), incorrect = "No, it appears that the there is no missing data after excluding participants with MNAR data.", try_again_button = "Modify your answer", allow_retry = TRUE )
nomissing
dataset, and be sure to exclude the first column Participant_ID
.cutoff
score using p < .001 as your criterion. table()
function to determine the number of outliers you have in the dataset. # Create percent missing function percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100} # Calculate missing percentages on datascreen missing <- apply(datascreen, 1, percentmiss) nomissing <- subset(datascreen, missing <= 5)
mahal <- mahalanobis(nomissing[ , -1], colMeans(nomissing[ , -1]), cov(nomissing[ , -1])) cutoff <- qchisq(1-.001, ncol(nomissing[ , -1])) table(mahal < cutoff)
question_text( "What is the *df* for your cut off score?", answer("11", correct = TRUE), incorrect = "We have 11 df because there are 11 columns as part of the calculation of mahalanobis.", try_again_button = "Modify your answer", allow_retry = TRUE )
question_text( "What is the cut off score? ", answer("31.26", correct = TRUE), incorrect = "The cut off score is 31.26.", try_again_button = "Modify your answer", allow_retry = TRUE )
question_text( "How many outliers did you have?", answer("16", correct = TRUE), incorrect = "We have 16 outliers (the FALSE answers in your table).", try_again_button = "Modify your answer", allow_retry = TRUE )
Exclude the outliers, and name that dataset noout
.
# Create percent missing function percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100} # Calculate missing percentages on datascreen missing <- apply(datascreen, 1, percentmiss) nomissing <- subset(datascreen, missing <= 5) mahal <- mahalanobis(nomissing[ , -1], colMeans(nomissing[ , -1]), cov(nomissing[ , -1])) cutoff <- qchisq(1-.001, ncol(nomissing[ , -1]))
noout <- subset(nomissing, mahal < cutoff)
Include a depiction of the correlation between scale items with corrplot
, and be sure to use the noout
dataset.
# Create percent missing function percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100} # Calculate missing percentages on datascreen missing <- apply(datascreen, 1, percentmiss) nomissing <- subset(datascreen, missing <= 5) mahal <- mahalanobis(nomissing[ , -1], colMeans(nomissing[ , -1]), cov(nomissing[ , -1])) cutoff <- qchisq(1-.001, ncol(nomissing[ , -1])) noout <- subset(nomissing, mahal < cutoff)
library(corrplot)
library(corrplot) corrplot(cor(noout[ , -1]))
question_text( "Do you meet the assumption of additivity?", answer("Yes", correct = TRUE), incorrect = "Yes, it appears we do not have any perfect correlations, although some are highly correlated.", try_again_button = "Modify your answer", allow_retry = TRUE )
Set up the rest of the assumption testing by doing the following:
random
variable that's the length of the noout
dataset using the rchisq()
function.fake
output using the lm()
function where in the random variable is predicted by the data.standardized
variable by using the rstudent()
function on the fake
regression.fitted
variable by using scale()
on the fitted.values
from the fake
regression. # Create percent missing function percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100} # Calculate missing percentages on datascreen missing <- apply(datascreen, 1, percentmiss) nomissing <- subset(datascreen, missing <= 5) mahal <- mahalanobis(nomissing[ , -1], colMeans(nomissing[ , -1]), cov(nomissing[ , -1])) cutoff <- qchisq(1-.001, ncol(nomissing[ , -1])) noout <- subset(nomissing, mahal < cutoff)
random <- rchisq(nrow(noout), 7) fake <- lm(random ~ ., data = noout[ , -1]) standardized <- rstudent(fake) fitted <- scale(fake$fitted.values)
Include the multivariate normality histogram by using hist()
on the standardized
variable.
# Create percent missing function percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100} # Calculate missing percentages on datascreen missing <- apply(datascreen, 1, percentmiss) nomissing <- subset(datascreen, missing <= 5) mahal <- mahalanobis(nomissing[ , -1], colMeans(nomissing[ , -1]), cov(nomissing[ , -1])) cutoff <- qchisq(1-.001, ncol(nomissing[ , -1])) noout <- subset(nomissing, mahal < cutoff) random <- rchisq(nrow(noout), 7) fake <- lm(random ~ ., data = noout[ , -1]) standardized <- rstudent(fake) fitted <- scale(fake$fitted.values)
hist(standardized)
question_text( "Interpret the graph. Does it indicate multivariate normality?", answer("No, it appears the data may be a bit skewed.", correct = TRUE), incorrect = "You should note that the data appears a bit skewed because of the longer tail on the right, as the data is not even centered over 0.", try_again_button = "Modify your answer", allow_retry = TRUE )
Include the multivariate QQ plot.
# Create percent missing function percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100} # Calculate missing percentages on datascreen missing <- apply(datascreen, 1, percentmiss) nomissing <- subset(datascreen, missing <= 5) mahal <- mahalanobis(nomissing[ , -1], colMeans(nomissing[ , -1]), cov(nomissing[ , -1])) cutoff <- qchisq(1-.001, ncol(nomissing[ , -1])) noout <- subset(nomissing, mahal < cutoff) random <- rchisq(nrow(noout), 7) fake <- lm(random ~ ., data = noout[ , -1]) standardized <- rstudent(fake) fitted <- scale(fake$fitted.values)
{qqnorm(standardized) abline(0,1)} #or plot(fake, 2)
question_text( "Interpret the graph. Does it indicate multivariate linearity?", answer("Maybe, the data appears to bend away at the edges a bit.", correct = TRUE), incorrect = "You should note that the line mostly indicates linearity with a bit of non-linearity towards the edges (i.e., it bends away from the graph around -1 and 1.", try_again_button = "Modify your answer", allow_retry = TRUE )
Include the multivariate residuals plot.
# Create percent missing function percentmiss <- function(x){ sum(is.na(x)) / length(x) * 100} # Calculate missing percentages on datascreen missing <- apply(datascreen, 1, percentmiss) nomissing <- subset(datascreen, missing <= 5) mahal <- mahalanobis(nomissing[ , -1], colMeans(nomissing[ , -1]), cov(nomissing[ , -1])) cutoff <- qchisq(1-.001, ncol(nomissing[ , -1])) noout <- subset(nomissing, mahal < cutoff) random <- rchisq(nrow(noout), 7) fake <- lm(random ~ ., data = noout[ , -1]) standardized <- rstudent(fake) fitted <- scale(fake$fitted.values)
{plot(standardized, fitted) abline(v = 1) abline(h = 1)}
question_text( "Interpret the graph. Does it indicate multivariate linearity?", answer("Yes, it mostly appears to support homogeneity and homoscedasticity.", correct = TRUE), incorrect = "You should note that more of the data is below 0 on the left side, but in general, this plot shows an even distribution of residuals.", try_again_button = "Modify your answer", allow_retry = TRUE )
On this page, you will create the submission for your instructor (if necessary). Please copy this report and submit using a Word document or paste into the text window of your submission page. Click "Generate Submission" to get your work!
encoder_logic()
encoder_ui()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.