library("knitr") opts_chunk$set(comment = "#>", collapse = TRUE) library("L2TDatabase") library("dplyr", warn.conflicts = FALSE) # connect to the database using my cnf file cnf_file <- file.path("../../l2t_db.cnf") l2t <- l2t_connect(cnf_file, "backend")
First, we prepare the data-set by downloading the tables of child-study IDs, of study names, and of responses to the literacy survey. We combine those tables together to generate the raw data-set.
# Download child, study mappings child_study <- "ChildStudy" %from% l2t %>% left_join("Study" %from% l2t) %>% collect # Download literacy scores, attaching matching child-study columns df_lit <- collect("Literacy" %from% l2t) %>% inner_join(child_study) # Keep just the literarcy related columns df_lit <- df_lit %>% select(Study, ShortResearchID, ReadingBedtime:TeachReading, Exclude, ExcludeNotes)
We next examine the kinds of responses in each literacy column. (A data.frame
is a list
of columns so using lapply(df, func)
lets us apply a function to
each column.)
# Display the kinds of values in each column df_lit %>% select(ReadingBedtime:TeachReading) %>% lapply(unique) %>% lapply(sort) # Count the number of NA values in each column df_lit %>% select(ReadingBedtime:TeachReading) %>% lapply(function(xs) length(Filter(is.na, xs))) %>% str
Re-encode the literacy responses as ordinal measures. Conveniently, the ordering of each column's values from smallest to largest is also the alphabetical order of those values.
# Ignore reading onset df_lit$ReadingOnset <- NULL # Set each column between from ReadingBedtime to TeachReading as ordinal df_lit <- df_lit %>% mutate_each(funs(factor(., order = TRUE)), ReadingBedtime:TeachReading) # Confirm level orderings df_lit %>% lapply(levels) %>% Filter(Negate(is.null), .)
Let's use the likert
package to visualize
responses.
library("likert") df_likert <- df_lit
First, let's define some constants for our "codebook". We define a look-up
vector of column names and column descriptions using the describe_tbl
function. Three of the questions ask about the frequency of events, so we recode
their 1:5 values as "Never":"Very Often". The other questions--about reading at
bed time, reading outside of bed time and number of books in child's
house--already have informative values, so we don't touch those.
# Used to convert column names to full-length questions survey_description <- describe_tbl(l2t, "Literacy") name_lookup <- survey_description$Description %>% setNames(survey_description$Field) frequency_labels <- c("Never", "Seldom", "Sometimes", "Often", "Very Often") as_freq_q <- function(xs) { factor(xs, levels = 1:5, labels = frequency_labels, ordered = TRUE) } df_likert <- df_likert %>% mutate_each(funs(as_freq_q), ReadingRequests, TeachPrinting, TeachReading)
We analyze the frequency items together:
df_freq_data <- df_likert %>% select(ReadingRequests, TeachPrinting, TeachReading) %>% as.data.frame names(df_freq_data) <- name_lookup[names(df_freq_data)] df_freq_data <- likert(df_freq_data) plot(df_freq_data, wrap = 20) plot(df_freq_data, type = "heat", wrap = 20)
Next, we can visualize the reading frequencies together.
df_reading <- df_likert %>% select(ReadingBedtime, ReadingOther) %>% as.data.frame names(df_reading) <- name_lookup[names(df_reading)] df_reading_data <- likert(df_reading) plot(df_reading_data, wrap = 20)
plot(df_reading_data, type = "heat", wrap = 20)
Finally, we can visualize the number of books question.
df_books <- df_likert %>% select(NumChildBooks) %>% as.data.frame names(df_books) <- name_lookup[names(df_books)] df_books_data <- likert(df_books) plot(df_books_data, wrap = 30) glimpse(df_books_data$results)
Impute missing values.
library("mice") library("tidyr") # Which rows have missing data kids_with_missing_data <- df_lit %>% gather(Item, Value, -Study, -ShortResearchID, -ExcludeNotes, -Exclude) %>% filter(is.na(Value)) %>% select(Study:ShortResearchID) %>% distinct kids_with_missing_data$ShortResearchID # Impute with default imputation methods (it will use "polr" (proportion odds # model) because the values are ordinal) df_values <- df_lit %>% select(ReadingBedtime:TeachReading) imputed <- mice(df_values, m = 10, print = FALSE) imputed # Combine original and imputed values with_imputed <- df_lit %>% bind_cols(mice::complete(imputed, "broad", include = TRUE)) %>% # Drop unmarked columns since the ColName.0 columns have original values select(-(ReadingBedtime:ExcludeNotes)) %>% # Look only at rows with missing data semi_join(kids_with_missing_data) # Convert to long format, separate ColName.ImputationNum into separate columns, # convert back to wide to show how each NA was imputed. each_imputation <- with_imputed %>% gather(Item, Value, -Study, -ShortResearchID) %>% separate(Item, into = c("Item", "Imputation")) %>% mutate(Imputation = paste0("Imp", Imputation), Imputation = ifelse(Imputation == "Imp0", "Raw", Imputation)) %>% spread(Imputation, Value) %>% filter(is.na(Raw)) %>% select(ShortResearchID, Item, Raw, Imp1, Imp2:Imp9, Imp10) %>% arrange(Item) kable(each_imputation, format = "markdown")
Hmmm... the imputations look a little unstable. I need to see what else I can include here, like including a measure of age at survey administration or including other child-level measures.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.