library("knitr")
opts_chunk$set(comment = "#>", collapse = TRUE)

library("L2TDatabase")
library("dplyr", warn.conflicts = FALSE)

# connect to the database using my cnf file
cnf_file <- file.path("../../l2t_db.cnf")
l2t <- l2t_connect(cnf_file, "backend")

First, we prepare the data-set by downloading the tables of child-study IDs, of study names, and of responses to the literacy survey. We combine those tables together to generate the raw data-set.

# Download child, study mappings
child_study <- "ChildStudy" %from% l2t %>% 
  left_join("Study" %from% l2t) %>% 
  collect

# Download literacy scores, attaching matching child-study columns
df_lit <- collect("Literacy" %from% l2t) %>% 
  inner_join(child_study)

# Keep just the literarcy related columns
df_lit <- df_lit %>% 
  select(Study, ShortResearchID, ReadingBedtime:TeachReading, 
         Exclude, ExcludeNotes)

We next examine the kinds of responses in each literacy column. (A data.frame is a list of columns so using lapply(df, func) lets us apply a function to each column.)

# Display the kinds of values in each column
df_lit %>% 
  select(ReadingBedtime:TeachReading) %>% 
  lapply(unique) %>% 
  lapply(sort)

# Count the number of NA values in each column
df_lit %>% 
  select(ReadingBedtime:TeachReading) %>% 
  lapply(function(xs) length(Filter(is.na, xs))) %>% 
  str

Re-encode the literacy responses as ordinal measures. Conveniently, the ordering of each column's values from smallest to largest is also the alphabetical order of those values.

# Ignore reading onset
df_lit$ReadingOnset <- NULL

# Set each column between from ReadingBedtime to TeachReading as ordinal
df_lit <- df_lit %>% 
  mutate_each(funs(factor(., order = TRUE)), ReadingBedtime:TeachReading)

# Confirm level orderings
df_lit %>% 
  lapply(levels) %>% 
  Filter(Negate(is.null), .)

Item Responses

Let's use the likert package to visualize responses.

library("likert")
df_likert <- df_lit

First, let's define some constants for our "codebook". We define a look-up vector of column names and column descriptions using the describe_tbl function. Three of the questions ask about the frequency of events, so we recode their 1:5 values as "Never":"Very Often". The other questions--about reading at bed time, reading outside of bed time and number of books in child's house--already have informative values, so we don't touch those.

# Used to convert column names to full-length questions
survey_description <- describe_tbl(l2t, "Literacy")
name_lookup <- survey_description$Description %>% 
  setNames(survey_description$Field)

frequency_labels <- c("Never", "Seldom", "Sometimes", "Often", "Very Often")

as_freq_q <- function(xs) {
  factor(xs, levels = 1:5, labels = frequency_labels, ordered = TRUE)
}

df_likert <- df_likert %>% 
  mutate_each(funs(as_freq_q), ReadingRequests, TeachPrinting, TeachReading)

We analyze the frequency items together:

df_freq_data <- df_likert %>% 
  select(ReadingRequests, TeachPrinting, TeachReading) %>% 
  as.data.frame
names(df_freq_data) <- name_lookup[names(df_freq_data)]

df_freq_data <- likert(df_freq_data)
plot(df_freq_data, wrap = 20)
plot(df_freq_data, type = "heat", wrap = 20)

Next, we can visualize the reading frequencies together.

df_reading <- df_likert %>% 
  select(ReadingBedtime, ReadingOther) %>% 
  as.data.frame
names(df_reading) <- name_lookup[names(df_reading)]
df_reading_data <- likert(df_reading)
plot(df_reading_data, wrap = 20)
plot(df_reading_data, type = "heat", wrap = 20)

Finally, we can visualize the number of books question.

df_books <- df_likert %>% 
  select(NumChildBooks) %>% 
  as.data.frame
names(df_books) <- name_lookup[names(df_books)]

df_books_data <- likert(df_books)
plot(df_books_data, wrap = 30)
glimpse(df_books_data$results)

Imputation

Impute missing values.

library("mice")
library("tidyr")

# Which rows have missing data
kids_with_missing_data <- df_lit %>% 
  gather(Item, Value, -Study, -ShortResearchID, -ExcludeNotes, -Exclude) %>% 
  filter(is.na(Value)) %>% 
  select(Study:ShortResearchID) %>% 
  distinct
kids_with_missing_data$ShortResearchID

# Impute with default imputation methods (it will use "polr" (proportion odds
# model) because the values are ordinal)
df_values <- df_lit %>% select(ReadingBedtime:TeachReading)
imputed <- mice(df_values, m = 10, print = FALSE)
imputed

# Combine original and imputed values
with_imputed <- df_lit %>% 
  bind_cols(mice::complete(imputed, "broad", include = TRUE)) %>% 
  # Drop unmarked columns since the ColName.0 columns have original values
  select(-(ReadingBedtime:ExcludeNotes)) %>%
  # Look only at rows with missing data
  semi_join(kids_with_missing_data) 

# Convert to long format, separate ColName.ImputationNum into separate columns,
# convert back to wide to show how each NA was imputed.
each_imputation <- with_imputed %>% 
  gather(Item, Value, -Study, -ShortResearchID) %>% 
  separate(Item, into = c("Item", "Imputation")) %>% 
  mutate(Imputation = paste0("Imp", Imputation),
         Imputation = ifelse(Imputation == "Imp0", "Raw", Imputation)) %>% 
  spread(Imputation, Value) %>% 
  filter(is.na(Raw)) %>% 
  select(ShortResearchID, Item, Raw, Imp1, Imp2:Imp9, Imp10) %>% 
  arrange(Item)

kable(each_imputation, format = "markdown")

Hmmm... the imputations look a little unstable. I need to see what else I can include here, like including a measure of age at survey administration or including other child-level measures.



LearningToTalk/L2TDatabase documentation built on June 24, 2020, 3:45 a.m.