In LearningToTalk/L2TDatabase: Helper Functions for the Learning-To-Talk 'MySQL' Database

library("knitr")
opts_chunk$set(
  collapse = TRUE, 
  comment = "#>", 
  message = FALSE,
  fig.width = 8,
  fig.height = 6,
  dpi = 300,
  out.width = "80%")

wd <- rprojroot::find_rstudio_root_file()
opts_knit$set(root.dir = wd)

Download the matches

Load in the list of participants. The table of matches is stored in a table called CIMatching in the database. We connect to the database, grab that table and the test scores for the children in that table.

library(dplyr)
library(L2TDatabase)

# We use the main RStudio project directory as the working folder 
# for this session.
wd <- rprojroot::find_rstudio_root_file()
dir_here <- file.path(wd, "inst", "analyses", "rwl_ci_nh_matches")
cnf_file <- file.path(wd, "inst", "l2t_db.cnf")

# Connect to database
l2t_main <- l2t_connect(cnf_file, "l2t")

# Download matches and scores from various tests
df_matches <- tbl(l2t_main, "CIMatching") %>% 
  left_join(tbl(l2t_main, "EVT")) %>% 
  left_join(tbl(l2t_main, "PPVT")) %>% 
  left_join(tbl(l2t_main, "GFTA")) %>% 
  select(Group = Matching_Group, Matching_PairNumber, ChildStudyID, 
         Study, ResearchID, ChildID,
         Female, AAE, LateTalker, CImplant, 
         Maternal_Education, Maternal_Education_Level, 
         EVT_Age, EVT_Raw, EVT_GSV, EVT_Standard,
         PPVT_Age, PPVT_Raw, PPVT_GSV, PPVT_Standard, 
         GFTA_Age:GFTA_Standard) %>% 
  collect()
df_matches

We can confirm adequate matching on (EVT) Age and sex.

df_matches %>% 
  group_by(Group) %>% 
  summarise(
    N = n(),
    CI = sum(CImplant),
    N_Female = sum(Female),
    N_Male = sum(Female == 0),
    N_EVT = sum(!is.na(EVT_Age)),
    EVT_Age = mean(EVT_Age, na.rm = TRUE),
    N_PPVT = sum(!is.na(PPVT_Age)),
    PPVT_Age = mean(PPVT_Age, na.rm = TRUE),
    N_GFTA = sum(!is.na(GFTA_Age)),
    GFTA_Age = mean(GFTA_Age, na.rm = TRUE)) %>% 
  knitr::kable()

We also matched by general maternal education.

df_matches %>% 
  count(Group, Maternal_Education) %>% 
  ungroup() %>%
  tidyr::spread(Group, n) %>% 
  knitr::kable()

We can also compute summary statistics.

df_matches %>% 
  select(Group, EVT_Age, EVT_Standard, PPVT_Standard, GFTA_Standard) %>% 
  # Convert to long format to compute summaries by group by score type
  tidyr::gather(Variable, Value, -Group) %>% 
  tidyr::drop_na(Value) %>% 
  group_by(Variable, Group) %>% 
  summarise(N_Values = n(), Mean = mean(Value), SD = sd(Value), 
            Min = min(Value), Max = max(Value)) %>% 
  # Round off decimals
  mutate_each_(funs(round), vars = vars(Mean, SD, Min, Max)) %>% 
  ungroup() %>% 
  knitr::kable()

Download the eyetracking data

This is a tedious step. We need to download the eyetracking data. First, let's connect to the database and prepare some queries.

l2t_eyetracking <- l2t_connect(cnf_file, "eyetracking")

# We could do this with the prepared queries (q_BlocksByStudy, etc.), but the 
# query q_LooksByStudy takes forever to run. So we will select identify the
# blocks we want and get the trials and looks for just those blocks. That should
# be faster than start with all the data and narrowing down to the subset we
# want.

# Find the numbers of the blocks 
tbl_blocks <- tbl(l2t_eyetracking, "Blocks") %>% 
  filter(ChildStudyID %in% df_matches$ChildStudyID) %>% 
  select(BlockID, ChildStudyID, Block_Basename, 
         Block_DateTime, Block_Task, Block_Version, Block_Age)

# Get the attributes for these blocks
tbl_blocks_attrs <- tbl(l2t_eyetracking, "BlockAttributes") %>% 
  inner_join(tbl_blocks) %>% 
  select(ChildStudyID, BlockID, BlockAttribute_Name, BlockAttribute_Value)

# Get trial id numbers for these blocks
tbl_trials <- tbl(l2t_eyetracking, "Trials") %>% 
  inner_join(tbl_blocks) %>% 
  select(ChildStudyID, BlockID, TrialID, Trial_TrialNo)

# Get attributes of the trials
tbl_trials_attrs <- tbl(l2t_eyetracking, "TrialAttributes") %>% 
  inner_join(tbl_trials) %>% 
  select(ChildStudyID, BlockID, TrialID, 
         TrialAttribute_Name, TrialAttribute_Value)

# The big one. Get the looking data for these trials.
tbl_looks <- tbl(l2t_eyetracking, "Looks") %>% 
  inner_join(tbl_trials) %>% 
  select(ChildStudyID, BlockID, TrialID, Time, GazeByImageAOI, GazeByAOI)

Downloading the data takes forever, so I'm going set a flag called refresh. When refresh is true, it will redownload the eyetracking data with those queries. Otherwise, it will load the last copy that I saved.

refresh <- TRUE

if (!refresh) {
  df_looks <- readr::read_csv(file.path(dir_here, "looks.csv"))
  df_blocks <- df_looks %>% 
    select(Study, ShortResearchID, Block_Age, 
           Block_Basename, Block_Dialect) %>% 
    distinct()
} else {

  df_blocks <- collect(tbl_blocks) %>% 
    left_join(df_matches) %>%
    group_by(Block_Task, ChildStudyID) %>%
    # We want one age per child, so use earliest. This might be dubious.
    mutate(Block_Age = min(Block_Age)) %>%
    ungroup()

  # Get the dialect and stimulus version
  df_blocks_attrs <- collect(tbl_blocks_attrs) %>% 
    # Pivot from long to wide so have the attributes we want
    tidyr::spread(BlockAttribute_Name, BlockAttribute_Value) %>% 
    select(ChildStudyID, BlockID, 
           Block_Dialect = Dialect, StimulusSet)

  # Add dialect to block info
  df_blocks <- df_blocks %>% 
    select(Block_Task, Study, ResearchID, BlockID, Block_Age, 
           Block_Basename, Block_Version) %>% 
    left_join(df_blocks_attrs) 

  df_trials <- collect(tbl_trials)


  df_trials_attrs <- collect(tbl_trials_attrs, n = Inf)

  df_looks <- collect(tbl_looks, n = Inf)

  df_looks <- df_blocks %>% 
    left_join(df_trials) %>% 
    left_join(df_looks) %>% 
    select(-BlockID, -ChildStudyID, -TrialID)

  # readr::write_csv(df_looks, file.path(dir_here, "looks.csv"))
}

Data-screening for each eyetracking experiment

We exclude blocks with the TimePoint1 version of the experiment stimuli.

df_bad_version <- df_blocks %>% 
  filter(StimulusSet == "TP1") %>% 
  select(Block_Task:ResearchID, Block_Basename, StimulusSet) %>% 
  print()

We identify blocks with more than 50% missing data during some analysis window (here 300--1800 ms).

df_blocks_to_drop <- df_looks %>% 
  anti_join(df_bad_version) %>% 
  # Offset by 20 ms because the data binned into 50ms bins: I.e., the frame at
  # 285ms is part of the [285, 300, 315] ms bin, so that frame needs to part of
  # the data screening.
  filter(between(Time, 280, 1820)) %>% 
  lookr::AggregateLooks(Block_Task + Study + ResearchID + 
                          Block_Basename ~ GazeByImageAOI) %>% 
  as_data_frame() %>% 
  filter(PropNA > .5) %>% 
  select(Block_Task:Block_Basename, PropNA) %>% 
  mutate(PropNA = round(PropNA, 3)) %>% 
  print(n = Inf)

Next, we drop individual trials with more than 50% missing data.

df_trials_to_drop <- df_looks %>% 
  anti_join(df_blocks_to_drop) %>% 
  filter(between(Time, 280, 1820)) %>% 
  lookr::AggregateLooks(Block_Task + Study + ResearchID + Block_Basename + 
                          Trial_TrialNo ~ GazeByImageAOI) %>% 
  as_data_frame() %>% 
  filter(PropNA > .5) %>% 
  select(Block_Task:Trial_TrialNo, PropNA) %>% 
  mutate(PropNA = round(PropNA, 3)) %>% 
  print()

df_looks <- df_looks %>% 
  anti_join(df_blocks_to_drop) %>% 
  anti_join(df_trials_to_drop)

Next, we need to exclude participants who no longer have a match.

df_leftover <- df_looks %>% 
  distinct(Block_Task, Study, ResearchID) %>% 
  inner_join(df_matches)

df_leftover %>% 
  count(Block_Task, Group) %>% 
  ungroup() %>% 
  rename(NumChildren = n)

df_singletons <- df_leftover %>% 
  count(Block_Task, Matching_PairNumber) %>% 
  ungroup() %>% 
  rename(NumChildrenInPair = n) %>% 
  filter(NumChildrenInPair == 1)
df_singletons


df_looks <- df_looks %>% 
  inner_join(df_matches %>% 
               select(Group, Matching_PairNumber, Study, ResearchID)) %>% 
  anti_join(df_singletons)

Now there will be the same number of children in each group x task.

df_looks %>% 
  distinct(Block_Task, Group, Study, ResearchID) %>% 
  count(Block_Task, Group) %>% 
  ungroup() %>% 
  rename(NumChildren = n)

# Make sure there are 2 children in every matching pair
df_looks %>% 
  distinct(Matching_PairNumber, Block_Task, Group, Study, ResearchID) %>% 
  count(Block_Task, Matching_PairNumber) %>% 
  ungroup() %>% 
  rename(NumChildrenInPair = n) %>% 
  filter(NumChildrenInPair != 2)

Finally, we have to separate the RWL and the MP data, so that we can attach the information about the trials to eyetracking data.

df_mp <- df_looks %>% 
  filter(Block_Task == "MP")

df_rwl <- df_looks %>% 
  filter(Block_Task == "RWL")

df_trial_info <- df_trials_attrs %>% 
  left_join(df_trials) %>% 
  left_join(df_blocks) %>% 
  semi_join(df_looks)

# Include the MP trial information
df_mp_trial_info <- df_trial_info %>% 
  filter(Block_Task == "MP") %>% 
  tidyr::spread(TrialAttribute_Name, TrialAttribute_Value) %>% 
  select(Study, ResearchID, Block_Basename, Trial_TrialNo, 
         Condition = StimType, WordGroup, TargetWord, Bias_ImageAOI, 
         DistractorImage, FamiliarImage, UnfamiliarImage, 
         ImageL, ImageR, TargetImage)

df_mp <- df_mp %>% 
  left_join(df_mp_trial_info)


# Do the same for the RWL data
df_rwl_trial_info <- df_trial_info %>% 
  filter(Block_Task == "RWL") %>% 
  tidyr::spread(TrialAttribute_Name, TrialAttribute_Value) %>% 
  select(Study, ResearchID, Block_Basename, Trial_TrialNo, 
         TargetImage, Bias_ImageAOI,
         starts_with("Image"), starts_with("Stimulus"), starts_with("Word"))

df_rwl <- df_rwl %>% 
  left_join(df_rwl_trial_info)

Look at the RWL data

Let's downsample into 50ms bins, and aggregate the number of looks.

df_bins <- df_rwl %>% 
  distinct(Time) %>%
  # Need a number of frames divisible three. Time 0 should be center of its bin
  filter(between(Time, -917, 1975)) %>% 
  arrange(Time) %>% 
  mutate(Bin = lookr::AssignBins(Time, bin_width = 3)) %>%
  group_by(Bin) %>% 
  # Round to nearest 50ms
  mutate(BinTime = Time %>% median() %>% round(-1)) %>% 
  ungroup()

df_binned <- df_rwl %>% 
  inner_join(df_bins) %>%
  select(Study, ResearchID, Trial_TrialNo, 
         Time, BinTime, GazeByImageAOI, GazeByAOI) %>% 
  lookr::AggregateLooks(Study + ResearchID + BinTime ~ GazeByImageAOI) %>% 
  as_data_frame() %>% 
  mutate(Looks_Images = Target + Others,
         Prop_Target = Target / Looks_Images,
         Prop_PhonologicalFoil = PhonologicalFoil / Looks_Images,
         Prop_SemanticFoil = SemanticFoil / Looks_Images,
         Prop_Unrelated = Unrelated / Looks_Images)
df_binned

If we want to plot a growth curve for image type (target, semantic foil, etc.), we need to reshape into a long format to have a Proportion column and an Image type column.

df_looks_to_aois <- df_binned %>% 
  select(Study, ResearchID, Time = BinTime, starts_with("Prop_")) %>% 
  tidyr::gather(AOI, Proportion, -Study, -ResearchID, -Time) %>% 
  mutate(AOI = AOI %>% 
           stringr::str_replace("Prop_", "") %>% 
           stringr::str_replace("Foil", "")) 

df_looks_to_aois$AOI <- factor(
  df_looks_to_aois$AOI, 
  levels = c("Target", "Phonological", "Semantic", "Unrelated"))

Do some spaghetti plots

library(ggplot2)
df_looks_to_aois <- left_join(df_looks_to_aois, df_matches) %>% 
  mutate(LineGroup = interaction(Study, ResearchID, AOI))

curr_labs <- labs(
  x = "Time after target onset (ms.)", 
  y = "Proportion of looks", 
  color = "Image") 

theme_aligned <- theme(
  axis.title.x = element_text(hjust = .995), 
  axis.title.y = element_text(hjust = .995))

p_theme <- theme_grey(base_size = 11) + 
  theme(legend.position = "bottom") + 
  theme_aligned

p1 <- ggplot(df_looks_to_aois) + 
  aes(x = Time, y = Proportion, color = AOI, group = LineGroup) + 
  geom_hline(yintercept = .25, size = 2, color = "white") + 
  geom_line() + 
  viridis::scale_color_viridis(discrete = TRUE, option = "inferno", end = .9) +
  facet_wrap("Group") +
  p_theme + curr_labs
p1

Replace data-set in last plot with one with a narrow time window.

df_looks_to_aois_window <- df_looks_to_aois %>% 
  filter(between(Time, 300, 1800))

p1 %+% df_looks_to_aois_window

Show average of each participant's lines.

p2 <- ggplot(df_looks_to_aois) + 
  aes(x = Time, y = Proportion, color = AOI) + 
  geom_hline(yintercept = .25, size = 2, color = "white") + 
  stat_summary(fun.data = mean_se, geom = "pointrange") +
  facet_wrap("Group") +
  p_theme + curr_labs + 
  viridis::scale_color_viridis(discrete = TRUE, option = "inferno", end = .9)
p2

p2b <- ggplot(df_looks_to_aois_window) + 
  aes(x = Time, y = Proportion, color = AOI) + 
  geom_hline(yintercept = .25, size = 2, color = "white") + 
  stat_summary(fun.data = mean_se, geom = "pointrange") +
  facet_wrap("Group") +
  p_theme + curr_labs + 
  viridis::scale_color_viridis(discrete = TRUE, option = "inferno", end = .9)
p2b

Actually, we don't need to facet them.

p3 <- ggplot(df_looks_to_aois) + 
  aes(x = Time, y = Proportion, color = AOI, shape = Group) + 
  geom_hline(yintercept = .25, size = 2, color = "white") + 
  stat_summary(fun.data = mean_se, geom = "pointrange") +
  p_theme + curr_labs + 
  viridis::scale_color_viridis(discrete = TRUE, option = "inferno", end = .9)
p3

And zoom in.

p4 <- ggplot(df_looks_to_aois_window) + 
  aes(x = Time, y = Proportion, color = AOI, shape = Group) + 
  geom_hline(yintercept = .25, size = 2, color = "white") + 
  stat_summary(fun.data = mean_se, geom = "pointrange") +
  p_theme + curr_labs + 
  viridis::scale_color_viridis(discrete = TRUE, option = "inferno", end = .9)
p4

Compare each mean of curve.

p5 <- ggplot(df_looks_to_aois_window) + 
  aes(x = Time, y = Proportion, linetype = Group) + 
  geom_hline(yintercept = .25, size = 2, color = "white") + 
  stat_summary(fun.y = mean, geom = "line", size = 1.25) +
  facet_wrap("AOI") +
  p_theme + curr_labs
p5

Look at the MP data

df_mp %>% 
  distinct(Study, ResearchID, Group, Matching_PairNumber) %>% 
  inner_join(df_matches) %>% 
  distinct(Group, ChildID) %>% 
  count(Group)

df_mp %>% 
  distinct(Study, ResearchID, Group, Matching_PairNumber) %>% 
  inner_join(df_matches) %>% 
  count(Group, ChildID) %>% 
  ungroup() %>% 
  count(Group, n)

df_mp %>% 
  distinct(Study, ResearchID, Group, Matching_PairNumber) %>% 
  inner_join(df_matches) %>% 
  group_by(Group) %>% 
  summarise(MinAge = min(EVT_Age), MaxAge = max(EVT_Age))

Similar steps as above. Let's downsample into 50ms bins, and aggregate the number of looks.

df_bins <- df_mp %>% 
  distinct(Time) %>%
  # Need a number of frames divisible three. Time 0 should be center of its bin
  filter(between(Time, -917, 1975)) %>% 
  arrange(Time) %>% 
  mutate(Bin = lookr::AssignBins(Time, bin_width = 3)) %>%
  group_by(Bin) %>% 
  # Round to nearest 50ms
  mutate(BinTime = Time %>% median() %>% round(-1)) %>% 
  ungroup()

df_binned <- df_mp %>% 
  inner_join(df_bins) %>%
  select(Study, ResearchID, Condition, Trial_TrialNo, 
         Time, BinTime, GazeByImageAOI, GazeByAOI) %>% 
  lookr::AggregateLooks(Condition + Study + 
                          ResearchID + BinTime ~ GazeByImageAOI) %>% 
  as_data_frame() %>% 
  rename(Time = BinTime)
df_binned

Prepare for plotting.

df_plot <- df_binned %>% 
  inner_join(df_matches) %>% 
  mutate(PlotGroup = ifelse(Group == "NormalHearing", 
                            "Normal hearing", "Cochlear implant")) %>% 
  filter(between(Time, 0, 2000))

condition_labels <- c(
  "real" = "real word",
  "MP" = "mispronunication",
  "nonsense" = "nonword"
)

df_plot$heard <- df_plot$Condition %>% 
  factor(levels = names(condition_labels), labels = condition_labels)

curr_labs <- labs(
  x = "Time after target onset (ms.)", 
  y = "Proportion of looks to familiar image", 
  color = "Child hears") 

theme_aligned <- theme(
  axis.title.x = element_text(hjust = .995), 
  axis.title.y = element_text(hjust = .995))

p_theme <- theme_grey(base_size = 11) + 
  theme(legend.position = "top",
        legend.justification = "left") + 
  theme_aligned

Show average of each participant's lines.

p2 <- ggplot(df_plot) + 
  aes(x = Time, y = Proportion, color = heard) + 
  geom_hline(yintercept = .5, size = 2, color = "white") + 
  stat_summary(fun.data = mean_se, geom = "pointrange") +
  facet_wrap("PlotGroup") +
  p_theme + curr_labs + 
  viridis::scale_color_viridis(discrete = TRUE, option = "viridis", end = .8)
p2

Actually, we don't need to facet them.

p3 <- ggplot(df_plot) + 
  aes(x = Time, y = Proportion, color = heard, shape = PlotGroup) + 
  geom_hline(yintercept = .5, size = 2, color = "white") + 
  stat_summary(fun.data = mean_se, geom = "pointrange") +
  p_theme + curr_labs + 
  viridis::scale_color_viridis(discrete = TRUE, option = "viridis", end = .8) +
  labs(shape = "Group")
p3

df_model_a <- df_plot %>% 
  filter(between(Time, 250, 1500), 
         Condition != "nonword") %>% 
  mutate(Group = factor(Group, c("NormalHearing", "CochlearImplant")))

library(rstanarm)
options(mc.cores = parallel::detectCores())

m <- stan_glmer(
  cbind(Target, Distractor) ~ poly(Time, 3) * Condition * Group + 
    (poly(Time, 3) | ChildStudyID:Condition),
  family = binomial, 
  prior = normal(0, 40),
  prior_covariance = decov(regularization = 2),
  data = df_model_a)
save(m, "model.Rdata")

Mispronunciation effects

schemes <- tibble::tribble(
  ~ MPLabel, ~ WordGroup, ~ Contrast,
  "rice-[w]ice", "rice", "ɹ-w",
  "shoes-[s]oes", "shoes", "ʃ-s",
  "girl-[d]irl", "girl", "g-d",
  "soup-[ʃ]oup", "soup", "ʃ-s",
  "cake-[g]ake", "cake", "k-g",
  "duck-[g]uck", "duck", "g-d",
  "dog-[t]og", "dog", "t-d"
)

df_items <- df_mp %>% 
  inner_join(df_bins) %>%
  filter(Condition != "nonsense") %>% 
  select(WordGroup, Study, ResearchID, Condition, Trial_TrialNo, 
         Time, BinTime, GazeByImageAOI, GazeByAOI) %>% 
  lookr::AggregateLooks(WordGroup + Condition + Study + 
                          ResearchID + BinTime ~ GazeByImageAOI) %>% 
  as_data_frame() %>% 
  rename(Time = BinTime) %>% 
  left_join(schemes) %>% 
  filter(Contrast != "t-d")
df_items

df_items %>% count(WordGroup, Contrast)

Prepare for plotting.

df_plot_items <- df_items %>% 
  inner_join(df_matches) %>% 
  mutate(PlotGroup = ifelse(Group == "NormalHearing", 
                            "Normal hearing", "Cochlear implant")) %>% 
  filter(between(Time, 0, 2000))

condition_labels <- c(
  "real" = "real word",
  "MP" = "mispronunication",
  "nonsense" = "nonword"
)

df_plot_items$heard <- df_plot_items$Condition %>% 
  factor(levels = names(condition_labels), labels = condition_labels)

p2 <- ggplot(df_plot_items) + 
  aes(x = Time, y = Proportion, color = heard, linetype = PlotGroup) + 
  geom_hline(yintercept = .5, size = 2, color = "white") + 
  stat_summary(fun.y = mean, geom = "line", size = 1) +
  facet_wrap("MPLabel") +
  p_theme + curr_labs + labs(linetype = "Group") +
  viridis::scale_color_viridis(discrete = TRUE, option = "viridis", end = .8)
p2

LearningToTalk/L2TDatabase documentation built on June 24, 2020, 3:45 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

LearningToTalk/L2TDatabase
Helper Functions for the Learning-To-Talk 'MySQL' Database

In LearningToTalk/L2TDatabase: Helper Functions for the Learning-To-Talk 'MySQL' Database

Download the matches

Download the eyetracking data

Data-screening for each eyetracking experiment

Look at the RWL data

Look at the MP data

Mispronunciation effects

R Package Documentation

Browse R Packages

We want your feedback!

LearningToTalk/L2TDatabase Helper Functions for the Learning-To-Talk 'MySQL' Database

In LearningToTalk/L2TDatabase: Helper Functions for the Learning-To-Talk 'MySQL' Database

Download the matches

Download the eyetracking data

Data-screening for each eyetracking experiment

Look at the RWL data

Look at the MP data

Mispronunciation effects

R Package Documentation

Browse R Packages

We want your feedback!

LearningToTalk/L2TDatabase
Helper Functions for the Learning-To-Talk 'MySQL' Database