knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
knitr::opts_chunk$set(echo = TRUE) # Set a random seed set.seed(3898934) # Libraries necessary for this vignette library(rio) library(flextable) library(dplyr) library(tidyr) library(psych) library(semanticprimeR) # Function for simulation item_power <- function(data, # name of data frame dv_col, # name of DV column as a character item_col, # number of items column as a character nsim = 10, # small for cran sample_start = 20, sample_stop = 200, sample_increase = 5, decile = .5){ DF <- cbind.data.frame( "dv" = data[ , dv_col], "items" = data[ , item_col] ) # just in case colnames(DF) <- c("dv", "items") # figure out the "sufficiently narrow" ci value SE <- tapply(DF$dv, DF$items, function (x) { sd(x)/sqrt(length(x)) }) cutoff <- quantile(SE, probs = decile) # sequence of sample sizes to try samplesize_values <- seq(sample_start, sample_stop, sample_increase) # create a blank table for us to save the values in sim_table <- matrix(NA, nrow = length(samplesize_values)*nsim, ncol = length(unique(DF$items))) # make it a data frame sim_table <- as.data.frame(sim_table) # add a place for sample size values sim_table$sample_size <- NA iterate <- 1 for (p in 1:nsim){ # loop over sample sizes for (i in 1:length(samplesize_values)){ # temp that samples and summarizes temp <- DF %>% group_by(items) %>% sample_n(samplesize_values[i], replace = T) %>% summarize(se = sd(dv)/sqrt(length(dv))) # dv on items colnames(sim_table)[1:length(unique(DF$items))] <- temp$items sim_table[iterate, 1:length(unique(DF$items))] <- temp$se sim_table[iterate, "sample_size"] <- samplesize_values[i] sim_table[iterate, "nsim"] <- p iterate <- iterate + 1 } } # figure out cut off final_sample <- sim_table %>% pivot_longer(cols = -c(sample_size, nsim)) %>% dplyr::rename(item = name, se = value) %>% group_by(sample_size, nsim) %>% summarize(percent_below = sum(se <= cutoff)/length(unique(DF$items))) %>% ungroup() %>% # then summarize all down averaging percents dplyr::group_by(sample_size) %>% summarize(percent_below = mean(percent_below)) %>% dplyr::arrange(percent_below) %>% ungroup() return(list( SE = SE, cutoff = cutoff, DF = DF, sim_table = sim_table, final_sample = final_sample )) }
Overconfidence for picture cues in foreign language learning
Data provided by: Jason Geller
Previous research shows that participants are overconfident in their ability to learn foreign language vocabulary from pictures compared with English translations. The current study explored whether this tendency is due to processing fluency or beliefs about learning. Using self-paced study of Swahili words paired with either picture cues or English translation cues, 30 participants provided JOLs to each of the 42 English-Swahili word pairs from Carpenter and Olson’s (2012) Experiment 2.The English words were one-syllable nouns, ranging between three and six letters, with an average concreteness rating of 4.86 on a 5-point scale (SD = .16) (Brysbaert, Warriner, & Kuperman, 2014), and an average frequency of 106.52 per million (SD = 113.40) (Brysbaert & New, 2009).
Participants began the experiment with instructions informing them that they would be learning Swahili words paired with either pictures or English translations as cues. To illustrate each type of cue, they were given an example of an item (Train: Reli) that was not included among the 42 experimental items. They were informed that each pair of items (English-Swahili pairs or picture-Swahili pairs) would be presented one at a time, and they would have as much time as they needed to study it. Participants were encouraged to do their best to learn each pair, and to encourage full and meaningful processing of each, they were instructed to press the spacebar once they felt they had fully “digested” it. For each participant, 21 items were randomly selected to be presented as English-Swahili pairs, and 21 as picture-Swahili pairs. Participants saw each stimulus pair one at a time, in a unique random order with English-Swahili pairs and picture-Swahili pairs intermixed. Each pair was presented in the center of the computer screen and remained on screen until participants pressed the spacebar to move on to the next pair. After each of the 42 pairs was presented for self-paced study in this way, the same pairs were presented again for JOLs. During a JOL trial, each cue-target pair was presented on the screen and participants were asked to estimate—using a scale from 0% (definitely will NOT recall) to 100% (definitely will recall)—the likelihood of recalling the Swahili word from its cue (either the picture or English translation) after about 5 minutes. Participants entered a value between 0 and 100 and pressed the ENTER key to advance to the next item.
Data can be found here: https://osf.io/2byt9/.
#read in data DF <- import("data/geller_data.xlsx") %>% select(Experiment, Subject, `CueType[1Word,2Pic]`, Stimulus, EncodeJOL) str(DF)
No official publication, see citation below.
Carpenter, S. K., & Geller, J. (2020). Is a picture really worth a thousand words? Evaluating contributions of fluency and analytic processing in metacognitive judgements for pictures in foreign language vocabulary learning. Quarterly Journal of Experimental Psychology, 73(2), 211–224. https://doi.org/10.1177/1747021819879416
Brysbaert, M., Warriner, A. B., & Kuperman, V. (2014). Concreteness ratings for 40 thousand generally known English word lemmas. Behavior Research Methods, 46(3), 904–911. https://doi.org/10.3758/s13428-013-0403-5
Brysbaert, M., & New, B. (2009). Moving beyond Kučera and Francis: A critical evaluation of current word frequency norms and the introduction of a new and improved word frequency measure for American English. Behavior Research Methods, 41(4), 977–990. https://doi.org/10.3758/BRM.41.4.977
Carpenter, S. K., & Geller, J. (2020). Is a picture really worth a thousand words? Evaluating contributions of fluency and analytic processing in metacognitive judgements for pictures in foreign language vocabulary learning. Quarterly Journal of Experimental Psychology, 73(2), 211–224. https://doi.org/10.1177/1747021819879416
Carpenter, S. K., & Olson, K. M. (2012). Are pictures good for learning new vocabulary in a foreign language? Only if you think they are not. Journal of Experimental Psychology: Learning, Memory, and Cognition, 38(1), 92–101. https://doi.org/10.1037/a0024828
Overconfidence, metacognition, processing fluency, analytic processing, foreign language learning
Open access with reference to original paper (Attribution-NonCommercial-ShareAlike CC BY-NC-SA)
Ames, Iowa
metadata <- tibble::tribble( ~Variable.Name, ~Variable.Description, ~`Type (numeric,.character,.logical,.etc.)`, "Experiment", "Experiment 1 (1) or 2 (2) ONLY USE 1", NA, "Subject", "Subject ID", "Numeric", "CueType", "Whether participant was presented with word translation (1) or word with picture (2)", "Numeric", "Stimulus", "Swahili words presented on each trail", "Character", "EncodeJOL", "JOL (1-100) 1=not likely to recall 100=very likely to recall", "Numeric" ) flextable(metadata) %>% autofit()
DF <- DF %>% filter(Experiment == 1) %>% filter(!is.na(EncodeJOL)) # Function for simulation var1 <- item_power(data = DF, # name of data frame dv_col = "EncodeJOL", # name of DV column as a character item_col = "Stimulus", # number of items column as a character nsim = 10, sample_start = 20, sample_stop = 100, sample_increase = 5, decile = .4)
What the usual standard error for the data that could be considered for our stopping rule using the 40% decile?
# individual SEs var1$SE var1$cutoff
Using our 40% decile as a guide, we find that r round(var1$cutoff, digits = 3)
is our target standard error for an accurately measured item.
To estimate minimum sample size, we should figure out what number of participants it would take to achieve 80%, 85%, 90%, and 95% of the SEs for items below our critical score of r round(var1$cutoff, digits = 3)
?
cutoff <- calculate_cutoff(population = DF, grouping_items = "Stimulus", score = "EncodeJOL", minimum = as.numeric(min(DF$EncodeJOL)), maximum = as.numeric(max(DF$EncodeJOL))) # showing how this is the same as the person calculated version versus semanticprimeR's function cutoff$cutoff final_table <- calculate_correction( proportion_summary = var1$final_sample, pilot_sample_size = length(unique(DF$Subject)), proportion_variability = cutoff$prop_var ) flextable(final_table) %>% autofit()
Our minimum sample size is (n = r final_table %>% filter(percent_below >= 80) %>% arrange(percent_below, corrected_sample_size) %>% slice_head() %>% pull(corrected_sample_size) %>% round()
as the minimum at 80%). We could consider using 90% (n = r final_table %>% filter(percent_below >= 90) %>% arrange(percent_below, corrected_sample_size) %>% slice_head() %>% pull(corrected_sample_size) %>% round()
) or 95% (n = r final_table %>% filter(percent_below >= 95) %>% arrange(percent_below, corrected_sample_size) %>% slice_head() %>% pull(corrected_sample_size) %>% round()
).
While there are many considerations for maximum sample size (time, effort, resources), if we consider a higher value just for estimation sake, we could use n = r final_table %>% filter(percent_below >= 98) %>% arrange(percent_below, corrected_sample_size) %>% slice_head() %>% pull(corrected_sample_size) %>% round()
at nearly 100%.
In any estimate for sample size, you should also consider the potential for missing data and/or unusable data due to any other exclusion criteria in your study (i.e., attention checks, speeding, getting the answer right, etc.). In this study, these values may be influenced by the pictures/word split in the study.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.