knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Vignette Setup:

knitr::opts_chunk$set(echo = TRUE)

# Libraries necessary for this vignette
library(rio)
library(flextable)
library(dplyr)
library(tidyr)
library(semanticprimeR)
set.seed(583902)

# Function for simulation
item_power <- function(data, # name of data frame
                       dv_col, # name of DV column as a character
                       item_col, # number of items column as a character
                       nsim = 10, # small for cran 
                       sample_start = 20, 
                       sample_stop = 200, 
                       sample_increase = 5,
                       decile = .5){

  DF <- cbind.data.frame(
    "dv" = data[ , dv_col],
    "items" = data[ , item_col]
  )

  # just in case
  colnames(DF) <- c("dv", "items")

  # figure out the "sufficiently narrow" ci value
  SE <- tapply(DF$dv, DF$items, function (x) { sd(x)/sqrt(length(x)) })
  cutoff <- quantile(SE, probs = decile)

  # sequence of sample sizes to try
  samplesize_values <- seq(sample_start, sample_stop, sample_increase)

  # create a blank table for us to save the values in 
  sim_table <- matrix(NA, 
                      nrow = length(samplesize_values)*nsim, 
                      ncol = length(unique(DF$items)))

  # make it a data frame
  sim_table <- as.data.frame(sim_table)

  # add a place for sample size values 
  sim_table$sample_size <- NA

  iterate <- 1

  for (p in 1:nsim){
    # loop over sample sizes
    for (i in 1:length(samplesize_values)){

      # temp that samples and summarizes
      temp <- DF %>% 
        group_by(items) %>% 
        sample_n(samplesize_values[i], replace = T) %>% 
        summarize(se = sd(dv)/sqrt(length(dv)))

      # dv on items
      colnames(sim_table)[1:length(unique(DF$items))] <- temp$items
      sim_table[iterate, 1:length(unique(DF$items))] <- temp$se
      sim_table[iterate, "sample_size"] <- samplesize_values[i]
      sim_table[iterate, "nsim"] <- p

      iterate <- iterate + 1
    }
  }

  # figure out cut off
  final_sample <- sim_table %>% 
    pivot_longer(cols = -c(sample_size, nsim)) %>% 
    dplyr::rename(item = name, se = value) %>% 
    group_by(sample_size, nsim) %>% 
    summarize(percent_below = sum(se <= cutoff)/length(unique(DF$items))) %>% 
    ungroup() %>% 
    # then summarize all down averaging percents
    dplyr::group_by(sample_size) %>% 
    summarize(percent_below = mean(percent_below)) %>% 
    dplyr::arrange(percent_below) %>% 
    ungroup()

  return(list(
    SE = SE, 
    cutoff = cutoff, 
    DF = DF, 
    sim_table = sim_table, 
    final_sample = final_sample
  ))

}

Project/Data Title:

Cue Word Triggered Memory’s Phenomenological

Data provided by: Krystian Barzykowski

Project/Data Description:

Participants participated in a voluntary memory task, where they were provided with a word cue in response to which they were about to recall an autobiographical memory. The item set consists of 30 word cues that were rated/classified by 142 separate participants.

Methods Description:

They briefly described the content of their thoughts recalled in response to the word-cue and rated it on a 7-point scale: (a) to what extent the content was accompanied by unexpected physiological sensations (henceforth, called physiological sensation), (b) to what extent they had deliberately tried to bring the thought to mind (henceforth, called effort), (c) clarity (i.e. how clearly and well an individual remembered a given memory/mental content), (d) how detailed the content was, (e) how specific and concrete the content was, (f) intensity of emotions experienced in response to the content, (g) how surprising the content was, (h) how personal it was, and (i) the relevance to current life situation (not included).

Data Location:

Data included within this vignette.

DF <- import("data/barzykowski_data.xlsx") %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 2)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 3)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 4)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 5)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 6)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 7)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 8)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 9)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 10)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 11)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 12)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 13)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 14)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 15)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 16)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 17)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 18)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 19)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 20)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 21)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 22)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 23)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 24)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 25)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 26)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 27)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 28)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 29)) %>% 
  bind_rows(import("data/barzykowski_data.xlsx", sheet = 30)) 

str(DF)

Date Published:

No official publication, see citation below.

Dataset Citation:

The cues were used in study published here: Barzykowski, K., Niedźwieńska, A., & Mazzoni, G. (2019). How intention to retrieve a memory and expectation that it will happen influence retrieval of autobiographical memories. Consciousness and Cognition, 72, 31-48. DOI: https://doi.org/10.1016/j.concog.2019.03.011

Keywords:

cue-word, valence, memory retrieval

Use License:

Open access with reference to original paper (Attribution-NonCommercial-ShareAlike CC BY-NC-SA)

Geographic Description - City/State/Country of Participants:

Poland, Kraków

Column Metadata:

metadata <- import("data/barzykowski_metadata.xlsx")

flextable(metadata) %>% autofit()

AIPE Analysis:

Note that the data is already in long format (each item has one row), and therefore, we do not need to restructure the data.

Stopping Rule

In this example, we have multiple variables to choose from for our analysis. We could include several to find the sample size rules for further study. In this example, we'll use the variables with the least and most variability and take the average of the 40% decile as suggested in our manuscript. This choice is somewhat arbitrary - in a real study, you could choose to use only the variables you were interested in and pick the most conservative values or simply average together estimates from all variables.

apply(DF[ , -c(1,2)], 2, sd)

These are Likert type items. The variance within them appears roughly equal. The lowest variance appears to be r names(which.min(apply(DF[ , -c(1,2)], 2, sd))), and the maximum appears to be r names(which.max(apply(DF[ , -c(1,2)], 2, sd))).

Run the function proposed in the manuscript:

# set seed
set.seed(8548)
# Function for simulation
var1 <- item_power(data = DF, # name of data frame
            dv_col = "How surprising", # name of DV column as a character
            item_col = "Cue no", # number of items column as a character
            nsim = 10,
            sample_start = 20, 
            sample_stop = 100, 
            sample_increase = 5,
            decile = .4)

var2 <- item_power(DF, # name of data frame
            "Personal nature", # name of DV column as a character
            item_col = "Cue no", # number of items column as a character
            nsim = 10, 
            sample_start = 20, 
            sample_stop = 100, 
            sample_increase = 5,
            decile = .4)

What the usual standard error for the data that could be considered for our stopping rule?

# individual SEs for how surprising 
var1$SE
# var 1 cut off
var1$cutoff

# individual SEs for personal nature
var2$SE
# var 2 cut off
var2$cutoff

# overall cutoff
cutoff <- mean(var1$cutoff, var2$cutoff)
cutoff

The average SE cutoff across both variables is r round(cutoff, digits = 3).

Minimum Sample Size

How large does the sample have to be for 80% to 95% of the items to be below our stopping SE rule?

cutoff_personal <- calculate_cutoff(population = DF, 
                           grouping_items = "Cue no",
                           score = "Personal nature",
                           minimum = as.numeric(min(DF$`Personal nature`)),
                           maximum = as.numeric(max(DF$`Personal nature`)))
# showing how this is the same as the person calculated version versus semanticprimeR's function
cutoff_personal$cutoff

final_table_personal <- calculate_correction(
  proportion_summary = var1$final_sample,
  pilot_sample_size = length(unique(DF$`Participant's ID`)),
  proportion_variability = cutoff_personal$prop_var
  )

flextable(final_table_personal) %>% 
  autofit()
cutoff_surprising <- calculate_cutoff(population = DF, 
                           grouping_items = "Cue no",
                           score = "How surprising",
                           minimum = as.numeric(min(DF$`How surprising`)),
                           maximum = as.numeric(max(DF$`How surprising`)))
# showing how this is the same as the person calculated version versus semanticprimeR's function
cutoff_surprising$cutoff

final_table_surprising <- calculate_correction(
  proportion_summary = var2$final_sample,
  pilot_sample_size = length(unique(DF$`Participant's ID`)),
  proportion_variability = cutoff_surprising$prop_var
  )

flextable(final_table_surprising) %>% 
  autofit()

In this scenario, we could go with the point wherein they both meet the 80% criterion, which is $n_{personal}$ = r final_table_personal %>% filter(percent_below >= 80) %>% arrange(percent_below, corrected_sample_size) %>% slice_head() %>% pull(corrected_sample_size) %>% round() to $n_{surprising}$ = r final_table_surprising %>% filter(percent_below >= 80) %>% arrange(percent_below, corrected_sample_size) %>% slice_head() %>% pull(corrected_sample_size) %>% round(). In these scenarios, it is probably better to estimate a larger sample.

Maximum Sample Size

If you decide to use 95% power as your criterion, you would see that items need somewhere between $n_{personal}$ = r final_table_personal %>% filter(percent_below >= 95) %>% arrange(percent_below, corrected_sample_size) %>% slice_head() %>% pull(corrected_sample_size) %>% round() to $n_{surprising}$ = r final_table_surprising %>% filter(percent_below >= 95) %>% arrange(percent_below, corrected_sample_size) %>% slice_head() %>% pull(corrected_sample_size) %>% round() participants for both variables. In this case, you could choose to make the larger value for participants your maximum sample size to ensure both variables reach the criterion.

Final Sample Size

You should also consider any potential for missing data and/or unusable data given the requirements for your study. Given that participants are likely to see all items in this study, we could use the minimum, stopping rule, and maximum defined above. However, one should consider that not all participants will be able to respond to all items within a memory.



SemanticPriming/semanticprimeR documentation built on Feb. 26, 2024, 8:30 p.m.