multilex: Multilingual lexical assessment using online surveys

Documented in ml_vocabulary

# ml_vocabulary

#' Generate participant information and progress for each response
#' @export ml_vocabulary
#' @import dplyr
#' @importFrom tidyr pivot_longer
#' @importFrom tidyr drop_na
#' @importFrom tidyr pivot_wider
#' @importFrom purrr reduce
#' @importFrom janitor clean_names
#' @importFrom rlang .data
#' @description This function generates a data frame with the vocabulary of each participant (keeping longitudinal data from the same participant in different rows). Comprehensive and productive vocabulary sizes are computed as raw counts (\code{vocab_count}) and as proportions \code{vocab_prop}, calculated from the total of items filled by the participant in the response \code{vocab_n}).
#' @param participants Participants data frame, as generated by \code{ml_participants}. If NULL (default), \code{ml_participants} is run.
#' @param responses Responses data frame, as generated by \code{ml_responses}. If NULL (default), \code{ml_responses} is run.
#' @param by A character vector that takes the name of the variable(s) to group data into. Vocabulary metrics will be calculated by aggregating responses within the groups that result from the combination of crossing of the variables provided in \code{by}. This variables can refer to item properties (see \code{pool}, e.g., "category") or to participant properties (see \code{ml_logs()}, e.g., "dominance").
#' @param scale A character vector that takes the value "count" and/or "prop". If "count" (default) vocabulary metrics are reported as counts (number of words). If "prop", vocabulary metrics are calculated as proportions?
#' @return A dataset (actually, a \code{\link[tibble]{tibble}}) with each participant's comprehensive and/or vocabulary size in each language. This data frame contains the following variables:
#' \describe{
#'      \item{id}{a character string indicating a participant's identifier. This value is always the same for each participant, so that different responses from the same participant share the same \code{id}.}
#'      \item{time}{a numeric value indicating how many times a given participant has been sent the questionnaire, regardless of whether they completed it or not.}
#'      \item{age}{a numeric value indicating the number of months elapsed since participants' birth date until they filled in the last item of their questionnaire response.}
#'      \item{type}{a character string indicating the vocabulary type computed: "understands" if option \emph{Understands} was selected, and "produces" if option \emph{Understands & Says} was selected.}
#'      \item{vocab_count_total}{integer indicating the number of items selected as \emph{Understands} or \emph{Understands and Says} in both languages.}
#'      \item{vocab_count_dominance_l1}{positive integer indicating the number of items selected as \emph{Understands} or \emph{Understands and Says} in the dominant language (L1).}
#'      \item{vocab_count_dominance_l2}{positive integer indicating the number of items selected as \emph{Understands} or \emph{Understands and Says} in the non-dominant language (L2).}
#'      \item{vocab_count_conceptual}{positive integer indicating the number of translation equivalents (aka. cross-language synonyms or doublets) in which  \emph{at list one of the items} was selected as \emph{Understands} or \emph{Understands and Says}. This is a measure of the number of lexicalised concepts.}
#'      \item{vocab_count_te}{positive integer indicating the number of translation equivalents (out of the total number of items the participant answered to) in which at \emph{both items} was selected as \emph{Understands} or \emph{Understands and Says}. This is a measure of the number of lexicalised concepts.}
#'      \item{vocab_prop_total}{numeric value ranging from 0 to 1 (both included) indicating the proportion of items selected as \emph{Understands} or \emph{Understands and Says} in both languages.}
#'      \item{vocab_prop_dominance_l1}{numeric value ranging from 0 to 1 (both included) indicating the proportion of of items selected as \emph{Understands} or \emph{Understands and Says} in the dominant language (L1).}
#'      \item{vocab_prop_dominance_l2}{numeric value ranging from 0 to 1 (both included) indicating the proportion of of items selected as \emph{Understands} or \emph{Understands and Says} in the non-dominant language (L2).}
#'      \item{vocab_prop_conceptual}{numeric value ranging from 0 to 1 (both included) indicating the proportion of of translation equivalents (aka. cross-language synonyms or doublets) in which  \emph{at list one of the items} was selected as \emph{Understands} or \emph{Understands and Says}. This is a measure of the number of lexicalised concepts.}
#'      \item{vocab_prop_te}{numeric value ranging from 0 to 1 (both included) indicating the proportion of of translation equivalents (aka. cross-language synonyms or doublets) in which at \emph{both items} was selected as \emph{Understands} or \emph{Understands and Says}. This is a measure of the number of lexicalised concepts.}
#' }
#' The specific subset of columns returned by \code{ml_vocabulary} depends on the arguments provided.
#' @author Gonzalo Garcia-Castro
#'
ml_vocabulary <- function(
  participants = NULL,
  responses = NULL,
  by = NULL,
  scale = "count"
) {

  suppressMessages({

    ml_connect() # get credentials to Google and formr

    if (!any(scale %in% c("count", "prop"))) {
      stop("Argument scale must be 'count' and/or 'prop'")
    }

    if (is.null(responses)) {
      if (is.null(participants)) {
        participants <- ml_participants()
      }
      responses <- ml_responses()
    }

    logs <- ml_logs(participants, responses) %>%
      filter(.data$id %in% unique(responses$id)) # get logs

    vocab_base <- responses %>%
      mutate(
        understands = ifelse(is.na(.data$response), NA, .data$response %in% c(2, 3)),
        produces = ifelse(is.na(.data$response), NA, .data$response %in% c(3))
      ) %>%
      select(-.data$response) %>%
      pivot_longer(
        c(.data$understands, .data$produces),
        names_to = "type",
        values_to = "response"
      ) %>%
      drop_na(.data$response) %>%
      left_join(select(pool, one_of("item", "te", "language", by))) %>%
      left_join(select(logs, one_of("id", "time", by))) %>%
      mutate(item_dominance = ifelse(.data$language==.data$dominance, "L1", "L2")) %>%
      select(one_of("id", "time", "age", "item_dominance", "type", "te", "item", by, "response"))

    # total vocabulary
    vocab_total <- vocab_base %>%
      group_by_at(c("id", "time", "age", "type", by)) %>%
      summarise(
        vocab_count_total = sum(.data$response, na.rm = TRUE),
        vocab_n_total = n(),
        .groups = "drop"
      ) %>%
      mutate(vocab_prop_total = ifelse(.data$vocab_n_total==0, 0, .data$vocab_count_total/.data$vocab_n_total))

    # total vocabulary in Catalan
    vocab_total_dominance <- vocab_base %>%
      group_by_at(c("id", "time", "age", "type", "item_dominance", by), .drop = FALSE) %>%
      summarise(
        vocab_count_dominance = sum(.data$response, na.rm = TRUE),
        n_total = sum(!is.na(.data$response)),
        .groups = "drop"
      ) %>%
      mutate(vocab_prop_dominance = ifelse(.data$n_total==0, 0, .data$vocab_count_dominance/.data$n_total)) %>%
      pivot_wider(names_from = .data$item_dominance, values_from = c(n_total, matches("vocab"))) %>%
      clean_names() %>%
      select(id, time, age, type, starts_with("vocab_count"), starts_with("vocab_prop"), any_of(by))

    # conceptual vocabulary
    n_total <- vocab_base %>%
      distinct_at(c("id", "time", "age", "te", by)) %>%
      group_by_at(c("id", "time", "age", by), .drop = FALSE) %>%
      mutate(n_total = n()) %>%
      ungroup() %>%
      select_at(c("id", "time", "te", by, "n_total"))

    vocab_conceptual <- vocab_base %>%
      left_join(n_total) %>%
      filter(response) %>%
      group_by_at(c("id", "time", "age", "type", "te", "n_total", by), .drop = FALSE) %>%
      summarise(n = n(), .groups = "drop") %>%
      group_by_at(c("id", "time", "type", "age", "n_total", by), .drop = FALSE) %>%
      summarise(n = n(), .groups = "drop")  %>%
      rename(vocab_count_conceptual = n) %>%
      mutate(
        vocab_count_conceptual = as.integer(vocab_count_conceptual),
        vocab_prop_conceptual = .data$vocab_count_conceptual/.data$n_total
      ) %>%
      select(id, time, age, type, starts_with("vocab_count"), starts_with("vocab_prop"), any_of(by))

    # TE vocabulary
    vocab_te <- vocab_base %>%
      left_join(n_total) %>%
      filter(response) %>%
      group_by_at(c("id", "time", "age", "type", "te", "n_total", by), .drop = FALSE) %>%
      summarise(n = n(), .groups = "drop") %>%
      filter(n > 1) %>%
      group_by_at(c("id", "time", "age", "type", "n_total", by), .drop = FALSE) %>%
      summarise(n = n(), .groups = "drop") %>%
      rename(vocab_count_te = n) %>%
      mutate(
        vocab_prop_te = .data$vocab_count_te/.data$n_total,
        vocab_count_te = as.integer(vocab_count_te)
      ) %>%
      select(id, time, age, type, starts_with("vocab_count"), starts_with("vocab_prop"), any_of(by))

    # merge all datasets
    vocab <- reduce(
      list(
        vocab_total,
        vocab_total_dominance,
        vocab_conceptual,
        vocab_te
      ),
      left_join,
      by = c("id", "time", "age", "type", by)
    ) %>%
      mutate(across(matches("conceptual|te"), ~ifelse(is.na(.), as.integer(0), .))) %>%
      select(one_of("id", "time", "age", "type", by), matches(scale)) %>%
      arrange(desc(id), time, type)

  })
  return(vocab)

}