multilex: Multilingual lexical assessment using online surveys

Documented in ml_norms

# ml_norms: Compute item norms

#' Generate item-level norms for age, sex, language profile and item dominance.
#' @export ml_norms
#' @importFrom googlesheets4 gs4_has_token
#' @importFrom dplyr mutate
#' @importFrom dplyr filter
#' @importFrom dplyr select
#' @importFrom dplyr left_join
#' @importFrom dplyr group_by_at
#' @importFrom dplyr between
#' @importFrom dplyr summarise
#' @importFrom dplyr rowwise
#' @importFrom dplyr arrange
#' @importFrom tidyr pivot_longer
#' @importFrom tidyr drop_na
#' @importFrom rlang .data
#' @importFrom Rdpack reprompt
#' @description This function generates a data frame with the estimated proportion
#'  of children that understand and/or produce some items for a selected age range
#'  and participant profiles. Estimated proportions and corresponding standard
#'  errors and confidence intervals are computed following
#'  \insertCite{gelman2020regression;textual}{multilex}'s adjustments to account
#'  for zero- and one-inflation (see functions \code{\link{prop_adj}},
#'  \code{\link{prop_adj_se}}, and \code{\link{prop_adj_ci}}).
#' @param responses Responses data frame, as generated by \code{ml_responses}.
#' If NULL (default), \code{ml_responses} is run.
#' @param participants Participants data frame, as generated by
#' \code{ml_participants}. If NULL (default), \code{ml_participants} is run.
#' @param item Character string indicating the item to compute norms for. If
#' left NULL (by default) norms will be computed for all items. You can check
#' the available items in the \code{pool} data set running \code{data("pool")}.
#' @param language Character string indicating the language to compute
#' vocabulary norms for: "catalan" and/or "spanish"
#' @param type Character string indicating the vocabulary type to compute norms
#' for. Takes "understands" and/or"produces" (defaults to both).
#' @param age Numeric vector of length two (min-max) indicating the age range of
#'  participants to compute norms for.
#' @param lp character string indicating the language profile of participants to
#'  compute norms for: "Bilingual", "Monolingual", "Other" (defaults to all).
#' @param sex character string indicating the sex of participants to compute
#' norms for. Takes "Female" and/or "Male" (defaults to both).
#' @param category character string indicating the semantic/functional
#' category/ies to include items from. See available categories in the
#' \code{pool} data set by running \code{data("pool")}.
#' @param .width Numeric values ranging from 0 to 1 (not included) indicating
#' the confidence level of confidence intervals (defaults to 0.95).
#' @returns A data frame (actually, a \code{\link[tibble]{tibble}}) with the proportion of participants in the sample that understand or produce the items indicated in \code{item}, along with the standard error and confidence interval of the estimation. The output contains the following variables:
#' \describe{
#'      \item{te}{an integer identifying the Translation Equivalent (aka., pair of cross-language synonyms, doublets) the item belongs to.}
#'      \item{item}{character string indicating the item identifier (e.g., \emph{spa_mesa}). This value is unique for each item. Responses to the same item from different participants are linked by the same \code{item} value.}
#'      \item{language}{a character string indicating the language the item response belongs to: \emph{Catalan} if item in Catalan), \emph{Spanish} if item in Spanish.}
#'      \item{age_bin}{an integer indicating the age group participants for which the estimates have been computed belong to (2 months-wide bins by default).}
#'      \item{type}{a character string indicating the vocabulary type computed: "understands" if option \emph{Understands} was selected, and "produces" if option \emph{Understands & Says} was selected.}
#'      \item{lp}{a character string indicating participants' language profile, classified using parental reports of language exposure (see \code{doe_spanish}, \code{doe_catalan}, and \code{doe_others}), and the thresholds passed in the \code{bilingual_threshold} and \code{other_threshold}.}
#'      \item{category}{a character string indicating the semantic/function category the item belongs to (e.g., \emph{Vehicles}, \emph{Actions}).}
#'      \item{item_dominance}{a character string that takes the value \emph{L1} if the item belongs to participants' language of most exposure, and \emph{L2} if the item belongs to participants' language of least exposure.}
#'      \item{label}{a character string indicating the text presented to participants in the questionnaire (replacing the \code{item} identifier).}
#'      \item{yes}{a positive integer indicating the number of positive responses: \code{responses} is 2 (\emph{Understands} or 3 (\emph{Understands & Says}) for \code{type} \emph{understands}, and 3 (\emph{Understands & Says}) if \code{type} is \emph{produces}.}
#'      \item{n}{a positive integer indicating the total number number of responses (useful for computing proportions).}
#'      \item{proportion}{a numeric value ranging from 0 to 1 (both included) indicating the estimated proportion of participants that provided a positive response, adjusted following \insertCite{gelman2020regression;textual}{multilex}'s method to account for zero- and one-inflation (see function \code{\link{prop_adj}}).}
#'      \item{se}{a numeric value indicating the standard error (\emph{SE}) of the estimated proportion of participants that provided a positive response, adjusted following \insertCite{gelman2020regression;textual}{multilex}'s method to account for zero- and one-inflation (see function \code{\link{prop_adj_se}}).}
#'      \item{ci_lower}{a numeric value indicating the lower boundary of the 95\% confidence interval (\emph{CI}) of the estimated proportion of participants that provided a positive response, adjusted following \insertCite{gelman2020regression;textual}{multilex}'s method to account for zero- and one-inflation (see function \code{\link{prop_adj_ci}}).}
#'      \item{ci_upper}{a numeric value indicating the upper boundary of the 95\% confidence interval (\emph{CI}) of the estimated proportion of participants that provided a positive response, adjusted following \insertCite{gelman2020regression;textual}{multilex}'s method to account for zero- and one-inflation (see function \code{\link{prop_adj_ci}}).}
#' }
#' @author Gonzalo Garcia-Castro
#' @references
#' \insertRef{gelman2020regression}{multilex}
#' @examples
#' ml_norms(item = "cat_casa", type = "understands", age = c(20, 24))

ml_norms <- function(
    participants = NULL,
    responses = NULL,
    item = NULL,
    language = c("Catalan", "Spanish"),
    type = c("understands", "produces"),
    age = c(0, 100),
    lp = c("Bilingual", "Monolingual", "Other"),
    sex = c("Female", "Male"),
    category = NULL,
    .width = 0.95
) {
  if (!gs4_has_token()) ml_connect()

  if (is.null(responses)) {
    if (is.null(participants)) participants <- ml_participants()
    responses <- ml_responses(participants = participants)
  }
  logs <- ml_logs(participants = participants, responses = responses)

  group_vars <- c("te", "item", "language", "age_bin", "type", "lp", "category", "item_dominance", "label")

  data("pool")

  if (is.null(item)) item <- unique(responses$item)
  if (is.null(category)) category <- unique(pool$category)

  norms <- responses %>%
    left_join(select(logs, .data$id, .data$time, .data$lp), c("id", "time")) %>%
    filter(
      .data$item %in% .env$item,
      .data$lp %in% .env$lp,
      between(.data$age, .env$age[1], .env$age[2])
    ) %>%
    mutate(
      understands = ifelse(is.na(.data$response), NA, .data$response %in% c(2, 3)),
      produces = ifelse(is.na(.data$response), NA, .data$response %in% c(3))
    ) %>%
    select(one_of("id", "age", "sex", "lp", "dominance", "item", "understands", "produces")) %>%
    pivot_longer(
      c(.data$understands, .data$produces),
      names_to = "type",
      values_to = "response"
    ) %>%
    mutate(age_bin = 2*as.numeric(cut(.data$age, seq(0, 100, by = 2), labels = FALSE))) %>%
    left_join(select(pool, one_of("te", "item", "language", "cognate", "label", "ipa", "frequency_zipf", "category")), by = "item") %>%
    filter(
      .data$language %in% .env$language,
      .data$type %in% .env$type,
      .data$category %in% .env$category
    ) %>%
    mutate(
      item_dominance = case_when(
        .data$language==.data$dominance ~ "L1",
        .data$language!=.data$dominance ~ "L2"
      )
    ) %>%
    drop_na(.data$response) %>%
    group_by_at(group_vars) %>%
    summarise(
      yes = sum(.data$response, na.rm = TRUE),
      n = sum(!is.na(.data$response), na.rm = TRUE),
      .groups = "drop"
    ) %>%
    rowwise() %>%
    mutate(
      proportion = prop_adj(.data$yes, .data$n),
      se = prop_adj_se(.data$yes, .data$n),
      ci_lower = prop_adj_ci(.data$yes, .data$n, .width = .width)[1],
      ci_upper = prop_adj_ci(.data$yes, .data$n, .width = .width)[2]
    ) %>%
    ungroup() %>%
    filter(.data$type %in% .env$type) %>%
    arrange(
      .data$te, .data$item, .data$language, .data$lp, .data$item_dominance,
      .data$type, .data$age_bin, .data$proportion, .data$yes, .data$n, .data$se, .data$ci_lower, .data$ci_upper
    )

  return(norms)

}