# ml_norms: Compute item norms
#' Generate item-level norms for age, sex, language profile and item dominance.
#' @export ml_norms
#' @importFrom googlesheets4 gs4_has_token
#' @importFrom dplyr mutate
#' @importFrom dplyr filter
#' @importFrom dplyr select
#' @importFrom dplyr left_join
#' @importFrom dplyr group_by_at
#' @importFrom dplyr between
#' @importFrom dplyr summarise
#' @importFrom dplyr rowwise
#' @importFrom dplyr arrange
#' @importFrom tidyr pivot_longer
#' @importFrom tidyr drop_na
#' @importFrom rlang .data
#' @importFrom Rdpack reprompt
#' @description This function generates a data frame with the estimated proportion
#' of children that understand and/or produce some items for a selected age range
#' and participant profiles. Estimated proportions and corresponding standard
#' errors and confidence intervals are computed following
#' \insertCite{gelman2020regression;textual}{multilex}'s adjustments to account
#' for zero- and one-inflation (see functions \code{\link{prop_adj}},
#' \code{\link{prop_adj_se}}, and \code{\link{prop_adj_ci}}).
#' @param responses Responses data frame, as generated by \code{ml_responses}.
#' If NULL (default), \code{ml_responses} is run.
#' @param participants Participants data frame, as generated by
#' \code{ml_participants}. If NULL (default), \code{ml_participants} is run.
#' @param item Character string indicating the item to compute norms for. If
#' left NULL (by default) norms will be computed for all items. You can check
#' the available items in the \code{pool} data set running \code{data("pool")}.
#' @param language Character string indicating the language to compute
#' vocabulary norms for: "catalan" and/or "spanish"
#' @param type Character string indicating the vocabulary type to compute norms
#' for. Takes "understands" and/or"produces" (defaults to both).
#' @param age Numeric vector of length two (min-max) indicating the age range of
#' participants to compute norms for.
#' @param lp character string indicating the language profile of participants to
#' compute norms for: "Bilingual", "Monolingual", "Other" (defaults to all).
#' @param sex character string indicating the sex of participants to compute
#' norms for. Takes "Female" and/or "Male" (defaults to both).
#' @param category character string indicating the semantic/functional
#' category/ies to include items from. See available categories in the
#' \code{pool} data set by running \code{data("pool")}.
#' @param .width Numeric values ranging from 0 to 1 (not included) indicating
#' the confidence level of confidence intervals (defaults to 0.95).
#' @returns A data frame (actually, a \code{\link[tibble]{tibble}}) with the proportion of participants in the sample that understand or produce the items indicated in \code{item}, along with the standard error and confidence interval of the estimation. The output contains the following variables:
#' \describe{
#' \item{te}{an integer identifying the Translation Equivalent (aka., pair of cross-language synonyms, doublets) the item belongs to.}
#' \item{item}{character string indicating the item identifier (e.g., \emph{spa_mesa}). This value is unique for each item. Responses to the same item from different participants are linked by the same \code{item} value.}
#' \item{language}{a character string indicating the language the item response belongs to: \emph{Catalan} if item in Catalan), \emph{Spanish} if item in Spanish.}
#' \item{age_bin}{an integer indicating the age group participants for which the estimates have been computed belong to (2 months-wide bins by default).}
#' \item{type}{a character string indicating the vocabulary type computed: "understands" if option \emph{Understands} was selected, and "produces" if option \emph{Understands & Says} was selected.}
#' \item{lp}{a character string indicating participants' language profile, classified using parental reports of language exposure (see \code{doe_spanish}, \code{doe_catalan}, and \code{doe_others}), and the thresholds passed in the \code{bilingual_threshold} and \code{other_threshold}.}
#' \item{category}{a character string indicating the semantic/function category the item belongs to (e.g., \emph{Vehicles}, \emph{Actions}).}
#' \item{item_dominance}{a character string that takes the value \emph{L1} if the item belongs to participants' language of most exposure, and \emph{L2} if the item belongs to participants' language of least exposure.}
#' \item{label}{a character string indicating the text presented to participants in the questionnaire (replacing the \code{item} identifier).}
#' \item{yes}{a positive integer indicating the number of positive responses: \code{responses} is 2 (\emph{Understands} or 3 (\emph{Understands & Says}) for \code{type} \emph{understands}, and 3 (\emph{Understands & Says}) if \code{type} is \emph{produces}.}
#' \item{n}{a positive integer indicating the total number number of responses (useful for computing proportions).}
#' \item{proportion}{a numeric value ranging from 0 to 1 (both included) indicating the estimated proportion of participants that provided a positive response, adjusted following \insertCite{gelman2020regression;textual}{multilex}'s method to account for zero- and one-inflation (see function \code{\link{prop_adj}}).}
#' \item{se}{a numeric value indicating the standard error (\emph{SE}) of the estimated proportion of participants that provided a positive response, adjusted following \insertCite{gelman2020regression;textual}{multilex}'s method to account for zero- and one-inflation (see function \code{\link{prop_adj_se}}).}
#' \item{ci_lower}{a numeric value indicating the lower boundary of the 95\% confidence interval (\emph{CI}) of the estimated proportion of participants that provided a positive response, adjusted following \insertCite{gelman2020regression;textual}{multilex}'s method to account for zero- and one-inflation (see function \code{\link{prop_adj_ci}}).}
#' \item{ci_upper}{a numeric value indicating the upper boundary of the 95\% confidence interval (\emph{CI}) of the estimated proportion of participants that provided a positive response, adjusted following \insertCite{gelman2020regression;textual}{multilex}'s method to account for zero- and one-inflation (see function \code{\link{prop_adj_ci}}).}
#' }
#' @author Gonzalo Garcia-Castro
#' @references
#' \insertRef{gelman2020regression}{multilex}
#' @examples
#' ml_norms(item = "cat_casa", type = "understands", age = c(20, 24))
ml_norms <- function(
participants = NULL,
responses = NULL,
item = NULL,
language = c("Catalan", "Spanish"),
type = c("understands", "produces"),
age = c(0, 100),
lp = c("Bilingual", "Monolingual", "Other"),
sex = c("Female", "Male"),
category = NULL,
.width = 0.95
) {
if (!gs4_has_token()) ml_connect()
if (is.null(responses)) {
if (is.null(participants)) participants <- ml_participants()
responses <- ml_responses(participants = participants)
}
logs <- ml_logs(participants = participants, responses = responses)
group_vars <- c("te", "item", "language", "age_bin", "type", "lp", "category", "item_dominance", "label")
data("pool")
if (is.null(item)) item <- unique(responses$item)
if (is.null(category)) category <- unique(pool$category)
norms <- responses %>%
left_join(select(logs, .data$id, .data$time, .data$lp), c("id", "time")) %>%
filter(
.data$item %in% .env$item,
.data$lp %in% .env$lp,
between(.data$age, .env$age[1], .env$age[2])
) %>%
mutate(
understands = ifelse(is.na(.data$response), NA, .data$response %in% c(2, 3)),
produces = ifelse(is.na(.data$response), NA, .data$response %in% c(3))
) %>%
select(one_of("id", "age", "sex", "lp", "dominance", "item", "understands", "produces")) %>%
pivot_longer(
c(.data$understands, .data$produces),
names_to = "type",
values_to = "response"
) %>%
mutate(age_bin = 2*as.numeric(cut(.data$age, seq(0, 100, by = 2), labels = FALSE))) %>%
left_join(select(pool, one_of("te", "item", "language", "cognate", "label", "ipa", "frequency_zipf", "category")), by = "item") %>%
filter(
.data$language %in% .env$language,
.data$type %in% .env$type,
.data$category %in% .env$category
) %>%
mutate(
item_dominance = case_when(
.data$language==.data$dominance ~ "L1",
.data$language!=.data$dominance ~ "L2"
)
) %>%
drop_na(.data$response) %>%
group_by_at(group_vars) %>%
summarise(
yes = sum(.data$response, na.rm = TRUE),
n = sum(!is.na(.data$response), na.rm = TRUE),
.groups = "drop"
) %>%
rowwise() %>%
mutate(
proportion = prop_adj(.data$yes, .data$n),
se = prop_adj_se(.data$yes, .data$n),
ci_lower = prop_adj_ci(.data$yes, .data$n, .width = .width)[1],
ci_upper = prop_adj_ci(.data$yes, .data$n, .width = .width)[2]
) %>%
ungroup() %>%
filter(.data$type %in% .env$type) %>%
arrange(
.data$te, .data$item, .data$language, .data$lp, .data$item_dominance,
.data$type, .data$age_bin, .data$proportion, .data$yes, .data$n, .data$se, .data$ci_lower, .data$ci_upper
)
return(norms)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.