multilex: Multilingual lexical assessment using online surveys

Documented in ml_responses

#' Retrieve and update local and/or remote data from formr
#' @import dplyr
#' @importFrom tidyr drop_na
#' @importFrom lubridate as_date
#' @importFrom formr formr_connect
#' @importFrom stats time
#' @importFrom rlang .data
#' @export ml_responses
#' @details This function generates a data frame with participant's responses
#' to each item, along with some session-specific metadata. It takes \code{participants}
#' (the output of \code{ml_participants}) and \code{runs} (a character vector that can take
#' zero, one, or multiple of the following values: "formr2", "formr-short", "formr-lockdown")
#'  as arguments. Only responses from the versions indicated in \code{runs} input will
#'  be updated. For the rest, data will be retrieved from the their last update.
#' @param participants Participants data frame, as generated by \code{ml_participants}. If NULL (default), \code{ml_participants} is run.
#' @param runs formr runs to update
#' @param longitudinal Should longitudinal data be included? If "all" (default), all responses (including repeated measures) are included. If "no", participants with more than one responses to the questionnaire (regardless of the version) are excluded. If "first", only the first response of each participant is included. If "last", only the last response of each participant is included. If "only", only responses with repeated measures are included.
#' @param update Should data be recovered from last update, or should it be updated now?
#' @return A data frame (actually, a \code{\link[tibble]{tibble}}) containing participant's responses to each item, along with some session-specific metadata. The output includes the following variables:
#'  \describe{
#'      \item{id}{a character string indicating a participant's identifier. This value is always the same for each participant, so that different responses from the same participant share the same \code{id}.}
#'      \item{id_exp}{a character string indicating a participant's identifier in the context of the particular study in which the participant was tested and invited to fill in the questionnaire. This value is always the same for each participant within the same study, so that different responses from the same participant in the same study share \code{id_exp} The same participant may have different \code{id_exp} across different studies.}
#'      \item{id_db}{a character string with five digits indicating a participant's identifier in the database from the \href{https://www.upf.edu/web/cbclab}{Laboratori de Recerca en Infància} at Universitat Pompeu Fabra. This value is always the same for each participant, so that different responses from the same participant share the same \code{id_db}.}
#'      \item{time}{a numeric value indicating how many times a given participant has been sent the questionnaire, regardless of whether they completed it or not.}
#'      \item{code}{a character string identifying a single response to the questionnaire. This value is always unique for each response to the questionnaire, even for responses from the same participant.}
#'      \item{study}{a character string indicating the study in which the participant was invited to fill in the questionnaire. Frequently, participants that filled in the questionnaire came to the lab to participant in a study, and were then invited to fill in the questionnaire later. This value indicates what study each participant was tested in before being sent the questionnaire.}
#'      \item{version}{a character string indicating what version of the questionnaire a given participant filled in. Different versions may contain a different subset of items, and the administration instructions might vary slightly (see formr questionnaire templates in the \href{https://github.com/gongcastro/multilex}{GitHub repository}). Also, different versions were designed, implemented, and administrated at different time points (e.g., before/during/after the COVID-related lockdown).}
#'      \item{time_stamp}{a date value (see lubridate package) in \code{yyyy/mm/dd} format indicating the date in which participants responded to the last item of their questionnaire response. Note: some participants took longer to complete the questionnaire since they started filling items in.}
#'      \item{language}{a character string indicating the language the item response belongs to: \emph{Catalan} if item in Catalan), \emph{Spanish} if item in Spanish.}
#'      \item{item}{character string indicating the item identifier (e.g., \emph{spa_mesa}). This value is unique for each item. Responses to the same item from different participants are linked by the same \code{item} value.}
#'      \item{response}{integer indicating the participant's response to a give item: 1 if \emph{No} (the participant does not understand or prooduce the word), 2 if \emph{Understands} (the participants understands the word), or 3 if \emph{Understands and Says} (the participant understands and produces the item).}
#'      \item{date_birth}{a date value (see lubridate package) in \code{yyyy/mm/dd} format indicating participants birth date.}
#'      \item{age}{a numeric value indicating the number of months elapsed since participants' birth date until they filled in the last item of their questionnaire response.}
#'      \item{sex}{a character string indicating participants' biological sex, as reported by the parents.}
#'      \item{postcode}{a character string indicating participants' household postcode.}
#'      \item{edu_parent1}{a character string indicating the educational attainment of one of the parents/caretakers.}
#'      \item{edu_parent2}{a character string indicating the educational attainment of the other parent/caretaker, if any.}
#'      \item{doe_spanish}{a numeric value ranging from 0 to 1 indicating participants' daily exposure to Spanish, as estimated by parents/caretakers. This value aggregates participants' exposure to any variant of Spanish (e.g., European and American Spanish).}
#'      \item{doe_catalan}{a numeric value ranging from 0 to 1 indicating participants' daily exposure to Catalan, as estimated by parents/caretakers. This value aggregates participants' exposure to any variant of Catalan (e.g., Catalan from Majorca or Barcelona).}
#'      \item{doe_others}{a numeric value ranging from 0 to 1 indicating participants' daily exposure to languages other than Spanish or Catalan, as estimated by parents/caretakers, aggregating participants' exposure to all those other languages (e.g., Norwegian, Arab, Swahili).}
#'      \item{randomisation}{a character string indicating the specific list of items a participant was assigned to. Only applies in the case of short versions of multilex, such as BL-Short, BL-Short-2 or BL-Lockdown, where the list of items was partinioned into several versions.}
#'      \item{doe_spanish_lockdown}{a numeric value ranging from 0 to 1 indicating participants' daily exposure to Spanish during the COVID-19 lockdown, as estimated by parents/caretakers. This value aggregates participants' exposure to any variant of Spanish (e.g., European and American Spanish).}
#'      \item{doe_catalan_lockdown}{a numeric value ranging from 0 to 1 indicating participants' daily exposure to Catalan during the COVID-19 lockdown, as estimated by parents/caretakers. This value aggregates participants' exposure to any variant of Catalan (e.g., Catalan from Majorca or Barcelona).}
#'      \item{doe_others_lockdown}{a numeric value ranging from 0 to 1 indicating participants' daily exposure to languages other than Spanish or Catalan during the COVID-19 lockdown, as estimated by parents/caretakers, aggregating participants' exposure to all those other languages (e.g., Norwegian, Arab, Swahili).}
#'      \item{dominance}{a character string indicating the language of highest exposure ("Catalan" or "Spanish"), as reported by parents. If exposure is identical for both language, "Catalan" is assigned.}
#'  }
#' @author Gonzalo Garcia-Castro
#'
ml_responses <- function(
  participants = NULL,
  runs = c("BL-Long2", "BL-Lockdown", "BL-Short"), # c("Inhibition", "DevLex", "CBC", "BL-Short", "BL-Long-1", "BL-Long-2", "BL-Lockdown")
  longitudinal = "all",
  update = TRUE
) {

  #### import data ----
  responses_exists <- file.exists(system.file("responses.rds", package = "multilex"))

  if (!update & responses_exists) {
    message(paste0("Loading last update (",  file.info(system.file("responses.rds", package = "multilex"))$mtime, ") ..."))
    responses <- readRDS(system.file("responses.rds", package = "multilex"))
  } else if (update | !responses_exists){
    if (!responses_exists){
      message("Data not available. Updating data...")
    } else if (update) {
      message("Updating data...")
    }

    ml_connect() # get credentials to Google and formr

    # get participant information
    if (is.null(participants)) participants <- ml_participants()

    # retrieve data from formr
    formr2 <- import_formr2() # formr2
    formr_lockdown <- import_formr_lockdown() # formr-lockdown
    formr_short <- import_formr_short()

    # merge data
    suppressMessages({
      responses <- list(formr1, formr2, formr_short, formr_lockdown) %>%
        bind_rows() %>%
        arrange(desc(time_stamp)) %>%
        distinct(.data$id, .data$code, .data$item, .keep_all = TRUE) %>%
        mutate(
          across(c(date_birth, time_stamp), as_date),
          time = ifelse(is.na(.data$time), 1, .data$time),
          dominance = case_when(
            .data$doe_catalan >= .data$doe_spanish ~ "Catalan",
            .data$doe_spanish > .data$doe_catalan ~ "Spanish"
          ),
          version = fix_version(version)
        ) %>%
        fix_item() %>%
        fix_doe() %>%
        mutate(across(starts_with("doe_"), ~./100)) %>%
        fix_postcode() %>%
        fix_sex() %>%
        fix_study() %>%
        fix_id_exp() %>%
        drop_na(.data$time_stamp) %>%
        get_longitudinal(longitudinal = longitudinal)

    })

    saveRDS(responses, file = file.path(system.file(package = "multilex"), "responses.rds"))
  }
  return(responses)

}