multilex: Multilingual lexical assessment using online surveys

Documented in ml_logs

#### ml_logs: Generate logs

#' Generate participant information and progress for each response
#' @import dplyr
#' @importFrom scales label_percent
#' @importFrom lubridate as_date
#' @importFrom lubridate today
#' @importFrom lubridate as_datetime
#' @importFrom rlang .data
#' @export ml_logs
#' @description This function generates a data frame that contains participant-level
#'  information. Each row is a given participant's response and each column is a
#'  variable. The same participant will always be identified with the same \code{id}.
#'  The variable \code{time} indexes how many times a participant has been sent the
#'   questionnaire, independently of whether a response was obtained from them later.
#' @param responses Responses data frame, as generated by \code{\link{ml_responses}}.
#'  If NULL (default), \code{\link{ml_responses}} is run.
#' @param participants Participants data frame, as generated by \code{\link{ml_participants}}.
#'  If NULL (default), \code{\link{ml_participants}} is run.
#' @param bilingual_threshold Numeric scalar ranging from 0 to 1 indicating the
#' minimum degree of exposure to Catalan or Spanish to consider a participant as
#' *Monolingual*.
#' @param other_threshold Numeric scalar ranging from 0 to 1 indicating the minimum
#'  degree of exposure to languages other than Catalan and Spanish to consider a
#'   participant as *Other*.
#' @returns A data frame (actually, a \code{\link[tibble]{tibble}}) with participant-level
#' information. Each row corresponds to a questionnaire response and each column
#'  represents a variable. The output includes the following variables:
#'  \describe{
#'      \item{id}{a character string indicating a participant's identifier. This value is always the same for each participant, so that different responses from the same participant share the same \code{id}.}
#'      \item{id_exp}{a character string indicating a participant's identifier in the context of the particular study in which the participant was tested and invited to fill in the questionnaire. This value is always the same for each participant within the same study, so that different responses from the same participant in the same study share \code{id_exp} The same participant may have different \code{id_exp} across different studies.}
#'      \item{id_db}{a character string with five digits indicating a participant's identifier in the database from the \href{https://www.upf.edu/web/cbclab}{Laboratori de Recerca en Infància} at Universitat Pompeu Fabra. This value is always the same for each participant, so that different responses from the same participant share the same \code{id_db}.}
#'      \item{code}{a character string identifying a single response to the questionnaire. This value is always unique for each response to the questionnaire, even for responses from the same participant.}
#'      \item{time}{a numeric value indicating how many times a given participant has been sent the questionnaire, regardless of whether they completed it or not.}
#'      \item{study}{a character string indicating the study in which the participant was invited to fill in the questionnaire. Frequently, participants that filled in the questionnaire came to the lab to participant in a study, and were then invited to fill in the questionnaire later. This value indicates what study each participant was tested in before being sent the questionnaire.}
#'      \item{version}{a character string indicating what version of the questionnaire a given participant filled in. Different versions may contain a different subset of items, and the administration instructions might vary slightly (see formr questionnaire templates in the \href{https://github.com/gongcastro/multilex}{GitHub repository}). Also, different versions were designed, implemented, and administrated at different time points (e.g., before/during/after the COVID-related lockdown).}
#'      \item{date_sent}{a date value (see lubridate package) in \code{yyyy/mm/dd} format indicating the date in which the questionnaire was sent to participants.}
#'      \item{days_from_sent}{a numeric value indicating the number of days elapsed since participants were sent the questionnaire (as indicated by \code{date_sent})  and completed the questionnaire.}
#'      \item{date_birth}{a date value (see lubridate package) in \code{yyyy/mm/dd} format indicating participants birth date.}
#'      \item{age}{a numeric value indicating the number of months elapsed since participants' birth date until they filled in the last item of their questionnaire response.}
#'      \item{age_today}{a numeric value indicating the number of months elapsed since participants' birth date until the present day, as indicated by \code{\link[lubridate]{now}.}}
#'      \item{months_from_last_response}{a numeric value indicating the number of months elapsed since participants' last questionnaire response (as indicated  by \code{time_stamp}) until the present day, as indicated by \code{\link[lubridate]{now}.}}
#'      \item{sex}{a character string indicating participants' biological sex, as reported by the parents.}
#'      \item{postcode}{a character string indicating participants' household postcode.}
#'      \item{edu_parent1}{a character string indicating the educational attainment of one of the parents/caretakers.}
#'      \item{edu_parent2}{a character string indicating the educational attainment of the other parent/caretaker, if any.}
#'      \item{dominance}{a character string indicating the language of highest exposure ("Catalan" or "Spanish"), as reported by parents. If exposure is identical for both language, "Catalan" is assigned.}
#'      \item{lp}{a character string indicating participants' language profile, classified using parental reports of language exposure (see \code{doe_spanish}, \code{doe_catalan}, and \code{doe_others}), and the thresholds passed in the \code{bilingual_threshold} and \code{other_threshold}.}
#'      \item{doe_spanish}{a numeric value ranging from 0 to 1 indicating participants' daily exposure to Spanish, as estimated by parents/caretakers. This value aggregates participants' exposure to any variant of Spanish (e.g., European and American Spanish).}
#'      \item{doe_catalan}{a numeric value ranging from 0 to 1 indicating participants' daily exposure to Catalan, as estimated by parents/caretakers. This value aggregates participants' exposure to any variant of Catalan (e.g., Catalan from Majorca or Barcelona).}
#'      \item{doe_others}{a numeric value ranging from 0 to 1 indicating participants' daily exposure to languages other than Spanish or Catalan, as estimated by parents/caretakers, aggregating participants' exposure to all those other languages (e.g., Norwegian, Arab, Swahili).}
#'      \item{progress}{a numeric value ranging from 0 to 1 indicating participants' progress filling the questionnaire. A value of 0 indicates that the participant has not filled in any item yet. A value of 0.5 indicates that the participant is halfway through the questionnaire. A value of 1 indicates that the participant has completed all items.}
#'      \item{completed}{a logical value that returns TRUE if \code{progress} is 1, and FALSE otherwise.}
#'  }
#' @author Gonzalo Garcia-Castro
ml_logs <- function(
    participants = NULL,
    responses = NULL,
    bilingual_threshold = 0.80,
    other_threshold = 0.10
) {

  ml_connect() # get credentials to Google and formr

  # if participants or responses are missing from function call, generate them
  if (is.null(responses)) {
    if (is.null(participants)){
      participants <- ml_participants()
    }
    responses <- ml_responses(participants = participants)
  }

  suppressMessages({

    # get n items answered by participants (depends on the questionnaire version)
    total_items <- studies %>%
      distinct(.data$version, .data$language, .data$n) %>%
      group_by(.data$version) %>%
      summarise(total_items = sum(.data$n), .groups = "drop")

    # generate logs
    logs <- responses %>%
      mutate(
        # define language profiles based on thresholds
        lp = case_when(
          .data$doe_catalan >= bilingual_threshold ~ "Monolingual",
          .data$doe_spanish >= bilingual_threshold ~ "Monolingual",
          .data$doe_others > other_threshold ~ "Other",
          TRUE ~ "Bilingual"
        ),
        # define language dominance
        dominance = case_when(
          .data$doe_catalan > .data$doe_spanish ~ "Catalan",
          .data$doe_spanish > .data$doe_catalan ~ "Spanish",
          .data$doe_catalan==.data$doe_spanish ~ sample(c("Catalan", "Spanish"), 1))
      ) %>%
      group_by_at(
        c(
          "id_db", "date_birth", "time", "age", "sex", "postcode",
          "edu_parent1", "edu_parent2", "dominance", "lp", "doe_spanish",
          "doe_catalan", "doe_others", "time_stamp", "code", "study", "version"
        )
      ) %>%
      # total items to fill by each participant (varies across versions)
      summarise(complete_items = sum(!is.na(.data$response)), .groups = "drop") %>%
      left_join(total_items) %>%
      left_join(select(participants, -c(.data$date_birth, .data$version))) %>%
      drop_na(.data$id) %>%
      # compute participant's progress trhough the questionnaire
      mutate(across(.data$time_stamp, as_datetime)) %>%
      rowwise() %>%
      mutate(
        progress = label_percent()(.data$complete_items/.data$total_items),
        completed = (.data$complete_items/.data$total_items) >= 0.95
      ) %>%
      ungroup() %>%
      # compute time laps between events
      mutate(
        across(c(date_sent, time_stamp), as_date),
        days_from_sent = time_length(difftime(today(), .data$date_sent), "days"),
        age_today = time_length(difftime(today(), .data$date_birth), "months") %>%
          ifelse(. %in% c(-Inf, Inf), NA_real_, .),
        months_from_last_response = time_length(difftime(today(), .data$time_stamp), "months")
      ) %>%
      # select relevant columns and reorder them
      select(
        starts_with("id"),
        one_of(
          "code", "time", "study", "version",
          "date_sent", "time_stamp", "days_from_sent", "date_birth", "age", "age_today", "months_from_last_response",
          "sex", "postcode", "edu_parent1", "edu_parent2",
          "dominance", "lp", "doe_spanish", "doe_catalan", "doe_others",
          "progress", "completed"
        )
      ) %>%
      arrange(desc(.data$time_stamp))

  })

  return(logs)

}