R/report_participants.R

Defines functions .find_race_in_data .find_country_in_data .find_education_in_data .find_gender_in_data .find_sex_in_data .find_age_in_data .report_participants .replace_names .check_df_names report_participants

Documented in report_participants

#' Reporting the participant data
#'
#' A helper function to help you format the participants data (age, sex, ...) in
#' the participants section.
#'
#' @param data A data frame.
#' @param age The name of the column containing the age of the participant.
#' @param sex The name of the column containing the sex of the participant. The
#'   classes should be one of `c("Male", "M", "Female", "F")`. Note that
#'   you can specify other characters here as well (e.g., `"Intersex"`), but
#'   the function will group all individuals in those groups as `"Other"`.
#' @param gender The name of the column containing the gender of the
#'   classes should be one of `c("Man", "M", "Male", Woman", "W", "F",
#'   "Female", Non-Binary", "N")`. Note that you can specify other characters
#'   here as well (e.g., `"Gender Fluid"`), but the function will group all
#'   individuals in those groups as `"Non-Binary"`.
#' @param education The name of the column containing education information.
#' @param country The name of the column containing country information.
#' @param race The name of the column containing race/ethnicity information.
#' @param threshold Percentage after which to combine, e.g., countries (default is 10%,
#' so countries that represent less than 10% will be combined in the "other" category).
#' @param participants The name of the participants' identifier column (for
#'   instance in the case of repeated measures).
#' @param by A character vector indicating the name(s) of the column(s) used
#'   for stratified description.
#' @param spell_n Logical, fully spell the sample size (`"Three participants"`
#'   instead of `"3 participants"`).
#' @inheritParams report.numeric
#' @param group Deprecated. Use `by` instead.
#'
#' @return A character vector with description of the "participants", based on
#'   the information provided in `data`.
#'
#' @examples
#' library(report)
#' data <- data.frame(
#'   "Age" = c(22, 23, 54, 21, 8, 42),
#'   "Sex" = c("Intersex", "F", "M", "M", "NA", NA),
#'   "Gender" = c("N", "W", "W", "M", "NA", NA)
#' )
#' report_participants(data, age = "Age", sex = "Sex")
#'
#' # Years of education (relative to high school graduation)
#' data$Education <- c(0, 8, -3, -5, 3, 5)
#' report_participants(data,
#'   age = "Age", sex = "Sex", gender = "Gender",
#'   education = "Education"
#' )
#'
#' # Education as factor
#' data$Education2 <- c(
#'   "Bachelor", "PhD", "Highschool",
#'   "Highschool", "Bachelor", "Bachelor"
#' )
#' report_participants(data, age = "Age", sex = "Sex", gender = "Gender", education = "Education2")
#'
#' # Country
#' data <- data.frame(
#'   "Age" = c(22, 23, 54, 21, 8, 42, 18, 32, 24, 27, 45),
#'   "Sex" = c("Intersex", "F", "F", "M", "M", "M", "F", "F", "F", "F", "F"),
#'   "Gender" = c("N", "W", "W", "M", "M", "M", "W", "W", "W", "W", "W"),
#'   "Country" = c(
#'     "USA", NA, "Canada", "Canada", "India", "Germany",
#'     "USA", "USA", "USA", "USA", "Canada"
#'   )
#' )
#' report_participants(data)
#'
#' # Country, control presentation treshold
#' report_participants(data, threshold = 5)
#'
#' # Race/ethnicity
#' data <- data.frame(
#'   "Age" = c(22, 23, 54, 21, 8, 42, 18, 32, 24, 27, 45),
#'   "Sex" = c("Intersex", "F", "F", "M", "M", "M", "F", "F", "F", "F", "F"),
#'   "Gender" = c("N", "W", "W", "M", "M", "M", "W", "W", "W", "W", "W"),
#'   "Race" = c(
#'     "Black", NA, "White", "Asian", "Black", "Arab", "Black",
#'     "White", "Asian", "Southeast Asian", "Mixed"
#'   )
#' )
#' report_participants(data)
#'
#' # Race/ethnicity, control presentation treshold
#' report_participants(data, threshold = 5)
#'
#' # Repeated measures data
#' data <- data.frame(
#'   "Age" = c(22, 22, 54, 54, 8, 8),
#'   "Sex" = c("I", "F", "M", "M", "F", "F"),
#'   "Gender" = c("N", "W", "W", "M", "M", "M"),
#'   "Participant" = c("S1", "S1", "s2", "s2", "s3", "s3")
#' )
#' report_participants(data, age = "Age", sex = "Sex", gender = "Gender", participants = "Participant")
#'
#' # Grouped data
#' data <- data.frame(
#'   "Age" = c(22, 22, 54, 54, 8, 8, 42, 42),
#'   "Sex" = c("I", "I", "M", "M", "F", "F", "F", "F"),
#'   "Gender" = c("N", "N", "W", "M", "M", "M", "Non-Binary", "Non-Binary"),
#'   "Participant" = c("S1", "S1", "s2", "s2", "s3", "s3", "s4", "s4"),
#'   "Condition" = c("A", "A", "A", "A", "B", "B", "B", "B")
#' )
#'
#' report_participants(data,
#'   age = "Age",
#'   sex = "Sex",
#'   gender = "Gender",
#'   participants = "Participant",
#'   by = "Condition"
#' )
#'
#' # Spell sample size
#' paste(
#'   report_participants(data, participants = "Participant", spell_n = TRUE),
#'   "were recruited in the study by means of torture and coercion."
#' )
#' @export
report_participants <- function(data,
                                age = NULL,
                                sex = NULL,
                                gender = NULL,
                                education = NULL,
                                country = NULL,
                                race = NULL,
                                participants = NULL,
                                by = NULL,
                                spell_n = FALSE,
                                digits = 1,
                                threshold = 10,
                                group = NULL,
                                ...) {
  ## TODO: deprecate later
  if (!is.null(group)) {
    insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint
    by <- group
  }

  # Convert empty strings to NA
  data_list <- lapply(data, function(x) {
    x[which(x == "")] <- NA # nolint
    x
  })
  data <- as.data.frame(data_list, stringsAsFactors = FALSE)

  # find age variable automatically
  if (is.null(age)) {
    age <- .find_age_in_data(data)
  }

  # find sex variable automatically
  if (is.null(sex)) {
    sex <- .find_sex_in_data(data)
  }

  # find gender variable automatically
  if (is.null(gender)) {
    gender <- .find_gender_in_data(data)
  }

  # find education variable automatically
  if (is.null(education)) {
    education <- .find_education_in_data(data)
  }

  # find country variable automatically
  if (is.null(country)) {
    country <- .find_country_in_data(data)
  }

  # find race variable automatically
  if (is.null(race)) {
    race <- .find_race_in_data(data)
  }

  if (is.null(by)) {
    final_text <- .report_participants(
      data,
      age = age,
      sex = sex,
      gender = gender,
      education = education,
      country = country,
      race = race,
      participants = participants,
      spell_n = spell_n,
      digits = digits,
      threshold = threshold,
      ...
    )
  } else {
    final_text <- NULL
    data[[by]] <- as.character(data[[by]])
    for (i in split(data, data[by])) {
      current_text <- .report_participants(
        i,
        age = age,
        sex = sex,
        gender = gender,
        education = education,
        country = country,
        race = race,
        participants = participants,
        spell_n = spell_n,
        digits = digits,
        threshold = threshold
      )

      pre_text <- paste0(
        "the '",
        paste0(names(i[by]), " - ", vapply(i[by], unique, "character"), collapse = " and "),
        "' group: "
      )

      final_text <- c(final_text, paste0(pre_text, current_text))
    }
    final_text <- paste("For", datawizard::text_concatenate(final_text, sep = ", for ", last = " and for "))
  }
  final_text
}

#' @keywords internal
.check_df_names <- function(data, names) {
  data[names] <- lapply(names, function(x) {
    if (is.null(x) || !all(x %in% names(data))) {
      NA
    } else {
      data[[x]]
    }
  })
  data
}

#' @keywords internal
.replace_names <- function(data, x) {
  if (is.null(x) || !all(x %in% names(data))) {
    tools::toTitleCase(deparse(substitute(x)))
  } else {
    x
  }
}

#' @keywords internal
.report_participants <- function(data,
                                 age = "Age",
                                 sex = "Sex",
                                 gender = "Gender",
                                 education = "Education",
                                 country = "Country",
                                 race = "Race",
                                 participants = NULL,
                                 spell_n = FALSE,
                                 digits = 1,
                                 threshold = 10,
                                 ...) {
  # Sanity checks
  demo_names <- c("Age", "Sex", "Gender", "Education", "Country", "Race")
  data <- .check_df_names(data, names = demo_names)

  age <- .replace_names(data, age)
  sex <- .replace_names(data, sex)
  gender <- .replace_names(data, gender)
  education <- .replace_names(data, education)
  country <- .replace_names(data, country)
  race <- .replace_names(data, race)

  # Set age as numeric
  data[[age]] <- as.numeric(data[[age]])

  # Grouped data
  if (!is.null(participants)) {
    data <- data.frame(
      Age = stats::aggregate(data[[age]],
        by = list(data[[participants]]),
        FUN = mean
      )[[2]],
      Sex = stats::aggregate(data[[sex]],
        by = list(data[[participants]]),
        FUN = utils::head, n = 1
      )[[2]],
      Gender = stats::aggregate(data[[gender]],
        by = list(data[[participants]]),
        FUN = utils::head, n = 1
      )[[2]],
      Education = stats::aggregate(data[[education]],
        by = list(data[[participants]]),
        FUN = utils::head, n = 1
      )[[2]],
      Country = stats::aggregate(data[[country]],
        by = list(data[[participants]]),
        FUN = utils::head, n = 1
      )[[2]],
      Race = stats::aggregate(data[[race]],
        by = list(data[[participants]]),
        FUN = utils::head, n = 1
      )[[2]],
      stringsAsFactors = FALSE
    )
    age <- "Age"
    sex <- "Sex"
    gender <- "Gender"
    education <- "Education"
    country <- "Country"
    race <- "Race"
  }

  if (spell_n) {
    size <- tools::toTitleCase(insight::format_number(nrow(data)))
  } else {
    size <- nrow(data)
  }

  # Create text
  if (all(is.na(data[[age]]))) {
    text_age <- ""
  } else {
    text_age <- summary(
      report_statistics(
        data[[age]],
        n = FALSE,
        centrality = "mean",
        missing_percentage = TRUE,
        digits = digits,
        ...
      )
    )
    text_age <- sub("Mean =", "Mean age =", text_age, fixed = TRUE)
  }


  text_sex <- if (all(is.na(data[[sex]]))) {
    ""
  } else {
    paste0(
      "Sex: ",
      insight::format_value(length(data[[sex]][tolower(
        data[[sex]]
      ) %in% c("female", "f")]) / nrow(data) * 100, digits = digits),
      "% females, ",
      insight::format_value(length(data[[sex]][tolower(
        data[[sex]]
      ) %in% c("male", "m")]) / nrow(data) * 100, digits = digits),
      "% males, ",
      insight::format_value(100 - length(data[[sex]][tolower(
        data[[sex]]
      ) %in% c("male", "m", "female", "f", NA, "na")]) /
        nrow(data) * 100, digits = digits),
      "% other",
      if (insight::format_value(length(data[[sex]][tolower(data[[sex]]) %in% c(NA, "na")]) / nrow(data) * 100) != "0.00") { # nolint
        paste0(", ", insight::format_value(length(data[[sex]][tolower(
          data[[sex]]
        ) %in% c(NA, "na")]) / nrow(data) * 100), "% missing")
      }
    )
  }

  genders_woman <- c(
    "woman", "w", "female", "f", "women", "girl",
    "lady", "miss", "madam", "dame", "lass"
  )
  genders_man <- c(
    "man", "m", "male", "men", "boy",
    "guy", "dude", "lad", "sir"
  )
  both_genders <- c(genders_woman, genders_man, NA, "na")

  text_gender <- if (all(is.na(data[[gender]]))) {
    ""
  } else {
    paste0(
      "Gender: ",
      insight::format_value(length(data[[gender]][tolower(
        data[[gender]]
      ) %in% genders_woman]) / nrow(data) * 100, digits = digits),
      "% women, ",
      insight::format_value(length(data[[gender]][tolower(
        data[[gender]]
      ) %in% genders_man]) / nrow(data) * 100, digits = digits),
      "% men, ",
      insight::format_value(100 - length(data[[gender]][tolower(
        data[[gender]]
      ) %in% both_genders]) /
        nrow(data) * 100), "% non-binary",
      if (insight::format_value(length(data[[gender]][tolower(
        data[[gender]]
      ) %in% c(NA, "na")]) / nrow(data) * 100) != "0.00") {
        paste0(", ", insight::format_value(length(data[[gender]][tolower(
          data[[gender]]
        ) %in% c(NA, "na")]) / nrow(data) * 100), "% missing")
      }
    )
  }

  if (all(is.na(data[[education]]))) {
    text_education <- ""
  } else if (is.numeric(data[[education]])) {
    text_education <- summary(
      report_statistics(
        data[[education]],
        n = FALSE,
        centrality = "mean",
        missing_percentage = NULL,
        digits = digits,
        ...
      )
    )

    text_education <- sub("Mean =", "Mean education =", text_education, fixed = TRUE)
  } else {
    data[which(data[[education]] %in% c(NA, "NA")), education] <- "missing"
    txt <- summary(report_statistics(
      as.factor(data[[education]]),
      levels_percentage = TRUE,
      digits = digits,
      ...
    ))

    text_education <- paste0("Education: ", txt)
  }

  text_country <- if (all(is.na(data[[country]]))) {
    ""
  } else {
    data[[country]] <- as.character(data[[country]])
    data[which(data[[country]] %in% c(NA, "NA")), country] <- "missing"
    frequency_table <- as.data.frame(datawizard::data_tabulate(data[[country]]),
      stringsAsFactors = FALSE
    )[c(2, 4)]
    names(frequency_table)[2] <- "Percent"
    frequency_table <- frequency_table[-which(is.na(frequency_table$Value)), ]
    frequency_table <- frequency_table[order(-frequency_table$Percent), ]
    upper <- frequency_table[which(frequency_table$Percent >= threshold), ]
    lower <- frequency_table[which(frequency_table$Percent < threshold), ]
    if (nrow(lower) > 0) {
      lower_sum <- data.frame(
        Value = "other", Percent = sum(lower$Percent),
        stringsAsFactors = FALSE
      )
      combined <- rbind(upper, lower_sum)
    } else {
      combined <- upper
    }
    combined$Percent <- insight::format_value(combined$Percent)
    value_string <- paste0(combined$Percent, "% ", combined$Value, collapse = ", ")
    text_country <- paste("Country:", value_string)
  }

  text_race <- if (all(is.na(data[[race]]))) {
    ""
  } else {
    data[[race]] <- as.character(data[[race]])
    data[which(data[[race]] %in% c(NA, "NA")), race] <- "missing"
    frequency_table <- as.data.frame(datawizard::data_tabulate(data[[race]]),
      stringsAsFactors = FALSE
    )[c(2, 4)]
    names(frequency_table)[2] <- "Percent"
    frequency_table <- frequency_table[-which(is.na(frequency_table$Value)), ]
    frequency_table <- frequency_table[order(-frequency_table$Percent), ]
    upper <- frequency_table[which(frequency_table$Percent >= threshold), ]
    lower <- frequency_table[which(frequency_table$Percent < threshold), ]
    if (nrow(lower) > 0) {
      lower_sum <- data.frame(
        Value = "other", Percent = sum(lower$Percent),
        stringsAsFactors = FALSE
      )
      combined <- rbind(upper, lower_sum)
    } else {
      combined <- upper
    }
    combined$Percent <- insight::format_value(combined$Percent)
    value_string <- paste0(combined$Percent, "% ", combined$Value, collapse = ", ")
    text_race <- paste("Race:", value_string)
  }

  # nolint start
  paste0(
    size,
    " participants (",
    ifelse(text_age == "", "", text_age),
    ifelse(text_sex == "", "", paste0(ifelse(
      text_age == "", "", "; "
    ), text_sex)),
    ifelse(text_gender == "", "", paste0(ifelse(
      text_age == "" & text_sex == "", "", "; "
    ), text_gender)),
    ifelse(text_education == "", "", paste0(ifelse(
      text_age == "" & text_sex == "" & text_gender == "", "", "; "
    ), text_education)),
    ifelse(text_country == "", "", paste0(ifelse(
      text_education == "" & text_age == "" & text_sex == "" &
        text_gender == "", "", "; "
    ), text_country)),
    ifelse(text_race == "", "", paste0(ifelse(
      text_country == "" & text_education == "" & text_age == "" &
        text_sex == "" & text_gender == "", "", "; "
    ), text_race)),
    ")"
  )
  # nolint end
}

#' @keywords internal
.find_age_in_data <- function(data) {
  if ("Age" %in% colnames(data)) {
    "Age"
  } else if ("age" %in% colnames(data)) {
    "age"
  } else if (any(startsWith("Age", colnames(data)))) {
    grep("^Age", colnames(data), value = TRUE)[1]
  } else if (any(startsWith("age", colnames(data)))) {
    grep("^age", colnames(data), value = TRUE)[1]
  } else {
    ""
  }
}

#' @keywords internal
.find_sex_in_data <- function(data) {
  if ("Sex" %in% colnames(data)) {
    "Sex"
  } else if ("sex" %in% colnames(data)) {
    "sex"
  } else if (any(startsWith("Sex", colnames(data)))) {
    grep("^Sex", colnames(data), value = TRUE)[1]
  } else if (any(startsWith("sex", colnames(data)))) {
    grep("^sex", colnames(data), value = TRUE)[1]
  } else {
    ""
  }
}

#' @keywords internal
.find_gender_in_data <- function(data) {
  if ("Gender" %in% colnames(data)) {
    "Gender"
  } else if ("gender" %in% colnames(data)) {
    "gender"
  } else if (any(startsWith("Gender", colnames(data)))) {
    grep("^Gender", colnames(data), value = TRUE)[1]
  } else if (any(startsWith("gender", colnames(data)))) {
    grep("^gender", colnames(data), value = TRUE)[1]
  } else {
    ""
  }
}

#' @keywords internal
.find_education_in_data <- function(data) {
  if ("Education" %in% colnames(data)) {
    "Education"
  } else if ("education" %in% colnames(data)) {
    "education"
  } else if (any(startsWith("Education", colnames(data)))) {
    grep("^Education", colnames(data), value = TRUE)[1]
  } else if (any(startsWith("education", colnames(data)))) {
    grep("^education", colnames(data), value = TRUE)[1]
  } else if ("isced" %in% colnames(data)) {
    "isced"
  } else if (any(startsWith("isced", colnames(data)))) {
    grep("^isced", colnames(data), value = TRUE)[1]
  } else {
    ""
  }
}

#' @keywords internal
.find_country_in_data <- function(data) {
  if ("Country" %in% colnames(data)) {
    "Country"
  } else if ("country" %in% colnames(data)) {
    "country"
  } else if (any(startsWith("Country", colnames(data)))) {
    grep("^Country", colnames(data), value = TRUE)[1]
  } else if (any(startsWith("country", colnames(data)))) {
    grep("^country", colnames(data), value = TRUE)[1]
  } else {
    ""
  }
}

#' @keywords internal
.find_race_in_data <- function(data) {
  if ("Race" %in% colnames(data)) {
    "Race"
  } else if ("race" %in% colnames(data)) {
    "race"
  } else if (any(startsWith("Race", colnames(data)))) {
    grep("^Race", colnames(data), value = TRUE)[1]
  } else if (any(startsWith("race", colnames(data)))) {
    grep("^race", colnames(data), value = TRUE)[1]
  } else {
    ""
  }
}

Try the report package in your browser

Any scripts or data that you put into this service are public.

report documentation built on Sept. 11, 2024, 8:47 p.m.