R/helpers_regression.R

Defines functions valid_ids check_discord_errors make_mean_diffs check_sibling_order

Documented in check_discord_errors check_sibling_order make_mean_diffs

#' @title Check Sibling Order
#'
#' @description This function determines the order of sibling pairs based on an outcome variable.
#' It function checks which of the two kinship pairs has more of a specified outcome variable.
#' It adds a new column named `order` to the dataset, indicating which sibling (identified as "s1" or "s2") has more of the outcome.
#' If the two siblings have the same amount of the outcome, it randomly assigns one as having more.
#
#' @inheritParams discord_data
#' @param row The row number of the data frame
#'
#' @return A character string signifying which familial member (1, 2, or
#'   neither) has more of the outcome.
#'
check_sibling_order <- function(data, outcome, pair_identifiers, row) {
  # Select the row of interest from the data frame
  data <- data[row, ]

  # Get the value of the outcome variable for each sibling
  outcome1 <- data[, base::paste0(outcome, pair_identifiers[1])]
  outcome2 <- data[, base::paste0(outcome, pair_identifiers[2])]

  # Check if either sibling has missing (NA) outcome data
  if (is.na(outcome1) | is.na(outcome2)) {
    stop(paste0("There are missing data, encoded as `NA`, for at least one kinship pair in the '", outcome, "' variable and data cannot be prepped properly.\n Please remove or impute missing data."))
  }

  if (outcome1 > outcome2) {
    data$order <- "s1"
  } else if (outcome1 < outcome2) {
    data$order <- "s2"
  } else if (outcome1 == outcome2) {
    p <- stats::rbinom(1, 1, 0.5)

    if (p) {
      data$order <- "s1"
    } else if (!p) {
      data$order <- "s2"
    }
  }

  return(data)
}
#' @title Make Mean Differences
#'
#' @description This function calculates differences and means of a given variable for each kinship pair. The order of subtraction and the variables' names in the output dataframe depend on the order column set by check_sibling_order().
#' If the demographics parameter is set to "race", "sex", or "both", it also prepares demographic information accordingly,
#' swapping the order of demographics as per the order column.
#' @inheritParams discord_data
#' @inheritParams check_sibling_order
#' @param variable outcomes and predictors for manipulating the data
#'
make_mean_diffs <- function(data, id, sex, race, demographics,
                            variable, pair_identifiers, row,
                            coding_method = "none") {
  S1 <- base::paste0(variable, pair_identifiers[1])
  S2 <- base::paste0(variable, pair_identifiers[2])
  sexS1 <- base::paste0(sex, pair_identifiers[1])
  sexS2 <- base::paste0(sex, pair_identifiers[2])
  raceS1 <- base::paste0(race, pair_identifiers[1])
  raceS2 <- base::paste0(race, pair_identifiers[2])

  data <- data[row, ]


  # write the core of the of the make_mean_diffs
  # This always runs -- ignoring sex or race variables
  if (data[, "order"] == "s1") {
    # no need to be yelled at by r for subtracting strings)
    diff <- suppressWarnings(data[[S1]] - data[[S2]])
    mean <- suppressWarnings(base::mean(c(data[[S1]], data[[S2]])))

    output <- data.frame(
      id = data[[id]],
      variable_1 = data[[S1]],
      variable_2 = data[[S2]],
      variable_diff = diff,
      variable_mean = mean
    )
  } else if (data[, "order"] == "s2") {
    # no need to be yelled at by r for subtracting strings)
    diff <- suppressWarnings(data[[S2]] - data[[S1]])
    mean <- suppressWarnings(base::mean(c(data[[S1]], data[[S2]])))

    output <- data.frame(
      id = data[[id]],
      variable_1 = data[[S2]],
      variable_2 = data[[S1]],
      variable_diff = diff,
      variable_mean = mean
    )
  }

  names(output) <- c(
    "id",
    paste0(variable, "_1"),
    paste0(variable, "_2"),
    paste0(variable, "_diff"),
    paste0(variable, "_mean")
  )

  # check for whether or not race and sex are defined

  if (demographics == "race") {
    if (data[, "order"] == "s1") {
      output_demographics <- data.frame(
        race_1 = data[[raceS1]],
        race_2 = data[[raceS2]]
      )
    } else if (data[, "order"] == "s2") {
      output_demographics <- data.frame(
        race_1 = data[[raceS2]],
        race_2 = data[[raceS1]]
      )
    }

    names(output_demographics) <- paste0(race, c("_1", "_2"))
  } else if (demographics == "sex") {
    if (data[, "order"] == "s1") {
      output_demographics <- data.frame(
        sex_1 = data[[sexS1]],
        sex_2 = data[[sexS2]]
      )
    } else if (data[, "order"] == "s2") {
      output_demographics <- data.frame(
        sex_1 = data[[sexS2]],
        sex_2 = data[[sexS1]]
      )
    }

    names(output_demographics) <- paste0(sex, c("_1", "_2"))
  } else if (demographics == "both") {
    if (data[, "order"] == "s1") {
      output_demographics <- data.frame(
        sex_1 = data[[sexS1]],
        sex_2 = data[[sexS2]],
        race_1 = data[[raceS1]],
        race_2 = data[[raceS2]]
      )
    } else if (data[, "order"] == "s2") {
      output_demographics <- data.frame(
        sex_1 = data[[sexS2]],
        sex_2 = data[[sexS1]],
        race_1 = data[[raceS2]],
        race_2 = data[[raceS1]]
      )
    }

    names(output_demographics) <- c(paste0(sex, c("_1", "_2")), paste0(race, c("_1", "_2")))
  }
  if (coding_method != "none") {
    # New logic to handle race and sex as categorical variables
    if (demographics == "both" || demographics == "race") {
      race_1_name <- paste0(race, "_1")
      race_2_name <- paste0(race, "_2")
      output_demographics[[paste0(race, "_binarymatch")]] <- ifelse(output_demographics[[race_1_name]] == output_demographics[[race_2_name]],
        1, 0
      )
      output_demographics[[paste0(race, "_multimatch")]] <- ifelse(output_demographics[[race_1_name]] == output_demographics[[race_2_name]],
        as.character(output_demographics[[race_2_name]]), "mixed"
      )
    }
    if (demographics == "both" || demographics == "sex") {
      sex_1_name <- paste0(sex, "_1")
      sex_2_name <- paste0(sex, "_2")
      output_demographics[[paste0(sex, "_binarymatch")]] <- ifelse(output_demographics[[sex_1_name]] == output_demographics[[sex_2_name]],
        1, 0
      )
      output_demographics[[paste0(sex, "_multimatch")]] <- ifelse(output_demographics[[sex_1_name]] == output_demographics[[sex_2_name]], as.character(output_demographics[[sex_2_name]]), "mixed")
    }
  }
  if (exists("output_demographics")) {
    output <- base::cbind(output, output_demographics)
  }


  return(output)
}

#' @title Check Discord Errors
#'
#' @description This function checks for common errors in the provided data, including the correct specification of identifiers (ID, sex, race) and their existence in the data.
#'
#
#' @param data The data to perform a discord regression on.
#' @param id A unique kinship pair identifier.
#' @param sex A character string for the sex column name.
#' @param race A character string for the race column name.
#' @param pair_identifiers A character vector of length two that contains the variable identifier for each kinship pair.
#'
#' @return An error message if one of the conditions are met.
#'
check_discord_errors <- function(data, id, sex, race, pair_identifiers) {
  if (!is.null(id)) {
    if (!id %in% base::names(data)) {
      stop(paste0("The kinship pair ID \"", id, "\" is not valid. Please check that you have the correct column name."))
    }
  }

  if (!base::is.null(sex) && base::sum(base::grepl(sex, base::names(data))) == 0) {
    stop(paste0("The kinship pair sex identifier \"", sex, "\" is not appropriately defined. Please check that you have the correct column name."))
  }
  if (!base::is.null(race) && base::sum(base::grepl(race, base::names(data))) == 0) {
    stop(paste0("The kinship pair race identifier \"", race, "\" is not appropriately defined. Please check that you have the correct column name."))
  }
  if (base::sum(base::grepl(pair_identifiers[1], base::names(data))) == 0 | base::sum(base::grepl(pair_identifiers[2], base::names(data))) == 0) {
    stop(paste0("Please check that the kinship pair identifiers \"", pair_identifiers[1], "\" and \"", pair_identifiers[2], "\" are valid, i.e. ensure that you have the correct labels for each kin."))
  }
  if (!base::is.null(sex) & !base::is.null(race) && sex == race) {
    stop("Please check that your sex and race variables are not equal.")
  }
}

#' @title Validate IDs
#'
#' @description This function checks if the provided kinship pair IDs are unique.
#'
#' @param data The data frame to be checked.
#' @param id A string representing the column name for kinship pair IDs.
#'
#' @return A logical value. If TRUE, the IDs are unique for each kin-pair. If FALSE, there is at least one duplicate ID.
#'
#' @noRd
#'
valid_ids <- function(data, id) {
  if (!is.null(id)) {
    id_length <- length(unique(data[[id]]))
    if (id_length != nrow(data)) {
      dwarn("Specified id column does not contain unique values for each kin-pair.
Adding row-wise ID for restructuring data into paired format for analysis.
For more details, see <https://github.com/R-Computing-Lab/discord/issues/6>.")
      return(FALSE)
    } else if (id_length == nrow(data)) {
      return(TRUE)
    }
  } else if (is.null(id)) {
    return(FALSE)
  }
}
smasongarrison/discord documentation built on March 4, 2024, 12:55 p.m.