R/main.R

Defines functions plot.cond_maha proportion2percentile proportion_round is_singular p2label print.maha format.maha print.cond_maha format.cond_maha cond_maha

Documented in cond_maha is_singular p2label plot.cond_maha proportion2percentile proportion_round

#' Calculate the conditional Mahalanobis distance for any variables.
#'
#' @export
#' @param data Data.frame with the independent and dependent variables.
#' Unless mu and sigma are specified, data are assumed to be z-scores.
#' @param R Correlation among all variables.
#' @param v_dep Vector of names of the dependent variables in your profile.
#' @param v_ind Vector of names of independent variables you would like to
#' control for.
#' @param v_ind_composites Vector of names of independent variables that are
#' composites of dependent variables
#' @param mu A vector of means. A single value means that all variables have
#' the same mean.
#' @param sigma A vector of standard deviations. A single value means that
#' all variables have the same standard deviation
#' @param use_sample_stats If TRUE, estimate R, mu, and sigma from data.
#' Only complete cases are used (i.e., no missing values in v_dep, v_ind,
#' v_ind_composites).
#' @param label optional tag for labeling output
#' @return a list with the conditional Mahalanobis distance
#' \itemize{
#' \item{`dCM` = Conditional Mahalanobis distance}
#' \item{`dCM_df` = Degrees of freedom for the conditional Mahalanobis distance}
#' \item{`dCM_p` = A proportion that indicates how unusual this profile is
#' compared to profiles with the same independent variable values. For example,
#' if `dCM_p` = 0.88, this profile is more unusual than 88 percent of profiles
#' after controlling for the independent variables.}
#' \item{`dM_dep` = Mahalanobis distance of just the dependent variables}
#' \item{`dM_dep_df` = Degrees of freedom for the Mahalanobis distance of
#' the dependent variables}
#' \item{`dM_dep_p` = Proportion associated with the Mahalanobis distance
#' of the dependent variables}
#' \item{`dM_ind` = Mahalanobis distance of just the independent variables}
#' \item{`dM_ind_df` = Degrees of freedom for the Mahalanobis distance of
#' the independent variables}
#' \item{`dM_ind_p` = Proportion associated with the Mahalanobis distance
#' of the independent variables}
#' \item{`v_dep` = Dependent variable names}
#' \item{`v_ind` = Independent variable names}
#' \item{`v_ind_singular` = Independent variables that can be perfectly
#' predicted from the dependent variables (e.g., composite scores)}
#' \item{`v_ind_nonsingular` = Independent variables that are not perfectly
#' predicted from the dependent variables}
#' \item{`data` = data used in the calculations}
#' \item{`d_ind` = independent variable data}
#' \item{`d_inp_p` = Assuming normality, cumulative distribution function
#' of the independent variables}
#' \item{`d_dep` = dependent variable data}
#' \item{`d_dep_predicted` = predicted values of the dependent variables}
#' \item{`d_dep_deviations = d_dep - d_dep_predicted` (i.e., residuals of
#' the dependent variables)}
#' \item{`d_dep_residuals_z` = standardized residuals of the dependent
#' variables}
#' \item{`d_dep_cp` = conditional proportions associated with
#' standardized residuals}
#' \item{`d_dep_p` = Assuming normality, cumulative distribution function
#' of the dependent variables}
#' \item{`R2` = Proportion of variance in each dependent variable explained
#' by the independent variables}
#' \item{`zSEE` = Standardized standard error of the estimate
#' for each dependent variable}
#' \item{`SEE` = Standard error of the estimate for each dependent variable}
#' \item{`ConditionalCovariance` = Covariance matrix of the dependent
#' variables after controlling for the independent variables}
#' \item{`distance_reduction = 1 - (dCM / dM_dep)` (Degree to which the
#' independent variables decrease the Mahalanobis distance of the dependent
#' variables. Negative reductions mean that the profile is more unusual
#' after controlling for the independent variables. Returns 0
#' if `dM_dep` is 0.)}
#' \item{`variability_reduction = 1 - sum((X_dep - predicted_dep) ^ 2) /
#' sum((X_dep - mu_dep) ^ 2)` (Degree to which the independent variables
#' decrease the variability the dependent variables (`X_dep`).
#' Negative reductions mean that the profile is more variable after
#' controlling for the independent variables. Returns 0 if `X_dep == mu_dep`)}
#' \item{`mu` = Variable means}
#' \item{`sigma` = Variable standard deviations}
#' \item{`d_person` = Data frame consisting of Mahalanobis distance data for
#' each person}
#' \item{`d_variable` = Data frame consisting of variable characteristics}
#' \item{`label` = label slot}
#' }
#'
#' @examples
#' library(unusualprofile)
#' library(simstandard)
#'
#' m <- "
#' Gc =~ 0.85 * Gc1 + 0.68 * Gc2 + 0.8 * Gc3
#' Gf =~ 0.8 * Gf1 + 0.9 * Gf2 + 0.8 * Gf3
#' Gs =~ 0.7 * Gs1 + 0.8 * Gs2 + 0.8 * Gs3
#' Read =~ 0.66 * Read1 + 0.85 * Read2 + 0.91 * Read3
#' Math =~ 0.4 * Math1 + 0.9 * Math2 + 0.7 * Math3
#' Gc ~ 0.6 * Gf + 0.1 * Gs
#' Gf ~ 0.5 * Gs
#' Read ~ 0.4 * Gc + 0.1 * Gf
#' Math ~ 0.2 * Gc + 0.3 * Gf + 0.1 * Gs"
#' # Generate 10 cases
#' d_demo <- simstandard::sim_standardized(m = m, n = 10)
#'
#' # Get model-implied correlation matrix
#' R_all <- simstandard::sim_standardized_matrices(m)$Correlations$R_all
#'
#' cond_maha(data = d_demo,
#'           R = R_all,
#'           v_dep = c("Math", "Read"),
#'           v_ind = c("Gf", "Gs", "Gc"))
cond_maha <- function(data,
                      R,
                      v_dep,
                      v_ind = NULL,
                      v_ind_composites = NULL,
                      mu = 0,
                      sigma = 1,
                      use_sample_stats = FALSE,
                      label = NA) {
  # Convert data to matrix
  if (is.vector(data)) {
    data <- matrix(data, nrow = 1, dimnames = list(NULL, names(data)))
  }

  data <- as.matrix(data)


  v_ind <- unique(c(v_ind,v_ind_composites))

  # Checks ----

  # Check if v_dep are in colnames of data and R
  if (!is.null(v_dep)) {
    if (!all(v_dep %in% colnames(data))) {
      v_dep_not_in_d <- paste(v_dep[!(v_ind_composites %in%
                                        colnames(data))],
                              collapse = ", ")
      stop(paste0("Some variables in v_dep are not in data: ",
                  v_dep_not_in_d))
    }

    if (!all(v_dep %in% colnames(R))) {
      v_dep_not_in_R <- paste(v_dep[!(v_dep %in%
                                        colnames(R))],
                              collapse = ", ")
      stop(paste0("Some variables in v_dep are not in R: ",
                  v_dep_not_in_R))
    }
  }



  # Check if v_ind_composites are in colnames of data
  if (!is.null(v_ind_composites)) {
    if (!all(v_ind_composites %in% colnames(data))) {
      v_ind_not_in_d <- paste(v_ind_composites[!(v_ind_composites %in%
                                                   colnames(data))],
                              collapse = ", ")
      stop(paste0(
        "Some variables in v_ind_composites are not in data: ",
        v_ind_not_in_d
      ))
    }
  }

  # Check if v_ind_composites are in R
  if (!is.null(v_ind_composites)) {
    if (!all(v_ind_composites %in% colnames(R))) {
      v_ind_not_in_d <- paste(v_ind_composites[!(v_ind_composites %in%
                                                   colnames(R))],
                              collapse = ", ")
      stop(paste0(
        "Some variables in v_ind_composites are not in R: ",
        v_ind_not_in_d
      ))
    }
  }

  # Check if v_ind are in colnames of data
  if (!is.null(v_ind)) {
    if (!all(v_ind %in% colnames(data))) {
      v_ind_not_in_d <- paste(v_ind[!(v_ind %in% colnames(data))],
                              collapse = ", ")
      stop(paste0("Some variables in v_ind are not in data: ",
                  v_ind_not_in_d))
    }


    if (!all(v_ind %in% colnames(R))) {
      v_ind_not_in_R <- paste(v_ind[!(v_ind %in% colnames(R))],
                              collapse = ", ")
      stop(paste0("Some variables in v_ind are not in R: ",
                  v_ind_not_in_R))
    }



    # Check if v_dep and v_ind have overlapping v
    if (any(v_dep %in% v_ind)) {
      overlap <- v_dep[v_dep %in% v_ind]
      stop(
        paste0("At least one variable is in both v_dep and v_ind: ",
               paste(overlap, collapse = " ")))
    }
  }




  v_all <- c(v_ind, v_dep)
  data_k <- length(v_all)
  # Select only variables that are used
  v_data_original <- colnames(data)
  data <- data[, v_all, drop = FALSE]

  # If data have no means specified
  if (is.null(mu) && !use_sample_stats) {
    mu <- rep(0, data_k)
    names(mu) <- colnames(data)
  } else {
    # Number of values in mu
    mu_k <- length(mu)

    # If there is only 1 value in mu, make data_k copies
    if (mu_k == 1) {
      mu <- rep(mu, data_k)
      names(mu) <- v_all
    }


    # If length of mu is unequal to the number of variables in data
    if (mu_k != data_k && mu_k > 1) {
      stop(
        paste0(
          "There are ",
          mu_k,
          " means in mu, but ",
          data_k,
          ifelse(data_k == 1, " column ", " columns "),
          "in the data. Supply 1 mean for each variable."
        )
      )
    } else {
      if (is.null(names(mu))) {
        # names in original order
        names(mu) <- v_data_original[v_data_original %in% v_all]
        # sort to new order
        mu <- mu[v_all]
      }

    }
  }


  # If data have no standard deviations specified
  if (is.null(sigma) && !use_sample_stats) {
    sigma <- rep(1, data_k)
    names(sigma) <- v_all
  } else {
    # Number of values in sigma
    sigma_k <- length(sigma)

    # If there is only 1 value in sigma, make data_k copies
    if (sigma_k == 1) {
      sigma <- rep(sigma, data_k)
      names(sigma) <- v_all
    }

    # If length of sigma is unequal to the number of variables in data
    if ((sigma_k != data_k) && (sigma_k > 1)) {
      stop(
        paste0(
          "There are ",
          sigma_k,
          " means in sigma, but ",
          data_k,
          ifelse(data_k == 1, " column ", " columns "),
          "in the data. Supply 1 mean for each variable."
        )
      )
    } else {
      if (is.null(names(sigma))) {
        # names in original order
        names(sigma) <- v_data_original[v_data_original %in% v_all]
        # sort to new order
        sigma <- sigma[v_all]
      }

    }
  }



  # if use_sample_stats, estimate R, mu, and sigma from data
  if (use_sample_stats) {
    if (nrow(data) < 3)
      stop(
        paste0(
          "Sample statistics cannot be calculated with fewer ",
          "than 3 rows of data. For accurate statistics, much ",
          "more than 3 rows are needed."
        )
      )
    d_estimate <-
      data[stats::complete.cases(data), v_all, drop = FALSE]
    R <- stats::cor(d_estimate)
    mu <- colMeans(d_estimate)
    sigma <- apply(d_estimate, 2, stats::sd)
  }

  # means and sd of dependent variables

  mu <- matrix(mu,
               ncol = 1,
               dimnames = list(names(mu), NULL))[v_all, , drop = FALSE]
  sigma <- matrix(sigma,
                  ncol = 1,
                  dimnames = list(names(sigma)))[v_all, , drop = FALSE]
  mu_dep <- mu[v_dep, , drop = FALSE]
  sigma_dep <- sigma[v_dep, , drop = FALSE]





  R <- R[v_all, v_all, drop = FALSE]
  # Check if R is a correlation matrix
  # Check if R has ones on the diagonal
  if (!all(diag(R) == 1)) {
    stop("R has values on its diagonal that are not ones.")
  }
  # Check if R is symmetric
  if (!isSymmetric(R))
    stop("R is not symmetric")
  # Check if all values in R are between -1 and 1
  if (!all(R <= 1 & R >= -1)) {
    stop("Some values of R are outside the range of 1 and -1.")
  }

  # Make z-scores ----
  n <- nrow(data)
  ones <- matrix(rep(1, n), ncol = 1)
  colmu <- ones %*% t(mu)
  colsigma <- ones %*% t(sigma)
  d_z <- (data - colmu) / colsigma

  # Select covariance among dependent variables
  Ryy <- R[v_dep, v_dep, drop = FALSE]

  # Make sure dependent covariance is nonsingluar
  if (is_singular(Ryy)) {
    stop(
      paste0(
        "Dependent measures are collinear. ",
        "Cannot calculate the Mahalanobis Distance"
      )
    )
  }


  # Data for just dependent variables
  d_dep <- data[, v_dep, drop = FALSE]
  d_dep_z <- d_z[, v_dep, drop = FALSE]

  # Number of dependent variables
  k_dep <- length(v_dep)

  # (Unconditional) Mahalanobis distance of dependent variables
  dM_dep <- (((d_dep_z %*% solve(Ryy)) * d_dep_z) %*%
               matrix(1, nrow = k_dep)) %>%
    sqrt %>%
    as.vector
  # Probability for Mahalanobis distance of dependent variables
  dM_dep_p <- stats::pchisq(dM_dep ^ 2, df = k_dep)

  if (!is.null(v_ind)) {
    # If there are independent variables,
    # calculate conditional Mahalanobis distance.


    mu_ind <- mu[v_ind, , drop = FALSE]
    sigma_ind <- sigma[v_ind, , drop = FALSE]

    # Make matrices
    Rxx <- R[v_ind, v_ind, drop = FALSE]
    Rxy <- R[v_ind, v_dep, drop = FALSE]
    Ryx <- R[v_dep, v_ind, drop = FALSE]

    # Make sure independent covariance is nonsingluar
    if (is_singular(Rxx)) {
      stop(
        paste0(
          "Independent measures are collinear. ",
          "Cannot calculate the Mahalanobis Distance"
        )
      )
    }

    iRxx <- solve(Rxx)

    # Make regression coefficients
    reg_beta <- iRxx %*% Rxy

    # Make variance explained
    R2 <- colSums(reg_beta * Rxy)

    # Make Standard Error of Estimate
    zSEE <- sqrt(1 - R2)
    SEE <- zSEE * sigma_dep[, 1, drop = TRUE]

    # Data for just independent variables
    d_ind <- data[, v_ind, drop = FALSE]
    d_ind_z <- d_z[, v_ind, drop = FALSE]
    d_ind_p <- stats::pnorm(d_ind_z)

    # Make predicted deps
    d_predicted_z <- d_ind_z %*% reg_beta
    d_deviations_z <- d_dep_z - d_predicted_z
    d_predicted <- d_predicted_z * colsigma[, v_dep, drop = FALSE] +
      colmu[, v_dep, drop = FALSE]
    d_deviations <- d_dep - d_predicted
    variability_reduction <- 1 -  sum(d_deviations ^ 2) /
      sum((d_dep - mu_dep[, 1, drop = TRUE]) ^ 2)

    # Conditional Variance
    cov_cond <- Ryy - Ryx %*% iRxx %*% Rxy
    k_ind <- length(v_ind)
    dCM_df <- k_dep

    # Initialize predictor vectors
    if (is.null(v_ind_composites)) {
      v_ind_composites <- character(0)
    }

    v_ind_singular <- v_ind_composites
    v_ind_nonsingular <- character(0)
    v_ind_try <- setdiff(v_ind, v_ind_singular)
    dCM_df <- dCM_df - length(v_ind_singular)

    if (is_singular(cov_cond)) {
      # Function to see if adding a predictor makes the
      # conditional covariance singular
      addpredictor <- function(p, oldInd, v_dep, R) {
        vInd <- c(p, oldInd)
        Rxx <- R[vInd, vInd]
        Rxy <- R[vInd, v_dep]
        Ryx <- R[v_dep, vInd]
        iRxx <- solve(Rxx)
        cov_cond <- Ryy - Ryx %*% iRxx %*% Rxy
        !is_singular(cov_cond)
      }

      # Add predictors that do not make conditional
      # covariance singular
      for (i in v_ind_try) {
        if (addpredictor(i, v_ind_nonsingular, v_dep, R)) {
          v_ind_nonsingular <- c(v_ind_nonsingular, i)
        } else {
          v_ind_singular <- c(v_ind_singular, i)
          dCM_df <- dCM_df - 1
        }
      }

      # Make final conditional covariance
      if (length(v_ind_nonsingular) > 0) {
        Rxx <- R[v_ind_nonsingular, v_ind_nonsingular]
        Rxy <- R[v_ind_nonsingular, v_dep]
        Ryx <- R[v_dep, v_ind_nonsingular]
        iRxx <- solve(Rxx)
        cov_cond <- Ryy - Ryx %*% iRxx %*% Rxy
      } else {
        cov_cond <- Ryy
      }
    }


    # Conditional Mahalanobis Distance
    dCM <- (((d_deviations_z %*%
                solve(cov_cond)) * d_deviations_z) %*%
              matrix(1, nrow = k_dep)) %>%
      sqrt %>%
      as.vector

    # Probability for Conditional Mahalanobis Distance
    dCM_p <- stats::pchisq(dCM ^ 2, dCM_df)

    # Calculate Mahalanobis distance of independent variables
    dM_ind <-
      (((d_ind_z %*% solve(R[v_ind, v_ind, drop = FALSE])) * d_ind_z) %*%
         matrix(1, nrow = k_ind)) %>%
      sqrt %>%
      as.vector()

    # Probability for Mahalanobis distance of independent variables
    dM_ind_p <- stats::pchisq(dM_ind ^ 2, df = k_ind)


    # Dependent standardized residuals (z-scores)
    d_dep_residuals_z <- d_deviations_z / zSEE

    # Dependent cp
    d_dep_cp <- stats::pnorm(d_dep_residuals_z)

    # Dependent p
    d_dep_p <- stats::pnorm(d_dep_z)



    d_person <- tibble::tibble(
      id = seq_len(nrow(data)),
      dCM = dCM,
      dCM_df = dCM_df,
      dCM_p = dCM_p,
      dM_dep = dM_dep,
      dM_dep_df = k_dep,
      dM_dep_p = dM_dep_p,
      dM_ind = dM_ind,
      dM_ind_df = k_ind,
      dM_ind_p = dM_ind_p
    )

    d_variable <- tibble::tibble(
      Variable = v_dep,
      mu = mu_dep[, 1],
      sigma = sigma_dep[, 1, drop = TRUE],
      R2 = R2,
      zSEE = zSEE,
      SEE = SEE
    ) %>%
      dplyr::bind_rows(tibble::tibble(
        Variable = v_ind,
        mu = mu_ind[, 1],
        sigma = sigma_ind[, 1]
      ))

    d_score <- dplyr::bind_rows(
      d_dep %>%
        tibble::as_tibble() %>%
        tibble::rowid_to_column("id") %>%
        dplyr::mutate(type = "Score"),
      d_dep_p %>%
        tibble::as_tibble() %>%
        tibble::rowid_to_column("id") %>%
        dplyr::mutate(type = "p"),
      d_dep_cp %>%
        tibble::as_tibble() %>%
        tibble::rowid_to_column("id") %>%
        dplyr::mutate(type = "cp"),
      d_predicted %>%
        tibble::as_tibble() %>%
        tibble::rowid_to_column("id") %>%
        dplyr::mutate(type = "Predicted")
    ) %>%
      dplyr::mutate(Role = "Dependent") %>%
      tidyr::pivot_longer(!!v_dep,
                          names_to = "Variable",
                          values_to = "Value") %>%
      dplyr::bind_rows(
        dplyr::bind_rows(
          d_ind %>%
            tibble::as_tibble() %>%
            tibble::rowid_to_column("id") %>%
            dplyr::mutate(type = "Score"),
          d_ind_p %>%
            tibble::as_tibble() %>%
            tibble::rowid_to_column("id") %>%
            dplyr::mutate(type = "p")
        ) %>%
          dplyr::mutate(Role = "Independent") %>%
          tidyr::pivot_longer(!!v_ind,
                              names_to = "Variable",
                              values_to = "Value")
      ) %>%
      tidyr::pivot_wider(names_from = "type",
                         values_from = "Value") %>%
      dplyr::left_join(d_variable, by = "Variable") %>%
      dplyr::mutate(Variable = factor(.data$Variable, levels = v_all))

    CM <- list(
      dCM = dCM,
      dCM_df = dCM_df,
      dCM_p = dCM_p,
      dM_dep = dM_dep,
      dM_dep_df = k_dep,
      dM_dep_p = dM_dep_p,
      dM_ind = dM_ind,
      dM_ind_df = k_ind,
      dM_ind_p = dM_ind_p,
      v_dep = v_dep,
      v_ind = v_ind,
      v_ind_singular = v_ind_singular,
      v_ind_nonsingular = v_ind_nonsingular,
      data = tibble::as_tibble(data),
      d_ind = tibble::as_tibble(d_ind),
      d_ind_p = tibble::as_tibble(d_ind_p),
      d_dep = tibble::as_tibble(d_dep),
      d_predicted = tibble::as_tibble(d_predicted),
      d_deviations = tibble::as_tibble(d_deviations),
      d_dep_residuals_z = tibble::as_tibble(d_dep_residuals_z),
      d_dep_cp = tibble::as_tibble(d_dep_cp),
      d_dep_p = tibble::as_tibble(d_dep_p),
      R2 = R2,
      zSEE = zSEE,
      SEE = SEE,
      ConditionalCovariance = cov_cond,
      distance_reduction = ifelse(dM_dep == 0, 0, 1 - (dCM / dM_dep)),
      variability_reduction = variability_reduction,
      mu = mu[, 1],
      sigma = sigma[, 1],
      d_person = d_person,
      d_score = d_score,
      d_variable = d_variable,
      label = label
    )


    class(CM) <- c("cond_maha", class(CM))
    CM



} else {
  # If there are no independent variables
  M <- list(
    dM_dep = dM_dep,
    dM_dep_df = k_dep,
    dM_dep_p = dM_dep_p,
    d_dep = tibble::as_tibble(d_dep),
    v_dep = v_dep,
    data = tibble::as_tibble(data),
    mu = mu[v_dep, 1],
    sigma = sigma[v_dep, 1],
    label = label
  )
  class(M) <- c("maha", class(M))
  M
  }
}

#' Format cond_maha class
#'
#' @param x Object of cond_maha class
#' @noRd
#' @keywords internal
#' @export
format.cond_maha <- function(x, ...) {
  paste0(
    "Conditional Mahalanobis Distance = ",
    formatC(x$dCM, 4, format = "f"),
    ", df = ",
    x$dCM_df,
    ", p = ",
    formatC(x$dCM_p, 4, format = "f")
  )
}

#' Print cond_maha class
#'
#' @param x Object of cond_maha class
#' @noRd
#' @keywords internal
#' @export
print.cond_maha <- function(x, ...) {
  cat(format(x, ...), "\n")
}


#' Format maha class
#'
#' @param x Object of maha class
#' @noRd
#' @keywords internal
#' @export
format.maha <- function(x, ...) {
  paste0(
    "Mahalanobis Distance = ",
    formatC(x$dM_dep, 4, format = "f"),
    ", df = ",
    x$dM_dep_df,
    ", p = ",
    formatC(x$dM_dep_p, 4, format = "f")
  )
}

#' Print maha class
#'
#' @param x Object of maha class
#' @noRd
#' @keywords internal
#' @export
print.maha <- function(x, ...) {
  cat(format(x, ...))
}



#' Range label associated with probability
#'
#' @export
#' @param p Probability
#' @keywords internal
#' @return label string
p2label <- function(p) {
  thresholds <- purrr::map_int(p, function(x) {
    sum(x > stats::pnorm(seq(60, 90, 10), 100, 15)) +
      sum(x >= stats::pnorm(seq(110, 140, 10), 100, 15)) + 1L
  })
  vlabels <- c(
    "Extremely Low",
    "Very Low",
    "Low",
    "Low Average",
    "Average",
    "High Average",
    "High",
    "Very High",
    "Extremely High"
  )
  vlabels[thresholds]
}


#' Test if matrix is singular
#'
#' @export
#' @param x matrix
#' @keywords internal
#' @return logical
is_singular <- function(x) {
  det(x) < .Machine$double.eps
}

#' Rounds proportions to significant digits both near 0 and 1
#'
#' @param p probability
#' @param digits rounding digits
#'
#' @return numeric vector
#' @export
#'
#' @examples
#' proportion_round(0.01111)

proportion_round <- function(p, digits = 2) {
  lower_limit <- 0.95 * 10 ^ (-1 * digits)
  upper_limit <- 1 - lower_limit

  p1 <- dplyr::if_else(p < 0.5, p, 1 - p)

  r <-
    dplyr::if_else(
      p <= 0 | p >= 1 | p > lower_limit & p < upper_limit,
      true = digits,
      false = -floor(log10(abs(p1))) + (p < lower_limit)
    )
  r <- dplyr::if_else(r > 1 & p < 1 & p > 0 & !is.na(p), r, digits)

  p2 <- stringr::str_replace(
    string = purrr::map2_chr(p, r, formatC, format = "f"),
    pattern = "^0\\.",
    replacement = "."
  )
  dplyr::if_else(is.na(p), NA_character_, p2)

}

#' Rounds proportions to significant digits both near 0 and 1,
#' then converts to percentiles
#'
#' @param p probability
#' @param digits rounding digits. Defaults to 2
#' @param remove_leading_zero Remove leading zero for small percentiles,
#' Defaults to TRUE
#' @param add_percent_character Append percent character. Defaults to FALSE
#' @return character vector
#' @export
#'
#' @examples
#' proportion2percentile(0.01111)

proportion2percentile <- function(p,
                                  digits = 2,
                                  remove_leading_zero = TRUE,
                                  add_percent_character = FALSE) {
  p1 <- as.character(100 * as.numeric(proportion_round(as.numeric(p),
                                                       digits = digits)))
  if (remove_leading_zero) {
    p1 <- gsub(pattern = "^0\\.",
               replacement = ".",
               x = p1)
  }

  if (add_percent_character) {
    p1 <- paste0(p1, "%")
  }

  p1

}

#' Plot the variables from the results of the cond_maha function.
#'
#' @export
#' @param x The results of the cond_maha function.
#' @param ... Arguments passed to print function
#' @param p_tail The proportion of the tail to shade
#' @param family Font family.
#' @param score_digits Number of digits to round scores.
#' @importFrom rlang .data
#' @return A ggplot2-object
plot.cond_maha <- function(x,
                           ...,
                           p_tail = 0,
                           family = "sans",
                           score_digits = ifelse(min(x$sigma) >= 10, 0, 2)) {
  if (length(unique(x$d_score$id)) > 1)
    stop("Can only plot one case at a time")

  break_width <- max(x$sigma)
  break_min <- min(x$mu - 10 * x$sigma)
  break_max <- max(x$mu + 10 * x$sigma)
  minor_break_width <- ifelse(break_width %% 3 == 0,
                              break_width / 3,
                              break_width / 2)
  major_breaks <- seq(break_min, break_max, break_width)
  minor_breaks <- seq(break_min, break_max, minor_break_width)

  label_independent <- paste0(
    "list(Independent~italic(d[M])==\"",
    formatC(x$dM_ind,
            digits = 2,
            format = "f"),
    "\",italic(p)==\"",
    proportion_round(x$dM_ind_p),
    "\")"
  )


  label_dependent <- paste0(
    "list(Dependent~italic(d[M])==\"",
    formatC(x$dM_dep,
            digits = 2,
            format = "f"),
    "\",italic(p)==\"",
    proportion_round(x$dM_dep_p),
    "\")"
  )

  x$d_score %>%
    dplyr::mutate(
      SD = ifelse(
        test = is.na(.data$zSEE),
        yes = .data$sigma,
        no = .data$zSEE * .data$sigma
      ),
      yhat = ifelse(is.na(.data$Predicted), .data$mu, .data$Predicted),
      id = factor(.data$id),
      Role = factor(
        .data$Role,
        levels = c("Independent", "Dependent"),
        labels = c(label_independent, label_dependent)
      )
    ) %>%
    ggplot2::ggplot(ggplot2::aes(
      x = .data$Variable,
      y = .data$Score,
      fill = .data$Role
    )) +
    ggplot2::facet_grid(
      cols = ggplot2::vars(!!quote(Role)),
      scales = "free",
      space = "free",
      labeller = ggplot2::label_parsed
    ) +
    ggnormalviolin::geom_normalviolin(
      mapping = ggplot2::aes(
        mu = .data$yhat,
        sigma = .data$SD,
        face_right = .data$Role == label_dependent,
        face_left = .data$Role != label_dependent
      ),
      p_tail = p_tail,
      fill = "gray68",
      width = 0.85
    ) +
    ggnormalviolin::geom_normalviolin(
      mapping = ggplot2::aes(
        mu = .data$mu,
        sigma = .data$sigma,
        face_right = .data$Role != label_dependent
      ),
      fill = "gray88",
      width = 0.85,
      p_tail = p_tail
    ) +
    ggplot2::geom_point(mapping = ggplot2::aes(color = .data$id)) +
    ggplot2::geom_text(
      mapping = ggplot2::aes(
        label = formatC(.data$Score, score_digits, format = "f"),
        color = .data$id
      ),
      vjust = -0.5,
      family = family
    ) +
    ggplot2::geom_text(
      mapping = ggplot2::aes(
        color = .data$id,
        label = dplyr::if_else(
          .data$Role == label_independent,
          "",
          paste0("italic(c*p)=='",
                 proportion_round(.data$cp),
                 "'")
        )
      ),
      vjust = 2.3,
      size = 3,
      parse = TRUE,
      family = family
    ) +
    ggplot2::geom_text(
      mapping = ggplot2::aes(
        color = .data$id,
        label = paste0("italic(phantom(c)*p)=='",
                       proportion_round(.data$p),
                       "'")
      ),
      vjust = 1.3,
      size = 3,
      parse = TRUE,
      family = family
    ) +
    ggplot2::scale_y_continuous("Scores",
                                breaks = major_breaks,
                                minor_breaks = minor_breaks) +
    ggplot2::scale_x_discrete(NULL,
                              expand = ggplot2::expansion(add = 1)) +
    ggplot2::labs(title = bquote(list(
      Conditional ~ Mahalanobis ~ Distance ~ (italic(d[CM])) == .(formatC(
        x$dCM,
        digits = 2,
        format = "f"
      )),
      italic(p) == .(proportion_round(x$dCM_p))
    )),
    caption = expression(
      list(
        italic(p) == "Population proportion",
        italic(c * p) == "Conditional proportion",
        italic(d[M]) == "Mahalanobis Distance"
      )
    )) +
    ggplot2::theme_light(base_family = family) +
    ggplot2::theme(legend.position = "none") +
    ggplot2::scale_color_grey()
}

#' Plot objects of the maha class (i.e, the results of the cond_maha function
#' using dependent variables only).
#'
#' @export
#' @param x The results of the cond_maha function.
#' @param ... Arguments passed to print function
#' @param p_tail Proportion in violin tail (defaults to 0).
#' @param family Font family.
#' @param score_digits Number of digits to round scores.
#' @importFrom rlang .data
#' @return A ggplot2-object
plot.maha <- function(x,
                      ...,
                      p_tail = 0,
                      family = "sans",
                      score_digits = ifelse(min(x$sigma) >= 10, 0, 2)) {
  if (nrow(x$d_dep) > 1)
    stop("Can only plot one case at a time")

  break_width <- max(x$sigma)
  break_min <- min(x$mu - 10 * x$sigma)
  break_max <- max(x$mu + 10 * x$sigma)
  minor_break_width <- ifelse(break_width %% 3 == 0,
                              break_width / 3,
                              break_width / 2)
  major_breaks <- seq(break_min, break_max, break_width)
  minor_breaks <- seq(break_min, break_max, minor_break_width)

  d_stats <-
    tibble::tibble(Variable = x$v_dep,
                   mu = x$mu,
                   sigma = x$sigma)

  x$d_dep %>%
    tibble::rownames_to_column("id") %>%
    dplyr::mutate(id = factor(.data$id)) %>%
    tidyr::pivot_longer(-"id",
                        names_to = "Variable",
                        values_to = "Score") %>%
    dplyr::left_join(d_stats, by = "Variable") %>%
    dplyr::mutate(
      z = (.data$Score - .data$mu) / .data$sigma,
      z_p = stats::pnorm(.data$z),
      in_tail = .data$z_p < p_tail / 2 |
        .data$z_p > (1 - p_tail / 2)
    ) %>%
    ggplot2::ggplot(ggplot2::aes(.data$Variable, .data$Score)) +
    ggnormalviolin::geom_normalviolin(
      data = d_stats,
      ggplot2::aes(
        x = .data$Variable,
        mu = .data$mu,
        sigma = .data$sigma
      ),
      inherit.aes = FALSE,
      fill = "gray65",
      p_tail = p_tail
    ) +
    ggplot2::geom_point(mapping = ggplot2::aes(shape = .data$in_tail)) +
    ggplot2::geom_text(
      mapping = ggplot2::aes(
        label = formatC(.data$Score, score_digits, format = "f"),
        color = .data$id
      ),
      vjust = -0.5,
      family = family
    ) +
    ggplot2::geom_text(
      mapping = ggplot2::aes(label = paste0(
        "italic(p)=='",
        proportion_round(.data$z_p),
        "'"
      )),
      vjust = 1.3,
      size = 3,
      parse = TRUE,
      family = family
    ) +
    ggplot2::theme_minimal(base_family = family) +
    ggplot2::theme(legend.position = "none") +
    ggplot2::scale_color_grey() +
    ggplot2::scale_shape_manual(values = c(16, 0)) +
    ggplot2::scale_y_continuous("Scores",
                                breaks = major_breaks,
                                minor_breaks = minor_breaks) +
    ggplot2::scale_x_discrete(NULL,
                              expand = ggplot2::expansion(add = 1)) +
    ggplot2::labs(title = bquote(list(
      Mahalanobis~Distance == .(formatC(x$dM_dep, 2, format = "f")),
      italic(p) == .(proportion_round(x$dM_dep_p))
    )),
    caption = expression(list(italic(p) == "Population proportion")))
}

Try the unusualprofile package in your browser

Any scripts or data that you put into this service are public.

unusualprofile documentation built on May 29, 2024, 9:37 a.m.