R/mddc_boxplot.R
In MDDC: Modified Detecting Deviating Cells Algorithm in Pharmacovigilance

Documented in mddc_boxplot

#' Modified Detecting Deviating Cells (MDDC) algorithm for adverse event signal
#' identification with boxplot method for cutoff selection.
#'
#' @description Modified Detecting Deviating Cells (MDDC) algorithm for adverse
#' event signal identification. Boxplot method is used for cutoff selection in
#' step 2 of the algorithm.
#' @param contin_table A data matrix of an \eqn{I} x \eqn{J} contingency table
#' with row (adverse event) and column (drug or vaccine) names. Please first
#' check the input contingency table using the function
#' \code{check_and_fix_contin_table()}.
#' @param col_specific_cutoff Logical. In the second step of the algorithm,
#' whether to apply boxplot method to the standardized Pearson residuals of
#' the entire table, or within each drug or vaccine column.
#' Default is \code{TRUE}, that is within each drug or vaccine column
#' (column specific cutoff). \code{FALSE} indicates
#' applying boxplot method on residuals of the entire table.
#' @param separate Logical. In the second step of the algorithm, whether to
#' separate the standardized Pearson residuals for the zero cells and non zero
#' cells and apply boxplot method separately or together.
#' Default is \code{TRUE}.
#' @param if_col_cor Logical. In the third step of the algorithm, whether to use
#' column (drug or vaccine) correlation or row (adverse event) correlation.
#' Default is \code{FALSE}, that is using the adverse event correlation.
#' \code{TRUE} indicates using drug or vaccine correlation.
#' @param cor_lim A numeric value between (0, 1). In the third step,
#' what correlation threshold should be used to select ``connected''
#' adverse events. Default is 0.8.
#' @param coef A numeric value or a list of numeric values. If a single numeric
#' value is provided, it will be applied uniformly across all columns of the
#' contingency table. If a list is provided, its length must match the number
#' of columns in the contingency table, and each value will be used as the
#' coefficient for the corresponding column.
#' @param num_cores Number of cores used to parallelize the MDDC
#' Boxplot algorithm. Default is 2.
#'
#' @return A list with the following components:
#' \itemize{
#' \item \code{boxplot_signal} returns the signals identified in the
#' second step.
#' 1 indicates signals, 0 for non signal.
#' \item \code{corr_signal_pval} returns the p values for each cell in the
#' contingency table in the fifth step, when the \eqn{r_{ij}} values are mapped
#' back to the standard normal distribution.
#' \item \code{corr_signal_adj_pval} returns the Benjamini-Hochberg adjusted
#' p values for each cell in the fifth step. We leave here an option for the
#' user to decide whether to use \code{corr_signal_pval} or
#' \code{corr_signal_adj_pval}, and what threshold for p values should be
#' used (for example, 0.05). Please see the example below.
#' }
#' @export
#'
#' @seealso \code{\link{find_optimal_coef}} for finding an optimal value of
#' \code{coef}.
#'
#' @examples
#' # using statin49 data set as an example
#' data(statin49)
#'
#' # apply the mddc_boxplot
#' boxplot_res <- mddc_boxplot(statin49)
#'
#' # signals identified in step 2 using boxplot method
#' signal_step2 <- boxplot_res$boxplot_signal
#'
#' # signals identified in step 5 by considering AE correlations
#' # In this example, cells with p values less than 0.05 are
#' # identified as signals
#' signal_step5 <- (boxplot_res$corr_signal_pval < 0.05) * 1
#'
#' @useDynLib MDDC
#' @importFrom grDevices boxplot.stats
#' @importFrom stats cor
#' @importFrom stats weighted.mean
#' @importFrom stats sd
#' @importFrom stats pnorm
#' @importFrom stats p.adjust
#' @importFrom stats lm
#' @importFrom foreach foreach
#' @importFrom foreach %dopar%
#' @importFrom doParallel registerDoParallel
#' @importFrom doParallel stopImplicitCluster

mddc_boxplot <- function(
    contin_table,
    col_specific_cutoff = TRUE,
    separate = TRUE,
    if_col_cor = FALSE,
    cor_lim = 0.8,
    coef = 1.5,
    num_cores = 2) {
  n_row <- nrow(contin_table)
  n_col <- ncol(contin_table)

  row_names <- row.names(contin_table)
  col_names <- colnames(contin_table)

  if (col_specific_cutoff) {
    if (is.numeric(coef)) {
      if (length(coef) == 1) {
        # If coef is a single numeric value, replicate it for each column
        coef <- rep(coef, n_col)
      } else if (length(coef) != n_col) {
        # If coef is a numeric vector but its length does not match n_col,
        # throw an error
        stop("Length of 'coef' does not match the number of columns (n_col).")
      }
    } else if (is.list(coef)) {
      if (length(coef) != n_col) {
        # If coef is a list but its length does not match n_col, throw an error
        stop("Length of 'coef' does not match the number of columns (n_col).")
      }
    } else {
      # If coef is neither a numeric nor a list, throw an error
      stop("'coef' must be a numeric value, numeric vector, or list.")
    }
  } else {
    if (length(coef) != 1) {
      stop("'coef' must be a numeric value when 'col_specific_cutoff is FALSE")
    }
  }

  boxplot_coef_list <- coef
  Z_ij_mat <- getZijMat(continTable = contin_table, na = FALSE)

  res_all <- as.vector(Z_ij_mat)
  res_nonzero <- as.vector(Z_ij_mat[which(contin_table != 0)])
  res_zero <- as.vector(Z_ij_mat[which(contin_table == 0)])

  if (col_specific_cutoff == TRUE) {
    if (separate == TRUE) {
      c_univ_drug <- unlist(lapply(seq_len(n_col), function(a) {
        boxplot.stats(Z_ij_mat[which(contin_table[
          ,
          a
        ] != 0), a], coef = boxplot_coef_list[a])$stats[[5]]
      }))
      zero_drug_cutoff <- unlist(lapply(seq_len(n_col), function(a) {
        boxplot.stats(Z_ij_mat[which(contin_table[
          ,
          a
        ] == 0), a])$stats[[1]]
      }))
    } else {
      c_univ_drug <- unlist(lapply(seq_len(n_col), function(a) {
        boxplot.stats(Z_ij_mat[, a], coef = boxplot_coef_list[a])$stats[[5]]
      }))
      zero_drug_cutoff <-
        apply(Z_ij_mat, 2, function(a) boxplot.stats(a)$stats[[1]])
    }
  } else {
    if (separate == TRUE) {
      c_univ_drug <-
        rep(boxplot.stats(res_nonzero, coef = coef)$stats[5], n_col)
      zero_drug_cutoff <- rep(boxplot.stats(res_zero)$stats[1], n_col)
    } else {
      c_univ_drug <- rep(boxplot.stats(res_all, coef = coef)$stats[5], n_col)
      zero_drug_cutoff <- rep(boxplot.stats(res_all)$stats[1], n_col)
    }
  }


  high_outlier <- matrix(unlist(lapply(seq_len(n_col), function(a) {
    (Z_ij_mat[
      ,
      a
    ] > c_univ_drug[a]) * 1
  })), ncol = n_col)
  colnames(high_outlier) <- colnames(contin_table)
  row.names(high_outlier) <- row.names(contin_table)

  low_outlier <- matrix(unlist(lapply(seq_len(n_col), function(a) {
    (Z_ij_mat[
      ,
      a
    ] < -c_univ_drug[a]) * 1
  })), ncol = n_col)
  colnames(low_outlier) <- colnames(contin_table)
  row.names(low_outlier) <- row.names(contin_table)

  zero_cell_out <- matrix(unlist(lapply(seq_len(n_col), function(a) {
    (Z_ij_mat[
      ,
      a
    ] < zero_drug_cutoff[a])
  })), ncol = n_col)
  zero_cell_outlier <- ((zero_cell_out) & (contin_table == 0)) * 1

  if_outlier_mat <- ((high_outlier + low_outlier + zero_cell_outlier) !=
    0) * 1

  U_ij_mat <- ifelse(1 - if_outlier_mat, Z_ij_mat, NA)

  cor_U <- pearsonCorWithNA(U_ij_mat, if_col_cor)

  if (if_col_cor == TRUE) {
    iter_over <- n_col
  } else {
    iter_over <- n_row
  }

  num_cores <- as.numeric(num_cores)
  registerDoParallel(num_cores)

  results <- foreach(i = seq_len(iter_over), .packages = c("stats")) %dopar% {
    # nocov start
    idx <- which(abs(cor_U[i, ]) >= cor_lim)
    cor_list_i <- idx[!idx %in% i]
    weight_list_i <- abs(cor_U[i, cor_list_i])

    if (length(cor_list_i) == 0) {
      fitted_value_i <- NA
    } else {
      if (if_col_cor == TRUE) {
        mat <- matrix(NA, n_row, length(cor_list_i))
        row.names(mat) <- row_names
        colnames(mat) <- col_names[cor_list_i]
      } else {
        mat <- matrix(NA, length(cor_list_i), n_col)
        row.names(mat) <- row_names[cor_list_i]
        colnames(mat) <- col_names
      }

      coef_list <- list()
      for (j in seq_len(length(cor_list_i))) {
        if (if_col_cor == TRUE) {
          coeff <- lm(U_ij_mat[, i] ~ U_ij_mat[, cor_list_i[j]])$coefficients
          fit_values <- U_ij_mat[, cor_list_i[j]] * coeff[2] +
            coeff[1]
          mat[which(row_names %in% names(fit_values)), j] <- fit_values
        } else {
          coeff <- lm(U_ij_mat[i, ] ~ U_ij_mat[cor_list_i[j], ])$coefficients
          fit_values <- U_ij_mat[cor_list_i[j], ] * coeff[2] +
            coeff[1]
          mat[j, which(col_names %in% names(fit_values))] <- fit_values
        }
        coef_list <- append(coef_list, list(coeff))
      }

      fitted_value_i <- mat
    }

    if (length(weight_list_i) > 0) {
      if (if_col_cor == TRUE) {
        Z_ij_hat_i <- apply(fitted_value_i, 1, function(a) {
          weighted.mean(x = a, w = weight_list_i, na.rm = TRUE)
        })
      } else {
        Z_ij_hat_i <- apply(fitted_value_i, 2, function(a) {
          weighted.mean(x = a, w = weight_list_i, na.rm = TRUE)
        })
      }
    } else {
      Z_ij_hat_i <- NA
    }

    list(Z_ij_hat = Z_ij_hat_i)
  } # nocov end

  stopImplicitCluster()

  Z_ij_hat_mat <- matrix(NA, n_row, n_col)

  for (i in seq_len(iter_over)) {
    result <- results[[i]]
    if (!is.null(result)) {
      if (if_col_cor) {
        Z_ij_hat_mat[, i] <- result$Z_ij_hat
      } else {
        Z_ij_hat_mat[i, ] <- result$Z_ij_hat
      }
    }
  }

  # Step 5: standardize the residuals within each drug column and flag outliers
  R_ij_mat <- Z_ij_mat - Z_ij_hat_mat

  r_ij_mat <- apply(R_ij_mat, 2, function(a) {
    (a - mean(a, na.rm = TRUE)) / sd(a, na.rm = TRUE)
  })

  r_pval <- 1 - pnorm(r_ij_mat)
  colnames(r_pval) <- col_names
  rownames(r_pval) <- row_names

  r_adj_pval <- matrix(p.adjust(r_pval, method = "BH"),
    nrow = n_row,
    ncol = n_col
  )
  colnames(r_adj_pval) <- col_names
  rownames(r_adj_pval) <- row_names

  rslt_list <- list(high_outlier, r_pval, r_adj_pval)
  names(rslt_list) <-
    c("boxplot_signal", "corr_signal_pval", "corr_signal_adj_pval")

  return(rslt_list)
}