virlink: VirScan cross-reactivity and correlation analysis

Documented in peptide_pairwise_correlation

#' Peptide pairwise correlation or co-occurrence
#'
#' perform correlation or co-occurrence analysis across all pairs of peptides.
#'     Pairwise comparison were implemented by \code{\link{to_pairwise}}.
#'
#' @param d a data frame or tibble containing z-scores or binary indicator of
#'     peptide enrichment. Columns represent peptides and rows represent samples.
#'     Column names are peptide ids and row names are sample ids.
#' @param analysis_type type of analysis to perform.
#'     Options: "correlation" or "cooccurrence".
#'     If select "correlation", correlation will be calculated on the z-scores, and
#'     \code{cor_method} will be used.
#'     If select "cooccurrence", z-score will be converted to a binary indicator
#'     of enrichment (1/0), and \code{occ_method} and \code{occ_test} will be used.
#'     Default: \code{correlation}
#' @param perform_test a logical indicator of whether statistical tests
#'     should be performed for each comparison.
#'     The test method is specified by \code{cor_method} and \code{occ_method},
#'     depending on \code{analysis_type}.
#'     Default: \code{FALSE}
#' @param cor_method correlation coefficient and test to be computed.
#'     Pass to \code{\link[stats]{cor}} and \code{\link[stats]{cor.test}}
#'     Options: "pearson", "spearman", "kendall".
#'     Default: "pearson"
#' @param occ_method cooccurrence coefficient to be computed.
#'     Pass to \code{\link{peptide_cooccurrence}}.
#'     Options: "jaccard", "phi", "prop", or any combination of them in a vector.
#'     Default: "jaccard"
#' @param occ_test cooccurrence tests to be computed.
#'     Pass to \code{\link{peptide_cooccurrence}}.
#'     Options: "fisher" (Fisher's exact test) or "chisq" (Pearson's Chi-squared test).
#'     Default: "fisher"
#' @param p_adjust_method method to adjust p values for multiple comparison.
#'     Pass to \code{\link[stats]{p.adjust}}.
#'     options: "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none".
#'     Default: "fdr"
#' @param hit_threshold the antibody reactivity threshold above which the epitope is considered as enriched.
#'     Default: 1
#' @param temporal_samples a logical indicator of whether there are multiple
#'     temporal samples from each individual. If \code{TRUE}, an additional data frame
#'     \code{si} is needed to indicate the attribution of samples to individuals.
#'     To account for the temporal structure, z-score difference between each two consecutive
#'     temporal points will be calculated and used for the correlation analysis.
#'     Although the function tolerates cooccurrence analysis with temporal samples,
#'     it is not preferred as changes of binary status are less informative.
#'     To perform cooccurrence analysis, only the change from 0 to 1 between two time points
#'     is considered as 1 in the difference data.
#'     Default: \code{FALSE}
#' @param si a data frame or tibble indicating the temporal structure of the samples.
#'     The first column of \code{si} is the sample id, which should match the row names of \code{d}.
#'     The second column of \code{si} is the individual id.
#'     All other columns will be ignored.
#' @param output_str format of the output to return.
#'     Options: "data.table", "data.frame", and "tibble".
#'     Default: "data.table"
#' @param mc number of cores to use for multi-thread parallel analysis. Pass to
#'     \code{\link[parallel]{mclapply}}.
#'     Deafult: 1
#'
#' @return a data frame/data table/tibble that contains correlation or co-occurence
#'     analysis results of all pairs of peptides
#'
#' @author Siyang Xia \email{sxia@@hsph.harvard.edu}
#' @seealso \code{\link{peptide_coocurrence}}, \code{\link{to_pairwise}}
#' @keywords peptide correlation cooccurrence
#'
#' @examples
#' data(peptide_z)
#' sample_info <- data.frame(id = row.names(peptide_z),
#'                           individual = rep(c("ind1", "ind2"), each = 6))
#'
#' ### 1. correlation analysis with default setting
#' output1 <- peptide_pairwise_correlation(d = peptide_z)
#' head(output1)
#'
#' ### 2. correlation with statistical tests
#' output2 <- peptide_pairwise_correlation(d = peptide_z,
#'                                         si = sample_info,
#'                                         analysis_type = "correlation",
#'                                         perform_test = TRUE,
#'                                         cor_method = "pearson")
#' head(output2)
#'
#' ### 3. correlation with temporal samples
#' output3 <- peptide_pairwise_correlation(d = peptide_z,
#'                                         si = sample_info,
#'                                         analysis_type = "correlation",
#'                                         perform_test = TRUE,
#'                                         cor_method = "pearson",
#'                                         temporal_samples = TRUE)
#' head(output3)
#'
#' ### 4. cooccurrence analysis with all three coefficients
#' output4 <- peptide_pairwise_correlation(d = peptide_z,
#'                                         si = sample_info,
#'                                         analysis_type = "cooccurrence",
#'                                         hit_threshold = 5,
#'                                         perform_test = FALSE,
#'                                         occ_method = c("jaccard", "phi", "prop"),
#'                                         temporal_samples = FALSE)
#' head(output4)
#'
#' ### 5. cooccurrence analysis with two coefficients and fisher's exact test
#' output5 <- peptide_pairwise_correlation(d = peptide_z,
#'                                         si = sample_info,
#'                                         analysis_type = "cooccurrence",
#'                                         hit_threshold = 10,
#'                                         perform_test = TRUE,
#'                                         occ_method = c("jaccard", "phi"),
#'                                         occ_test = "fisher",
#'                                         temporal_samples = FALSE)
#' head(output5)
#'
#' ### 6. cooccurrence analysis with only phi coefficient and return as a tibble
#' output6 <- peptide_pairwise_correlation(d = peptide_z,
#'                                         si = sample_info,
#'                                         analysis_type = "cooccurrence",
#'                                         hit_threshold = 5,
#'                                         perform_test = FALSE,
#'                                         occ_method = "phi",
#'                                         temporal_samples = FALSE,
#'                                         output_str = "tibble")
#' head(output6)
#'
#' @export
#' @import data.table
#'
peptide_pairwise_correlation <- function(d,
                                         analysis_type    = "correlation", # option: "correlation", "cooccurrence"
                                         perform_test     = FALSE,
                                         cor_method       = "pearson",     # option: "pearson", "spearman"
                                         occ_method       = "jaccard",     # option: "jaccard", "phi", "prop", or combination of them
                                         occ_test         = "fisher",      # option: "fisher", "chisq"
                                         p_adjust_method  = "fdr",
                                         hit_threshold    = 1,
                                         temporal_samples = FALSE,
                                         si               = NULL,
                                         output_str       = "data.table",  # options: "data.table", "data.frame", "tibble"
                                         mc               = 1
){


  # 1. basic messages about the analysis to perform -------------------------

  if(analysis_type == "correlation"){

    if(!(cor_method %in% c("pearson", "spearman", "kendall"))){
      stop('Invalid correlation method. Please select from "pearson", "spearman", "kendall".')
    }

  }else if(analysis_type == "cooccurrence"){

    if(!(all(occ_method %in% c("jaccard", "phi", "prop")))){
      stop('Invalid cooccurrence coefficient. Please select from "jaccard", "phi", "prop", or their combination as a vector.')
    }

    if(perform_test & (!(occ_test %in% c("fisher", "chisq")))){
      stop('Invalid cooccurrence test method Please select from "fisher" or "chisq".')
    }

  }else{

    stop('Invalid data type. Please select from "correlation" and "cooccurrence".')

  }

  # if(analysis_type == "correlation"){
  #   message("Correlation of z-scores between each pair of epitopes.")
  #
  #   if(cor_method == "pearson"){
  #     message("Correlation method: Pearson")
  #   }else if(cor_method == "spearman"){
  #     message("Correlation method: Spearman")
  #   }else if(cor_method == "kendall"){
  #     message("Correlation method: Kendall")
  #   }else{
  #     stop('Invalid correlation method. Please select from "pearson", "spearman", "kendall".')
  #   }
  #
  #   if(perform_test) message("Perform statistical test")
  #
  # }else if(analysis_type == "cooccurrence"){
  #   message("Co-occurrence (binary) analysis between each pair of epitopes.")
  #
  #   if(all(occ_method %in% c("jaccard", "phi", "prop"))){
  #     if("jaccard" %in% occ_method){
  #       message("Cooccurrence coefficient: jaccard index")
  #     }
  #     if("phi" %in% occ_method){
  #       message("Cooccurrence coefficient: phi correlation coefficient")
  #     }
  #     if("prop" %in% occ_method){
  #       message("Cooccurrence coefficient: mean proportion of co-occurrence given one peptide is present")
  #     }
  #   }else{
  #     stop('Invalid cooccurrence coefficient. Please select from "jaccard", "phi", "prop", or their combination as a vector.')
  #   }
  #
  #   if(perform_test){
  #     if(occ_test == "fisher"){
  #       message("Perform statistical test: Fisher's exact test")
  #     }else if(occ_test == "chisq"){
  #       message("Perform statistical test: Chi-squared test")
  #     }else{
  #       stop('Invalid cooccurrence test method Please select from "fisher" or "chisq".')
  #     }
  #   }
  #
  # }else{
  #   stop('Invalid data type. Please select from "correlation" and "cooccurrence".')
  # }
  #
  # if(temporal_samples){
  #   message("Consider multiple temporal samples of each individual")
  # }



  # 2. select the function to analyze each pair of epitopes -----------------

  if(analysis_type == "correlation" & perform_test == FALSE){

    fct <- function(v1, v2){
      suppressWarnings(cor(x = v1, y = v2, method = cor_method))
    }

  }else if(analysis_type == "correlation" & perform_test == TRUE){

    fct <- function(v1, v2){
      tr <- suppressWarnings(cor.test(x = v1, y = v2, method = cor_method))
      return(c(tr$estimate, tr$statistic, p = tr$p.value))
    }

  }else if(analysis_type == "cooccurrence"){

    fct <- function(v1, v2) peptide_cooccurrence(x = v1, y = v2,
                                                 coefficient = occ_method,
                                                 perform_test = perform_test,
                                                 test_method = occ_test)
  }



  # 3. pairwise analysis ----------------------------------------------------

  ### convert the z-score data to binary occurrence data if necessary
  if(analysis_type == "cooccurrence"){
    if(!all(unlist(d) %in% c(0, 1))) d <- as.data.frame((d > hit_threshold) * 1)
  }


  ### pairwise analysis
  if(temporal_samples == FALSE){  # do not consider temporal samples

    output <- to_pairwise(d = d,
                          f = fct,
                          unique_pair = TRUE,
                          same_comparison = FALSE,
                          mc = mc)

  }else{  # use temporal samples

    # check if "si" contains the correct information
    if(length(intersect(unlist(si[, 1]), row.names(d))) < 2){
      stop("Too few samples are included in the sample information data.")
    }

    # colnames of d and si
    dt_name <- names(d)

    # convert the original data and the sample information (si) to data tables
    d_t <- data.table::setDT(data.table::copy(d), keep.rownames = "id")
    s_i <- data.table::setDT(data.table::copy(si[, 1:2]))
    data.table::setnames(s_i, old = colnames(s_i)[1:2], new = c("id", "individual"))

    # combine them
    d_t <- d_t[s_i, on = "id"]
    data.table::setcolorder(d_t, neworder = c("id", "individual"))

    # calculate changes between every two consecutive time point for each individual and each epitope
    d_t[, (dt_name) := .SD - data.table::shift(.SD, give.names = TRUE), by = .(individual), .SDcols = dt_name]
    d_t <- na.omit(d_t)

    # check if "si" contains the correct information
    if(d_t[, .N] < 2){
      stop("Too few samples left after calculating changes between time points.")
    }else if(d_t[, .N] < 10){
      warning("Fewer than 10 samples left after calculating changes between time points.")
    }

    # remove sample information and keep only the data
    d_t[, c("id", "individual") := NULL]

    # when the original data is presence/absence (binary), calculating difference will produce -1
    if(analysis_type == "cooccurrence"){
      d_t[d_t < 0] <- 0
      warning("Cooccurrence analysis is not prefered for temporal data. Correlation analysis with z-scores is more informative.")
    }

    # pairwise analysis
    output <- to_pairwise(d = d_t,
                          f = fct,
                          unique_pair = TRUE,
                          same_comparison = FALSE,
                          mc = mc)

  }


  ### convert the output to data table
  if(!data.table::is.data.table(output)) output <- data.table::setDT(output)


  ### fix column name
  if("temp" %in% colnames(output)) data.table::setnames(output, old = "temp", new = "cor")


  ### adjust p-values if performed statistical analysis
  if(perform_test){
    output[, p_adjust := p.adjust(p, method = p_adjust_method)]
  }



  # 4. Return the results -------------------------------------------------

  if(output_str == "data.table"){
    return(output)
  }else if(output_str == "data.frame"){
    return(as.data.frame(output))
  }else if(output_str == "tibble"){
    return(tibble::as_tibble(output))
  }else{
    warning('Invalid output structure. Result returned as a "data.table".')
    return(output)
  }

}