enrichr: Compute Protein Enrichment

Documented in compute_fxn enrich impute_fxn

#' Impute minimum values for AUC XICs for each accession in both test and control runs.
#'
#' @param pd_df A dataframe generated by `format_pd_file()`.
#' @return A dataframe with imputed values for AUC XICs.
impute_fxn <- function(pd_df){

  print(pd_df)

  pd_impute_df <- pd_df %>%
    dplyr::mutate(comb_ctrl = rowSums(pd_df[, grep("^ctrl\\d.*", names(pd_df))])) %>%
    dplyr::mutate(comb_exp = rowSums(pd_df[, grep("^exp\\d.*", names(pd_df))])) %>%
    dplyr::mutate(min_col = pmin(comb_ctrl, comb_exp, na.rm=TRUE))

  pd_impute_df[c("comb_ctrl","comb_exp")][is.na(pd_impute_df[c("comb_ctrl","comb_exp")])] = min(pd_impute_df$min_col, na.rm = TRUE)

  return(pd_impute_df)
}

#' Compute fold-change values for XICs and PSMs & Z-scores for PSMs for each accession in both test and control runs.
#'
#' @param pd_impute_df A dataframe generated by `impute_fxn()`.
#' @param one_sided Specify one-sided or two-sided statistical test; default = `FALSE` (two-sided).
#' @return A dataframe with fold-change values for XICs and PSMs & Z-scores for PSMs for each accession in both test and control runs.
compute_fxn <- function(pd_impute_df, one_sided = FALSE){

  pd_enriched <- pd_impute_df %>%
    # XIC fold change calcuations
    dplyr::mutate(mean_abundance = (comb_exp+comb_ctrl)/2) %>%
    dplyr::mutate(abundance_fc = case_when(comb_ctrl > comb_exp ~ -comb_ctrl/comb_exp,
                                    comb_ctrl < comb_exp ~ comb_exp/comb_ctrl)) %>%
    dplyr::mutate(abundance_log2fc = log2((comb_exp/comb_ctrl))) %>%
    # PSM fold change calculations
    dplyr::mutate(expPSMs_pseudo = exp_PSMs + 1,
           ctrlPSMs_pseudo = ctrl_PSMs + 1) %>%
    dplyr::mutate(F0ctrl_pseudo = ctrlPSMs_pseudo / sum(ctrlPSMs_pseudo)) %>%
    dplyr::mutate(F0exp_pseudo = expPSMs_pseudo / sum(expPSMs_pseudo)) %>%
    dplyr::mutate(PSM_fc = case_when(F0ctrl_pseudo > F0exp_pseudo ~ -F0ctrl_pseudo/F0exp_pseudo,
                              F0ctrl_pseudo < F0exp_pseudo ~ F0exp_pseudo/F0ctrl_pseudo)) %>%
    dplyr::mutate(PSM_log2fc = log2((F0exp_pseudo/F0ctrl_pseudo))) %>%
    # PSM z-score calculations
    dplyr::mutate(F0_ctrl = ctrl_PSMs / (sum(ctrl_PSMs))) %>%
    dplyr::mutate(F0_exp = exp_PSMs / (sum(exp_PSMs))) %>%
    dplyr::mutate(F1 = (ctrl_PSMs + exp_PSMs) / (sum(ctrl_PSMs) + sum(exp_PSMs))) %>%
    dplyr::mutate(PSM_zscore =
             (F0_exp - F0_ctrl)/
             sqrt((
               (F1 * (1-F1) / sum(exp_PSMs))) +
                 (F1 * (1-F1) / sum(exp_PSMs))))

  # calculate probabilities
  if(!one_sided){
    pd_enriched <- pd_enriched %>%
      dplyr::mutate(pval = pnorm(PSM_zscore)) %>%
      dplyr::mutate(fdr_bh = p.adjust(pval, method = "BH", n = length(pval)))
  } else {
    pd_enriched <- pd_enriched %>%
      dplyr::mutate(pval = pnorm(PSM_zscore, lower.tail = FALSE)) %>%
      dplyr::mutate(fdr_bh = p.adjust(pval, method = "BH", n = length(pval)))
  }

  # format final dataframe
  pd_enriched <- pd_enriched %>%
    dplyr::arrange(desc(PSM_zscore)) %>%
    dplyr::rename(total_PSMs = number_PSMs, ctrl_abundance = comb_ctrl,
           exp_abundance = comb_exp) %>%
    dplyr::select(accession, description, number_of_unique_peptides,
                  number_of_peptides, total_PSMs, ctrl_abundance,
                  exp_abundance, mean_abundance, abundance_fc,
                  abundance_log2fc, ctrl_PSMs, exp_PSMs,
                  PSM_fc, PSM_log2fc, PSM_zscore, pval, fdr_bh)

  print(pd_enriched)

  return(pd_enriched)
}

#' Annotate and compute fold-changes and Z-scores for differential proteomics results.
#'
#' Format PSM.txt and Proteins.txt files output by Proteome Discoverer 2.3, formatting PSM counts and AUC XIC values into one table, summing technical replicates, and computing fold-change and Z-scores for each detected protein in test and control runs. Optionally annotates results given a tab-separated annotation file. See README for file formats and example usage on the \href{https://github.com/rachaelcox/enrichr}{enrichr GitHub Repository}.
#'
#' @param exp_id An identifier for a given experiment that matches test and control files together (see meta file).
#' @param meta_file A tab-separate file with column headers `exp_name` (populated with values that correspond to exp_id variable), `type` (exp or ctrl), and `file_id` (determined by Proteome Discoverer, see README on Github).
#' @param psm_file PSM.txt tab-separated file output by Proteome Discoverer 2.3.
#' @param pd_file Proteins.txt tab-separated file output by Proteome Discoverer 2.3.
#' @param one_sided Specify one-sided or two-sided statistical test; default = `FALSE` (two-sided).
#' @param annot_file Optional tab-separated file containing a table with annotations for the queried proteome; must match accession format in FASTA given to Proteome Discoverer as a look up database.
#' @param outfile_name File name for output .csv; default = "output.csv".
#' @return A `.csv` with all computed information combined into one table.
#' @export
enrich <- function(exp_id, meta_file, psm_file, pd_file, one_sided = FALSE, annot_file, outfile_name = "output.csv"){

  if(str_sub(outfile_name, -3, -1) != "csv"){
    outfile_name = paste0(outfile_name, ".csv")
  }

  # load in and format Proteome Discoverer files, meta file
  psm_df <- format_psm_file(exp_id, meta_file, psm_file)

  pd_df <- format_pd_file(exp_id, meta_file, pd_file, psm_df)

  # impute missing values
  pd_impute_df <- impute_fxn(pd_df)

  # sum technical replicates, calculate fold change and log ratios
  pd_enriched <- compute_fxn(pd_impute_df, one_sided)

  # if no XIC values detected, remove these columns
  if(all(pd_enriched$mean_abundance == 0)){

    pd_enriched <- pd_enriched %>%
      dplyr::select(-matches(".*abundance.*"))
  }

  # join on annotations if specified
  if(!missing(annot_file)){

    annot_table <- readr::read_tsv(annot_file) %>%
      janitor::clean_names()

    pd_enriched <- pd_enriched %>%
      dplyr::left_join(annot_table, by=c("accession"))
  }

  readr::write_csv(pd_enriched, outfile_name)

  return(pd_enriched)

}