R/peptable.R

Defines functions newColnames

Documented in newColnames

#' Make new column names
#'
#' According to the Sample_ID(s) in label_scheme.
#'
#' @param TMT_Set Integer; the index of TMT experiment.
#' @param df Data frame; log2FC data.
#' @param label_scheme Experiment summary.
#' @param pattern Regex pattern for capturing intensity and ratio columns.
#' @param group_ids A character vector; for example, c("heavy", "light") that
#'   can be found from column \code{pep_group} in \code{df}; \code{group_ids} is
#'   NULL if column \code{pep_group} is not in \code{df}.
#' @import dplyr purrr forcats
#' @importFrom magrittr %>% %T>% %$% %<>% not
newColnames <- function(TMT_Set = 1L, df, label_scheme, 
                        pattern = "[RI][0-9]{3}[NC]{0,1}", group_ids = NULL) 
{
  label_scheme_sub <- label_scheme %>%
    dplyr::filter(.data$TMT_Set == .env$TMT_Set)
  
  if (is.null(group_ids)) {
    df <- hsubColnames(group_ids, df, pattern, TMT_Set, label_scheme_sub)
  }
  else {
    for (id in group_ids) {
      df <- hsubColnames(id, df, pattern, TMT_Set, label_scheme_sub)
    }
  }

  invisible(df)
}


#' Helper of \link{newColnames}.
#'
#' @param id A character string; for example, "heavy" or "light" that can be
#'   found from column \code{pep_group} in \code{df}. The value of \code{id} is
#'   NULL if column \code{pep_group} is not in \code{df}.
#' @param label_scheme_sub Experiment summary at \code{TMT_Set}.
#' @inheritParams newColnames
hsubColnames <- function (id = NULL, df, pattern = "[RI][0-9]{3}[NC]{0,1}", 
                          TMT_Set = 1L, label_scheme_sub) 
{
  nms <- names(df)
  sids <- as.character(label_scheme_sub$Sample_ID)
  backref <- paste0("(", pattern, ")")
  
  if (is.null(id)) {
    cols <- grep(paste0(pattern, "_", TMT_Set, "$"), nms)
    pat <- paste0(paste(backref, TMT_Set, sep = "_"), "$")
    bares <- gsub(pat, "\\1", nms[cols])
    names(df)[cols] <- paste0(bares, " (", sids, ")")
  }
  else {
    cols <- grep(paste0(paste(pattern, TMT_Set, id, sep = "_"), "$"), nms)
    pat <- paste0(paste(backref, TMT_Set, id, sep = "_"), "$")
    bares <- gsub(pat, "\\1", nms[cols])
    names(df)[cols] <- paste0(bares, " (", sids, " [", id, "]", ")")
  }
  
  invisible(df)
}


#' Check the single presence of MaxQuant peptides[...].txt.
#' 
#' @param dat_dir The working directory.
#' @param label_scheme_full The metadata
#' @inheritParams normPSM
use_mq_peptable <- function (dat_dir, label_scheme_full, 
                             group_psm_by = "pep_seq") 
{
  # only works for "pep_seq"
  filelist <- if (group_psm_by == "pep_seq_mod") {
    list.files(path = file.path(dat_dir), 
               pattern = "^modificationSpecificPeptides.*\\.txt$")
  }
  else if (group_psm_by == "pep_seq") {
    list.files(path = file.path(dat_dir), pattern = "^peptides.*\\.txt$")
  }
  
  if (!length(filelist)) {
    return(FALSE)
  }
  else if (length(filelist) > 1L) {
    stop("Only single MaxQuant `peptides.txt` allowed.", call. = FALSE)
  } 
  else {
    if (!file.exists(file.path(dat_dir, filelist))) {
      stop("No MaxQuant LFQ file `peptides[...].txt` udner", dat_dir)
    } 
    else {
      message("MaxQuant file `", filelist, "` found.")
      return(TRUE)
    }
  }
}


#' Check the single presence of MSFragger peptide table
#' 
#' @param dat_dir The working directory.
#' @param label_scheme_full The metadata
#' @inheritParams normPSM
use_mf_peptable <- function (dat_dir, label_scheme_full, 
                             group_psm_by = "pep_seq_mod") 
{
  filelist <- if (group_psm_by == "pep_seq_mod") {
    list.files(path = file.path(dat_dir), 
               pattern = "^combined_modified_peptide.*\\.tsv$")
  }
  else if (group_psm_by == "pep_seq") {
    list.files(path = file.path(dat_dir), 
               pattern = "^combined_peptide.*\\.tsv$")
  }

  if (!length(filelist)) {
    return(FALSE)
  } 
  else if (length(filelist) > 1L) {
    stop("Only single MSFragger `combined_modified_peptide.tsv` allowed.")
  } 
  else {
    if (!file.exists(file.path(dat_dir, filelist))) {
      stop("No MSFragger `combined_modified_peptide.tsv` udner", dat_dir)
    } 
    else {
      message("MSFragger file `", filelist, "` found.")
      return(TRUE)
    }
  }
}


#' Check the single presence of MSFragger combined_protein[...].tsv
#' 
#' @param dat_dir The working directory.
use_mf_prottable <- function (dat_dir) 
{
  filelist <- list.files(path = file.path(dat_dir), 
                         pattern = "^combined_protein.*\\.tsv$")

  if (!length(filelist)) {
    return(FALSE)
  } 
  else if (length(filelist) > 1L) {
    stop("Only single MSFragger `combined_protein.tsv` allowed.")
  } 
  else {
    if (!file.exists(file.path(dat_dir, filelist))) {
      stop("No MSFragger `combined_protein.tsv` udner", dat_dir)
    } 
    else {
      message("MSFragger file `", filelist, "` found.")
      return(TRUE)
    }
  }
}


#' Check the single presence of MaxQuant combined_protein[...].tsv
#' 
#' @param dat_dir The working directory.
use_mq_prottable <- function (dat_dir) 
{
  filelist <- list.files(path = file.path(dat_dir), 
                         pattern = "^proteinGroups*\\.txt$")
  
  if (!length(filelist)) {
    return(FALSE)
  } 
  else if (length(filelist) > 1L) {
    stop("Only single MaxQuant `proteinGroups.txt` allowed.")
  } 
  else {
    if (!file.exists(file.path(dat_dir, filelist))) {
      stop("No MaxQuant `proteinGroups.txt` udner", dat_dir)
    } 
    else {
      message("MaxQuant file `", filelist, "` found.")
      return(TRUE)
    }
  }
}


#' Adds MSFragger intensity and ratio columns
#'
#' @param dat_dir The working directory.
#' @param use_maxlfq Logical; use MaxLFQ values or not.
#' @param use_spec_counts Logical; use spectrum counts or not.
#' @param prob_co The cut-off in protein probability.
fillMFprnnums <- function (dat_dir, use_maxlfq = TRUE, use_spec_counts = FALSE, 
                           prob_co = .99) 
{
  label_scheme <- load_ls_group(dat_dir, label_scheme)
  sids <- label_scheme$Sample_ID
  refChannels <- sids[label_scheme[["Reference"]]]

  df <- readr::read_tsv(file.path(dat_dir, "combined_protein.tsv"))
  nms <- names(df)

  if ("Protein Probability" %in% nms) {
    df <- df |> dplyr::filter(`Protein Probability` >= prob_co)
  }

  if (use_maxlfq && any(grepl(" MaxLFQ Intensity", nms))) {
    df <- df[, c("Protein ID", paste0(sids, " MaxLFQ Intensity"))]
  }
  else {
    df <- df[, c("Protein ID", paste0(sids, " Intensity"))]
  }
  
  df <- df |>
    dplyr::rename(prot_acc = `Protein ID`)

  names(df)[2:ncol(df)] <- paste0("I000 (", sids, ")")
  df <- df |> calc_lfq_log2r("I000", refChannels)
  nms <- names(df)
  df <- df |>
    dplyr::mutate_at(.vars = grep("log2_R000\\s", nms), 
                     function (x) replace(x, is.nan(x), NA_real_))

  df_n <- df[, 2:ncol(df), drop = FALSE]
  names(df_n) <- paste0("N_", names(df_n))
  
  df_z <- df_n[, grepl("^N_log2_R000 \\(", names(df_n))]
  names(df_z) <- gsub("^N_log2", "Z_log2", names(df_z))
  
  df <- dplyr::bind_cols(df, df_n, df_z)
  
  nms <- names(df)
  df <- dplyr::bind_cols(
    df |> dplyr::select("prot_acc"),
    df |> dplyr::select(grep("^I000 \\(", nms)),
    df |> dplyr::select(grep("^N_I000 \\(", nms)),
    df |> dplyr::select(grep("^log2_R000 \\(", nms)),
    df |> dplyr::select(grep("^N_log2_R000 \\(", nms)),
    df |> dplyr::select(grep("^Z_log2_R000 \\(", nms)))
}


#' Adds MSFragger intensity and ratio columns
#'
#' @param dat_dir The working directory.
#' @param use_spec_counts Logical; use spectrum counts or not.
#' @param use_maxlfq Logical; use MaxLFQ values or not.
fillMQprnnums <- function (dat_dir, use_spec_counts = FALSE, use_maxlfq = TRUE) 
{
  label_scheme <- load_ls_group(dat_dir, label_scheme)
  sids <- label_scheme$Sample_ID
  refChannels <- sids[label_scheme[["Reference"]]]

  file <- file.path(dat_dir, "proteinGroups.txt")
  
  if (file.exists(file)) {
    df <- readr::read_tsv(file)
  }
  else {
    stop("File not found: ", file)
  }

  if (use_maxlfq) {
    df <- df[, c("Majority protein IDs", paste0("LFQ intensity ", sids))]
  }
  else {
    df <- df[, c("Majority protein IDs", paste0("Intensity ", sids))]
  }
  
  df <- df |>
    dplyr::rename(prot_acc = `Majority protein IDs`) |>
    dplyr::mutate(prot_acc = gsub("\\;.*", "", prot_acc)) |>
    dplyr::filter(prot_acc != "")

  names(df)[2:ncol(df)] <- paste0("I000 (", sids, ")")
  df <- df |> calc_lfq_log2r("I000", refChannels)
  nms <- names(df)
  df <- df |>
    dplyr::mutate_at(.vars = grep("log2_R000\\s", nms), 
                     function (x) replace(x, is.nan(x), NA_real_))
  
  df_n <- df[, 2:ncol(df), drop = FALSE]
  names(df_n) <- paste0("N_", names(df_n))
  
  df_z <- df_n[, grepl("^N_log2_R000 \\(", names(df_n))]
  names(df_z) <- gsub("^N_log2", "Z_log2", names(df_z))
  
  df <- dplyr::bind_cols(df, df_n, df_z)
  
  nms <- names(df)
  df <- dplyr::bind_cols(
    df |> dplyr::select("prot_acc"),
    df |> dplyr::select(grep("^I000 \\(", nms)),
    df |> dplyr::select(grep("^N_I000 \\(", nms)),
    df |> dplyr::select(grep("^log2_R000 \\(", nms)),
    df |> dplyr::select(grep("^N_log2_R000 \\(", nms)),
    df |> dplyr::select(grep("^Z_log2_R000 \\(", nms)))
}


#' Compare sample IDS between MaxQuant results and expt_smry.xlsx 
#' 
#' @param df A data frame of MaxQuant peptides.txt or proteinGroups.txt.
#' @param label_scheme Experiment summary
check_mq_df <- function (df, label_scheme) 
{
  mq_nms <- names(df) %>% 
    .[grepl("^LFQ intensity ", .)] %>% 
    gsub("^LFQ intensity ", "", .)
  
  ls_nms <- label_scheme$Sample_ID %>% 
    .[!grepl("^Empty\\.[0-9]+", .)] %>% 
    unique() 
  
  if (!all(mq_nms %in% ls_nms)) {
    missing_nms <- mq_nms %>% .[! . %in% ls_nms]
    
    warning("Sample ID(s) in MaxQuant `peptides.txt` not found in metadata:\n", 
            purrr::reduce(missing_nms, paste, sep = ", "), 
            call. = FALSE)

    # the same ID occurs in "Identification type", "Experiment", 
    # "Intensity", "LFQ intensity"
    purrr::walk(missing_nms, ~ {
      df[, grep(paste0(" ", .x, "$"), names(df))] <- NULL
      df <<- df
    }, df)
  }
  
  invisible(df)
}


#' Helper of calculating MaxQuang log2FC
#' 
#' @param df A data frame.
#' @param type of of intensity data.
#' @param refChannels Reference channels.
calc_mq_log2r <- function (df, type, refChannels) {
  type <- paste0("^", type, " ")
  col_smpls <- grep(type, names(df))
  
  if (length(refChannels) > 0L) {
    col_refs <- paste0(type, refChannels) %>% 
      purrr::map_dbl(~ grep(.x, names(df)))
  } else {
    col_refs <- grep(type, names(df)) 
  }
  
  if (type == "^Intensity ") {
    prefix <- "log2_R000"
  } else if (type == "^LFQ intensity ") {
    prefix <- "N_log2_R000"
  } else {
    stop("`type` needs to be either `Intensity` or `LFQ intensity`.")
  }
  
  sweep(df[, col_smpls, drop = FALSE], 1,
        rowMeans(df[, col_refs, drop = FALSE], na.rm = TRUE), "/") %>%
    log2(.)  %>% 
    dplyr::mutate_all(~ replace(.x, is.infinite(.), NA)) %>% 
    `colnames<-`(gsub(paste0(type, "(.*)$"), 
                      paste0(prefix, " \\(", "\\1", "\\)"), 
                      names(.))) %>%
    cbind(df, .)
}


#' Extracts MaxQuant intensity values and calculates log2FC.
#' 
#' @param df A data frame of MaxQuant results.
extract_mq_ints <- function (df) 
{
  load(file.path(dat_dir, "label_scheme.rda"))
  sids <- label_scheme$Sample_ID
  refChannels <- sids[label_scheme[["Reference"]]]

  df <- df %>% 
    calc_mq_log2r("Intensity", refChannels) %>% 
    calc_mq_log2r("LFQ intensity", refChannels) %>% 
    dplyr::mutate_at(.vars = grep("log2_R000\\s", names(.)), 
                     ~ replace(.x, is.infinite(.), NA_real_)) 
  
  log2sd <- df %>% 
    dplyr::select(grep("Intensity\\s", names(.))) %>% 
    `names<-`(gsub("^Intensity\\s(.*)", 
                   "sd_log2_R000 \\(\\1\\)", 
                   names(.))) %>% 
    dplyr::mutate_all(~ replace(.x, !is.na(.x), NA_real_))
  
  df <- dplyr::bind_cols(df, log2sd)
  
  df <- df %>% 
    `names<-`(gsub("^Intensity (.*)$", 
                   paste0("I000 \\(", "\\1", "\\)"), 
                   names(.))) %>% 
    `names<-`(gsub("^LFQ intensity (.*)$", 
                   paste0("N_I000 \\(", "\\1", "\\)"), 
                   names(.)))
  
  if (!length(grep("log2_R000", names(df)))) {
    stop("No `log2_R000...` columns available.\n",
         "Probably inconsistent sample IDs between metadata and MaxQuant `peptides.txt`.", 
         call. = FALSE)
  }
  
  df <- dplyr::bind_cols(
    df %>% dplyr::select(-grep("[IR]{1}000 \\(", names(.))),
    df %>% dplyr::select(grep("^I000 \\(", names(.))),
    df %>% dplyr::select(grep("^N_I000 \\(", names(.))),
    df %>% dplyr::select(grep("^sd_log2_R000 \\(", names(.))),
    df %>% dplyr::select(grep("^log2_R000 \\(", names(.))),
    df %>% dplyr::select(grep("^N_log2_R000 \\(", names(.))),  
  )
}


#' Handling of MaxQuant peptide.txt
#'
#' Fill back intensity values that are not filled in LFQ msms.txt.
#' 
#' @param label_scheme Experiment summary.
#' @param omit_single_lfq Omit entries with single LFQ value or not.
#' @param use_spec_counts Not yet used. Logical; use spectrum counts or not.
#' @param use_maxlfq Logical; use MaxLFQ values or not.
pep_mq_lfq <- function(label_scheme, omit_single_lfq = FALSE, 
                       use_spec_counts = FALSE, use_maxlfq = TRUE) 
{
  dat_dir <- get_gl_dat_dir()
  group_psm_by <- match_call_arg(normPSM, group_psm_by)
  group_pep_by <- match_call_arg(normPSM, group_pep_by)

  filelist <- 
    list.files(path = file.path(dat_dir), pattern = "^peptides.*\\.txt$")
  
  if (!length(filelist)) {
    stop(paste("No MaxQuant LFQ file of `peptides[...].txt` under", 
               file.path(dat_dir), ".\n",
               "Make sure that the name of file starts with `peptides`."))
  }
  
  df <- read.csv(file.path(dat_dir, filelist), check.names = FALSE, 
                 header = TRUE, sep = "\t", comment.char = "#") |>
    dplyr::mutate(Proteins = gsub("\\.[0-9]*", "", Proteins), 
                  `Leading razor protein` = 
                    gsub("\\.[0-9]*", "", `Leading razor protein`))
  
  # Leading razor protein
  df <- local({
    if (!length(grep("^LFQ intensity |^Intensity ", names(df)))) {
      nas <- data.frame(rep(NA_real_, nrow(df)))
      sample_ids <- as.character(label_scheme$Sample_ID)
      
      df_int <- purrr::map(sample_ids, ~ {
        nas %>% `colnames<-`(paste("Intensity", .x))
      }) %>% 
        dplyr::bind_cols()
      
      df_int2 <- purrr::map(sample_ids, ~ {
        nas %>% `colnames<-`(paste("LFQ intensity", .x))
      }) %>% 
        dplyr::bind_cols()
      
      df <- dplyr::bind_cols(df, df_int, df_int2)
    }
    else if (!length(grep("^LFQ intensity ", names(df)))) {
      warning("Columns `LFQ intensity` not found in `", filelist, "`; ", 
              "columns `Intensity` used instead.")
      
      df_int <- df %>% 
        dplyr::select(grep("^Intensity ", names(.))) %>% 
        `names<-`(gsub("^Intensity ", "LFQ intensity ", names(.)))
      
      df <- dplyr::bind_cols(df, df_int)
    }
    else if (!length(grep("^Intensity ", names(df)))) {
      warning("Columns `Intensity` not found in `", filelist, "`; ", 
              "columns `LFQ intensity` used instead.")
      
      df_int <- df %>% 
        dplyr::select(grep("^LFQ intensity ", names(.))) %>% 
        `names<-`(gsub("^LFQ intensity ", "Intensity ", names(.)))
      
      df <- dplyr::bind_cols(df, df_int)
    }
    
    # MaxQuant peptide.txt may contains extra_sample_ids not in label_scheme
    # e.g. one may delete a Sample_ID from label_scheme, 
    #   the corresponding Sample_ID will be removed from compiled PSM tables.
    # However, the same needs to be done 
    #  when backfilling intensity data from peptide.txt.
    # Otherwise would cause columns of `pep_razor_int (extra_sample_ids)` etc. 
    #   in peptide table.
    # This will cause an error with Pep2Prn(method_pep_prn = lfq_...), 
    # which involves columns of `pep_razor_int (extra_sample_ids)` etc. 
    # for the identification of top_n.
    df <- local({
      extra_sample_ids <- names(df) %>% 
        .[grep("^Intensity\\s.*", .)] %>% 
        gsub("^Intensity\\s(.*)$", "\\1", .) %>% 
        .[! . %in% label_scheme$Sample_ID]
      
      if (length(extra_sample_ids)) {
        warning("\nSample IDs in `peptide.txt` not found", 
                " in `label_scheme.xlsx` and removed: \n", 
                purrr::reduce(extra_sample_ids, paste, sep = ", "), 
                "\n\n=======================================================================", 
                "\nWith the temporary data backfilling of `msms.txt` <- `peptide.txt`, ", 
                "\nit is currently not possible to tell the above mismatches is by either \n",
                "(a) intended sample removals in `label_scheme.xlsx` or \n",
                "(b) inadvertence in naming samples differently between `label_scheme.xlsx` ", 
                "and MaXquant searches.\n\n",
                
                "Fow now, identical sample IDs between `label_scheme.xlsx` ", 
                "and `peptide.txt` are required ", 
                "(only with the backfilling procedures).\n", 
                "With the temporary requirement of identical sample IDs being fulfilled, ", 
                "users may then perform sample exclusions via `label_scheme.xlsx`.",
                "\n=======================================================================\n", 
                call. = FALSE)
        
        purrr::walk(extra_sample_ids, ~ {
          smpl <- paste0(" ", .x, "$")
          df <<- df %>% dplyr::select(-grep(smpl, names(.)))
          
          message("Mismatched sample ID removed from `peptide.txt`: ", .x)
        })
        
        if (purrr::is_empty(grep("^LFQ intensity |^Intensity ", names(df)))) {
          stop("No samples matched to the IDs in `label_scheme.xlsx`.", 
               call. = FALSE)
        }
      }
      
      invisible(df)
    })
    
    df <- df %>% 
      dplyr::filter(not_allzero_rows(.[grep("^LFQ intensity ", names(.))])) 
  })

  if (omit_single_lfq) {
    df <- df %>% na_single_lfq("^LFQ intensity ")
  }

  ## handle inconsistency in MaxQuant column keys
  # (1) "Gene names" vs "Gene Names" etc.
  # (2) presence or absence of "Gene Names", "Protein Names" etc.
  if (!("Gene Names" %in% names(df) && "Protein Names" %in% names(df))) {
    stopifnot("Proteins" %in% names(df))
    
    df$"Gene names" <- df$"Gene Names" <- df$"Protein names" <- 
      df$"Protein Names" <- NULL
    
    fasta <- match_call_arg(normPSM, fasta)
    entrez <- match_call_arg(normPSM, entrez)
    
    df <- local({
      df <- df %>% 
        dplyr::mutate(prot_acc = gsub(";.*$", "", Proteins))
      
      tempdata <- df %>% 
        dplyr::select(prot_acc) %>% 
        dplyr::filter(!duplicated(prot_acc)) %>% 
        annotPrn(fasta, entrez) %>% 
        dplyr::select(prot_acc, gene, prot_desc) %>% 
        dplyr::rename(`Gene Names` = "gene", 
                      "Protein Names" = "prot_desc")
      
      df <- df %>% 
        dplyr::left_join(tempdata, by = "prot_acc") %>% 
        dplyr::select(-prot_acc) 
      
      col <- which(names(df) == "Proteins")
      
      dplyr::bind_cols(
        df[, 1:col],
        df[, c("Gene Names", "Protein Names")],
        df %>% 
          dplyr::select(-c("Gene Names", "Protein Names")) %>% 
          dplyr::select((col+1):ncol(.))
      )
    })
  }
  
  df <- df %>% check_mq_df(label_scheme) %>% 
    dplyr::rename(
      pep_seq = Sequence, 
      prot_acc = Proteins, 
    ) %>% 
    dplyr::mutate(gene = gsub("\\;.*", "", `Gene Names`)) %>% 
    dplyr::mutate(prot_acc = gsub("\\;.*", "", prot_acc))

  # `Modified sequence` not available in `peptides.txt`
  # group_psm_by = "pep_seq" only in `normPSM()`
  if (group_psm_by == "pep_seq_mod") {
    if ("Modified sequence" %in% names(df)) {
      use_lowercase_aa <- match_call_arg(normPSM, use_lowercase_aa)
      
      df <- df %>% 
        dplyr::rename(pep_seq_mod = `Modified sequence`) %>% 
        dplyr::select(which(names(.) == group_psm_by),
                      which(names(.) != group_psm_by)) %>% 
        add_maxquant_pepseqmod(use_lowercase_aa = FALSE)
    }
    else {
      stop("Column `Modified sequence` not found in MaxQuant `peptides.txt`.\n", 
           "Rerun `normPSM(group_psm_by = pep_seq, ...)`.\n")
    }
  }
  else {
    # df <- df %>% 
    #   dplyr::mutate(pep_seq = paste(`Amino acid before`, pep_seq, `Amino acid after`, 
    #                                 sep = ".")) 
  }

  df_vals <- df %>% 
    dplyr::select(group_psm_by, 
                  grep("^Intensity\\s|^LFQ\\sintensity\\s", names(.)))
  
  if (use_maxlfq) {
    df_vals[, grepl("^Intensity ", names(df_vals))] <- 
      df_vals[, grepl("^LFQ intensity ", names(df_vals)), drop = FALSE]
  }
  
  df_vals <- df_vals %>% 
    extract_mq_ints() %>%
    dplyr::select(-grep("sd_log2_R000", names(.)))
  
  df_sds <- df %>% 
    dplyr::left_join(df_vals, by = group_psm_by) %>% 
    calcSD_Splex(group_pep_by) %>% 
    `names<-`(gsub("^log2_R", "sd_log2_R", names(.)))
  
  df <- df %>% 
    dplyr::left_join(df_vals, by = group_psm_by) %>% 
    dplyr::left_join(df_sds, by = group_pep_by) %>% 
    na_zeroIntensity() 

  df %>% 
    dplyr::select(
      group_psm_by, 
      grep("^sd_log2_R000", names(.)), 
      grep("^log2_R000", names(.)), 
      grep("^N_log2_R000", names(.)), 
      grep("^I000", names(.)), 
      grep("^N_I000", names(.)), 
    )
}


#' Handling of MSFragger peptide table
#'
#' Fill back intensity values.
#' 
#' @param label_scheme Experiment summary.
#' @param use_spec_counts Not yet used. Logical; use spectrum counts or not.
#' @param use_maxlfq Logical; use MaxLFQ values or not.
#' @param omit_single_lfq Logical; omit entries of single LFQ measures or not.
#' @inheritParams normPSM
pep_mf_lfq <- function(label_scheme, omit_single_lfq = FALSE, use_maxlfq = TRUE, 
                       group_psm_by = "pep_seq_mod", use_spec_counts = FALSE, 
                       use_lowercase_aa = FALSE) 
{
  dat_dir <- get_gl_dat_dir()
  group_psm_by <- match_call_arg(normPSM, group_psm_by)
  group_pep_by <- match_call_arg(normPSM, group_pep_by)
  
  filelist <- if (group_psm_by == "pep_seq_mod") {
    list.files(path = file.path(dat_dir), 
               pattern = "^combined_modified_peptide.*\\.tsv$")
  }
  else if (group_psm_by == "pep_seq") {
    list.files(path = file.path(dat_dir), 
               pattern = "^combined_peptide.*\\.tsv$")
  }
  else {
    stop("Unhandled condition.")
  }

  n_files <- length(filelist)
  if (!n_files) {
    stop(paste("No MSFragger LFQ file of `combined_peptide[...].tsv` under", 
               file.path(dat_dir), ".\n",
               "Make sure that the name of file starts with `combined_peptide`."))
  }
  
  if (n_files > 1L) {
    stop(paste("Only allowed one MSFragger LFQ file of `combined_peptide[...].tsv` under", 
               file.path(dat_dir), ".\n"))
  }
  
  df <- readr::read_tsv(file.path(dat_dir, filelist))
  pat  <- " Intensity"
  pat2 <- " MaxLFQ Intensity"
  col_nms <- names(df)
  col_ints  <- grep(pat, col_nms)
  col_ints2 <- grep(pat2, col_nms)
  col_ints  <- col_ints[!col_ints %in% col_ints2]

  # columns of " MaxLFQ Intensity" can be missing
  if (!length(col_ints2)) {
    df[, gsub(" Intensity$", pat2, col_nms[col_ints])] <- df[, col_ints]
    col_nms <- names(df)
    col_ints2 <- grep(pat2, col_nms)
  }

  sids <- gsub(pat2, "", col_nms[col_ints2])
  names(df)[col_ints2] <- paste0("N_I000 (", sids, ")")
  names(df)[col_ints]  <- paste0("I000 (", sids, ")")
  
  if (use_maxlfq) {
    df[, col_ints] <- df[col_ints2]
  }

  df <- df |>
    dplyr::select(which(col_nms %in% c("Peptide Sequence", "Modified Sequence", 
                                       "Prev AA", "Next AA", "Start", "End")), 
                  col_ints, col_ints2) |>
    dplyr::rename(pep_seq = `Peptide Sequence`, 
                  pep_seq_mod = `Modified Sequence`, 
                  pep_res_before = `Prev AA`, 
                  pep_res_after = `Next AA`, 
                  pep_start = Start, 
                  pep_end = End, )

  if (group_psm_by == "pep_seq_mod") {
    df <- df |> 
      dplyr::select(-"pep_seq") |>
      reloc_col_before(group_psm_by, "pep_res_after") |>
      reloc_col_before("pep_res_before", group_psm_by) |>
      add_msfragger_pepseqmod(use_lowercase_aa)
  }
  else if (group_psm_by == "pep_seq") {
    df <- df |> dplyr::select(-"pep_seq_mod")
  }

  if (omit_single_lfq) {
    df <- na_single_lfq(df, "^N_I000 ")
  }

  df <- df |>
    dplyr::select(-c("pep_res_before", "pep_res_after", "pep_start", "pep_end"))
  
  df_empty <- matrix(ncol = length(sids), nrow = nrow(df)) |>
    data.frame() |>
    tibble::tibble()
  
  # special handling
  if (FALSE && !n_ints2) {
    df_n_ints <- df[, grepl("^I000 ", names(df))]
    names(df_n_ints) <- paste0("N_I000 (", sids, ")")
    df <- dplyr::bind_cols(df, df_n_ints)
  }
  
  df_n_log2r <- df_log2r <- df_sd <- df_empty
  names(df_sd) <- paste0("sd_log2_R000 (", sids, ")")
  names(df_log2r) <- paste0("log2_R000 (", sids, ")")
  names(df_n_log2r) <- paste0("N_log2_R000 (", sids, ")")
  
  df <- dplyr::bind_cols(df, df_sd, df_log2r, df_n_log2r)
  
  ## log2 ratios
  refChannels <- label_scheme$Sample_ID[label_scheme[["Reference"]]]

  df <- df |>
    calc_lfq_log2r("I000", refChannels) |>
    calc_lfq_log2r("N_I000", refChannels) |>
    # NaN for log2 if zero-intensity under reference channels: log(0/0)
    dplyr::mutate_if(is.numeric, function (x) replace(x, is.nan(x), NA_real_))
  
  df <- df |>
    dplyr::mutate_at(grep("log2_R000\\s", names(df)), 
                     function (x) replace(x, is.infinite(x), NA_real_)) 
  
  df
}


#' Helper: calculates LFQ log2FC.
#'
#' For rows with single intensity values: original intensities kept but
#' \code{log2_R000} coerced from 0 to NA. This will keep single-value rows away
#' from data alignment that are based on \code{log2_R000}. Otherwise, the
#' alignment may be trapped to the spike at \code{log2_R000 = 0}.
#'
#' @param df A data frame.
#' @param type The type of intensity data.
#' @param refChannels The reference channels.
calc_lfq_log2r <- function (df, type, refChannels) 
{
  if (!type %in% c("I000", "N_I000")) {
    stop("The value of `type` need to be `I000` or `N_I000`.")
  }

  new_type <- paste0("^", type, " ")
  prefix <- gsub("I000", "log2_R000", new_type)
  no_hat <- gsub("\\^", "", prefix)
  
  df <- df[, !grepl(prefix, names(df))]
  col_smpls <- grep(new_type, names(df))
  
  if (!length(col_smpls)) {
    stop("No sample columns start with ", type, ".")
  }

  col_refs <- if (length(refChannels)) 
    which(names(df) %in% paste0(type, " (", refChannels, ")"))
  else 
    col_smpls
  
  dfx <- df[, col_smpls, drop = FALSE]
  rows_sv <- (rowSums(!is.na(dfx)) == 1L)
  
  ans <- sweep(dfx, 1,
        rowMeans(df[, col_refs, drop = FALSE], na.rm = TRUE), "/") %>%
    log2(.)  %>% 
    dplyr::mutate_all(~ replace(.x, is.infinite(.), NA_real_)) %>% 
    `colnames<-`(gsub(paste0(new_type, "(.*)$"), 
                      paste0(no_hat, "\\1"), 
                      names(.)))
  
  ans[rows_sv, ] <- NA_real_
  
  cbind(df, ans)
}


#' Trivializes data rows with single LFQ intensity
#' 
#' @param df A data frame.
#' @param pattern The pattern of intensity fields.
na_single_lfq <- function (df, pattern = "^I000 ") 
{
  cols <- grep(pattern, names(df))
  df_lfq <- df[, cols, drop = FALSE]

  # slightly more flexible than (rowSums(!is.na(df_lfq)) > 1L)
  # in case that upstream NA's are replaced 0's
  # (e.g. MaxQuant's timsTOF PSMs are all NA values)
  not_single_zero <- (rowSums(df_lfq > 0, na.rm = TRUE) > 1L) 
  not_single_zero[!not_single_zero] <- NA # NA logical
  
  df_lfq[] <- lapply(df_lfq, `*`, not_single_zero)
  df[, cols] <- df_lfq
  
  invisible(df)
}


#' Calculates log2FC of peptides based on LFQ intensity.
#'
#' With LFQ, values of \code{log2_R000 (...)} and \code{N_log2_R000 (...)} in
#' \code{df_num} are not yet filled after \link{spreadPepNums}. This utility
#' calculates the ratio using the values of LFQ intensity.
#'
#' For SILAC, intensity of heave, light etc. have been spread into multiple
#' columns and thus the log2Ratios can be calculated like regular LFQ.
#'
#' @param df A data frame.
#' @param label_scheme Meta data.
#' @param tmt_plex The multiplexicty of TMT.
#' @param min_int Minimal Y intensity to replace 0 values.
#' @param imp_refs Logical; impute missing references or not.
#' @param imp_vals Logical; impute missing values or not.
#' @param sp_centers A vector of log2FC centers for each species.
calcLFQPepNums <- function (df, label_scheme, tmt_plex = 0L, min_int = 1E3, 
                            imp_refs = FALSE, imp_vals = FALSE, 
                            sp_centers = NULL) 
{
  rows        <- label_scheme[["Reference"]]
  refChannels <- label_scheme$Sample_ID[rows]
  nms         <- names(df)
  cols_both   <- nms[grepl("^I000 \\(", nms)]

  if (length(refChannels)) {
    cols_ref  <- paste0("I000 (", refChannels, ")")
    cols_smpl <- cols_both[!cols_both %in% cols_ref]
  }
  else {
    cols_smpl <- cols_ref <- cols_both
  }

  # (0) no possible to have all NA rows for both references and samples
  if (FALSE) {
    row_sums <- df[, c(cols_ref, cols_smpl), drop = FALSE] |>
      is.na() |>
      rowSums()
    oks <- row_sums < length(c(cols_ref, cols_smpl))
    df <- df[oks, ]
    rm(list = c("row_sums", "oks"))
  }
  
  # (1) handle missing reference and obtain species centers
  df <- impRefNA(df = df, cols_ref = cols_ref, cols_smpl = cols_smpl, 
                 refChannels = refChannels, group_psm_by = "pep_seq_mod", 
                 imp_refs = imp_refs, sp_centers = sp_centers)
  
  if (is.null(sp_centers)) {
    sp_centers <- attr(df, "sp_centers", exact = TRUE)
  }

  # 2. With references
  if (imp_vals && length(cols_smpl) > 1L) {
    df <- local({
      srs <- rowSums(is.na(df[, cols_smpl, drop = FALSE]))
      empties <- srs == length(cols_smpl)
      df0 <- df[empties, ]
      df1 <- df[!empties, ]
      
      nms  <- names(df)
      cols <- which(nms %in% cols_smpl)
      class(df1) <- "data.frame" # remove `tbl_df` and `tbl`
      df1[, cols] <- impPepNA(df1[, cols, drop = FALSE])
      
      df1[, paste0("N_", cols_smpl)] <- df1[, cols_smpl, drop = FALSE]
      df <- dplyr::bind_rows(df0, df1)
    })
  }

  df <- df |>
    # dplyr::mutate_at(grep("I[0-9]{3}[NC]{0,1}", names(df)), 
    #                  function (x) { x[x < min_int] <- min_int; x }) |>
    calc_lfq_log2r("I000", refChannels) |>
    calc_lfq_log2r("N_I000", refChannels) |>
    # NaN for log2 if zero-intensity under reference channels: log(0/0)
    dplyr::mutate_if(is.numeric, function (x) replace(x, is.nan(x), NA_real_))

  df <- df |>
    dplyr::mutate_at(grep("log2_R000\\s", names(df)), 
                     function (x) replace(x, is.infinite(x), NA_real_)) 

  if (!length(grep("log2_R000", names(df)))) {
    stop("No `log2_R000...` columns available.\n",
         "Probably inconsistent sample IDs between metadata and ", 
         "MaxQuant `peptides.txt`.")
  }
  
  nms <- names(df)
  df <- dplyr::bind_cols(
    df |> dplyr::select(-grep("[IRC]{1}000 \\(", nms)),
    df |> dplyr::select(grep("^I000 \\(", nms)),
    df |> dplyr::select(grep("^N_I000 \\(", nms)),
    df |> dplyr::select(grep("^C000 \\(", nms)),
    df |> dplyr::select(grep("^sd_log2_R000 \\(", nms)),
    df |> dplyr::select(grep("^log2_R000 \\(", nms)),
    df |> dplyr::select(grep("^N_log2_R000 \\(", nms)))
  
  attr(df, "sp_centers") <- sp_centers
  df
}


#' Impute NA for references
#' 
#' @param df A data frame.
#' @param cols_ref The colume keys of reference IDs.
#' @param cols_smpl The column keys of sample IDs.
#' @param refChannels The reference channels.
#' @param group_psm_by A key for PSM grouping.
#' @param imp_refs Logical; impute missing references or not.
#' @param add_errs Logical; add normal errors or not.
#' @param sp_centers A vector of log2FC centers for each species.
impRefNA <- function (df, cols_ref, cols_smpl, refChannels, 
                      group_psm_by = "pep_seq_mod", sp_centers = NULL,
                      imp_refs = FALSE, add_errs = FALSE)
{
  if (!"species" %in% names(df)) {
    stop("Column `species` not found.")
  }
  
  # df$species[is.na(df$species)] <- ".other"
  
  empties <- rowSums(is.na(df[, cols_ref, drop = FALSE])) == length(cols_ref)
  if (!length(empties)) { return(df) }
  df0 <- df[empties, ]
  df1 <- df[!empties, ]

  # finds species centers for each sample (no NA species)
  nms_smpl <- gsub("^I([0-9]{3})", paste0("log2_R", "\\1"), cols_smpl)
  log2rs   <- calc_lfq_log2r(df = df1, type = "I000", refChannels = refChannels)
  log2rs   <- split(log2rs[, nms_smpl, drop = FALSE], log2rs$species)
  mvs      <- find_species_centers(df1, refChannels, cols_smpl)
  
  if (!imp_refs) {
    attr(df, "sp_centers") <- mvs
    return(df)
  }
  
  for (i in seq_along(sps <- names(mvs))) {
    spi  <- sps[[i]] # cannot be NA
    dfi  <- df0[rows <- df0$species == spi, ]
    rms  <- rowMeans(dfi[, cols_smpl, drop = FALSE], na.rm = TRUE)
    
    ysi <- if (add_errs) {
      rms / (2^rnorm(length(rms), mvs[[i]], .01))
    }
    else {
      rms / (2^mvs[[i]])
    }
    
    ysi[is.nan(ysi)] <- NA_real_
    
    for (col in cols_ref) {
      dfi[[col]] <- ysi
    }
    
    df0[rows, ] <- dfi
  }

  df0[, paste0("N_", cols_ref)] <- df0[, cols_ref, drop = FALSE]
  
  df <- dplyr::bind_rows(df0, df1)
  attr(df, "sp_centers") <- mvs
  
  df
}


#' Finds the data center of log2FC by species and samples
#' 
#' @param df A data frame contains intensity values, peptides and species.
#' @param refChannels Reference channels.
#' @param cols_smpl The columns of samples for finding the data centers.
#' @return Lists of log2FC centers by species. Each list entry contains values
#'   corresponding to each sample.
find_species_centers <- function (df, refChannels, cols_smpl)
{
  if (!nrow(df)) {
    return(0.0)
  }
  
  nms_smpl <- gsub("^I([0-9]{3})", paste0("log2_R", "\\1"), cols_smpl)
  log2rs   <- calc_lfq_log2r(df = df, type = "I000", refChannels = refChannels)

  if (all(is.na(sps <- log2rs$species))) {
    mvs <- mean(sapply(log2rs[, nms_smpl, drop = FALSE], median, na.rm = TRUE), 
                na.rm = TRUE)
  }
  else {
    log2rs <- split(log2rs[, nms_smpl, drop = FALSE], sps)
    
    mvs <- sapply(log2rs, function (xs) {
      mean(sapply(xs, median, na.rm = TRUE), na.rm = TRUE)
    })
  }
  
  mvs
}


#' Finds the data center of log2FC by species and samples
#'
#' Input matrix: columns, peptides; rows, samples.
#'
#' @param ymat An intensity matrix.
#' @param are_refs A logical vector indicating reference status.
#' @param are_smpls A logical vector indicating reference status.
#' @param sps The species values corresponding the column peptides.
#' @param new_na_species A replace value for NA species.
#' @param na_species_by_pri Logical; if TRUE, use the center of the primary
#'   species for NA species.
find_species_centers2 <- function (ymat, are_refs = NULL, are_smpls = NULL, sps, 
                                   new_na_species = ".other", 
                                   na_species_by_pri = FALSE)
{
  if (!length(are_smpls)) {
    if (all(are_refs)) {
      are_smpls <- are_refs
    }
    else if (any(are_refs)) {
      are_smpls <- !are_refs
    } 
    else {
      are_refs  <- rep_len(TRUE, length(are_refs))
      are_smpls <- are_refs
    }
  }

  rmat <- sweep(
    ymat[are_smpls, , drop = FALSE], 2, 
    colMeans(ymat[are_refs, , drop = FALSE], na.rm = TRUE), "/")
  rmat[is.nan(rmat)] <- NA_real_
  rmat <- log2(rmat)
  
  sps[is.na(sps)] <- new_na_species
  
  mvs <- sapply(split(as.data.frame(t(rmat)), sps), function (mat) {
    mean(sapply(mat, median, na.rm = TRUE), na.rm = TRUE)
  })
  
  if (na_species_by_pri) {
    tbl  <- table(sps)
    imax <- which.max(tbl)
    mvs[[new_na_species]] <- mvs[[imax]]
    # tbl  <- table(sps, useNA = "always")
    # names(tbl)[which(is.na(names(tbl)))] <- ""
    # imax <- which.max(tbl)
    # pri  <- names(tbl)[imax] # can be "", and not subset by mvs[pri]
    # mvs[names(mvs) == ""] <- unname(mvs[names(mvs) == pri])
  }

  # names(mvs)[names(mvs) == ".other"] <- ""
  
  return(mvs)
  
  
  if (FALSE) {
    tbl  <- table(sps)
    imax <- which.max(tbl)
    vmax <- mvs[[imax]]
    
    # assume NA species replaced by ".other"
    ymats <- split(as.data.frame(t(ymat)), sps)
    rmati <- rmats[[imax]]
    ymati <- log2(ymats[[imax]])
    
    
    
    
    
    ymat0 <- ymat[are_smpls, nas, drop = FALSE]
    ymat1 <- ymat[are_smpls, oks, drop = FALSE]
    ymats <- split(as.data.frame(t(ymat1)), sps[oks])
    
    # or use the contour from the primary species and translate the results 
    #  to other species with center offsets
    
    

    for (i in seq_along(ymats)) { # by species
      rmati <- rmats[[i]]
      ymati <- log2(ymats[[i]])
      
      mvs[[i]] - vmax # distance of translating 
      
      for (j in 1:ncol(ymati)) { # by samples
        xvs <- rmati[, j]
        yvs <- ymati[, j]
        okx <- !(is.na(xvs) | is.na(yvs))
        xvs <- xvs[okx]
        yvs <- yvs[okx]

        if ((nys <- length(yvs)) <= 1000L) {
          lev <- .05
        }
        else if (nys <= 5000L) {
          lev <- .02
        }
        else if (nys <= 10000L) {
          lev <- .02
        }
        else {
          lev <- .10
        }
        
        conts <- 
          grDevices::contourLines(MASS::kde2d(xvs, yvs, n = 50), levels = lev)
        cont1 <- conts[[which.max(lengths(lapply(conts, `[[`, "x")))]]
        
        # outliers
        okps <- point_in_polygon(xvs, yvs, cont1$x, cont1$y)
        xos  <- xvs[!okps]
        yos  <- yvs[!okps]
        
        if (FALSE) {
          ggplot() + 
            geom_point(data = data.frame(x = xvs, y = yvs), 
                       mapping = aes(y = y, x = x), size = .1) + 
            geom_polygon(data = data.frame(x = cont1$x, y = cont1$y), 
                         mapping = aes(y = y, x = x), 
                         size = .1, fill = "red", alpha = .5)
          
          ggplot() +
            geom_histogram(data = data.frame(value = xvs), aes(x = value, y = ..count..),
                           color = "white", alpha = .5, binwidth = .05, size = .1)
        }
      }
    }
  }
  
  mvs <- c(mvs1, mv0)
}


#' Find points within a contour
#' 
#' @param x A vector of X values.
#' @param y A vector of Y values.
#' @param xp A vector of polygon X values.
#' @param yp A vector of polygon Y values.
point_in_polygon <- function(x, y, xp, yp)
{
  inside <- logical(length(x))
  
  for (i in seq_along(x)) {
    angle_sum <- 0
    
    for (j in seq_along(xp)) {
      j_next <- if (j == length(xp)) 1L else j + 1L
      
      angle_sum <- angle_sum + atan2(
        (yp[j] - y[i]) * (xp[j_next] - xp[j]) - (xp[j] - x[i]) * (yp[j_next] - yp[j]),
        (xp[j] - x[i]) * (xp[j_next] - x[i]) + (yp[j] - y[i]) * (yp[j_next] - y[i])
      )
    }
    
    inside[i] <- abs(angle_sum) > pi
  }
  
  inside
}


#' Calculates \code{pep_tot_int}, \code{pep_razor_int} and
#' \code{pep_unique_int} in LFQ
#' 
#' @param df A data frame.
#' @param filelist A list of individual peptide tables.
#' @inheritParams normPSM 
calcLFQPepInts <- function (df, filelist, group_psm_by) 
{
  dat_dir <- get_gl_dat_dir()
  label_scheme <- load_ls_group(dat_dir, label_scheme, prefer_group = FALSE)
  
  cols_grp  <- c(group_psm_by, "TMT_Set")
  cols_grp2 <- c("TMT_Set")
  nms <- names(df)
  
  lapply(cols_grp, 
         function (x) if (!x %in% nms) stop("Column `", x, "` not found."))
  
  group_ids <- if ("pep_group" %in% nms) unique(df$pep_group) else NULL
  is_mulgrps <- length(group_ids) >= 2L
  
  if (is_mulgrps) {
    cols_grp <- c(cols_grp, "pep_group")
    cols_grp2 <- c(cols_grp2, "pep_group")
  }

  df_num <- 
    df[, c(cols_grp, "pep_tot_int", "pep_unique_int", "pep_razor_int")] |>
    dplyr::group_by_at(cols_grp) |>
    dplyr::arrange(TMT_Set)
  nms <- names(df_num)
  df_num <- df_num |>
    tidyr::gather(grep("^pep_.*_int$", nms), key = ID, value = value) |>
    tidyr::unite(ID, ID, cols_grp2, sep = "_")
  
  ## pep_tot_int_1_light, pep_unique_int_1_light, pep_razor_int_1_light
  type_levels <- c("pep_tot_int", "pep_unique_int", "pep_razor_int")
  tmt_levels <- NULL
  pat <- "pep_.*_int"
  fct_cols <- c("type", "set", "group")
  
  lapply(c("type_levels"), function (x) {
    if (is.factor(df_num[[x]])) stop("`", x, "` cannot be factor.")
  })
  
  df_num <- df_num %>% 
    dplyr::mutate(type = gsub(paste0("^(", pat, ")", "_.*"), "\\1", ID), 
                  set = gsub(paste0("^", pat, "_([0-9]+).*"), "\\1", ID), ) |>
    dplyr::mutate(type = factor(type, levels = type_levels), 
                  set = as.integer(set), )
  
  lapply(c("type", "set"), function (x) {
    if (any(is.na(df_num[[x]]))) stop("Unexpected NA under column `", x, "`.")
  })

  df_num <- df_num |>
    dplyr::mutate(group = gsub(paste0("^", pat, "_[0-9]+_(.*)$"), "\\1", ID), 
                  group = factor(group, levels = group_ids)) |>
    dplyr::arrange_at(fct_cols)
  df_num <- df_num[, !names(df_num) %in% fct_cols]
  id_levels <- unique(df_num$ID)
  
  df_num <- df_num %>%
    dplyr::group_by(pep_seq_mod, ID) |>
    dplyr::summarise_at("value", sum, na.rm = TRUE) |>
    dplyr::mutate(ID = factor(ID, levels = id_levels)) |>
    tidyr::pivot_wider(names_from = ID, values_from = value)
  
  set_indexes <- gsub("^.*TMTset(\\d+).*", "\\1", filelist) %>% 
    unique() %>% 
    as.integer() %>% 
    sort()
  
  for (set_idx in set_indexes) {
    df_num <- newColnames(TMT_Set = set_idx, 
                          df = df_num, 
                          label_scheme = label_scheme, 
                          pattern = "pep_.*_int", 
                          group_ids = group_ids)
  }
  
  if (is_mulgrps) {
    label_scheme_group <- 
      load_ls_group(dat_dir, label_scheme, prefer_group = TRUE)
    
    df_num <- pad_grp_samples(
      df = df_num, 
      sids = as.character(label_scheme_group$Sample_ID), 
      tmt_levels = tmt_levels, 
      type_levels = type_levels)
  }
  
  df_num %>% 
    dplyr::select(!!rlang::sym(group_psm_by), 
                  grep("^pep_.*_int ", names(.))) %>% 
    dplyr::ungroup() %>% 
    dplyr::arrange(!!rlang::sym(group_psm_by))
}


#' Spreads peptide numbers.
#'
#' Spreads fields of numeric values: sd_log2_R, log2_R, log2_R, I, N_I by TMT
#' sets.
#'
#' Also works for LFQ as each sample corresponds to a TMT set.
#'
#' For single SILAC sample, the values of log2Ratios spreads into
#' \emph{MULTIPLE} columns of heavy, light etc. Despite, log2Ratios remains NA,
#' just like regular single-sample LFQ. The log2Ratios will be later calculated
#' with \link{calcLFQPepNums} that are based on intensity values.
#'
#' @param df A data frame of peptide table, ordered by ascending \code{TMT_Set}
#'   and \code{LCMS_Injection}.
#' @param dat_dir The working directory.
#' @param basenames Names of peptide table files.
#' @param ok_lfq Logical Capable of LFQ or not.
#' @param tmt_plex The multiplicity of TMT; zero for LFQ.
#' @param engine The search engine.
#' @param group_psm_by Group PSMs by.
#' @param group_pep_by Groups peptides by.
spreadPepNums <- function (df, dat_dir = NULL, basenames, tmt_plex = 0L, 
                           ok_lfq = FALSE, group_psm_by = "pep_seq_mod", 
                           group_pep_by = "gene", engine = "mz") 
{
  if (is.null(dat_dir)) {
    dat_dir <- get_gl_dat_dir()
  }

  label_scheme <- 
    load_ls_group(dat_dir, label_scheme, prefer_group = FALSE)
  label_scheme_full <- 
    load_ls_group(dat_dir, label_scheme_full, prefer_group = FALSE)
  
  tmt_levels <- TMT_levels(tmt_plex)
  tmt_levels <- if (tmt_plex) gsub("^TMT-", "", tmt_levels) else "000"
  cols_grp   <- c(group_psm_by, "TMT_Set")
  cols_grp2  <- cols_grp[cols_grp != group_psm_by]

  nms <- names(df)
  lapply(cols_grp, function (x) if (!x %in% nms) stop("Column missing: ", x))
  group_ids  <- if ("pep_group" %in% nms) unique(df$pep_group) else NULL
  is_mulgrps <- length(group_ids) >= 2L
  
  if (is_mulgrps) {
    cols_grp  <- c(cols_grp, "pep_group")
    cols_grp2 <- c(cols_grp2, "pep_group")
    
    label_scheme_group <- rep_ls_groups(group_ids)
    label_scheme_full_group <- rep_ls_groups(group_ids)
    
    save(label_scheme_group, 
         file = file.path(dat_dir, "label_scheme_group.rda"))
    save(label_scheme_full_group, 
         file = file.path(dat_dir, "label_scheme_full_group.rda"))
    
    write_excel_wb(label_scheme_full_group, "Setup", dat_dir, 
                   "label_scheme_full_group.xlsx")
    write_excel_wb(label_scheme_group, "Setup", dat_dir, 
                   "label_scheme_group.xlsx")
  }
  else {
    # save(label_scheme, file = file.path(dat_dir, "label_scheme_group.rda"))
    # save(label_scheme_full, file = file.path(dat_dir, "label_scheme_group.rda"))
  }
  
  # Numeric fields
  pat_sd_log2_R <- "^sd_log2_R[0-9]{3}[NC]{0,1}"
  pat_log2_R    <- "^log2_R[0-9]{3}[NC]{0,1}"
  pat_N_log2_R  <- "^N_log2_R[0-9]{3}[NC]{0,1}"
  pat_I         <- "^I[0-9]{3}[NC]{0,1}"
  pat_N_I       <- "^N_I[0-9]{3}[NC]{0,1}"
  type_levels   <- c("I", "N_I", "sd_log2_R", "log2_R", "N_log2_R")
  
  ## (1) aggregation different LCMS injections under the same TMT_Set
  nms <- names(df)
  if (tmt_plex) {
    df_num <- df |>
      dplyr::select(cols_grp, 
                    grep(pat_sd_log2_R, nms), 
                    grep(pat_log2_R,    nms), 
                    grep(pat_N_log2_R,  nms), 
                    grep(pat_I,         nms), 
                    grep(pat_N_I,       nms)) |>
      dplyr::group_by_at(cols_grp) |>
      aggrNumLCMS(group_psm_by, label_scheme_full) |>
      dplyr::mutate(TMT_Set = as.integer(TMT_Set)) |>
      dplyr::arrange(TMT_Set)
  }
  else {
    if (ok_lfq) { # Mzion-LFQ
      df_mbr <- lapply(basenames, function (file) {
        fi <- paste0("MBRpeps_", file)
        df <- readr::read_tsv(file.path(dat_dir, "Peptide/cache", fi))
        
        if (!"TMT_Set" %in% names(df)) {
          df$TMT_Set <- as.integer(gsub("^.*TMTset(\\d+).*", "\\1", file))
        }
        # df$LCMS_Inj <- as.integer(gsub("^.*TMTset\\d+_LCMSinj(\\d+)_.*$", "\\1", file))
        df
      }) |>
        dplyr::bind_rows() |>
        dplyr::rename(I000 = pep_tot_int)
    }
    else {
      df_mbr <- df
    }
    
    ## LFQ: MaxQuant, MSFragger, Mzion; or spec counts: all engines
    df_num <- df_mbr |> # already ordered by TMT_Set and LCMS_Injection
      dplyr::select(c(cols_grp, "I000")) |>
      dplyr::group_by_at(cols_grp) |>
      aggrNumLCMS(group_psm_by, label_scheme_full) |>
      dplyr::ungroup() |>
      dplyr::mutate(TMT_Set = as.integer(TMT_Set), 
                    N_I000 = I000, sd_log2_R000 = NA_real_, 
                    log2_R000 = NA_real_, N_log2_R000 = NA_real_) |>
      dplyr::arrange(TMT_Set)
  }
  
  ## (2) add spectrum counts at protein levels to the peptide table
  if (ok_lfq) {
    # add column `gene` or `prot_acc`
    rows   <- match(df_num[[group_psm_by]], df[[group_psm_by]])
    df_num <- dplyr::bind_cols(
      !!group_pep_by := df[rows, ][[group_pep_by]], df_num)
    if (is.factor(df_num[[group_pep_by]])) {
      df_num[[group_pep_by]] <- as.character(df_num[[group_pep_by]])
    }

    df_num <- add_spec_counts(
      dat_dir = dat_dir, df_num = df_num, group_psm_by = group_psm_by, 
      group_pep_by = group_pep_by, type = "protein")
    df_num[[group_pep_by]] <- NULL
    
    type_levels <- c(type_levels, "C")
  }
  
  # if (anyDuplicated(df_num$pep_seq_mod)) {
  #   stop("Developer: duplicated pep_seq_mod detected.")
  # }
  
  ## (3) outputs: Format: ..._log2_R126_1_[base], ..._I126_1_[base]
  nms <- names(df_num)
  # set_indexes <- sort(unique(df_num$TMT_Set))
  df_num <- df_num |>
    tidyr::gather(grep("[RIC]{1}[0-9]{3}[NC]{0,1}", nms), 
                  key = ID, value = value) |>
    dplyr::arrange_at(cols_grp2) |>
    tidyr::unite(ID, c(ID, cols_grp2))

  # define the levels of TMT channels;
  # otherwise, the order of channels will flip between N(itrogen) and C(arbon)
  lapply(c("type_levels", "tmt_levels"), function (x) {
    if (is.factor(df_num[[x]])) stop("`", x, "` cannot be factor.")
  })
  
  df_num <- df_num |>
    dplyr::mutate(
      type    = gsub("^(.*[RIC]{1})[0-9]{3}[NC]{0,1}_.*", "\\1", ID), 
      set     = gsub("^.*[RIC]{1}[0-9]{3}[NC]{0,1}_([0-9]+).*", "\\1", ID), 
      channel = gsub("^.*[RIC]{1}([0-9]{3}[NC]{0,1})_.*", "\\1", ID)) |>
    dplyr::mutate(
      type    = factor(type, levels = type_levels), set = as.integer(set), 
      channel = factor(channel, levels = tmt_levels))
  
  lapply(c("type", "channel", "set"), function (x) {
    if (any(is.na(df_num[[x]]))) stop("Unexpected NA under column `", x, "`.")
  })
  
  fct_cols <- c("type", "set", "channel", "group")
  
  df_num  <- df_num |>
    dplyr::mutate(
      group = gsub("^.*[RIC]{1}[0-9]{3}[NC]{0,1}_[0-9]+_(.*)$", "\\1", ID), 
      group = factor(group, levels = group_ids)) |>
    dplyr::arrange_at(fct_cols) 
  df_num <- df_num[, !names(df_num) %in% fct_cols]
  
  id_levels <- unique(df_num$ID)
  
  df_num <- df_num |>
    dplyr::mutate(ID = factor(ID, levels = id_levels)) |>
    tidyr::pivot_wider(names_from = ID, values_from = value)
  
  # remove values at different LCMS_Injections
  set_indexes <- gsub("^.*TMTset(\\d+).*", "\\1", basenames) |>
    unique() |>
    as.integer() |>
    sort()

  if (is.factor(group_ids)) {
    stop("`group_ids` cannot be factor.")
  }
  
  for (set_idx in set_indexes) {
    df_num <- newColnames(TMT_Set = set_idx, 
                          df = df_num, 
                          label_scheme = label_scheme, 
                          pattern = "[RIC]{1}[0-9]{3}[NC]{0,1}", 
                          group_ids = group_ids)
  }
  
  nms <- names(df_num)
  df_num <- df_num %>% 
    dplyr::select(!!rlang::sym(group_psm_by), 
                  grep("[RIC][0-9]{3}[NC]{0,1}", nms)) |>
    dplyr::ungroup() |>
    dplyr::arrange(!!rlang::sym(group_psm_by))
  
  if (is_mulgrps) {
    df_num <- pad_grp_samples(
      df = df_num, 
      sids = as.character(label_scheme_group$Sample_ID), 
      tmt_levels = tmt_levels, 
      type_levels = type_levels)
  }
  
  df_num
}


#' Spectrum counts at peptide levels
#'
#' @param dat_dir The working directory.
#' @param type The data type.
#' @param label_scheme_full The full label scheme.
#' @param filelist A list of file names of TMTset1_LCMSinj1_Peptide_N.txt etc
#'   with prepending path.
#' @param basenames The base names of file names in \code{filelist}.
#' @param set_idxes The indexes of \code{TMT_Set}'s corresponding to
#'   \code{basenames}.
#' @param injn_idxes The indexes of \code{LCMS_Inj}'s corresponding to
#'   \code{basenames}.
#' @param pep_col The output column name for peptide summary statistics.
#' @param prot_col The output column name for protein summary statistics.
#' @inheritParams normPSM
countSpecs <- function (dat_dir = NULL, label_scheme_full = NULL, type = "PSM",
                        filelist = NULL, basenames = NULL, 
                        set_idxes = 1L, injn_idxes = 1L, 
                        group_psm_by = "pep_seq_mod", group_pep_by = "prot_acc", 
                        pep_col = "pep_n_specs", prot_col = "prot_n_specs")
{
  message("Summarizing spectrum count data.")
  
  if (is.null(dat_dir)) {
    dat_dir <- get_gl_dat_dir()
  }
  
  if (is.null(label_scheme_full)) {
    load(file = file.path(dat_dir, "label_scheme_full.rda"))
  }
  
  if (!dir.exists(out_path <- file.path(dat_dir, "PSM", "cache"))) {
    dir.create(out_path)
  }
  
  if (is.null(filelist)) {
    filelist <- list.files(
      path = file.path(dat_dir, type), 
      pattern = (pat <- paste0("TMTset[0-9]+_LCMSinj[0-9]+_", type, "_N\\.txt$")), 
      full.names = TRUE)
    
    if (!(n_files <- length(filelist))) {
      stop("No individual PSM tables available.")
    }
    
    basenames  <- basename(filelist)
    set_idxes  <- 
      as.integer(gsub("TMTset(\\d+)_.*", "\\1", basenames))
    injn_idxes <- 
      as.integer(gsub("^TMTset\\d+_LCMSinj(\\d+)_.*\\.txt$", "\\1", basenames))
    
    ord <- order(set_idxes, injn_idxes)
    filelist   <- filelist[ord]
    basenames  <- basenames[ord]
    set_idxes  <- set_idxes[ord]
    injn_idxes <- injn_idxes[ord]
  }
  
  df <- mapply(function (file, set_idx, injn_idx) {
    df <- readr::read_tsv(file, col_types = get_col_types(), 
                    show_col_types = FALSE) |>
      suppressWarnings() 
    
    nms <- names(df)
    if (!"TMT_Set" %in% nms) { df$TMT_Set <- set_idx }
    if (!"LCMS_Injection" %in% nms) { df$LCMS_Injection <- injn_idx }
    df
  }, filelist, set_idxes, injn_idxes, USE.NAMES = FALSE, SIMPLIFY = FALSE)
  
  df <- do.call(rbind, df)
    # |> dplyr::bind_rows()
    
  
  prots     <- unique(df[, c(group_pep_by, group_psm_by)])
  cols_grp  <- c(group_psm_by, "TMT_Set")
  cols_grp2 <- c(cols_grp, "LCMS_Injection")
  
  ## (1) sum by each TMTSet[i]_LCMS_Injection[j]
  ans_peps <- df[, cols_grp2, drop = FALSE] |>
    dplyr::group_by_at(cols_grp2) |>
    dplyr::summarise(!!pep_col := dplyr::n()) |>
    dplyr::ungroup()
  
  rows     <- match(ans_peps[[group_psm_by]], prots[[group_psm_by]])
  ans_peps <- dplyr::bind_cols(
    !!group_pep_by := prots[rows, ][[group_pep_by]], ans_peps)
  readr::write_tsv(ans_peps, file.path(out_path, "pep_spec_counts_indiv.tsv"))
  
  ans_prots <- ans_peps[, -which(names(ans_peps) == group_psm_by)] |>
    dplyr::group_by_at(c(group_pep_by, "TMT_Set", "LCMS_Injection")) |>
    dplyr::summarise_at(pep_col, sum, na.rm = TRUE) |>
    dplyr::rename(!!prot_col := !!pep_col)
  readr::write_tsv(ans_prots, file.path(out_path, "prot_spec_counts_indiv.tsv"))
  ans_prots_ind <- ans_prots
  
  if (!any(n_LCMS(label_scheme_full)$n_LCMS > 1L)) {
    ans_peps <- ans_peps[, -which(names(ans_peps) == "LCMS_Injection")]
    readr::write_tsv(ans_peps, file.path(out_path, "pep_spec_counts_byset.tsv"))
    
    ans_prots <- ans_prots[, -which(names(ans_prots) == "LCMS_Injection")]
    readr::write_tsv(ans_prots, file.path(out_path, "prot_spec_counts_byset.tsv"))
    
    return(ans_prots_ind)
  }
  
  ## (2) average different LCMS_Injection's at the same TMTSet[i]
  ans_peps <- 
    ans_peps[, -which(names(ans_peps) %in% c("LCMS_Injection", group_pep_by))] |>
    dplyr::group_by_at(cols_grp) |>
    aggrNumLCMS(key = group_psm_by, label_scheme_full = label_scheme_full, 
                cols = pep_col)
  rows     <- match(ans_peps[[group_psm_by]], prots[[group_psm_by]])
  ans_peps <- dplyr::bind_cols(
    !!group_pep_by := prots[rows, ][[group_pep_by]], ans_peps)
  readr::write_tsv(ans_peps, file.path(out_path, "pep_spec_counts_byset.tsv"))
  
  # better than the `sum` from aggregated `ans_peps`: partially overlapped 
  # peptide IDs -> median(c(1, NA), na.rm = TRUE) -> 1, not 0.5.
  ans_prots <- ans_prots[, -which(names(ans_prots) == "LCMS_Injection")] |>
    dplyr::group_by_at(c(group_pep_by, "TMT_Set")) |>
    aggrNumLCMS(key = group_pep_by, label_scheme_full = label_scheme_full, 
                cols = prot_col)
  readr::write_tsv(ans_prots, file.path(out_path, "prot_spec_counts_byset.tsv"))
  
  ans_prots_ind
}


#' Add spectrum counts
#'
#' DO not aggregate from peptides to proteins. Missing values in peptide tables
#' can inflate protein spectrum counts: median(c(1, NA)) -> 1 not 0.5.
#' 
#' @param dat_dir A working directory.
#' @param df_num A data frame.
#' @param group_psm_by Group PSM data by.
#' @param group_pep_by Group peptide data by.
#' @param type The type of data.
add_spec_counts <- function (dat_dir, df_num, group_psm_by = "pep_seq_mod", 
                             group_pep_by = "gene", type = "protein")
{
  ty <- switch(type, 
         protein = "prot",
         peptide = "pep", 
         "prot")
  
  file_sc <- 
    file.path(dat_dir, "PSM", "cache", paste0(ty, "_spec_counts_byset.tsv"))
  df_sc   <- readr::read_tsv(file_sc)
  nms_sc  <- names(df_sc)
  
  if (type == "protein") {
    key_sc <- group_pep_by
    col_sc <- "prot_n_specs"
  }
  else if (type == "peptide") {
    if ("pep_seq_mod" %in% nms_sc) { # MaxQuant no pep_seq_mod
      key_sc <- "pep_seq_mod"
    }
    else if ("pep_seq" %in% nms_sc) {
      key_sc <- "pep_seq"
    }
    else {
      stop("Peptide key not found in ", file_sc)
    }
    
    col_sc <- "pep_n_specs"
  }
  
  if (!key_sc %in% (nms <- names(df_num))) {
    if (key_sc == "pep_seq_mod" && group_psm_by == "pep_seq_modz") { # Mzion
      df_num[[key_sc]] <- gsub("@[0-9]+", "", df_num[[group_psm_by]])
    }
    else {
      stop("Column not found: ", key_sc)
    }
  }
  
  # need "@"; otherwise: ADK1 at set 1 -> ADK11
  df_sc$uid <- paste0(df_sc[[key_sc]], "@", df_sc[["TMT_Set"]])
  df_sc[[key_sc]] <- df_sc[["TMT_Set"]] <- NULL
  df_num$uid <- paste0(df_num[[key_sc]], "@", df_num[["TMT_Set"]])
  df_num <- df_num |> dplyr::left_join(df_sc, by = "uid")

  nms <- names(df_num)
  nms[nms == col_sc] <- "C000"
  names(df_num) <- nms
  df_num$uid <- NULL
  
  df_num
}


#' Aggregates data from the same TMT_Set at different LCMS_Injection.
#' 
#' @param df A data frame.
#' @param key A column key.
#' @param label_scheme_full Metadata.
#' @param cols Columns for data aggregation.
aggrNumLCMS <- function (df, key = "pep_seq_mod", label_scheme_full, 
                         cols = NULL)
{
  tbl_lcms <- n_LCMS(label_scheme_full)

  if (uni_lcms <- !any(tbl_lcms$n_LCMS > 1L)) {
    attr(df, "uni_lcms") <- uni_lcms
    return(df)
  }
  
  tb_n <- dplyr::filter(tbl_lcms, n_LCMS > 1L)
  rows <- df$TMT_Set %in% tb_n$TMT_Set
  df_n <- df[rows, ]
  df_1 <- df[!rows, ]
  
  nms_n <- names(df_n)
  if (is.null(cols)) {
    icols <- grep("log2_R[0-9]{3}[NC]{0,1}|I[0-9]{3}[NC]{0,1}", nms_n)
    len   <- length(icols)
  }
  else {
    icols <- match(cols, nms_n)
    icols <- icols[!is.na(icols)]
    len   <- length(icols)
    
    if (!len) {
      stop("Not all column found for data summary.")
    }
  }
  
  col1 <- which(nms_n == key)
  col2 <- which(nms_n == "TMT_Set")
  col3 <- which(nms_n == "pep_group")
  cols123 <- c(col1, col2, col3)
  
  df_n <- df_n |> dplyr::group_by_at(cols123)
  
  # one column of data for aggregation
  if (len == 1L) {
    df <- df_n[, unique(c(cols123, icols))] |> 
      dplyr::summarise_all(median, na.rm = TRUE) |>
      dplyr::ungroup() |>
      dplyr::bind_rows(df_1) |>
      dplyr::arrange(TMT_Set)
    attr(df, "uni_lcms") <- uni_lcms
    
    return(df)
  }
  
  # slower; later to summary only over the original intensity columns
  if (FALSE) {
    df_n <- df_n |> 
      dplyr::group_by_at(cols123) |>
      dplyr::group_split() # 40094
    
    nrows <- unlist(lapply(df_n, nrow))
    df_n0 <- df_n[nrows == 1L]
    df_n0 <- dplyr::bind_rows(df_n0)
    df_n1 <- df_n[nrows > 1L]
    
    df_n1_keys <- lapply(df_n1, function (x) x[1, -icols]) |>
      dplyr::bind_rows()
    
    data  <- lapply(df_n1, function (x) 
      dplyr::summarise_at(x, .vars = icols, median, na.rm = TRUE))
    data  <- dplyr::bind_rows(data)
    df_n1 <- dplyr::bind_cols(df_n1_keys, data)
    
    if (FALSE) {
      n_cores <- max(min(parallel::detectCores() - 1L), 4L)
      cl <- parallel::makeCluster(getOption("cl.cores", n_cores))
      
      data <- parallel::clusterApply(
        cl, parallel::clusterSplit(cl, df_n1), 
        function (dfs) {
          lapply(dfs, function (df) 
            dplyr::summarise_at(df, .vars = icols, median, na.rm = TRUE)) |>
            dplyr::bind_rows()
        }
      )
      parallel::stopCluster(cl)
      
      data <- dplyr::bind_rows(data)
      df_n1 <- dplyr::bind_cols(df_n1_keys, data)
      df_n <- dplyr::bind_rows(df_n0, df_n1)
      df <- dplyr::bind_rows(df_1, df_n)
    }
  }
  
  n_cores <- min(parallel::detectCores() - 1L, len)
  cl <- parallel::makeCluster(getOption("cl.cores", n_cores))
  df <- suppressWarnings(
    parallel::clusterApply(
      cl, icols, function(icol) {
        df_n[, c(cols123, icol)] |> 
          dplyr::summarise_all(median, na.rm = TRUE) |>
          dplyr::ungroup()
      }
    )
  )
  parallel::stopCluster(cl)
  
  dx <- df[[1]][, cols123, drop = FALSE]
  xs <- lapply(df, function (x) x[, -cols123, drop = FALSE])
  
  df <- dplyr::bind_cols(dx, xs) |>
    dplyr::bind_rows(df_1) |>
    dplyr::arrange(TMT_Set)
  attr(df, "uni_lcms") <- uni_lcms
  
  df
}


#' Pads sample groups.
#'
#' For uses with group searches (with non-trivial values under
#' \code{pep_group}). For example, heavy may be missing in complete under one
#' sample and light missing in another.
#'
#' @param df A data frame.
#' @param sids The Sample_IDs from label_scheme.
#' @param tmt_levels The levels of TMT: 000, 126, 127 etc. without the
#'   \code{TMT-} prefix.
#' @param type_levels The levels of five types.
#' 
#' @examples 
#' \donttest{
#' # 10-plex
#' tmt_levels <- TMT_levels(10)
#' tmt_levels <- gsub("^TMT-", "", tmt_levels)
#' 
#' group_ids <- c("light", "heavy")
#' 
#' sids <- LETTERS[1:10]
#' len <- length(sids)
#' sids <- rep(sids, each = length(group_ids))
#' 
#' group_ids <- rep(group_ids, len)
#' sids <- paste0(sids, " [", group_ids, "]")
#' 
#' rm(list = c("len", "group_ids"))
#' 
#' # 1-plex
#' #' tmt_levels <- TMT_levels(1)
#' tmt_levels <- gsub("^TMT-", "", tmt_levels)
#' 
#' group_ids <- c("light", "heavy")
#' 
#' sids <- LETTERS[1]
#' len <- length(sids)
#' sids <- rep(sids, each = length(group_ids))
#' 
#' group_ids <- rep(group_ids, len)
#' sids <- paste0(sids, " [", group_ids, "]")
#' 
#' rm(list = c("len", "group_ids"))
#' }
pad_grp_samples <- function (df, sids, tmt_levels = NULL, 
                             type_levels = c("I", "N_I", "sd_log2_R", "log2_R", "N_log2_R")) 
{
  len_tp <- length(type_levels)
  len_si <- length(sids)
  len_tmt <- length(tmt_levels)

  univ <- unlist(lapply(type_levels, function (x) paste0(x, tmt_levels)))
  
  # pept_tot_int etc.: len_tmt == 0L and is.null(tmt_levels)
  if (len_tmt <= 1L) {
    univ <- unlist(lapply(univ, function (x) paste0(x, " (", sids, ")")))
  }
  else {
    n_grps <- len_si/len_tmt
    univ <- rep(univ, each = n_grps)
    univ <- paste0(univ, " (", rep(sids, len_tp), ")")
  }
  
  more_nms <- univ[!univ %in% names(df)]
  len <- length(more_nms)
  
  if (len) {
    for (i in 1:len) {
      more_nms_i <- more_nms[[i]]
      df[[more_nms_i]] <- NA_real_
    }
    
    nms <- names(df)
    
    # stopifnot(all(univ %in% nms))
    
    df <- dplyr::bind_cols(
      df[, setdiff(nms, univ), drop = FALSE], 
      df[, univ, drop = FALSE])
  }
  
  invisible(df)
}


#' Replicates new label_scheme with new Sample_IDs by group_ids.
#' 
#' The new Sample_ID: "Sample [heavy]", "Sample [light]" etc.
#' 
#' @inheritParams newColnames
rep_ls_groups <- function (group_ids) 
{
  label_scheme <- load_ls_group(dat_dir, label_scheme, prefer_group = FALSE)

  nrow <- nrow(label_scheme)
  sids <- label_scheme$Sample_ID
  n_grps <- length(group_ids)
  row_ids <- rep(1:nrow(label_scheme), each = n_grps)
  sids_grps <- paste0(rep(sids, each = n_grps), " [", group_ids, "]")
  
  label_scheme$Sample_ID_Orig <- label_scheme$Sample_ID
  label_scheme <- label_scheme[row_ids, ]
  label_scheme$Sample_ID <- sids_grps
  label_scheme$Sample_ID_Group <- rep(group_ids, nrow)

  invisible(label_scheme)
}


#' Combines peptide reports across multiple experiments.
#'
#' Median summary of data from the same TMT or LFQ experiment at different LCMS
#' injections summed \code{pep_n_psm}, \code{prot_n_psm}, and \code{prot_n_pep}
#' after data merging no Z_log2_R yet available use \code{col_select =
#' expr(Sample_ID)} not \code{col_select} to get all Z_log2_R why: users may
#' specify \code{col_select} only partial to Sample_ID entries.
#'
#' @param engine The name of search engine.
#' @param use_mq_pep Logical; if TRUE, uses the peptides.txt from MaxQuant.
#' @param use_mf_pep Logical; if TRUE, uses the peptides.txt from MSFragger.
#' @param rt_tol Error tolerance in retention times.
#' @param rt_step The step size in binning retention times.
#' @param imp_refs Logical; impute missing references or not (yet to be tested
#'   more).
#' @param new_na_species A replace value for NA species.
#' @inheritParams info_anal
#' @inheritParams normPSM
#' @inheritParams splitPSM_ma
#' @inheritParams mergePep
normPep <- function (dat_dir = NULL, group_psm_by = "pep_seq_mod", 
                     group_pep_by = "prot_acc", 
                     engine = "mz", lfq_mbr = TRUE, 
                     use_duppeps = TRUE, duppeps_repair = "denovo", 
                     cut_points = Inf, omit_single_lfq = FALSE, 
                     use_mq_pep = FALSE, use_mf_pep = FALSE, 
                     rm_allna = FALSE, rt_tol = 30, rt_step = 5E-3, 
                     mbr_ret_tol = 30, 
                     ret_sd_tol = Inf, rm_ret_outliers = FALSE, 
                     imp_refs = FALSE, use_spec_counts = FALSE, 
                     new_na_species = ".other", ...) 
{
  load(file = file.path(dat_dir, "label_scheme_full.rda"))
  load(file = file.path(dat_dir, "label_scheme.rda"))
  tmt_plex <- TMT_plex(label_scheme_full)
  
  if (!dir.exists(temp_dir <- file.path(dat_dir, "Peptide/cache"))) {
    dir.create(temp_dir, recursive = TRUE)
  }
  
  dots <- rlang::enexprs(...)
  filter_dots <- dots[unlist(lapply(dots, is.language))]
  filter_dots <- filter_dots[grepl("^filter_", names(filter_dots))]
  
  filelist  <- list.files(
    path = file.path(dat_dir, "Peptide"), 
    pattern = (pat <- paste0("TMTset[0-9]+_LCMSinj[0-9]+_Peptide_N\\.txt$")), 
    full.names = TRUE)

  if (!(n_files <- length(filelist))) {
    stop("No individual peptide tables available; run `PSM2Pep()` first.")
  }
  
  filelist   <- reorder_files2(filelist)
  basenames  <- attr(filelist, "basenames")
  set_idxes  <- attr(filelist, "set_idxes")
  injn_idxes <- attr(filelist, "injn_idxes")
  
  ids <- paste0("TMTset", set_idxes, "_LCMSinj", injn_idxes, "_Peptide_N.txt")
  if (all(are_refs <- basenames %in% ids[label_scheme_full[["Reference"]]])) {
    are_smpls <- are_refs
  }
  else if (any(are_refs)) {
    are_smpls <- !are_refs
  } 
  else {
    are_refs  <- rep_len(TRUE, length(are_refs))
    are_smpls <- are_refs
  }
  rm(list = c("ids"))

  if (tmt_plex && use_spec_counts) {
    warning("No spectrum counting for TMT workflows.")
    use_spec_counts <- FALSE
  }
  
  if (engine == "mascot" && !tmt_plex) {
    message("Use spectrum counts for Mascot LFQ.")
    use_spec_counts <- TRUE
  }
  
  if (use_spec_counts) {
    lfq_mbr <- FALSE
    mz_lfq  <- FALSE
    ok_lfq  <- FALSE
    ok_mbr  <- FALSE
  }
  else {
    mz_lfq <- if (engine == "mz" && !tmt_plex) { TRUE } else { FALSE }
    ok_lfq <- if (mz_lfq && n_files > 1L) { TRUE } else { FALSE }
    ok_mbr <- if (ok_lfq && lfq_mbr) { TRUE } else { FALSE }
  }

  if (ok_lfq) {
    ans_trs  <- find_ms1filepath(dat_dir, pat = "calibms1full", type = 1L)
    path_ms1 <- ans_trs$path_ms1
    ms1files <- ans_trs$ms1files
    ok_lfq   <- ok_lfq && ans_trs$ok_lfq
    ok_mbr   <- ok_lfq && ok_mbr
  }
  
  if (tmt_plex) {
    prot_spec_counts <- NULL
  }
  else {
    prot_spec_counts <- countSpecs(
      dat_dir = dat_dir, label_scheme_full = label_scheme_full, type = "PSM", 
      filelist = filelist, basenames = basenames, 
      set_idxes = set_idxes, injn_idxes = injn_idxes, 
      group_psm_by = if (engine != "mq") "pep_seq_mod" else "pep_seq", 
      group_pep_by = group_pep_by)
    # prot_spec_counts <- readr::read_tsv(file.path(dat_dir, "PSM/cache", "prot_spec_counts_indiv.tsv"))
  }

  dfs <- lapply(filelist, function (x) {
    readr::read_tsv(
      x, col_types = get_col_types(), show_col_types = FALSE)
  }) |>
    suppressWarnings()
  
  nms <- names(dfs[[1]])
  dfs <- mapply(function (df, set_idx, injn_idx) {
    if (!"TMT_Set " %in% nms) { df$TMT_Set <- set_idx }
    if (!"LCMS_Injection" %in% nms) { df$LCMS_Injection <- injn_idx }
    if ("gene" %in% nms) { df$gene <- forcats::fct_na_value_to_level(df$gene) }
    # handling data splitting by `species`
    if ("species" %in% nms) { df$species[is.na(df$species)] <- new_na_species }

    df
  }, dfs, set_idxes, injn_idxes, SIMPLIFY = FALSE, USE.NAMES = FALSE)
  
  if (engine == "mq" && "Protein Group Ids" %in% (nms)) {
    # all groups contain only single ID and become integer
    dfs <- lapply(dfs, function (df) {
      df$`Protein Group Ids` <- as.character(df$`Protein Group Ids`)
      df
    })
  }
  
  if (use_spec_counts) {
    df_counts <- readr::read_tsv(
      file.path(dat_dir, "PSM/cache", "pep_spec_counts_indiv.tsv"))
    df_counts <- 
      split(df_counts, interaction(df_counts$TMT_Set, df_counts$LCMS_Injection))
    tmt_lcms <- strsplit(names(df_counts), "\\.")
    ord <- order(as.integer(unlist(lapply(tmt_lcms, `[[`, 1L))), 
                 as.integer(unlist(lapply(tmt_lcms, `[[`, 2L))))
    df_counts   <- df_counts[ord]

    dfs <- mapply(function (df, dx) {
      dfx <- unique(dx[, c(group_psm_by, "pep_n_specs")])
      df  <- dplyr::left_join(df, dfx, by = group_psm_by)
      mts <- match(df[[group_psm_by]], dx[[group_psm_by]])
      df  <- df[mts, ]
      df$pep_tot_int <- df$I000 <- df$N_I000 <- df$pep_n_specs
      df$pep_n_specs <- NULL
      df
    }, dfs, df_counts, SIMPLIFY = FALSE, USE.NAMES = TRUE)
    
    rm(list = c("df_counts", "tmt_lcms", "ord"))
  }

  df <- do.call(rbind, dfs) # df <- dplyr::bind_rows(dfs) # may fail...
  df$pep_tot_int <- df$dat_file <- NULL
  
  # already ordered by TMT_Set + LCMS_Injection
  df <- df |>
    filters_in_call(!!!filter_dots) |>
    assign_duppeps(group_psm_by, group_pep_by, use_duppeps, duppeps_repair)

  if (ok_lfq) {
    dfs <- hpepLFQ(
      filelist = filelist, basenames = basenames, set_idxes = set_idxes, 
      injn_idxes = injn_idxes, are_refs = are_refs, are_smpls = are_smpls, 
      dfs = dfs, 
      # a few assignments of peptide `species` can be wrong for shared peptides 
      # across species, but sufficient for finding species centers
      df_sps = unique(df[, c(group_pep_by, "pep_seq_modz", "species")]), 
      prot_spec_counts = prot_spec_counts, use_spec_counts = use_spec_counts, 
      dat_dir = dat_dir, path_ms1 = path_ms1, ms1files = ms1files, 
      temp_dir = temp_dir, rt_tol = rt_tol, rt_step = rt_step, 
      mbr_ret_tol = mbr_ret_tol, 
      group_psm_by = "pep_seq_modz", group_pep_by = group_pep_by, 
      imp_refs = imp_refs, lfq_mbr = lfq_mbr, 
      new_na_species = new_na_species)
    sp_centers <- attr(dfs, "sp_centers", exact = TRUE)
    df <- dplyr::bind_rows(dfs) # after updated species info

    for (i in seq_along(filelist)) {
      addMBRpeps(file = filelist[[i]], pep_tbl = dfs[[i]], dat_dir = dat_dir, 
                 group_psm_by = group_psm_by)
    }

    rm(list = "ans_lfq")
  }
  else {
    sp_centers <- NULL
  }

  # make `Ixxx (SID_1)`, `Ixxx (SID_2)`, ...
  # treat spectrum counts as TMT; force Mzion-LFQ to go this route
  if (tmt_plex) {
    df_num <- spreadPepNums(
      df = df, dat_dir = dat_dir, basenames = basenames, tmt_plex = tmt_plex, 
      ok_lfq = FALSE, group_psm_by = group_psm_by, group_pep_by = group_pep_by, 
      engine = engine)
  }
  else if (engine == "mz") {
    df_num <- spreadPepNums(
      df = df, dat_dir = dat_dir, basenames = basenames, tmt_plex = tmt_plex, 
      ok_lfq = ok_lfq, group_psm_by = "pep_seq_modz", group_pep_by = group_pep_by, 
      engine = engine)

    df_num <- groupMZPepZ2(df_num)
    
    if (!"species" %in% names(df_num)) {
      df_num <- dplyr::left_join(
        df_num, unique(df[, c("pep_seq_mod", "species")]), by = "pep_seq_mod")
    }
    
    df_num <- calcLFQPepNums(
      df_num, label_scheme, imp_refs = imp_refs, sp_centers = sp_centers)
    # sp_centers <- attr(df_num, "sp_centers", exact = TRUE)
    df_num[["species"]] <- NULL
  }
  else if (engine %in% c("mq", "mf")) {
    # back-fill from a MaxQuant/MSFragger LFQ peptide table
    use_lowercase_aa <- match_call_arg("normPSM", "use_lowercase_aa")
    
    if (use_mq_pep) {
      df_num <- pep_mq_lfq(label_scheme, omit_single_lfq)
    }
    else if (use_mf_pep) {
      df_num <- pep_mf_lfq(label_scheme, omit_single_lfq, 
                           group_psm_by = group_psm_by, 
                           use_lowercase_aa = use_lowercase_aa)
    }
    else { # MaxQuant or MSFragger without the engine-generated peptide table.
      warning("Peptide tables not found for engine: `", engine, 
              "`. Use spectrum counting instead.")
      df_num <- spreadPepNums(
        df = df, dat_dir = dat_dir, basenames = basenames, tmt_plex = tmt_plex, 
        ok_lfq = FALSE, group_psm_by = group_psm_by, group_pep_by = group_pep_by, 
        engine = engine)
    }
  }
  else { # no MS1-based LFQ by Mascot and use spectrum counts
    warning("No unsupported MS1-based LFQ; proceed with spectrum countings.")
    use_spec_counts <- TRUE
    df_num <- spreadPepNums(
      df = df, dat_dir = dat_dir, basenames = basenames, tmt_plex = tmt_plex, 
      ok_lfq = FALSE, group_psm_by = group_psm_by, group_pep_by = group_pep_by, 
      engine = engine)
  }

  df_num <- df_num %>% 
    dplyr::mutate(
      mean_lint = log10(rowMeans(.[, grepl("^N_I[0-9]{3}[NC]{0,1}", names(.)), 
                                   drop = FALSE], 
                                   na.rm = TRUE)), 
      mean_lint = ifelse(is.infinite(mean_lint), NA_real_, mean_lint), 
      mean_lint = round(mean_lint, digits = 2))
  
  count_nna <- df_num %>% 
    dplyr::select(grep("N_log2_R[0-9]{3}[NC]{0,1}", 
                       names(.))) %>% 
    dplyr::select(-grep("^N_log2_R[0-9]{3}[NC]{0,1}\\s\\(Ref\\.[0-9]+\\)$", 
                        names(.))) %>% 
    dplyr::select(-grep("^N_log2_R[0-9]{3}[NC]{0,1}\\s\\(Empty\\.[0-9]+\\)$", 
                        names(.))) %>% 
    is.na() %>% 
    magrittr::not() %>% 
    rowSums()
  
  df_num <- dplyr::bind_cols(count_nna = count_nna, df_num) |>
    reloc_col_before("mean_lint", "count_nna")
  
  df$pep_seq_modz <- NULL
  
  if ("pep_ret_range" %in% names(df) && engine != "mz" && 
      !all(is.na(df$pep_ret_range))) {
    if (rm_ret_outliers) {
      df <- df |> dplyr::mutate(id. = row_number())
      
      oks <- local({
        df <- df[, c(group_psm_by, "pep_ret_range", "id.")]
        col <- which(names(df) == "pep_ret_range")
        df <- df %>% split(.[[group_psm_by]], drop = TRUE)
        
        rows <- unlist(lapply(df, function (x) nrow(x) <= 2L))
        df0 <- dplyr::bind_rows(df[rows])
        df1 <- df[!rows] 
        
        if (length(df1)) {
          df1 <- lapply(df1, locate_outliers, col) %>% 
            dplyr::bind_rows() %>% 
            dplyr::filter(!is.na(pep_ret_range))
        }
        
        c(df0$id., df1$id.)
      })
      
      if (length(oks)) 
        df <- df %>% dplyr::filter(id. %in% oks)
      
      df <- df %>% dplyr::select(-id.)
      rm(list = "oks")
    }
    
    pep_ret_sd <- df %>% 
      calc_pep_retsd(group_psm_by, use_unique = FALSE) %>% 
      dplyr::arrange(!!rlang::sym(group_psm_by))
    
    if (!is.infinite(ret_sd_tol)) {
      message("Removal of `", group_psm_by, "` entries at ", 
              "retention time SD >= ", ret_sd_tol, ".")
      
      # (all entries under the same peptide will be removed)
      df <- local({
        peps <- pep_ret_sd %>% 
          dplyr::filter(pep_ret_sd <= ret_sd_tol) %>% 
          .[[group_psm_by]]
        
        df %>% dplyr::filter(!!rlang::sym(group_psm_by) %in% peps)
      })
    }
  }
  else {
    if (rm_ret_outliers) {
      message("Peptide retention times not available for data filtration ", 
              "by `rm_ret_outliers`.")
    }
    
    if (!is.infinite(ret_sd_tol)) {
      message("Peptide retention times not available for data filtration ", 
              "by `ret_sd_tol`.")
    }
    
    pep_ret_sd <- df %>%
      dplyr::select(group_psm_by) %>% 
      unique() %>% 
      dplyr::mutate(pep_ret_sd = NA_real_) %>% 
      dplyr::arrange(!!rlang::sym(group_psm_by))
  }
  
  # up to this point pep_tot_int == I000;
  # following the summary statistics, pep_tot_int is the sum across all samples
  df_first <- df %>% 
    dplyr::select(-grep("log2_R[0-9]{3}|I[0-9]{3}", names(.))) %>% 
    dplyr::select(-which(names(.) %in% c("prot_matches_sig", "prot_sequences_sig", 
                                         "pep_ret_sd", "pep_exp_mz", "pep_exp_z", 
                                         "TMT_Set"))) %>% 
    # `pep_tot_int` will be summed over samples; 
    # the field is different to `pep_tot_int (SID)`
    med_summarise_keys(group_psm_by) %>% 
    dplyr::arrange(!!rlang::sym(group_psm_by)) %>% 
    reloc_col_after("pep_expect", "pep_score") %>% 
    reloc_col_after("pep_phospho_locprob", "pep_locdiff") %>% 
    reloc_col_after("pep_phospho_locdiff", "pep_phospho_locprob") %>% 
    reloc_col_after("pep_n_nl", "pep_rank_nl")
  
  # colnm_before <- find_preceding_colnm(df, "pep_tot_int")
  
  df <- list(df_first, pep_ret_sd, df_num) |>
    purrr::reduce(dplyr::left_join, by = group_psm_by) |>
    reloc_col_before(group_psm_by, "pep_res_after") |>
    reloc_col_after("pep_ret_sd", "pep_ret_range")
  
  df$pep_apex_ps <- df$pep_apex_ts <- df$pep_apex_xs <- df$pep_apex_ys <- 
    df$pep_apex_fwhms <- df$pep_apex_ns <- NULL
  
  if (separate_lfq_cols <- TRUE) {
    df <- df[, !grepl("^pep_.*_int \\(", names(df))]
  }
  
  # --- update sd_log2R000 (...) ---
  if (!(tmt_plex || use_mq_pep || use_mf_pep)) {
    df <- local({
      df <- df[, !grepl("^sd_log2_R000 ", names(df))]
      
      df_sds <- df %>% 
        calcSD_Splex(group_pep_by) %>% 
        `names<-`(gsub("^log2_R", "sd_log2_R", names(.)))
      
      dplyr::right_join(df_sds, df, by = group_pep_by) |>
        na_zeroIntensity()
    })
  }
  
  # --- add varmod columns ---
  if (("pep_seq_mod" %in% names(df)) && 
      (match_call_arg(normPSM, use_lowercase_aa))) {
    df <- df |>
      dplyr::mutate(pep_mod_protnt = ifelse(grepl("^~", pep_seq_mod), 
                                            TRUE, FALSE)) |>
      dplyr::mutate(pep_mod_protntac = ifelse(grepl("^_", pep_seq_mod), 
                                              TRUE, FALSE)) |>
      dplyr::mutate(pep_mod_pepnt = ifelse(grepl("^[_~]?\\^", pep_seq_mod), 
                                           TRUE, FALSE)) |>
      dplyr::mutate(pep_mod_m = ifelse(grepl("m", pep_seq_mod), 
                                       TRUE, FALSE)) |>
      dplyr::mutate(pep_mod_n = ifelse(grepl("n", pep_seq_mod), 
                                       TRUE, FALSE)) |>
      dplyr::mutate(pep_mod_sty = ifelse(grepl("[sty]", pep_seq_mod), 
                                         TRUE, FALSE)) |>
      dplyr::mutate(pep_mod_pepct = ifelse(grepl("[\\^]{1}[_~]?$",
                                                 pep_seq_mod), 
                                           TRUE, FALSE)) |>
      dplyr::mutate(pep_mod_protctam = ifelse(grepl("_{1}$", pep_seq_mod), 
                                              TRUE, FALSE)) |>
      dplyr::mutate(pep_mod_protct = ifelse(grepl("~{1}$", pep_seq_mod), 
                                            TRUE, FALSE))
  } 
  else {
    df <- df %>% 
      dplyr::mutate(pep_mod_protnt = NA, 
                    pep_mod_protntac = NA, 
                    pep_mod_pepnt = NA, 
                    pep_mod_m = NA, 
                    pep_mod_n = NA, 
                    pep_mod_sty = NA, 
                    pep_mod_pepct = NA, 
                    pep_mod_protctam = NA, 
                    pep_mod_protct = NA)
  }
  
  # --- tidy up ---
  nms <- names(df)
  df <- dplyr::bind_cols(
    df |> dplyr::select(grep("^prot_", nms)),
    df |> dplyr::select(grep("^pep_", nms)), 
    df |> dplyr::select(-grep("^prot_|^pep_", nms)), 
  )
  
  nms <- names(df)
  df <- dplyr::bind_cols(
    df |> dplyr::select(-grep("[RI]{1}[0-9]{3}[NC]*", nms)), 
    df |> dplyr::select(grep("^I[0-9]{3}[NC]*", nms)), 
    df |> dplyr::select(grep("^N_I[0-9]{3}[NC]*", nms)), 
    df |> dplyr::select(grep("^sd_log2_R[0-9]{3}[NC]*", nms)), 
    df |> dplyr::select(grep("^log2_R[0-9]{3}[NC]*", nms)), 
    df |> dplyr::select(grep("^N_log2_R[0-9]{3}[NC]*", nms)), 
    df |> dplyr::select(grep("^Z_log2_R[0-9]{3}[NC]*", nms)), 
  )
  
  df <- df %>%
    dplyr::filter(!duplicated(.[[group_psm_by]])) %>% 
    { if (tmt_plex && rm_allna) 
      .[rowSums(!is.na(.[grepl("^N_log2_R[0-9]{3}[NC]{0,1}", names(.))])) > 0, ] 
      else . } %>% 
    dplyr::arrange(!!rlang::sym(group_psm_by))
  
  # a placeholder so no need to handle the exception of 
  # no `Z_log2_R` columns before the first `normMulGau`
  if (!length(grep("^Z_log2_R[0-9]{3}[NC]{0,1}", names(df)))) {
    df <- df %>% 
      dplyr::select(grep("^N_log2_R[0-9]{3}[NC]{0,1}", names(.))) %>% 
      `names<-`(gsub("^N_log2_R", "Z_log2_R", names(.))) %>% 
      dplyr::bind_cols(df, .)
  }
  
  df <- normMulGau(
    df = df,
    method_align = "MC",
    n_comp = 1L,
    range_log2r = c(0, 100),
    range_int = c(0, 100),
    filepath = file.path(dat_dir, "Peptide/Histogram"),
    col_select = rlang::expr(Sample_ID), 
    cut_points = cut_points, 
    is_prot_lfq = FALSE,
  ) %>% 
    fmt_num_cols() %T>% 
    write.table(file.path(dat_dir, "Peptide/Peptide.txt"), 
                sep = "\t", col.names = TRUE, row.names = FALSE)
}


#' Finds the filepath of MS1 traces
#' 
#' @param dat_dir The current working directory.
#' @param pat A pattern for recognizing full MS1 files.
#' @param type The exit type: 0, warning; 1, stop.
find_ms1filepath <- function (dat_dir, pat = "ms1full", type = 1L)
{
  load(file.path(dat_dir, "fraction_scheme.rda"))
  
  msg1 <- paste0(
    "MS1 peak lists of `^ms1full_[...].rds` not found for LFQ.\n", 
    "Please copy them from the Mzion folder ", 
    "to your working directory.")
  msg2 <- paste0(
    "The number of MS1 `^ms1full_[...].rds` files is different ", 
    "to the number of RAW files.")
  
  ok_lfq   <- TRUE
  pat2     <- paste0("^", pat, "_.*\\.rds$")
  n_raws   <- length(unique(fraction_scheme$RAW_File))
  ms1files <- list.files(dat_dir, pattern = pat2)
  n_ms1fis <- length(ms1files)
  
  if (n_ms1fis) {
    message("MS1 trace files found under ", dat_dir)
    path_ms1 <- dat_dir
  }
  else {
    path_ms1 <- file.path(dat_dir, "ms1data")
    ms1files <- list.files(path_ms1, pattern = pat2)
    n_ms1fis <- length(ms1files)
    
    if (!n_ms1fis) {
      # stop: otherwise `df` from `PSM2Pep` is unique by pep_seq_modz and 
      # need to be collapsed to pep_seq_mod
      if (type) stop(msg1) else warning(msg1)
      ok_lfq <- FALSE
    }
  }
  
  if (n_ms1fis != n_raws) {
    if (type) stop(msg2) else warning(msg2)
    ok_lfq <- FALSE
  }
  
  if (!(ok_lfq)) {
    path_ms1 <- NULL
    ms1files <- NULL
  }
  
  list(path_ms1 = path_ms1, ms1files = ms1files, ok_lfq = ok_lfq)
}


#' Separate MBR file names by samples
#'
#' @param b_nms The basename of peptide files.
#' @param dat_dir The working directory.
#' @param ms1files The file names of MS1 data.
#' @param type MS1 data type. Either full MS1 or processed, sectional traces.
#' @param by Separate the output by. Only have an effect at \code{type =
#'   ms1apexes}.
sep_mbrfiles <- function (b_nms, dat_dir, ms1files = NULL, 
                          type = "ms1full", by = c("raw", "set"))
{
  if (!length(ms1files)) {
    return(NULL)
  }
  
  by <- match.arg(by)
  load(file.path(dat_dir, "fraction_scheme.rda"))
  
  fraction_scheme <- fraction_scheme |>
    tidyr::unite(tmt_lcms, c("TMT_Set", "LCMS_Injection"), remove = FALSE)

  meta <- data.frame(
    basename = b_nms, 
    TMT_Set = gsub("^TMTset(\\d+).*", "\\1", b_nms), 
    LCMS_Injection = 
      gsub("^TMTset\\d+_LCMSinj(\\d+)_(Peptide|PSM)_N.txt$", "\\1", b_nms)) |>
    tidyr::unite(tmt_lcms, c("TMT_Set", "LCMS_Injection"), remove = TRUE) |>
    dplyr::left_join(fraction_scheme, by = "tmt_lcms")
  readr::write_tsv(meta, file.path(dat_dir, "Peptide/cache/mbr_meta.txt"))
  
  if (type %in% c("ms1full", "calibms1full")) {
    if (grepl("\\.raw\\.rds", ms1file1 <- ms1files[[1]])) {
      suffix <- ".raw.rds"
    }
    else if (grepl("\\.d\\.rds", ms1file1)) {
      suffix <- ".d.rds"
    }
  }
  else {
    suffix <- ".rds" # for `ms1apexes`
  }
  
  # raws under each peptide table
  raws <- lapply(b_nms, function (b_nm) {
    rows <- meta$basename == b_nm
    raws <- unique(meta$RAW_File[rows])
    raws <- gsub("\\.raw$", "", raws)
    raws <- gsub("\\.d$", "", raws)
  })
  names(raws) <- b_nms
  
  # separate ms1files by raws
  if (type %in% c("ms1full", "calibms1full")) {
    ms1raws <- gsub(paste0("^", type, "_(.*)\\.rds$"), "\\1", ms1files)
    ms1raws <- gsub("(\\.raw$|\\.d$)", "", ms1raws)
    ms1raws <- lapply(raws, function (x) ms1raws[ms1raws %in% x])
    out <- lapply(ms1raws, function (x) paste0(type, "_", x, suffix))
  }
  else if (type == "ms1apexes") {
    ms1files <- local({
      files <- gsub("^ms1apexes_(.*)\\.(raw|d)_{0,1}\\d*\\.rds$", "\\1", ms1files)
      fracs <- gsub(".*_{0,1}(\\d*)\\.rds$", "\\1", ms1files) |> as.integer()
      ord   <- order(files, fracs)
      ms1files <- ms1files[ord]
    })

    ms1raws <- gsub(paste0("^", type, "_(.*)\\.rds$"), "\\1", ms1files)
    ms1raws <- gsub("(\\.raw_{0,1}\\d*$|\\.d_{0,1}\\d*$)", "", ms1raws)
    
    # for adding apex information
    if (by == "raw") {
      out <- split(ms1files, ms1raws)
    }
    # for avoiding MBR among fractions under the same TMT_Set/Sample_ID
    else if (by == "set") {
      out <- lapply(raws, function (rs) {
        ms <- ms1files[ms1raws %in% rs]
        attr(ms, "raws") <- rs
        ms
      })
    }
    else {
      stop("The value of `by` need to be either `raw` or `set`.")
    }

    # backward compatible
    if (all(lengths(out)) == 0L) {
      warning("Need the latest Mzion for MBR.")
      return(NULL)
    }
  }
  else {
    stop("Unknown MS1 data type.")
  }

  out
}


#' Extract MBR intensities
#' 
#' To handle the case of multiple matched Y values in a scan.
#' 
#' @param xs Vectors of X-value vectors over a range of retention times.
#' @param ys Vectors of Y-value vectors over a range of retention times.
#' @param mbr_mz The Value of an m-over-z for MBR.
#' @param step The mass error in \code{ppm / 1e6}.
extract_mbry <- function (xs, ys, mbr_mz, step = 1e-5)
{
  if (!(len <- length(xs))) {
    return(NA_real_)
  }

  ysub <- xsub <- rep_len(NA_real_, len)

  for (i in 1:len) {
    xi  <- xs[[i]]
    yi  <- ys[[i]]
    oks <- .Internal(which(abs(xi / mbr_mz - 1) <= step))
    ni  <- length(oks)
    
    if (ni == 1L) {
      xsub[[i]] <- xi[[oks]]
      ysub[[i]] <- yi[[oks]]
    }
    else if (ni > 1L) {
      xvs <- xi[oks]
      yvs <- yi[oks]
      p   <- which.min(abs(xvs - mbr_mz))
      xsub[[i]] <- xvs[[p]]
      ysub[[i]] <- yvs[[p]]
    }
  }
  
  list(x = xsub, y = ysub)
}


#' Group Mzion peptides by charge states
#' 
#' @param df A Mzion peptide table.
#' @param sdco A standard deviaiton cut-off.
#' @inheritParams normPSM
groupMZPepZ <- function (df, group_psm_by = "pep_seq_mod", sdco = sqrt(9))
{
  df <- df[, c("pep_seq_mod", "pep_seq", "pep_tot_int", 
               "pep_apex_ret", "pep_apex_scan")] |>
    # use the first scan and hopefully the same z along LC across samples
    dplyr::arrange(pep_apex_scan)
  
  dfs <- split(df, df$pep_seq_mod)
  
  for (i in seq_along(dfs)) {
    dx <- dfs[[i]]
    
    if (nrow(dx) > 1L) {
      ys <- sum(dx$pep_tot_int, na.rm = TRUE)
      dx <- dx[1, ]
      dx$pep_tot_int <- ys
    }
    
    dfs[[i]] <- dx
  }
  
  df <- dplyr::bind_rows(dfs)
  
  if (group_psm_by == "pep_seq") {
    df  <- df |> dplyr::arrange(pep_apex_scan)
    dfs <- split(df, df$pep_seq)
    ys  <- lapply(dfs, function (dx) sum(dx$pep_tot_int))
    dfs <- lapply(dfs, function (dx) dx[1, ])
    df  <- dplyr::bind_rows(dfs)
    df$pep_tot_int <- unlist(ys, recursive = FALSE, use.names = FALSE)
  }
  
  df
}


#' Group Mzion peptides by charge states
#' 
#' @param df_num A Mzion peptide table.
groupMZPepZ2 <- function (df_num)
{
  df_num$pep_seq_mod <- gsub("@[0-9]+", "", df_num$pep_seq_modz)
  df_num$pep_seq_modz <- NULL
  df_nums <- split(df_num, df_num$pep_seq_mod)
  nrows <- sapply(df_nums, nrow)
  oks <- nrows == 1L
  df_nums0 <- dplyr::bind_rows(df_nums[oks])
  df_nums1 <- df_nums[!oks]
  
  nms <- names(df_num)
  cols_y1 <- grep("^I000 \\(", nms)

  for (i in seq_along(df_nums1)) {
    dfi <- df_nums1[[i]]
    nna <- rowSums(is.na(dfi[, cols_y1, drop = FALSE]))
    row <- which.min(nna)
    df_nums1[[i]] <- dfi[row, ]
  }
  
  out <- dplyr::bind_rows(df_nums0, dplyr::bind_rows(df_nums1))
}


#' Summary of peptide keys by mean or geomean
#'
#' @param df A data frame.
#' @param ids A vector of column keys being the identifier of the level of
#'   uniqueness. For example, it may be "pep_seq_mod" for non-SILAC or
#'   c("pep_seq_mod", "pep_group") for SILAC with the conversion from PSMs to
#'   peptides.
#' @import dplyr purrr
#' @importFrom magrittr %>% %T>% %$% %<>%
med_summarise_keys <- function(df, ids) 
{
  ## --- Mascot ---
  mascot_median_keys <- c("pep_score", "pep_rank", "pep_isbold", 
                          "pep_exp_mr", "pep_delta", 
                          "pep_exp_mz", "pep_exp_z", 
                          "pep_locprob", "pep_locdiff", 
                          "pep_ret_range", "pep_n_nl", "pep_n_exp_z", 
                          # "pep_score_co", 
                          "pep_n_nl")
  mascot_geomean_keys <- c("pep_expect")
  mascot_sum_keys <- c("pep_tot_int")
  
  df_mascot_med <- df %>% 
    dplyr::select(ids, which(names(.) %in% mascot_median_keys)) %>% 
    dplyr::group_by_at(ids) %>% 
    dplyr::summarise_all(~ median(.x, na.rm = TRUE))
  
  df_mascot_geomean <- df %>% 
    dplyr::select(ids, which(names(.) %in% mascot_geomean_keys)) %>% 
    dplyr::group_by_at(ids) %>% 
    dplyr::summarise_all(~ my_geomean(.x, na.rm = TRUE))
  
  df_all_sum <- df %>% 
    dplyr::select(ids, which(names(.) %in% mascot_sum_keys)) %>% 
    dplyr::group_by_at(ids) %>% 
    dplyr::summarise_all(~ sum(.x, na.rm = TRUE))
  
  df <- df %>% 
    dplyr::select(-which(names(.) %in% c(mascot_median_keys, 
                                         mascot_geomean_keys, 
                                         mascot_sum_keys)))
  
  ## --- MaxQuant ---
  df_mq_rptr_mass_dev <- df %>% 
    dplyr::select(ids, grep(str_to_title("^Reporter mass deviation"), names(.))) %>% 
    dplyr::group_by_at(ids) %>% 
    dplyr::summarise_all(~ median(.x, na.rm = TRUE))
  
  df <- df %>% 
    dplyr::select(-grep(str_to_title("^Reporter mass deviation"), names(.)))
  
  mq_median_keys <- str_to_title(c(
    "Score", 
    "Charge", "Mass", "PIF", "Fraction of total spectrum", "Mass error [ppm]", 
    "Mass error [Da]", "Base peak fraction", # "Precursor Intensity", 
    "Precursor Apex Fraction", "Intensity coverage", "Peak coverage", 
    "Combinatorics"
  ))
  mq_geomean_keys <- c("PEP")
  
  df_mq_med <- df %>% 
    dplyr::select(ids, which(names(.) %in% mq_median_keys)) %>% 
    dplyr::group_by_at(ids) %>% 
    dplyr::summarise_all(~ median(.x, na.rm = TRUE))
  
  df_mq_geomean <- df %>% 
    dplyr::select(ids, which(names(.) %in% mq_geomean_keys)) %>% 
    dplyr::group_by_at(ids) %>% 
    dplyr::summarise_all(~ my_geomean(.x, na.rm = TRUE))
  
  df <- df %>% 
    dplyr::select(-which(names(.) %in% c(mq_median_keys, mq_geomean_keys)))
  
  if ("Length" %in% names(df)) {
    df <- df %>% dplyr::mutate(pep_len = Length)
  }
  if ("Missed cleavages" %in% names(df)) {
    df <- df %>% dplyr::mutate(pep_miss = `Missed cleavages`)
  }
  if ("Missed Cleavages" %in% names(df)) {
    df <- df %>% dplyr::mutate(pep_miss = `Missed Cleavages`)
  }
  
  df <- df %>% 
    dplyr::select(-which(names(.) %in% str_to_title(
      c("Scan number", "Scan index", "Length", "Missed Cleavages")
    )))
  
  if (all(c("Modifications", "Retention Time") %in% names(df))) {
    df <- df %>% dplyr::select(!`Modifications`:`Retention Time`)
  }
  
  df <- df %>% 
    dplyr::select(-which(names(.) %in% str_to_title(
      c("Delta score", "Score diff", "Localization prob", 
        "Precursor full scan number", "Precursor apex fraction",
        "Precursor apex offset", "Precursor apex offset time", 
        "Matches", 
        "Mass deviations [ppm]", "Masses", "Number of matches", 
        "Neutral loss level", "Intensities", 
        "Mass deviations [Da]", 
        "ETD identification type", "Reverse", "All scores", 
        "All sequences", 
        "All modified sequences", 
        "Reporter PIF", "Reporter fraction", 
        "ID", # "Protein group IDs", 
        "Peptide ID", "Mod. peptide ID", 
        "Evidence ID", "Diagnostic Peak Phospho (Sty) Y")
    ))) %>% 
    dplyr::select(-grep("Site IDs$", names(.)))
  
  ## --- Spectrum Mill ---
  sm_median_keys <- c(
    "score", "parent_charge", 
    "deltaForwardReverseScore", "percent_scored_peak_intensity", "totalIntensity", 
    "precursorAveragineChiSquared", "precursorIsolationPurityPercent", 
    "precursorIsolationIntensity", "ratioReporterIonToPrecursor", 
    "delta_parent_mass", "delta_parent_mass_ppm")
  sm_geomean_keys <- NA
  
  df_sm_med <- df %>% 
    dplyr::select(ids, which(names(.) %in% sm_median_keys)) %>% 
    dplyr::group_by_at(ids) %>% 
    dplyr::summarise_all(~ median(.x, na.rm = TRUE))
  
  df <- df %>% 
    dplyr::select(-which(names(.) %in% sm_median_keys))
  
  ## --- MSFragger ---
  mf_median_keys <- c("Nextscore", "PeptideProphet Probability")
  mf_geomean_keys <- NA
  
  df_mf_med <- df %>% 
    dplyr::select(ids, which(names(.) %in% mf_median_keys)) %>% 
    dplyr::group_by_at(ids) %>% 
    dplyr::summarise_all(~ median(.x, na.rm = TRUE))
  
  df <- df %>% 
    dplyr::select(-which(names(.) %in% mf_median_keys)) 
  
  ## --- put together ---
  df_first <- df %>% 
    tidyr::unite(ids., ids, sep = ".", remove = FALSE) %>% 
    dplyr::filter(!duplicated(ids.)) %>% 
    dplyr::select(-ids.)
  
  df <- list(df_first, 
             df_mascot_med, df_mascot_geomean, df_all_sum, 
             df_mq_rptr_mass_dev, df_mq_med, df_mq_geomean, 
             df_sm_med, df_mf_med) %>%
    purrr::reduce(dplyr::left_join, by = ids) %>%
    data.frame(check.names = FALSE)
  
  df <- dplyr::bind_cols(
    df %>% dplyr::select(grep("^pep_", names(.))), 
    df %>% dplyr::select(-grep("^pep_", names(.))), 
  ) 
}


#' Loads prior Peptide.txt
#' 
#' @inheritParams info_anal
load_prior <- function(filename, id) 
{
  dat_dir <- get_gl_dat_dir()
  
  stopifnot(file.exists(filename))
  
  df <- read.csv(filename, check.names = FALSE, 
                 header = TRUE, sep = "\t", comment.char = "#") 

  if (! id %in% names(df)) {
    try(unlink(file.path(dat_dir, "Peptide/Peptide.txt")))
    try(unlink(file.path(dat_dir, "Protein/Protein.txt")))
    stop("`Peptide.txt` deleted as column `", id, "` not available.", 
         call. = FALSE)
  }
  
  df <- df %>% dplyr::arrange(!!rlang::sym(id))
}


#' Formats numeric columns
#' 
#' @inheritParams info_anal
fmt_num_cols <- function (df) 
{
  df %>% 
    dplyr::mutate_at(vars(grep("[IR][0-9]{3}[NC]{0,1}", names(.))), 
                     as.numeric) %>% 
    dplyr::mutate_at(vars(grep("I[0-9]{3}[NC]{0,1}", names(.))), 
                     ~ round(.x, digits = 0)) %>% 
    dplyr::mutate_at(vars(grep("R[0-9]{3}[NC]{0,1}", names(.))), 
                     ~ round(.x, digits = 3)) 
}


#' Merge peptide table(s) into one
#'
#' \code{mergePep} merges individual peptide table(s),
#' \code{TMTset1_LCMSinj1_Peptide_N.txt, TMTset1_LCMSinj2_Peptide_N.txt} etc.,
#' into one interim \code{Peptide.txt}. The \code{log2FC} values in the interim
#' result are centered with the medians at zero (median centering). The utility
#' is typically applied after the conversion of PSMs to peptides via
#' \code{\link{PSM2Pep}} and is required even with a experiment at one multiplex
#' TMT and one LC/MS series.
#'
#' In the interim output file, "\code{Peptide.txt}", values under columns
#' \code{log2_R...} are logarithmic ratios at base 2 in relative to the
#' \code{reference(s)} within each multiplex TMT set, or to the row means within
#' each plex if no \code{reference(s)} are present. Values under columns
#' \code{N_log2_R...} are median-centered \code{log2_R...} without scaling
#' normalization. Values under columns \code{Z_log2_R...} are \code{N_log2_R...}
#' with additional scaling normalization. Values under columns \code{I...} are
#' reporter-ion or LFQ intensity before normalization. Values under columns
#' \code{N_I...} are normalized \code{I...}. Values under columns
#' \code{sd_log2_R...} are the standard deviation of the \code{log2FC} of
#' proteins from ascribing peptides.
#'
#' Description of the column keys in the output: \cr \code{system.file("extdata",
#' "peptide_keys.txt", package = "proteoQ")}
#'
#' The peptide counts in individual peptide tables,
#' \code{TMTset1_LCMSinj1_Peptide_N.txt} etc., may be fewer than the entries
#' indicated under the \code{prot_n_pep} column after the peptide
#' removals/cleanups using \code{purgePSM}.
#'
#' @param mbr_ret_tol The tolerance in MBR retention time in seconds. The
#'   default is to match the setting in \link{normPSM}.
#' @param duppeps_repair Not currently used (or only with \code{majority}).
#'   Character string; the method of reparing double-dipping peptide sequences
#'   upon data pooling.
#'
#'   For instance, the same sequence of PEPTIDE may be assigned to protein
#'   accession PROT_ACC1 in data set 1 and PROT_ACC2 in data set 2. At the
#'   \code{denovo} default, the peptide to protein association will be
#'   re-established freshly. At the \code{majority} alternative, a majority rule
#'   will be applied for the re-assignments.
#' @param cut_points A named, numeric vector defines the cut points (knots) for
#'   the median-centering of \code{log2FC} by sections. For example, at
#'   \code{cut_points = c(mean_lint = seq(4, 7, .5))}, \code{log2FC} will be
#'   binned according to the intervals of \eqn{-Inf, 4, 4.5, ..., 7, Inf} under
#'   column \code{mean_lint} (mean log10 intensity) in the input data. The
#'   default is \code{cut_points = Inf}, or equivalently \code{-Inf}, where the
#'   \code{log2FC} under each sample will be median-centered as one piece. See
#'   also \code{\link{prnHist}} for data binning in histogram visualization.
#' @param use_duppeps Logical; if TRUE, re-assigns double/multiple dipping
#'   peptide sequences to the most likely proteins by majority votes.
#' @param imp_refs Logical; impute missing references or not. 
#' @param ret_sd_tol Depreciated. Numeric; the tolerance in the variance of
#'   retention time (w.r.t. measures in seconds). The thresholding applies to
#'   TMT data. The default is \code{Inf}. Depends on the setting of LCMS
#'   gradients, a setting of, e.g., 150 might be suitable.
#' @param rm_ret_outliers Depreciated. Logical; if TRUE, removes peptide entries
#'   with outlying retention times across samples and/or LCMS series.
#' @param omit_single_lfq Depreciated. Logical; if TRUE, omits LFQ entries with
#'   single measured values across all samples. The default is FALSE.
#' @param ... \code{filter_}: Variable argument statements for the row
#'   filtration of data against the column keys in individual peptide tables of
#'   \code{TMTset1_LCMSinj1_Peptide_N.txt, TMTset1_LCMSinj2_Peptide_N.txt}, etc.
#'   \cr \cr The variable argument statements should be in the following format:
#'   each statement contains to a list of logical expression(s). The \code{lhs}
#'   needs to start with \code{filter_}. The logical condition(s) at the
#'   \code{rhs} needs to be enclosed in \code{exprs} with round parenthesis. For
#'   example, \code{pep_len} is a column key present in \code{Mascot} peptide
#'   tables of \code{TMTset1_LCMSinj1_Peptide_N.txt},
#'   \code{TMTset1_LCMSinj2_Peptide_N.txt} etc. The statement
#'   \code{filter_peps_at = exprs(pep_len <= 50)} will remove peptide entries
#'   with \code{pep_len > 50}. See also \code{\link{normPSM}}.
#' @inheritParams normPSM
#' @inheritParams splitPSM_ma
#'
#' @seealso \emph{Metadata} \cr \code{\link{load_expts}} for metadata
#'   preparation and a reduced working example in data normalization \cr
#'
#'   \emph{Data normalization} \cr \code{\link{normPSM}} for extended examples
#'   in PSM data normalization \cr \code{\link{PSM2Pep}} for extended examples
#'   in PSM to peptide summarization \cr \code{\link{mergePep}} for extended
#'   examples in peptide data merging \cr \code{\link{standPep}} for extended
#'   examples in peptide data normalization \cr \code{\link{Pep2Prn}} for
#'   extended examples in peptide to protein summarization \cr
#'   \code{\link{standPrn}} for extended examples in protein data normalization.
#'   \cr \code{\link{purgePSM}} and \code{\link{purgePep}} for extended examples
#'   in data purging \cr \code{\link{pepHist}} and \code{\link{prnHist}} for
#'   extended examples in histogram visualization. \cr
#'   \code{\link{extract_raws}} and \code{\link{extract_psm_raws}} for
#'   extracting MS file names \cr
#'
#'   \emph{Variable arguments of `filter_...`} \cr \code{\link{contain_str}},
#'   \code{\link{contain_chars_in}}, \code{\link{not_contain_str}},
#'   \code{\link{not_contain_chars_in}}, \code{\link{start_with_str}},
#'   \code{\link{end_with_str}}, \code{\link{start_with_chars_in}} and
#'   \code{\link{ends_with_chars_in}} for data subsetting by character strings
#'   \cr
#'
#'   \emph{Missing values} \cr \code{\link{pepImp}} and \code{\link{prnImp}} for
#'   missing value imputation \cr
#'
#'   \emph{Informatics} \cr \code{\link{pepSig}} and \code{\link{prnSig}} for
#'   significance tests \cr \code{\link{pepVol}} and \code{\link{prnVol}} for
#'   volcano plot visualization \cr \code{\link{prnGSPA}} for gene set
#'   enrichment analysis by protein significance pVals \cr \code{\link{gspaMap}}
#'   for mapping GSPA to volcano plot visualization \cr \code{\link{prnGSPAHM}}
#'   for heat map and network visualization of GSPA results \cr
#'   \code{\link{prnGSVA}} for gene set variance analysis \cr
#'   \code{\link{prnGSEA}} for data preparation for online GSEA. \cr
#'   \code{\link{pepMDS}} and \code{\link{prnMDS}} for MDS visualization \cr
#'   \code{\link{pepPCA}} and \code{\link{prnPCA}} for PCA visualization \cr
#'   \code{\link{pepLDA}} and \code{\link{prnLDA}} for LDA visualization \cr
#'   \code{\link{pepHM}} and \code{\link{prnHM}} for heat map visualization \cr
#'   \code{\link{pepCorr_logFC}}, \code{\link{prnCorr_logFC}},
#'   \code{\link{pepCorr_logInt}} and \code{\link{prnCorr_logInt}}  for
#'   correlation plots \cr \code{\link{anal_prnTrend}} and
#'   \code{\link{plot_prnTrend}} for trend analysis and visualization \cr
#'   \code{\link{anal_pepNMF}}, \code{\link{anal_prnNMF}},
#'   \code{\link{plot_pepNMFCon}}, \code{\link{plot_prnNMFCon}},
#'   \code{\link{plot_pepNMFCoef}}, \code{\link{plot_prnNMFCoef}} and
#'   \code{\link{plot_metaNMF}} for NMF analysis and visualization \cr
#'
#'   \emph{Custom databases} \cr \code{\link{Uni2Entrez}} for lookups between
#'   UniProt accessions and Entrez IDs \cr \code{\link{Ref2Entrez}} for lookups
#'   among RefSeq accessions, gene names and Entrez IDs \cr \code{\link{prepGO}}
#'   for
#'  \code{\href{http://current.geneontology.org/products/pages/downloads.html}{gene
#'   ontology}} \cr \code{\link{prepMSig}} for
#'  \href{https://data.broadinstitute.org/gsea-msigdb/msigdb/release/7.0/}{molecular
#'   signatures} \cr \code{\link{prepString}} and \code{\link{anal_prnString}}
#'   for STRING-DB \cr
#'
#'   \emph{Column keys in PSM, peptide and protein outputs} \cr
#'   system.file("extdata", "psm_keys.txt", package = "proteoQ") \cr
#'   system.file("extdata", "peptide_keys.txt", package = "proteoQ") \cr
#'   system.file("extdata", "protein_keys.txt", package = "proteoQ") \cr
#'
#' @return The primary output is in \code{.../Peptide/Peptide.txt}.
#'
#' @example inst/extdata/examples/mergePep_.R
#' @import stringr dplyr tidyr purrr
#' @importFrom magrittr %>% %T>% %$% %<>%
#' @export
mergePep <- function (
    use_duppeps = TRUE, mbr_ret_tol = NULL, 
    duppeps_repair = c("majority", "denovo"), plot_log2FC_cv = TRUE, 
    cut_points = Inf, rm_allna = FALSE, imp_refs = FALSE,  
    omit_single_lfq = FALSE, ret_sd_tol = Inf, 
    rm_ret_outliers = FALSE, ...) 
{
  dat_dir <- get_gl_dat_dir()
  engine  <- find_search_engine(dat_dir)
  
  old_opts <- options()
  options(warn = 1L)
  on.exit(options(old_opts), add = TRUE)
  
  on.exit(
    if (exists(".savecall", envir = environment())) {
      if (.savecall) {
        mget(names(formals()), envir = environment(), inherits = FALSE) |>
          c(dots) |>
          save_call("mergePep")
      }
    }, add = TRUE)

  # ---
  duppeps_repair <- "majority"
  duppeps_repair <- rlang::enexpr(duppeps_repair)
  oks <- eval(formals()[["duppeps_repair"]])
  
  duppeps_repair <- if (length(duppeps_repair) > 1L) {
    oks[[1]]
  }
  else {
    rlang::as_string(duppeps_repair)
  }
  
  stopifnot(duppeps_repair %in% oks, length(duppeps_repair) == 1L)
  # ---
  
  stopifnot(cut_points >= 0, ret_sd_tol > 0, 
            vapply(c(plot_log2FC_cv, use_duppeps, rm_allna, omit_single_lfq, 
                     rm_ret_outliers), rlang::is_logical, logical(1)))

  reload_expts()

  load(file = file.path(dat_dir, "label_scheme_full.rda"))
  load(file = file.path(dat_dir, "label_scheme.rda"))

  dir.create(file.path(dat_dir, "Peptide/cache"), 
             recursive = TRUE, showWarnings = FALSE)
  dir.create(file.path(dat_dir, "Peptide/Histogram"), 
             recursive = TRUE, showWarnings = FALSE)
  dir.create(file.path(dat_dir, "Peptide/log2FC_cv/raw"), 
             recursive = TRUE, showWarnings = FALSE)
  dir.create(file.path(dat_dir, "Peptide/log2FC_cv/purged"), 
             recursive = TRUE, showWarnings = FALSE)
  dir.create(file.path(dat_dir, "Peptide/log"), 
             recursive = TRUE, showWarnings = FALSE)
  
  group_psm_by <- match_call_arg(normPSM, group_psm_by)
  group_pep_by <- match_call_arg(normPSM, group_pep_by)
  
  if (is.null(mbr_ret_tol)) {
    mbr_ret_tol  <- match_call_arg(normPSM, mbr_ret_tol)
  }
  else {
    if (!is.numeric(mbr_ret_tol)) {
      stop("Argument `mbr_ret_tol` needs to be numeric.")
    }
    
    if (mbr_ret_tol <= 0) {
      stop("Argument `mbr_ret_tol` needs to be greater than 0.")
    }
  }

  dots <- rlang::enexprs(...)
  
  filter_dots <- dots %>% 
    .[purrr::map_lgl(., is.language)] %>% 
    .[grepl("^filter_", names(.))]
  
  # later apply also the back-filling with MSFragger data...
  message("Primary column keys in `Peptide/TMTset1_LCMSinj1_Peptide_N.txt` etc. ", 
          "for `filter_` varargs.")
  use_mq_pep <- use_mq_peptable(dat_dir, label_scheme_full)
  use_mf_pep <- use_mf_peptable(dat_dir, label_scheme_full, 
                                group_psm_by = group_psm_by)
  
  if (use_mq_pep && use_mf_pep) {
    stop("Both MaxQuant and MSFragger peptide tables found.", 
         "Use either, not both.")
  }

  use_spec_counts <- match_call_arg(normPSM, use_spec_counts)
  lfq_mbr <- match_call_arg(normPSM, lfq_mbr)

  df <- normPep(dat_dir = dat_dir, 
                group_psm_by = group_psm_by, 
                group_pep_by = group_pep_by, 
                engine = engine, 
                lfq_mbr = lfq_mbr,
                use_duppeps = use_duppeps, 
                duppeps_repair = duppeps_repair, 
                cut_points = cut_points, 
                omit_single_lfq = omit_single_lfq,
                use_mq_pep = use_mq_pep, 
                use_mf_pep = use_mf_pep, 
                rm_allna = rm_allna, 
                mbr_ret_tol = mbr_ret_tol, 
                rt_tol = 30, # user accessible later
                rt_step = 5E-3, # was 2E-3
                ret_sd_tol = ret_sd_tol, 
                rm_ret_outliers = rm_ret_outliers, 
                use_spec_counts = use_spec_counts, 
                imp_refs = imp_refs, 
                new_na_species = ".other", 
                !!!filter_dots)
  
  if (plot_log2FC_cv) {
    quiet_out <- purrr::quietly(sd_violin)(
      df = df, 
      id = !!group_pep_by, 
      filepath = file.path(dat_dir, "Peptide/log2FC_cv/raw/Peptide_sd.png"), 
      width = 8 * n_TMT_sets(label_scheme), 
      height = 8, 
      type = "log2_R", 
      adjSD = FALSE, 
      is_psm = FALSE, 
      ...
    )
  }
  
  .saveCall <- TRUE
  
  invisible(NULL)
}


#'Standardize peptide results
#'
#'\code{standPep} standardizes peptide results from \code{\link{mergePep}} with
#'additional, stand-alone choices in data alignment. The utility is typically
#'applied after the assembly of peptide data via \code{\link{mergePep}}. It
#'further supports iterative normalization against data under selected sample
#'columns, data rows or both.
#'
#'
#'In the primary output file, "\code{Peptide.txt}", values under columns
#'\code{log2_R...} are logarithmic ratios at base 2 in relative to the
#'\code{reference(s)} within each multiplex TMT set, or to the row means within
#'each plex if no \code{reference(s)} are present. Values under columns
#'\code{N_log2_R...} are aligned \code{log2_R...} according to
#'\code{method_align} without scaling normalization. Values under columns
#'\code{Z_log2_R...} are \code{N_log2_R...} with additional scaling
#'normalization. Values under columns \code{I...} are reporter-ion or LFQ
#'intensity before normalization. Values under columns \code{N_I...} are
#'normalized \code{I...}. Values under columns \code{sd_log2_R...} are the
#'standard deviation of the \code{log2FC} of proteins from ascribing peptides.
#'
#'In general, median statistics is applied when summarizing numeric peptide data
#'from different LCMS series. One exception is \code{pep_expect} where geometric
#'mean is used.
#'
#'Description of the column keys in the inputs and outputs: \cr
#'\code{system.file("extdata", "peptide_keys.txt", package = "proteoQ")}
#'
#'@param method_align Character string indicating the method in aligning
#'  \code{log2FC} across samples. \code{MC}: median-centering; \code{MGKernel}:
#'  the kernel density defined by multiple Gaussian functions
#'  (\code{\link[mixtools]{normalmixEM}}). At the \code{MC} default, the ratio
#'  profiles of each sample will be aligned in that the medians of the
#'  \code{log2FC} are zero. At \code{MGKernel}, the ratio profiles of each
#'  sample will be aligned in that the \code{log2FC} at the maximums of kernel
#'  density are zero.
#'@param col_select Character string to a column key in \code{expt_smry.xlsx}.
#'  At the \code{NULL} default, the column key of \code{Select} in
#'  \code{expt_smry.xlsx} will be used. In the case of no samples being
#'  specified under \code{Select}, the column key of \code{Sample_ID} will be
#'  used. The non-empty entries under the ascribing column will be used in
#'  indicated analysis.
#'@param range_log2r Numeric vector at length two. The argument specifies the
#'  range of the \code{log2FC} for use in the scaling normalization of standard
#'  deviation across samples. The default is between the 10th and the 90th
#'  quantiles.
#'@param range_int Numeric vector at length two. The argument specifies the
#'  range of the \code{intensity} of reporter ions (including \code{I000}) for
#'  use in the scaling normalization of standard deviation across samples. The
#'  default is between the 5th and the 95th quantiles.
#'@param n_comp Integer; the number of Gaussian components to be used with
#'  \code{method_align = MGKernel}. A typical value is 2 or 3. The variable
#'  \code{n_comp} overwrites the argument \code{k} in
#'  \code{\link[mixtools]{normalmixEM}}.
#'@param seed Integer; a seed for reproducible fitting at \code{method_align =
#'  MGKernel}.
#'@param ... \code{slice_}: variable argument statements for the identification
#'  of row subsets. The partial data will be taken for parameterizing the
#'  alignment of \code{log2FC} across samples. The full data set will be updated
#'  subsequently with the newly derived parameters. Note that there is no data
#'  entry removals from the complete data set with the \code{slice_} procedure.
#'  \cr \cr The variable argument statements should be in the following format:
#'  each of the statement contains a list of logical expression(s). The
#'  \code{lhs} needs to start with \code{slice_}. The logical condition(s) at
#'  the \code{rhs} needs to be enclosed in \code{exprs} with round parenthesis.
#'  For example, \code{pep_len} is a column key present in \code{Peptide.txt}.
#'  The \code{slice_peps_at = exprs(pep_len >= 10, pep_len <= 50)} will extract
#'  peptide entries with the number of amino acid residues betwen 10 and 50 for
#'  \code{log2FC} alignment. Shorter or longer peptide sequences will remain in
#'  \code{Peptide.txt} but not used in the parameterization. See also
#'  \code{\link{normPSM}} for the variable arguments of \code{filter_}. \cr \cr
#'  Additional parameters from \code{\link[mixtools]{normalmixEM}}, i.e., \cr
#'  \code{maxit}, the maximum number of iterations allowed; \cr \code{epsilon},
#'  tolerance limit for declaring algorithm convergence.
#'@inheritParams normPSM
#'@seealso \emph{Metadata} \cr \code{\link{load_expts}} for metadata preparation
#'and a reduced working example in data normalization \cr
#'
#'\emph{Data normalization} \cr \code{\link{normPSM}} for extended examples in
#'PSM data normalization \cr \code{\link{PSM2Pep}} for extended examples in PSM
#'to peptide summarization \cr \code{\link{mergePep}} for extended examples in
#'peptide data merging \cr \code{\link{standPep}} for extended examples in
#'peptide data normalization \cr \code{\link{Pep2Prn}} for extended examples in
#'peptide to protein summarization \cr \code{\link{standPrn}} for extended
#'examples in protein data normalization. \cr \code{\link{purgePSM}} and
#'\code{\link{purgePep}} for extended examples in data purging \cr
#'\code{\link{pepHist}} and \code{\link{prnHist}} for extended examples in
#'histogram visualization. \cr \code{\link{extract_raws}} and
#'\code{\link{extract_psm_raws}} for extracting MS file names \cr
#'
#'\emph{Variable arguments of `filter_...`} \cr \code{\link{contain_str}},
#'\code{\link{contain_chars_in}}, \code{\link{not_contain_str}},
#'\code{\link{not_contain_chars_in}}, \code{\link{start_with_str}},
#'\code{\link{end_with_str}}, \code{\link{start_with_chars_in}} and
#'\code{\link{ends_with_chars_in}} for data subsetting by character strings \cr
#'
#'\emph{Missing values} \cr \code{\link{pepImp}} and \code{\link{prnImp}} for
#'missing value imputation \cr
#'
#'\emph{Informatics} \cr \code{\link{pepSig}} and \code{\link{prnSig}} for
#'significance tests \cr \code{\link{pepVol}} and \code{\link{prnVol}} for
#'volcano plot visualization \cr \code{\link{prnGSPA}} for gene set enrichment
#'analysis by protein significance pVals \cr \code{\link{gspaMap}} for mapping
#'GSPA to volcano plot visualization \cr \code{\link{prnGSPAHM}} for heat map
#'and network visualization of GSPA results \cr \code{\link{prnGSVA}} for gene
#'set variance analysis \cr \code{\link{prnGSEA}} for data preparation for
#'online GSEA. \cr \code{\link{pepMDS}} and \code{\link{prnMDS}} for MDS
#'visualization \cr \code{\link{pepPCA}} and \code{\link{prnPCA}} for PCA
#'visualization \cr \code{\link{pepLDA}} and \code{\link{prnLDA}} for LDA
#'visualization \cr \code{\link{pepHM}} and \code{\link{prnHM}} for heat map
#'visualization \cr \code{\link{pepCorr_logFC}}, \code{\link{prnCorr_logFC}},
#'\code{\link{pepCorr_logInt}} and \code{\link{prnCorr_logInt}}  for correlation
#'plots \cr \code{\link{anal_prnTrend}} and \code{\link{plot_prnTrend}} for
#'trend analysis and visualization \cr \code{\link{anal_pepNMF}},
#'\code{\link{anal_prnNMF}}, \code{\link{plot_pepNMFCon}},
#'\code{\link{plot_prnNMFCon}}, \code{\link{plot_pepNMFCoef}},
#'\code{\link{plot_prnNMFCoef}} and \code{\link{plot_metaNMF}} for NMF analysis
#'and visualization \cr
#'
#'\emph{Custom databases} \cr \code{\link{Uni2Entrez}} for lookups between
#'UniProt accessions and Entrez IDs \cr \code{\link{Ref2Entrez}} for lookups
#'among RefSeq accessions, gene names and Entrez IDs \cr \code{\link{prepGO}}
#'for
#'\code{\href{http://current.geneontology.org/products/pages/downloads.html}{gene
#'ontology}} \cr \code{\link{prepMSig}} for
#'\href{https://data.broadinstitute.org/gsea-msigdb/msigdb/release/7.0/}{molecular
#'signatures} \cr \code{\link{prepString}} and \code{\link{anal_prnString}} for
#'STRING-DB \cr
#'
#'\emph{Column keys in PSM, peptide and protein outputs} \cr 
#'system.file("extdata", "psm_keys.txt", package = "proteoQ") \cr
#'system.file("extdata", "peptide_keys.txt", package = "proteoQ") \cr
#'system.file("extdata", "protein_keys.txt", package = "proteoQ") \cr
#'
#'@return The primary output is in \code{.../Peptide/Peptide.txt}.
#'
#'@example inst/extdata/examples/normPep_.R
#'
#'@import stringr dplyr tidyr purrr
#'@importFrom magrittr %>% %T>% %$% %<>%
#'@export
standPep <- function (method_align = c("MC", "MGKernel"), col_select = NULL, 
                      range_log2r = c(10, 90), 
                      range_int = c(5, 95), n_comp = NULL, seed = NULL, 
                      plot_log2FC_cv = FALSE, ...) 
{
  dat_dir <- get_gl_dat_dir()
  
  old_opts <- options()
  options(warn = 1)
  on.exit(options(old_opts), add = TRUE)
  
  on.exit(
    if (exists(".savecall", envir = rlang::current_env())) {
      if (.savecall) {
        mget(names(formals()), envir = rlang::current_env(), 
             inherits = FALSE) %>% 
          c(dots) %>% 
          save_call("standPep")
      }
    }, 
    add = TRUE
  )
  
  stopifnot(vapply(c(plot_log2FC_cv), rlang::is_logical, logical(1)))
  
  reload_expts()
  group_psm_by <- match_call_arg(normPSM, group_psm_by)
  group_pep_by <- match_call_arg(normPSM, group_pep_by)
  
  method_align <- rlang::enexpr(method_align)
  if (method_align == rlang::expr(c("MC", "MGKernel"))) {
    method_align <- "MC"
  } 
  else {
    method_align <- rlang::as_string(method_align)
    stopifnot(method_align %in% c("MC", "MGKernel"), 
              length(method_align) == 1L)
  }
  
  range_log2r <- prep_range(range_log2r)
  range_int <- prep_range(range_int)
  
  label_scheme <- load_ls_group(dat_dir, label_scheme)
  label_scheme_full <- load_ls_group(dat_dir, label_scheme_full)
  ok_existing_params(file.path(dat_dir, "Peptide/Histogram/MGKernel_params_N.txt"))

  col_select <- rlang::enexpr(col_select)
  col_select <- if (is.null(col_select)) 
    rlang::expr(Sample_ID) 
  else 
    rlang::sym(col_select)
  col_select <- parse_col_select(rlang::as_string(col_select), label_scheme)

  dots <- rlang::enexprs(...)
  
  filename <- file.path(dat_dir, "Peptide/Peptide.txt")
  
  if (!file.exists(filename)) {
    stop(filename, " not found; run `mergePep(...)` first.")
  }

  message("Primary column keys in `Peptide/Peptide.txt` for `slice_` varargs.")

  df <- load_prior(filename, group_psm_by) 
  
  if (sum(grepl("^log2_R[0-9]{3}[NC]{0,1}", names(df))) <= 1) {
    stop("Need more than one sample for `standPep` or `standPrn`.\n", 
         "Skip this module for qualitative analysis.")
  }
  
  df <- normMulGau(
    df = df,
    method_align = method_align,
    n_comp = n_comp,
    seed = seed,
    range_log2r = range_log2r,
    range_int = range_int,
    filepath = file.path(dat_dir, "Peptide/Histogram"),
    col_select = col_select, 
    cut_points = Inf, 
    is_prot_lfq = FALSE, 
    !!!dots, 
  ) %>% 
    fmt_num_cols() %T>% 
    write.table(file.path(dat_dir, "Peptide", "Peptide.txt"), 
                sep = "\t", col.names = TRUE, row.names = FALSE)

  if (plot_log2FC_cv & TMT_plex(label_scheme)) {
    sd_violin(df = df, id = !!group_pep_by, 
              filepath = file.path(dat_dir, "Peptide/log2FC_cv/raw", "Peptide_sd.png"), 
              width = 8 * n_TMT_sets(label_scheme), height = 8, 
              type = "log2_R", adjSD = FALSE, is_psm = FALSE)
  }
  
  .saveCall <- TRUE
  
  invisible(NULL)
}



#'Interim protein data
#'
#'\code{Pep2Prn} summarizes \code{Peptide.txt} to an interim protein report in
#'\code{Protein.txt}.
#'
#'Fields other than \code{log2FC} and \code{intensity} are summarized with
#'median statistics.
#'
#'@param use_unique_pep Logical. If TRUE, only entries that are \code{TRUE} or
#'  equal to \code{1} under the column \code{pep_isunique} in \code{Peptide.txt}
#'  will be used, for summarizing the \code{log2FC} and the \code{intensity} of
#'  peptides into protein values. The default is to use unique peptides only.
#'  For \code{MaxQuant} data, the levels of uniqueness are according to the
#'  \code{pep_unique_by} in \code{\link{normPSM}}. The argument currently do
#'  nothing to \code{Spectrum Mill} data where both unique and shared peptides
#'  will be kept.
#' @param impute_prot_na Logical; impute NA values of protein log2FC or not.
#' @param mc Logical. At the TRUE default, performs median-centering of
#'  \code{log2FC} after the peptide-to-protein aggregation. Otherwise, the
#'  summarized \code{log2FC} values will be left as they are.
#'@param ... \code{filter_}: Variable argument statements for the filtration of
#'  data rows. Each statement contains a list of logical expression(s). The
#'  \code{lhs} needs to start with \code{filter_}. The logical condition(s) at
#'  the \code{rhs} needs to be enclosed in \code{exprs} with round parenthesis.
#'  For example, \code{pep_len} is a column key in \code{Peptide.txt}. The
#'  statement of \code{filter_peps_at = exprs(pep_len <= 50)} will remove
#'  peptide entries with \code{pep_len > 50}.
#'@inheritParams mergePep
#'@inheritParams normPSM
#'@seealso \emph{Metadata} \cr \code{\link{load_expts}} for metadata preparation
#'  and a reduced working example in data normalization \cr
#'
#'  \emph{Data normalization} \cr \code{\link{normPSM}} for extended examples in
#'  PSM data normalization \cr \code{\link{PSM2Pep}} for extended examples in
#'  PSM to peptide summarization \cr \code{\link{mergePep}} for extended
#'  examples in peptide data merging \cr \code{\link{standPep}} for extended
#'  examples in peptide data normalization \cr \code{\link{Pep2Prn}} for
#'  extended examples in peptide to protein summarization \cr
#'  \code{\link{standPrn}} for extended examples in protein data normalization.
#'  \cr \code{\link{purgePSM}} and \code{\link{purgePep}} for extended examples
#'  in data purging \cr \code{\link{pepHist}} and \code{\link{prnHist}} for
#'  extended examples in histogram visualization. \cr \code{\link{extract_raws}}
#'  and \code{\link{extract_psm_raws}} for extracting MS file names \cr
#'
#'  \emph{Variable arguments of `filter_...`} \cr \code{\link{contain_str}},
#'  \code{\link{contain_chars_in}}, \code{\link{not_contain_str}},
#'  \code{\link{not_contain_chars_in}}, \code{\link{start_with_str}},
#'  \code{\link{end_with_str}}, \code{\link{start_with_chars_in}} and
#'  \code{\link{ends_with_chars_in}} for data subsetting by character strings
#'  \cr
#'
#'  \emph{Missing values} \cr \code{\link{pepImp}} and \code{\link{prnImp}} for
#'  missing value imputation \cr
#'
#'  \emph{Informatics} \cr \code{\link{pepSig}} and \code{\link{prnSig}} for
#'  significance tests \cr \code{\link{pepVol}} and \code{\link{prnVol}} for
#'  volcano plot visualization \cr \code{\link{prnGSPA}} for gene set enrichment
#'  analysis by protein significance pVals \cr \code{\link{gspaMap}} for mapping
#'  GSPA to volcano plot visualization \cr \code{\link{prnGSPAHM}} for heat map
#'  and network visualization of GSPA results \cr \code{\link{prnGSVA}} for gene
#'  set variance analysis \cr \code{\link{prnGSEA}} for data preparation for
#'  online GSEA. \cr \code{\link{pepMDS}} and \code{\link{prnMDS}} for MDS
#'  visualization \cr \code{\link{pepPCA}} and \code{\link{prnPCA}} for PCA
#'  visualization \cr \code{\link{pepLDA}} and \code{\link{prnLDA}} for LDA
#'  visualization \cr \code{\link{pepHM}} and \code{\link{prnHM}} for heat map
#'  visualization \cr \code{\link{pepCorr_logFC}}, \code{\link{prnCorr_logFC}},
#'  \code{\link{pepCorr_logInt}} and \code{\link{prnCorr_logInt}}  for
#'  correlation plots \cr \code{\link{anal_prnTrend}} and
#'  \code{\link{plot_prnTrend}} for trend analysis and visualization \cr
#'  \code{\link{anal_pepNMF}}, \code{\link{anal_prnNMF}},
#'  \code{\link{plot_pepNMFCon}}, \code{\link{plot_prnNMFCon}},
#'  \code{\link{plot_pepNMFCoef}}, \code{\link{plot_prnNMFCoef}} and
#'  \code{\link{plot_metaNMF}} for NMF analysis and visualization \cr
#'
#'  \emph{Custom databases} \cr \code{\link{Uni2Entrez}} for lookups between
#'  UniProt accessions and Entrez IDs \cr \code{\link{Ref2Entrez}} for lookups
#'  among RefSeq accessions, gene names and Entrez IDs \cr \code{\link{prepGO}}
#'  for
#'  \code{\href{http://current.geneontology.org/products/pages/downloads.html}{gene
#'   ontology}} \cr \code{\link{prepMSig}} for
#'  \href{https://data.broadinstitute.org/gsea-msigdb/msigdb/release/7.0/}{molecular
#'   signatures} \cr \code{\link{prepString}} and \code{\link{anal_prnString}}
#'  for STRING-DB \cr
#'
#'  \emph{Column keys in PSM, peptide and protein outputs} \cr
#'  system.file("extdata", "psm_keys.txt", package = "proteoQ") \cr
#'  system.file("extdata", "peptide_keys.txt", package = "proteoQ") \cr
#'  system.file("extdata", "protein_keys.txt", package = "proteoQ") \cr
#'
#'@return The primary output in "\code{.../Protein/Protein.txt}".
#'
#'@example inst/extdata/examples/Pep2Prn_.R
#'@import stringr dplyr purrr
#'@importFrom magrittr %>% %T>% %$% %<>%
#'@export
Pep2Prn <- function (use_unique_pep = TRUE, impute_prot_na = FALSE, 
                     cut_points = Inf, rm_outliers = FALSE, rm_allna = FALSE, 
                     mc = TRUE, ...) 
{
  dat_dir <- get_gl_dat_dir()
  
  dir.create(file.path(dat_dir, "Protein/Histogram"), 
             recursive = TRUE, showWarnings = FALSE)
  dir.create(file.path(dat_dir, "Protein/cache"), 
             recursive = TRUE, showWarnings = FALSE)
  dir.create(file.path(dat_dir, "Protein/log"), 
             recursive = TRUE, showWarnings = FALSE)
  
  old_opts <- options()
  options(warn = 1L)
  on.exit(options(old_opts), add = TRUE)
  
  on.exit(
    if (exists(".savecall", envir = rlang::current_env())) {
      if (.savecall) {
        mget(names(formals()), envir = rlang::current_env(), inherits = FALSE) |>
          c(dots) |>
          save_call("Pep2Prn")
      }
    }, add = TRUE)

  reload_expts()
  load(file = file.path(dat_dir, "label_scheme_full.rda"))
  tmt_plex <- TMT_plex(label_scheme_full)
  
  group_pep_by <- match_call_arg(normPSM, group_pep_by)
  use_spec_counts  <- match_call_arg(normPSM, use_spec_counts)
  
  if (use_spec_counts) {
    method_pep_prn <- "lfq_all" # sum of peptide spec counts
  }
  else if (tmt_plex) {
    method_pep_prn <- "median"
  }
  else {
    # method_pep_prn <- "lfq_top_3_sum" # LFQ intensity for histo color-coding
    method_pep_prn <- "median"
  }

  stopifnot(group_pep_by %in% c("prot_acc", "gene"), length(group_pep_by) == 1L)
  stopifnot(vapply(c(use_unique_pep, mc), rlang::is_logical, logical(1L)))
  
  gn_rollup <- if (group_pep_by == "gene") { TRUE } else { FALSE }

  message("Column keys in `Peptide/Peptide.txt` ", "for \"filter_\" varargs.")

  dots <- rlang::enexprs(...)
  
  filter_dots <- dots %>% 
    .[purrr::map_lgl(., is.language)] %>% 
    .[grepl("^filter_", names(.))]
  
  engine <- find_search_engine(dat_dir)
  use_mq_prot <- use_mq_prottable(dat_dir) && engine == "mq"
  use_mf_prot <- use_mf_prottable(dat_dir) && engine == "mf"

  if (use_mq_prot && use_mf_prot) {
    stop("Both MaxQuant and MSFragger protein tables found.", 
         "Use either, not both.")
  }
  
  if (use_spec_counts) {
    if (use_mq_prot) {
      use_mq_prot <- FALSE
    }
    
    if (use_mf_prot) {
      use_mf_prot <- FALSE
    }
  }

  is_prot_lfq <- tmt_plex == 0L
  
  df <- pep_to_prn(id = prot_acc, 
                   method_pep_prn = method_pep_prn, 
                   use_unique_pep = use_unique_pep, 
                   gn_rollup = gn_rollup, 
                   rm_outliers = rm_outliers, 
                   rm_allna = rm_allna, 
                   engine = engine, 
                   impute_prot_na = impute_prot_na, 
                   use_mq_prot = use_mq_prot, 
                   use_mf_prot = use_mf_prot, 
                   use_spec_counts = use_spec_counts, 
                   !!!filter_dots) 
  
  if (mc) {
    df <- normMulGau(
      df = df,
      method_align = "MC",
      n_comp = 1L,
      range_log2r = c(0, 100),
      range_int = c(0, 100),
      filepath = file.path(dat_dir, "Protein/Histogram"),
      col_select = rlang::expr(Sample_ID), 
      is_prot_lfq = is_prot_lfq, 
      cut_points = cut_points) 
  }
  else {
    warning("No data centering performed.", call. = FALSE)
  }

  df <- df %>% 
    dplyr::filter(!nchar(as.character(.[["prot_acc"]])) == 0) %>% 
    dplyr::mutate_at(vars(grep("I[0-9]{3}[NC]*", names(.))), 
                     as.numeric) %>% 
    dplyr::mutate_at(vars(grep("I[0-9]{3}[NC]*", names(.))), 
                     ~ round(.x, digits = 0)) %>% 
    dplyr::mutate_at(vars(grep("log2_R[0-9]{3}[NC]*", names(.))), 
                     as.numeric) %>% 
    dplyr::mutate_at(vars(grep("log2_R[0-9]{3}[NC]*", names(.))), 
                     ~ round(.x, digits = 3)) %T>% 
    write.table(file.path(dat_dir, "Protein/Protein.txt"), 
                sep = "\t", col.names = TRUE, row.names = FALSE)
  
  .saveCall <- TRUE
  
  invisible(df)
}


#' Helper: maps peptides to possible proteins
#' 
#' @param df A data frame.
#' @inheritParams normPSM
map_peps_prots <- function (df, group_psm_by = "pep_seq", 
                            group_pep_by = "prot_acc") 
{
  dat_dir <- get_gl_dat_dir()
  
  df <- df |> dplyr::filter(!duplicated(group_psm_by))

  if (group_pep_by == "gene") {
    pep_prot_map <- df |> dplyr::select(shared_genes)
  }
  else if (group_pep_by == "prot_acc") {
    pep_prot_map <- df |> dplyr::select(shared_prot_accs)
  }
  else {
    stop("`group_pep_by` needs to be either `prot_acc` or `gene`.")
  }

  pep_prot_map <- unlist(pep_prot_map, recursive = FALSE, use.names = FALSE) |>
    stringr::str_split(", ") |>
    setNames(df[[group_psm_by]])
  
  save(pep_prot_map, file = file.path(dat_dir, "pep_prot_map.rda"))
  
  invisible(pep_prot_map)
}


#' Helper: finds the row indexes of a protein within a family
#' 
#' Not currently used. The rows include primary (razor) and secondary (mapped)
#' proteins.
#' 
#' @param df A data frame.
#' @inheritParams normPSM
find_prot_family_rows <- function (df, group_psm_by, group_pep_by) 
{
  dat_dir <- get_gl_dat_dir()
  
  pep_prot_map <- map_peps_prots(df, group_psm_by, group_pep_by) 
  
  prots <- unique(df[[group_pep_by]])
  
  message("Find peptides unique to or shared by proteins; please wait...")
  
  prot_family_rows <- purrr::map(prots, ~ {
    prot <- .x
    prot_rows <- purrr::map_lgl(pep_prot_map, ~ prot %in% .x) %>% 
      which()
  }) %>% 
    `names<-`(prots)
  
  save(prot_family_rows, file = file.path(dat_dir, "prot_family_rows.rda"))
  invisible(prot_family_rows)
}


#' Sums top-n.
#' 
#' @param x A numeric vector.
#' @param n An positive integer.
#' @param ... Additional arguments for \code{sum}.
my_sum_n <- function (x, n = 3, ...) 
{
  if (n < 1L) 
    stop("\"n\" need to be a positive integer.")
  
  if (is.infinite(n)) 
    n <- length(x)
  
  # need to convert x to integers if to partial sort
  sum(sort(x, decreasing = TRUE)[1:n], ...)
}


#' Adds the \code{total}, \code{razor} and \code{unique} intensities of
#' proteins.
#'
#' @param df A data frame.
#' @param label_scheme Meta data.
#' @param impute_prot_na Logical; impute NA values of protein log2FC or not.
#' @inheritParams normPSM
#' @inheritParams Pep2Prn
calcLFQprnnums <- function (df, label_scheme, use_unique_pep = TRUE, 
                            group_psm_by = "pep_seq", group_pep_by = "gene", 
                            method_pep_prn = "lfq_top_3_sum", 
                            impute_prot_na = TRUE, use_spec_counts = FALSE)
{
  refChannels  <- label_scheme$Sample_ID[label_scheme[["Reference"]]]

  if (use_unique_pep) {
    pep_unique_by <- match_call_arg(normPSM, pep_unique_by)
    
    if (pep_unique_by == "group") {
      df <- df |> dplyr::filter(pep_razor_unique)
    }
    else if (pep_unique_by == "protein") {
      df <- df |> dplyr::filter(pep_literal_unique)
    }
    else {
      stop("`pep_unique_by` need to be `group` or `protein`.")
    }
  }
  
  if (use_spec_counts && method_pep_prn != "lfq_all") {
    stop("Require `method_pep_prn = lfq_all` for spectrum counting.")
  }

  prot_ints <- df |>
    dplyr::select(group_pep_by, grep("^I[0-9]{3}[NC]{0,1}", names(df)))
  prot_ints <- switch(
    method_pep_prn, 
    median = aggrNums(median)(prot_ints, !!rlang::sym(group_pep_by), na.rm = TRUE),
    mean = aggrNums(mean)(prot_ints, !!rlang::sym(group_pep_by), na.rm = TRUE), 
    lfq_all = aggrNums(sum)(prot_ints, !!rlang::sym(group_pep_by), na.rm = TRUE),
    lfq_max = aggrTopn(sum)(prot_ints, 1, na.rm = TRUE), 
    lfq_top_2_sum = aggrTopn(sum)(prot_ints, !!rlang::sym(group_pep_by), 2, na.rm = TRUE), 
    # with LFQ, e.g., for histogram color coding
    lfq_top_3_sum = aggrTopn(sum)(prot_ints, !!rlang::sym(group_pep_by), 3, na.rm = TRUE),
    top_3_mean = TMT_top_n(prot_ints, !!rlang::sym(group_pep_by), na.rm = TRUE), 
    weighted_mean = tmt_wtmean(prot_ints, !!rlang::sym(group_pep_by), na.rm = TRUE), 
    aggrNums(median)(prot_ints, !!rlang::sym(group_pep_by), na.rm = TRUE))
  
  if (use_spec_counts) { # derive log2FCs from summed spec counts
    prot_log2rs <- prot_ints |>
      calc_lfq_log2r("I000", refChannels) |>
      dplyr::mutate_if(is.numeric, function (x) replace(x, is.nan(x), NA_real_))
  }
  else { # use log2FCs (previously from MS1 peak areas)
    nms <- names(df)
    prot_log2rs <- df |>
      dplyr::select(group_pep_by, grep("^log2_R[0-9]{3}[NC]{0,1}", nms))
    prot_log2rs <- 
      aggrNums(median)(prot_log2rs, !!rlang::sym(group_pep_by), na.rm = TRUE)
  }

  if (impute_prot_na) {
    cols <- grep("^log2_R[0-9]{3}[NC]{0,1}", names(prot_log2rs))
    
    if (length(cols) > 1L) {
      class(prot_log2rs) <- "data.frame"
      prot_log2rs[, cols] <- 
        impPepNA(prot_log2rs[, cols, drop = FALSE], is_intensity = FALSE)
    }
  }
  
  # --- median centering ---
  if (use_spec_counts) {
    dfw <- prot_log2rs
  }
  else {
    dfw <- prot_ints |> dplyr::left_join(prot_log2rs, by = group_pep_by)
  }

  cols_log2Ratio <- grepl("^log2_R[0-9]{3}[NC]{0,1}", names(dfw))
  
  # cfs <- apply(dfw[, cols_log2Ratio, drop = FALSE], 2, median, na.rm = TRUE)
  cfs <- lapply(dfw[, cols_log2Ratio, drop = FALSE], function (x) {
    median(x[x >= -2 & x <= 2], na.rm = TRUE)
  }) |>
    unlist()
  
  dfw <- sweep(dfw[, cols_log2Ratio, drop = FALSE], 2, cfs, "-") %>%
    `colnames<-`(paste("N", names(.), sep="_"))	%>%
    dplyr::bind_cols(dfw, .)
  
  dfw <- sweep(dfw[, grepl("^I[0-9]{3}", names(dfw)), drop = FALSE], 
               2, 2^cfs, "/") %>%
    `colnames<-`(paste("N", names(.), sep="_"))	%>%
    dplyr::bind_cols(dfw, .)
  
  if (!length(grep("log2_R000", names(dfw)))) {
    stop("No \"log2_R000...\" columns available.\n")
  }
  
  dfw <- dfw %>% 
    dplyr::mutate_at(.vars = grep("log2_R000\\s", names(.)), 
                     function (x) replace(x, is.infinite(x) | is.nan(x), 
                                          NA_real_))
  
  if (!length(grep("^Z_log2_R[0-9]{3}[NC]{0,1}", names(dfw)))) {
    dfw <- dfw %>% 
      dplyr::select(grep("^N_log2_R[0-9]{3}[NC]{0,1}", names(.))) %>% 
      `names<-`(gsub("^N_log2_R", "Z_log2_R", names(.))) %>% 
      dplyr::bind_cols(dfw, .)
  }
  
  nms <- names(dfw)
  dfw <- dplyr::bind_cols(
    dfw |> dplyr::select(group_pep_by),
    dfw |> dplyr::select(grep("^I000 \\(", nms)),
    dfw |> dplyr::select(grep("^N_I000 \\(", nms)),
    dfw |> dplyr::select(grep("^log2_R000 \\(", nms)),
    dfw |> dplyr::select(grep("^N_log2_R000 \\(", nms)),
    dfw |> dplyr::select(grep("^Z_log2_R000 \\(", nms)), )
}


#' Helper: calculates the TMT log2FC and reporter-ion intensity of proteins
#' 
#' @param id Always "prot_acc".
#' @inheritParams calcLFQprnnums
#' @inheritParams Pep2Prn
#' @inheritParams normPSM
calc_tmt_prnnums <- function (df, use_unique_pep = TRUE, id = "prot_acc", 
                              method_pep_prn = "median", use_spec_counts = FALSE) 
{
  if (use_unique_pep && "pep_isunique" %in% names(df)) {
    df <- df %>% dplyr::filter(pep_isunique)
  }
  
  df_num <- df %>% 
    dplyr::select(id, grep("log2_R[0-9]{3}|I[0-9]{3}", names(.))) %>% 
    dplyr::select(-grep("^sd_log2_R[0-9]{3}", names(.))) %>% 
    dplyr::group_by(!!rlang::sym(id))

  df_num <- switch(
    method_pep_prn, 
    median = aggrNums(median)(df_num, !!rlang::sym(id), na.rm = TRUE),
    mean = aggrNums(mean)(df_num, !!rlang::sym(id), na.rm = TRUE), 
    top_3_mean = TMT_top_n(df_num, !!rlang::sym(id), na.rm = TRUE), 
    weighted_mean = tmt_wtmean(df_num, !!rlang::sym(id), na.rm = TRUE), 
    aggrNums(median)(df_num, !!rlang::sym(id), na.rm = TRUE))
  
  invisible(df_num)
}


#' Helper of Pep2Prn
#' 
#' @param gn_rollup Logical; if TRUE, rolls up protein accessions to gene names.
#' @param engine The name of search engine.
#' @param use_mq_prot Logical; use MQ protein table or not.
#' @param use_mf_prot Logical; use MSFragger protein table or not.
#' @inheritParams info_anal
#' @inheritParams Pep2Prn
#' @inheritParams normPSM
pep_to_prn <- function(id = "prot_acc", method_pep_prn = "median", 
                       use_unique_pep = TRUE, gn_rollup = TRUE, 
                       rm_outliers = FALSE, rm_allna = FALSE, 
                       engine = "mz", impute_prot_na = TRUE, 
                       use_mq_prot = FALSE, use_mf_prot = FALSE, 
                       use_spec_counts = FALSE, ...)
{
  dat_dir <- get_gl_dat_dir()
  label_scheme <- load_ls_group(dat_dir, label_scheme)
  tmt_plex <- TMT_plex(label_scheme)

  # `id` is always "prot_acc"; `group_pep_by` could be "gene"
  id <- rlang::as_string(rlang::enexpr(id))
  
  filter_dots <- rlang::enexprs(...) %>% 
    .[purrr::map_lgl(., is.language)] %>% 
    .[grepl("^filter_", names(.))]
  
  df <- readr::read_tsv(file.path(dat_dir, "Peptide/Peptide.txt"), 
                        col_types = get_col_types(), 
                        show_col_types = FALSE) |> 
    suppressWarnings()
  
  df <- df %>% 
    filters_in_call(!!!filter_dots) %>% 
    { if (tmt_plex && rm_allna) 
      .[rowSums(!is.na(.[grepl("^log2_R[0-9]{3}[NC]{0,1}", names(.))])) > 0, ] 
      else . } 
  
  if (rm_outliers) {
    df <- local({
      df$pep_index <- seq_along(1:nrow(df))
      
      dfw_split <- df %>% 
        dplyr::select(!!rlang::sym(id), 
                      pep_index, 
                      grep("^log2_R[0-9]{3}[NC]{0,1}", names(.))) %>% 
        dplyr::group_by(!!rlang::sym(id)) %>%
        `colnames<-`(gsub("^log2_R", "X", names(.))) %>%
        dplyr::mutate_at(.vars = grep("^X[0-9]{3}", names(.)), 
                         ~ replace(.x, is.infinite(.x), NA)) %>% 
        data.frame(check.names = FALSE) %>% 
        split(.[[id]], drop = TRUE)
      
      range_colRatios <- grep("^X[0-9]{3}", names(dfw_split[[1]]))
      
      dfw_split <- dfw_split %>% 
        purrr::map(locate_outliers, range_colRatios) %>% 
        dplyr::bind_rows() %>%
        dplyr::mutate_at(.vars = grep("^X[0-9]{3}", names(.)), 
                         function (x) replace(x, is.infinite(x), NA_real_)) %>% 
        tidyr::unite(prot_acc_i, !!rlang::sym(id), pep_index, sep = ":") %>%
        dplyr::mutate_at(.vars = grep("^X[0-9]{3}", names(.)), 
                         function (x) replace(x, !is.na(x), 1))
      
      df <- df %>% 
        tidyr::unite(prot_acc_i, !!rlang::sym(id), pep_index, sep = ":") %>%
        dplyr::left_join(dfw_split, by = "prot_acc_i") %>%
        tidyr::separate(prot_acc_i, into = c(id, "pep_index"), 
                        sep = ":", remove = TRUE) %>%
        dplyr::select(-pep_index)
    })
    
    df[, grepl("^I[0-9]{3}", names(df))] <-
      purrr::map2(as.list(df[, grepl("^I[0-9]{3}", names(df))]),
                  as.list(df[, grepl("^X[0-9]{3}", names(df))]), `*`) %>%
      dplyr::bind_rows()
    
    df[, grepl("^N_I[0-9]{3}", names(df))] <-
      purrr::map2(as.list(df[, grepl("^N_I[0-9]{3}", names(df))]),
                  as.list(df[, grepl("^X[0-9]{3}", names(df))]), `*`) %>%
      dplyr::bind_rows()
    
    df[, grepl("^log2_R[0-9]{3}", names(df))] <-
      purrr::map2(as.list(df[, grepl("^log2_R[0-9]{3}", names(df))]),
                  as.list(df[, grepl("^X[0-9]{3}", names(df))]), `*`) %>%
      dplyr::bind_rows()
    
    df[, grepl("^N_log2_R[0-9]{3}", names(df))] <-
      purrr::map2(as.list(df[, grepl("^N_log2_R[0-9]{3}", names(df))]),
                  as.list(df[, grepl("^X[0-9]{3}", names(df))]), `*`) %>%
      dplyr::bind_rows()
    
    df[, grepl("^Z_log2_R[0-9]{3}", names(df))] <-
      purrr::map2(as.list(df[, grepl("^Z_log2_R[0-9]{3}", names(df))]),
                  as.list(df[, grepl("^X[0-9]{3}", names(df))]), `*`) %>%
      dplyr::bind_rows()
    
    df <- df %>% 
      { if (rm_allna) 
        .[rowSums(!is.na(.[grepl("^log2_R[0-9]{3}[NC]{0,1}", names(.))])) > 0, ] 
        else . } %>% 
      dplyr::select(-grep("^X[0-9]{3}[NC]{0,1}", names(.)))
  } 

  if (!"pep_isunique" %in% names(df)) {
    df$pep_isunique <- TRUE
    warning("Column \"pep_isunique\" created and TRUE values assumed.")
  }
  else if (all(is.na(df$pep_isunique))) {
    df$pep_isunique <- TRUE
    warning("Values of \"pep_isunique\" are all NA and coerced to TRUE.")
  }
  
  group_psm_by <- match_call_arg(normPSM, group_psm_by)
  group_pep_by <- match_call_arg(normPSM, group_pep_by)
  
  # add `prot_n_uniqpep` and `prot_n_uniqpsm`
  df <- local({
    df_shared <- df |>
      dplyr::select(!!rlang::sym(group_psm_by), !!rlang::sym(group_pep_by), 
                    pep_n_psm, prot_n_psm, prot_n_pep, pep_isunique) |>
      dplyr::filter(!pep_isunique)
    
    # may need to take species into account... same gene different species
    
    prot_n_sharepeps <- df_shared %>% 
      dplyr::select(!!rlang::sym(group_psm_by), !!rlang::sym(group_pep_by)) |>
      dplyr::group_by(!!rlang::sym(group_pep_by)) |>
      dplyr::summarise(prot_n_sharepeps = dplyr::n())
    
    prot_n_sharepsms <- df_shared |>
      dplyr::select(!!rlang::sym(group_pep_by), pep_n_psm) |>
      dplyr::group_by(!!rlang::sym(group_pep_by)) |>
      dplyr::summarise(prot_n_sharepsms = sum(pep_n_psm))

    df <- list(df, prot_n_sharepeps, prot_n_sharepsms) |>
      purrr::reduce(dplyr::left_join, by = group_pep_by) |>
      dplyr::mutate(prot_n_sharepeps = replace(prot_n_sharepeps, 
                                               is.na(prot_n_sharepeps), 0), 
                    prot_n_sharepsms = replace(prot_n_sharepsms, 
                                               is.na(prot_n_sharepsms), 0)) |>
      dplyr::mutate(prot_n_uniqpep = prot_n_pep - prot_n_sharepeps, 
                    prot_n_uniqpsm = prot_n_psm - prot_n_sharepsms) |>
      dplyr::select(-prot_n_sharepeps, -prot_n_sharepsms)
    
    df <- df %>% 
      ins_cols_after(which(names(.) == "prot_n_psm"), 
                     which(names(.) == "prot_n_uniqpsm")) %>% 
      ins_cols_after(which(names(.) == "prot_n_pep"), 
                     which(names(.) == "prot_n_uniqpep"))
  })
  
  # first by `id = prot_acc` and later optional roll-up to `gene`
  if (tmt_plex) {
    df_num <- calc_tmt_prnnums(
      df = df, use_unique_pep = use_unique_pep, id = id, 
      method_pep_prn = method_pep_prn, use_spec_counts = use_spec_counts)
  }
  else {
    if (engine == "mz") {
      df_num <- calcLFQprnnums(
        df, label_scheme = label_scheme, use_unique_pep = use_unique_pep, 
        group_psm_by = group_psm_by, group_pep_by = id, 
        method_pep_prn = method_pep_prn, impute_prot_na = impute_prot_na, 
        use_spec_counts = use_spec_counts)
    }
    else if (engine == "mf") {
      if (use_mf_prot) {
        df_num <- 
          fillMFprnnums(dat_dir, use_spec_counts = use_spec_counts,prob_co = .99)
      }
      else {
        df_num <- calcLFQprnnums(
          df, label_scheme = label_scheme, use_unique_pep = use_unique_pep, 
          group_psm_by = group_psm_by, group_pep_by = id, 
          method_pep_prn = method_pep_prn, impute_prot_na = impute_prot_na, 
          use_spec_counts = use_spec_counts)
      }
    }
    else if (engine == "mq") {
      if (use_mq_prot) {
        df_num <- fillMQprnnums(dat_dir, use_spec_counts = use_spec_counts)
      }
      else {
        df_num <- calcLFQprnnums(
          df, label_scheme = label_scheme, use_unique_pep = use_unique_pep, 
          group_psm_by = group_psm_by, group_pep_by = id, 
          method_pep_prn = method_pep_prn, impute_prot_na = impute_prot_na, 
          use_spec_counts = use_spec_counts)
      }
    }
    
    df_num <- df_num |> dplyr::filter(!is.na(!!rlang::sym(id)))
  }
  
  df_num <- df_num %>% 
    dplyr::mutate(mean_lint = 
                    log10(rowMeans(.[, grepl("^N_I[0-9]{3}[NC]{0,1}", names(.)), 
                                     drop = FALSE], na.rm = TRUE)), 
                  mean_lint = round(mean_lint, digits = 2L))
  
  # nms <- names(df_num)
  count_nna <- df_num %>%
    dplyr::select(grep("N_log2_R[0-9]{3}[NC]{0,1}", names(.))) %>%
    dplyr::select(-grep("^N_log2_R[0-9]{3}[NC]{0,1}\\s\\(Ref\\.[0-9]+\\)$", 
                        names(.))) %>%
    dplyr::select(-grep("^N_log2_R[0-9]{3}[NC]{0,1}\\s\\(Empty\\.[0-9]+\\)$", 
                        names(.))) %>%
    is.na() |>
    magrittr::not() |>
    rowSums()
  
  df_num <- dplyr::bind_cols(count_nna = count_nna, df_num) |>
    reloc_col_before("mean_lint", "count_nna")
  
  df <- df %>% 
    dplyr::select(-grep("log2_R[0-9]{3}|I[0-9]{3}", names(.))) %>% 
    dplyr::select(-which(names(.) %in% c("pep_istryptic", "pep_semitryptic", 
                                         "mean_lint", "count_nna", 
                                         "shared_prot_accs", "shared_genes"))) %>% 
    dplyr::select(-grep("^Reporter mass deviation", names(.))) %>% 
    dplyr::select(-which(names(.) %in% c("m/z", "PIF", "PEP"))) %>% 
    dplyr::select(-which(names(.) %in% stringr::str_to_title(
      c("Charge", "Mass", "Mass error [ppm]", 
        "Mass error [Da]", "Score", "Combinatorics", 
        "Fraction of total spectrum", 
        "Base peak fraction", 
        "Precursor Intensity", "Precursor intensity", 
        "Precursor Apex Fraction", 
        "Intensity coverage", "Intensity Coverage", 
        "Peak coverage", "Peak Coverage",  
        "Proteins", "Protein group IDs")
    ))) %>% 
    dplyr::select(-which(names(.) %in% c("Is Unique", "Protein", "Gene", 
                                         "Mapped Genes", 
                                         "Mapped Proteins", "Nextscore", 
                                         "PeptideProphet Probability")))

  mq_median_keys <- NULL
  df_mq_med <- df %>% 
    dplyr::select(!!rlang::sym(id), which(names(.) %in% mq_median_keys)) %>% 
    dplyr::group_by(!!rlang::sym(id)) %>% 
    dplyr::summarise_all(~ median(.x, na.rm = TRUE))
  df <- df %>% dplyr::select(-which(names(.) %in% mq_median_keys))
  rm(list = "mq_median_keys")
  
  sm_median_keys <- c(
    "deltaForwardReverseScore", "percent_scored_peak_intensity", "totalIntensity", 
    "precursorAveragineChiSquared", "precursorIsolationPurityPercent", 
    "precursorIsolationIntensity", "ratioReporterIonToPrecursor", 
    "matched_parent_mass", "delta_parent_mass", "delta_parent_mass_ppm")
  df_sm_med <- df %>% 
    dplyr::select(!!rlang::sym(id), which(names(.) %in% sm_median_keys)) %>% 
    dplyr::group_by(!!rlang::sym(id)) %>% 
    dplyr::summarise_all(~ median(.x, na.rm = TRUE))
  df <- df %>% dplyr::select(-which(names(.) %in% sm_median_keys))
  rm(list = "sm_median_keys")
  
  mf_median_keys <- NULL
  df_mq_med <- df %>% 
    dplyr::select(!!rlang::sym(id), which(names(.) %in% mf_median_keys)) %>% 
    dplyr::group_by(!!rlang::sym(id)) %>% 
    dplyr::summarise_all(~ median(.x, na.rm = TRUE))
  df <- df %>% dplyr::select(-which(names(.) %in% mf_median_keys))
  rm(list = "mf_median_keys")

  df_first <- df %>% 
    dplyr::filter(!duplicated(!!rlang::sym(id))) %>% 
    dplyr::select(-grep("^pep_", names(.)))
  
  df <- list(df_first, 
             df_mq_med, 
             df_sm_med, 
             df_num) %>% 
    purrr::reduce(left_join, by = id) %>% 
    data.frame(check.names = FALSE)
  
  nms <- names(df)
  cols <- grepl("log2_R[0-9]{3}", nms) & !sapply(df, is.logical)
  df[, cols] <- df[, cols] |>
    dplyr::mutate_if(is.integer, as.numeric) |>
    round(digits = 3L)

  cols <- grepl("I[0-9]{3}", nms) & !sapply(df, is.logical)
  df[, cols] <- df[, cols] |>
    dplyr::mutate_if(is.integer, as.numeric) |>
    round(digits = 0L)
  
  df <- dplyr::bind_cols(
    df[, !grepl("I[0-9]{3}|log2_R[0-9]{3}", nms), drop = FALSE], 
    df[, grep("^I[0-9]{3}", nms), drop = FALSE], 
    df[, grep("^N_I[0-9]{3}", nms), drop = FALSE], 
    df[, grep("^log2_R[0-9]{3}", nms), drop = FALSE], 
    df[, grep("^N_log2_R[0-9]{3}", nms), drop = FALSE], 
    df[, grep("^Z_log2_R[0-9]{3}", nms), drop = FALSE])
  
  if (rm_allna) {
    df <- df %>% 
      .[rowSums(!is.na(.[grepl("^N_log2_R[0-9]{3}[NC]{0,1}", names(.))])) > 0, ]
  }

  if (gn_rollup) {
    # don't move: for keeping track of the original column names
    nms <- names(df)
    
    df$mean_lint <- df$count_nna <- NULL

    dfa <- local({
      dfa <- df %>% 
        dplyr::select(gene, grep("I[0-9]{3}|log2_R[0-9]{3}", names(.))) %>% 
        dplyr::filter(!is.na(gene)) %>% 
        dplyr::group_by(gene) %>% 
        dplyr::summarise_all(list(function (x) median(x, na.rm = TRUE)))
      
      dfa <- dfa %>% 
        dplyr::mutate(
          mean_lint = log10(rowMeans(.[, grepl("^N_I[0-9]{3}[NC]{0,1}", names(.)), 
                                       drop = FALSE], na.rm = TRUE)), 
          mean_lint = round(mean_lint, digits = 2L))

      count_nna <- dfa %>% 
        dplyr::select(grep("N_log2_R[0-9]{3}[NC]{0,1}", 
                           names(.)))%>% 
        dplyr::select(-grep("^N_log2_R[0-9]{3}[NC]{0,1}\\s\\(Ref\\.[0-9]+\\)$", 
                            names(.))) %>% 
        dplyr::select(-grep("^N_log2_R[0-9]{3}[NC]{0,1}\\s\\(Empty\\.[0-9]+\\)$", 
                            names(.))) %>% 
        is.na() %>% 
        magrittr::not() %>% 
        rowSums() 
      
      dfa <- dplyr::bind_cols(count_nna = count_nna, dfa) %>% 
        reloc_col_before("mean_lint", "count_nna")
    })

    dfb <- df %>% 
      dplyr::select(-prot_cover, 
                    -grep("I[0-9]{3}|log2_R[0-9]{3}", names(.))) %>% 
      dplyr::filter(!is.na(gene)) %>% 
      dplyr::filter(!duplicated(gene))
    
    dfc <- suppressWarnings(
      df %>% 
        dplyr::select(gene, prot_cover) %>% 
        dplyr::filter(!is.na(gene)) %>% 
        dplyr::group_by(gene) %>% 
        dplyr::summarise_all(~ max(.x, na.rm = TRUE)))

    df <- list(dfc, dfb, dfa) %>% 
      purrr::reduce(right_join, by = "gene") %>% 
      dplyr::filter(!is.na(gene), !duplicated(gene)) %>% 
      dplyr::select(nms)
  }
  
  df <- dplyr::bind_cols(
    df %>% dplyr::select(grep("^prot_", names(.))), 
    df %>% dplyr::select(-grep("^prot_", names(.))))
}


#' Assign duplicated peptides to a leading protein
#' @param df A PSM data frame
#' @inheritParams mergePep
#' @inheritParams annotPSM
assign_duppeps <- function(df, group_psm_by = "pep_seq", 
                           group_pep_by = "pep_seq_mod", use_duppeps = TRUE, 
                           duppeps_repair = "denovo") 
{
  # Scenario: 
  # In `dat_file_1`, peptide_x assigned to Prn_MOUSE against "human + mouse" databases.
  # In `dat_file_2` the same peptide_x assigned to PRN_HUMAN against "human only" database.
  # When combining, `dat_file_1` and `dat_file_2`, all the peptide entries will be 
  #   re-assigned to the protein id with the greater `prot_n_pep`.
  
  message("Assigning multiple-dipped peptide sequences.\n")

  dat_dir <- get_gl_dat_dir()
  
  dup_peps <- df |>
    dplyr::select(!!rlang::sym(group_psm_by), !!rlang::sym(group_pep_by)) |>
    dplyr::group_by(!!rlang::sym(group_psm_by)) |>
    dplyr::summarise(N = dplyr::n_distinct(!!rlang::sym(group_pep_by))) |>
    dplyr::filter(N > 1L)
  
  if (nrow(dup_peps)) {
    if (use_duppeps) {
      if (duppeps_repair == "denovo") {
        df <- local({
          # grps <- readRDS(file.path(dat_dir, "grps.rds"))
          grps <- mzion::groupProts(unique(df[, c("prot_acc", "pep_seq")]), 
                                     out_path = dat_dir)
          sets <- readRDS(file.path(dat_dir, "prot_pep_setcover.rds"))
          ids <- with(sets, paste0(prot_acc, ".", pep_seq))
          
          # e.g. "prot_hit_num", "prot_family_member" may be not in df
          col_nms <- names(df)

          grps <- grps %>% 
            .[, names(.) %in% col_nms] %>% 
            tidyr::unite(prot_pep, prot_acc, pep_seq, sep = ".", remove = FALSE) %>% 
            dplyr::filter(prot_pep %in% ids) %>% 
            dplyr::select(-prot_pep)

          updated_nms <- names(grps) %>% .[! . == "pep_seq"]

          ans <- df %>% 
            dplyr::select(-which(names(.) %in% updated_nms)) %>% 
            dplyr::right_join(grps, by = "pep_seq") %>% 
            dplyr::select(col_nms)

          # keep the original pep_n_psm
          # update prot_n_psm, prot_n_pep and pep_n_psm later...
          # update other ^prot_ fields
          # and more...
          
          ans
        })
      }
      else if (duppeps_repair == "majority") {
        df <- local({
          # to ensure the same order in column names during replacement
          col_nms <- suppressWarnings(
            df %>% 
              dplyr::select(grep("^prot_", names(.)), 
                            one_of(c("gene", "acc_type", "entrez", "species", 
                                     "kin_attr", "kin_class", "kin_order"))) %>% 
              dplyr::select(-one_of(c("prot_n_psm", "prot_n_pep", "prot_cover", 
                                      "prot_matches_sig", "prot_sequences_sig"))) %>% 
              names()
          )
          
          rows <- df[[group_psm_by]] %in% dup_peps[[group_psm_by]]
          
          dups <- df[rows, ]
          unis <- df[!rows, ]
          
          ans <- lapply(split(dups, dups[[group_psm_by]]), 
                        replace_by_rowone, col_nms) %>% 
            dplyr::bind_rows() %>% 
            dplyr::select(names(df))
          
          dplyr::bind_rows(unis, ans)
        })
      }
      else {
        stop("Invalide choice of `duppeps_repair`." )
      }

      if (FALSE) {
        # update `dup_peps`; should be empty
        dup_peps_af <- df %>% 
          dplyr::filter(!!rlang::sym(group_psm_by) %in% dup_peps[[group_psm_by]]) %>%
          dplyr::select(!!rlang::sym(group_psm_by), !!rlang::sym(group_pep_by)) %>%
          dplyr::group_by(!!rlang::sym(group_psm_by)) %>%
          dplyr::summarise(N = n_distinct(!!rlang::sym(group_pep_by))) %>%
          dplyr::filter(N > 1)
        
        if (nrow(dup_peps_af)) {
          write.csv(dup_peps_af, file.path(dat_dir, "Peptide/dbl_dipping_peptides.csv"), 
                    row.names = FALSE)
          
          df <- df %>% 
            dplyr::filter(! (!!rlang::sym(group_psm_by) %in% dup_peps_af[[group_psm_by]]))
        }
      }
      
    } 
    else {
      write.csv(dup_peps, file.path(dat_dir, "Peptide/dbl_dipping_peptides.csv"), 
                row.names = FALSE)
      
      df <- df %>% 
        dplyr::filter(! (!!rlang::sym(group_psm_by) %in% dup_peps[[group_psm_by]]))
    }
  }
  else {
    # save the pep-prot map...
  }
  
  invisible(df)
}


#' Replaces values by the first row.
#'
#' @param df A data frame.
#' @param col_nms The column names under which the values in \code{df} will be
#'   replaced with those in the first row.
replace_by_rowone <- function (df, col_nms) 
{
  stopifnot(all(c("prot_n_pep", "prot_n_psm", "prot_mass") %in% names(df)))
  
  df <- df %>% 
    dplyr::arrange(-prot_n_pep, -prot_n_psm, -prot_mass)
  
  # if (group_pep_by == "prot_acc") use the first prot_acc and also the first gene
  # if (group_pep_by == "gene") use the first gene and also the first prot_acc
  
  cols_replace <- df[1, col_nms]
  
  df2 <- df[-1, ]
  df2[, col_nms] <- cols_replace
  
  dplyr::bind_rows(df[1, ], df2)
}


#' Imputes NA values in a peptide table
#'
#' @param df A intensity data frame (not matrix) of peptides.
#' @param fold The fold difference to mean.
#' @param is_intensity Logical; is intensity data or not (log2FC).
impPepNA <- function (df, fold = 50, is_intensity = TRUE)
{
  # if (!is.data.frame(df)) stop("Input `df` needs to be a data frame.")
  set.seed(1422)
  
  vmax <- max(df, na.rm = TRUE)
  vmin <- min(df, na.rm = TRUE)
  fold <- ceiling(max(abs(vmax), abs(vmin)) * 10)
  
  nas <- is.na(df)
  
  if (sum(nas) == 0) {
    return(df)
  }
  
  if (is_intensity) {
    mv   <- mean(df[!nas]) / fold
    errs <- 2^rnorm(sum(nas), 0, .5)
  }
  else {
    mv   <- mean(df[!nas])
    errs <- 2^rnorm(sum(nas), -2, .5)
  }

  nvs  <- mv * errs
  df[nas] <- nvs

  df
}


#' Find MBR MS1 files
#'
#' @param dat_dir The working directory
#' @param n_files The number of files.
#' @param abort Logical; to abort the run or not. TRUE at \link{mergePep};
#'   otherwise, \code{df} is unique by pep_seq_modz and need to be collapsed to
#'   pep_seq_mod.
find_mbr_ms1files <- function(dat_dir, n_files, abort = FALSE)
{
  ms1files <- list.files(dat_dir, pattern = "^ms1full_.*\\.rds$", 
                         full.names = TRUE, recursive = TRUE)
  n_ms1fis <- length(ms1files)
  
  if (n_ms1fis) {
    path_ms1 <- unique(dirname(ms1files))
    
    if (length(path_ms1) > 1L) {
      path_ms1 <- path_ms1[[1]]
      # need to check duplicated files
      ms1files <- list.files(path_ms1, pattern = "^ms1full_.*\\.rds$")
      n_ms1fis <- length(ms1files)
    }
    else {
      ms1files <- basename(ms1files)
    }
  }
  
  if (!n_ms1fis) {
    if (abort) {
      stop("MS1 peak lists of `^ms1full_[...].rds` not found for MBR.\n", 
           "Please copy them from the Mzion folder ", 
           "to your working directory.")
    }
    
    ok_mbr <- FALSE
  }
  else if (n_ms1fis != n_files) {
    if (abort) {
      stop("The number of MS1 `^ms1full_[...].rds` files is different ", 
           "to the number of peptide ", pat, " files.")
    }
    
    ok_mbr <- FALSE
  }
}
qzhang503/proteoQ documentation built on April 13, 2025, 8:31 a.m.