proteomixr: Label-free quantitative proteomics & D3-Leu turnover

Documented in proteome

#' Proteome
#'
#' Creates a data frame with an average abundance of the top ionizers for each protein
#' @param file_name file containing all peptides for all samples with the columns named Annotated.Sequence, Modifications, Master.Protein.Accessions.
#' @param organism "mouse", "human" or "rat"
#' @param wt_samps "all" if you want to use all samples to weight, otherwise use a pattern that is present in the column names of the samples you wish to use for the weighting
#' @param group if you want to group samples then \code{TRUE}
#' @param group_names indicate names of the groups (must be a string pattern present in the samples)
#' @param top_pep number of top ionizers used
#' @param min_pep minumum number of peptides found in a protein to be incuded in the analysis
#' @param norm set to TRUE if normalization is wanted
#' @param norm_method indicate normalization method: "protein" or "sum_total"
#' @param norm_pro accession of protein used for
#' @param csv if result file should be written in working directory
#' @keywords whole proteome analysis
#' @export
#' @examples
#' proteome("WT vs KO all pep.csv", group_names = c("WT", "KO"),
#'          organism = "mouse", group = TRUE, norm = TRUE,
#'          norm_method = "protein", norm_pro = "B2RTM0", csv = FALSE)

proteome <- function(file_name, organism,
                     wt_samps = "all", group = FALSE, group_names = NULL,
                     top_pep = 3, min_pep = 3,
                     norm = FALSE, norm_method = NULL, norm_pro = NULL,
                     csv = TRUE){

  # import data frame
  dat <- read.csv(file_name)

  # data frame format check
  if (colnames(dat)[1] != "Annotated.Sequence" &
      colnames(dat)[2] != "Modifications" &
      colnames(dat)[3] != "Master.Protein.Accessions")
    stop ("invalid data frame: must have Annotated.Sequence, Modifications,
          Master.Protein.Accessions")

  # import gene accession
  if (organism == "mouse") {
    gene_df <- read.csv(
      "C:/Users/PrevBeast/Documents/Fasta Files/Mouse/mouse_fasta_accession_gene.csv")
  }
  if (organism == "human"){
    gene_df <- read.csv(
      "C:/Users/PrevBeast/Documents/Fasta Files/Human/human_fasta_accession_gene.csv")
  }
  if (organism == "rat"){
    gene_df <- read.csv(
      "C:/Users/PrevBeast/Documents/Fasta Files/Rat/rat_fasta_accession_gene.csv")
  }


  if (organism != "mouse" & organism != "human" & organism != "rat")
    stop ("invalid organism name: only human, mouse or rat currently supported")


  # determine the top ionizers and sort by them taking only the top_pep
  if (wt_samps == "all"){
    wt_grp <- 4:ncol(dat)
  } else {
    if (sum(grepl(wt_samps, colnames(dat))) < 1)
      stop ("wt_samps must be a pattern that is present in the sample
            (column) names that are being used to weight the top ionizers")
    wt_grp <- grep(wt_samps, colnames(dat))
  }

  dat$wt_grp <- by_group(dat, wt_grp)

  df <- tbl_df(dat) %>%
    group_by(Master.Protein.Accessions) %>%
    top_n(n = top_pep, wt = wt_grp)


  # do normalization if indicated --------
  if (norm == TRUE){

    if (norm_method == "sum_total") {
      norm_value <- colSums(df[, wt_grp], na.rm = TRUE)

      raw_abun_mat <- as.matrix(df[, wt_grp])

      norm_abun <- t(t(raw_abun_mat)/norm_value)
      colnames(norm_abun) <- paste(colnames(norm_abun), sep = "_", "norm")
      norm_test <- as.data.frame(norm_abun)
      df <- tibble::as_tibble(df)
      df <- cbind(df, norm_test)

    }

    if (norm_method == "protein"){

      norm_pep <- subset(df, df$Master.Protein.Accessions == norm_pro)
      numeric_cols <- which(sapply(norm_pep, is.numeric) == TRUE)
      raw_abun <- numeric_cols[-length(numeric_cols)]
      norm_value <- sapply(norm_pep[, raw_abun], mean, na.rm = TRUE)
      raw_abun_mat <- as.matrix(df[, raw_abun])
      norm_abun <- t(t(raw_abun_mat)/norm_value)
      colnames(norm_abun) <- paste(colnames(norm_abun), sep = "_", "norm")
      norm_test <- as.data.frame(norm_abun)
      df <- tibble::as_tibble(df)
      df <- cbind(df, norm_test)
    }
  }

  df <- as.data.frame(df)

  if (group == TRUE){
    for (i in 1:length(group_names)){

      if (norm == TRUE){
        pat <- paste0(group_names[i], "_norm")
      } else {
        pat <- group_names[i]
      }
      samp_col_group <- grep(pat, colnames(df))
      df[, paste0(group_names[i], "_med")] <- by_group(df, samp_col_group)
    }

    pro_df <- by_protein(df, colnames(df)[grep("_med", colnames(df))]) %>%
      as.data.frame %>%
      rownames_to_column("Master.Protein.Accessions")

    colnames(pro_df) <- sub("_med", "", colnames(pro_df))

  }


  if (group == FALSE) {
    if (norm == FALSE) {
      all <- colnames(dat)[-c(1,2,3,length(dat))]

      pro_df <- by_protein(df, all) %>%
        as.data.frame %>%
        rownames_to_column("Master.Protein.Accessions")
    }
    if (norm == TRUE){
      pro_df <- by_protein(df, colnames(df)[grep("_norm", colnames(df))]) %>%
        as.data.frame %>%
        rownames_to_column("Master.Protein.Accessions")

      colnames(pro_df) <- sub("_norm", "", colnames(pro_df))
    }

  }


  pro_df$gene <- mpa_to_gene(pro_df, gene_df)

  pro_df$peptides <- table(dat$Master.Protein.Accessions)
  pro_df <- dplyr::filter(pro_df, pro_df$peptides >= min_pep)

  col_order <- c("gene", "Master.Protein.Accessions", "peptides",
                 colnames(pro_df)[2:(length(colnames(pro_df))-2)])

  pro_df <- pro_df[, col_order]

  colnames(pro_df)[1] <- "Gene"
  colnames(pro_df)[2] <- "Accession"
  colnames(pro_df)[3] <- "# of peptides"

  pro_df <- dplyr::filter(pro_df, Accession != "")

  file_output <- paste0(gsub(".csv", "", file_name), "_r_",
                        format(Sys.time(), "%m%d%y%H%M%S"),
                        ".csv")

  if (csv == TRUE){
    write.csv(pro_df, file_output, row.names = FALSE)
    print(paste("result file", file_output ,"is located in", getwd()))
  }
  pro_df <- tibble::as_tibble(pro_df)
  return(pro_df)
}