R/imprints_cleaved_peptides.R

Defines functions curve find_cutoff nb_pepval inflex imprints_cleaved_peptides

Documented in imprints_cleaved_peptides

#' imprints_cleaved_peptides
#'
#' Function to find proteins that are potentially cleaved and their cleaved sites.
#' For more information, see Details section.
#'
#' @details
#' The idea of this function is to compute the sum from all fold change for each peptide and then compute the cumulative sum of
#' these summed fold change of every peptides for each protein.
#' If the protein is not cleaved, this cumulative sum should be constantly increasing/decreasing, i.e linear.
#' If the protein is cleaved, then this cumulative sum will either be convex or concave.
#' If convex, the last peptides are more abundant; if concave the first peptides are more abundant.
#' In the end a volcano plot will be plotted where the p-value corresponds to the combined p-value from
#' the six temperatures comparison between peptides located towards the N-terminal and the ones located
#' towards the C-terminal position.
#' You can then use the function \code{imprints_sequence_peptides} to plot the peptides before and after
#' the potential cleavage site.
#'
#' @param data The normalized peptides data set, i.e. the outpout from \code{imprints_normalize_peptides}.
#' @param data_diff The log2 fold-changes peptides data set, i.e. the outpout from \code{imprints_sequence_peptides}.
#'                  If NULL, it will be computed.
#' @param control The control treatment from your dataset.
#' @param min_ValidValue The minimum proportion of non-missing values per peptides.
#'                       Default is 0.4; so if 6 temperatures need at least 3 non missing values.
#' @param min_peptide The minimum number of peptides per protein to be considered a RESP candidate.
#'   Default is 4.
#' @param FDR The FDR used to obtained the final p-value cutoff. Default is 0.01
#' @param comb_pv_method The method used to combine p-values. Either george, fisher or edgington.
#'   Default is george. See Details.
#' @param RESP_score The RESP score cutoff. Default is 0.5
#' @param fixed_score_cutoff Logical to tell if you want to use a fixed cutoff for the RESP score.
#'   Default is FALSE. See Details.
#' @param categorize Logical to categorize or not the obtained hits using the function \code{\link{imprints_categorize_peptides}}.
#'   Default is TRUE.
#' @param curvature The curvature used for the curve on the volcano plot
#' @param folder_name The name of the folder in which you want to save the results.
#'
#' @return The potential cleaved sites from the proteins considered as cleaved.
#'   A folder will also be saved where you'll find a volcano plots and results data.
#'
#' @details
#' George's method correspond to the sum of the logit of the p-values, Fisher's to the sum of the
#' log of the p-values and Edgington's is the sum of the p-values.
#' Edgington's method is the most stringent and is particularly sensitive with higher p-values
#' whereas Fisher's method is the less stringent as it is mostly sensitive to low p-values.
#' George's method is a compromise between the two methods.
#' For more details read \link{https://doi.org/10.48550/arXiv.1707.06897}.
#'
#' @details
#' About the fixed_score_cutoff,if TRUE, the value RESP_score will directly be used as the cutoff and
#' for all treatments. If FALSE, the RESP score cutoff will be calculated as the value selected for
#' RESP_score plus the median of the scores of the proteins which have a p-value lower than the median
#' of all p-values for a given treatment.
#'
#' @export
#'

imprints_cleaved_peptides <- function(data, data_diff = NULL, control = "Vehicle",
                                      min_ValidValue = 0.4, min_peptide = 4,
                                      FDR = 0.01, comb_pv_method = c("george", "fisher", "edgington"),
                                      RESP_score = 0.5, fixed_score_cutoff = FALSE,
                                      categorize = TRUE,
                                      curvature = 0.05, folder_name = ""){
  if(!any(grepl(paste0("_", control, "$"), colnames(data)))){
    message(paste("Error:", control, "is wasn't found in your data !"))
    return()
  }

  comb_pv_method <- tolower(comb_pv_method)
  comb_pv_method <- match.arg(comb_pv_method)

  wd <- getwd()
  outdir <- paste0(wd, "/", format(Sys.time(), "%y%m%d_%H%M"), "_RESP_analysis",
                   ifelse(nchar(folder_name) > 0, "_", ""), folder_name)
  if (dir.exists(outdir) == FALSE) {
    dir.create(outdir)
  }

  require(limma)

  # if any, removing QP columns
  if(any(grepl("^36C_", colnames(data)))){
    data <- data[,-grep("^36C_", colnames(data))]
  }

  if(is.null(data_diff)){
    message("No fold-change dataset specified, computing...")
    data_diff <- imprints_sequence_peptides(data, control = control,
                                            dataset_name = deparse(substitute(data))
                                            )
  }

  if(any(grepl("^36C_", colnames(data_diff)))){
    data_diff <- data_diff[,-grep("^36C_", colnames(data_diff))]
  }

  if("countNum" %in% colnames(data)){
    data$countNum <- NULL
  }
  if("countNum" %in% colnames(data_diff)){
    data_diff$countNum <- NULL
  }

  data_cleaved <- data_diff
  if(any(grepl(paste0("_", control, "$"), colnames(data_cleaved)))){
      data_cleaved <- data_cleaved[,-grep(paste0("_", control, "$"), colnames(data_cleaved))]
  }

  # data_cleaved is output from 'imprints_sequence_peptides' --> remove unnecessary informations
  data_cleaved$Master.Protein.Accessions <- data_cleaved$Positions.in.Master.Proteins
  data_cleaved$Positions.in.Master.Proteins <- NULL
  data_cleaved$Annotated.Sequence <- NULL
  data_cleaved$Modifications <- NULL
  colnames(data_cleaved)[1] <- "id" # easier handle


  message("Averaging value, summing profiles...")
  # get summed value from all temperature for every peptide + # keeping track of number of non-missing values
  data_cleaved <- data_cleaved %>%
    tidyr::gather("treatment", "value", -id, -description) %>%
    tidyr::separate(treatment, into = c("temp", "biorep", "treatment"), sep = "_") %>%
    dplyr::group_by(id, description, temp, treatment, biorep) %>% # if peptide modified, mean values from same peptide sequence
    dplyr::summarise(value = mean(value, na.rm = TRUE)) %>%
    dplyr::mutate(N = paste(biorep, as.numeric(!is.na(value)), sep = "_")) %>%  # keeping track of number of non-missing values
    dplyr::ungroup() %>% dplyr::group_by(id, description, temp, treatment) %>%
    dplyr::summarise(mean_value = mean(value, na.rm = TRUE),
                     Nvalue = paste(N, collapse = ";")) %>%  # keeping track of number of non-missing values
    dplyr::mutate(Nvalue = paste(temp, Nvalue, sep = "-")) %>%
    dplyr::ungroup() %>% dplyr::group_by(id, description, treatment) %>%
    dplyr::filter(length(na.omit(mean_value))/length(mean_value) >= min_ValidValue) %>%  # keeping peptides with more than 40% of valid values
    dplyr::reframe(sum_profile = sum(mean_value, na.rm = TRUE),  # get sum value for each peptides --> sum all temperatures values
                   Nvalue = paste(Nvalue, collapse = "|"))   # keeping track of number of non-missing values


  message("Filtering and ordering peptides...")
  # separate id in protein and sequence (need to take into account protein groups)
  data_cleaved$sequence <- gsub("(?<= |^)(.{5,6}|A0.{6,8})(?= \\[)", "", data_cleaved$id, perl = TRUE)
  data_cleaved$sequence <- gsub(" ", "", data_cleaved$sequence)
  data_cleaved$sequence <- gsub(";", "; ", data_cleaved$sequence)

  data_cleaved$id <- gsub("(?<=\\[)\\d{1,4}-\\d{1,4}(?=\\])", "", data_cleaved$id, perl = TRUE)
  data_cleaved$id <- gsub(" \\[(\\]|\\];)", "", data_cleaved$id)
  data_cleaved$id <- gsub("(?<!;) ", "; ", data_cleaved$id, perl = TRUE)

  data_cleaved <- data_cleaved %>%
    dplyr::mutate(sequence = gsub("\\[|\\]", "", sequence)) %>%
    dplyr::ungroup() %>% dplyr::group_by(id, description, treatment) %>%
    dplyr::filter(length(sum_profile) >= min_peptide)  # only keeping proteins with more than min_peptide peptides

  # order data according proteins and sequence
  # ordering according sequence is capital
  data_cleaved$factor <- data_cleaved$id
  data_cleaved <- data_cleaved[order(data_cleaved$id),]
  ord <- sapply(strsplit(gsub(";.*", "", data_cleaved$sequence), "-|~"), # only taking first protein if proteinn group
                function(x) {sum(as.numeric(x))}
                )
  ord <- data.frame(factor = data_cleaved$factor, idx = 1:nrow(data_cleaved), sum.pos = ord) %>%
    dplyr::group_by(factor) %>%
    dplyr::mutate(ord.sum.pos = order(sum.pos),
                  nb.pep = length(ord.sum.pos),
                  final.order = idx[ord.sum.pos])

  data_cleaved <- data_cleaved[ord$final.order,]
  data_cleaved$Master.Protein.Accessions <- data_cleaved$factor
  data_cleaved$factor <- NULL

  message("Computing cumulative sum to find potential cleaved sites...")
  # compute cumulative sum for each protein
  data_cleaved <- data_cleaved %>%
    dplyr::group_by(id, description, treatment) %>%
    dplyr::mutate(cumsum_profile = cumsum(tidyr::replace_na(sum_profile, 0)))

  # get cleaved site
  # if cumulative abundance concave --> first derivative is decreasing globally --> inflexion point is where second derivative is minimum
  # if cumulative abundance convex --> first derivative is increasing globally --> inflexion point is where second derivative is maximum
  data_cleaved <- data_cleaved %>% dplyr::ungroup() %>%
    dplyr::group_by(id, description, treatment) %>%
    dplyr::group_modify(~ {
      cleaved_sites = .x$sequence[inflex(.x$cumsum_profile)]
      df <- sapply(cleaved_sites,
                   function(cleaved_site){
                     df <- .x
                     df$cleaved_site <- cleaved_site;
                     df
                   }, simplify = FALSE)
      df <- as.data.frame(Reduce(rbind, df))
      return(df)
    })# get back potential cleaved sites


  message("Checking cleaved sites, formating...")
  # set peptide position --> before cleaved site = N-terminal side / after cleaved site = C-terminal side
  # if protein group, only taking first protein
  data_cleaved$position_global <- unlist(lapply(strsplit(gsub(";.*", "", data_cleaved$sequence),
                                                 "~|-"),
                                        function(s) sum(as.numeric(s)))
                                 )
  data_cleaved$cleaved.global <- unlist(lapply(strsplit(gsub(";.*", "", data_cleaved$cleaved_site),
                                                "~|-"),
                                       function(s) sum(as.numeric(s)))
                                )

  data_cleaved <- data_cleaved %>%
    dplyr::mutate(position_global = ifelse(position_global < cleaved.global,
                                           "N", ifelse(sequence == cleaved_site,
                                                       "cleaved", "C"))
           ) %>%
    dplyr::select(-cleaved.global)

  # checking cleaved site, if not assign right position
  data_cleaved <- data_cleaved %>% dplyr::group_by(id, description, treatment, cleaved_site) %>%
    dplyr::mutate(cleaved_site_update = cleaved_site) %>%
    dplyr::group_modify(~ {
      val_N <- .x$sum_profile[which(.x$position_global == "N")]
      val_C <- .x$sum_profile[which(.x$position_global == "C")]
      cle <- .x$sum_profile[which(.x$position_global == "cleaved")]
      non_cleaved <- .x$cleaved_site_update[1]

      bef_cleaved <- .x$sequence[which(.x$position_global == "cleaved") -1]
      aft_cleaved <- .x$sequence[which(.x$position_global == "cleaved") +1]

      bef_cleaved <- sapply(strsplit(bef_cleaved, "; ")[[1]],
                            function(x){
                              x <- strsplit(x, "-|~")[[1]]
                              as.numeric(x[2]) + 1
                            }, USE.NAMES = FALSE)
      aft_cleaved <- sapply(strsplit(aft_cleaved, "; ")[[1]],
                            function(x){
                              x <- strsplit(x, "-|~")[[1]]
                              as.numeric(x[1]) - 1
                            }, USE.NAMES = FALSE)

      # proportion augmentation/diminution of sd
      pN <- (sd(c(val_N, cle), na.rm = TRUE)/sd(val_N, na.rm = TRUE))
      pC <- (sd(c(val_C, cle), na.rm = TRUE)/sd(val_C, na.rm = TRUE))

      if(any(is.na(c(pN, pC)))){ # if only one peptide in N or C --> sd is NA; only one of them can be NA (at least 4 peptides in total)
        # if only one, cleaved site is assign to this group
        .x$position_global[which(.x$position_global == "cleaved")] <- ifelse(is.na(pN), "N", "C")
        non_cleaved <- sapply(strsplit(non_cleaved, "; ")[[1]],
                              function(x){
                                x <- strsplit(x, "-|~")[[1]]
                                as.numeric(x)
                              }, simplify = FALSE)
        if(is.na(pN)){
          non_cleaved <- sapply(non_cleaved, "[[", 2)
          if(any(non_cleaved != aft_cleaved, na.rm = TRUE)){  # if position doesn't follow
            non_cleaved[which(non_cleaved != aft_cleaved)] <- non_cleaved[which(non_cleaved != aft_cleaved)] + 1
            if(any(non_cleaved > aft_cleaved, na.rm = TRUE)){ # cleaved position and after have common AA position
              non_cleaved[which(non_cleaved > aft_cleaved)] <- aft_cleaved
              aft_cleaved[which(aft_cleaved == non_cleaved)] <- aft_cleaved[which(aft_cleaved == non_cleaved)] + 1
            }
          }
          non_cleaved <- paste(non_cleaved, aft_cleaved, sep = "~")
        }
        else{
          non_cleaved <- sapply(non_cleaved, "[[", 1)
          if(any(non_cleaved != bef_cleaved, na.rm = TRUE)){  # if position doesn't follow
            non_cleaved[which(non_cleaved != bef_cleaved)] <- non_cleaved[which(non_cleaved != bef_cleaved)] - 1
            if(any(non_cleaved < bef_cleaved, na.rm = TRUE)){ # cleaved position and before have common AA position
              bef_cleaved[which(bef_cleaved > non_cleaved)] <- non_cleaved - 1
            }
          }
          non_cleaved <- paste(bef_cleaved, non_cleaved, sep = "~")
        }
        non_cleaved <- paste(non_cleaved, collapse = "; ")
      }
      else if(cle >= 0){ # to be cleaved, peptide should be less abundant, i.e. negative fold changes
        .x$position_global[which(.x$position_global == "cleaved")] <- ifelse(pN <= pC, "N", "C")
        non_cleaved <- sapply(strsplit(non_cleaved, "; ")[[1]],
                              function(x){
                                x <- strsplit(x, "-|~")[[1]]
                                as.numeric(x)
                              }, simplify = FALSE)
        if(pN <= pC){
          non_cleaved <- sapply(non_cleaved, "[[", 2)
          if(any(non_cleaved != aft_cleaved, na.rm = TRUE)){  # if position doesn't follow
            non_cleaved[which(non_cleaved != aft_cleaved)] <- non_cleaved[which(non_cleaved != aft_cleaved)] + 1
            if(any(non_cleaved > aft_cleaved, na.rm = TRUE)){ # cleaved position and after have common AA position
              non_cleaved[which(non_cleaved > aft_cleaved)] <- aft_cleaved
              aft_cleaved[which(aft_cleaved == non_cleaved)] <- aft_cleaved[which(aft_cleaved == non_cleaved)] + 1
            }
          }
          non_cleaved <- paste(non_cleaved, aft_cleaved, sep = "~")
        }
        else{
          non_cleaved <- sapply(non_cleaved, "[[", 1)
          if(any(non_cleaved != bef_cleaved, na.rm = TRUE)){  # if position doesn't follow
            non_cleaved[which(non_cleaved != bef_cleaved)] <- non_cleaved[which(non_cleaved != bef_cleaved)] - 1
            if(any(non_cleaved < bef_cleaved, na.rm = TRUE)){ # cleaved position and before have common AA position
              bef_cleaved[which(bef_cleaved > non_cleaved)] <- non_cleaved - 1
            }
          }
          non_cleaved <- paste(bef_cleaved, non_cleaved, sep = "~")
        }
        non_cleaved <- paste(non_cleaved, collapse = "; ")
      }
      else{
        if(any(c(pN, pC) < 1)){ # arbitrary, sd augmentation should be more than 100%
          .x$position_global[which(.x$position_global == "cleaved")] <- ifelse(pN <= pC, "N", "C")
          non_cleaved <- sapply(strsplit(non_cleaved, "; ")[[1]],
                                function(x){
                                  x <- strsplit(x, "-|~")[[1]]
                                  as.numeric(x)
                                }, simplify = FALSE)

          if(pN <= pC){
            non_cleaved <- sapply(non_cleaved, "[[", 2)
            if(any(non_cleaved != aft_cleaved, na.rm = TRUE)){  # if position doesn't follow
              non_cleaved[which(non_cleaved != aft_cleaved)] <- non_cleaved[which(non_cleaved != aft_cleaved)] + 1
              if(any(non_cleaved > aft_cleaved, na.rm = TRUE)){ # cleaved position and after have common AA position
                non_cleaved[which(non_cleaved > aft_cleaved)] <- aft_cleaved
                aft_cleaved[which(aft_cleaved == non_cleaved)] <- aft_cleaved[which(aft_cleaved == non_cleaved)] + 1
              }
            }
            non_cleaved <- paste(non_cleaved, aft_cleaved, sep = "~")
          }
          else{
            non_cleaved <- sapply(non_cleaved, "[[", 1)
            if(any(non_cleaved != bef_cleaved, na.rm = TRUE)){  # if position doesn't follow
              non_cleaved[which(non_cleaved != bef_cleaved)] <- non_cleaved[which(non_cleaved != bef_cleaved)] - 1
              if(any(non_cleaved < bef_cleaved, na.rm = TRUE)){ # cleaved position and before have common AA position
                bef_cleaved[which(bef_cleaved > non_cleaved)] <- non_cleaved - 1
              }
            }
            non_cleaved <- paste(bef_cleaved, non_cleaved, sep = "~")
          }
          non_cleaved <- paste(non_cleaved, collapse = "; ")
        }
      }
      .x$cleaved_site_update <- non_cleaved

      return(.x)
    }) %>%
    dplyr::filter(position_global != "cleaved") %>%
    dplyr::mutate(cleaved_site = cleaved_site_update) %>%
    dplyr::select(-cleaved_site_update) %>% unique()

  ### reducing cleavage site candidate
  # if a protein has 2 or 3 potential cleavage sites and that some are measured peptides,
  # meaning they were decreasing in level and considered as measured cleavage site;
  # then we only keep these as this mean that the two methods used to obtain the inflection point
  # overlapped well and that the measured peptides is very likely to be the actual cleavage site
  data_cleaved <- data_cleaved %>% ungroup() %>%
    dplyr::select(-sum_profile, -cumsum_profile, -Master.Protein.Accessions) %>%
    dplyr::group_by(id, description, treatment)  %>%
    dplyr::filter(length(unique(cleaved_site)) == 1 | length(unique(cleaved_site)) > 3 |
                    (length(unique(cleaved_site)) == 2 | length(unique(cleaved_site)) == 3) & (grepl("-", cleaved_site) | all(grepl("~", cleaved_site))))

  ### sum number of values per peptide N and peptide C
  data_cleaved <- data_cleaved %>% dplyr::ungroup() %>%
    dplyr::group_by(id, description, treatment, cleaved_site, position_global) %>%
    dplyr::summarise(Nvalue = nb_pepval(Nvalue),
                     Npep = length(sequence)
                     ) %>%
    dplyr::ungroup() %>% dplyr::group_by(id, description, treatment) %>%
    dplyr::mutate(id_cleaved = paste0(id, "_", as.numeric(factor(cleaved_site))))

  message("Summing peptides profiles from N-terminal side and C-terminal side from each treatment\n")
  treat <- unique(data_cleaved$treatment)
  new_data_diff <- list()
  res <- list()
  for(t in treat){
    treat_data <- unique(data_cleaved[data_cleaved$treatment == t, c("id_cleaved", "cleaved_site")])
    treat_data_diff <- data
    treat_torm <- treat[!(treat %in% t)]
    if(length(treat_torm)){
      treat_torm <- paste0("_", treat_torm, "$", collapse = "|")
      treat_data_diff <- treat_data_diff[,-grep(treat_torm, colnames(treat_data_diff))]
    }

    ## taking into accounts for each potential cleaved sites
    colnames(treat_data_diff)[1] <- "id"
    treat_data_diff <- right_join(treat_data_diff, unique(data_cleaved[data_cleaved$treatment == t,
                                                                       c("id", "id_cleaved")]),
                                  by = "id", relationship = "many-to-many")
    treat_data_diff$id <- treat_data_diff$id_cleaved
    treat_data_diff$id_cleaved <- NULL
    colnames(treat_data_diff)[1] <- "Master.Protein.Accessions"
    ## ids are now updated

    directory_toremove <- list.files()
    treat_data_diff <- imprints_sequence_peptides(treat_data_diff,
                                                  proteins = treat_data$id_cleaved,
                                                  sequence = sub("~", "-", treat_data$cleaved_site),
                                                  control = control, dataset_name = t
                                                  )
    directory_toremove <- list.files()[!(list.files() %in% directory_toremove)]
    directory_toremove <- grep(paste0(t, "\\.txt$"), directory_toremove, value = TRUE)
    if(length(directory_toremove) == 1){
      unlink(directory_toremove, recursive = TRUE)
    }


    res[[t]] <- treat_data
    ### removing peptides that could be cleaved site
    treat_data_diff <- imprints_remove_peptides(treat_data_diff,
                                                treat_data$id_cleaved,
                                                treat_data$cleaved_site
                                                )
    # if only one peptide, id on position becomes different than in master, need to modify:
    prid_pos <- gsub(" \\[(\\]|\\])", "",
                     gsub("(?<=\\[)\\d{1,4}(-|~)\\d{1,4}(?=\\])", "",
                          treat_data_diff$Positions.in.Master.Proteins, perl = TRUE)
                     )
    if(any(treat_data_diff$Master.Protein.Accessions != prid_pos)){
      prid_new <- treat_data_diff$Master.Protein.Accessions[which(treat_data_diff$Master.Protein.Accessions != prid_pos)]
      prid_new <- gsub(".*; ", "", prid_new)
      prid_pos_tochange <- treat_data_diff$Positions.in.Master.Proteins[which(treat_data_diff$Master.Protein.Accessions != prid_pos)]

      treat_data_diff$Positions.in.Master.Proteins[which(treat_data_diff$Master.Protein.Accessions != prid_pos)] <- unlist(
                                                                mapply(function(x, n){
                                                                          if(grepl("; ", x)){
                                                                            x <- gsub(paste0("; ", sub("_.*", "", n), ".*"),
                                                                                      "", x)
                                                                            x <- paste0(x, "; ", n)
                                                                          }
                                                                          else{
                                                                            x <- gsub(sub("_.*", "", n),
                                                                                      n, x)
                                                                          }
                                                                        },
                                                                        prid_pos_tochange, prid_new,
                                                                       SIMPLIFY = FALSE, USE.NAMES = FALSE)
                                                              )
    }

    treat_data_diff <- treat_data_diff[,-grep(paste0("_", control), colnames(treat_data_diff))]
    treat_data_diff$Master.Protein.Accessions <- treat_data_diff$Positions.in.Master.Proteins
    treat_data_diff$Modifications <- NULL
    treat_data_diff$Annotated.Sequence <- NULL
    treat_data_diff$Positions.in.Master.Proteins <- NULL
    colnames(treat_data_diff)[1] <- "id"

    treat_data_diff <- treat_data_diff %>%
      tidyr::gather("key", "value", -id, -description) %>%
      tidyr::separate(key, into = c("temperature", "biorep", "treatment"), sep = "_") %>%
      dplyr::mutate(protein = gsub("(?<!;) ", "; ", gsub(" \\[(\\]|\\];)", "", gsub("(?<=\\[)\\d{1,4}(-|~)\\d{1,4}(?=\\])", "", id, perl = TRUE)), perl = TRUE),
             position = gsub(";", "; ", gsub(" ", "", gsub("(?<=; |^)(.{5,6}|A0.{6,8})(|_\\d{1,3})(?= \\[)", "", id, perl = TRUE)))
             ) %>%
      dplyr::select(-id)
    treat_data_diff <- treat_data_diff[,c("protein", "position", "description", "temperature", "biorep", "treatment", "value")]
    new_data_diff[[t]] <- treat_data_diff
  }
  new_data_diff <- as.data.frame(Reduce(rbind, new_data_diff))
  # updating ids
  data_cleaved$id <- data_cleaved$id_cleaved
  data_cleaved$id_cleaved <- NULL

  ##### computing p-values for each protein --> H0 = protein is not cleaved
  message("\nReshaping data and computing experimental design")
  new_data_diff$position_global <- unlist(lapply(strsplit(gsub("\\[|\\]", "", gsub(";.*", "", new_data_diff$position)),
                                                          "~|-"),
                                                 function(s) sum(as.numeric(s)))
                                          )
  new_data_diff <- new_data_diff %>%
    dplyr::group_by(protein, description, treatment) %>%
    dplyr::mutate(position_global = ifelse(position_global == min(position_global),
                                           "N", "C")) %>%
    dplyr::select(-position)

  new_data_diff$temperature <- factor(new_data_diff$temperature)
  new_data_diff$biorep <- factor(new_data_diff$biorep)
  new_data_diff$position_global <- factor(new_data_diff$position_global,
                                          levels = c("N", "C"))

  ntemp <- nlevels(new_data_diff$temperature)
  nrep <- nlevels(new_data_diff$biorep)
  # npos = 2 ; N or C

  ### limma design
  # design
  position <- rep(levels(new_data_diff$position_global), ntemp*nrep)
  temperature <- rep(levels(new_data_diff$temperature), each=nrep*2)
  replicate <- rep(rep(levels(new_data_diff$biorep), each = 2), ntemp)
  subj <- paste0(position, replicate)
  # g features points of interest to build contrasts
  g <- factor(paste0(position, temperature),
              levels = c(paste0(levels(new_data_diff$position_global)[1],
                                levels(new_data_diff$temperature)
                                ),
                         paste0(levels(new_data_diff$position_global)[2],
                                levels(new_data_diff$temperature)
                                )
                         )
              )

  design <- model.matrix(~ 0 + g + replicate)
  colnames(design)[1:nlevels(g)] <- levels(g)

  # reshape data and order columns
  new_data_diff_matrix <- new_data_diff %>%
    tidyr::unite("key", position_global, biorep, temperature, sep = "_") %>%
    tidyr::spread(key, value) %>%
    dplyr::mutate(description = unname(sapply(description, IMPRINTS.CETSA.app:::getGeneName))) %>%
    tidyr::unite("id", protein, description, sep = "_")
  new_data_diff_matrix <- new_data_diff_matrix[,c("id", "treatment", paste(position, replicate, temperature, sep = "_"))]

  ##### Get weight matrix for lmFit
  message("Computing weight matrix from number of measurements")
  weights <- data_cleaved
  weights$description <- unname(sapply(weights$description, IMPRINTS.CETSA.app:::getGeneName))
  weights$cleaved_site <- NULL
  weights$Npep <- NULL

  ## extract number of measurements
  weights <- weights %>%
    dplyr::group_by(id, description, treatment, position_global) %>%
    dplyr::group_modify(~ {
      x <- lapply(strsplit(.x$Nvalue, "\\|")[[1]], function(n) strsplit(n, "-")[[1]])

      temp <- unlist(lapply(x, "[[", 1))
      biorep <- strsplit(unlist(lapply(x, "[[", 2)), ";")

      x <- mapply(function(b, t){
        b <- strsplit(b, "_")
        b <- t(as.data.frame(b))
        rownames(b) <- 1:nrow(b)
        colnames(b) <- c("biorep", "Nv")
        b <- as.data.frame(b)
        b$Nv <- as.numeric(b$Nv)
        b$temperature <- t;
        b
      },
      biorep, temp, SIMPLIFY = FALSE)

      x <- as.data.frame(Reduce(rbind, x))
      return(x)
    })

  weights <- weights %>%
    tidyr::unite("id", id, description, sep = "_") %>%
    tidyr::unite("key", position_global, biorep, temperature, sep = "_") %>%
    tidyr::spread(key, Nv)
  weights <- weights[,colnames(new_data_diff_matrix)]

  message("Computing p-values for each temperatures for each treatment...")
  res <- list()
  for(tr in treat){
    X <- new_data_diff_matrix[new_data_diff_matrix$treatment == tr,] %>%
      dplyr::ungroup() %>% dplyr::select(-treatment) %>%
      tibble::column_to_rownames("id") %>%
      as.matrix()
    Xw <- weights[weights$treatment == tr,] %>%
      dplyr::ungroup() %>% dplyr::select(-treatment) %>%
      tibble::column_to_rownames("id") %>%
      as.matrix()

    ### Add blocking on subject in duplicateCorrelation().
    corfit <- duplicateCorrelation(X, design, block=subj)
    ### Fit our model with weigths (not normalized)
    fitW <- lmFit(X, design, weights = Xw,
                  block=subj, correlation=corfit$consensus)

    temperature <- levels(new_data_diff$temperature)
    res[[tr]] <- list()
    for(tmp in temperature){
      # How do the two profiles differ in their response over temperature ?
      con.1 <- makeContrasts(contrasts = paste0("C", tmp, " - N", tmp),
                             levels = design)
      fitcon.1 <- contrasts.fit(fitW, con.1)
      fitcon.1 <- eBayes(fitcon.1)
      res[[tr]][[tmp]] <- topTable(fitcon.1, number=nrow(X), adjust.method="BH")
    }

    res[[tr]] <- as.data.frame(Reduce(rbind,
                                     mapply(function(x, n){
                                       x$temperature <- n
                                       x <- x %>%
                                         tibble::rownames_to_column("id") %>%
                                         tidyr::separate(id, into = c("id", "Gene"), sep = "(?<=_\\d{1,3})_");
                                       x
                                     },
                                     res[[tr]], names(res[[tr]]),
                                     SIMPLIFY = FALSE)
                                     )
                               )

    res[[tr]]$treatment <- tr
  }
  res <- as.data.frame(Reduce(rbind, res))

  message("Finding most likely proteins with RESP effect")
  res <- res %>%
    dplyr::mutate(adj.P.Val = tidyr::replace_na(adj.P.Val, 0.999999))

  if(comb_pv_method == "george"){
    res <- res %>%
      dplyr::group_by(id, Gene, treatment) %>%
      dplyr::summarise(combined_pvalue = ifelse(length(na.omit(logFC)),
                                                metap::logitp(adj.P.Val)$p, # George's method
                                                NA),
                       maxFC = ifelse(length(na.omit(logFC)),
                                      temperature[which(abs(logFC) == max(abs(logFC), na.rm = T))],
                                      NA)
      )
  }
  else if(comb_pv_method == "fisher"){
    res <- res %>%
      dplyr::group_by(id, Gene, treatment) %>%
      dplyr::summarise(combined_pvalue = ifelse(length(na.omit(logFC)),
                                                metap::sumlog(adj.P.Val)$p, # Fisher's method
                                                NA),
                       maxFC = ifelse(length(na.omit(logFC)),
                                      temperature[which(abs(logFC) == max(abs(logFC), na.rm = T))],
                                      NA)
      )
  }
  else if(comb_pv_method == "edgington"){
    res <- res %>%
      dplyr::group_by(id, Gene, treatment) %>%
      dplyr::summarise(combined_pvalue = ifelse(length(na.omit(logFC)),
                                                metap::sump(adj.P.Val)$p, # Edgington’s method
                                                NA),
                       maxFC = ifelse(length(na.omit(logFC)),
                                      temperature[which(abs(logFC) == max(abs(logFC), na.rm = T))],
                                      NA)
      )
  }

  # selecting cleaved site with smallest p-value
  res <- res %>% dplyr::ungroup() %>%
    dplyr::mutate(id2 = sub("_.*", "", id)) %>%
    dplyr::group_by(id2, Gene, treatment) %>%
    dplyr::summarise(id = id[which.min(combined_pvalue)],
              combined_pvalue = min(combined_pvalue, na.rm = TRUE),
              maxFC = maxFC[which.min(combined_pvalue)]) %>%
    dplyr::ungroup() %>% dplyr::select(-id2)

  res <- res %>%
    dplyr::group_by(id, Gene, treatment) %>%
    dplyr::group_modify(~ {
      if(!is.na(.x$maxFC)){
        # getting FC
        score <- new_data_diff_matrix[which(new_data_diff_matrix$id == paste0(.x$id, "_", .x$Gene) & new_data_diff_matrix$treatment == .x$treatment),
                      grep(paste0("_", .x$maxFC, "$"), colnames(new_data_diff_matrix))]

        # computing score
        score <- c(mean(as.numeric(score[1,grep("^N_", colnames(score))]), na.rm = TRUE),
                   mean(as.numeric(score[1,grep("^C_", colnames(score))]), na.rm = TRUE))
        sign_score <- unique(sign(score))
        diff_score <- diff(score)
        if(length(unique(sign_score)) == 1){ # same sign
          zscore <- diff_score/max(c(0.1, min(abs(score))))
          # ratio give the score where 0.1 is a constant that prevent exploding score when min approach 0
        }
        else{ # one is neg and one is pos
          zscore <- max(abs(score))*diff_score/0.1  # also use the same constant
          # multiplying by the maximum fold change prevent keepin low profile that would have opposite sign
        }
      }
      else{
        zscore <- NA
      }

      score <- .x
      score <- score[,-grep("^id$|^Gene$|^treatment$", colnames(score))]
      score$zscore <- zscore
      return(score)
    }, .keep = TRUE) %>%
    dplyr::ungroup() %>% dplyr::group_by(treatment) %>%
    dplyr::mutate(zscore = (zscore - mean(zscore, na.rm = TRUE))/sd(zscore, na.rm = TRUE)) %>%
    dplyr::select(-maxFC)

  # compute cutoffs
  if(fixed_score_cutoff){
    cutoff <- res %>% dplyr::ungroup() %>% dplyr::group_by(treatment) %>%
      dplyr::mutate(BH = (order(order(combined_pvalue))/length(combined_pvalue))*FDR) %>%
      dplyr::summarise(pval = find_cutoff(combined_pvalue, BH),
                       zscore_pos = RESP_score,
                       zscore_neg = -RESP_score)
  }
  else{
    cutoff <- res %>% dplyr::ungroup() %>% dplyr::group_by(treatment) %>%
      dplyr::mutate(BH = (order(order(combined_pvalue))/length(combined_pvalue))*FDR) %>%
      dplyr::summarise(pval = find_cutoff(combined_pvalue, BH),
                       zscore_pos = RESP_score + median(abs(zscore)[which(combined_pvalue < quantile(combined_pvalue, 0.5, na.rm = TRUE))], na.rm = TRUE),
                       zscore_neg = -RESP_score - median(abs(zscore)[which(combined_pvalue < quantile(combined_pvalue, 0.5, na.rm = TRUE))], na.rm = TRUE))
  }

  # saving obtained cutoffs
  cutoff_file <- paste0(outdir, "/", format(Sys.time(), "%y%m%d_%H%M"), "_", "cutoff.txt")
  readr::write_tsv(cutoff, cutoff_file)
  extra_info <- paste0("\nParameters: \nRESP z-score cutoff=", RESP_score, ", FDR=", FDR*100, "%, method to combine p-values=", comb_pv_method, ", curvature=", curvature,
                       " \nMinimum number of peptides=", min_peptide, ", Minimum proportion of non-missing values=", min_ValidValue)
  write(extra_info, cutoff_file, sep = "\n", append = TRUE)

  res <- res %>% dplyr::group_by(id, Gene, treatment) %>%
    dplyr::mutate(curve = curve(zscore,
                                cutoff$zscore_neg[which(cutoff$treatment == treatment)],
                                cutoff$zscore_pos[which(cutoff$treatment == treatment)],
                                cutoff$pval[which(cutoff$treatment == treatment)],
                                curvature = curvature),
                  criteria_curve = -log10(combined_pvalue) >= curve
                  )
  res$criteria_curve <- tidyr::replace_na(res$criteria_curve, FALSE)

  n_treat <- length(treat)
  df_curve <- data.frame(zscore = rep(seq(-max(abs(res$zscore), na.rm = TRUE),
                                          max(abs(res$zscore), na.rm = TRUE), 0.005),
                                     n_treat)
                         )
  df_curve$treatment <- rep(treat, each = nrow(df_curve)/n_treat)
  df_curve <- df_curve %>% dplyr::group_by(treatment, rownames(df_curve)) %>%
    dplyr::mutate(curve = curve(zscore,
                                cutoff$zscore_neg[which(cutoff$treatment == treatment)],
                                cutoff$zscore_pos[which(cutoff$treatment == treatment)],
                                cutoff$pval[which(cutoff$treatment == treatment)]
                                )
                  )

  message("Creating and saving plot")
  g_h <- ggplot(res, aes(zscore, -log10(combined_pvalue), color = criteria_curve)) +
    geom_point() +
    geom_line(data = df_curve, aes(x = zscore, y = curve), linetype = "dashed", color = "black") +
    ylim(c(0, max(-log10(res$combined_pvalue), na.rm = TRUE))) +
    xlim(c(-max(abs(res$zscore), na.rm = TRUE),
           max(abs(res$zscore), na.rm = TRUE))
         ) +
    labs(title = "RESP volcano plot",
         y = "-log10(combined p-value)",
         x = "RESP z-score") +
    scale_color_manual(values = c("TRUE" = "red", "FALSE" = "grey70"))  +
    facet_wrap(~treatment) +
    ggrepel::geom_label_repel(data = res[res$criteria_curve,],
                              aes(zscore, -log10(combined_pvalue), label = Gene),
                              show.legend = FALSE, min.segment.length = 0) +
    theme_bw() +
    theme(plot.title = element_text(hjust = 0.5),
          legend.position = "none",
          strip.background = element_rect(fill = "white"),
          strip.text = element_text(face = "bold"))

  ggsave(paste0(outdir, "/", format(Sys.time(), "%y%m%d_%H%M"), "_", "RESP_hits_plot.png"),
         plot = g_h, device = "png",
         width = 16, height = 9)

  message("Preparing and saving results")
  data_cleaved$description <- unname(sapply(data_cleaved$description, IMPRINTS.CETSA.app:::getProteinName))
  data_cleaved$position_global <- paste0(data_cleaved$position_global, "-term")
  data_cleaved <- data_cleaved %>%
    tidyr::pivot_wider(id_cols = c("id", "description", "treatment", "cleaved_site"),
                       names_from = position_global,
                       values_from = c("Nvalue", "Npep"))

  # saving whole results
  resp <- res[,c("id", "Gene", "treatment", "combined_pvalue", "zscore", "criteria_curve")]
  colnames(resp)[c(ncol(resp)-1, ncol(resp))] <- c("RESP_score", "RESP_hit")
  resp <- dplyr::left_join(resp, data_cleaved, by = c("id", "treatment"),
                           relationship = "one-to-many")
  resp <- resp[order(resp$combined_pvalue),]
  resp$id <- sub("_.*", "", resp$id)
  openxlsx::write.xlsx(resp, paste0(outdir, "/", format(Sys.time(), "%y%m%d_%H%M"), "_", "RESP_analysis_full.xlsx"))

  # saving summary
  resp_summary <- res[res$criteria_curve,c("id", "Gene", "treatment", "combined_pvalue", "zscore")]
  colnames(resp_summary)[ncol(resp_summary)] <- "RESP_score"
  resp_summary <- dplyr::left_join(resp_summary, data_cleaved, by = c("id", "treatment"),
                          relationship = "one-to-many")
  resp_summary <- resp_summary[order(resp_summary$combined_pvalue),]
  resp_summary$id <- sub("_.*", "", resp_summary$id)

  if(categorize & nrow(resp_summary)){
    message("Categorizing...")
    resp_summary <- imprints_categorize_peptides(data, resp_summary, control, save_xlsx = FALSE)
  }

  openxlsx::write.xlsx(resp_summary, paste0(outdir, "/", format(Sys.time(), "%y%m%d_%H%M"), "_", "RESP_hits_summary.xlsx"))
  message("Done !")
  return(resp_summary)
}


### function to compute discrete inflexion point
inflex <- function(x){
  ### compute 'exact' theoritical inflexion point using second derivative
  x_df1 <- diff(x) # get first 'first derivative'
  # compute 'tendency' from 'first derivative', i.e. if increasing/decreasing
  df <- data.frame(x = 1:length(x_df1),
                   y = x_df1)
  res <- lm(y ~ x, data = df)
  res <- res$coeff[2]
  res <- sign(res)

  x_df2 <- diff(x_df1) # get second 'first derivative'
  if(res == 1){
    inflex_point_exac <- which.max(x_df2) + 2 # double diff, so add 2 to index point
  }
  else{
    inflex_point_exac <- which.min(x_df2) + 2 # double diff, so add 2 to index point
  }

  if(inflex_point_exac - 2 == length(x_df2)){  # if inflex_point is last one, can't conclude that is cleaved
    inflex_point_exac <- NA
  }

  ### compute inflexion point using fitting line
  inflex_point_fit <- sapply(2:(length(x)-1),
                             function(z){
                               slope_1 <- (x[z] - x[1])/(z - 1)
                               intercept_1 <- x[z] - z*slope_1
                               slope_2 <- (x[length(x)] - x[z])/(length(x) - z)
                               intercept_2 <- x[z] - z*slope_2

                               RSS_1 = sum(((1:z * slope_1 + intercept_1) - x[1:z])**2)
                               RSS_2 = sum(((z:length(x) * slope_2 + intercept_2) - x[z:length(x)])**2)

                               res <- RSS_1 + RSS_2

                               return(res)
                             })

  inflex_point_fit <- which.min(inflex_point_fit) + 1

  if(is.na(inflex_point_exac)){
    inflex_point <- inflex_point_fit
  }
  else if(inflex_point_fit == inflex_point_exac){
    inflex_point <- inflex_point_fit
  }
  else{
    inflex_point <- c(inflex_point_fit, inflex_point_exac)
    inflex_point <- inflex_point[order(inflex_point)]

    Ninflex <- abs(inflex_point_fit - inflex_point_exac) + 1
    Npep <- length(x)
    if(Ninflex/Npep > 0.5){ # 50% of the peptides are considered potential cleavage site --> most likely no cleavage
      if(Npep - inflex_point[2] > inflex_point[1] - 1){
        inflex_point <- inflex_point[2]
      }
      else if(Npep - inflex_point[2] == inflex_point[1] - 1){
        inflex_point <- inflex_point_exac
      }
      else{
        inflex_point <- inflex_point[1]
      }
    }
    else{
      inflex_point <- inflex_point[1]:inflex_point[2]
    }
  }

  return(inflex_point)
}

### function to sum number of values per peptide N and peptide C
nb_pepval <- function(x){
  x <- lapply(strsplit(x, "\\|"), strsplit, split = "-")

  x <- lapply(x,
              function(t){
                temp <- unlist(lapply(t, "[[", 1))
                biorep <- strsplit(unlist(lapply(t, "[[", 2)), ";")
                names(biorep) <- temp
                biorep <- lapply(biorep,
                                 function(b){
                                   b <- strsplit(b, "_")
                                   b_names <- unlist(lapply(b, "[[", 1))
                                   nv <- lapply(b, "[[", 2)
                                   nv <- lapply(nv, as.numeric)
                                   names(nv) <- b_names;
                                   nv
                                 });
                biorep
              })

  x <- as.data.frame(x, check.names = FALSE)
  x <- sapply(unique(names(x)), function(nv) sum(x[,names(x) == nv]))

  temp <- unique(sub("\\..*", "", names(x)))
  x <- sapply(temp,
              function(nv){
                b <- x[grep(nv, names(x))]
                names(b) <- sub(".*\\.", "", names(b))
                b <- paste(paste(names(b), b, sep = "_"), collapse = ";")
                b <- paste(nv, b, sep = "-");
                b
              })

  x <- paste(x, collapse = "|")
  return(x)
}

### function to find p-value cutoff
find_cutoff <- function(x,y){
  id <- order(x)
  x <- x[id]
  y <- y[id]

  x <- x[which(x < y)]
  x <- ifelse(length(x), x[length(x)], NA)
  return(x)
}

### function to compute cutoff curve
curve <- function(x, cut_neg, cut_pos, cut_p, curvature = 0.1){
  y <- rep(NA, length(x))
  neg <- which(x <= cut_neg)
  pos <- which(x >= cut_pos)
  y[neg] <- curvature/abs(x[neg] - cut_neg) + -log10(cut_p)
  y[pos] <- curvature/abs(x[pos] - cut_pos) + -log10(cut_p)

  return(y)
}
mgerault/mineCETSAapp documentation built on April 17, 2025, 7:24 p.m.