R/select_peptide_data.R

Defines functions select_peptide_data

Documented in select_peptide_data

#' select_peptide_data
#'
#' Select correlating peptides and subset them into clusters.
#' @param X the input data - see details.
#' @param data_matrix_name the name of the data matrix with the (normalized) peak intensities
#' @param score_limit currently unused
#' @param p_val the p-value associated with the significance of the correlation between 2 peptides
#' @param cut_point the inconsistency cut-off for clustering peptides by correlation distance.
#'
#'

select_peptide_data <- function(X, data_matrix_name, score_limit, p_val, peptide_confidence_limit,
  cut_point = 1) {

  ################################################
  #  R implementation of PQPQ
  #  by Jenny Forshed 2011-05-16
  #  port to R by Melissa Key
  ################################################

  result <- plyr::llply(X, function(protein) {
    if(!(data_matrix_name %in% names(protein))) stop("'data_matrix_name' does not point to a valid matrix")

    confidence <- protein$confidence

    Nk <- ncol(protein[[data_matrix_name]])

    # create empty list to put warningings in
    protein_warnings <- vector("list", 5)

    # Do this once here, then pass the relevant results instead of recalculating it a
    # million times calculate correlation and associated p-values (p-value
    # calculation requires Hmisc package)
    r_cor <- Hmisc::rcorr(protein[[data_matrix_name]])


    # proteins supported by > 300 peptides select the 300 most confident peptides for
    # the actual protein
    if (Nk > 300) {
      index <- order(confidence,decreasing = TRUE)[1:300]
      protein_warnings[[4]] <- "protein supported by > 300 peptides"
    } else {
      index <- 1:Nk
    }
    if(Nk == 1) return(NULL)


    ##### select correlating peptides #####
    correlation_data <- plyr::ldply(seq(1, length(index) - 1), function(k) {
      index_pairs <- data.frame(k = index[k], l = index[-(1:k)])  # skipping protein/peptide IDs

      within(index_pairs, {
        p <- r_cor$P[k[1], l]
        r2 <- r_cor$r[k[1], l]
        conf_l <- confidence[l]
        conf_k <- confidence[k]
        conf <- (conf_k >= peptide_confidence_limit) & (conf_l >= peptide_confidence_limit)
        pep_l <- colnames(protein[[data_matrix_name]])[l]
        pep_k <- colnames(protein[[data_matrix_name]])[k]
      })
    })

    # Now we use subsetting of the above data to pick which pairs we actually want to use
    # first try: p-value < p_val and r2 > 0 and both confidences > peptide_confidence_limit
    index_selecting_proteins <- subset(correlation_data, p < p_val & r2 > 0 &
        conf, c(k, l))
    # if that didn't pick any up, relax the criteria on the p_value (not other criteria)
    if (nrow(index_selecting_proteins) == 0) {
      index_selecting_proteins <- subset(correlation_data, r2 > 0 & conf, c(k,
        l))
    }
    # if that doesn't work, throw up our hands and move on to the next protein.
    skip_rest <- FALSE
    if (nrow(index_selecting_proteins) == 0) {
      protein_warnings[[2]] <- "peptide conf too low to support protein ID or peptides don't correlate"
      skip_rest <- TRUE
    }

    if (!skip_rest){

      best_peptides <- unique(unlist(index_selecting_proteins))

      ##### Hierarchical clustering #####
      if (length(best_peptides) > 3) {
        # create dendrogram
        hclust_obj <- hclust(d = as.dist(1 - r_cor$r[best_peptides, best_peptides]), method = "single")

        # cut dendorgram into clusters
        class_assignment <- inconsistent(hclust_obj, cut_point = cut_point)$clusters
      } else {
        # otherwise, just put everything in one class.
        class_assignment <- data.frame(label = colnames(protein[[data_matrix_name]])[best_peptides], cluster = 1)
      }

      # sort classes by size.  this actually is important later!
      num_class_members <- with(class_assignment, tapply(cluster, cluster, length))



      # select the model peptide with the highest intensity (from each class)
      model_peptides <- plyr::daply(class_assignment, plyr::.(cluster), function(class, data) {
        names(which.max(colMeans(data[, as.character(class$label), drop = FALSE])))
      },data = protein[[data_matrix_name]])

      indices_for_peptides_correlating_with_model_peptides <- plyr::llply(model_peptides,
        function(peptide) {
          subset(correlation_data, (pep_k == peptide | pep_l == peptide) & p <
              p_val & r2 > 0)
        })
      correlating_peptides <- plyr::llply(indices_for_peptides_correlating_with_model_peptides,
        with, {
          unique(c(pep_k, pep_l))
        })

      # order by the number of high-confidence peptides initially added to the cluster
      correlating_peptides <- correlating_peptides[order(num_class_members, decreasing = TRUE)]

      # loop through each pair of clusters.  If the clusters share over 50% of peptides
      # using the # of peptides in the 1st cluster as a reference, then remove the
      # second cluster.
      if (length(correlating_peptides) > 1) {
        end_clusters <- plyr::llply(seq(2, length(correlating_peptides)), function(k) {
          overlap_degree <- plyr::laply(seq(1, k - 1), function(l) {
            n_joint_peptides <- length(intersect(correlating_peptides[[k]], correlating_peptides[[l]]))
            n_peptides_in_k <- length(correlating_peptides[[k]])
            n_joint_peptides/n_peptides_in_k * 100
          })

          if (mean(overlap_degree) > 50)
            NULL else k
        })
        end_clusters <- c(1, unlist(end_clusters))
      } else {
        end_clusters <- 1
      }
      # create warning if more than one cluster is in the output.
      if(length(end_clusters) > 1) protein_warnings[[1]] <- 'non-overlapping peptide clusters'

      # create separate warning if a peptide is assigned to multiple clusters.
      # this warning is combined with warning #1 in original code.
      peptide_cluster_count <- table(unlist(correlating_peptides[end_clusters]))
      if(max(peptide_cluster_count) > 1) protein_warnings[[5]] <- 'peptides assigned to more than one cluster (overlapping clusters)'

      # attach the results to the original data contents.
      result <- c(protein,
        list(
          correlating_peptides = correlating_peptides[end_clusters],
          model_peptides = model_peptides,
          warnings = protein_warnings
        )
      )
    }
    else{
      result <- c(protein,
        list(
          warnings = protein_warnings
        )
      )
    }
    class(result) <- 'pqpq'
    result
  })



}
melissakey/PQPQ documentation built on May 4, 2019, 7:42 p.m.