R/imputePartialPeakgroupCompleteness.R

Defines functions calcNCorrelatingTraces imputePartialPeakgroupCompletness produceFinalManualAnnotations

Documented in imputePartialPeakgroupCompletness

calcNCorrelatingTraces <- function(trace.mat, rt, window.size=14, corr.cutoff=0.7) {
    # Extract the submatrix from the full trace matrix
    window.size.half <- floor(window.size / 2)
    left.border <- max(1, rt - window.size.half)
    right.border <- min(ncol(trace.mat), rt + window.size.half)
    trace.mat.win <- trace.mat[, left.border:right.border]

    n.subunits.tot <- nrow(trace.mat)  # the number of theoretical subunits

    if (n.subunits.tot < 2) {
        return(NA)
    } else if (n.subunits.tot == 2) {
        correl <- cor(t(trace.mat.win))[1, 2]
        if (correl > corr.cutoff) {
            return(1)
        } else {
            return(NA)
        }
    }

    for (i in 1:nrow(trace.mat.win)) {
        all.zero <- all(trace.mat.win[i, ] == 0)
        if (all.zero) {
            trace.mat.win[i, ] <- rnorm(ncol(trace.mat.win))
        }
    }

    corrmat <- cor(t(trace.mat.win))
    # Compute distance between elements as measured by the pearson correlation,
    # i.e., dist = 2 - abs(pearson corr)
    distance <- proxy::dist(corrmat, method='correlation')

    # Cluster correlation vectors hierarchically s.t. proteins that correlate
    # well with a similar group of other proteins cluster together.
    cl <- hclust(distance)
    plot(cl)

    abline(h=1 - corr.cutoff, col='red')
    # Cut the dendrogram at distance 0.3, i.e. pearson corr == 0.7,
    # this will give a vector of group labels.
    group.assignments <- cutree(cl, h=1 - corr.cutoff)
    # Count how often each group label occurs.
    group.sizes <- table(group.assignments)
    max.group.size <- max(group.sizes)
    has.large.enough.subgroup <- any(group.sizes >= 0.5 * n.subunits.tot)

    return(max.group.size)
}


#' Estimate the completness of a peak group that was annotated as being
#' partially complete.
#' @param manual.annotations.partial A manual annotation data.table
#' @param protein.traces Wide format protein traces with columns:
#'   \itemize{
#'     \item complex_id
#'     \item protein_id
#'     \item complex_name
#'     \item additional columns for each sec rt
#'   }
#' @param window.size The window in SEC that is considered around each manually
#'        annotated RT.
#' @param corr.cutoff Above which value a group of traces is considered to be well
#'        correlated.
#' @examples
#' peptraces <- widePepTracesToLong(e4.peptide.traces.wide.filtered)
#' prottraces <- produceProteinTraces(peptraces)
#' prottraces.wc <- annotateProteinTraces(prottraces, corum.complex.protein.assoc)
#' imputePartialPeakgroupCompletness(manual.annotations.partial, prottraces.wc)
#' @export
imputePartialPeakgroupCompletness <- function(manual.annotations.partial,
                                              protein.traces,
                                              window.size=14,
                                              corr.cutoff=0.7) {
    manual.annotations.partial <- manual.annotations.partial  # copy
    manual.annotations.partial$complex_id <-
        as.character(manual.annotations.partial$complex_id)
    complex.ids <- unique(manual.annotations.partial[, complex_id])

    for (complex.id in complex.ids) {
        cat('Complex id:\t', complex.id, '\n')
        rts <- manual.annotations.partial[complex_id == complex.id, rt]
        traces <- protein.traces[complex_id == complex.id, ]
        traces.wide <- longProtTracesToWide(traces)
        is.sec.col <- grepl('\\d+', colnames(traces.wide))
        trace.mat.full <- as.matrix(subset(traces.wide, select=is.sec.col))
        manual.annotations.partial[
            complex_id == complex.id,
            n_proteins_in_complete_complex := nrow(trace.mat.full)
        ]

        for (rt in rts) {
            cat('RT:\t', rt, '\n')
            max.group.size <-
                calcNCorrelatingTraces(trace.mat.full, rt,
                                       window.size=window.size,
                                       corr.cutoff=corr.cutoff)
            manual.annotations.partial[
                complex_id == complex.id & rt == rt,
                n_proteins_in_complex := max.group.size
            ]
        }
    }
    manual.annotations.partial
}

produceFinalManualAnnotations <- function() {
    completness.info <- manual.annotations.raw.rhafen[,
        list(complex_id, n_proteins_in_complex,
             n_proteins_in_complete_complex)]

    manual.annotations.full.w.completness <-
        merge(manual.annotations.full, completness.info, all.x=TRUE, by='complex_id')
    manual.annotations.full.w.completness[, completeness := 'full']

    manual.annotations.partial <- manual.annotations.partial.imputed.win14.corr0.7
    manual.annotations.partial[, completeness := 'partial']

    manual.annotations.final <- rbind(
        manual.annotations.full.w.completness,
        manual.annotations.partial)

     manual.annotations.final[, min_2_proteins := n_proteins_in_complex >= 2]
     manual.annotations.final[ ,
        min_50_percent := n_proteins_in_complex >= 0.5 * n_proteins_in_complete_complex]
     manual.annotations.final <- manual.annotations.final[min_50_percent & min_2_proteins]
     subset(manual.annotations.final,
            select=!(colnames(manual.annotations.final) %in% c('min_2_proteins', 'min_50_percent')))
}
hafenr/MACode documentation built on May 17, 2019, 2:24 p.m.