upbm: Tools for the analysis of universal protein binding microarrays

Documented in cy3FitEmpirical

#' @title Compute Cy3 scaling factors using emprical reference
#'
#' @description
#' PBM arrays are scanned twice, once for Cy3-tagged dUTPs to quantify
#' dsDNA abundance at each probe, and again for the Alexa488-tagged 
#' protein. The Cy3 scans can be used to first filter out probes which
#' appear to have poor dsDNA enrichment, and second, to scale Alexa488
#' intensities to account for differences in dsDNA abundance between
#' probes.
#'
#' Since probe intensities in the Cy3 scans are independent of the
#' assayed Alexa488-tagged protein, Cy3 scans should be comparable across
#' experiments. Based on this observation, a global Cy3 reference can be
#' constructed by taking the empirical average over a large number of Cy3
#' scans. Filtering and scaling can then be performed based on deviations
#' from this reference. Given a PBMExperiment of Cy3 scans and an
#' empirical reference generated by \code{cy3GenerateRef}, this function
#' returns the residuals from the reference, and the corresponding
#' observed-to-expected ratios for each probe as new assays added to the
#' original Cy3 PBMExperiment object.
#'
#' The returned PBMExperiment object can be passed to \code{cy3Normalize}
#' with a PBMExperiment of Alexa488 scan intensities 
#' to filter low quality probes and/or normalize the Alexa488 intensities by the
#' computed ratios.
#' 
#' @param pe a PBMExperiment object containing Cy3 intensity data.
#' @param refpe a PBMExperiment object containing Cy3 reference
#'        intensities as a single assay named \code{"ref"}, most likely
#'        generated by a call to \code{cy3GenerateRef}.
#' @param assay a numeric index or string specifying the Cy3 intensity assay.
#'        (default = \code{SummarizedExperiment::assayNames(pe)[1]})
#' @param useMean a logical value whether to use the probe-level mean, rather than
#'        the probe-level median, from the Cy3 reference (default = TRUE)
#' @param standardize a logical value whether to standardize residuals using
#'        MAD (median absolute deviation about the median) intensity
#'        computed for PBM Cy3 reference data for probe filtering. (default = TRUE)
#' @param threshold a numeric threshold on the absolute value of the log2 ratio between
#'        observed and expected Cy3 intensities. If \code{standardize} is
#'        specified, the log2 ratios are first scaled before comparing against
#'        the threshold. In this case, the threshold is also scaled by the
#'        median of the probe-level MAD intensities across all probes. (default = \code{1/2})
#' @param verbose a logical value whether to print verbose output during
#'        analysis. (default = FALSE)
#' 
#' @return
#' Original Cy3 PBMExperiment object with additional assays corresponding
#' to ratio of observed to expected probe intensities, and whether probes were
#' flagged as low-quality based on \code{abs(log2(ratio)) > threshold}.
#'
#' @seealso \code{\link{cy3GenerateRef}}, \code{\link{cy3Normalize}}
#' @importFrom dplyr as_tibble select bind_cols group_by ungroup left_join mutate one_of
#' @importFrom tidyr pivot_longer pivot_wider
#' @export
#' @author Patrick Kimes
cy3FitEmpirical <- function(pe, refpe, assay = SummarizedExperiment::assayNames(pe)[1],
                            useMean = TRUE, standardize = TRUE, threshold = 1/2,
                            verbose = FALSE) {
    stopifnot(is(pe, "PBMExperiment")) 
    stopifnot(is(refpe, "PBMExperiment")) 

    ## check validity of Cy3 empirical reference
    stopifnot("ref" %in% SummarizedExperiment::assayNames(refpe))
    stopifnot("sfactor" %in% names(metadata(refpe)))
    stopifnot("params" %in% names(metadata(refpe)))
    stopifnot("probe_mad" %in% colnames(refpe))
    if (useMean) {
        stopifnot("probe_mean" %in% colnames(refpe))
    } else {
        stopifnot("probe_median" %in% colnames(refpe))
    }

    if (verbose) {
        cat("|| upbm::cy3FitEmpirical \n")
        cat("|| - Starting calculation of Cy3 deviations from empirical reference",
            "for", ncol(pe), "Cy3 PBM scans.\n")
    }

    if (verbose) {
        cat("|| - Filtering probes according to", length(pe@probeFilter),
            "probeFilter rule(s).\n")
        ntotal <- nrow(pe)
    }

    ## filter using rules
    npe <- pbmFilterProbes(pe)
    
    if (verbose) {
        cat("|| - Data filtered from", ntotal, "probes to", nrow(npe), "probes.\n")
    }

    ## determine overlap in probe columns
    ovnames <- intersect(npe@probeCols, refpe@probeCols)

    ## check that merging shouldn't be a problem
    if (length(ovnames) == 0L) {
        stop("rowData of 'pe' and 'refpe' do not have any matching probe design columns.\n",
             "Check that rowData contains sufficient probe design columns.\n",
             "'pe' rowData probeCols: ", paste0(pe@probeCols, collapse = ", "), "\n",
             "'refpe' rowData probeCols: ", paste0(refpe@probeCols, collapse = ", "))
    }

    ## determine row/probe metadata for Cy3 scans
    rowdat1 <- as.data.frame(rowData(npe), optional = TRUE)
    rowdat1 <- dplyr::as_tibble(rowdat1)
    rowdat1 <- dplyr::select(rowdat1, one_of(ovnames))

    ## check 'pe' rowData alone is alright
    if (nrow(rowdat1) > nrow(dplyr::distinct(rowdat1))) {
        stop("rowData of 'pe' cannot be used to uniquely identify probes.\n",
             "Check that rowData contain probe design columns ",
             "which are unique to each probe (after filtering).")
    }
    
    ## determine row/probe metadata for Cy3 reference
    rowdat2 <- as.data.frame(rowData(refpe), optional = TRUE)
    rowdat2 <- dplyr::as_tibble(rowdat2)
    rowdat2 <- dplyr::select(rowdat2, one_of(ovnames))

    ## check 'refpe' rowData alone is alright
    if (nrow(rowdat2) > nrow(dplyr::distinct(rowdat2))) {
        stop("rowData of 'refpe' cannot be used to uniquely identify probes.\n",
             "Check that rowData contain probe design columns ",
             "which are unique to each probe (after filtering).")
    }

    ## extract intensities for Cy3 scans
    pdat1 <- SummarizedExperiment::assay(npe, assay)
    pdat1 <- as.data.frame(pdat1, optional = TRUE)
    pdat1 <- dplyr::as_tibble(pdat1)
    pdat1 <- dplyr::bind_cols(pdat1, rowdat1)
    pdat1 <- tidyr::pivot_longer(pdat1, names_to = "condition",
                                 values_to = "intensity", colnames(npe))

    ## extract intensities for Cy3 reference
    pdat2 <- SummarizedExperiment::assay(refpe, "ref")
    pdat2 <- as.data.frame(pdat2, optional = TRUE)
    pdat2 <- dplyr::as_tibble(pdat2)
    pdat2 <- dplyr::bind_cols(pdat2, rowdat2)

    ## join observed and reference tables
    pdat <- dplyr::left_join(pdat1, pdat2, by = ovnames)

    if (nrow(pdat) < nrow(pdat1)) {
        stop("rowData of 'pe' and 'refpe' could not be used to fully match all probes in 'pe'.\n",
             "Check that rowData contains sufficient probe design columns.\n",
             "'pe' rowData probeCols: ", paste0(pe@probeCols, collapse = ", "), "\n",
             "'refpe' rowData probeCols: ", paste0(refpe@probeCols, collapse = ", "))
    }
    
    if (verbose) {
        cat("|| - Using reference mean intensity for calculation.\n")
    }

    ## set the probe reference 
    if (useMean) {
        pdat$probe_ref <- pdat$probe_mean
    } else {
        pdat$probe_ref <- pdat$probe_median
    }

    ## verify that most values aren't NAs
    pna <- mean(is.na(pdat$probe_ref))
    if (pna > 0.2) {
        stop("After merging observed and reference Cy3 intensities, > 20% of",
             "probes have NA reference intensities.\n",
             "Percent NAs = ", 100*round(pna, 4), "%.\n",
             "Please check 'refpe' again before proceeding.")
    } else if (pna > 0.01) {
        warning("After merging observed and reference Cy3 intensities, > 1% of",
                "probes have NA reference intensities.\n",
                "Percent NAs = ", 100*round(pna, 4), "%.\n")
    }

    ## register samples if Cy3 ref was computed with registered scans
    if (metadata(refpe)$params$register) {
        if (verbose) {
            cat("|| - Using registered Cy3 intensities for calculation.\n")
        }
        pdat <- dplyr::group_by(pdat, condition)
        pdat <- dplyr::mutate(pdat, sintensity = intensity / median(intensity, na.rm = TRUE))
        pdat <- dplyr::ungroup(pdat)
        pdat <- dplyr::mutate(pdat, sintensity = sintensity * metadata(refpe)$sfactor)
    } else {
        if (verbose) {
            cat("|| - Using unregistered Cy3 intensities for calculation.\n")
        }
        pdat <- dplyr::mutate(pdat, sintensity = intensity)
    }

    ## log2 tranform and compute ratios
    pdat <- dplyr::mutate(pdat, sintensity = log2(sintensity + metadata(refpe)$params$offset))
    pdat <- dplyr::mutate(pdat, pscores = sintensity - probe_ref)
    pdat <- dplyr::mutate(pdat, pratios = 2^pscores)

    ## scale threshold cutoff if standardize specified
    if (standardize) {
        if (verbose) {
            cat("|| - Using studentized/standardized deviations for filtering.\n")
        }
        median_mad <- median(SummarizedExperiment::assay(refpe, "ref")$probe_mad, na.rm = TRUE)
        pdat <- dplyr::mutate(pdat, pscores = pscores / probe_mad * median_mad)
    } else if (verbose) { 
        cat("|| - Using non-studentized/standardized deviations for filtering.\n")
    }

    ## identify outlier probes
    pdat <- dplyr::mutate(pdat, pdrop = abs(pscores) > threshold)
    
    ## create assays
    pexps <- dplyr::select(pdat, one_of(ovnames), condition, probe_ref)
    pexps <- dplyr::mutate(pexps, probe_ref = 2^probe_ref - metadata(refpe)$params$offset)
    pexps <- tidyr::pivot_wider(pexps, names_from = condition, values_from = probe_ref)
    pratios <- dplyr::select(pdat, one_of(ovnames), condition, pratios)
    pratios <- tidyr::pivot_wider(pratios, names_from = condition, values_from = pratios)
    pscores <- dplyr::select(pdat, one_of(ovnames), condition, pscores)
    pscores <- tidyr::pivot_wider(pscores, names_from = condition, values_from = pscores)
    plowq <- dplyr::select(pdat, one_of(ovnames), condition, pdrop)
    plowq <- tidyr::pivot_wider(plowq, names_from = condition, values_from = pdrop)

    ## left join to original rowData to get full set
    full_rowdat <- as.data.frame(rowData(pe), optional = TRUE)
    full_rowdat <- dplyr::as_tibble(full_rowdat)
    full_rowdat <- dplyr::select(full_rowdat, one_of(ovnames))

    pexps <- dplyr::left_join(dplyr::select(full_rowdat, one_of(ovnames)), pexps, by = ovnames)
    pratios <- dplyr::left_join(dplyr::select(full_rowdat, one_of(ovnames)), pratios, by = ovnames)
    pscores <- dplyr::left_join(dplyr::select(full_rowdat, one_of(ovnames)), pscores, by = ovnames)
    plowq <- dplyr::left_join(dplyr::select(full_rowdat, one_of(ovnames)), plowq, by = ovnames)

    pexps <- DataFrame(pexps, check.names = FALSE)
    pexps <- pexps[, rownames(colData(pe)), drop = FALSE]
    pratios <- DataFrame(pratios, check.names = FALSE)
    pratios <- pratios[, rownames(colData(pe)), drop = FALSE]
    pscores <- DataFrame(pscores, check.names = FALSE)
    pscores <- pscores[, rownames(colData(pe)), drop = FALSE]
    plowq <- DataFrame(plowq, check.names = FALSE)
    plowq <- plowq[, rownames(colData(pe)), drop = FALSE]

    ## add to PBMExperiment object
    SummarizedExperiment::assay(pe, "expected") <- pexps
    SummarizedExperiment::assay(pe, "ratio") <- pratios
    SummarizedExperiment::assay(pe, "scores") <- pscores
    SummarizedExperiment::assay(pe, "lowq") <- plowq

    if (verbose) {
        cat("|| - Finished calculation of Cy3 deviations.\n")
        cat("|| - Returning PBMExperiment with", nrow(pe), "rows and", ncol(pe), "columns.\n")
    }
    return(pe)
}