R/functions_normalization.R
In seqsetvis: Set Based Visualizations for Next-Gen Sequencing Data

Documented in append_ynorm calc_norm_factors

#' calc_norm_factors
#'
#' Calculate normalization factors in a two step process:
#'
#' 1) summarize every region for each sample (default summary function is max)
#'
#' 2) caclulate a value to cap each sample to based on regions (default is 95th
#' quantile).
#'
#' The uderlying assumption here is that meaningful enrichment is present at the
#' majority of regions provided.  If prevalence varies by a specific factor, say
#' ChIP-seq targets with different characteristics - ie. when analyzing TSSes
#' for H3K4me3 and an infrequent transcription factor it is more appropriate to
#' specify appropriate quantile cutoffs per factor.
#'
#' @param full_dt a data.table, as returned by ssvFetch*(..., return_data.table.
#'   = TRUE)
#' @param value_ character, attribute in full_dt to normalzie.
#' @param cap_value_ character, new attribute name specifying values to cap to.
#' @param by1 character vector, specifies attributes relevant to step 1.
#' @param by2 character vector, specifies attributes relevant to step 1 and 2.
#' @param aggFUN1 function called on value_ with by = c(by1, by2) in step 1.
#' @param aggFUN2 function called on result of aggFUN1 with by = by2 in step 2.
#'
#' @return data.table mapping by2 to cap_value_.
#' @export
#'
#' @examples
#' calc_norm_factors(CTCF_in_10a_profiles_dt)
#' calc_norm_factors(CTCF_in_10a_profiles_dt,
#'   aggFUN1 = mean, aggFUN2 = function(x)quantile(x, .5))
calc_norm_factors = function(full_dt,
                             value_ = "y",
                             cap_value_ = "y_cap_value",
                             by1 = "id",
                             by2 = "sample",
                             aggFUN1 = max,
                             aggFUN2 = function(x)quantile(x, .95)
){
    value_summary_ = NULL #reserve binding for data.table
    stopifnot(data.table::is.data.table(full_dt))
    stopifnot(value_ %in% colnames(full_dt))
    stopifnot(by1 %in% colnames(full_dt))
    stopifnot(by2 %in% colnames(full_dt))
    cap_dt = full_dt[, list(value_summary_ = aggFUN1(get(value_))), c(by1, by2)][, list(value_capped_ = aggFUN2(value_summary_)), c(by2)]
    data.table::setnames(cap_dt, "value_capped_", cap_value_)
    cap_dt[]
}


#' append_ynorm
#'
#' see \code{\link{calc_norm_factors}} for normalization details.
#'
#' @param full_dt a data.table, as returned by ssvFetch*(..., return_data.table
#'   = TRUE).
#' @param value_ character, attribute in full_dt to normalzie.
#' @param cap_value_ character, new attribute name specifying values to cap to.
#' @param norm_value_ character, new attribute name specifying normalized values.
#' @param by1 character vector, specifies attributes relevant to step 1.
#' @param by2 character vector, specifies attributes relevant to step 1 and 2.
#' @param aggFUN1 function called on value_ with by = c(by1, by2) in step 1.
#' @param aggFUN2 function called on result of aggFUN1 with by = by2 in step 2.
#' @param cap_dt optionally, provide user generated by2 to cap_value_ mapping
#' @param do_not_cap if TRUE, normalized values are not capped to 1. Default is FALSE.
#' @param force_append if TRUE, any previous cap_value or norm_value is overridden. Default is FALSE.
#'
#' @return data.table, full_dt with cap_value_ and norm_value_ values appended.
#' @export
#'
#' @examples
#' append_ynorm(CTCF_in_10a_profiles_dt)
#' append_ynorm(CTCF_in_10a_profiles_dt,
#'   aggFUN1 = mean, aggFUN2 = function(x)quantile(x, .5))
append_ynorm = function(full_dt,
                           value_ = "y",
                           cap_value_ = "y_cap_value",
                           norm_value_ = "y_norm",
                           by1 = "id",
                           by2 = "sample",
                           aggFUN1 = max,
                           aggFUN2 = function(x)quantile(x, .95),
                           cap_dt = NULL,
                           do_not_cap = FALSE,
                        force_append = FALSE
){
    stopifnot(data.table::is.data.table(full_dt))
    stopifnot(value_ %in% colnames(full_dt))
    stopifnot(by1 %in% colnames(full_dt))
    stopifnot(by2 %in% colnames(full_dt))
    if(force_append){
        full_dt[[cap_value_]] = NULL
        full_dt[[norm_value_]] = NULL
    }
    stopifnot(!cap_value_ %in% colnames(full_dt))
    stopifnot(!norm_value_ %in% colnames(full_dt))
    if(is.null(cap_dt)){
        cap_dt = calc_norm_factors(full_dt, value_, cap_value_, by1, by2, aggFUN1, aggFUN2)
    }else{
        stopifnot(cap_value_ %in% colnames(cap_dt))
        stopifnot(by2 %in% colnames(cap_dt))
    }

    full_dt = merge(full_dt, cap_dt, by = by2)
    data.table::set(full_dt, NULL, norm_value_, full_dt[[value_]])
    if(!do_not_cap){
        full_dt[get(norm_value_) > get(cap_value_), (norm_value_) := get(cap_value_)]
    }
    full_dt[, (norm_value_) := get((norm_value_)) / get(cap_value_)]
    full_dt[]
}