PopSV: Population-based detection of structural variants from Read-Depth signal

Documented in tnK.norm

##' Bin count targeted normalization using K-mean clustering to group bins with similar coverage profile across the samples. For outlier bins, i.e. showing unique profile, the set of most similar bins is computed for each bin. Eventually the importance of each sample in the definition of the coverage profile can be weighted using principal components. This normalization is still in development state. It is supposed to be more faster than the full targeted normalization ('tn.norm') and better normalize outlier samples (e.g. if the data is not homogeneous).
##' @title Weighted targeted normalization using K-mean optimization.
##' @param bc.ref a data.frame with the coverage in the reference samples.
##' @param samp the name of the sample to normalize.
##' @param bc.to.norm If non-null and the 'samp' is not in 'bc.ref', this data.frame is used. It should be a single sample coverage data.frame, i.e. with columns exactly: chr, start, end and bc.
##' @param cont.sample the name of the control sample to normalize the coverage to. 
##' @param pca.weights should the samples be weighted using principal components.
##' @param max.size the maximum size of a cluster of bins.
##' @param plot should some graphs be outputed ? Default FALSE.
##' @return a data.frame with the normalized bin counts. columns: chr, start, end, bc.
##' @author Jean Monlong
##' @export
tnK.norm <- function(bc.ref, samp, bc.to.norm = NULL, cont.sample, pca.weights = FALSE, 
    max.size = 1000, plot = FALSE) {
    nb.pcs = 2
    k = 2
    
    if (all(cont.sample != colnames(bc.ref))) {
        stop("'cont.sample' is not in 'bc.ref'")
    }
    
    norm.mat <- function(bc, row.ids, col.id, ref.col.id = 1) {
        col.bc = bc[row.ids, col.id]
        bc.out = col.bc * stats::median(bc[row.ids, ref.col.id])/stats::median(col.bc)
        names(bc.out) = rownames(bc)[row.ids]
        return(bc.out)
    }
    norm.outlier <- function(bc, id.bins, bc.n, bins, samp, cont.sample, nb.support.bins = 500) {
        res.v = sapply(bins, function(bin) {
            d.b = rowSums((bc.n - matrix(bc.n[bin, ], byrow = TRUE, nrow(bc.n), ncol(bc.n)))^2)
            bins.support = names(d.b)[order(d.b)[1:nb.support.bins]]
            samp.bc.n = norm.mat(bc, id.bins[bins.support], colnames(bc) == samp, 
                colnames(bc) == cont.sample)
            return(samp.bc.n[bin])
        })
        names(res.v) = bins
        return(res.v)
    }
    rec.kmeans <- function(mat, k = 2, max.size = 100) {
        km.o = stats::kmeans(mat, k)
        res.l = list()
        for (kid in 1:k) {
            if (km.o$size[kid] > max.size) {
                res.l = c(res.l, rec.kmeans(mat[km.o$cluster == kid, ], k = k, max.size = max.size))
            } else {
                res.l = c(res.l, list(rownames(mat)[km.o$cluster == kid]))
            }
        }
        res.l
    }
    
    bc.rn = with(bc.ref, paste(chr, as.integer(start), as.integer(end), sep = "-"))
    samples = setdiff(colnames(bc.ref), c("chr", "start", "end"))
    bc.out = bc.ref[, c("chr", "start", "end")]
    if (all(samp != colnames(bc.ref)) & !is.null(bc.to.norm)) {
        id.bins = 1:nrow(bc.to.norm)
        names(id.bins) = with(bc.to.norm, paste(chr, as.integer(start), as.integer(end), 
            sep = "-"))
        med.bc = stats::median(apply(bc.ref[, samples], 2, stats::median, na.rm = TRUE))
        med.samp = stats::median(bc.to.norm[, 4], na.rm = TRUE)
        bc.ref = cbind(bc.to.norm[id.bins[bc.rn], 4] * med.bc/med.samp, bc.ref[, 
            samples])
        colnames(bc.ref)[1] = samp
    } else {
        bc.ref = bc.ref[, samples]
    }
    
    bc.ref = as.matrix(bc.ref)
    id.bins = 1:nrow(bc.ref)
    names(id.bins) = row.names(bc.ref) = bc.rn
    samples = colnames(bc.ref)
    
    ## PCA
    if (pca.weights) {
        bc.rand = bc.ref[sample.int(nrow(bc.ref), min(10000, nrow(bc.ref))), ]
        pca.o = stats::prcomp(t(bc.rand[apply(bc.rand, 1, function(ee) all(!is.na(ee))), 
            ]))
        pca.d = stats::dist(t(t(pca.o$x[samples, 1:nb.pcs])))
        w.i = as.matrix(pca.d)[samples == samp, ]
        w.i = 1 - w.i/max(w.i)
    } else {
        w.i = rep(1, length(samples))
    }
    w.i[samples == samp] = 0
    
    ## Transform BC for geometric distance computation
    quant.c = apply(bc.ref, 1, stats::quantile, probs = 0.75)
    bc.n = (bc.ref/quant.c) %*% diag(w.i)
    colnames(bc.n) = samples
    quant.c.non.null = which(quant.c > 0)
    bc.n = bc.n[quant.c.non.null, ]
    
    ## Rec kmeans
    bins.km = rec.kmeans(bc.n, k = k, max.size = max.size)
    if (plot) {
        km.size = unlist(lapply(bins.km, length))
        ggplot2::qplot(km.size) + ggplot2::ylab("number of cluster") + ggplot2::xlab("cluster size") + 
            ggplot2::theme_bw()
    }
    
    ## Normalize
    bc.samp.n = unlist(lapply(bins.km, function(bins) {
        if (length(bins) < max.size * 0.1) {
            return(norm.outlier(bc.ref, id.bins, bc.n, bins, samp, cont.sample))
        } else {
            return(norm.mat(bc.ref, id.bins[bins], which(samples == samp), ref.col.id = which(samples == 
                cont.sample)))
        }
    }))
    
    bc.out$bc = rep(NA, nrow(bc.ref))
    bc.out$bc[id.bins[names(bc.samp.n)]] = bc.samp.n
    return(bc.out)
}

jmonlong/PopSV documentation built on Sept. 15, 2019, 9:29 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

jmonlong/PopSV
Population-based detection of structural variants from Read-Depth signal

R/tnK.norm.R
In jmonlong/PopSV: Population-based detection of structural variants from Read-Depth signal

Defines functions tnK.norm

Documented in tnK.norm

R Package Documentation

Browse R Packages

We want your feedback!

jmonlong/PopSV Population-based detection of structural variants from Read-Depth signal

R/tnK.norm.R In jmonlong/PopSV: Population-based detection of structural variants from Read-Depth signal

Defines functions tnK.norm

Documented in tnK.norm

R Package Documentation

Browse R Packages

We want your feedback!

jmonlong/PopSV
Population-based detection of structural variants from Read-Depth signal

R/tnK.norm.R
In jmonlong/PopSV: Population-based detection of structural variants from Read-Depth signal