##' Join bin counts of the samples to analyze and compute some QC metrics to help
##' defining the set of reference samples.
##' @title Join and QC the reference samples
##' @param files.df a data.frame with the information about the files to
##' use. Columns 'sample' and 'bc.gc.bg' are required and should be
##' present after running 'initFileNames' function. Files should exist if
##' 'correct.GC' was run.
##' @param bin.df a data.frame with the information about the bins. Columns 'chr', 'start'
##' and 'end' are required.
##' @param outfile.prefix the prefix of the output file name. The suffix '.bgz' will
##' be appended if compressed ('appendIndex.outfile=TRUE').
##' @param ref.samples a vector with the names of the samples to use as reference.
##' @param nb.ref.samples the number of reference samples desired. If NULL, the size of 'ref.samples'.
##' @param plot should PCA graphs be outputed ? Default is TRUE.
##' @param appendIndex.outfile if TRUE (default), the results will be appended regularly on
##' the output file which will be ultimately compressed and indexed. This is recommend when a large number
##' of bins are analyzed. If FALSE, a data.frame with the bin counts will be returned and no
##' file are created.
##' @param chunk.size the number of bins to analyze at a time (for memory optimization).
##' Default is 100 000. Reduce this number if memory problems arise.
##' @param col.bc the column from 'files.df' defining the bin count file names.
##' @param nb.cores number of cores to use. If higher than 1, \code{parallel}
##' package is used to parallelize the counting.
##' @param median.norm Should the merged bin counts be median-normalized. Default is TRUE.
##' @return a list with
##' \item{bc}{the name of the file with the joined bin counts OR a data.frame with
##' these bin counts.}
##' \item{ref.samples}{a vector with the reference samples names.}
##' \item{cont.sample}{the name of the sample to use as control among the reference samples (for normalization).}
##' \item{pc.all.df}{a data.frame with the first 3 principal components for all input reference samples.}
##' \item{pc.ref.df}{a data.frame with the first 3 principal components for the final reference samples.}
##' @author Jean Monlong
##' @export
qc.samples <- function(files.df, bin.df, outfile.prefix, ref.samples = NULL, nb.ref.samples = NULL, plot = TRUE, appendIndex.outfile = TRUE, chunk.size = 1e+05, col.bc = "bc.gc.gz", nb.cores = 1, median.norm=TRUE) {
## Checks and arguments completion
if(!all(c("chr","start","end") %in% colnames(bin.df))){
stop("Missing column in 'bin.df'. 'chr', 'start' and 'end' are required.")
}
if(!all(c("sample", col.bc) %in% colnames(files.df))){
stop("Missing column in 'files.df'. 'sample', '",col.bc,"' are required.")
}
if (is.null(ref.samples)) {
ref.samples = as.character(files.df$sample)
}
if (is.null(nb.ref.samples)) {
nb.ref.samples = length(ref.samples)
}
## Flexible function to read bin counts on specific bins, several files, ...
read.bc.samples <- function(df, files.df, nb.cores = 1, med.med = NULL, med.samp = NULL,
file = NULL, append.f = FALSE, sub.bc = NULL) {
if (nb.cores > 1) {
bc.l = parallel::mclapply(as.character(files.df[, col.bc]), function(fi) {
res = read.bedix(fi, df,exact.match=TRUE)
}, mc.cores = nb.cores)
} else {
bc.l = lapply(as.character(files.df[, col.bc]), function(fi) {
res = read.bedix(fi, df,exact.match=TRUE)
})
}
if (is.null(med.med) & is.null(med.samp)) {
med.samp = lapply(bc.l, function(df) stats::median(df[,ncol(df)], na.rm = TRUE)) ## Normalization of the median coverage
med.med = stats::median(unlist(med.samp), na.rm = TRUE)
}
bc.l = lapply(1:nrow(files.df), function(samp.i) {
df = bc.l[[samp.i]]
df[,ncol(df)] = round(df[,ncol(df)] * med.med/med.samp[[samp.i]], 2)
colnames(df)[ncol(df)] = "bc"
df$sample = as.character(files.df$sample[samp.i])
df
})
bc.df = as.data.frame(data.table::rbindlist(bc.l))
bc.df = tidyr::spread(bc.df, "sample", "bc", fill=0)
bc.df = bc.df[order(as.character(bc.df$chr), bc.df$start, bc.df$end),]
if (!is.null(file)) {
utils::write.table(bc.df, file = file, quote = FALSE, row.names = FALSE, sep = "\t",
append = append.f, col.names = !append.f)
}
if (!is.null(sub.bc)) {
bc.df = bc.df[sample(1:nrow(bc.df), min(c(nrow(bc.df), sub.bc))), ]
}
return(bc.df)
}
center.pt <- function(m) {
dm = as.matrix(stats::dist(m))
diag(dm) = NA
rownames(m)[which.min(apply(dm, 2, mean, na.rm = TRUE))]
}
files.df = files.df[which(files.df$sample %in% ref.samples), ]
## Too many ref.samples
pc.all.df = bc.rand = NULL
if (length(ref.samples) > nb.ref.samples) {
sparse.pts <- function(m, nb.pts) {
dm = as.matrix(stats::dist(m))
diag(dm) = NA
pts.jj = 1:nrow(dm)
pts.ii = which.max(apply(dm, 1, max, na.rm = TRUE))
pts.jj = pts.jj[-pts.ii]
while (length(pts.ii) < nb.pts) {
m.ii = which.max(apply(dm[pts.ii, pts.jj, drop = FALSE], 2, min,
na.rm = TRUE))
pts.ii = c(pts.ii, pts.jj[m.ii])
pts.jj = pts.jj[-m.ii]
}
m[pts.ii, ]
}
## Get subset of bins
bc.rand = read.bc.samples(bin.df[sample.int(nrow(bin.df), min(c(nrow(bin.df)/2,1000))), ], files.df, med.med = 1, med.samp = rep(list(1), nrow(files.df)))
## PCA
bc.mv = medvar.norm.internal(bc.rand[, ref.samples])
pc.all = stats::prcomp(t(stats::na.exclude(bc.mv)))
pc.all.df = data.frame(pc.all$x[, 1:3])
pc.all.df$sample = ref.samples
sp.o = sparse.pts(pc.all$x[, 1:2], nb.ref.samples)
pc.all.df$reference = pc.all.df$sample %in% rownames(sp.o)
if (plot) {
PC1 = PC2 = reference = NULL ## Uglily appease R checks
print(ggplot2::ggplot(pc.all.df, ggplot2::aes(x = PC1, y = PC2, size = reference)) +
ggplot2::geom_point(alpha = 0.8) + ggplot2::theme_bw() + ggplot2::scale_size_manual(values = 2:3))
}
ref.samples = rownames(sp.o)
bc.rand = bc.rand[, c("chr", "start", "end", ref.samples)]
}
if(!median.norm){
med.med = 1
med.samp = rep(list(1), nrow(files.df))
} else {
med.med = med.samp = NULL
}
files.df = files.df[which(files.df$sample %in% ref.samples), ]
bin.df = bin.df[order(as.character(bin.df$chr), bin.df$start, bin.df$end),]
if (nrow(bin.df) < 1.3 * chunk.size) {
bc.df = read.bc.samples(bin.df, files.df, nb.cores, med.med, med.samp)
utils::write.table(bc.df, file = outfile.prefix, quote = FALSE, row.names = FALSE,
sep = "\t")
} else {
nb.chunks = ceiling(nrow(bin.df)/chunk.size)
bin.df$chunk = rep(1:nb.chunks, each = chunk.size)[1:nrow(bin.df)]
analyze.chunk <- function(df) {
ch.nb = as.numeric(df$chunk[1])
df.o = read.bedix(as.character(files.df[1, col.bc]), df,exact.match=TRUE)
read.bc.samples(df.o, files.df, nb.cores, med.med, med.samp, file = outfile.prefix,
append.f = ch.nb > 1, sub.bc = chunk.size/nb.chunks)
}
if (is.null(bc.rand)) {
bc.rand = read.bc.samples(bin.df[sample.int(nrow(bin.df), min(c(nrow(bin.df)/2,1000))), ], files.df, med.med = 1, med.samp = rep(list(1), nrow(files.df)))
}
if(median.norm){
med.samp = lapply(as.character(files.df$sample), function(samp.i) stats::median(bc.rand[,
samp.i], na.rm = TRUE)) ## Normalization of the median coverage
med.med = stats::median(unlist(med.samp), na.rm = TRUE)
}
bc.res = lapply(unique(bin.df$chunk), function(chunk.i) {
analyze.chunk(bin.df[which(bin.df$chunk == chunk.i), ])
})
bc.df = plyr::ldply(bc.res, identity)
}
## Compress and index
if (appendIndex.outfile) {
outfile.prefix = comp.index.files(outfile.prefix, rm.input=TRUE, overwrite.out=TRUE, reorder=any(order(bin.df$chr, bin.df$start, bin.df$end)!=1:nrow(bin.df)))
}
##
## QC
bc.mv = medvar.norm.internal(bc.df[, ref.samples])
if(any(is.infinite(bc.mv))){
bc.mv[which(is.infinite(bc.mv))] = NA
}
pc.ref = stats::prcomp(t(stats::na.exclude(bc.mv)))
pc.ref.df = data.frame(pc.ref$x[, 1:3])
pc.ref.df$sample = ref.samples
if (plot) {
PC1 = PC2 = NULL ## Uglily appease R checks
print(ggplot2::ggplot(pc.ref.df, ggplot2::aes(x = PC1, y = PC2)) + ggplot2::geom_point(alpha = 0.8) +
ggplot2::theme_bw())
}
cont.sample = center.pt(pc.ref$x[, 1:2])
bc.df = outfile.prefix
return(list(bc = bc.df, ref.samples = ref.samples, cont.sample = cont.sample,
pca.all.df = pc.all.df, pca.ref.df = pc.ref.df))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.