Nothing
#' Process coverageBed reports
#'
#' @description
#' Process the coverage reports generated by bedtools coverage tool.
#'
#' @inheritParams get.coverage.by.sample.statistics
#'
#' @return final.statistics data frame of coverage statistics generated by parsing through coverage reports
process.coverage.reports <- function(project.directory) {
# TO DO:
# - add tests for reports having expected format
# - ask Ros what the "cumulative" coverage numbers are supposed to mean
coverage.report.paths <- system.ls(pattern = "*/*all.coverage.report", directory = project.directory, error = TRUE);
sample.ids <- extract.sample.ids(coverage.report.paths, from.filename = TRUE);
single.sample <- !(length(sample.ids) > 1)
# store mean and median coverage per patient
mean.median.by.sample <- list();
# initialize data frame to store coverage data for all samples
merged.coverage.data <- data.frame();
### PROCESS EACH SAMPLE
for(i in seq_along(coverage.report.paths)) {
path <- coverage.report.paths[i];
sample.id <- sample.ids[i];
# generated from coverageBed, based on all target regions
# 1)
# 2) depth
# 3) no. of bases at depth
# 4) size of A
# 5) % of A at depth
coverage.data <- utils::read.delim(
path,
header = FALSE,
stringsAsFactors = FALSE
);
depth.values <- coverage.data[, 2];
depth.frequencies <- coverage.data[, 3];
# data frame for merging with all other patients
patient.coverage <- data.frame(depth.values, depth.frequencies);
names(patient.coverage) <- c('depth', sample.id);
# get the median coverage for the sample
median.coverage <- tabular.median(
values = depth.values,
frequencies = depth.frequencies
);
# get the mean coverage for each sample
mean.coverage <- tabular.mean(
values = depth.values,
frequencies = depth.frequencies
);
mean.median.by.sample[[ sample.id ]] <- data.frame(
"sample.id" = sample.id,
"mean.coverage" = mean.coverage,
"median.coverage" = median.coverage
);
# merge with full data frame
if( 0 == nrow(merged.coverage.data) ) {
merged.coverage.data <- patient.coverage;
} else {
merged.coverage.data <- merge(
merged.coverage.data,
patient.coverage,
by.x = "depth",
by.y = "depth",
all = TRUE
);
}
}
### POST-PROCESSING
mean.median.by.sample <- do.call(rbind, mean.median.by.sample);
merged.coverage.data$bin <- cut(
merged.coverage.data$depth,
breaks = c(-Inf, 1, 10, 20, 30, 40, Inf),
labels = c("0", "1-10", "10-20", "20-30", "30-40", "40+")
# labels = c(0, 1, 10, 20, 30, 40)
);
# for each patient, get proportion of frequencies falling within each depth category
if (!single.sample){
coverage.statistics <- apply(
merged.coverage.data[, 2:(ncol(merged.coverage.data) - 1)],
2,
FUN = function(x, coverage.bin) {
tapply(x, coverage.bin, sum, na.rm = TRUE )/ sum(x, na.rm = TRUE);
},
coverage.bin = merged.coverage.data$bin
);
} else {
coverage.statistics <- data.frame(sample.ids=tapply(merged.coverage.data[, 2], merged.coverage.data$bin, sum, na.rm = TRUE) / sum(merged.coverage.data[ ,2], na.rm = TRUE))
}
# for all categories except the first one, get the proportion of frequencies falling into that category or higher
# - first category (coverage zero) is still just the proportion falling in that category
for( i in 2:(nrow(coverage.statistics)-1) ) {
if (!single.sample) {
coverage.statistics[i,] <- apply(coverage.statistics[i:nrow(coverage.statistics),], 2, sum, na.rm = TRUE)
} else {
coverage.statistics[i,] <- sum(coverage.statistics[i:nrow(coverage.statistics),], na.rm = TRUE)
}
}
# transpose
#if (!single.sample) {
names(coverage.statistics) <- sample.ids;
coverage.statistics <- t(coverage.statistics);
#} else {
# rownames(coverage.statistics) <- sample.ids
#}
# add mean/ median per sample
# make sure they're ordered the same way – anything else will lead to disappointment
if (!single.sample) {
mean.median.by.sample <- mean.median.by.sample[rownames(coverage.statistics), ];
}
print(mean.median.by.sample);
print(coverage.statistics);
# sanity check
if( !identical(rownames(coverage.statistics), rownames(mean.median.by.sample) ) ) {
stop("coverage.statistics and mean.median.by.sample do not appear to be in the same order. Please investigate.");
}
# assemble final data frame
final.statistics <- cbind(
coverage.statistics,
mean.median.by.sample
);
return(final.statistics);
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.