Nothing
#' @include classes.r
#' @include RCNA.r
#' @description This generic function is used to run to calculate and correct GC-content-based coverage bias
#'
#' @title correct_gc_bias: Estimate and correct GC bias in coverage
#' @param obj A \linkS4class{RCNA_object} class object, or a data.frame object that contains all the necessary parameters.
#' @param ... Other arguments
#' @details This function can be run as a stand-alone or as part of \link{run_RCNA}
#' @return A \linkS4class{RCNA_analysis} class object that describes the input parameters and output files generated by this step of the workflow.
#' @seealso \linkS4class{RCNA_object}, \linkS4class{RCNA_analysis}, \link{run_RCNA}
#' @rdname correct_gc_bias.default
#' @export correct_gc_bias
correct_gc_bias <- function(obj, ...) UseMethod("correct_gc_bias")
#' Estimate and correct GC bias in the coverage data
#'
#' This function optionally estimates and then corrects the GC bias based on a GC-content factor file that is either generated or provided by the user using a sliding window approach. It creates a GC factor file and a corrected coverage file, both of which are placed in the output directory under `/gc`.
#'
#' @param df Path to the config file, or a `data.frame` object containing the valid parameters. Valid column names are `file.raw.coverage`, `file.gc.factor`, `file.corrected.coverage`, and `sample.names`. Additional columns will be ignored.
#' @param sample.names Character vector of sample names. Alternatively can be specified in `df`.
#' @param ano.file Location of the annotation file. This file must be in CSV format and contain the following information (with column headers as specified): "feature,chromosome,start,end".
#' @param out.dir Output directory for results. A subdirectory for results will be created under this + `/nkr/`.
#' @param ncpus Integer number of CPUs to use. Specifying more than one allows this function to be parallelized by feature.
#' @param file.raw.coverage Character vector listing the raw input coverage files. Must be the same length as `sample.names`. Alternatively can be specified in `df`.
#' @param file.corrected.coverage Character vector listing the corrected input coverage files. If not specified new names will be generated based on the raw coverage files.
#' @param file.gc.factor Character vector listing the GC factor files used to correct coverage. If `estimate_gc=FALSE` then this must be provided. Otherwise it is ignored.
#' @param win.size Size in base pairs of the sliding window used to estimate and correct the GC bias.
#' @param gc.step Bin size for GC bias in the GC factor file. If the GC factor file is provided then the file must have corresponding bin sizes.
#' @param estimate_gc Logical determining if GC content estimation should be performed. If set to `FALSE` then a factor file must be provided via `file.gc.factor` or in `df`.
#' @param verbose When TRUE increases level of detail for command output
#' @param obj An `RCNA_object` type created by \link{create_RCNA_object}. If specified this takes precedent over all over args.
#' @param ... Additional arguments
#' @examples
#' ## Run GC-bias estimation and correction on example object
#' # See \link{example_obj} for more information on example
#' example_obj@ano.file <- system.file("examples" ,"annotations-example.csv",
#' package = "RCNA")
#' raw.cov <- system.file("examples", "coverage",
#' paste0(example_obj@sample.names, ".txt.gz"), package = "RCNA")
#' example_obj@gcParams$file.raw.coverage <- raw.cov
#' \donttest{example_obj}
#' # Create output directory
#' dir.create(file.path("output", "gc"), recursive = TRUE)
#' # Estimate and correct GC bias, append results
#' correct_gc_analysisObj <- correct_gc_bias(example_obj)
#' example_obj@commands <- c(example_obj@commands, correct_gc_analysisObj)
#' \dontshow{system("rm -rf output")}
#' @return A \linkS4class{RCNA_analysis} class object that describes the input parameters and output files generated by this step of the workflow.
#' @rdname correct_gc_bias.default
#' @import utils parallel doParallel
#' @details
#'
#' The `df` argument corresponds to the `gcParams` matrix on \linkS4class{RCNA_object}. Valid column names are `sample.names`, `file.raw.coverage`, `file.corrected.coverage`, and `file.gc.factor`. The `file.gc.factor` column is not required if `estimate_gc=TRUE`. Additional columns will be ignored.
#' @export
correct_gc_bias.default = function(obj = NULL, df = NULL, sample.names = NULL, ano.file, out.dir = NULL, ncpus = 1,
file.raw.coverage = NULL, file.corrected.coverage = NULL, file.gc.factor = NULL, win.size = 75, gc.step = 0.01, estimate_gc = TRUE,
verbose = FALSE, ...){
if(is.data.frame(obj) | is.character(obj)){
df = obj
} else if(!missing(obj)){
return(correct_gc_bias.RCNA_object(obj, verbose))
} else if(missing(df)){
if(verbose) message("No data.frame detected for correct_gc_bias. Constructing run from parameters instead")
if(!missing(sample.names) & !missing(ano.file)){
tryCatch({ # Attempt to create an RCNA object
new_RCNA_obj = create_RCNA_object(sample.names = sample.names, ano.file = ano.file, ncpu = ncpus, ...)
},
error = function(cond){
message("RCNA Object could not be created due to error:")
},
warning = function(cond){
message("Creating an RCNA object caused a warning:")
message(conditionMessage(cond))
})
} else if(missing(sample.names)){
stop("Missing both `df` argument and `sample.names` argument - please specify at least one.")
}
} else {
if(missing(sample.names)){
tryCatch({
if(verbose) message("Pulling sample.names from data.frame")
sample.names = df[["sample.names"]]
},
error = function(cond){
message("Error while populating sample names from data.frame:")
},
warning = function(cond){
message("Populating sample names produced a warning:")
message(conditionMessage(cond))
})
}
tryCatch({ # Attempt to create an RCNA object
new_RCNA_obj = create_RCNA_object(sample.names = sample.names, ano.file = ano.file, ncpu = ncpus, gcParams = df, ...)
},
error = function(cond){
message("RCNA Object could not be created due to error:")
},
warning = function(cond){
message("Creating an RCNA object caused a warning:")
message(conditionMessage(cond))
})
}
analysis.obj = correct_gc_bias.RCNA_object(obj = new_RCNA_obj,
verbose = verbose, estimate_gc = estimate_gc)
new_RCNA_obj@commands = analysis.obj
return(new_RCNA_obj)
}
#' Estimate and correct GC bias in the coverage data
#'
#' This function optionally estimates and then corrects the GC bias based on a GC-content factor file that is either generated or provided by the user using a sliding window approach. It creates a GC factor file and a corrected coverage file, both of which are placed in the output directory under `/gc`.
#'
#' @param obj A RCNA_object type object - parameters will be pulled from the object instead, specifically from the `gcParams` slot.
#' @param verbose If set to TRUE will display more detail
#' @param ... Additional arguments (unused)
#' @method correct_gc_bias RCNA_object
#' @details For more parameter information, see \link{estimate_nkr.default}.
#' @return A \linkS4class{RCNA_analysis} class object that describes the input parameters and output files generated by this step of the workflow.
#' @import foreach parallel
#' @rdname correct_gc_bias.default
#' @export
correct_gc_bias.RCNA_object = function(obj, verbose = FALSE, ...){
config.df = obj@gcParams
gcdir_out = file.path(obj@out.dir, "gc")
win.size = obj@win.size
gc.step = obj@gc.step
estimate_gc = obj@estimate_gc
ncpus = obj@ncpu
if(!dir.exists(gcdir_out)){
if(verbose) message("Creating GC factor output directory at ", gcdir_out)
dir.create(gcdir_out, recursive = TRUE)
}
# Check if output GC content factor file names exist
if(estimate_gc){
if(!("file.gc.factor" %in% colnames(config.df))){
if(verbose) message("No output file column labeled `file.gc.factor` - defaulting to sample names. See `?correct_gc_bias()` for more information.")
config.df$file.gc.factor = file.path(gcdir_out, paste0(config.df$sample.names, ".gc.factor.txt"))
}
}else{
if("file.gc.factor" %in% colnames(config.df)){
if(!all(file.exists(config.df$file.gc.factor))) stop("Invalid GC factor file specified. The following files were specified but not found:", paste0(config.df[!file.exists(config.df$file.gc.factor), "file.gc.factor"], "\n"))
} else {
warning("No column labeled `file.gc.factor` yet estimate_gc = FALSE. Performing GC content estimation. See `?correct_gc_bias` for more information.")
config.df$file.gc.factor = file.path(gcdir_out, paste0(config.df$sample.names, ".gc.factor.txt"))
estimate_gc = TRUE
}
}
output_analysis_obj = c()
# Perform GC bias estimation if estimate_gc = TRUE
if(verbose) message("Beginning GC estimation")
if(estimate_gc){
estimate.params = config.df[,c("file.raw.coverage", "file.gc.factor")]
cl=parallel::makeCluster(ncpus)
registerDoParallel(cl)
foreach (x=1:nrow(estimate.params)) %dopar% {
do_estimate_gc_bias(win.size = win.size, gc.step = gc.step, infile = estimate.params[x,1], gc.factor.outfile = estimate.params[x,2], verbose = verbose)
}
parallel::stopCluster(cl)
output_analysis_obj = c(output_analysis_obj, new("RCNA_analysis", call = "estimate_gc_bias", params = estimate.params, res.files = list("file.gc.factor" = estimate.params[,2])))
}
# Perform GC bias correction
config.df = config.df[,c("file.raw.coverage", "file.corrected.coverage", "file.gc.factor", "sample.names")]
if(verbose) message("Calculating GC bias correction")
x <- 0
cl=parallel::makeCluster(ncpus)
registerDoParallel(cl)
foreach (x=1:nrow(config.df)) %dopar% {
do_correct_gc_bias(infile = config.df[x,1], outfile = config.df[x,2], gcfile = config.df[x,3], sample.name = config.df[x,4], win.size = win.size, verbose = verbose)
}
parallel::stopCluster(cl)
if(verbose) message("Feature score estimation succeeded!")
output_analysis_obj = c(output_analysis_obj, new("RCNA_analysis", call = "correct_gc_bias", params = list("win.size" = win.size, "gcParams" = config.df, "gc.step" = gc.step), res.files = list("file.corrected.coverage" = config.df[,c("file.gc.factor", "file.corrected.coverage")])))
return(output_analysis_obj)
}
do_correct_gc_bias = function(win.size = 75, gcfile, infile, outfile, sample.name, verbose = FALSE){
if(verbose) message(paste0("Correcting ", sample.name, "..."))
# Coerce arguments
win.size = as.numeric(win.size)
# Read data
all = read.table(infile)
# Search boundary of each region
reg.start.idx = 1
reg.end.idx = c()
for (i in 2:nrow(all)) {
if (all[i, 1] != all[i - 1, 1] || all[i, 2] - all[i - 1, 2] != 1) {
reg.end.idx = c(reg.end.idx, i - 1)
reg.start.idx = c(reg.start.idx, i)
}
}
reg.end.idx = c(reg.end.idx, nrow(all))
# Read GC bins
gc.bin = read.table(gcfile)
gc.bin = gc.bin[gc.bin[,2] > 0,]
# Calculate GC content for each position in each region
res = data.frame()
for (i in 1:length(reg.start.idx)) {
start.idx = reg.start.idx[i]
end.idx = reg.end.idx[i]
for (idx in start.idx : end.idx) {
win.start.idx = ifelse(idx - win.size > start.idx, idx - win.size, start.idx)
win.end.idx = ifelse(idx + win.size < end.idx, idx + win.size, end.idx)
# Calculate GC
ngc = 0
nseq = 0
for (s in all[win.start.idx : win.end.idx, 5]) {
nseq = nseq + 1
if (s == 'G' || s == 'C') {
ngc = ngc + 1
}
}
# Find nearest GC in bins
gc.idx = which.min(abs(gc.bin[,1] - ngc / nseq))
gc.factor = gc.bin[gc.idx,2]
# Correct based on GC factor
all[idx, 4] = all[idx, 4] / gc.factor
}
}
all[, 4] = round(all[, 4])
colnames(all) = c('chr', 'pos', 'target', 'coverage', 'seq')
# Output
write.table(all, outfile, sep = '\t', quote =FALSE, row.names =FALSE)
}
do_estimate_gc_bias = function(win.size = 75, infile, gc.factor.outfile, gc.step = 0.01, verbose = FALSE) {
if(verbose) message("Estimating GC bias for ", infile)
# Read data
all = read.table(infile)
# Coerce arguments
win.size = as.numeric(win.size)
gc.step = as.numeric(gc.step)
# Search boundary of each region
reg.start.idx = 1
reg.end.idx = c()
for (i in 2:nrow(all)) {
if (all[i, 1] != all[i - 1, 1] || all[i, 2] - all[i - 1, 2] != 1) {
reg.end.idx = c(reg.end.idx, i - 1)
reg.start.idx = c(reg.start.idx, i)
}
}
reg.end.idx = c(reg.end.idx, nrow(all))
# Create GC bins
gc = seq(0, 1, gc.step)
gc.bin = data.frame(gc, len = rep(0, length(gc)), dep = rep(0, length(gc)))
# Calculate GC content for each position in each region
for (i in 1:length(reg.start.idx)) {
start.idx = reg.start.idx[i]
end.idx = reg.end.idx[i]
for (idx in start.idx : end.idx) {
win.start.idx = ifelse(idx - win.size > start.idx, idx - win.size, start.idx)
win.end.idx = ifelse(idx + win.size < end.idx, idx + win.size, end.idx)
# Calculate GC
ngc = 0
nseq = 0
for (s in all[win.start.idx : win.end.idx, 5]) {
nseq = nseq + 1
if (s == 'G' || s == 'C') {
ngc = ngc + 1
}
}
# Record it in GC bins
gc.idx = which.min(abs(gc.bin$gc - ngc / nseq))
gc.bin$dep[gc.idx] = gc.bin$dep[gc.idx] + all[idx, 4]
gc.bin$len[gc.idx] = gc.bin$len[gc.idx] + 1
}
}
# Normalize to mean value
gc.bin$meandep = ifelse(gc.bin$len > 0, gc.bin$dep / gc.bin$len, 0)
gc.bin$r = gc.bin$meandep / (sum(gc.bin$dep) / sum(gc.bin$len))
gc.bin$gc = formatC(gc.bin$gc, format = 'f', digits = 2)
gc.bin$r = formatC(gc.bin$r, format = 'f', digits = 5)
write.table(gc.bin[,c('gc', 'r')], gc.factor.outfile, sep = '\t', quote =FALSE, col.names =FALSE, row.names =FALSE)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.