R/correct_gc_bias.r

Defines functions do_estimate_gc_bias do_correct_gc_bias correct_gc_bias.RCNA_object correct_gc_bias.default correct_gc_bias

Documented in correct_gc_bias correct_gc_bias.default correct_gc_bias.RCNA_object

#' @include classes.r
#' @include RCNA.r

#' @description This generic function is used to run to calculate and correct GC-content-based coverage bias
#'
#' @title correct_gc_bias: Estimate and correct GC bias in coverage
#' @param obj A \linkS4class{RCNA_object} class object, or a data.frame object that contains all the necessary parameters.
#' @param ... Other arguments
#' @details This function can be run as a stand-alone or as part of \link{run_RCNA}
#' @return A \linkS4class{RCNA_analysis} class object that describes the input parameters and output files generated by this step of the workflow.
#' @seealso \linkS4class{RCNA_object}, \linkS4class{RCNA_analysis}, \link{run_RCNA}
#' @rdname correct_gc_bias.default
#' @export correct_gc_bias

correct_gc_bias <- function(obj, ...) UseMethod("correct_gc_bias")

#' Estimate and correct GC bias in the coverage data
#'
#' This function optionally estimates and then corrects the GC bias based on a GC-content factor file that is either generated or provided by the user using a sliding window approach. It creates a GC factor file and a corrected coverage file, both of which are placed in the output directory under `/gc`.
#'
#' @param df Path to the config file, or a `data.frame` object containing the valid parameters. Valid column names are `file.raw.coverage`, `file.gc.factor`, `file.corrected.coverage`, and `sample.names`. Additional columns will be ignored.
#' @param sample.names Character vector of sample names. Alternatively can be specified in `df`.
#' @param ano.file Location of the annotation file. This file must be in CSV format and contain the following information (with column headers as specified): "feature,chromosome,start,end".
#' @param out.dir Output directory for results. A subdirectory for results will be created under this + `/nkr/`.
#' @param ncpus Integer number of CPUs to use. Specifying more than one allows this function to be parallelized by feature.
#' @param file.raw.coverage Character vector listing the raw input coverage files. Must be the same length as `sample.names`. Alternatively can be specified in `df`.
#' @param file.corrected.coverage Character vector listing the corrected input coverage files. If not specified new names will be generated based on the raw coverage files.
#' @param file.gc.factor Character vector listing the GC factor files used to correct coverage. If `estimate_gc=FALSE` then this must be provided. Otherwise it is ignored.
#' @param win.size Size in base pairs of the sliding window used to estimate and correct the GC bias.
#' @param gc.step Bin size for GC bias in the GC factor file. If the GC factor file is provided then the file must have corresponding bin sizes.
#' @param estimate_gc Logical determining if GC content estimation should be performed. If set to `FALSE` then a factor file must be provided via `file.gc.factor` or in `df`.
#' @param verbose When TRUE increases level of detail for command output
#' @param obj An `RCNA_object` type created by \link{create_RCNA_object}. If specified this takes precedent over all over args.
#' @param ... Additional arguments
#' @examples
#' ## Run GC-bias estimation and correction on example object
#' # See \link{example_obj} for more information on example
#' example_obj@ano.file <- system.file("examples" ,"annotations-example.csv",
#'  package = "RCNA")
#' raw.cov <- system.file("examples", "coverage",
#'                        paste0(example_obj@sample.names, ".txt.gz"), package = "RCNA")
#' example_obj@gcParams$file.raw.coverage <- raw.cov
#' \donttest{example_obj}
#' # Create output directory
#' dir.create(file.path("output", "gc"), recursive = TRUE)
#' # Estimate and correct GC bias, append results
#' correct_gc_analysisObj <- correct_gc_bias(example_obj)
#' example_obj@commands <- c(example_obj@commands, correct_gc_analysisObj)
#' \dontshow{system("rm -rf output")}
#' @return A \linkS4class{RCNA_analysis} class object that describes the input parameters and output files generated by this step of the workflow.
#' @rdname correct_gc_bias.default
#' @import utils parallel doParallel
#' @details
#'
#' The `df` argument corresponds to the `gcParams` matrix on \linkS4class{RCNA_object}. Valid column names are `sample.names`, `file.raw.coverage`, `file.corrected.coverage`, and `file.gc.factor`. The `file.gc.factor` column is not required if `estimate_gc=TRUE`. Additional columns will be ignored.
#' @export

correct_gc_bias.default = function(obj = NULL, df = NULL, sample.names = NULL, ano.file, out.dir = NULL, ncpus = 1,
		file.raw.coverage = NULL, file.corrected.coverage = NULL, file.gc.factor = NULL,  win.size = 75, gc.step = 0.01, estimate_gc = TRUE,
		verbose = FALSE, ...){

  if(is.data.frame(obj) | is.character(obj)){
    df = obj
  } else if(!missing(obj)){
    return(correct_gc_bias.RCNA_object(obj, verbose))
	} else if(missing(df)){
		if(verbose) message("No data.frame detected for correct_gc_bias. Constructing run from parameters instead")
		if(!missing(sample.names) & !missing(ano.file)){
			tryCatch({ # Attempt to create an RCNA object
					new_RCNA_obj = create_RCNA_object(sample.names = sample.names, ano.file = ano.file, ncpu = ncpus, ...)
					},
					error = function(cond){
						message("RCNA Object could not be created due to error:")
					},
					warning = function(cond){
						message("Creating an RCNA object caused a warning:")
						message(conditionMessage(cond))
					})
		} else if(missing(sample.names)){
			stop("Missing both `df` argument and `sample.names` argument - please specify at least one.")
		}
	} else {
		if(missing(sample.names)){
			tryCatch({
						if(verbose) message("Pulling sample.names from data.frame")
						sample.names = df[["sample.names"]]
					},
					error = function(cond){
						message("Error while populating sample names from data.frame:")
					},
					warning = function(cond){
						message("Populating sample names produced a warning:")
						message(conditionMessage(cond))
					})
		}

		tryCatch({ # Attempt to create an RCNA object
					new_RCNA_obj = create_RCNA_object(sample.names = sample.names, ano.file = ano.file, ncpu = ncpus, gcParams = df, ...)
				},
				error = function(cond){
					message("RCNA Object could not be created due to error:")
				},
				warning = function(cond){
					message("Creating an RCNA object caused a warning:")
					message(conditionMessage(cond))
		})
	}

	analysis.obj = correct_gc_bias.RCNA_object(obj = new_RCNA_obj,
			verbose = verbose, estimate_gc = estimate_gc)

	new_RCNA_obj@commands = analysis.obj
	return(new_RCNA_obj)
}
#' Estimate and correct GC bias in the coverage data
#'
#' This function optionally estimates and then corrects the GC bias based on a GC-content factor file that is either generated or provided by the user using a sliding window approach. It creates a GC factor file and a corrected coverage file, both of which are placed in the output directory under `/gc`.
#'
#' @param obj A RCNA_object type object - parameters will be pulled from the object instead, specifically from the `gcParams` slot.
#' @param verbose If set to TRUE will display more detail
#' @param ... Additional arguments (unused)
#' @method correct_gc_bias RCNA_object
#' @details For more parameter information, see \link{estimate_nkr.default}.
#' @return A \linkS4class{RCNA_analysis} class object that describes the input parameters and output files generated by this step of the workflow.
#' @import foreach parallel
#' @rdname correct_gc_bias.default
#' @export

correct_gc_bias.RCNA_object = function(obj, verbose = FALSE, ...){
	config.df = obj@gcParams
	gcdir_out = file.path(obj@out.dir, "gc")
	win.size = obj@win.size
	gc.step = obj@gc.step
	estimate_gc = obj@estimate_gc
	ncpus = obj@ncpu

	if(!dir.exists(gcdir_out)){
		if(verbose) message("Creating GC factor output directory at ", gcdir_out)
		dir.create(gcdir_out, recursive = TRUE)
	}

	# Check if output GC content factor file names exist
	if(estimate_gc){
		if(!("file.gc.factor" %in% colnames(config.df))){
			if(verbose) message("No output file column labeled `file.gc.factor` - defaulting to sample names. See `?correct_gc_bias()` for more information.")
			config.df$file.gc.factor = file.path(gcdir_out, paste0(config.df$sample.names, ".gc.factor.txt"))
		}
	}else{
		if("file.gc.factor" %in% colnames(config.df)){
			if(!all(file.exists(config.df$file.gc.factor))) stop("Invalid GC factor file specified. The following files were specified but not found:", paste0(config.df[!file.exists(config.df$file.gc.factor), "file.gc.factor"], "\n"))
		} else {
			warning("No column labeled `file.gc.factor` yet estimate_gc = FALSE. Performing GC content estimation. See `?correct_gc_bias` for more information.")
			config.df$file.gc.factor = file.path(gcdir_out, paste0(config.df$sample.names, ".gc.factor.txt"))
			estimate_gc = TRUE
		}
	}

	output_analysis_obj = c()
	# Perform GC bias estimation if estimate_gc = TRUE
	if(verbose) message("Beginning GC estimation")
	if(estimate_gc){
		estimate.params = config.df[,c("file.raw.coverage", "file.gc.factor")]
		cl=parallel::makeCluster(ncpus)
		registerDoParallel(cl)
		foreach (x=1:nrow(estimate.params)) %dopar% {
			do_estimate_gc_bias(win.size = win.size, gc.step = gc.step, infile = estimate.params[x,1], gc.factor.outfile = estimate.params[x,2], verbose = verbose)
		}
		parallel::stopCluster(cl)
		output_analysis_obj = c(output_analysis_obj, new("RCNA_analysis", call = "estimate_gc_bias", params = estimate.params, res.files = list("file.gc.factor" = estimate.params[,2])))
	}

	# Perform GC bias correction
	config.df = config.df[,c("file.raw.coverage", "file.corrected.coverage", "file.gc.factor", "sample.names")]
	if(verbose) message("Calculating GC bias correction")
	x <- 0
	cl=parallel::makeCluster(ncpus)
	registerDoParallel(cl)
	foreach (x=1:nrow(config.df)) %dopar% {
		do_correct_gc_bias(infile = config.df[x,1], outfile = config.df[x,2], gcfile = config.df[x,3], sample.name = config.df[x,4], win.size = win.size, verbose = verbose)
	}
	parallel::stopCluster(cl)

	if(verbose) message("Feature score estimation succeeded!")
	output_analysis_obj = c(output_analysis_obj, new("RCNA_analysis", call = "correct_gc_bias", params = list("win.size" = win.size, "gcParams" = config.df, "gc.step" = gc.step), res.files = list("file.corrected.coverage" = config.df[,c("file.gc.factor", "file.corrected.coverage")])))
	return(output_analysis_obj)
}

do_correct_gc_bias = function(win.size = 75, gcfile, infile, outfile, sample.name, verbose = FALSE){

  if(verbose) message(paste0("Correcting ", sample.name, "..."))

	# Coerce arguments
	win.size = as.numeric(win.size)

	# Read data
	all = read.table(infile)

	# Search boundary of each region
	reg.start.idx = 1
	reg.end.idx = c()
	for (i in 2:nrow(all)) {
		if (all[i, 1] != all[i - 1, 1] || all[i, 2] - all[i - 1, 2] != 1) {
			reg.end.idx = c(reg.end.idx, i - 1)
			reg.start.idx = c(reg.start.idx, i)
		}
	}
	reg.end.idx = c(reg.end.idx, nrow(all))

	# Read GC bins
	gc.bin = read.table(gcfile)
	gc.bin = gc.bin[gc.bin[,2] > 0,]

	# Calculate GC content for each position in each region
	res = data.frame()
	for (i in 1:length(reg.start.idx)) {
		start.idx = reg.start.idx[i]
		end.idx = reg.end.idx[i]
		for (idx in start.idx : end.idx) {
			win.start.idx = ifelse(idx - win.size > start.idx, idx - win.size, start.idx)
			win.end.idx = ifelse(idx + win.size < end.idx, idx + win.size, end.idx)
			# Calculate GC
			ngc = 0
			nseq = 0
			for (s in all[win.start.idx : win.end.idx, 5]) {
				nseq = nseq + 1
				if (s == 'G' || s == 'C') {
					ngc = ngc + 1
				}
			}

			# Find nearest GC in bins
			gc.idx = which.min(abs(gc.bin[,1] - ngc / nseq))
			gc.factor = gc.bin[gc.idx,2]

			# Correct based on GC factor
			all[idx, 4] = all[idx, 4] / gc.factor
		}
	}

	all[, 4] = round(all[, 4])
	colnames(all) = c('chr', 'pos', 'target', 'coverage', 'seq')

	# Output
	write.table(all, outfile, sep = '\t', quote =FALSE, row.names =FALSE)
}

do_estimate_gc_bias = function(win.size = 75, infile, gc.factor.outfile, gc.step = 0.01, verbose = FALSE) {

  if(verbose) message("Estimating GC bias for ", infile)
	# Read data
	all = read.table(infile)

	# Coerce arguments
	win.size = as.numeric(win.size)
	gc.step = as.numeric(gc.step)

	# Search boundary of each region
	reg.start.idx = 1
	reg.end.idx = c()
	for (i in 2:nrow(all)) {
		if (all[i, 1] != all[i - 1, 1] || all[i, 2] - all[i - 1, 2] != 1) {
			reg.end.idx = c(reg.end.idx, i - 1)
			reg.start.idx = c(reg.start.idx, i)
		}
	}
	reg.end.idx = c(reg.end.idx, nrow(all))

	# Create GC bins
	gc = seq(0, 1, gc.step)
	gc.bin = data.frame(gc, len = rep(0, length(gc)), dep = rep(0, length(gc)))

	# Calculate GC content for each position in each region
	for (i in 1:length(reg.start.idx)) {
		start.idx = reg.start.idx[i]
		end.idx = reg.end.idx[i]
		for (idx in start.idx : end.idx) {
			win.start.idx = ifelse(idx - win.size > start.idx, idx - win.size, start.idx)
			win.end.idx = ifelse(idx + win.size < end.idx, idx + win.size, end.idx)

			# Calculate GC
			ngc = 0
			nseq = 0
			for (s in all[win.start.idx : win.end.idx, 5]) {
				nseq = nseq + 1
				if (s == 'G' || s == 'C') {
					ngc = ngc + 1
				}
			}

			# Record it in GC bins
			gc.idx = which.min(abs(gc.bin$gc - ngc / nseq))
			gc.bin$dep[gc.idx] = gc.bin$dep[gc.idx] + all[idx, 4]
			gc.bin$len[gc.idx] = gc.bin$len[gc.idx] + 1
		}
	}

	# Normalize to mean value
	gc.bin$meandep = ifelse(gc.bin$len > 0, gc.bin$dep / gc.bin$len, 0)
	gc.bin$r = gc.bin$meandep / (sum(gc.bin$dep) / sum(gc.bin$len))

	gc.bin$gc = formatC(gc.bin$gc, format = 'f', digits = 2)
	gc.bin$r = formatC(gc.bin$r, format = 'f', digits = 5)

	write.table(gc.bin[,c('gc', 'r')], gc.factor.outfile, sep = '\t', quote =FALSE, col.names =FALSE, row.names =FALSE)
}

Try the RCNA package in your browser

Any scripts or data that you put into this service are public.

RCNA documentation built on April 3, 2025, 6:03 p.m.