# rBiasCorrection: Correct Bias in Quantitative DNA Methylation Analyses.
# Copyright (C) 2019-2025 Lorenz Kapsner
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#' @title Correct PCR-Bias in Quantitative DNA Methylation Analyses.
#'
#' @description This function implements the algorithms described by
#' Moskalev et. al in their article 'Correction of PCR-bias in quantitative
#' DNA methylation studies by means of cubic polynomial regression',
#' published 2011 in Nucleic acids research, Oxford University Press
#' (\doi{10.1093/nar/gkr213}).
#'
#' @param experimental A character string. Path to the file containing the raw
#' methylation values of the samples under investigation.
#' @param calibration A character string. In type 1 data (one locus in many
#' samples, e.g. pyrosequencing data):
#' Path to the file containing the raw methylation values of the calibration
#' samples.
#' In type 2 data (many loci in one sample, e.g. next-generationsequencing
#' data or microarray data):
#' Path to the folder that contains at least 4 calibration files (one file
#' per calibration step). Please refer to the FAQ for more detailed
#' information on the specific file requirements (\url{
#' https://raw.githubusercontent.com/kapsner/PCRBiasCorrection/master/FAQ.md
#' }).
#' @param samplelocusname A character string. In type 1 data: locus name -
#' name of the gene locus under investigation. In type 2 data: sample name -
#' name of the sample under investigation.
#' @param minmax A logical, indicating which equations are used for
#' BiasCorrection (default: FALSE). If TRUE, equations are used that include
#' the respective minima and maxima of the provided data.
#' @param correct_method A character string. Method used to correct the PCR-
#' bias of the samples under investigation. One of "best" (default),
#' "hyperbolic" or "cubic". If the method is set to "best" (short: "b"),
#' the algorithm will automatically determine the best fitting type of
#' regression for each CpG site based on \emph{selection_method} (by
#' default: sum of squared errors, SSE,
#' \url{https://en.wikipedia.org/wiki/Residual_sum_of_squares}). If the
#' method is set to "hyperbolic" (short: "h") or "cubic" (short: "c"), the
#' PCR-bias correction of all samples under investigation will be performed
#' with the hyperbolic or the cubic regression respectively.
#' @param selection_method A character string. The method used to select the
#' regression algorithm to correct the respective CpG site. This is by
#' default the sum of squared errors ("SSE"). The second option is
#' "RelError", which selects the regression method based on the theoretical
#' relative error after correction. This metric is calculated by correcting
#' the calibration data with both the hyperbolic regression and the cubic
#' regression and using them again as input data to calculate the 'goodness
#' of fit'-metrics.
#' @param type A single integer. Type of data to be corrected: either `1L` (one
#' locus in many samples, e.g. pyrosequencing data) or `2L` (many loci in one
#' sample, e.g. next-generation sequencing data or microarray data).
#' @param csvdir A character string. Directory to store the resulting tables.
#' (default = paste0(tempdir(), "/plotdir/")). CAUTION: This directory will
#' be newly created on every call of the function - any preexisting files will
#' be deleted without a warning.
#' @param plotdir A character string. Directory to store the resulting plots
#' (default = paste0(tempdir(), "/plotdir/")). CAUTION: This directory will
#' be newly created on every call of the function - any preexisting files will
#' be deleted without a warning.
#' @param logfilename A character string. Path to a file to save the log
#' messages (default = paste0(tempdir(), "/log.txt")).
#' @param seed A integer value. The seed used when solving the unknowns in the
#' hyperbolic regression equation and the cubic regression equation.
#' Important for reproducibility (default: 1234).
#' @param parallel A boolean. If TRUE (the default value), initializing
#' `future::plan("multicore")` (on unix systems) or
#' `future::plan("multisession")` (on non-unix systems) before running the
#' code.
#'
#' @inheritParams createbarerrorplots
#'
#' @return This function is a wrapper around all of `rBiasCorrection`'s
#' included functions. When executing it, it performs the whole workflow of
#' bias correction and writes resulting csv-files and plots, as well as a
#' log file to the local file system (the respective directories can be
#' specified with the function arguments). The return-value is TRUE, if the
#' correction of PCR measurement biases succeeds. If the correction fails,
#' an error message is returned.
#'
#' @examples
#' \donttest{
#' data.table::fwrite(
#' rBiasCorrection::example.data_experimental$dat,
#' paste0(tempdir(), "/experimental_data.csv")
#' )
#' data.table::fwrite(
#' rBiasCorrection::example.data_calibration$dat,
#' paste0(tempdir(), "/calibration_data.csv")
#' )
#' experimental <- paste0(tempdir(), "/experimental_data.csv")
#' calibration <- paste0(tempdir(), "/calibration_data.csv")
#'
#' results <- biascorrection(
#' experimental = experimental,
#' calibration = calibration,
#' samplelocusname = "BRAF",
#' parallel = FALSE
#' )
#' }
#'
#' @export
biascorrection <- function(experimental,
calibration,
samplelocusname,
minmax = FALSE,
correct_method = c("best", "hyperbolic", "cubic"),
selection_method = c("SSE", "RelError"),
type = 1L,
csvdir = paste0(tempdir(), "/csvdir/"),
plotdir = paste0(tempdir(), "/plotdir/"),
logfilename = paste0(tempdir(), "/log.txt"),
plot_height = 5,
plot_width = 7.5,
plot_textsize = 16,
seed = 1234,
parallel = TRUE) {
correct_method <- match.arg(correct_method)
selection_method <- match.arg(selection_method)
type <- as.integer(type)
stopifnot(
"`experimental` must be of type character" = is.character(experimental),
"`calibration` must be of type character" = is.character(calibration),
"`samplelocusname` must be of type character" =
is.character(samplelocusname),
"`type` must be an integer of `1L` or `2L`" = is.integer(type) &&
type == 1L || type == 2L,
"`minmax` must be a boolean" = is.logical(minmax),
"`csvdir` must be of type character" = is.character(csvdir),
"`plotdir` must be of type character" = is.character(plotdir),
"`logfilename` must be of type character" = is.character(logfilename),
"`plot_height` must be of type numeric" = is.numeric(plot_height),
"`plot_width` must be of type numeric" = is.numeric(plot_width),
"`plot_textsize` must be of type numeric" = is.numeric(plot_textsize),
"`seed` must be of type numeric" = is.numeric(seed),
"`parallel` must be a boolean" = is.logical(parallel)
)
# fix directories to work with all functions
# therefore we need a "/" at the end of the dir-string
plotdir <- gsub("([[:alnum:]])$", "\\1/", plotdir)
csvdir <- gsub("([[:alnum:]])$", "\\1/", csvdir)
# initialize some stuff
on_start(plotdir, csvdir, logfilename, parallel)
# initialize our list for reactive values
rv <- list()
# min-max hard coded to minmax = FALSE
# (minmax = TRUE is experimental and under development)
rv$minmax <- minmax
rv$selection_method <- selection_method
# save locusname
rv$sample_locus_name <- handle_text_input(samplelocusname)
# load data
if (type == 1) {
# experimental data
rv$fileimport_experimental <- clean_dt(
data.table::fread(experimental, header = TRUE),
"experimental",
type = 1,
logfilename = logfilename
)[["dat"]]
# write raw data to file
write_csv(table = rv$fileimport_experimental,
filename = paste0(csvdir, "raw_experimental_data.csv"))
# calibration data
cal_type_1 <- clean_dt(data.table::fread(calibration, header = TRUE),
"calibration", type = 1,
logfilename = logfilename)
rv$fileimport_calibration <- cal_type_1[["dat"]]
# write raw data to file
write_csv(table = rv$fileimport_calibration,
filename = paste0(csvdir, "raw_calibration_data.csv"))
# write names of columns to rv
rv$vec_cal <- cal_type_1[["vec_cal"]]
# calculate aggregated inputs
rv$aggregated_experimental <- aggregated_input(
datatable = rv$fileimport_experimental,
description = "experimental",
vec_cal = rv$vec_cal,
type = 1
)
# write aggregated data to file
write_csv(table = rv$aggregated_experimental,
filename = paste0(csvdir, "aggregated_experimental_data.csv"))
rv$aggregated_calibration <- aggregated_input(
datatable = rv$fileimport_calibration,
description = "calibration",
vec_cal = rv$vec_cal
)
# write aggregated data to file
write_csv(table = rv$aggregated_calibration,
filename = paste0(csvdir, "aggregated_calibration_data.csv"))
} else if (type == 2) {
return(paste0("The correction of PCR measurement ",
"biases of this type of data is not implemented yet."))
} else {
return("ERROR. Please specify a valid type of data to correct (1 or 2).")
}
if (type == 1) {
# calculate calibration curves
# reconstruct parts from app_plottingUtility.R
regression_results <- regression_utility(
data = rv$fileimport_calibration,
samplelocusname = rv$sample_locus_name,
locus_id = NULL,
rv = rv,
mode = NULL,
logfilename = logfilename,
minmax = rv$minmax,
seed = seed
)
plotlist_reg <- regression_results[["plot_list"]]
rv$result_list <- regression_results[["result_list"]]
# create calibration plots
plotting_utility(
data = rv$fileimport_calibration,
plotlist_reg = plotlist_reg,
type = 1,
samplelocusname = rv$sample_locus_name,
locus_id = NULL,
rv = rv,
mode = NULL,
plotdir = plotdir,
logfilename = logfilename,
minmax = rv$minmax,
plot_height = plot_height,
plot_width = plot_width,
plot_textsize = plot_textsize
)
# save regression statistics to reactive value
rv$reg_stats <- statistics_list(rv$result_list, minmax = rv$minmax)
# write regression statistics to file
write_csv(table = rv$reg_stats,
filename = paste0(csvdir,
rv$sample_locus_name,
"_regression_stats_",
gsub("\\-", "", substr(Sys.time(), 1, 10)),
"_",
gsub("\\:", "", substr(Sys.time(), 12, 16)),
".csv")
)
# correct here calibration-data with hyperbolic and cubic regression
# to get relative error of corrected data
# hyperbolic correction
rv$choices_list <- rv$reg_stats[, c("Name"), with = FALSE
][
, ("better_model") := 0
]
# correct calibration data (to show corrected calibration curves)
solved_eq_h <- solving_equations(datatable = rv$fileimport_calibration,
regmethod = rv$choices_list,
type = 1,
rv = rv,
mode = "corrected",
logfilename = logfilename,
minmax = rv$minmax)
rv$fileimport_cal_corrected_h <- solved_eq_h[["results"]]
colnames(rv$fileimport_cal_corrected_h) <- colnames(
rv$fileimport_calibration
)
# write corrected calibration data to file
write_csv(table = rv$fileimport_cal_corrected_h,
filename = paste0(csvdir,
rv$sample_locus_name,
"_corrected_calibrations_h_",
get_timestamp(),
".csv"))
# substitutions
rv$substitutions_corrected_h <- solved_eq_h[["substitutions"]]
# write substitutions to csv (if existing)
if (nrow(rv$substitutions_corrected_h) > 0) {
write_csv(table = rv$substitutions_corrected_h,
filename = paste0(csvdir,
rv$sample_locus_name,
"_substituted_corrected_h_",
get_timestamp(),
".csv"))
}
# calculate new calibration curves from corrected calibration data
regression_results <- regression_utility(
data = rv$fileimport_cal_corrected_h,
samplelocusname = rv$sample_locus_name,
rv = rv,
mode = "corrected",
logfilename = logfilename,
minmax = rv$minmax,
seed = seed
)
plotlist_reg <- regression_results[["plot_list"]]
rv$result_list_hyperbolic <- regression_results[["result_list"]]
plotting_utility(
data = rv$fileimport_cal_corrected_h,
plotlist_reg = plotlist_reg,
type = 1,
samplelocusname = rv$sample_locus_name,
locus_id = NULL,
rv = rv,
mode = "corrected_h",
plotdir = plotdir,
logfilename = logfilename,
minmax = rv$minmax,
plot_height = plot_height,
plot_width = plot_width,
plot_textsize = plot_textsize
)
# save regression statistics to reactive value
rv$reg_stats_corrected_h <- statistics_list(
resultlist = rv$result_list_hyperbolic,
minmax = rv$minmax
)
# write regression statistics to file
write_csv(table = rv$reg_stats_corrected_h,
filename = paste0(csvdir,
rv$sample_locus_name,
"_corrected_regression_stats_h_",
gsub("\\-", "", substr(Sys.time(), 1, 10)),
"_",
gsub("\\:", "", substr(Sys.time(), 12, 16)),
".csv")
)
createbarerrorplots(
statstable_pre = rv$reg_stats,
statstable_post = rv$reg_stats_corrected_h,
rv = rv,
type = 1,
locus_id = NULL,
plotdir = plotdir,
logfilename = logfilename,
mode = "corrected_h",
plot_height = plot_height,
plot_width = plot_width,
plot_textsize = plot_textsize
)
# cubic correction
rv$choices_list <- rv$reg_stats[, c("Name"), with = FALSE
][
, ("better_model") := 1
]
# correct calibration data (to show corrected calibration curves)
solved_eq_c <- solving_equations(datatable = rv$fileimport_calibration,
regmethod = rv$choices_list,
type = 1,
rv = rv,
mode = "corrected",
logfilename = logfilename,
minmax = rv$minmax)
rv$fileimport_cal_corrected_c <- solved_eq_c[["results"]]
colnames(rv$fileimport_cal_corrected_c) <- colnames(
rv$fileimport_calibration
)
# write corrected calibration data to file
write_csv(table = rv$fileimport_cal_corrected_c,
filename = paste0(csvdir,
rv$sample_locus_name,
"_corrected_calibrations_c_",
get_timestamp(),
".csv"))
# substitutions
rv$substitutions_corrected_c <- solved_eq_c[["substitutions"]]
# write substitutions to csv (if existing)
if (nrow(rv$substitutions_corrected_c) > 0) {
write_csv(table = rv$substitutions_corrected_c,
filename = paste0(csvdir,
rv$sample_locus_name,
"_substituted_corrected_c_",
get_timestamp(),
".csv")
)
}
# calculate new calibration curves from corrected calibration data
regression_results <- regression_utility(
data = rv$fileimport_cal_corrected_c,
samplelocusname = rv$sample_locus_name,
rv = rv,
mode = "corrected",
logfilename = logfilename,
minmax = rv$minmax,
seed = seed
)
plotlist_reg <- regression_results[["plot_list"]]
rv$result_list_cubic <- regression_results[["result_list"]]
plotting_utility(
data = rv$fileimport_experimental_corrected_c,
plotlist_reg = plotlist_reg,
type = 1,
samplelocusname = rv$sample_locus_name,
locus_id = NULL,
rv = rv,
mode = "corrected_c",
plotdir = plotdir,
logfilename = logfilename,
minmax = rv$minmax,
plot_height = plot_height,
plot_width = plot_width,
plot_textsize = plot_textsize
)
# save regression statistics to reactive value
rv$reg_stats_corrected_c <- statistics_list(
resultlist = rv$result_list_cubic,
minmax = rv$minmax
)
# write regression statistics to file
write_csv(table = rv$reg_stats_corrected_c,
filename = paste0(csvdir,
rv$sample_locus_name,
"_corrected_regression_stats_c_",
gsub("\\-", "", substr(Sys.time(), 1, 10)),
"_",
gsub("\\:", "", substr(Sys.time(), 12, 16)),
".csv")
)
createbarerrorplots(
statstable_pre = rv$reg_stats,
statstable_post = rv$reg_stats_corrected_c,
rv = rv,
locus_id = NULL,
type = 1,
plotdir = plotdir,
logfilename = logfilename,
mode = "corrected_c",
plot_height = plot_height,
plot_width = plot_width,
plot_textsize = plot_textsize
)
# now correct the real experimental data with the method chosen:
# BiasCorrect experimental data with derived calibration curves
if (correct_method %in% c("best", "b")) {
# default selection of the model with the lower sse:
rv$choices_list <- better_model(
statstable_pre = rv$reg_stats,
statstable_post_hyperbolic = rv$reg_stats_corrected_h,
statstable_post_cubic = rv$reg_stats_corrected_c,
selection_method = rv$selection_method
)
} else if (correct_method %in% c("hyperbolic", "h")) {
rv$choices_list <- rv$reg_stats[, c("Name"), with = FALSE
][
, ("better_model") := 0
]
} else if (correct_method %in% c("cubic", "c")) {
rv$choices_list <- rv$reg_stats[, c("Name"), with = FALSE
][
, ("better_model") := 1
]
}
solved_eq <- solving_equations(
datatable = rv$fileimport_experimental,
regmethod = rv$choices_list,
mode = NULL,
type = 1,
rv = rv,
logfilename = logfilename,
minmax = rv$minmax
)
rv$final_results <- solved_eq[["results"]]
# write final results to csv
write_csv(table = rv$final_results,
filename = paste0(csvdir,
rv$sample_locus_name,
"_corrected_values_",
get_timestamp(),
".csv"))
rv$substitutions <- solved_eq[["substitutions"]]
# write substitutions to csv (if existing)
if (nrow(rv$substitutions) > 0) {
write_csv(table = rv$substitutions,
filename = paste0(csvdir,
rv$sample_locus_name,
"_substituted_values_",
get_timestamp(),
".csv"))
}
}
return(rv)
}
# document datasets
#' @title example.data_experimental
#'
#' @description A list containing the experimental data ($dat) and the
#' colnames ($vec_cal) needed by the algorithms to perform the bias
#' correction.
#'
#' @name example.data_experimental
#' @docType data
#' @keywords data
NULL
#' @title example.data_calibration
#'
#' @description A list containing the calibration data ($dat) and the
#' colnames ($vec_cal) needed by the algorithms to perform the bias
#' correction.
#'
#' @name example.data_calibration
#' @docType data
#' @keywords data
NULL
#' @title example._plot.df_agg
#'
#' @description A data.table containing the aggregated
#' calibration data for CpG site 1
#' to create an example plot.
#'
#' @name example._plot.df_agg
#' @docType data
#' @keywords data
NULL
#' @title example._plot_coef_c
#'
#' @description A list containing exemplary coefficients
#' of the cubic regression equation for CpG site 1
#' to create an example plot.
#'
#' @name example._plot_coef_c
#' @docType data
#' @keywords data
NULL
#' @title example._plot_coef_h
#'
#' @description A list containing exemplary coefficients
#' of the hyperbolic regression equation for CpG site 1
#' to create an example plot.
#'
#' @name example._plot_coef_h
#' @docType data
#' @keywords data
NULL
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.