Nothing
# NOTE: Column names used for calculations with data.table is declared
# in globals.R to avoid NOTES in R CMD CHECK.
################################################################################
# CHANGE LOG (last 20 changes)
# 09.07.2022: Fixed "...URLs which should use \doi (with the DOI name only)".
# 24.08.2018: Removed unused variables.
# 06.08.2017: Added audit trail.
# 23.05.2016: Changed name on some result columns.
# 22.05.2016: Added masked data to result for manual investigation.
# 20.05.2016: 'Blocked' changed to 'masked' throughout.
# 20.05.2016: Implemented method 7 (assumes log-normal distribution).
# 28.02.2016: Added check for ILS dye.
# 09.01.2016: Added more attributes to result.
# 21.10.2015: Added attributes.
# 06.10.2015: Added importFrom for data.table
# 28.08.2015: Added importFrom
# 26.06.2015: Added global AT per dye.
# 26.06.2015: Fixed hard-coded kit/dye set.
# 03.05.2015: First version.
#' @title Calculate Analytical Threshold
#'
#' @description
#' Calculate analytical thresholds estimates.
#'
#' @details
#' Calculate the analytical threshold (AT) according to method 1, 2, and 4 as
#' recommended in the reference by analyzing the background signal (noise).
#' In addition method 7, a log-normal version of method 1 has been implemented.
#' Method 1: The average signal + 'k' * the standard deviation.
#' Method 2: The percentile rank method. The percentage of noise peaks below 'rank.t'.
#' Method 4: Utilize the mean and standard deviation and the critical value obtained
#' from the t-distribution for confidence interval 'alpha' (one-sided) and observed
#' peaks analyzed (i.e. not masked) minus one as degrees of freedom, and the number
#' of samples.
#' Method 7: The average natural logarithm of the signal + k * the standard deviation.
#'
#' If samples containing DNA are used, a range around the allelic peaks can be masked
#' from the analysis to discard peaks higher than the noise. Masking can be within
#' each dye or across all dye channels.
#' Similarly a range around the peaks of the internal lane standard (ILS) can be
#' masked across all dye channels. Which can bleed-through in week samples
#' (i.e. negative controls)
#' The mean, standard deviation, and number of peaks are calculated per dye per sample,
#' per sample, globally across all samples, and globally across all samples per dye,
#' for each method to estimate AT. Also the complete percentile rank list is calculated.
#'
#' @param data a data frame containing at least 'Dye.Sample.Peak',
#' 'Sample.File.Name', 'Marker', 'Allele', 'Height', and 'Data.Point'.
#' @param ref a data frame containing at least
#' 'Sample.Name', 'Marker', 'Allele'.
#' @param mask.height logical to indicate if high peaks should be masked.
#' @param height integer for global lower peak height threshold for peaks
#' to be excluded from the analysis. Active if 'mask.peak=TRUE.
#' @param mask.sample logical to indicate if sample allelic peaks should be masked.
#' @param per.dye logical TRUE if sample peaks should be masked per dye channel.
#' FALSE if sample peaks should be masked globally across dye channels.
#' @param range.sample integer to specify the masking range in (+/-) data points.
#' Active if mask.sample=TRUE.
#' @param mask.ils logical to indicate if internal lane standard peaks should be masked.
#' @param range.ils integer to specify the masking range in (+/-) data points.
#' Active if mask.ils=TRUE.
#' @param k numeric factor for the desired confidence level (method AT1).
#' @param alpha numeric one-sided confidence interval to obtain the
#' critical value from the t-distribution (method AT4).
#' @param rank.t numeric percentile rank threshold (method AT2).
#' @param ignore.case logical to indicate if sample matching should ignore case.
#' @param word logical to indicate if word boundaries should be added before sample matching.
#' @param debug logical to indicate if debug information should be printed.
#'
#' @return list of three data frames. The first with result per dye per sample,
#' per sample, globally across all samples, and globally across all samples
#' per dye, for each method. The second is the complete percentile rank list.
#' The third is the masked raw data used for calculation to enable manual
#' check of the result.
#'
#' @export
#'
#' @importFrom stats sd qt
#' @importFrom utils str head tail
#' @importFrom data.table data.table setnames
#'
#' @seealso \code{\link{maskAT}}, \code{\link{checkSubset}}
#'
#' @references
#' J. Bregu et.al.,
#' Analytical thresholds and sensitivity: establishing RFU thresholds for
#' forensic DNA analysis, J. Forensic Sci. 58 (1) (2013) 120-129,
#' ISSN 1556-4029, DOI: 10.1111/1556-4029.12008.
#' \doi{10.1111/1556-4029.12008}
#'
#'
calculateAT <- function(data, ref = NULL, mask.height = TRUE, height = 500,
mask.sample = TRUE, per.dye = TRUE, range.sample = 20,
mask.ils = TRUE, range.ils = 10,
k = 3, rank.t = 0.99, alpha = 0.01,
ignore.case = TRUE, word = FALSE, debug = FALSE) {
if (debug) {
print(paste("IN:", match.call()[[1]]))
print("Parameters:")
print("data")
print(str(data))
print("ref")
print(str(ref))
print("mask.height")
print(mask.height)
print("height")
print(height)
print("mask.sample")
print(mask.sample)
print("per.dye")
print(per.dye)
print("range.sample")
print(range.sample)
print("mask.ils")
print(mask.ils)
print("range.ils")
print(range.ils)
print("k")
print(k)
print("rank.t")
print(rank.t)
print("alpha")
print(alpha)
print("ignore.case")
print(ignore.case)
print("word")
print(word)
}
# Check data ----------------------------------------------------------------
if (is.null(data$Dye.Sample.Peak)) {
stop("'data' must contain a column 'Dye.Sample.Peak'")
}
if (is.null(data$Sample.File.Name)) {
stop("'data' must contain a column 'Sample.File.Name'")
}
if (is.null(data$Marker)) {
stop("'data' must contain a column 'Marker'")
}
if (is.null(data$Allele)) {
stop("'data' must contain a column 'Allele'")
}
if (is.null(data$Height)) {
stop("'data' must contain a column 'Height'")
}
if (is.null(data$Data.Point)) {
stop("'data' must contain a column 'Data.Point'")
}
# Check if slim format.
if (sum(grepl("Allele", names(data))) > 1) {
stop("'data' must be in 'slim' format",
call. = TRUE
)
}
if (sum(grepl("Height", names(data))) > 1) {
stop("'data' must be in 'slim' format",
call. = TRUE
)
}
if (sum(grepl("Data.Point", names(data))) > 1) {
stop("'data' must be in 'slim' format",
call. = TRUE
)
}
if (!is.null(ref)) {
if (is.null(ref$Sample.Name)) {
stop("'ref' must contain a column 'Sample.Name'")
}
if (is.null(ref$Marker)) {
stop("'ref' must contain a column 'Marker'")
}
if (is.null(ref$Allele)) {
stop("'ref' must contain a column 'Allele'")
}
# Check if slim format.
if (sum(grepl("Allele", names(ref))) > 1) {
stop("'ref' must be in 'slim' format",
call. = TRUE
)
}
}
# Check parameters.
if (!is.logical(mask.height)) {
stop("'mask.height' must be logical",
call. = TRUE
)
}
if (!is.numeric(height)) {
stop("'height' must be numeric",
call. = TRUE
)
}
if (!is.logical(mask.sample)) {
stop("'mask.sample' must be logical",
call. = TRUE
)
}
if (!is.logical(per.dye)) {
stop("'per.dye' must be logical",
call. = TRUE
)
}
if (!is.numeric(range.sample)) {
stop("'range.sample' must be numeric",
call. = TRUE
)
}
if (!is.logical(mask.ils)) {
stop("'mask.ils' must be logical",
call. = TRUE
)
}
if (!is.numeric(range.ils)) {
stop("'range.ils' must be numeric",
call. = TRUE
)
}
if (!is.numeric(k)) {
stop("'k' must be numeric",
call. = TRUE
)
}
if (!is.numeric(rank.t)) {
stop("'rank.t' must be numeric",
call. = TRUE
)
}
if (!is.numeric(alpha)) {
stop("'alpha' must be numeric",
call. = TRUE
)
}
if (!is.logical(ignore.case)) {
stop("'ignore.case' must be logical",
call. = TRUE
)
}
if (!is.logical(word)) {
stop("'word' must be logical",
call. = TRUE
)
}
if (!is.logical(debug)) {
stop("'debug' must be logical",
call. = TRUE
)
}
# Prepare -------------------------------------------------------------------
if (!all(c("Masked", "Dye") %in% names(data))) {
# Mask data for AT calculation
# (need to be separate function to enable control plots in GUI).
data <- maskAT(
data = data, ref = ref, mask.height = mask.height, height = height,
mask.sample = mask.sample, per.dye = per.dye, range.sample = range.sample,
mask.ils = mask.ils, range.ils = range.ils,
ignore.case = ignore.case, word = word, debug = debug
)
}
# Get all dyes.
dyes <- as.character(unique(data$Dye))
dyeILS <- unique(data$Dye[data$ILS])
dyesKit <- setdiff(dyes, dyeILS)
# Get number of samples.
nSamples <- length(unique(data$Sample.File.Name))
# Internal functions --------------------------------------------------------
# Function to calculate the percentile rank .
percentileRank <- function(x) trunc(rank(x)) / length(x)
# Function to get height above a percentile.
rankThreshold <- function(x, t) min(x[percentileRank(x) > t])
# Convert -------------------------------------------------------------------
# Check ILS dye.
if (length(dyeILS) == 0) {
message("No ILS dye found!")
message("Identified dyes: ", paste(dyes, collapse = ", "))
# Strip masked data.
dt <- data[data$Masked == FALSE, ]
} else {
# Strip masked data, and ILS channel.
dt <- data[data$Masked == FALSE & data$Dye != dyeILS, ]
}
# Convert to data.table for performance.
dt <- data.table::data.table(dt)
# Analyse1 ------------------------------------------------------------------
# Calculate for sample per dye.
at.sample.dye <- dt[, list(
Dye.Mean = mean(Height, na.rm = TRUE),
Dye.Sd = sd(Height, na.rm = TRUE),
Dye.Mean.ln = mean(log(Height), na.rm = TRUE),
Dye.Sd.ln = sd(log(Height), na.rm = TRUE),
Dye.Peaks = sum(Masked == FALSE),
Dye.AT2 = rankThreshold(Height, rank.t)
),
by = list(Sample.File.Name, Dye)
]
# Extract AT2 and remove from dataset to get final row order correct.
at.sample.dye.AT2 <- at.sample.dye$Dye.AT2
at.sample.dye$Dye.AT2 <- NULL
# Calculate globally for each dye.
at.dye <- dt[, list(
Mean = mean(Height, na.rm = TRUE),
Sd = sd(Height, na.rm = TRUE),
Mean.ln = mean(log(Height), na.rm = TRUE),
Sd.ln = sd(log(Height), na.rm = TRUE),
Peaks = sum(Masked == FALSE),
AT2 = rankThreshold(Height, rank.t)
),
by = list(Dye)
]
# Calculate for sample.
at.sample <- dt[, list(
Mean = mean(Height, na.rm = TRUE),
Sd = sd(Height, na.rm = TRUE),
Mean.ln = mean(log(Height), na.rm = TRUE),
Sd.ln = sd(log(Height), na.rm = TRUE),
Peaks = sum(Masked == FALSE),
AT2 = rankThreshold(Height, rank.t)
),
by = list(Sample.File.Name)
]
# Extract AT2 and remove from dataset to get final row order correct.
at.sample.AT2 <- at.sample$AT2
at.sample$AT2 <- NULL
# Join the result.
at.sample.dye$Sample.Mean <- rep(at.sample$Mean, each = length(dyesKit))
at.sample.dye$Sample.Sd <- rep(at.sample$Sd, each = length(dyesKit))
at.sample.dye$Sample.Mean.ln <- rep(at.sample$Mean.ln, each = length(dyesKit))
at.sample.dye$Sample.Sd.ln <- rep(at.sample$Sd.ln, each = length(dyesKit))
at.sample.dye$Sample.Peaks <- rep(at.sample$Peaks, each = length(dyesKit))
# Calculate globally for all data.
at.global <- dt[, list(
Mean = mean(Height, na.rm = TRUE),
Sd = sd(Height, na.rm = TRUE),
Peaks = sum(Masked == FALSE),
Mean.ln = mean(log(Height), na.rm = TRUE),
Sd.ln = sd(log(Height), na.rm = TRUE),
AT2 = rankThreshold(Height, rank.t)
)]
# Join the result.
at.sample.dye$Global.Mean <- rep(at.global$Mean, nrow(at.sample.dye))
at.sample.dye$Global.Sd <- rep(at.global$Sd, nrow(at.sample.dye))
at.sample.dye$Global.Mean.ln <- rep(at.global$Mean.ln, nrow(at.sample.dye))
at.sample.dye$Global.Sd.ln <- rep(at.global$Sd.ln, nrow(at.sample.dye))
at.sample.dye$Global.Peaks <- rep(at.global$Peaks, nrow(at.sample.dye))
# Calculate AT1.
at.sample.dye$Dye.AT1 <- at.sample.dye$Dye.Mean + k * at.sample.dye$Dye.Sd
at.sample.dye$Sample.AT1 <- at.sample.dye$Sample.Mean + k * at.sample.dye$Sample.Sd
at.sample.dye$Global.AT1 <- at.sample.dye$Global.Mean + k * at.sample.dye$Global.Sd
# Calculate AT1 per dye.
at.dye$AT1 <- at.dye$Mean + k * at.dye$Sd
for (d in seq(along = dyesKit)) {
colName <- paste("Global", dyesKit[d], "AT1", sep = ".")
colVal <- rep(at.dye$AT1[d], nrow(at.sample.dye))
colCnt <- length(colVal)
dtNew <- data.table::data.table(col = colVal)
data.table::setnames(dtNew, colName)
at.sample.dye <- data.table::data.table(at.sample.dye, dtNew)
}
# Add AT2 results.
at.sample.dye$Dye.AT2 <- at.sample.dye.AT2
at.sample.dye$Sample.AT2 <- rep(at.sample.AT2, each = length(dyesKit))
at.sample.dye$Global.AT2 <- rep(at.global$AT2, nrow(at.sample.dye))
# Add AT2 per dye.
for (d in seq(along = dyesKit)) {
colName <- paste("Global", dyesKit[d], "AT2", sep = ".")
colVal <- rep(at.dye$AT2[d], nrow(at.sample.dye))
colCnt <- length(colVal)
dtNew <- data.table::data.table(col = colVal)
data.table::setnames(dtNew, colName)
at.sample.dye <- data.table::data.table(at.sample.dye, dtNew)
}
# Calculate AT4.
# Note: Actually no point using t-distribution since degrees of freedom
# (number of observations - 1) are large (>100).
at.sample.dye$Dye.AT4 <- at.sample.dye$Dye.Mean + abs(qt(alpha, at.sample.dye$Dye.Peaks - 1)) * (1 + 1 / 1)^0.5 * at.sample.dye$Dye.Sd
at.sample.dye$Sample.AT4 <- at.sample.dye$Sample.Mean + abs(qt(alpha, at.sample.dye$Sample.Peaks - 1)) * (1 + 1 / 1)^0.5 * at.sample.dye$Sample.Sd
at.sample.dye$Global.AT4 <- at.sample.dye$Global.Mean + abs(qt(alpha, at.sample.dye$Global.Peaks - 1)) * (1 + 1 / nSamples)^0.5 * at.sample.dye$Global.Sd
# Calculate AT4 per dye.
at.dye$AT4 <- at.dye$Mean + abs(qt(alpha, at.dye$Peaks - 1)) * (1 + 1 / 1)^0.5 * at.dye$Sd
for (d in seq(along = dyesKit)) {
colName <- paste("Global", dyesKit[d], "AT4", sep = ".")
colVal <- rep(at.dye$AT4[d], nrow(at.sample.dye))
colCnt <- length(colVal)
dtNew <- data.table::data.table(col = colVal)
data.table::setnames(dtNew, colName)
at.sample.dye <- data.table::data.table(at.sample.dye, dtNew)
}
# Calculate AT7.
at.sample.dye$Dye.AT7 <- exp(at.sample.dye$Dye.Mean.ln + k * at.sample.dye$Dye.Sd.ln)
at.sample.dye$Sample.AT7 <- exp(at.sample.dye$Sample.Mean.ln + k * at.sample.dye$Sample.Sd.ln)
at.sample.dye$Global.AT7 <- exp(at.sample.dye$Global.Mean.ln + k * at.sample.dye$Global.Sd.ln)
# Calculate AT7 per dye.
at.dye$AT7 <- exp(at.dye$Mean.ln + k * at.dye$Sd.ln)
for (d in seq(along = dyesKit)) {
colName <- paste("Global", dyesKit[d], "AT7", sep = ".")
colVal <- rep(at.dye$AT7[d], nrow(at.sample.dye))
colCnt <- length(colVal)
dtNew <- data.table::data.table(col = colVal)
data.table::setnames(dtNew, colName)
at.sample.dye <- data.table::data.table(at.sample.dye, dtNew)
}
# Finally -------------------------------------------------------------------
# Add number of samples.
at.sample.dye$Total.Samples <- nSamples
# Update audit trail.
at.sample.dye <- auditTrail(
obj = at.sample.dye, f.call = match.call(),
package = "strvalidator"
)
# Convert back to data.frame.
res1 <- data.frame(at.sample.dye)
# Analyse2 ------------------------------------------------------------------
# Calculate complete percentile rank list.
at.rank <- data.frame(
Height = unique(sort(dt$Height)),
Rank = unique(percentileRank(sort(dt$Height))),
Observations = as.numeric(table(dt$Height))
)
# Update audit trail.
at.rank <- auditTrail(
obj = at.rank, f.call = match.call(),
package = "strvalidator"
)
# Convert back to data frame.
res2 <- data.frame(at.rank)
# Masked data ---------------------------------------------------------------
# Update audit trail.
data <- auditTrail(
obj = data, f.call = match.call(),
package = "strvalidator"
)
# Convert back to data.frame.
res3 <- data.frame(data)
if (debug) {
print("str(res1)")
print(str(res1))
print("head(res1)")
print(head(res1))
print("tail(res1)")
print(tail(res1))
print("str(res2)")
print(str(res2))
print("head(res2)")
print(head(res2))
print("tail(res2)")
print(tail(res2))
print("str(res3)")
print(str(res3))
print("head(res3)")
print(head(res3))
print("tail(res3)")
print(tail(res3))
}
if (debug) {
print(paste("EXIT:", match.call()[[1]]))
}
# Return list of the two dataframes.
res <- list(res1, res2, res3)
# Return result.
return(res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.