################################################################################
# CHANGE LOG (last 20 changes)
# 11.09.2022: Added information about limitations when dropout is present.
# 07.07.2022: Fixed "...URLs which should use \doi (with the DOI name only)".
# 24.08.2018: Removed unused variables.
# 06.08.2017: Added audit trail.
# 18.09.2016: Fixed attribute word not saved.
# 16.09.2016: Change keep.na=FALSE to keep.na=TRUE for filterProfile.
# 14.09.2016: Fixed error when homozygotes are given with double notation in ref.
# 06.09.2016: Implemented word boundaries (filterProfile).
# 06.09.2016: Fixed implementation of updated filterProfile function.
# 29.08.2016: Implemented updated filterProfile function.
# 19.08.2016: Fixed bug resulting in wrong number of observed allele copies.
# 15.08.2016: Rewritten to use data.table for efficiency and new metrics.
# 29.06.2016: Added option to remove sex markers and quality sensor.
# 25.01.2016: Fixed save attribute saves dataset.
# 09.01.2016: Added more attributes to result.
# 06.01.2016: Added attributes to result.
# 12.10.2014: Fixed bug when NA in Allele column.
# 26.09.2014: Accept vector for 'exclude'.
# 12.09.2014: Included 'exclude' parameter.
# 10.09.2014: Included total peak height in result.
#' @title Calculate Peak Height.
#'
#' @description
#' Calculate peak height metrics for samples.
#'
#' @details
#' Calculates the total peak height (TPH), and number of observed peaks (Peaks),
#' for each sample by default. If a reference dataset is provided average peak
#' height (H), and profile proportion (Proportion) are calculated.
#'
#' H is calculated according to the formula (references [1][2]):
#' \eqn{H = sum(peak heights)/(n[het] + 2n[hom]}
#' Where:
#' n[het] = number of observed heterozygous alleles
#' n[hom] = number of observed homozygous alleles
#'
#' Important: The above formula has a drawback that when many alleles have
#' dropped out, i.e. when only few alleles are detected, H can be overestimated.
#' For example, if there are only 1 (homozygote) peak observed in the profile,
#' with a height of 100 RFU, then H=100 RFU. This means that the value of H will
#' always be between half the analytical threshold (AT/2) and the peak height of
#' the observed allele (if only one). For this reason Tvedebrink et al. actually
#' modified the estimate to take the number of expected alleles into account
#' when estimating the expected peak height (reference [3]). Basically, they adjust
#' the estimated peak height for the fact that they know how many alleles that
#' fall below the AT, such that the expected peak height could be estimated lower
#' than AT. In addition, they account for degradation using a log-linear
#' relationship on peak heights and fragment length.
#'
#' Tip: If it is known that all expected peaks are observed and no unexpected
#' peaks are present, the dataset can be used as a reference for itself.
#'
#' Note: If a reference dataset is provided the known alleles will be extracted
#' from the dataset.
#'
#' @param data data.frame with at least columns 'Sample.Name' and 'Height'.
#' @param ref data.frame with at least columns 'Sample.Name' and 'Allele'.
#' @param na.replace replaces NA values in the final result.
#' @param exclude character vector (case sensitive) e.g. "OL" excludes rows with
#' "OL" in the 'Allele' column (not necessary when a reference dataset is provided).
#' @param add logical default is TRUE which will add or overwrite columns
#' 'TPH', 'Peaks', 'H', and 'Proportion' in the provided 'data'.
#' @param sex.rm logical, default FALSE to include sex markers in the analysis.
#' @param qs.rm logical, default TRUE to exclude quality sensors from the analysis.
#' @param kit character, required if sex.rm=TRUE or qs.rm=TRUE to define the kit.
#' @param ignore.case logical TRUE ignores case in sample name matching.
#' @param exact logical TRUE for exact sample name matching.
#' @param word logical TRUE to add word boundaries to sample name matching.
#' @param debug logical indicating printing debug information.
#'
#' @return data.frame with with at least columns 'Sample.Name', 'TPH', and 'Peaks'.
#'
#' @export
#'
#' @references
#' [1] Torben Tvedebrink, Poul Svante Eriksen, Helle Smidt Mogensen, Niels Morling,
#' Evaluating the weight of evidence by using quantitative short tandem repeat data in DNA mixtures
#' Journal of the Royal Statistical Society: Series C (Applied Statistics),
#' Volume 59, Issue 5, 2010,
#' Pages 855-874, 10.1111/j.1467-9876.2010.00722.x.
#' \doi{10.1111/j.1467-9876.2010.00722.x}
#'
#' @references
#' [2] Torben Tvedebrink, Helle Smidt Mogensen, Maria Charlotte Stene, Niels Morling,
#' Performance of two 17 locus forensic identification STR kits-Applied Biosystems's AmpFlSTR NGMSElect and Promega's PowerPlex ESI17 kits
#' Forensic Science International: Genetics,
#' Volume 6, Issue 5, 2012,
#' Pages 523-531, 10.1016/j.fsigen.2011.12.006.
#' \doi{10.1016/j.fsigen.2011.12.006}
#'
#' @references
#' [3] Torben Tvedebrink, Maria Asplund, Poul Svante Eriksen, Helle Smidt Mogensen, Niels Morling,
#' Estimating drop-out probabilities of STR alleles accounting for stutters, detection threshold truncation and degradation
#' Forensic Science International: Genetics Supplement Series,
#' Volume 4, Issue 1, 2013,
#' Pages e51-e52, 10.1016/j.fsigss.2013.10.026.
#' \doi{10.1016/j.fsigss.2013.10.026}
#'
#' @importFrom utils str
#' @importFrom data.table data.table :=
calculateHeight <- function(data, ref = NULL, na.replace = NULL, add = TRUE, exclude = NULL,
sex.rm = FALSE, qs.rm = FALSE, kit = NULL, ignore.case = TRUE,
exact = FALSE, word = FALSE, debug = FALSE) {
if (debug) {
print(paste("IN:", match.call()[[1]]))
}
# CHECK DATA ----------------------------------------------------------------
# Check dataset.
if (!any(grepl("Sample.Name", names(data)))) {
stop("'data' must contain a column 'Sample.Name'.", call. = TRUE)
}
if (!any(grepl("Height", names(data)))) {
stop("'data' must contain a column 'Height'.", call. = TRUE)
}
# Check if slim format.
if (sum(grepl("Height", names(data)) > 1)) {
stop("'data' must be in 'slim' format.", call. = TRUE)
}
# Check ref.
if (!is.null(ref)) {
# Check dataset.
if (!any(grepl("Sample.Name", names(ref)))) {
stop("'ref' must contain a column 'Sample.Name'.", call. = TRUE)
}
if (!any(grepl("Allele", names(ref)))) {
stop("'ref' must contain a column 'Allele'.", call. = TRUE)
}
# Check if slim format.
if (sum(grepl("Allele", names(ref)) > 1)) {
stop("'ref' must be in 'slim' format.", call. = TRUE)
}
}
# Check na.
if (length(na.replace) > 1) {
stop("'na.replace' must be of length 1.", call. = TRUE)
}
# Check logical arguments.
if (!is.logical(add)) {
stop("'add' must be logical.", call. = TRUE)
}
if (!is.logical(sex.rm)) {
stop("'sex.rm' must logical.", call. = TRUE)
}
if (!is.logical(qs.rm)) {
stop("'qs.rm' must be logical.", call. = TRUE)
}
# Check dependencies.
if (sex.rm) {
if (is.null(kit)) {
stop("'kit' can't be NULL if sex.rm=TRUE.")
}
}
if (qs.rm) {
if (is.null(kit)) {
stop("'kit' can't be NULL if qs.rm=TRUE.")
}
}
# PREPARE -----------------------------------------------------------------
# Check if numeric data.
if (!is.numeric(data$Height)) {
# Convert to numeric.
data$Height <- as.numeric(data$Height)
message("The column 'Height' was converted to numeric.")
}
if (!is.null(exclude)) {
message("Removing excluded alleles from dataset:")
for (e in seq(along = exclude)) {
# Remove excluded alleles, accept NA values (or will result in all NA for that row).
tmp1 <- nrow(data)
data <- data[data$Allele != exclude[e] | is.na(data$Allele), ]
tmp2 <- nrow(data)
message("Removed ", tmp1 - tmp2, " rows with Allele=", exclude[e], ".")
}
}
# Check if reference dataset was provided.
if (!is.null(ref)) {
# Filter dataset.
message("Extracting known alleles from dataset...")
data <- filterProfile(
data = data, ref = ref,
add.missing.loci = FALSE, keep.na = TRUE, invert = FALSE,
ignore.case = ignore.case, exact = exact, word = word,
sex.rm = sex.rm, qs.rm = qs.rm, kit = kit, debug = debug
)
# Remove sex markers and quality sensors from reference dataset.
if (sex.rm || qs.rm) {
message("Removing gender markers and/or quality sensors from reference dataset...")
ref <- filterProfile(
data = ref, filter.allele = FALSE,
sex.rm = sex.rm, qs.rm = qs.rm, kit = kit,
debug = debug
)
}
# Check if missing alleles (Y markers in female profiles.)
if (any(is.na(ref$Allele))) {
# Remove any row with Allele=NA.
tmp1 <- nrow(ref)
ref <- ref[!is.na(ref$Allele), ]
tmp2 <- nrow(ref)
message("Removed ", tmp1 - tmp2, " rows with Allele=NA in reference dataset.")
}
if (!"Copies" %in% names(ref)) {
# Add
ref <- calculateCopies(data = ref)
message("Number of allele copies added to reference dataset.")
}
# Convert to data.table and calculate number of allele copies and expected peaks.
DTref <- data.table(ref)
# This code is required to handle homozygotes with double notation.
DTref <- DTref[, list(Copies = unique(Copies)),
by = list(Sample.Name, Marker, Allele)
]
# Calculate then number of expected peaks.
DTref[, Expected := .N, by = list(Sample.Name)]
# Add to dataset.
data <- addData(
data = data, new.data = DTref,
by.col = "Sample.Name", then.by.col = "Marker",
what = c("Copies", "Expected"), exact = exact,
debug = debug
)
message("Expected number of alleles added to dataset.")
} else {
message("Reference dataset not provided.")
# Filter quality sensors and sex markers.
data <- filterProfile(
data = data, ref = NULL,
add.missing.loci = FALSE, keep.na = TRUE, invert = FALSE,
ignore.case = ignore.case, exact = exact, word = word,
sex.rm = sex.rm, qs.rm = qs.rm, kit = kit,
filter.allele = FALSE, debug = debug
)
}
# CALCULATE -----------------------------------------------------------------
# Convert to data.table and calculate metrics.
DT <- data.table(data)
if (add & nrow(DT) > 0) {
# Calculate and add to dataset (repeat over all rows in sample).
# Calculate total peak height for each sample.
DT[, TPH := sum(Height, na.rm = TRUE), by = list(Sample.Name)]
# Calculate number of observed peaks for each sample.
DT[, Peaks := sum(!is.na(Height)), by = list(Sample.Name)]
if ("Copies" %in% names(DT)) {
# Calculate number of observed allele copies for each sample.
DT[, N.Alleles := sum(Copies[!is.na(Height)]), by = list(Sample.Name)]
} else {
message("A column 'Copies' was not found in 'data'.")
message("Number of observed allele copies cannot be calculated")
message("Provide a reference dataset to enable calculation of 'N.Alleles'.")
}
if ("N.Alleles" %in% names(DT)) {
# Calculate average peak height for each sample.
DT[, H := TPH / N.Alleles, by = list(Sample.Name)]
} else {
message("A column 'N.Alleles' was not found in 'data'.")
message("Average peak height cannot be calculated.")
message("Provide a reference dataset to enable calculation of 'H'.")
}
if ("Expected" %in% names(DT)) {
# Calculate proportion observed profile for each sample.
DT[, Proportion := Peaks / Expected, by = list(Sample.Name)]
} else {
message("A column 'Expected' was not found in 'data'.")
message("Profile proportion cannot be calculated.")
message("Provide a reference dataset to enable calculation of 'Expected'.")
}
# Assign to result.
res <- DT
} else if (!add & nrow(DT) > 0) {
# Calculate per sample in a new dataset.
if (!is.null(ref)) {
# Calculate total peak height for each sample.
DT[, TPH := sum(Height, na.rm = TRUE), by = list(Sample.Name)]
# Calculate number of observed peaks for each sample.
DT[, Peaks := sum(!is.na(Height)), by = list(Sample.Name)]
# Calculate number of observed peaks for each sample.
DT[, N.Alleles := sum(Copies[!is.na(Height)]), by = list(Sample.Name)]
# Calculate total and average peak height, number of peaks,
# and profile proportion for each sample.
res <- DT[, list(
TPH = unique(TPH),
H = unique(TPH) / unique(N.Alleles),
Peaks = unique(Peaks),
Expected = unique(Expected),
Proportion = unique(Peaks) / unique(Expected)
),
by = list(Sample.Name)
]
} else {
# Calculate total peak height and number of peaks for each sample.
res <- DT[, list(
TPH = sum(Height, na.rm = TRUE),
Peaks = sum(!is.na(Height))
),
by = list(Sample.Name)
]
message("Average peak height and profile proportion will not be calculated.")
message("Provide a reference dataset to enable calculation.")
}
} else if (!nrow(DT) > 0) {
message("Dataset is empty. Returning NULL")
res <- NULL
} else {
# This should not happen.
message("There was an unexpected error")
}
# Convert to data.frame to avoid unexpected results in other functions.
res <- as.data.frame(res)
# FINALIZE ------------------------------------------------------------------
# Replace NA:s
if (!is.null(na.replace)) {
# Check if NA:s and change to 'na.replace'.
if ("TPH" %in% names(res)) {
if (any(is.na(res$TPH))) {
n <- sum(is.na(res$TPH))
res[is.na(res$TPH), ]$TPH <- na.replace
message("Replaced ", n, " NA's in 'TPH' with '", na.replace, "'.")
}
}
if ("Peaks" %in% names(res)) {
if (any(is.na(res$Peaks))) {
n <- sum(is.na(res$Peaks))
res[is.na(res$Peaks), ]$Peaks <- na.replace
message("Replaced ", n, " NA's in 'Peaks' with '", na.replace, "'.")
}
}
if ("H" %in% names(res)) {
if (any(is.na(res$H))) {
n <- sum(is.na(res$H))
res[is.na(res$H), ]$H <- na.replace
message("Replaced ", n, " NA's in 'H' with '", na.replace, "'.")
}
}
if ("Proportion" %in% names(res)) {
if (any(is.na(res$Proportion))) {
n <- sum(is.na(res$Proportion))
res[is.na(res$Proportion), ]$Proportion <- na.replace
message("Replaced ", n, " NA's in 'Proportion' with '", na.replace, "'.")
}
}
}
# Add attributes to result.
attr(res, which = "kit") <- kit
# Update audit trail.
res <- auditTrail(obj = res, f.call = match.call(), package = "strvalidator")
# Return result.
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.