Nothing
#' Detect outliers for single observations
#'
#' Detect outlying observations in a time series by modeling each plotId using
#' a local regression.
#'
#' See locfit() help function from the locfit R library. The user can act on:
#' \describe{
#' \item{nnLocfit}{the constant of the smoothing parameter. Increase nnLocfit
#' to have a very smooth curve}
#' \item{confIntSize}{the level to calculate the confidence interval. Increase
#' confIntSize to exclude less outliers}
#' }
#'
#' @param TP An object of class \code{TP}.
#' @param trait A character vector indicating the trait to model in \code{TP}.
#' @param plotIds A character vector of plotIds for which the outliers should be
#' detected. If \code{NULL}, all plotIds in \code{TP} are used.
#' @param checkEdges Before fitting the local regression should a check be done
#' if the first and last time point for a plot are outlying observations?
#' @param confIntSize A numeric value defining the confidence interval (see
#' Details).
#' @param nnLocfit A numeric value defining the constant component of the
#' smoothing parameter nn (see Details).
#'
#' @return An object of class singleOut, a \code{data.frame} with the following
#' columns.
#' \describe{
#' \item{plotId}{plotId}
#' \item{timePoint}{time point}
#' \item{trait}{modeled trait}
#' \item{yPred}{prediction from the local regression}
#' \item{sd_yPred}{standard deviation of the prediction}
#' \item{lwr}{lower bound of the confidence interval}
#' \item{upr}{upper bound of the confidence interval}
#' \item{outlier}{flag for detected outlier (a value of 1 indicates the
#' observation is an outlier)}
#' }
#'
#' @examples
#' ## Create a TP object containing the data from the Phenovator.
#' PhenovatorDat1 <- PhenovatorDat1[!PhenovatorDat1$pos %in%
#' c("c24r41", "c7r18", "c7r49"), ]
#' phenoTP <- createTimePoints(dat = PhenovatorDat1,
#' experimentName = "Phenovator",
#' genotype = "Genotype",
#' timePoint = "timepoints",
#' repId = "Replicate",
#' plotId = "pos",
#' rowNum = "y", colNum = "x",
#' addCheck = TRUE,
#' checkGenotypes = c("check1", "check2",
#' "check3", "check4"))
#'
#' ## First select a subset of plants, for example here 9 plants
#' plantSel <- phenoTP[[1]]$plotId[1:9]
#' # Then run on the subset
#' resuVatorHTP <- detectSingleOut(TP = phenoTP,
#' trait = "EffpsII",
#' plotIds = plantSel,
#' confIntSize = 3,
#' nnLocfit = 0.1)
#'
#' @family functions for detecting outliers for single observations
#'
#' @export
detectSingleOut <- function(TP,
trait,
plotIds = NULL,
checkEdges = TRUE,
confIntSize = 5,
nnLocfit = 0.5) {
## Checks.
if (!inherits(TP, "TP")) {
stop("TP should be an object of class TP.\n")
}
TPTot <- do.call(rbind, TP)
if (!is.character(trait) || length(trait) > 1) {
stop("trait should be a character string of length 1.\n")
}
if (!hasName(x = TPTot, name = trait)) {
stop("TP should contain a column ", trait, ".\n")
}
if (!is.null(plotIds)) {
if (!all(plotIds %in% TPTot[["plotId"]])) {
stop("All plotIds should be in TP.\n")
}
## Restrict TPTot to selected plotIds
TPTot <- TPTot[TPTot[["plotId"]] %in% plotIds, ]
}
## Remove missing values for trait.
TPTotRest <- na.omit(TPTot[c("plotId", "timePoint", trait)])
## Split data into a list of data.frames per plot.
TPPlot <- split(x = TPTotRest, f = TPTotRest[["plotId"]], drop = TRUE)
## Compute outliers per plot.
plotPreds <- lapply(X = TPPlot, FUN = function(plotDat) {
## Only makes sense for at least 6 time points.
if (nrow(plotDat) <= 6) {
warning("Not enough data points (at least 7) to fit a model for: ",
plotDat[1, "plotId"], ".\n", call. = FALSE)
return(NULL)
}
## Fit a model using locfit.
y <- plotDat[[trait]]
x <- plotDat[["timePoint"]]
## Fit model with all time points included.
fitMod <- locfit::locfit(y ~ locfit::lp(x, nn = nnLocfit, deg = 2))
yPred <- predict(fitMod, newdata = x, se.fit = TRUE)
if (checkEdges) {
## Check if first timepoint is an outlier.
## Fit model excluding first timepoint.
fitMod0 <- locfit::locfit(y[-1] ~ locfit::lp(x[-1], nn = nnLocfit, deg = 2))
## Get predictions for new models.
yPred0 <- predict(fitMod0, newdata = x, se.fit = TRUE)
## Compute mean and standard deviation for first 5 time points.
m0 <- mean(y[1:5])
s0 <- sqrt(sum((y[1:5] - m0) ^ 2) / 5)
## Remove first time point if it is outside mean +/- 1.5 sd.
if (y[1] < m0 - 1.5 * s0 || y[1] > m0 + 1.5 * s0) {
y <- y[-1]
x <- x[-1]
yPred <- yPred0
}
## Check if last timepoint is an outlier.
## Get position of last timepoint.
posL <- length(x)
## Fit model excluding last timepoint.
fitMod1 <- locfit::locfit(y[-posL] ~ locfit::lp(x[-posL],
nn = nnLocfit, deg = 2))
## Get predictions for the model.
yPred1 <- predict(fitMod1, newdata = plotDat[["timePoint"]], se.fit = TRUE)
## Compute mean and standard deviation for last 5 time points.
m1 <- mean(y[(posL - 4):posL])
s1 <- sqrt(sum((y[(posL - 4):posL] - m1) ^ 2) / 5)
## Remove last time point if it is outside mean +/- 1.5 sd.
if (y[posL] < m1 - 1.5 * s1 || y[posL] > m1 + 1.5 * s1) {
yPred <- yPred1
}
}
## Compute upper and lower boundaries.
lwr <- yPred$fit - confIntSize * yPred$se.fit
upr <- yPred$fit + confIntSize * yPred$se.fit
## Add results to plotDat.
plotDat[["yPred"]] <- yPred$fit
plotDat[["sd_yPred"]] <- yPred$se.fit
plotDat[["lwr"]] <- lwr
plotDat[["upr"]] <- upr
## All values outside the [lwr, upr] range are marked as outlier.
plotDat[["outlier"]] <- ifelse(plotDat[[trait]] < plotDat[["upr"]] &
plotDat[[trait]] > plotDat[["lwr"]], 0, 1)
return(plotDat)
})
## Bind everything together in a single data.frame.
plotPred <- do.call(rbind, plotPreds)
if (is.null(plotPred)) {
stop("Not enough data points (<= 6) for any of the plots.\n")
}
## Rownames are confusing and redundant.
rownames(plotPred) <- NULL
## Add class and trait as attribute.
class(plotPred) <- c("singleOut", class(plotPred))
attr(plotPred, which = "trait") <- trait
return(plotPred)
}
#' Plot outliers for single observations
#'
#' Plot the fitted local regression, confidence intervals and detected outliers
#' for each plotId.
#'
#' @inheritParams detectSingleOut
#' @inheritParams plot.TP
#'
#' @param x An object of class singleOut.
#' @param ... Ignored.
#' @param outOnly Should only plots containing outliers be plotted?
#'
#' @return A list of ggplot objects is invisibly returned.
#'
#' @examples
#' ## Create a TP object containing the data from the Phenovator.
#' PhenovatorDat1 <- PhenovatorDat1[!PhenovatorDat1$pos %in%
#' c("c24r41", "c7r18", "c7r49"), ]
#' phenoTP <- createTimePoints(dat = PhenovatorDat1,
#' experimentName = "Phenovator",
#' genotype = "Genotype",
#' timePoint = "timepoints",
#' repId = "Replicate",
#' plotId = "pos",
#' rowNum = "y", colNum = "x",
#' addCheck = TRUE,
#' checkGenotypes = c("check1", "check2",
#' "check3", "check4"))
#'
#' ## Select a subset of plants, for example here 9 plants.
#' plantSel <- phenoTP[[1]]$plotId[1:9]
#' # Then run on the subset.
#' resuVatorHTP <- detectSingleOut(TP = phenoTP,
#' trait = "EffpsII",
#' plotIds = plantSel,
#' confIntSize = 3,
#' nnLocfit = 0.1)
#'
#' ## Visualize the prediction by choosing a single plant...
#' plot(resuVatorHTP, plotIds = "c21r24", outOnly = FALSE)
#' ## ...or a subset of plants.
#' plot(resuVatorHTP, plotIds = plantSel, outOnly = FALSE)
#'
#' @family functions for detecting outliers for single observations
#'
#' @export
plot.singleOut <- function(x,
...,
plotIds = NULL,
outOnly = TRUE,
output = TRUE) {
plotDat <- x
if (!is.null(plotIds)) {
if (!all(plotIds %in% plotDat[["plotId"]])) {
stop("All plotIds should be in x\n")
}
plotDat <- plotDat[plotDat[["plotId"]] %in% plotIds, ]
}
## Select outliers.
outliers <- plotDat[plotDat[["outlier"]] == 1, ]
if (outOnly) {
plotDat <- plotDat[plotDat[["plotId"]] %in% outliers[["plotId"]], ]
if (nrow(plotDat) == 0) {
stop("No outliers present for selected plotIds.\n")
}
}
plotDat <- droplevels(plotDat)
trait <- attr(x = x, which = "trait")
p <- ggplot2::ggplot(plotDat,
ggplot2::aes(x = .data[["timePoint"]],
y = .data[[trait]])) +
ggplot2::geom_point(na.rm = TRUE) +
ggplot2::geom_line(mapping = ggplot2::aes(y = .data[["lwr"]]),
col = "green", linewidth = .8) +
ggplot2::geom_line(mapping = ggplot2::aes(y = .data[["upr"]]),
col = "green", linewidth = .8) +
ggplot2::geom_line(mapping = ggplot2::aes(y = .data[["yPred"]]),
col = "red", linewidth = .8) +
ggplot2::geom_point(data = outliers, col = "blue", size = 2) +
ggplot2::theme(legend.position = "none")
nTp <- length(unique(plotDat[["timePoint"]]))
if (nTp < 5) {
p <- p + ggplot2::scale_x_datetime(breaks = unique(plotDat[["timePoint"]]),
labels = scales::date_format("%B %d"))
} else {
## Format the time scale to Month + day.
p <- p + ggplot2::scale_x_datetime(breaks = prettier(n = 3),
labels = scales::date_format("%B %d"))
}
## Calculate the total number of plots.
nPlots <- length(unique(plotDat[["plotId"]]))
## 25 Plots per page.
nPag <- ceiling(nPlots / 25)
if (nPlots >= 25) {
## More than 25 plots.
## For identical layout on all pages use 5 x 5 plots throughout.
#rowPag <- colPag <- rep(x = 5, times = nPag)
# 28-7-2020. ggforce has a bug that prevents this identical layout
# https://github.com/thomasp85/ggforce/issues/201
# When fixed the code above can be reactivated and the three lines below
# removed.
plotsLastPag <- nPlots %% 25
rowPag <- c(rep(x = 5, times = nPag - 1), min(plotsLastPag %/% 5 + 1, 5))
colPag <- c(rep(x = 5, times = nPag - 1),
ifelse(plotsLastPag >= 5, 5, plotsLastPag))
} else {
## Less than 25 plots.
## Fill page by row of 5 plots.
plotsPag <- nPlots %% 25
rowPag <- min(ceiling(plotsPag / 5), 5)
colPag <- ifelse(plotsPag >= 5, 5, plotsPag)
}
## Build pages of plots.
pPag <- vector(mode = "list", length = nPag)
for (i in 1:nPag) {
pPag[[i]] <- p +
ggforce::facet_wrap_paginate(facets = "plotId", nrow = rowPag[i],
ncol = colPag[i], page = i)
if (output) {
suppressMessages(plot(pPag[[i]]))
}
}
invisible(pPag)
}
#' Replace outliers for single observations by NA
#'
#' Function for replacing outliers for single observations by NA.
#'
#' @param TP An object of class TP.
#' @param singleOut A data.frame with at least the columns plotId and
#' timePoint with values corresponding to those in TP. If a column outlier is
#' present, as in the output of \code{detectSingleOut}, only plotId x
#' timePoint combinations for which outlier = 1 will be set to NA. If no
#' column outlier is present, all observations in singleOut will be set to NA.
#' @param trait The trait that should be set to NA. Can be ignored when using
#' the output of \code{detectSingleOut} as input.
#'
#' @return An object of class TP, the input with the outlier replaced by NA.
#'
#' @examples
#' ## Create a TP object containing the data from the Phenovator.
#' PhenovatorDat1 <- PhenovatorDat1[!PhenovatorDat1$pos %in%
#' c("c24r41", "c7r18", "c7r49"), ]
#' phenoTP <- createTimePoints(dat = PhenovatorDat1,
#' experimentName = "Phenovator",
#' genotype = "Genotype",
#' timePoint = "timepoints",
#' repId = "Replicate",
#' plotId = "pos",
#' rowNum = "y", colNum = "x",
#' addCheck = TRUE,
#' checkGenotypes = c("check1", "check2",
#' "check3", "check4"))
#'
#' ## First select a subset of plants, for example here 9 plants.
#' plantSel <- phenoTP[[1]]$plotId[1:9]
#' # Then run on the subset
#' resuVatorHTP <- detectSingleOut(TP = phenoTP,
#' trait = "EffpsII",
#' plotIds = plantSel,
#' confIntSize = 3,
#' nnLocfit = 0.1)
#'
#' ## Replace the studied trait by NA for the plants marked as outliers.
#' phenoTPOut <- removeSingleOut(phenoTP, resuVatorHTP)
#'
#' @family functions for detecting outliers for single observations
#'
#' @export
removeSingleOut <- function(TP,
singleOut,
trait = attr(x = singleOut,
which = "trait")) {
if (!inherits(TP, "TP")) {
stop("TP should be an object of class TP.\n")
}
if (!inherits(singleOut, "data.frame")) {
stop("singleOut should be a data.frame.\n")
}
if (!all(hasName(singleOut, c("plotId", "timePoint")))) {
stop("singleOut should at least contain the columns plotId ",
"and timePoint.\n")
}
if (!all(as.character(singleOut[["timePoint"]]) %in% names(TP))) {
stop("All time points in singleOut should be in TP.\n")
}
if (!any(singleOut[["outlier"]] == 1)) {
stop("There are no outlying points in TP.\n")
}
if (hasName(x = singleOut, "outlier")) {
## Remove observations that are not actually outliers.
singleOut <- singleOut[singleOut[["outlier"]] == 1, ]
}
for (i in seq_len(nrow(singleOut))) {
plotI <- singleOut[i, "plotId"]
timeI <- as.character(singleOut[i, "timePoint"])
if (!plotI %in% TP[[timeI]][["plotId"]]) {
warning(plotI, " not present in timePoint ", timeI, ".\n", call. = FALSE)
}
TP[[timeI]][TP[[timeI]][["plotId"]] == plotI, trait] <- NA
}
return(TP)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.