#' Derive the proportion of similarities in the shape of event traces
#'
#' @description
#' Derive the proportion of similarities in the shape of event traces between sensors within two sessions.
#'
#' @param x An "FemFit" object.
#' @param sensorToMatch A numeric vector which denotes sensors to use in the comparison from \code{sessionOrder[1]}.
#' @param sensorToMatchAgainst A numeric vector which denotes sensors to use in the comparison from \code{sessionOrder[2]}.
#' @param whichJSONLabel A character argument specifying part of the protocol to extract the event traces from.
#' @param sessionOrder A character vector of length two specifying the sessions to compare. Defaults to \code{unique(x$df$sessionID)[1:2]}.
#' @param B A numeric argument specifying the number of bootstraps to run to estimate the variability of the propotion of similarities.
#' @param toPlot A logical argument as to whether \code{repeatAnalysis()} produces ggplot2 objects visualising where the Hidden Markov Model detected dissimilarities in the shape of the pressure trace.
#'
#' @details
#' \code{repeatAnalysis()} first standardises the event traces against themselves, procuring the raw shape of the event trace.
#' Then, it uses loess regression to get a smooth approximation of the shape for each event trace.
#' A total of (n_1 + n_2)/2 observations are drawn sequentially over time from each loess regression to construct the smooth approximation, where n_x is the number of observations that make up the xth event trace.
#' A vector of differences is constructed and then squared. A Hidden Markov Model is fitted to the vector of differences assuming two states which adhere to a Normal distribution.
#' Finally, the proportion of similarity is derived by using the output of the Viterbi algorithm, and the assessment of the variability is done with a parametric bootstrap based on the Viterbi algorithm's delta probabilities.
#'
#' @examples
#' AS005 = read_FemFit(c(
#' "Datasets_AukRepeat/61aa0782289af385_283_csv.zip",
#' "Datasets_AukRepeat/61aa0782289af385_284_csv.zip"
#' ),
#' remove.NAs = TRUE
#' ) %>%
#' # Segment the FemFit data
#' segment(
#' cp. = 0.001,
#' numOfNodesToLabel = list(c(3, 1, 3, 4), c(4, 1, 5, 3))
#' ) %>%
#' # Calculate the zeroed out pressures
#' deriveZero(method = "lmm")
#'
#' # Produce the proportions of similarities for Sensor four against Sensor four with the ggplot2 objects.
#' AS005_RepeatOutput = repeatAnalysis(AS005, 4, 4, "pfmc3x5s_rest30s", toPlot = TRUE)
#'
#' # View the proportion of similarities for the three events, with the bootstrapped standard errors and CIs
#' AS005_RepeatOutput %>%
#' select(eventID, classProb, classProb_stderr, classProb_lwrbnd, classProb_uprbnd)
#'
#' # View the ggplot2 objects generated by `repeatAnalysis()`
#' AS005_RepeatOutput$plotObj[1]
#' AS005_RepeatOutput$plotObj[2]
#' AS005_RepeatOutput$plotObj[3]
#'
#' @export
repeatAnalysis = function(x, sensorToMatch = 4, sensorToMatchAgainst = 3:5, whichJSONLabel = "pfmc3x5s_rest30s", sessionOrder = NULL, B = 1000, toPlot = FALSE) {
# Throw an error if the x argument is not an FemFit object or missing
if (!inherits(x, "FemFit") || is.na(x)) {
stop("The x argument is not an FemFit object.", call. = FALSE)
}
# Throw an error if zeroPrssr does not exists in x$df
if (x$df %>% colnames %>% {any(grepl("^zeroPrssr[0-9]$", .))} %>% !.) {
stop("zeroPrssr does not exist in the FemFit object.", call. = FALSE)
}
# Throw an error if sensorToMatch is not all numeric
if (!all(is.numeric(sensorToMatch))) {
stop("Some of the specified sensors in sensorToMatch are not numerics.", call. = FALSE)
}
# Throw an error if sensorToMatch is not between 1 and 8 (inclusive)
if (!all(dplyr::between(sensorToMatch, 1, 8))) {
stop("Some of the specified sensors in sensorToMatch are not numbered between 1 and 8 (inclusive).", call. = FALSE)
}
# Throw an error if sensorToMatchAgainst is not all numeric
if (!all(is.numeric(sensorToMatchAgainst))) {
stop("Some of the specified sensors in sensorToMatchAgainst are not numerics.", call. = FALSE)
}
# Throw an error if sensorToMatchAgainst is not between 1 and 8 (inclusive)
if (!all(dplyr::between(sensorToMatchAgainst, 1, 8))) {
stop("Some of the specified sensors in sensorToMatchAgainst are not numbered between 1 and 8 (inclusive).", call. = FALSE)
}
# Throw an error if whichJSONLabel is not a character
if (!is.character(whichJSONLabel[1])) {
stop("The whichJSONLabel argument is not a character.", call. = FALSE)
}
whichJSONLabel = whichJSONLabel[1]
# Throw an error if whichJSONLabel does not exist in the FemFit object
if (!all(whichJSONLabel %in% unique(x$df$JSONLabel))) {
stop("The specified JSONLabel do not exist in the FemFit object.", call. = FALSE)
}
# Throw an error for sessionOrder if it is not specified to be a character or they do not exist in the FemFit object
if (!is.null(sessionOrder[1:2])) {
if (all(is.character(sessionOrder[1:2]))) {
if (!all(sessionOrder[1:2] %in% unique(x$df$sessionID))) {
stop("The specified pair of sessionIDs do not exist in the FemFit object.", call. = FALSE)
}
sessionOrder = sessionOrder[1:2]
} else {
stop("The specified sessionIDs are not both characters.", call. = FALSE)
}
}
# Throw an error if B is not a numeric
if (!is.numeric(B[1])) {
stop("The B argument is not a numeric.", call. = FALSE)
}
B = B[1]
# Throw a warning if B is not greater than 0
if (B < 1) {
B = 1000
warning("The B argument is less 1, it has been set to 1000.", call. = FALSE)
}
# Throw a warning if B is non-integer
if (B < 1) {
B = round(B, digits = 0)
warning("The B argument is non-integer, it has been rounded to an integer.", call. = FALSE)
}
# Throw an error if toPlot is not a logical
if (!is.logical(toPlot[1])) {
stop("The toPlot argument is not a logical.", call. = FALSE)
}
toPlot = toPlot[1]
# Determine the the order of the comparison
if (is.null(sessionOrder)) {
sessionOrder = unique(x$df$sessionID)
}
# Create sequential event region labels if they do not exist
if (x$df %>% colnames %>% {any(grepl("^trLabelSeq$", .))} %>% !.) {
x$df = FemFit:::deriveZero_seqLabels(x$df)
}
# Construct the reference `tibble` to store what is needed to be extracted from x$df
vlookup = sapply(sessionOrder, function (sessionIDs_Child) {
x$df %>%
dplyr::filter(trLabel == "event", sessionID == sessionIDs_Child, JSONLabel == whichJSONLabel) %>%
dplyr::pull(trLabelSeq) %>%
unique()
})
# Throw an error if there is an unequal number of events
if (length(vlookup[[1]]) != length(vlookup[[2]])) {
stop("Observed protocol information is not the same between sessions.", call. = FALSE)
}
# Finish constructing the reference `tibble`
vlookup = dplyr::as_tibble(vlookup) %>%
# Give the event traces identifiers some `prettier` labels
dplyr::mutate(newLabel = paste("Event", 1:n()))
# Manipulate x$df to:
x_Work = x$df %>%
# Extract the event pressure traces within the JSONLabel
dplyr::filter(trLabel == "event", JSONLabel == whichJSONLabel) %>%
split(.$sessionID) %>%
lapply(function(df_Child) {
sessionIDs_Child = df_Child$sessionID[1]
vlookup_Child = vlookup %>%
dplyr::select(sessionIDs_Child, newLabel)
# Append the `prettier` labels onto the filtered x$df
df_Child %>%
dplyr::left_join(vlookup_Child, by = c("trLabelSeq" = sessionIDs_Child))
}) %>%
dplyr::bind_rows()
# Construct the reference `tibble` so that `sensorToMatch` and `sensorToMatchAgainst` are drawn from the right session
toGen = dplyr::tibble(sensor = paste0("zeroPrssr", sensorToMatch), sessionID = sessionOrder[1]) %>%
dplyr::bind_rows(expand.grid(sensor = paste0("zeroPrssr", sensorToMatchAgainst), sessionID = sessionOrder[2], stringsAsFactors = FALSE))
# Derive the loess regression functinos based on the standardised `zeroPrssr`s
x_Hat = apply(toGen, 1, function (toGen_Row) {
FemFit:::repeatAnalysis_generateAbsoluteErrs(toGen_Row[1], x_df = x_Work, toGen_Row[2])
}) %>%
dplyr::bind_rows()
# Construct the comparison matrix and filter to keep only to compare event traces conducted at the same point of the protocol between sessions
comparisonMatrix = expand.grid(sessionOrder, unique(x_Hat$eventID), stringsAsFactors = FALSE) %>%
{paste0(.$Var1, "*", .$Var2)} %>%
combn(2) %>%
t %>%
dplyr::as_tibble() %>%
dplyr::mutate(session_A = gsub("(.*)\\*(.*)", "\\1", V1),
session_B = gsub("(.*)\\*(.*)", "\\1", V2),
event_A = gsub("(.*)\\*(.*)", "\\2", V1),
event_B = gsub("(.*)\\*(.*)", "\\2", V2),
order = as.numeric(gsub("(.*)\\*(.*)[[:space:]]([0-9])", "\\3", V1))) %>%
dplyr::select(-V1, -V2) %>%
dplyr::filter(dplyr::if_else(event_A == event_B, TRUE, dplyr::if_else(session_A == session_B, TRUE, FALSE))) %>%
dplyr::mutate(between = dplyr::if_else(event_A == event_B, TRUE, FALSE)) %>%
dplyr::filter(between) %>%
dplyr::arrange(order) %>%
dplyr::select(-order)
# For each row of the comparison matrix...
output = apply(comparisonMatrix, 1, function (input) {
# Extract the loess regression functions of interest to do the comparison between elements of `sensorToMatch` and `sensorToMatchAgainst`
x_WChild = dplyr::filter(x_Hat, sessionID == input[1], eventID == input[3]) %>%
dplyr::full_join(dplyr::filter(x_Hat, sessionID == input[2], eventID == input[4]), by = c("eventID", "JSONLabel"), suffix = c("_A", "_B"))
# For each comparison of `sensorToMatch` and `sensorToMatchAgainst`
results = sapply(1:nrow(x_WChild), function (rowIndex) {
x_WGChild = x_WChild[rowIndex, ]
# Calculate the average number of observations to draw from the loess regression functions over time
nobs = round((x_WGChild$nob_A + x_WGChild$nob_B)/2)
# Calculate the squared differences between the loess regression functions
diff_df = dplyr::tibble(rescaleTime = seq(0, 1, length.out = nobs)) %>%
{dplyr::mutate(.,
yHat_A = predict(x_WGChild$loess_fit_A[[1]], newdata = .),
yHat_B = predict(x_WGChild$loess_fit_B[[1]], newdata = .),
diffHat = (yHat_A - yHat_B)^2,
sensor_A = x_WGChild$sensor_A,
sensor_B = x_WGChild$sensor_B,
sessionID_A = x_WGChild$sessionID_A,
sessionID_B = x_WGChild$sessionID_B,
JSONLabel = x_WGChild$JSONLabel,
eventID = x_WGChild$eventID
)}
# Use the default histogram breakpoints to setup the starting values of the two states with Normal distributions
midPoints = hist(diff_df$diffHat, plot = FALSE)$mids
initMod = depmix(diffHat ~ 1, data = diff_df, nstates = 2, instart = c(0.5, 0.5), respstart = c(midPoints[1], 0.05, midPoints[length(midPoints)], 0.05))
# Fit the HMM model with EM-Algorithm
emMod = fit(initMod, verbose = FALSE)
emPars = getpars(emMod)
# Extract the fitted parameters of the HMM
tMat.hat = matrix(emPars[3:6], byrow = TRUE, nrow = 2)
muHat = emPars[c(7, 9)]
sigmaHat = emPars[c(8, 10)]
# Format the fitted parameters of the HMM to return to the user
hmmTheta = list(transition_Matrix = tMat.hat, state_Means = as.numeric(muHat), state_SDs = as.numeric(sigmaHat))
# Identify the state with the lowest mean
lowLabel.hat = dplyr::if_else(muHat[1] < muHat[2], 1, 2)
# Calculate the proportion of similarity
classProb = length(which(posterior(emMod)[, 1] == lowLabel.hat))/nobs
# Bootstrap the delta probabilities of the Viterbi algorithm to get a sense of the proportion's variability
classProb_bootstraps = apply(posterior(emMod)[-1, ], 1, function (row) {
sample(1:2, size = B, replace = TRUE, prob = row[2:3])
}) %>%
rbind() %>%
{cbind(rep(posterior(emMod)[1, 1], times = B), .)} %>%
apply(1, function (row) {
length(which(row == lowLabel.hat))/nobs
})
# Standard Bootstrap CI calculations to return to the user
classProb_stderr = sd(classProb_bootstraps)
classProb_lwrbnd = quantile(classProb_bootstraps, probs = 0.025)
classProb_uprbnd = quantile(classProb_bootstraps, probs = 0.975)
# If the a plot is required to visualise where the HMM detected dissimilarities with the squared differences
plotObj = NULL
if (toPlot) {
# Reconstruct the data.frame to be ggplot2 friendly
diff_gather = diff_df %>%
dplyr::select(-rescaleTime, -diffHat) %>%
tidyr::gather(sensor, yHat, -sensor_A, -sensor_B, -eventID, -sessionID_A, -sessionID_B, -JSONLabel) %>%
dplyr::mutate(rescaleTime = rep(dplyr::pull(diff_df, rescaleTime), 2),
sensor = paste0(dplyr::if_else(sensor == "yHat_A", sessionID_A, sessionID_B), ": ", gsub(".*([0-9])", "Sensor \\1", dplyr::if_else(sensor == "yHat_A", sensor_A, sensor_B))),
toPolygon = !rep(posterior(emMod)[, 1] != lowLabel.hat, 2)) %>%
dplyr::select(-sensor_A, -sensor_B)
diff_gather$sensor2 = factor(x = diff_gather$sensor, levels = unique(diff_gather$sensor)[c(grep(x_WGChild$sessionID_A, unique(diff_gather$sensor)), grep(x_WGChild$sessionID_B, unique(diff_gather$sensor)))])
diff_gather$lineBreaks = c(TRUE, diff_gather$toPolygon[2:nrow(diff_gather)] == diff_gather$toPolygon[1:(nrow(diff_gather)-1)]) %>%
{c(1, which(. != 1), nrow(diff_gather)+1)} %>%
{rep(1:length(diff(.)), diff(.))}
# Produce the plot
plotObj = ggplot2::ggplot(data = diff_gather, aes(x = rescaleTime, y = yHat)) +
ggplot2::facet_wrap(~ sensor2) +
ggplot2::theme_bw() +
ggplot2::geom_area(aes(y = 1, fill = toPolygon, group = lineBreaks), show.legend = FALSE, alpha = 0.4) +
ggplot2::geom_line() +
ggplot2::scale_fill_manual(values = c("tomato", NA), labels = c("Disparity", "Similarity"), guide = ggplot2::guide_legend(title = NULL, title.position = "bottom")) +
ggplot2::labs(title = paste0(x_WGChild$JSONLabel, ": ", gsub("event\\.([0-9]*)", "Event \\1", x_WGChild$eventID)), x = "Time (Rescaled)", y = "Zeroed Pressure (Rescaled)")
}
# Return the required output for the current set of comparisons
return (list(classProb, classProb_stderr, classProb_lwrbnd, classProb_uprbnd, hmmTheta, plotObj))
}) %>%
t() %>%
dplyr::as_tibble() %>%
setNames(c("classProb", "classProb_stderr", "classProb_lwrbnd", "classProb_uprbnd", "hmmTheta", "plotObj"))
return (dplyr::bind_cols(x_WChild, results))
}) %>%
# And format the object to return to the user
dplyr::bind_rows() %>%
dplyr::select(-loess_fit_A, -nob_A, -loess_fit_B, -nob_B) %>%
dplyr::mutate(classProb = as.numeric(classProb),
classProb_stderr = as.numeric(classProb_stderr),
classProb_lwrbnd = as.numeric(classProb_lwrbnd),
classProb_uprbnd = as.numeric(classProb_uprbnd))
return (output)
}
#' @title
#' Internal functions of \code{repeatAnalysis}
#'
#' @description
#' Accessible for end-user use with \code{FemFit:::} as a prefix to each function. Note that these child functions have no in-built error detection.
#'
#' @name repeatAnalysis_Child
NULL
#> NULL
#' @describeIn repeatAnalysis_Child
#' Fits a loess regression with the optimal bandwidth parameter determined with AICc.
#'
#' @keywords internal
repeatAnalysis_loess_AICc = function (...) {
# Fit the initial model specified with ...
loess.fit.0 = loess(...)
# Store the number of observations and the value the cell parameter is set to
N = loess.fit.0$n
Cell = loess.fit.0$pars$cell
# Helper function to evaluate the AICc of a loess at a given span parameter
loess_Optim = function(span) {
# Note that the cell parameter is adjusted if and only if floor(N * span * Cell) evaluates to be less than or equal to one
loess.fit.i = update(loess.fit.0, span = span, cell = dplyr::if_else(floor(N * span * Cell) <= 1, 3 / (N * span), Cell))
sigma2.hat = sum((loess.fit.i$residuals)^2)/loess.fit.i$n
trace.hat = loess.fit.i$trace.hat
return (log(sigma2.hat) + 1 + 2 * (trace.hat + 1) / (loess.fit.i$n - trace.hat - 2))
}
# Set the updated span and cell parameters
span. = optimise(loess_Optim, c(0.05, 0.95))$minimum
cell. = dplyr::if_else(floor(N * span. * Cell) <= 1, 3 / (N * span.), Cell)
# Return a loess fit with the updated span and cell parameters
return (loess(..., span = span., cell = cell.))
}
#' @describeIn repeatAnalysis_Child
#' Generates the loess regression functions for a given event trace indexed by sensor and session.
#'
#' @keywords internal
repeatAnalysis_generateAbsoluteErrs = function (whichSensor, x_df, whichSession) {
x_Errs = x_df %>%
# Extract the raw pressure measurements for the given sensor and session
dplyr::filter(trLabel == "event", sessionID == whichSession) %>%
dplyr::select(zeroPrssr = dplyr::matches(whichSensor), time, JSONLabel, newLabel) %>%
split(.$newLabel) %>%
lapply(function (eventRegion_df) {
# Standardise within the partition
eventRegionChild_df = eventRegion_df %>%
dplyr::mutate(rescaleTime = (time - min(time))/(max(time) - min(time)),
rescaleZero = (zeroPrssr - min(zeroPrssr))/(max(zeroPrssr) - min(zeroPrssr)))
# Obtain the smooth approximation of the raw pressure measurements via a loess function
loess.aicc.fit = FemFit:::repeatAnalysis_loess_AICc(rescaleZero ~ rescaleTime, data = eventRegionChild_df, surface = "direct")
# Index the loess functions
dplyr::tibble(loess_fit = list(loess.aicc.fit), sessionID = whichSession, nob = nrow(eventRegion_df), JSONLabel = eventRegionChild_df$JSONLabel[1], eventID = eventRegionChild_df$newLabel[1])
}) %>%
dplyr::bind_rows() %>%
dplyr::mutate(sensor = whichSensor)
# Return the loess functions
return (x_Errs)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.