#' Segment an FemFit dataset
#'
#' @description
#' Conducts principal component analysis then fits a regression tree to the first principal component. Event terminal nodes within a JSONLabel are identified by "large" mean deviations.
#'
#' @param x An "FemFit" object.
#' @param cp. The complexity parameter provided to the regression tree.
#' @param numOfNodesToLabel The number of nodes to label as events within a JSONLabel.
#' @param JSONLabel_Baseline The JSONLabels corresponding to the definite baselines.
#' @param JSONLabel_Sequence The JSONLabel sequence corresponding to \code{numOfNodesToLabel}. Defaults to \code{unique(x$df$JSONLabel)}.
#' @param xval. The number of cross-validations parameter provided to the regression tree.
#' @param ... Arguments passed to \code{\link{rpart}}.
#'
#' @details
#' The elements of the list, \code{numOfNodesToLabel}, maps to each unique \code{sessionID}. The numeric vector corresponds to the number of terminal nodes to label as events within each level of \code{JSONLabel}, where the numeric vector maps to \code{JSONLabel_Sequence}.
#'
#' \code{cp.}, \code{numOfNodesToLabel}, \code{JSONLabel_Baseline}, \code{JSONLabel_Sequence}, and \code{xval.} are recylced to the length of \code{unique(x$df$sessionID)}. \code{...} arguments are not vectorised.
#'
#' The length of the numeric vector depends on the number of non-\code{JSONLabel_Baseline} device labels.
#'
#' @return
#' Updates the \code{errorSummary} element found in the "FemFit" object. This contains a list of terminal nodes with their assigned label and timestamps.
#'
#' @seealso
#' \code{\link{prcomp}} and \code{\link{rpart}} for how the main statistical learning components work.
#'
#' \code{\link{segmentRefine_pcurve}} and \code{\link{segmentRefine_protocol}} for functions to identifiy events in noisy pressure traces post-segmentation.
#'
#' @examples
#' # Read in the FemFit data
#' AS005 = read_FemFit(c(
#' "Datasets_AukRepeat/61aa0782289af385_283_csv.zip",
#' "Datasets_AukRepeat/61aa0782289af385_284_csv.zip"
#' ),
#' remove.NAs = TRUE
#' )
#'
#' # Segment the FemFit data
#' AS005 = segment(
#' x = AS005
#' # Use the same control parameter for both sessions
#' cp. = 0.001,
#' # Define a different set in the number of terminal nodes to edit for each session
#' numOfNodesToLabel = list(c(3, 1, 3, 4), c(4, 1, 5, 3)),
#' # Set the baseline JSONLabels. segment() will look for both "relax30s_A" and "relax30s_B" in each session
#' JSONLabel_Baseline = list(c("relax30s_A", "relax30s_B")),
#' # Works in conjunction with the numOfNodesToLabel argument.
#' # segment() will assume that the order of protocol is unique(AS005$df$JSONLabel), excluding the baseline JSONLabels
#' JSONLabel_Sequence = list("")
#' )
#'
#' @export
segment = function(x, cp. = 0.005, numOfNodesToLabel = list(c(3, 1, 6, 3)), JSONLabel_Baseline = list(c("relax30s_A", "relax30s_B")), JSONLabel_Sequence = list(""), xval. = 100, ...) {
# Throw an error if the x argument is not an FemFit object or missing
if (!inherits(x, "FemFit") || is.na(x)) {
stop("The x argument is not an FemFit object.", call. = FALSE)
}
# Throw an error if cp. is not a numeric or it has any NAs
if (!is.numeric(cp.) || anyNA(cp.)) {
stop("The provided cp. value is not a numeric.", call. = FALSE)
}
# Throw an error if cp. is less than 0
if (any(cp. < 0)) {
stop("The provided cp. value is less than zero. See ?rpart for details.", call. = FALSE)
}
# Throw a warning if cp. is less than 0.001
if (any(cp. < 0.001)) {
warning("The provided cp. value is less than 0.001. The naive labelling may not be easily reproducible without a preceding set.seed() call.", call. = FALSE)
}
# Throw an error if numOfNodesToLabel is not a numeric or it has any NAs
if (lapply(numOfNodesToLabel, is.numeric) %>% unlist %>% all %>% !. || lapply(numOfNodesToLabel, anyNA) %>% unlist %>% all) {
stop("A number of provided numOfNodesToLabel values are not numerics.", call. = FALSE)
}
# Throw an error if JSONLabel_Baseline is not a character or it has any NAs
if (lapply(JSONLabel_Baseline, is.character) %>% unlist %>% all %>% !. || lapply(JSONLabel_Baseline, anyNA) %>% unlist %>% all) {
stop("A number of provided JSONLabel_Baseline value(s) are not characters.", call. = FALSE)
}
# Throw an error if JSONLabel_Seq is not a character or it has any NAs
if (lapply(JSONLabel_Sequence, is.character) %>% unlist %>% all %>% !. || lapply(JSONLabel_Sequence, anyNA) %>% unlist %>% all) {
stop("A number of provided JSONLabel_Sequence value(s) are not characters.", call. = FALSE)
}
# Throw an error if xval. is not a numeric or it has any NAs
if (!is.numeric(cp.) || anyNA(cp.)) {
stop("The provided xval. value is not a numeric.", call. = FALSE)
}
# Throw a warning if xval. is not a whole number
if (any(xval. %% 1 != 0)) {
warning("The provided xval. value is not a whole number. segment() will coerce it into a whole number.", call. = FALSE)
xval. = round(xval., 0)
}
# Throw an error if xval. is less than 0.001
if (any(xval. < 1)) {
stop("The provided xval. value is less than 1. Must be a value greater than 1.", call. = FALSE)
}
# Recycle elements of cp, numOfNodesToLabel, JSONLabel_Baseline, JSONLabel_Sequence, and xval. if necessary
# Also, name the elements within each vector with sessionID
sessionIDs = unique(x$df$sessionID)
sessionID.len = length(unique(x$df$sessionID))
cp. = rep(cp., length.out = sessionID.len)
names(cp.) = sessionIDs
numOfNodesToLabel = rep(numOfNodesToLabel, length.out = sessionID.len)
names(numOfNodesToLabel) = sessionIDs
JSONLabel_Baseline = rep(JSONLabel_Baseline, length.out = sessionID.len)
names(JSONLabel_Baseline) = sessionIDs
JSONLabel_Sequence = rep(JSONLabel_Sequence, length.out = sessionID.len)
names(JSONLabel_Sequence) = sessionIDs
xval. = rep(xval., length.out = sessionID.len)
names(xval.) = sessionIDs
# Construct the regular expression with the JSONLabel_Baseline components
regexp.JSONLabels = sapply(JSONLabel_Baseline, function (child) paste(child, collapse = "|"))
# Check whether the length of numOfNodesToLabel is equal to the number of non-baseline JSONLabels
numOfNodesToLabel = lapply(sessionIDs, function (child) {
if (all(JSONLabel_Sequence[[child]] == "")) {
# obs.JSONLabels defaults to unique(x$df$JSONLabel) if JSONLabel_Sequence[[child]] is missing
obs.JSONLabels = unique(x$df %>% dplyr::filter(sessionID == child) %>% dplyr::pull(JSONLabel))
} else {
obs.JSONLabels = JSONLabel_Sequence[[child]] %>% unique()
}
if (length(numOfNodesToLabel[[child]]) != length(obs.JSONLabels[!grepl(regexp.JSONLabels[child], obs.JSONLabels)])) {
stop(paste0("The number of non-exercise JSONLabels found in Session ", child, " does not much the length of the corresponding numOfNodesToLabel vector argument"), call. = FALSE)
}
names(numOfNodesToLabel[[child]]) = obs.JSONLabels[!grepl(regexp.JSONLabels[child], obs.JSONLabels)]
numOfNodesToLabel[[child]]
})
names(numOfNodesToLabel) = sessionIDs
# Throw a warning if trLabel exists in x$df and remove it from x$df
if (x$df %>% colnames %>% {any(grepl("^trLabel$", .))}) {
warning("trLabel already exists in the FemFit object. segment() will overwrite the previous trLabel.", call. = FALSE)
x$df = x$df %>%
dplyr::select(-trLabel)
}
x_Work = by(x$df, x$df$sessionID, function (x_Child) {
sessionID_Child = x_Child$sessionID[1]
# Generate the first principal component of the eight pressure traces
x_Child$pc.1 = x_Child %>%
dplyr::select(dplyr::starts_with("prssr_sensor")) %>%
dplyr::select_if(function(x) (var(x) != 0)) %>%
prcomp(., center = TRUE, scale. = TRUE) %>%
.$x %>%
.[, 1]
# Fit a regression tree to split the first principal component with the explanatory variables time and JSONLabel
rgrssnTree = rpart::rpart(formula = pc.1 ~ time + JSONLabel, data = x_Child, method = "anova", cp = cp.[sessionID_Child], xval = xval.[sessionID_Child], ...) %>%
# Prune the regression tree with the one std. error rule
prune(., cp = with(dplyr::as_tibble(.$cptable), max(CP[which(xerror <= min(xerror) + xstd[which(xerror == min(xerror))[1]])[1]])))
# Define the temporal order
curOrder = unique(rgrssnTree$where)
newOrder = 1:length(curOrder)
# Load in the regression splits into the data.frame
x_Child$trmnlNode =
# Recode the nodes to labelled in temporal order
sapply(rgrssnTree$where, function (x) {newOrder[which(x == curOrder)]}) %>%
{
# Create a data.frame object which only has the recoded nodes and their JSONLabels
df = data.frame(trmnlNode = ., JSONLabel = dplyr::pull(x_Child, JSONLabel))
# Identify where a node spans over two or more JSONLabels
vlookup = unique(df) %>%
dplyr::mutate(trmnlNode_New = row_number())
# Create the updated nodes
df = df %>%
dplyr::left_join(y = vlookup, by = c("trmnlNode", "JSONLabel")) %>%
dplyr::pull(trmnlNode_New)
}
# Extract out the (principal component one) means for the device recorded baselines
fitted.Mus = x_Child %>%
dplyr::filter(grepl(regexp.JSONLabels[sessionID_Child], JSONLabel)) %>%
dplyr::group_by(trmnlNode) %>%
dplyr::summarise(fitted = mean(pc.1)) %>% {
setNames(dplyr::pull(., fitted), paste0("bslineNode_", unique(.$trmnlNode)))
}
# Create a new set of labels to store the node labels into
trLabels = gsub("bslineNode_([0-9]*)", "trLabel_\\1", names(fitted.Mus))
names(trLabels) = names(fitted.Mus)
trLabels.len = length(trLabels)
# Load in the right numOfNodesToLabel argument...
numOfNodesToLabel_Child = numOfNodesToLabel[[sessionID_Child]]
# Create a data.frame object which contains each node's prediction error for each element in fitted.Mus
predErrs = vapply(fitted.Mus, FUN.VALUE = numeric(nrow(x_Child)), FUN = function (mu) {
(x_Child$pc.1 - mu)^2
}) %>%
dplyr::as_tibble() %>%
dplyr::bind_cols(x_Child) %>%
dplyr::group_by(sessionID, JSONLabel, trmnlNode) %>%
dplyr::summarise_at(names(fitted.Mus), funs(sqrt(mean(., na.rm = TRUE)))) %>%
# Create naive labels of baselines and events
by(., .$JSONLabel, function (predErrs_Child) {
# Create the naive baseline and event labels for the exercises
if (!grepl(regexp.JSONLabels[sessionID_Child], predErrs_Child$JSONLabel[1])) {
predErrs_Child = lapply(trLabels, function (current_Label) {
predErrs_GrandChild = predErrs_Child %>%
dplyr::arrange_(paste0("desc(", names(trLabels[which(trLabels == current_Label)]), ")"))
predErrs_GrandChild[current_Label] = "baseline"
predErrs_GrandChild[1:numOfNodesToLabel_Child[predErrs_Child$JSONLabel[1]], current_Label] = "event"
predErrs_GrandChild[c("trmnlNode", current_Label)]
}) %>%
Reduce(function(tdf1, tdf2) dplyr::left_join(tdf1, tdf2, by = "trmnlNode"), .) %>%
{dplyr::left_join(predErrs_Child, ., by = "trmnlNode")}
# Else, initialise all node labels as baselines
} else {
predErrs_Child[trLabels] = "baseline"
}
return (predErrs_Child)
}) %>%
{Reduce(function(dtf1, dtf2) dplyr::bind_rows(dtf1, dtf2), .)} %>%
dplyr::mutate(trLabel = NA_real_)
# Majority vote to get the overall naive label for a node
predErrs$trLabel = apply(predErrs, 1, function (row_Child) {
row_Child[trLabels] %>%
.[grepl("event", .)] %>%
{(length(.) / trLabels.len >= 0.5)} %>%
dplyr::if_else(., "event", "baseline")
})
# Final safety check to ensure the number of event nodes returned is the specified number of nodes
unwrapped_Nodes = unlist(numOfNodesToLabel[sessionID_Child])
names(unwrapped_Nodes) = gsub(paste0(sessionID_Child, "\\.(.*)"), "\\1", names(unwrapped_Nodes))
finalSafety = predErrs %>%
dplyr::filter(trLabel == "event") %>%
dplyr::group_by(JSONLabel) %>%
dplyr::summarise(n = n()) %>%
dplyr::mutate(expectedN = unwrapped_Nodes[JSONLabel],
isExpected = n == expectedN)
if (any(!finalSafety$isExpected)) {
# Initialise an empty tibble object
index_Base = dplyr::tibble()
# Utilising a for-loop here as the index_Base object has to be updated
for (index in which(finalSafety$isExpected == FALSE)) {
# Isolate the event terminal nodes
finalSafety_Child = predErrs %>%
dplyr::filter(trLabel == "event") %>%
dplyr::group_by(JSONLabel) %>%
dplyr::filter(JSONLabel == finalSafety$JSONLabel[index])
# Update the labels based on which nodes have the highest mean princpal component one score
index_Child = x_Child %>%
dplyr::filter(JSONLabel == finalSafety$JSONLabel[index],
trmnlNode %in% finalSafety_Child$trmnlNode) %>%
dplyr::group_by(trmnlNode) %>%
dplyr::summarise(mu = mean(pc.1)) %>%
dplyr::arrange(desc(mu)) %>%
dplyr::mutate(trLabel_New = "baseline")
index_Child$trLabel_New[1:finalSafety$expectedN[index]] = "event"
# Update the index_Base object
index_Base = dplyr::bind_rows(index_Base, dplyr::select(index_Child, trLabel_New, trmnlNode))
}
# Update the predErrs object with the right number of event classifications
predErrs = dplyr::left_join(predErrs, index_Base, by = "trmnlNode") %>%
dplyr::mutate(trLabel = dplyr::if_else(!is.na(trLabel_New), trLabel_New, trLabel)) %>%
dplyr::select(-trLabel_New)
}
# Append the trLabel to x_Child
x_Child = x_Child %>%
dplyr::left_join(y = dplyr::select(predErrs, sessionID, JSONLabel, trmnlNode, trLabel), by = c("sessionID", "JSONLabel", "trmnlNode")) %>%
dplyr::mutate(trLabel = dplyr::if_else(is.na(trLabel), "baseline", trLabel))
# Construct errorSummary
errorSummary = predErrs %>%
dplyr::select(sessionID, JSONLabel, trmnlNode, trLabel) %>%
dplyr::ungroup() %>%
dplyr::left_join(x_Child %>% group_by(trmnlNode) %>% summarise(start = min(time), stop = max(time)), by = "trmnlNode")
return (list(
df = x_Child %>% dplyr::select(-pc.1),
errorSummary = errorSummary
))
})
# Setup the object to return to the end-user
if (length(x_Work) == 1) {
x$df = x_Work[[1]]$df
x$errorSummary = x_Work[[1]]$errorSummary
} else {
x$df = x_Work %>% Reduce(function(lst1, lst2) dplyr::bind_rows(lst1$df, lst2$df), .)
x$errorSummary = x_Work %>% Reduce(function(lst1, lst2) dplyr::bind_rows(lst1$errorSummary, lst2$errorSummary), .)
}
return (x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.