FemFit: Functions to Analyse FemFit Datasets

Documented in repeatAnalysis

#' Derive the proportion of similarities in the shape of event traces
#'
#' @description
#' Derive the proportion of similarities in the shape of event traces between sensors within two sessions.
#'
#' @param x An "FemFit" object.
#' @param sensorToMatch A numeric vector which denotes sensors to use in the comparison from \code{sessionOrder[1]}.
#' @param sensorToMatchAgainst A numeric vector which denotes sensors to use in the comparison from \code{sessionOrder[2]}.
#' @param whichJSONLabel A character argument specifying part of the protocol to extract the event traces from.
#' @param sessionOrder A character vector of length two specifying the sessions to compare. Defaults to \code{unique(x$df$sessionID)[1:2]}.
#' @param B A numeric argument specifying the number of bootstraps to run to estimate the variability of the propotion of similarities.
#' @param toPlot A logical argument as to whether \code{repeatAnalysis()} produces ggplot2 objects visualising where the Hidden Markov Model detected dissimilarities in the shape of the pressure trace.
#'
#' @details
#' \code{repeatAnalysis()} first standardises the event traces against themselves, procuring the raw shape of the event trace.
#' Then, it uses loess regression to get a smooth approximation of the shape for each event trace.
#' A total of (n_1 + n_2)/2 observations are drawn sequentially over time from each loess regression to construct the smooth approximation, where n_x is the number of observations that make up the xth event trace.
#' A vector of differences is constructed and then squared. A Hidden Markov Model is fitted to the vector of differences assuming two states which adhere to a Normal distribution.
#' Finally, the proportion of similarity is derived by using the output of the Viterbi algorithm, and the assessment of the variability is done with a parametric bootstrap based on the Viterbi algorithm's delta probabilities.
#'
#' @examples
#' AS005 = read_FemFit(c(
#'         "Datasets_AukRepeat/61aa0782289af385_283_csv.zip",
#'         "Datasets_AukRepeat/61aa0782289af385_284_csv.zip"
#'     ),
#'     remove.NAs = TRUE
#'   ) %>%
#'   # Segment the FemFit data
#'   segment(
#'     cp. = 0.001,
#'     numOfNodesToLabel = list(c(3, 1, 3, 4), c(4, 1, 5, 3))
#'   ) %>%
#'   # Calculate the zeroed out pressures
#'   deriveZero(method = "lmm")
#'
#' # Produce the proportions of similarities for Sensor four against Sensor four with the ggplot2 objects.
#' AS005_RepeatOutput = repeatAnalysis(AS005, 4, 4, "pfmc3x5s_rest30s", toPlot = TRUE)
#'
#' # View the proportion of similarities for the three events, with the bootstrapped standard errors and CIs
#' AS005_RepeatOutput %>%
#'    select(eventID, classProb, classProb_stderr, classProb_lwrbnd, classProb_uprbnd)
#'
#' # View the ggplot2 objects generated by `repeatAnalysis()`
#' AS005_RepeatOutput$plotObj[1]
#' AS005_RepeatOutput$plotObj[2]
#' AS005_RepeatOutput$plotObj[3]
#'
#' @export
repeatAnalysis = function(x, sensorToMatch = 4, sensorToMatchAgainst = 3:5, whichJSONLabel = "pfmc3x5s_rest30s", sessionOrder = NULL, B = 1000, toPlot = FALSE) {
  # Throw an error if the x argument is not an FemFit object or missing
  if (!inherits(x, "FemFit") || is.na(x)) {
    stop("The x argument is not an FemFit object.", call. = FALSE)
  }

  # Throw an error if zeroPrssr does not exists in x$df
  if (x$df %>% colnames %>% {any(grepl("^zeroPrssr[0-9]$", .))} %>% !.) {
    stop("zeroPrssr does not exist in the FemFit object.", call. = FALSE)
  }

  # Throw an error if sensorToMatch is not all numeric
  if (!all(is.numeric(sensorToMatch))) {
    stop("Some of the specified sensors in sensorToMatch are not numerics.", call. = FALSE)
  }

  # Throw an error if sensorToMatch is not between 1 and 8 (inclusive)
  if (!all(dplyr::between(sensorToMatch, 1, 8))) {
    stop("Some of the specified sensors in sensorToMatch are not numbered between 1 and 8 (inclusive).", call. = FALSE)
  }

  # Throw an error if sensorToMatchAgainst is not all numeric
  if (!all(is.numeric(sensorToMatchAgainst))) {
    stop("Some of the specified sensors in sensorToMatchAgainst are not numerics.", call. = FALSE)
  }

  # Throw an error if sensorToMatchAgainst is not between 1 and 8 (inclusive)
  if (!all(dplyr::between(sensorToMatchAgainst, 1, 8))) {
    stop("Some of the specified sensors in sensorToMatchAgainst are not numbered between 1 and 8 (inclusive).", call. = FALSE)
  }

  # Throw an error if whichJSONLabel is not a character
  if (!is.character(whichJSONLabel[1])) {
    stop("The whichJSONLabel argument is not a character.", call. = FALSE)
  }
  whichJSONLabel = whichJSONLabel[1]

  # Throw an error if whichJSONLabel does not exist in the FemFit object
  if (!all(whichJSONLabel %in% unique(x$df$JSONLabel))) {
    stop("The specified JSONLabel do not exist in the FemFit object.", call. = FALSE)
  }

  # Throw an error for sessionOrder if it is not specified to be a character or they do not exist in the FemFit object
  if (!is.null(sessionOrder[1:2])) {
    if (all(is.character(sessionOrder[1:2]))) {
      if (!all(sessionOrder[1:2] %in% unique(x$df$sessionID))) {
        stop("The specified pair of sessionIDs do not exist in the FemFit object.", call. = FALSE)
      }
      sessionOrder = sessionOrder[1:2]
    } else {
      stop("The specified sessionIDs are not both characters.", call. = FALSE)
    }
  }

  # Throw an error if B is not a numeric
  if (!is.numeric(B[1])) {
    stop("The B argument is not a numeric.", call. = FALSE)
  }
  B = B[1]

  # Throw a warning if B is not greater than 0
  if (B < 1) {
    B = 1000
    warning("The B argument is less 1, it has been set to 1000.", call. = FALSE)
  }

  # Throw a warning if B is non-integer
  if (B < 1) {
    B = round(B, digits = 0)
    warning("The B argument is non-integer, it has been rounded to an integer.", call. = FALSE)
  }

  # Throw an error if toPlot is not a logical
  if (!is.logical(toPlot[1])) {
    stop("The toPlot argument is not a logical.", call. = FALSE)
  }
  toPlot = toPlot[1]

  # Determine the the order of the comparison
  if (is.null(sessionOrder)) {
    sessionOrder = unique(x$df$sessionID)
  }

  # Create sequential event region labels if they do not exist
  if (x$df %>% colnames %>% {any(grepl("^trLabelSeq$", .))} %>% !.) {
    x$df = FemFit:::deriveZero_seqLabels(x$df)
  }

  # Construct the reference `tibble` to store what is needed to be extracted from x$df
  vlookup = sapply(sessionOrder, function (sessionIDs_Child) {
    x$df %>%
      dplyr::filter(trLabel == "event", sessionID == sessionIDs_Child, JSONLabel == whichJSONLabel) %>%
      dplyr::pull(trLabelSeq) %>%
      unique()
  })

  # Throw an error if there is an unequal number of events
  if (length(vlookup[[1]]) != length(vlookup[[2]])) {
    stop("Observed protocol information is not the same between sessions.", call. = FALSE)
  }

  # Finish constructing the reference `tibble`
  vlookup = dplyr::as_tibble(vlookup) %>%
    # Give the event traces identifiers some `prettier` labels
    dplyr::mutate(newLabel = paste("Event", 1:n()))

  # Manipulate x$df to:
  x_Work = x$df %>%
    # Extract the event pressure traces within the JSONLabel
    dplyr::filter(trLabel == "event", JSONLabel == whichJSONLabel) %>%
    split(.$sessionID) %>%
    lapply(function(df_Child) {
      sessionIDs_Child = df_Child$sessionID[1]
      vlookup_Child = vlookup %>%
        dplyr::select(sessionIDs_Child, newLabel)

      # Append the `prettier` labels onto the filtered x$df
      df_Child %>%
        dplyr::left_join(vlookup_Child, by = c("trLabelSeq" = sessionIDs_Child))
    }) %>%
    dplyr::bind_rows()

  # Construct the reference `tibble` so that `sensorToMatch` and `sensorToMatchAgainst` are drawn from the right session
  toGen = dplyr::tibble(sensor = paste0("zeroPrssr", sensorToMatch), sessionID = sessionOrder[1]) %>%
    dplyr::bind_rows(expand.grid(sensor = paste0("zeroPrssr", sensorToMatchAgainst), sessionID = sessionOrder[2], stringsAsFactors = FALSE))

  # Derive the loess regression functinos based on the standardised `zeroPrssr`s
  x_Hat = apply(toGen, 1, function (toGen_Row) {
    FemFit:::repeatAnalysis_generateAbsoluteErrs(toGen_Row[1], x_df = x_Work, toGen_Row[2])
  }) %>%
    dplyr::bind_rows()

  # Construct the comparison matrix and filter to keep only to compare event traces conducted at the same point of the protocol between sessions
  comparisonMatrix = expand.grid(sessionOrder, unique(x_Hat$eventID), stringsAsFactors = FALSE) %>%
  {paste0(.$Var1, "*", .$Var2)} %>%
    combn(2) %>%
    t %>%
    dplyr::as_tibble() %>%
    dplyr::mutate(session_A = gsub("(.*)\\*(.*)", "\\1", V1),
                  session_B = gsub("(.*)\\*(.*)", "\\1", V2),
                  event_A = gsub("(.*)\\*(.*)", "\\2", V1),
                  event_B = gsub("(.*)\\*(.*)", "\\2", V2),
                  order = as.numeric(gsub("(.*)\\*(.*)[[:space:]]([0-9])", "\\3", V1))) %>%
    dplyr::select(-V1, -V2) %>%
    dplyr::filter(dplyr::if_else(event_A == event_B, TRUE, dplyr::if_else(session_A == session_B, TRUE, FALSE))) %>%
    dplyr::mutate(between = dplyr::if_else(event_A == event_B, TRUE, FALSE)) %>%
    dplyr::filter(between) %>%
    dplyr::arrange(order) %>%
    dplyr::select(-order)

  # For each row of the comparison matrix...
  output = apply(comparisonMatrix, 1, function (input) {
    # Extract the loess regression functions of interest to do the comparison between elements of `sensorToMatch` and `sensorToMatchAgainst`
    x_WChild = dplyr::filter(x_Hat, sessionID == input[1], eventID == input[3]) %>%
      dplyr::full_join(dplyr::filter(x_Hat, sessionID == input[2], eventID == input[4]), by = c("eventID", "JSONLabel"), suffix = c("_A", "_B"))

    # For each comparison of `sensorToMatch` and `sensorToMatchAgainst`
    results = sapply(1:nrow(x_WChild), function (rowIndex) {
      x_WGChild = x_WChild[rowIndex, ]

      # Calculate the average number of observations to draw from the loess regression functions over time
      nobs = round((x_WGChild$nob_A + x_WGChild$nob_B)/2)

      # Calculate the squared differences between the loess regression functions
      diff_df = dplyr::tibble(rescaleTime = seq(0, 1, length.out = nobs)) %>%
      {dplyr::mutate(.,
                     yHat_A = predict(x_WGChild$loess_fit_A[[1]], newdata = .),
                     yHat_B = predict(x_WGChild$loess_fit_B[[1]], newdata = .),
                     diffHat = (yHat_A - yHat_B)^2,
                     sensor_A = x_WGChild$sensor_A,
                     sensor_B = x_WGChild$sensor_B,
                     sessionID_A = x_WGChild$sessionID_A,
                     sessionID_B = x_WGChild$sessionID_B,
                     JSONLabel = x_WGChild$JSONLabel,
                     eventID = x_WGChild$eventID
      )}

      # Use the default histogram breakpoints to setup the starting values of the two states with Normal distributions
      midPoints = hist(diff_df$diffHat, plot = FALSE)$mids
      initMod = depmix(diffHat ~ 1, data = diff_df, nstates = 2, instart = c(0.5, 0.5), respstart = c(midPoints[1], 0.05, midPoints[length(midPoints)], 0.05))

      # Fit the HMM model with EM-Algorithm
      emMod = fit(initMod, verbose = FALSE)
      emPars = getpars(emMod)
      # Extract the fitted parameters of the HMM
      tMat.hat = matrix(emPars[3:6], byrow = TRUE, nrow = 2)
      muHat = emPars[c(7, 9)]
      sigmaHat = emPars[c(8, 10)]

      # Format the fitted parameters of the HMM to return to the user
      hmmTheta = list(transition_Matrix = tMat.hat, state_Means = as.numeric(muHat), state_SDs = as.numeric(sigmaHat))

      # Identify the state with the lowest mean
      lowLabel.hat = dplyr::if_else(muHat[1] < muHat[2], 1, 2)
      # Calculate the proportion of similarity
      classProb = length(which(posterior(emMod)[, 1] == lowLabel.hat))/nobs

      # Bootstrap the delta probabilities of the Viterbi algorithm to get a sense of the proportion's variability
      classProb_bootstraps = apply(posterior(emMod)[-1, ], 1, function (row) {
        sample(1:2, size = B, replace = TRUE, prob = row[2:3])
      }) %>%
        rbind() %>%
        {cbind(rep(posterior(emMod)[1, 1], times = B), .)} %>%
        apply(1, function (row) {
          length(which(row == lowLabel.hat))/nobs
        })

      # Standard Bootstrap CI calculations to return to the user
      classProb_stderr = sd(classProb_bootstraps)
      classProb_lwrbnd = quantile(classProb_bootstraps, probs = 0.025)
      classProb_uprbnd = quantile(classProb_bootstraps, probs = 0.975)

      # If the a plot is required to visualise where the HMM detected dissimilarities with the squared differences
      plotObj = NULL
      if (toPlot) {
        # Reconstruct the data.frame to be ggplot2 friendly
        diff_gather = diff_df %>%
          dplyr::select(-rescaleTime, -diffHat) %>%
          tidyr::gather(sensor, yHat, -sensor_A, -sensor_B, -eventID, -sessionID_A, -sessionID_B, -JSONLabel) %>%
          dplyr::mutate(rescaleTime = rep(dplyr::pull(diff_df, rescaleTime), 2),
                        sensor = paste0(dplyr::if_else(sensor == "yHat_A", sessionID_A, sessionID_B), ": ", gsub(".*([0-9])", "Sensor \\1", dplyr::if_else(sensor == "yHat_A", sensor_A, sensor_B))),
                        toPolygon = !rep(posterior(emMod)[, 1] != lowLabel.hat, 2)) %>%
          dplyr::select(-sensor_A, -sensor_B)

        diff_gather$sensor2 = factor(x = diff_gather$sensor, levels = unique(diff_gather$sensor)[c(grep(x_WGChild$sessionID_A, unique(diff_gather$sensor)), grep(x_WGChild$sessionID_B, unique(diff_gather$sensor)))])

        diff_gather$lineBreaks = c(TRUE, diff_gather$toPolygon[2:nrow(diff_gather)] == diff_gather$toPolygon[1:(nrow(diff_gather)-1)]) %>%
        {c(1, which(. != 1), nrow(diff_gather)+1)} %>%
        {rep(1:length(diff(.)), diff(.))}

        # Produce the plot
        plotObj = ggplot2::ggplot(data = diff_gather, aes(x = rescaleTime, y = yHat)) +
          ggplot2::facet_wrap(~ sensor2) +
          ggplot2::theme_bw() +
          ggplot2::geom_area(aes(y = 1, fill = toPolygon, group = lineBreaks), show.legend = FALSE, alpha = 0.4) +
          ggplot2::geom_line() +
          ggplot2::scale_fill_manual(values = c("tomato", NA), labels = c("Disparity", "Similarity"), guide = ggplot2::guide_legend(title = NULL, title.position = "bottom")) +
          ggplot2::labs(title = paste0(x_WGChild$JSONLabel, ": ", gsub("event\\.([0-9]*)", "Event \\1", x_WGChild$eventID)), x = "Time (Rescaled)", y = "Zeroed Pressure (Rescaled)")
      }

      # Return the required output for the current set of comparisons
      return (list(classProb, classProb_stderr, classProb_lwrbnd, classProb_uprbnd, hmmTheta, plotObj))
    }) %>%
      t() %>%
      dplyr::as_tibble() %>%
      setNames(c("classProb",  "classProb_stderr", "classProb_lwrbnd", "classProb_uprbnd", "hmmTheta", "plotObj"))

    return (dplyr::bind_cols(x_WChild, results))
  }) %>%
    # And format the object to return to the user
    dplyr::bind_rows() %>%
    dplyr::select(-loess_fit_A, -nob_A, -loess_fit_B, -nob_B) %>%
    dplyr::mutate(classProb = as.numeric(classProb),
                  classProb_stderr = as.numeric(classProb_stderr),
                  classProb_lwrbnd = as.numeric(classProb_lwrbnd),
                  classProb_uprbnd = as.numeric(classProb_uprbnd))

  return (output)
}


#' @title
#' Internal functions of \code{repeatAnalysis}
#'
#' @description
#' Accessible for end-user use with \code{FemFit:::} as a prefix to each function. Note that these child functions have no in-built error detection.
#'
#' @name repeatAnalysis_Child
NULL
#> NULL

#' @describeIn repeatAnalysis_Child
#' Fits a loess regression with the optimal bandwidth parameter determined with AICc.
#'
#' @keywords internal
repeatAnalysis_loess_AICc = function (...) {
  # Fit the initial model specified with ...
  loess.fit.0 = loess(...)

  # Store the number of observations and the value the cell parameter is set to
  N = loess.fit.0$n
  Cell = loess.fit.0$pars$cell

  # Helper function to evaluate the AICc of a loess at a given span parameter
  loess_Optim = function(span) {
    # Note that the cell parameter is adjusted if and only if floor(N * span * Cell) evaluates to be less than or equal to one
    loess.fit.i = update(loess.fit.0, span = span, cell = dplyr::if_else(floor(N * span * Cell) <= 1, 3 / (N * span), Cell))

    sigma2.hat = sum((loess.fit.i$residuals)^2)/loess.fit.i$n
    trace.hat = loess.fit.i$trace.hat

    return (log(sigma2.hat) + 1 + 2 * (trace.hat + 1) / (loess.fit.i$n - trace.hat - 2))
  }

  # Set the updated span and cell parameters
  span. = optimise(loess_Optim, c(0.05, 0.95))$minimum
  cell. = dplyr::if_else(floor(N * span. * Cell) <= 1, 3 / (N * span.), Cell)

  # Return a loess fit with the updated span and cell parameters
  return (loess(..., span = span., cell = cell.))
}

#' @describeIn repeatAnalysis_Child
#' Generates the loess regression functions for a given event trace indexed by sensor and session.
#'
#' @keywords internal
repeatAnalysis_generateAbsoluteErrs = function (whichSensor, x_df, whichSession) {
  x_Errs = x_df %>%
    # Extract the raw pressure measurements for the given sensor and session
    dplyr::filter(trLabel == "event", sessionID == whichSession) %>%
    dplyr::select(zeroPrssr = dplyr::matches(whichSensor), time, JSONLabel, newLabel) %>%
    split(.$newLabel) %>%
    lapply(function (eventRegion_df) {
      # Standardise within the partition
      eventRegionChild_df = eventRegion_df %>%
        dplyr::mutate(rescaleTime = (time - min(time))/(max(time) - min(time)),
                      rescaleZero = (zeroPrssr - min(zeroPrssr))/(max(zeroPrssr) - min(zeroPrssr)))

      # Obtain the smooth approximation of the raw pressure measurements via a loess function
      loess.aicc.fit = FemFit:::repeatAnalysis_loess_AICc(rescaleZero ~ rescaleTime, data = eventRegionChild_df, surface = "direct")

      # Index the loess functions
      dplyr::tibble(loess_fit = list(loess.aicc.fit), sessionID = whichSession, nob = nrow(eventRegion_df), JSONLabel = eventRegionChild_df$JSONLabel[1], eventID = eventRegionChild_df$newLabel[1])
    }) %>%
    dplyr::bind_rows() %>%
    dplyr::mutate(sensor = whichSensor)

  # Return the loess functions
  return (x_Errs)
}