FemFit: Functions to Analyse FemFit Datasets

Documented in read_FemFit

#' Reading an FemFit dataset
#'
#' @description
#' Reads in a compressed file found at \url{https://pfmeasure.com} and returns it as an "FemFit" object.
#'
#' @param zipPath The filepath of compressed file.
#' @param remove.NAs If set to \code{TRUE}, remove unlabelled observations. Defaults to \code{FALSE}.
#' @param remove.spikes If set to \code{TRUE}, remove observations conditional on \code{spikes.threshold}. Defaults to \code{FALSE}.
#' @param spikes.threshold The numeric threshold to remove pressure fluctuations associated with the change in temperature.
#' @param merge.csvs If set to \code{TRUE}, merges comma-separated files. Defaults to \code{FALSE}.
#'
#' @details
#' \code{zipPath} can take a vector of filepaths and if it is, the remaining arguments are recycled to the length of \code{zipPath}.
#'
#' \code{remove.NAs} removes observations that were not labelled with any protocol information found in the "session" JavaScript Object Notation file.
#'
#' \code{remove.spikes} removes observations where the sum of the lagged temperature differences across the eight sensors is greater than or equal to \code{spikes.threshold}.
#'
#' \code{merge.csvs} merges all of the valid comma separated value file(s). The resulting session identifier is determined by alphabetical order.
#'
#' A valid comma separated file is determined with the device timestamps found in the "session" JavaScript Object Notation file.
#'
#' @return
#' The "FemFit" object contains an element called \code{df} which contains the data found in the compressed file. It also contains one NULL element \code{errorSummary}, which is updated by \code{\link{segment}}.
#'
#' The data contains 19 columns, time, pressure recordings for sensors 1 to 8, temperature recordings for sensors 1 to 8, patient identifier, session identifier, and the assigned protocol labels for the session.
#'
#' @seealso
#' \code{\link{FemFit_removeNAs}} \code{\link{FemFit_removeSpikes}} \code{\link{FemFit_mergeCSVs}}
#'
#' @examples
#' # Basic example
#' session283 = read_FemFit("Datasets_AukRepeat/61aa0782289af385_283_csv.zip", remove.NAs = TRUE)
#'
#' # Specify that the second session should have it's NAs removed
#' AS027 = read_FemFit(c(
#'         "Datasets_AukRepeat/dee8fc3fdcfccb27_744_csv.zip",
#'         "Datasets_AukRepeat/dee8fc3fdcfccb27_746_csv.zip"
#'     ),
#'     remove.NAs = c(
#'         FALSE,
#'         TRUE
#'     ))
#'
#' # Specify that the sessions should remove pressure fluctuations associated with the change in temperature
#' AS011 = read_FemFit(c(
#'         "Datasets_AukRepeat/dee8fc3fdcfccb27_642_csv.zip",
#'         "Datasets_AukRepeat/dee8fc3fdcfccb27_643_csv.zip"
#'     ),
#'     spikes.threshold = 2,
#'     remove.spikes = c(
#'         FALSE,
#'         TRUE
#'     ))
#'
#' # Specify that the CSVs found in the second compressed file should be merged together
#' AS013 = read_FemFit(c(
#'         "Datasets_AukRepeat/dee8fc3fdcfccb27_640_csv.zip",
#'         "Datasets_AukRepeat/dee8fc3fdcfccb27_641_csv.zip"
#'     ),
#'     merge.csvs = c(
#'         FALSE,
#'         TRUE
#'     ))
#'
#' # An example of using more than one argument
#' AS008 = read_FemFit(c(
#'         "Datasets_AukRepeat/61aa0782289af385_417_csv.zip",
#'         "Datasets_AukRepeat/61aa0782289af385_418_csv.zip"
#'     ),
#'     remove.NAs = TRUE,
#'     remove.spikes = c(
#'         TRUE,
#'         FALSE
#'     ))
#'
#' @export
read_FemFit = function(zipPath, remove.NAs = FALSE, remove.spikes = FALSE, spikes.threshold = 2, merge.csvs = FALSE) {
  # Throw an error if the zipPath argument has any NAs
  if (anyNA(zipPath)) {
    stop("The zipPath argument has not been provided.", call. = FALSE)
  }

  # Throw an error if remove.NAs is not a logical or it has any NAs
  if (!is.logical(remove.NAs) || anyNA(remove.NAs)) {
    stop("The provided remove.NAs value is not a logical.", call. = FALSE)
  }

  # Throw an error if remove.spikes is not a logical or it has any NAs
  if (!is.logical(remove.spikes) || anyNA(remove.spikes)) {
    stop("The provided remove.spikes value is not a logical.", call. = FALSE)
  }

  # Throw an error if spikes.threshold is not a numeric or it has any NAs
  if (!is.numeric(spikes.threshold) || anyNA(spikes.threshold)) {
    stop("The provided spikes.threshold value is not a numeric.", call. = FALSE)
  }

  # Throw an error if merge.csv is not a logical or it has any NAs
  if (!is.logical(merge.csvs) || anyNA(merge.csvs)) {
    stop("The provided merge.csvs value is not a logical.", call. = FALSE)
  }

  # Recycle elements of remove.NAs, remove.spikes, spikes.threshold, and merge.csvs if necessary
    # Also, name the elements within each vector with zipPath
  zipPath.len = length(zipPath)
  remove.NAs = rep(remove.NAs, length.out = zipPath.len)
  names(remove.NAs) = zipPath

  remove.spikes = rep(remove.spikes, length.out = zipPath.len)
  names(remove.spikes) = zipPath

  spikes.threshold = rep(spikes.threshold, length.out = zipPath.len)
  names(spikes.threshold) = zipPath

  merge.csvs = rep(merge.csvs, length.out = zipPath.len)
  names(merge.csvs) = zipPath

  # Prepare the processing information for the FemFit files
  toProcess = lapply(zipPath, function (zipPath_Child) {
    # Extract the contents of zip file
    contents = unzip(zipPath_Child, list = TRUE) %>%
      dplyr::filter(Length != 0)

    # Extract the filepaths of the csv files
    csvPaths = contents %>%
      dplyr::filter(grepl(".csv", x = Name)) %>%
      dplyr::pull(Name)

    # Parse csvPaths for the csv filenames
    csvFileNames = gsub(".*/(.*.csv)", "\\1", csvPaths)

    # Extract the JSON as a list of R objects
    jsonInfo = jsonlite::fromJSON(
      unz(zipPath_Child, contents$Name[grep(".*/session.json$", contents$Name)]),
      simplifyDataFrame = TRUE
    )

    # Create the patient ID
    patientID = jsonInfo$patient_name

    # The start time and stop time for each csv file
    trialTimes <- jsonInfo$recordings %>%
      dplyr::mutate(filename = gsub(pattern = ".data", replacement = ".csv", x = filepath)) %>%
      dplyr::select(filename, start, stop) %>%
      dplyr::mutate(exists = sapply(.$filename, function(x) {
        any(grepl(x, contents$Name))
      })) %>%
      dplyr::filter(exists) %>%
      dplyr::select(-exists) %>%
      # Create the session ID
      dplyr::mutate(sessionID = paste(
        jsonInfo$session_id,
        format(as.POSIXct(start/1000, tz = jsonInfo$session_timezone, origin = "1970-01-01"), "%H:%M")
      ))

    # Compile the patientID, sessionID, zipPath_Child, csvPaths, and exerciseInfo into a tibble object
    toProcess_Child = dplyr::tibble(patientID, zipPath = zipPath_Child, csvPath = csvPaths, exerciseInfo = dplyr::if_else(class(jsonInfo$exercises) == "data.frame", list(jsonInfo$exercises), NULL), filename = csvFileNames) %>%
      dplyr::inner_join(y = trialTimes, by = "filename") %>%
      dplyr::select(-filename)

    return (toProcess_Child)
  }) %>% dplyr::bind_rows()

  # Create a data.frame for each toProcess row
  toReturnDf = apply(toProcess, 1, function (toProcess_Child) {
    # Extract the csv file
    toReturnDf_Child = read.csv(unz(toProcess_Child$zipPath, toProcess_Child$csvPath), header = FALSE) %>%
      # Rename the time column
      dplyr::rename(time = V1) %>%
      # Appropriately rename the other columns
      dplyr::rename_at(2:9, funs(paste0("prssr_sensor", 1:8))) %>%
      dplyr::rename_at(10:17, funs(paste0("tmprtr_sensor", 1:8))) %>%
      # Append on additional information
      dplyr::mutate(patientID = toProcess_Child$patientID, sessionID = toProcess_Child$sessionID, zipPath = toProcess_Child$zipPath, JSONLabel = NA)

    # Check if the exercise information is not null
    if (!is.null(toProcess_Child$exerciseInfo)) {
      # Extract and adjust the exercise timestamps associated with the csv file
      exerciseTimes <- toProcess_Child$exerciseInfo %>%
        # Coerce the start and stop variables to numerics and remove rows that have missing values in either column.
        dplyr::mutate(start = as.numeric(start), stop = as.numeric(stop)) %>%
        dplyr::filter(!is.na(start) & !is.na(stop)) %>%
        # Filtering as so allows us to capture exercise input before the device has formerly begun the trial.
        dplyr::filter(stop <= toProcess_Child$stop) %>%
        # Zero out the exercise timestamps with the device recorded trial start time.
        dplyr::mutate(start = (start - toProcess_Child$start), stop = (stop - toProcess_Child$start)) %>%
        # Remove exercises in the data.frame which have negative entries for both start time and stop time.
        dplyr::filter(!(start < 0 & stop < 0)) %>%
        # Create a new column which keeps track of repeated device labelled exercises.
        dplyr::mutate(repeatCount = 0)

      if (nrow(exerciseTimes) != 0) {
        # Identify non-unique exercise labels in 'exerciseTimes'
        duplicateLabels <- data.frame(
            label = unique(exerciseTimes$exercise),
            repeats = vapply(unique(exerciseTimes$exercise),
            function (input) {
              length(grep(input, exerciseTimes$exercise))
            }, numeric(1)),
            row.names = NULL
          ) %>%
          dplyr::filter(repeats > 1)

        # Determine the number of times the non-unique exercise labels occur in 'exerciseTimes'
        exerciseTimes$repeatCount[which(exerciseTimes$exercise %in% duplicateLabels$label)] <- exerciseTimes %>%
          .[which(.$exercise %in% duplicateLabels$label), ] %>%
          dplyr::group_by(exercise) %>%
          dplyr::arrange(start) %>%
          dplyr::mutate(repeatCount = row_number()) %>%
          dplyr::pull(repeatCount)

        # Create unique exercise labels for the non-unique exercise labels in 'exerciseTimes', then coerce the data.frame object to a list object.
        exerciseTimes = exerciseTimes %>%
          dplyr::mutate(exercise = dplyr::if_else(repeatCount > 0, paste(exercise, LETTERS[repeatCount], sep = "_"), exercise)) %>%
          split(., seq(nrow(.))) %>%
          unname(.)

        # Append the exercise labels to the CSV's data.frame object
        toReturnDf_Child = lapply(exerciseTimes, function(rowInput) {
            # For each exercise
            toReturnDf_Child %>%
            # Get the time period that the exercise was recorded for...
            dplyr::filter(time >= rowInput$start & time <= rowInput$stop) %>%
            dplyr::select(time) %>%
            # And append the exercise label to that time period
            dplyr::mutate(JSONLabel_Update = rowInput$exercise)
          }) %>%
          # Bind the partitions into a singular data.frame
          bind_rows() %>%
          # left_join() merges the labelled data.frame to the full data.frame
          dplyr::left_join(toReturnDf_Child, ., by = "time") %>%
          dplyr::mutate(JSONLabel = JSONLabel_Update) %>%
          dplyr::select(-JSONLabel_Update)
      }
    }

    # If remove.NAs is equal to TRUE then remove observations with NAs
    if (remove.NAs[toProcess_Child$zipPath]) {
      toReturnDf_Child = toReturnDf_Child %>%
        dplyr::filter(!is.na(JSONLabel)) %>%
        dplyr::mutate(time = trunc(row_number()*10 - 10))
    } else {
      toReturnDf_Child = toReturnDf_Child %>%
        dplyr::mutate(time = trunc(row_number()*10 - 10))
    }

    # If remove.spikes is set to TRUE then remove observations where the sum of the temperature differences is greater than or equal to spikes.threshold
    if (remove.spikes[toProcess_Child$zipPath]) {
      toReturnDf_Child = toReturnDf_Child %>%
        # Calculate the absolute difference within a sensor's temperature measurements
        dplyr::mutate_at(dplyr::vars(dplyr::starts_with("tmprtr_sensor")), dplyr::funs(
          tmpDiff = dplyr::if_else(is.na(abs(. - lag(.))), 0, abs(. - lag(.)))
        )) %>%
        # Calculate the row sum of the absolute temperature differences
        dplyr::mutate(tmpDiffSum = rowSums(
          dplyr::select(., dplyr::ends_with("tmpDiff"))
        )) %>%
        # Exclude any observations where the row sum of the absolute temperature differences is greater than spikes.threshold
        dplyr::filter(!tmpDiffSum >= spikes.threshold[toProcess_Child$zipPath]) %>%
        # Remove the derived variables from the data.frame object
        dplyr::select(-dplyr::contains("tmpDiff")) %>%
        dplyr::mutate(time = trunc(row_number()*10 - 10))
    }

    return (toReturnDf_Child)
  }) %>% dplyr::bind_rows()

  # If merge.csvs is set to TRUE, merge the csvs within a zipPath and treat them as a singular session.
  if (any(merge.csvs)) {
    toReturnDf = toReturnDf %>%
      # Set up the dplyr commands to manipulate the data.frame object by zipPath
      dplyr::group_by(zipPath) %>%
      # Set sessionID to the first sessionID of the zipPath group
      dplyr::mutate(sessionID = dplyr::if_else(merge.csvs[zipPath], unique(sessionID)[1], sessionID),
                    time = trunc(row_number()*10 - 10)) %>%
      dplyr::ungroup()
  }

  return (structure(list(df = toReturnDf %>% dplyr::select(-zipPath), errorSummary = NULL), class = "FemFit"))
}