#' Reading an FemFit dataset
#'
#' @description
#' Reads in a compressed file found at \url{https://pfmeasure.com} and returns it as an "FemFit" object.
#'
#' @param zipPath The filepath of compressed file.
#' @param remove.NAs If set to \code{TRUE}, remove unlabelled observations. Defaults to \code{FALSE}.
#' @param remove.spikes If set to \code{TRUE}, remove observations conditional on \code{spikes.threshold}. Defaults to \code{FALSE}.
#' @param spikes.threshold The numeric threshold to remove pressure fluctuations associated with the change in temperature.
#' @param merge.csvs If set to \code{TRUE}, merges comma-separated files. Defaults to \code{FALSE}.
#'
#' @details
#' \code{zipPath} can take a vector of filepaths and if it is, the remaining arguments are recycled to the length of \code{zipPath}.
#'
#' \code{remove.NAs} removes observations that were not labelled with any protocol information found in the "session" JavaScript Object Notation file.
#'
#' \code{remove.spikes} removes observations where the sum of the lagged temperature differences across the eight sensors is greater than or equal to \code{spikes.threshold}.
#'
#' \code{merge.csvs} merges all of the valid comma separated value file(s). The resulting session identifier is determined by alphabetical order.
#'
#' A valid comma separated file is determined with the device timestamps found in the "session" JavaScript Object Notation file.
#'
#' @return
#' The "FemFit" object contains an element called \code{df} which contains the data found in the compressed file. It also contains one NULL element \code{errorSummary}, which is updated by \code{\link{segment}}.
#'
#' The data contains 19 columns, time, pressure recordings for sensors 1 to 8, temperature recordings for sensors 1 to 8, patient identifier, session identifier, and the assigned protocol labels for the session.
#'
#' @seealso
#' \code{\link{FemFit_removeNAs}} \code{\link{FemFit_removeSpikes}} \code{\link{FemFit_mergeCSVs}}
#'
#' @examples
#' # Basic example
#' session283 = read_FemFit("Datasets_AukRepeat/61aa0782289af385_283_csv.zip", remove.NAs = TRUE)
#'
#' # Specify that the second session should have it's NAs removed
#' AS027 = read_FemFit(c(
#' "Datasets_AukRepeat/dee8fc3fdcfccb27_744_csv.zip",
#' "Datasets_AukRepeat/dee8fc3fdcfccb27_746_csv.zip"
#' ),
#' remove.NAs = c(
#' FALSE,
#' TRUE
#' ))
#'
#' # Specify that the sessions should remove pressure fluctuations associated with the change in temperature
#' AS011 = read_FemFit(c(
#' "Datasets_AukRepeat/dee8fc3fdcfccb27_642_csv.zip",
#' "Datasets_AukRepeat/dee8fc3fdcfccb27_643_csv.zip"
#' ),
#' spikes.threshold = 2,
#' remove.spikes = c(
#' FALSE,
#' TRUE
#' ))
#'
#' # Specify that the CSVs found in the second compressed file should be merged together
#' AS013 = read_FemFit(c(
#' "Datasets_AukRepeat/dee8fc3fdcfccb27_640_csv.zip",
#' "Datasets_AukRepeat/dee8fc3fdcfccb27_641_csv.zip"
#' ),
#' merge.csvs = c(
#' FALSE,
#' TRUE
#' ))
#'
#' # An example of using more than one argument
#' AS008 = read_FemFit(c(
#' "Datasets_AukRepeat/61aa0782289af385_417_csv.zip",
#' "Datasets_AukRepeat/61aa0782289af385_418_csv.zip"
#' ),
#' remove.NAs = TRUE,
#' remove.spikes = c(
#' TRUE,
#' FALSE
#' ))
#'
#' @export
read_FemFit = function(zipPath, remove.NAs = FALSE, remove.spikes = FALSE, spikes.threshold = 2, merge.csvs = FALSE) {
# Throw an error if the zipPath argument has any NAs
if (anyNA(zipPath)) {
stop("The zipPath argument has not been provided.", call. = FALSE)
}
# Throw an error if remove.NAs is not a logical or it has any NAs
if (!is.logical(remove.NAs) || anyNA(remove.NAs)) {
stop("The provided remove.NAs value is not a logical.", call. = FALSE)
}
# Throw an error if remove.spikes is not a logical or it has any NAs
if (!is.logical(remove.spikes) || anyNA(remove.spikes)) {
stop("The provided remove.spikes value is not a logical.", call. = FALSE)
}
# Throw an error if spikes.threshold is not a numeric or it has any NAs
if (!is.numeric(spikes.threshold) || anyNA(spikes.threshold)) {
stop("The provided spikes.threshold value is not a numeric.", call. = FALSE)
}
# Throw an error if merge.csv is not a logical or it has any NAs
if (!is.logical(merge.csvs) || anyNA(merge.csvs)) {
stop("The provided merge.csvs value is not a logical.", call. = FALSE)
}
# Recycle elements of remove.NAs, remove.spikes, spikes.threshold, and merge.csvs if necessary
# Also, name the elements within each vector with zipPath
zipPath.len = length(zipPath)
remove.NAs = rep(remove.NAs, length.out = zipPath.len)
names(remove.NAs) = zipPath
remove.spikes = rep(remove.spikes, length.out = zipPath.len)
names(remove.spikes) = zipPath
spikes.threshold = rep(spikes.threshold, length.out = zipPath.len)
names(spikes.threshold) = zipPath
merge.csvs = rep(merge.csvs, length.out = zipPath.len)
names(merge.csvs) = zipPath
# Prepare the processing information for the FemFit files
toProcess = lapply(zipPath, function (zipPath_Child) {
# Extract the contents of zip file
contents = unzip(zipPath_Child, list = TRUE) %>%
dplyr::filter(Length != 0)
# Extract the filepaths of the csv files
csvPaths = contents %>%
dplyr::filter(grepl(".csv", x = Name)) %>%
dplyr::pull(Name)
# Parse csvPaths for the csv filenames
csvFileNames = gsub(".*/(.*.csv)", "\\1", csvPaths)
# Extract the JSON as a list of R objects
jsonInfo = jsonlite::fromJSON(
unz(zipPath_Child, contents$Name[grep(".*/session.json$", contents$Name)]),
simplifyDataFrame = TRUE
)
# Create the patient ID
patientID = jsonInfo$patient_name
# The start time and stop time for each csv file
trialTimes <- jsonInfo$recordings %>%
dplyr::mutate(filename = gsub(pattern = ".data", replacement = ".csv", x = filepath)) %>%
dplyr::select(filename, start, stop) %>%
dplyr::mutate(exists = sapply(.$filename, function(x) {
any(grepl(x, contents$Name))
})) %>%
dplyr::filter(exists) %>%
dplyr::select(-exists) %>%
# Create the session ID
dplyr::mutate(sessionID = paste(
jsonInfo$session_id,
format(as.POSIXct(start/1000, tz = jsonInfo$session_timezone, origin = "1970-01-01"), "%H:%M")
))
# Compile the patientID, sessionID, zipPath_Child, csvPaths, and exerciseInfo into a tibble object
toProcess_Child = dplyr::tibble(patientID, zipPath = zipPath_Child, csvPath = csvPaths, exerciseInfo = dplyr::if_else(class(jsonInfo$exercises) == "data.frame", list(jsonInfo$exercises), NULL), filename = csvFileNames) %>%
dplyr::inner_join(y = trialTimes, by = "filename") %>%
dplyr::select(-filename)
return (toProcess_Child)
}) %>% dplyr::bind_rows()
# Create a data.frame for each toProcess row
toReturnDf = apply(toProcess, 1, function (toProcess_Child) {
# Extract the csv file
toReturnDf_Child = read.csv(unz(toProcess_Child$zipPath, toProcess_Child$csvPath), header = FALSE) %>%
# Rename the time column
dplyr::rename(time = V1) %>%
# Appropriately rename the other columns
dplyr::rename_at(2:9, funs(paste0("prssr_sensor", 1:8))) %>%
dplyr::rename_at(10:17, funs(paste0("tmprtr_sensor", 1:8))) %>%
# Append on additional information
dplyr::mutate(patientID = toProcess_Child$patientID, sessionID = toProcess_Child$sessionID, zipPath = toProcess_Child$zipPath, JSONLabel = NA)
# Check if the exercise information is not null
if (!is.null(toProcess_Child$exerciseInfo)) {
# Extract and adjust the exercise timestamps associated with the csv file
exerciseTimes <- toProcess_Child$exerciseInfo %>%
# Coerce the start and stop variables to numerics and remove rows that have missing values in either column.
dplyr::mutate(start = as.numeric(start), stop = as.numeric(stop)) %>%
dplyr::filter(!is.na(start) & !is.na(stop)) %>%
# Filtering as so allows us to capture exercise input before the device has formerly begun the trial.
dplyr::filter(stop <= toProcess_Child$stop) %>%
# Zero out the exercise timestamps with the device recorded trial start time.
dplyr::mutate(start = (start - toProcess_Child$start), stop = (stop - toProcess_Child$start)) %>%
# Remove exercises in the data.frame which have negative entries for both start time and stop time.
dplyr::filter(!(start < 0 & stop < 0)) %>%
# Create a new column which keeps track of repeated device labelled exercises.
dplyr::mutate(repeatCount = 0)
if (nrow(exerciseTimes) != 0) {
# Identify non-unique exercise labels in 'exerciseTimes'
duplicateLabels <- data.frame(
label = unique(exerciseTimes$exercise),
repeats = vapply(unique(exerciseTimes$exercise),
function (input) {
length(grep(input, exerciseTimes$exercise))
}, numeric(1)),
row.names = NULL
) %>%
dplyr::filter(repeats > 1)
# Determine the number of times the non-unique exercise labels occur in 'exerciseTimes'
exerciseTimes$repeatCount[which(exerciseTimes$exercise %in% duplicateLabels$label)] <- exerciseTimes %>%
.[which(.$exercise %in% duplicateLabels$label), ] %>%
dplyr::group_by(exercise) %>%
dplyr::arrange(start) %>%
dplyr::mutate(repeatCount = row_number()) %>%
dplyr::pull(repeatCount)
# Create unique exercise labels for the non-unique exercise labels in 'exerciseTimes', then coerce the data.frame object to a list object.
exerciseTimes = exerciseTimes %>%
dplyr::mutate(exercise = dplyr::if_else(repeatCount > 0, paste(exercise, LETTERS[repeatCount], sep = "_"), exercise)) %>%
split(., seq(nrow(.))) %>%
unname(.)
# Append the exercise labels to the CSV's data.frame object
toReturnDf_Child = lapply(exerciseTimes, function(rowInput) {
# For each exercise
toReturnDf_Child %>%
# Get the time period that the exercise was recorded for...
dplyr::filter(time >= rowInput$start & time <= rowInput$stop) %>%
dplyr::select(time) %>%
# And append the exercise label to that time period
dplyr::mutate(JSONLabel_Update = rowInput$exercise)
}) %>%
# Bind the partitions into a singular data.frame
bind_rows() %>%
# left_join() merges the labelled data.frame to the full data.frame
dplyr::left_join(toReturnDf_Child, ., by = "time") %>%
dplyr::mutate(JSONLabel = JSONLabel_Update) %>%
dplyr::select(-JSONLabel_Update)
}
}
# If remove.NAs is equal to TRUE then remove observations with NAs
if (remove.NAs[toProcess_Child$zipPath]) {
toReturnDf_Child = toReturnDf_Child %>%
dplyr::filter(!is.na(JSONLabel)) %>%
dplyr::mutate(time = trunc(row_number()*10 - 10))
} else {
toReturnDf_Child = toReturnDf_Child %>%
dplyr::mutate(time = trunc(row_number()*10 - 10))
}
# If remove.spikes is set to TRUE then remove observations where the sum of the temperature differences is greater than or equal to spikes.threshold
if (remove.spikes[toProcess_Child$zipPath]) {
toReturnDf_Child = toReturnDf_Child %>%
# Calculate the absolute difference within a sensor's temperature measurements
dplyr::mutate_at(dplyr::vars(dplyr::starts_with("tmprtr_sensor")), dplyr::funs(
tmpDiff = dplyr::if_else(is.na(abs(. - lag(.))), 0, abs(. - lag(.)))
)) %>%
# Calculate the row sum of the absolute temperature differences
dplyr::mutate(tmpDiffSum = rowSums(
dplyr::select(., dplyr::ends_with("tmpDiff"))
)) %>%
# Exclude any observations where the row sum of the absolute temperature differences is greater than spikes.threshold
dplyr::filter(!tmpDiffSum >= spikes.threshold[toProcess_Child$zipPath]) %>%
# Remove the derived variables from the data.frame object
dplyr::select(-dplyr::contains("tmpDiff")) %>%
dplyr::mutate(time = trunc(row_number()*10 - 10))
}
return (toReturnDf_Child)
}) %>% dplyr::bind_rows()
# If merge.csvs is set to TRUE, merge the csvs within a zipPath and treat them as a singular session.
if (any(merge.csvs)) {
toReturnDf = toReturnDf %>%
# Set up the dplyr commands to manipulate the data.frame object by zipPath
dplyr::group_by(zipPath) %>%
# Set sessionID to the first sessionID of the zipPath group
dplyr::mutate(sessionID = dplyr::if_else(merge.csvs[zipPath], unique(sessionID)[1], sessionID),
time = trunc(row_number()*10 - 10)) %>%
dplyr::ungroup()
}
return (structure(list(df = toReturnDf %>% dplyr::select(-zipPath), errorSummary = NULL), class = "FemFit"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.