R/process_data.R
In iglu: Interpreting Glucose Data from Continuous Glucose Monitors

Documented in process_data

#' Data Pre-Processor
#'
#' @description A helper function to assist in pre-processing the user-supplied
#' input data for use with other functions.
#' Typically, this function will process the data and return another DataFrame.
#' This function ensures that the returned data will be compatible with every
#' function within the \code{iglu} package. All \code{NA}s will be removed. See Vignette for further details.
#'
#' @usage process_data(data, id, timestamp, glu, time_parser = as.POSIXct)
#'
#' @param data User-supplied dataset containing continuous glucose monitor data. Must
#' contain data for time and glucose readings at a minimum. Accepted formats are dataframe and tibble.
#'
#' @param id Optional column name (character string) corresponding to subject id column.
#' If no value is passed, an id of 1 will be assigned to the data.
#'
#' @param timestamp Required column name (character string) corresponding to time values in data. The dates can be
#' in any format parsable by as.POSIXct, or any format accepted by the parser passed to time_parser. See time_parser param for an explanation
#' on how to handle arbitrary formats.
#'
#' @param glu Required column name (character string) corresponding to glucose values, mg/dL
#'
#' @param time_parser Optional function used to convert datetime strings to time objects. Defaults to as.POSIXct.
#' If your times are in a format not parsable by as.POSIXct, you can parse a custom format by passing
#' function(time_string) \{strptime(time_string, format = <format string>)\} as the time_parser parameter.
#'
#' @details A dataframe with the columns "id", "time", and "gl" will be returned. All \code{NA}s will be removed.
#'
#' If "mmol/l" in the glucose column name, the glucose values will be multipled by 18 to convert to mg/dL.
#'
#' Based on John Schwenck's \code{data_process} for his bp package "https://github.com/johnschwenck/bp".
#'
#' @return A processed DataFrame object that cooperates with every other
#' function within the \code{iglu} package - all column names and formats comply.
#'
#' @export
#'
#' @author David Buchanan, John Schwenck
#'
#' @examples
#' data("example_data_1_subject")
#'
#' # Process example data
#' processed <- process_data(example_data_1_subject, id = "id", timestamp = "time", glu = "gl")
#'
#' processed
#'
#' data("example_data_5_subject")
#'
#' # Process example data
#' processed_5subj <- process_data(example_data_5_subject, id = "id", timestamp = "time", glu = "gl")
#'
#' processed_5subj
#'

process_data = function(data,
                        id = NULL,
                        timestamp = NULL,
                        glu = NULL,
                        time_parser = as.POSIXct) {
  if (is.data.frame(data) == FALSE) {
    if (tibble::is_tibble(data) == FALSE) {
      if (is.vector(data) == FALSE) {
        stop("Invalid data type, please use dataframe, tibble, or if passing just glucose readings, vector.")
      }
    }
  }
  data = na.omit(data)
  if (is.data.frame(data) || tibble::is_tibble(data)) {
    colnames(data) = tolower(colnames(data))
    data = na.omit(data)
    if (is.null(id)) {
      message("No 'id' parameter passed, defaulting id to 1")
      id = 1
    }
    if (is.character(id)) {
      if (tolower(id) %in% colnames(data) == FALSE) {

        warning("Could not find user-defined id argument name in dataset.\nFor example, if the user defines id=\"Subjects\" but no column is named \"Subjects\"\nThen there will be no matches for \"Subject\"\nCheck spelling of id argument.")

        if(length(grep(paste("\\bid\\b", sep = ""), names(data))) == 1) {

          stop('Fix user-defined argument name for id. \nNote: A column in the dataset DOES match the name "id": \nIf this is the correct column, indicate as such in function argument. \ni.e. id = "id"')

        }
      } else {

        col_idx <- grep(paste("\\b",tolower(id),"\\b", sep = ""), names(data))
        data <- data[, c(col_idx, (1:ncol(data))[-col_idx])]

        if(colnames(data)[1] != "id"){

          colnames(data)[1] <- "id"
          data$id <- as.character(data$id)
        }
      }
    } else {
      stop('User-defined id name must be character.\n')
    }

    if (is.character(timestamp)) {
      if (tolower(timestamp) %in% colnames(data) == FALSE) {

        warning("Could not find user-defined timestamp argument name in dataset.\nFor example, if the user defines timestamp=\"time\" but no column is named \"time\"\nThen there will be no matches for \"time\"\nCheck spelling of timestamp argument.")

        if(length(grep(paste("\\btime\\b", sep = ""), names(data))) == 1) {

          stop('Fix user-defined argument name for timestamp. \nNote: A column in the dataset DOES match the name "time": \nIf this is the correct column, indicate as such in function argument. \ni.e. timestamp = "time"')

        }
      } else {

        col_idx <- grep(paste("\\b",tolower(timestamp),"\\b", sep = ""), names(data))
        data <- data[, c(1, col_idx, (2:ncol(data))[-col_idx+1])]

        if(colnames(data)[2] != "time"){

          colnames(data)[2] <- "time"
        }

          tryCatch({
            data$time <- time_parser(data$time)
            },error=function(cond) {
              message("Failed to parse times, ensure times are in parsable format and possible.\nSee docs for explanation on how to handle arbitary formats.\nOriginal error message:")
              message(cond)
              stop("Error in time conversion.")
              return(NA)
            })

        }
    } else {
      stop('User-defined timestamp name must be character.\n')
    }

    if (is.character(glu)) {
      if (tolower(glu) %in% colnames(data) == FALSE) {

        warning("Could not find user-defined glucose argument name in dataset.\nFor example, if the user defines glu=\"glucose\" but no column is named \"glucose\"\nThen there will be no matches for \"glucose\"\nCheck spelling of glu argument.")

        if(length(grep(paste("\\bgl\\b", sep = ""), names(data))) == 1) {

          stop('Fix user-defined argument name for glucose. \nNote: A column in the dataset DOES match the name "gl": \nIf this is the correct column, indicate as such in function argument. \ni.e. glu = "glucose"')

        }
      } else {
        mmol = FALSE
        col_idx <- grep(paste("\\b",tolower(glu),"\\b", sep = ""), names(data))
        if (sum(grep("mmol/l", glu))) {
          mmol = TRUE
        }
        data <- data[, c(1:2, col_idx, (3:ncol(data))[-col_idx+2])]

        if(colnames(data)[3] != "gl"){
          colnames(data)[3] <- "gl"
        }
        data$gl <- as.numeric(data$gl)
        if (mmol) {
          data$gl = 18*data$gl
        }
        if (min(data$gl, na.rm = T) < 20) {
          warning("Minimum glucose reading below 20. Data may not be cleaned.")
        }
        if (max(data$gl, na.rm = T) > 500) {
          warning("Maximum glucose reading above 500. Data may not be cleaned.")
        }
      }
    } else {
      stop('User-defined glucose name must be character.\n')
    }

  }
  return(data)

}