R/reading.R

#' Read and save, or load simulation data
#'
#' \code{read_or_load}() is an "all-in-one" function that handles everything
#' related to the reading, saving, and loading of data generated by the CODES
#' adaptability simulation. Reading data (see \code{\link{read_logs}}()) takes a
#' long time. This function integrates a saving and loading component that,
#' after reading, saves a properly formatted .rds file which can be reloaded. On
#' future uses of this function, the .rds file can be loaded instead of
#' re-reading all of the simulation data.
#'
#' @param rds (Optional) Character of path for a .RDS file. Will be used to load
#'   an existing file or, if none exists, to save a file after reading data. If
#'   left blank, interactive guidance will be provided.
#' @param raw_dir (Optional) Character of path to the top-level directory of the
#'   raw data. Only required when data will be read for the first time. If left
#'   blank (and no .RDS file exists), interactive guidance will be provided.
#'
#' @export
read_or_load <- function(rds, raw_dir) {

  # Try to load existing .RDS ---------------

  message("\nChecking for previously saved RDS file...\n")
  Sys.sleep(1.5)

  if (missing(rds)) {
    rds_exists <- menu(c("Yes", "No"), title = "You have not specified a saved RDS. Does one exist?")
    if (rds_exists == 1) {
      invisible(readline(prompt = "When ready, please press [enter] and locate your saved RDS file."))
      rds <- file.choose()
    }
  }

  if (!missing(rds) && file.exists(rds)) {
    message("RDS found. Loading...")
    return (readr::read_rds(rds))
  }

  # Read raw data instead ---------------

  message("\nNo RDS found. Locating raw data instead...\n")
  Sys.sleep(1.5)

  # Read data
  if (missing(raw_dir)) {
    readline(prompt="\nYou haven't specified a raw data directory.\nWhen ready, press [enter] and specify a directory from which to read data.")
    raw_dir <- as.character(tcltk::tkchooseDirectory())
  }

  if (!missing(raw_dir) && dir.exists(raw_dir)) {
    message("Directory found. Data reading about to begin.\nThis will take time. Reading...\n")
    Sys.sleep(1.5)
    dat <- read_all(raw_dir)
  } else {
    stop("`raw_dir` specifies a directory that does not exist.")
  }

  # Save read raw data ---------------

  message("\nReading of raw data completed. Saving data object as RDS for future use...\n")
  Sys.sleep(1.5)

  if (missing(rds)) {
    readline(prompt="\nYou haven't specified a file path for the RDS.\nWhen ready, press [enter] and specify a directory in which to save the data.")
    rds_dir <- as.character(tcltk::tkchooseDirectory())
    rds_name <- paste0(readline("Please enter a name for the RDS file (e.g. saved_data): "), ".rds")
    rds <- file.path(rds_dir, rds_name)
  }

  if (!missing(rds)) {
    dir.create(dirname(rds), recursive = TRUE, showWarnings = FALSE)
    readr::write_rds(dat, rds)
  }

  message("\nRDS saved: ", rds, "\n")

  return (dat)
}

#' Read user's log files
#'
#' These functions help to read a user's session, events, or stream log files.
#' read_logs() is a wrapper function that uses the default values of all other
#' functions to read all log files into a single list. If this fails, the log
#' files can be read separately using the other functions and by adjusting the
#' variables appropriately.
#'
#' All files that can be read are assumed to be tab-separated values without
#' variable headers. Uniquely, they are assumed to:
#'
#' \describe{
#'  \item{session}{Appear in top-level of user's log file directory; contain
#'  variable names and values}
#'  \item{events}{Appear in top-level of user's log file directory; contain a
#'  timestamp and an event name (tab-separated details about the event can
#'  follow in some cases).}
#'  \item{stream}{Appear in streams/ directory of user's log file directory;
#'  contain a timestamp and a value.}
#' }
#'
#' Most functions are helper functions to read in a single file.
#' read_all_streams, however, reads all the stream files in a directory and
#' merges them into a single tibble. Also, read_logs will make use of all the
#' functions to read all log files into a list.
#'
#' @param user_dir Character string defining the user's log-file directory
#' @param file_name Character string of the file name. Must include extension such as .tsv
#' @param stream_dir Character string defining the directory in which stream log
#'   files exist.
#' @param col_names Vector of column names to be used.
#' @param is_numeric Indicate whether a stream variable is numeric and,
#'   therefore, should be convereted to numeric. Can be a boolean value
#'   (TRUE/FALSE) or a character vector of variable names to convert if present.
#' @param is_vec3 Indicate whether a stream variable is a Vector3 value in Unity
#'   and, therefore, should be convereted to a vector of 3 values. Can be a
#'   boolean value (TRUE/FALSE) or a character vector of variable names to
#'   convert if present.
#' @inheritParams base::list.files
#' @return read_logs with return a list with a "user" S3 class, with three
#'   tibbles (session, events and streams). All others will return a single
#'   \code{\link[tibble]{tibble}}.
#' @name read_logs
#' @export
read_logs <- function(user_dir) {
  session <- read_session(user_dir = user_dir)
  events  <- read_events(user_dir = user_dir)
  streams <- read_all_streams(user_dir = user_dir)

  l <- list(session = session, events = events, streams = streams)
  class(l) <- c("user", class(l))
  l
}

#' @rdname read_logs
#' @inheritParams lubridate::dmy_hms
#' @export
read_session <- function(user_dir,
                         file_name = "session.tsv",
                         col_names = c("var", "info"),
                         tz        = "Australia/Sydney") {

  # Append "/" to directory if required
  user_dir <- end_with_slash(user_dir)

  # Check file exists
  if(!check_file_exists(user_dir, file_name)) {
    return (NA)
  }

  # Read file
  session <- readr::read_tsv(stringr::str_c(user_dir, file_name), col_names = col_names)

  # Convert to wide format so each variable is its own colum
  session <- session %>% tidyr::spread(var, info)

  # Convert date variable to date (if it exists)
  if ("local_start_time" %in% names(session))
    session <- dplyr::mutate_at(session, "local_start_time", lubridate::mdy_hms, tz = tz)

  session
}

#' @rdname read_logs
#' @export
read_events <- function(user_dir,
                        file_name = "events.tsv",
                        col_names = c("time", "event", "detail")) {

  # Append "/" to directory if required
  user_dir <- end_with_slash(user_dir)

  # Check file exists
  if(!check_file_exists(user_dir, file_name)) {
    return (NA)
  }

  events <- readr::read_lines(stringr::str_c(user_dir, file_name)) %>%
    stringr::str_split_fixed("\t", 3) %>%
    tibble::as_tibble()
  names(events) <- c("time", "event", "detail")  # Give variables names
  events <- events %>% dplyr::mutate(detail = stringr::str_split(detail, "\t"),
                                     time   = as.numeric(time))  # Split details into a list (where appropriate)
  events
}

#' @rdname read_logs
#' @export
read_stream <- function(user_dir,
                        file_name,
                        stream_dir = "streams/",
                        is_numeric = TRUE,
                        is_vec3    = FALSE) {

  # Append "/" to directorys if required
  user_dir <- end_with_slash(user_dir)
  stream_dir <- end_with_slash(stream_dir)

  # Check file exists
  if(!check_file_exists(user_dir, stream_dir, file_name)) {
    return (NA)
  }

  # Get variable name
  stream_var <- stringr::str_replace(file_name, "\\..*$", "")

  stream <- stringr::str_c(user_dir, stream_dir, file_name) %>%
              readr::read_tsv(col_names = c("time", stream_var), col_types = "dc")  # Note need to import all values as characters

  # Handle vec3 OR numeric, but not both (if boolean values used)
  if(is.logical(is_vec3) && is_vec3) {
    is_numeric <- FALSE
    warning("is_numeric and is_vec3 both set to TRUE; is_vec3 will be used.")
  }

  # Handle numeric variables
  if (is.character(is_numeric)) {
    is_numeric <- stream_var %in% is_numeric
  }

  if (is.logical(is_numeric)) {
    if (is_numeric) {
      stream[[stream_var]] <- as.numeric(stream[[stream_var]])
    }
  } else {
    stop("is_numeric is not a valid boolean or character string format.")
  }

  # Handle vector 3 variables
  if (is.character(is_vec3)) {
    is_vec3 <- stream_var %in% is_vec3
  }

  if (is.logical(is_vec3)) {
    if (is_vec3) {
      stream[[stream_var]] <- vec3_to_vec(stream[[stream_var]])
    }
  } else {
    stop("is_vec3 is not a valid boolean or character string format.")
  }

  stream
}

#' @rdname read_logs
#' @export
read_all_streams <- function(user_dir,
                             pattern = "tsv$",
                             stream_dir = "streams/",
                             is_numeric = c("input_brake", "input_horizontal", "input_vertical"),
                             is_vec3    = c("position", "rotation", "velocity")) {

  # Append "/" to directorys if required
  user_dir <- end_with_slash(user_dir)
  stream_dir <- end_with_slash(stream_dir)

  # Check streams directory exists
  if(!check_file_exists(user_dir, stream_dir)) {
    return (NA)
  }

  # Find all stream files
  stream_files <- list.files(stringr::str_c(user_dir, stream_dir), pattern = "tsv$")

  # Read in all stream files and merge into single tibble
  streams <- purrr::map(stream_files,
                        ~ read_stream(user_dir = user_dir,
                                      file_name = .,
                                      stream_dir = stream_dir,
                                      is_numeric = is_numeric,
                                      # Do vec_3 conversions later so nest_duplicated will work
                                      is_vec3    = FALSE)) %>%
              # Merge all streams into a single data frame
              purrr::map(~ tibble::rownames_to_column(., var = "i")) %>%
              purrr::reduce(dplyr::full_join, by = "i") %>%
              # Compute time as the mean from each stream
              nest_duplicated() %>%
              dplyr::mutate_if(is.list, dplyr::funs(map_dbl(., na_mean))) %>%
              # Select/order/arrange columns
              dplyr::select(time, dplyr::everything(), -i) %>%
              dplyr::arrange(time)

  # Convert Vector 3 variables to lists
  is_vec3 <- names(streams)[names(streams) %in% is_vec3]  # include variables only if they exist
  streams <- streams %>% dplyr::mutate_at(is_vec3, vec3_to_vec)

  streams
}


#' Read all users logs from a data directory
#'
#' @param data_dir Directory containing all group data  directories. Must be
#'   organised in particular way.
#' @param clean Should user list be cleaned as well?
#' @param calculate Should derived variables be calculated?
#'
#' @return List of user log-file data lists
#'
#' @export
read_all  <- function(data_dir, clean = TRUE, calculate = TRUE) {
  # Group directories
  group_dirs <- file.path(data_dir, list.files(data_dir), "logs")

  # User directories
  user_dirs <- purrr::map(group_dirs, ~ file.path(., list.files(.))) %>% unlist()

  # User logs
  users <- purrr::map(user_dirs, read_logs)

  # Get user ids
  user_ids <- purrr::map_chr(users, ~ .$session$user_id)

  # Find any duplicated ids
  dup_users <- names(table(user_ids))[table(user_ids) > 1]

  # Remove any users with duplicate ids *FOR NOW
  if (length(dup_users) > 0) {
    warning("Duplicate user ids found. Removing data for the following:", dup_users)
    users <- users[user_ids != dup_users]
    user_ids <- user_ids[user_ids != dup_users]
  }

  # Name user list
  names(users) <- user_ids

  # Add role and group information ------------------------------------------

  # Create a tibble of user info
  user_info <- tibble::tibble(id = user_ids)

  # Split id into relevant info
  user_info <- tidyr::separate(user_info, id, into = c("session_time", "group", "seat"), remove = FALSE)

  # Convert session_time into a date object
  user_info <- user_info %>% dplyr::mutate(session_time = lubridate::ymd_h(session_time, tz = "Australia/Sydney"))

  # Compute role by searching for "2" in the seat
  # becase driver were seated in g2 and d2, drone operators in g1 and d1
  user_info <- user_info %>% dplyr::mutate(role = ifelse(grepl("2", seat), "driver", "drone"))

  # Check that the number of drivers/drones per group makes sense
  ns <- user_info %>%
    dplyr::group_by(session_time, group) %>%
    dplyr::summarise(driver = sum(role == "driver"),
                     drone = sum(role == "drone"))

  if (all(ns$driver == 1)) {
    cat("Check for one driver in each group...\t\tOK\n")
  } else {
    cat("Check for one driver in each group...\t\tERROR\n")
  }

  if (all(ns$drone == 0 | ns$drone == 1)) {
    cat("Check for zero or one drone operators in each group...\t\tOK\n")
  } else {
    cat("Check for zero or one drone operators in each group...\t\tERROR\n")
  }

  # Determine teammates and other session information
  user_teams <- user_info %>%
    dplyr::group_by(session_time, group) %>%
    dplyr::summarise(driver = id[role == "driver"],
                     drone  = ifelse("drone" %in% role, id[role == "drone"], "NA")) %>%
    dplyr::mutate(drone = dplyr::na_if(drone, "NA")) %>%
    tidyr::unite(both, driver, drone, sep = " ", remove = F) %>%
    tidyr::gather(role, id, driver, drone) %>%
    tidyr::drop_na() %>%
    tidyr::separate(both, into = c("team_driver", "team_drone"), sep = " ") %>%
    dplyr::mutate(teammate_id = ifelse(role == "driver", team_drone, team_driver)) %>%
    dplyr::select(-team_driver, -team_drone) %>%
    dplyr::mutate(teammate_id = dplyr::na_if(teammate_id, "NA")) %>%
    dplyr::ungroup()

  # Append information to user and add role class. Also  convert stream data to
  # numeric for drones (as they come out as lists)
  for(i in user_ids) {
    users[[i]]$session <- user_teams %>%
      dplyr::filter(id == i) %>%
      dplyr::bind_cols(users[[i]]$session)

    class(users[[i]]) <- c(users[[i]]$session$role, class(users[[i]]))

    if (is(users[[i]], "drone")) {
      users[[i]]$streams <- users[[i]]$streams %>% purrr::map_df(as.numeric)
    }
  }

  # Add user_list class to final object
  class(users) <- c("user_list", class(users))

  # If asked, clean logs in user list
  if (clean) {
    users <- clean_laps(users)
    users <- clean_events(users)
  }

  # If asked, calculate new variables
  if (calculate) {
    users <- calc_variables(users)
  }

  users
}
drsimonj/adapter documentation built on May 15, 2019, 2:51 p.m.