R/fetch_clean_functions.R

#' Return a list of data frames generated from the servosphere
#'
#' Import all the .csv files in a single directory and convert
#' them to a list of data frames
#'
#' When using the servosphere, each trial produces a .csv file that contains the
#' data from that trial. This data should include a column for time (dT),
#' position (dX, dY), and the stimulus. All the data files the user wishes to
#' analyze should be in the same directory. They should also have a common
#' naming convention to facilitate the identification of these files (i.e.
#' "x_servosphere.csv", where x might be the date the trial was run or the
#' unique trial ID).
#'
#' Other functions in this package require that all the .csv files from the
#' servosphere that the user wishes to analyze are contained in a list, where
#' each item in the list is a data.table created from each .csv file.
#'
#' This function makes use of the data.table package to read in the .csv files,
#' as it is currently the fastest way to bring in such files. This function is
#' noticeably slower when alternative csv reading functions are used and when
#' the number of .csv files is large.
#'
#' @param path A string for the file path.
#' @param pattern A string with a unique pattern to look for in file names.
#' @param full.names Return the full file path when TRUE or the file name when
#'   FALSE.
#' @return A list where each item is a data.table
#' @examples
#' servosphere <- getFiles("./extdata/", pattern = "_servosphere.csv")
#' @export

getFiles <- function(path, pattern, full.names = TRUE) {
   if (!is.character(path) | !is.character(pattern)){
      warning("Both path and pattern arguments must be strings.")
   }
   fullpath <- list.files(path = path, pattern = pattern, full.names = TRUE)
   dat <- purrr::map(fullpath, data.table::fread)
   dat <- purrr::map(dat, as.data.frame)
   dat <- purrr::map(dat, function(.x) {
      .x %>% select(-(contains("encode")))
   })
}

#' Set column names for list of data frames
#'
#' Take a list of data frames and change the column names
#'
#' Once a list of data frames is generated with the getFiles function, use this
#' function to set up the column names for all the data frames in the list.
#'
#' Other functions in this package require that the change in time and position
#' columns are named "dT", "dx", and "dy", respectively.
#'
#' @param list A list of data frame objects.
#' @param colnames A vector of strings holding the names for the columns.
#' @return Returns the list of data frames provided with the column names
#'   modified based on the provided vector \code{colnames}.
#' @examples
#' colnames <- c("stimulus", "dT", "dx", "dy")
#' servosphere <- cleanNames(servosphere, colnames)
#' @export

cleanNames <- function(list, colnames) {
   purrr::map_if(list, is.data.frame, function(.x) {
     .x <- .x %>%
         select(-(contains("enco")))
     .x <- stats::setNames(.x, colnames)
   })
}

#' Aggregate data frames
#'
#' Aggregate the data frames in a list to reduce noise in movement recordings
#'
#' This function requires that the change in time and change in position column
#' names are "dT", "dx", and "dy", respectively.
#'
#' Movement recording data generated by the servosphere software is quite noisy
#' due to extraneous movements by the insect or side-to-side motion. Aggregating
#' the data by aggregating rows reduces this noise. Aggregating also reduces the
#' size of the data files, making future computing tasks faster.
#'
#' To aggregate the data properly, we recommend watching recordings of the
#' target organism moving on the servosphere and determine the length of time it
#' takes the insect to move at least 50 percent of its body length. Use the time
#' it takes the insect to move at least 50 percent of its body length as the
#' minimum aggregation time. Longer periods may be necessary depending on the
#' size and movement of the insect.
#'
#' The resulting data frames will have an additional column named "length". The
#' values in length should be approximately equal to the sum of dT from the n
#' rows aggregated over, i.e. if each observation from the un-aggregated data is 10
#' ms and the user aggregates these observations to 1 second, the value of
#' length should be approximately 1. Note that in aggregating the data frame, the
#' stimulus column will also be aggregated The stimulus status value at the first
#' row to be aggregated will become the value of the stimulus for the aggregated
#' row.
#'
#' @param list A list of data frame objects.
#' @param n The number of consecutive rows to aggregate over.
#' @return A list of aggregated data frames with an additional column,
#'   \code{length}, to check that the function worked.
#' @examples
#' # Aggregates every 5 rows in each data frame. Must be used after
#' # cleanNames()
#'  servosphere <- list(data.frame(id = rep(1, 200),
#'                                 stimulus = rep(c(0, 1), each = 100),
#'                                 dT = sample(8:12, 200, replace = TRUE),
#'                                 dx = runif(200, 0, 5),
#'                                 dy = runif(200, 0, 5),
#'                                 treatment = rep("a", 200),
#'                                 date = rep("2032018", 200)),
#'                      data.frame(id = rep(2, 200),
#'                                 stimulus = rep(c(0, 1), each = 100),
#'                                 dT = sample(8:12, 200, replace = TRUE),
#'                                 dx = runif(200, 0, 5),
#'                                 dy = runif(200, 0, 5),
#'                                 treatment = rep("b", 200),
#'                                 date = rep("2032018", 200)))
#'  servosphere <- aggregateData(servosphere, n = 5)
#' @export
#' @import dplyr
#' @importFrom stats aggregate
#' @importFrom magrittr %>%

aggregateData <- function(list, n){
   # Done to appease the R CMD check gods
   . <- NULL
   dx <- NULL
   dy <- NULL

   purrr::map_if(list, is.data.frame, function(.x) {
      x <- cbind(aggregate(.x[, c("dT", "dx", "dy")],
                           list(rep(
                              1:(nrow(.x) %/% n + 1), each = n, len = nrow(.x)
                           )),
                           sum)[,-1],
                 aggregate(.x[, c("dT")],
                           list(rep(
                              1:(nrow(.x) %/% n + 1), each = n, len = nrow(.x)
                           )),
                           length)[,-1])
      colnames(x)[4] <- "length" # add length column
      x <- .x %>% # Add other identification columns corresponding to aggregated
         select(names(.x)[!(names(.x) %in% names(x))]) %>% # rows
         .[seq(from = 1,
                to = nrow(.x),
                length.out = nrow(x)), ] %>%
         bind_cols(x) %>%
         select(id, dx, dy, everything()) %>%
         select(everything())

      return(x)
   })
}

#' Merge trial id information
#'
#' This function merges trial id information with the servosphere data.
#'
#' Users of the servosphere will need a separate data frame with trial id
#' information in a column titled \code{id}. This should contain a unique
#' identifier and any other relevant experimental information, such as
#' treatments applied, date, time of day, etc. Make sure the rows in your trial
#' id data frame are ordered in the same order as the list of data frames of
#' your servosphere output.
#'
#' Researchers may wish to compare data before and after some stimulus has been
#' applied and this function allows the user to split their data into separate
#' data frames based on different values of the stimulus variable to facilitate
#' these comparison. If the data frames should be split by stimulus, the trial
#' record data frame should contain a column \code{id_stim} that lists the id
#' number of the trial, followed by an underscore, followed by each value of the
#' stimulus variable retained. If the data should be split by stimulus, the rows
#' of the trial id data frame should be ordered first by \code{id} in the same
#' order as their data is stored within the list. Once ordered by \code{id} the
#' trial data data frame should be further ordered within an \code{id} by
#' \code{stimulus} (i.e. id_stim 1_1 should come before id_stim 1_2).
#'
#' Data recorded during a particular stimuli may also be discarded if it is not
#' required for analysis. For example, recordings may begin with a five minute
#' adjustment period and the data associated with that period may not be used
#' for analysis. The stimulus recorded by the software during that five minute
#' adjustment period can be discarded by omitting that stimulus number from the
#' \code{stimulus.keep} argument.
#'
#' This function will also append an item to your list of data frames that
#' contains the relevant column names to be retained in future manipulations of
#' the data.
#'
#' @param list The list of servosphere output data.
#' @param trial.data The data frame containing your trial id information. This
#'   must contain an identifier column titled `id` and if the data are to be
#'   split by stimuli, an additional identifier column `id_stim`. See the
#'   description for more details.
#' @param col.names A string vector containing the names of the columns you want
#'   to transfer to your servosphere output data. The trial.data data frame may
#'   have columns not needed for the analysis, so the function asks the user to
#'   be explicit about which columns to retain.
#' @param stimulus.split A logical value indicating whether the data frames
#'   should be split by stimulus. Defaults to `FALSE`. If `TRUE`, be sure to
#'   include a `id_stim` column to give each trial/stimulus combination a unique
#'   ID.
#' @param stimulus.keep An integer vector containing the stimuli numbers to
#'   retained in the data and split the data frames by. Omitted stimuli values
#'   will be discarded.
#' @return Returns the list of data frames provided which have been merged with
#'   additional relevant trial information.
#' @examples
#'  servosphere <- list(data.frame(id = rep(1, 200),
#'                                 stimulus = rep(c(0, 1), each = 100),
#'                                 dT = sample(8:12, 200, replace = TRUE),
#'                                 dx = runif(200, 0, 5),
#'                                 dy = runif(200, 0, 5),
#'                                 treatment = rep("a", 200),
#'                                 date = rep("2032018", 200)),
#'                      data.frame(id = rep(2, 200),
#'                                 stimulus = rep(c(0, 1), each = 100),
#'                                 dT = sample(8:12, 200, replace = TRUE),
#'                                 dx = runif(200, 0, 5),
#'                                 dy = runif(200, 0, 5),
#'                                 treatment = rep("b", 200),
#'                                 date = rep("2032018", 200)))
#'  trial_record <- data.frame(id = c(1, 2),
#'                             treatment = c("a", "b"),
#'                             date = c("2032018", "2042018"),
#'                             time = c("13:30", "07:30"))
#'  trial_id_split <- data.frame(id = c(1, 2, 1, 2),
#'                               stimulus = c(1, 1, 2, 2),
#'                               treatment = c("a", "b", "a", "b"),
#'                               date = rep(c("2032018", "2042018"), times = 2),
#'                               time = rep(c("13:30", "07:30"), times = 2),
#'                               id_stim = c("1_1", "2_1", "1_2", "2_2"))
#' # Merge the columns id, treatment, and date from the trial_record data frame
#' # with all the data frames in our list servosphere.
#'
#'  merged_servosphere <- mergeTrialInfo(servosphere,
#'    trial_record,
#'    col.names = c("id", "treatment"),
#'    stimulus.keep = c(0, 1))
#'
#' # Repeat of the merger above without retaining the id column while
#' # also splitting the data provided into separate data frames based on the
#' # different stimuli recorded, keeping only data associated with stimuli 1 and
#' # 2. Splitting based on stimulus requires a column in the trial.data data
#' # frame titled id_stim.
#'
#'  merged_servosphere <- mergeTrialInfo(servosphere,
#'      trial_id_split,
#'      col.names = c("id", "treatment"),
#'      stimulus.split = TRUE,
#'      stimulus.keep = c(0, 1))
#' @export
#' @import dplyr purrr
#' @importFrom rlang .data
#' @importFrom magrittr %>%

mergeTrialInfo <- function(list,
                           trial.data,
                           col.names,
                           stimulus.keep,
                           stimulus.split = FALSE) {
   # Done to appease the R CMD check gods
   stimulus <- NULL
   . <- NULL

   if (stimulus.split == TRUE) {
      id <- unique(trial.data$id)
      list.id <- lapply(as.list(1:length(id)),
                                function(x) x[[1]])
      list <- purrr::map2(list, list.id, function(.x, .y) {
         .y <- rep(.y, each = nrow(.x))
         .x <- .x %>% mutate(id = .y)
         return(.x)
      })
      out <- purrr::map_if(list, is.data.frame, function(.x) {
         .x %>%
            filter(stimulus %in% stimulus.keep) %>%
            mutate(id_stim = as.character(paste0(id, "_", stimulus))) %>%
            split(.$stimulus)
      })
      out <- flatten(out)
      list.names <- map_chr(out, function(.x) {
         .x$id_stim[1]
      })
      names(out) <- list.names
      trial.data <- dplyr::select(trial.data, col.names)
      list.trial.data <- lapply(as.list(1:dim(trial.data)[1]),
                                function(x) trial.data[x[1], ])
      list <- purrr::map2(out, list.trial.data, function(.x, .y) {
         .y <- .y[rep(1, each = nrow(.x)), ]
         .x <- bind_cols(.x, .y)
         .x <- .x %>%
            select(-(num_range("id_stim", 1)))
         return(.x)
      })
      list[["col.names"]] <- c("id_stim", col.names)

   } else {
   trial.data <- dplyr::select(trial.data, col.names)
   list.trial.data <- lapply(as.list(1:dim(trial.data)[1]),
                                     function(x) trial.data[x[1], ])
   list <- purrr::map2(list, list.trial.data, function(.x, .y) {
      .y <- .y[rep(1, each = nrow(.x)), ]
      .x <- bind_cols(.x, .y)
      return(.x)
      })
   list <- purrr::map_if(list, is.data.frame, function(.x) {
      .x <- .x %>% filter(stimulus %in% stimulus.keep)
   })
   list[["col.names"]] <- c("id", col.names)
   }

   return(list)
}

Try the servosphereR package in your browser

Any scripts or data that you put into this service are public.

servosphereR documentation built on May 15, 2019, 1:05 a.m.