R/data_storage.R

Defines functions get_data_from_meta save_meta get_meta clean_store_batch clean_batch_df store_batch_list

if(getRversion() >= '2.5.1') {
  globalVariables(c('data_dir', 'read.csv'))
}

#'
#'Generates basic metadata about a directory of animal data files and stores the files as data frames as a list with the meta
#'
#'@param data_dir location of animal data files, in list format
#'@param max_rate maximum rate of travel (meters/minute) between consecutive points
#'@param max_course maximum distance (meters) between consecutive points
#'@param max_dist maximum geographic distance (meters) between consecutive points
#'@param max_clean_time maximum time (minutes) between consecutive points
#'@return a list of animal data frames with information about the data
#'@noRd
#'
store_batch_list <- function(data_dir, max_rate = 84, max_dist = 840, max_course = 100, max_clean_time = 3600) {
  # unpack documents in the .zip file to a temp folder
  dir_name <- gsub(".zip", "", data_dir$name)
  
  unlink(file.path("temp"), recursive=TRUE)
  
  data_files <- utils::unzip(data_dir$datapath, exdir="temp")
  data_files <- list.files("temp", pattern = "*.(csv|txt|TXT)", recursive = TRUE, full.names = TRUE)

  rds_name <- paste0(dir_name, ".rds")
  
  data_sets <- lapply(data_files, read_gps)
    
  # remove "temp" from file name
  file_names <- gsub("(temp)(\\/)", "", data_files)
  
  gps_units <- gsub("(.*)(20)([0-9]{2}\\_)(.*)(\\_{1}.*)(\\.(csv|txt|TXT))","\\4", file_names)
  ani_ids <- gsub("(.*)(20)([0-9]{2}\\_)(.*\\_)(.*)(\\.(csv|txt|TXT))","\\5", file_names)
  
  site_names <- c()
  # function to compute max/min lat/long from a dirty dataset
  maxminlatlong <- function(data, dtype){
    if(dtype == "igotu") {
      suppressWarnings(  df <-  data[!is.na(as.numeric(data$Index)), ] ) # discard any rows with text in the first column duplicate header rows
    }
    else {
      df <- data
    }
    df <- utils::type.convert(df) %>% 
      dplyr::select( Latitude, Longitude) %>%
      dplyr::filter(!is.na(Latitude), Latitude !=0, !is.na(Longitude), Longitude !=0)
    
   return( c(     max(df$Latitude), 
                  min(df$Latitude),
                  max(df$Longitude),
                  min(df$Longitude)
            )
         )
  }

  # function to update a global max/min lat/long with a new dataset
  update_maxminlatlong <- function(mmll, newdata, newdtype){
    new_mmll <- maxminlatlong(newdata, newdtype)
    c(max(mmll[1], new_mmll[1]),
      min(mmll[2], new_mmll[2]),
      max(mmll[3], new_mmll[3]),
      min(mmll[4], new_mmll[4]))
    
  }
  maxminsll <- maxminlatlong(data_sets[[1]]$df, data_sets[[1]]$dtype)
  
  meta_df <- data.frame()

  for(i in 1:length(file_names)) {
    site_names[i] <- ifelse( grepl("\\_", file_names[i]), tolower(sub("\\_.*","", file_names[i])), paste0("Unknown_", gsub("(.*).(csv|txt|TXT)", "\\1", file_names[i])))
    ani_ids[i] <- ifelse(ani_ids[i] == file_names[i], paste0("Unknown_", gsub("(.*).(csv|txt|TXT)", "\\1", file_names[i])), ani_ids[i])
    gps_units[i] <-  ifelse(gps_units[i] == file_names[i], paste0("Unknown_", gsub("(.*).(csv|txt|TXT)", "\\1", file_names[i])), gps_units[i])
    df <- data_sets[[i]]$df
    dtype <- data_sets[[i]]$dtype
    if(i > 1 ){
      maxminsll <- update_maxminlatlong(maxminsll, df, dtype)
    }
    df <- clean_location_data(df, dtype, filters = FALSE, aniid = ani_ids[i], gpsid = gps_units[i],
                              maxrate = max_rate, maxdist = max_dist, maxcourse = max_course,
                              maxtime = max_clean_time)
    df_clean <- clean_location_data(df, dtype, filters = TRUE, aniid = ani_ids[i], gpsid = gps_units[i],
                                    maxrate = max_rate, maxdist = max_dist, maxcourse = max_course,
                                    maxtime = max_clean_time)
    current_meta <- get_meta(df_clean, i, dtype, file_names[i], site_names[i], ani_ids[i], "temp.rds")
    meta_df <- save_meta(meta_df, current_meta) 
    data_sets[[i]] <- df
  }
  
  ani_ids <- make.unique(ani_ids, sep="_")
  
  unlink("temp", recursive = TRUE)
  
  saveRDS(data_sets, "temp.rds")
  
  return(list(data = data_sets, file = file_names, 
              ani = ani_ids, gps = gps_units, 
              site = site_names, rds_name = rds_name,
              min_lat = maxminsll[2], max_lat = maxminsll[1],
              min_long = maxminsll[4], max_long = maxminsll[3],
              meta = meta_df))
}

#'
#'Cleans a directory of animal data files
#'
#'@param data_info list of animal data frames with information about the data, generated by store_batch
#'@param filters filter bad data points, defaults to true
#'@param tz_in input time zone, defaults to UTC
#'@param tz_out output time zone, defaults to UTC
#'@param maxrate maximum rate of travel (meters/minute) between consecutive points
#'@param maxcourse maximum distance (meters) between consecutive points
#'@param maxdist maximum geographic distance (meters) between consecutive points
#'@param maxtime maximum time (minutes) between consecutive points
#'@return clean df with all animal data files from the directory
#'@noRd
#'
clean_batch_df <- function(data_info, filters = TRUE, tz_in = "UTC", tz_out = "UTC", max_rate, max_course,
                            max_dist, max_clean_time) {
  data_sets <- list()
  withProgress(message = paste0("Preparing raw data", ifelse(filters, " (filtered)", " (unfiltered)")), detail = paste0("0/",length(data_info$data), " files prepped"), value = 0, {
    
  for(i in 1:length(data_info$data)) {
    df <- data_info$data[[i]]
    dtype <- data_info$meta$dtype[i]
  
    aniid <- data_info$ani[i]
    gpsid <- data_info$gps[i]
    
    # clean df
    if(filters) {
      df_out <- clean_location_data(df, dtype, filters,
                                   aniid = aniid, 
                                   gpsid = gpsid, 
                                   maxrate = max_rate, max_course = max_course, 
                                   max_dist = max_dist, max_time=max_time, tz_in = tz_in, tz_out = tz_out)
    }
    else {
      df_out <- df
    }
    # add cleaned df to the list of data
    data_sets[[paste0("ani",aniid)]] <- df_out
    incProgress(1/(length(data_info$data)), detail = paste0(i,"/",length(data_info$data), " files prepped"))
  } #cleaning for loop
  }) #progress bar
  
  return(do.call(rbind, data_sets))
}

#'
#'Cleans a directory of animal data files and stores them locally in rds format
#'
#'@param data_info list of animal data frames with information about the data, generated by store_batch
#'@param filters filter bad data points, defaults to true
#'@param zoom level of zoom, defaults to 11
#'@param get_elev logical, whether to append elevation data
#'@param get_slope logical, whether to compute slope (in degrees), defaults to true
#'@param get_aspect logical, whether to compute aspect (in degrees), defaults to true
#'@param weather_vars weather variables to append
#'@param selected_station weather station for data lookup, can be NULL if weather_vars is NULL
#'@param min_lat minimum latitude for filtering, defaults to min in data_info
#'@param max_lat maximum latitude for filtering, defaults to max in data_info
#'@param min_long minimum longitude for filtering, defaults to min in data_info
#'@param max_long maximum longitude for filtering, defaults to max in data_info
#'@param tz_in input time zone, defaults to UTC
#'@param tz_out output time zone, defaults to UTC
#'@param maxrate maximum rate of travel (meters/minute) between consecutive points
#'@param maxcourse maximum distance (meters) between consecutive points
#'@param maxdist maximum geographic distance (meters) between consecutive points
#'@param maxtime maximum time (minutes) between consecutive points
#'@return df of metadata for animal file directory
#'@noRd
#'
clean_store_batch <- function(data_info, filters = TRUE, zoom = 11, get_elev = FALSE, get_slope = FALSE, get_aspect = FALSE, 
                              weather_vars = NULL, selected_station = NULL,
                              min_lat = data_info$min_lat, max_lat = data_info$max_lat, min_long = data_info$min_long, max_long = data_info$max_long, 
                              tz_in = "UTC", tz_out = "UTC", dbscan_enable=FALSE, kalman = FALSE, kalman_max_timestep=300, max_rate, max_course, max_dist, max_clean_time,
                              dbscan_knn_eps = 0.001, dbscan_knn_k = 5) {
  #initialize empty meta
  meta_df <- data.frame(matrix(ncol = 9, nrow = 0))
  meta_cols <- c("file_id", "file_name", "site", "ani_id", "min_date", "max_date", "min_lat", "max_lat", "storage")
  colnames(meta_df) <- meta_cols
 
  num_saved_rds <- 0
  
  withProgress(message = "Processing data", detail = paste0("0/",length(data_info$data), " files processed"), value = 0, {

  data_sets <- list()
  
  all_data_sets <- data.frame()
  
  for(i in 1:length(data_info$data)) {
   
    df <- data_info$data[[i]]
    dtype <- data_info$meta$dtype[i]
    
    if(dtype == "igotu" & data_info$meta$storage[1] != "demo_nov19.rds") {
      df <- df[!duplicated(as.list(df))] # discard any columns that are duplicates of index
      colnames(df)[1] <- "Index"
      suppressWarnings(  df <-  df[!is.na(as.numeric(df$Index)), ] ) # discard any rows with text in the first column duplicate header rows
    }
    
    df <- utils::type.convert(df)
    
    aniid <- data_info$ani[i]
    gpsid <- data_info$gps[i]
    
    # clean df
    
    df_out<- clean_location_data(df, dtype, filters,
                             aniid = aniid,
                             gpsid = gpsid,
                             maxrate = max_rate, maxcourse = max_course, maxdist = max_dist, maxtime = max_clean_time,
                             tz_in = tz_in, tz_out = tz_out,
                             dbscan_enable=dbscan_enable, kalman=kalman, kalman_min_lat=min_lat, kalman_max_lat=max_lat, kalman_min_lon=min_long, kalman_max_lon=max_long, kalman_max_timestep=kalman_max_timestep)
    
    # add cleaned df to the list of data
    data_sets[[paste0("ani",aniid)]] <- df_out
    all_data_sets <- all_data_sets %>% rbind(df_out)
  } #cleaning for loop
    
  df_processed <- all_data_sets
    # Elevation lookup
    if(get_elev) {
      df_processed <- all_data_sets %>% dplyr::filter(Latitude <= max_lat,
                                                        Latitude >= min_lat,
                                                        Longitude <= max_long,
                                                        Longitude >= min_long)
      
      status_message <- modalDialog(
        pre(id = "console"),
        title = "Please Wait...",
        easyClose = TRUE,
        footer = NULL
      )
      
      showModal(status_message)
      
      if(nrow(df_processed) == 0) {
        incProgress(0, detail = paste0("Appending elevation at zoom = ", zoom, " for invalid bounds. Defaulting to all data."))
        withCallingHandlers({
          shinyjs::html("console", "")
          df_processed <- lookup_elevation_aws(all_data_sets, zoom = zoom, get_slope = get_slope, get_aspect = get_aspect)
        },
        message = function(m) {
          shinyjs::html(id = "console", html = m$message)
        })
      }
      else {
        incProgress(0, detail = paste0("Appending elevation for lat. bounds (", min_lat, ",", max_lat, 
                                       ") and long. bounds (", min_long, ",", max_long, ") at zoom = ", zoom, "..." ))
        withCallingHandlers({
          shinyjs::html("console", "")
          df_processed <- lookup_elevation_aws(df_processed, zoom = zoom, get_slope = get_slope, get_aspect = get_aspect)
        },
        message = function(m) {
          shinyjs::html(id = "console", html = m$message)
        })
      }
    }
    # Weather lookup
    if(length(weather_vars) != 0) {
      incProgress(0, detail = paste0("Appending weather data..."))
      selected_vars <- c()
      if("wind direction" %in% weather_vars) {
        selected_vars <- c(selected_vars, "wind_direction")
      }
      if("wind speed" %in% weather_vars) {
        selected_vars <- c(selected_vars, "wind_speed")
      }
      if("ceiling height" %in% weather_vars) {
        selected_vars <- c(selected_vars, "ceiling_height")
      }
      if("visibility distance" %in% weather_vars) {
        selected_vars <- c(selected_vars, "visibility_distance")
      }
      if("temperature" %in% weather_vars) {
        selected_vars <- c(selected_vars, "temperature")
      }
      if("dewpoint temperature" %in% weather_vars) {
        selected_vars <- c(selected_vars, "temperature_dewpoint")
      }
      if("air pressure" %in% weather_vars) {
        selected_vars <- c(selected_vars, "air_pressure")
      }
      if("precipitation depth" %in% weather_vars) {
        selected_vars <- c(selected_vars, "AA1_depth")
      }
      withCallingHandlers({
        shinyjs::html("console", "")
        df_processed <- df_processed %>% 
          lookup_weather(selected_vars, search = FALSE, station = selected_station, is_shiny = TRUE)
      },
      message = function(m) {
        shinyjs::html(id = "console", html = m$message)
      })
    }

    removeModal()
    
    for(i in 1:length(data_info$data)) {
      
      aniid <- data_info$ani[i]
      
      #on every 50th data file, increment file name counter and wipe data_sets
      if(i %% 50 == 0 & i > 49) {
        saveRDS(data_sets, data_info$rds_name)
        num_saved_rds <- num_saved_rds + 1
        data_info$rds_name <- paste0(data_dir, num_saved_rds, ".rds")
        data_sets <- list()
      }
      
      df_out <- df_processed %>% dplyr::filter(Animal == aniid)

      # get meta from df
      file_meta <- get_meta(df_out, i, data_info$meta$dtype[i], data_info$file[i], data_info$site[i], aniid, data_info$rds_name)
      # save meta to the designated meta df
      meta_df <- save_meta(meta_df, file_meta)
      # replace df with elevation df
      data_sets[[paste0("ani",aniid)]] <- df_out
    }
    
  }) #progress bar
  #save remaining data files
  saveRDS(data_sets, data_info$rds_name)
  return(meta_df)
}


#'
#'Generate metadata for an animal data frame -
#'filename, site, date min/max, animals, min/max lat/longitude, storage location 
#'
#'@param df clean animal data frame 
#'@param file_id ID number of source of animal data frame
#'@param dtype igotu or columbus
#'@param file_name .csv source of animal data frame
#'@param site physical source of animal data
#'@param ani_id ID of animal found in data frame
#'@param storage_loc .rds storage location of animal data frame
#'@return df of metadata for animal data frame 
#'@noRd
get_meta <- function(df, file_id, dtype, file_name, site, ani_id, storage_loc) {
   return(data.frame(file_id = file_id,
            dtype = dtype,
            file_name = file_name, 
            site = site, 
            ani_id = ani_id, 
            min_date = min(df$DateTime, na.rm = TRUE), 
            max_date = max(df$DateTime, na.rm = TRUE), 
            min_lat = min(df$Latitude), 
            max_lat = max(df$Latitude),
            min_long = min(df$Longitude), 
            max_long = max(df$Longitude), 
            storage = storage_loc))
}

#'
#'Save metadata to a data frame and return it
#'
#'@param meta_df the data frame to store metadata in
#'@param file_meta meta for a .csv file generated by get_meta
#'@return df of metadata
#'@noRd
#'
save_meta <- function(meta_df, file_meta) {
  meta_df <- rbind(meta_df, file_meta)
  return(meta_df)
}

#'
#'Get animal data set from specified meta. 
#'If date range is invalid, automatically returns all animal data specified by meta_df.
#'
#'@param meta_df data frame of specified meta
#'@param min_date minimum date specified by user
#'@param max_date maximum date specified by user
#'@return df of animal data from specified meta
#'@noRd
#'
get_data_from_meta <- function(meta_df, min_date = min(meta_df$min_date), max_date = max(meta_df$max_date)) {
  
  meta_df$storage <- as.character(meta_df$storage)
  rds_files <- list(unique(meta_df$storage))
  current_df <- data.frame()
  for(file_name in rds_files) {
    current_rds <- readRDS(file_name)
    for(df in current_rds) {
      current_df <- rbind(current_df, df)
    }
  }
  
  current_df <- current_df %>%
    dplyr::filter(Animal %in% meta_df$ani_id,
                  DateTime <= as.Date(max_date),
                  DateTime >= as.Date(min_date))
  
  # check if current_df is empty
  
  if(nrow(current_df) == 0) {
    current_df <- current_df %>% 
      dplyr::filter(Animal %in% meta_df$ani_id)
  }
  
  return(current_df)
}
mathedjoe/animaltracker documentation built on Aug. 12, 2021, 7:46 a.m.