R/data_loading.R

load(file.path(getwd(), "data", "quakes.rda"))

#' Convert a date element vector into a "[0-9]{2}" string
#'
#' @param vect input vector as character or numeric
#' @param replaceNA pattern used to replace NAs
#'
#' @return A vector of "[0-9]{2}" strings
#'
#' @examples
#' CapstoneProject:::std_time(c(12, 4, 2.9))
std_time <- function(vect, replaceNA = "00") {
  unlist(sapply(X = vect, function(x, rep = replaceNA) {
    #If time isn't specified replaced with replaceNA
    if (is.na(x)) {
      rep

    }

    #Si a time is specified
    else{
      #If there's a dot inside a time (eg. for seconds)
      if (length(grep(pattern = "\\.", x = as.character(x))) > 0) {
        #Floor the time
        x <- as.character(floor(as.numeric(x)))
      }

      #If there a no dots / after flooring
      if (nchar(x) < 2) {
        paste0("0", x)

      }

      else{
        x
      }
    }
  }))
}

#' Returns a cleaned NOAA Significant Earthquakes data.frame
#'
#' @param data an already loaded data.frame from the NOAA dataset
#' @param filename the path to a NOAA tab separated datafile
#' @param replaceDateNA_by a five element vector. Will
#' be used to replace NA elements in dates
#' c("MONTH", "DAY", "HOUR", "MINUTE", "SECOND"). Allow
#' computation of partial dates.
#'
#' @import dplyr
#' @importFrom readr read_tsv
#' @importFrom stringr str_to_title
#' @importFrom lubridate ymd_hms
#' @importFrom stats setNames
#'
#' @return a dataframe
#' @export
#'
#' @note
#' If no input are defined, function will use "quakes"
#' dataset as an input (see ?quakes for more information)
#'
#' @examples
#' df <- eq_clean_data()
#' head(df, 5)
#'
eq_clean_data <- function(data = NULL, filename = NULL,
                          replaceDateNA_by = c(NA, NA, "00", "00", "00")) {

  #If no data and no filename are given
  if (is.null(data) & is.null(filename)) {
    #data is set to quakes
    data <- quakes
  }

  #If a filename is given
  if (!is.null(filename)) {
    #data is set to corresponding file
    data <- readr::read_tsv(filename)
  }

  #Mutate columns to be compliant to a set of specifications
  cleaned_data <- data %>%

    dplyr::mutate_(

      #LATITUDE is a numeric
      LATITUDE = stats::setNames(object = ~ as.numeric(LATITUDE),
                          nm = "LATITUDE"),

      #LONGITUDE is a numeric
      LONGITUDE = stats::setNames(object = ~ as.numeric(LONGITUDE),
                           nm = "LONGITUDE"),

      #LOCATION_NAME is title case
      LOCATION_NAME = stats::setNames(
        object = ~ stringr::str_to_title(LOCATION_NAME),
        nm = "LOCATION_NAME"
      ),

      #All "date" columns are to be in a two digit caracter format
      #For month and year
      ##MONTH is set to a [0-9]{2} format or to replacement pattern if NA
      MONTH = stats::setNames(object = ~ std_time(MONTH, replaceDateNA_by[1]),
                       nm = "MONTH"),

      ##DAY is set to a [0-9]{2} format or to replacement pattern if NA
      DAY = stats::setNames(object = ~ std_time(DAY, replaceDateNA_by[2]),
                     nm = "DAY"),

      ##HOUR is set to a [0-9]{2} format or to replacement pattern if NA
      HOUR = stats::setNames(object = ~ std_time(HOUR, replaceDateNA_by[3]),
                      nm = "HOUR"),

      ##MINUTE is set to a [0-9]{2} format or to replacement pattern if NA
      MINUTE = stats::setNames(object = ~ std_time(MINUTE, replaceDateNA_by[4]),
                        nm = "MINUTE"),

      ##SECOND is set to a [0-9]{2} format or to replacement pattern if NA
      SECOND = stats::setNames(object = ~ std_time(SECOND, replaceDateNA_by[5]),
                        nm = "SECOND")
    )

  #Mutate columns to be compliant to a set of specifications
  cleaned_data <- cleaned_data %>%

    #Add a date
    dplyr::mutate_(
      date = stats::setNames(
      object = ~ as.Date(lubridate::ymd_hms(paste0(YEAR,
                                           MONTH,
                                           DAY,
                                           HOUR,
                                           MINUTE,
                                           SECOND), quiet = TRUE)),
      nm = "date"
    ),

    #Extract country and colon
    LOCATION_NAME = stats::setNames(
      object = ~gsub(pattern = "^(.+):( )*", replacement = "",
                     x = LOCATION_NAME),
      nm = "LOCATION_NAME"
    )

    )

  return(cleaned_data)

}
KDallaporta/CapstoneProject documentation built on May 12, 2019, 1:09 p.m.