R/process_data.R

Defines functions eq_location_clean eq_clean_data

Documented in eq_clean_data eq_location_clean

#' Clean Location Name
#'
#' This function takes a string as input: if a colon ":" is present, it trim
#'   the beginning of the string till the ":" and following 2 spaces.
#'   If no colon ":" is present, the function returns the same string.
#'   The function is designe to work on a string, so that can be called within
#'   a "pipe".
#'
#' @param string a character vector with the earthquake location
#'
#' @return a string vector
#'
#' @importFrom tools toTitleCase
#'
#' @examples
#' \dontrun{
#' eq_location_clean("JORDAN: BAB-A-DARAA,AL-KARAK")
#' #[1] "Bab-a-Daraa,al-Karak"
#' }
#'
#' @export
eq_location_clean <- function(string) {
  name_clean <- ifelse(grepl(":", string),
                       # ^ beginning of the string
                       # . any character
                       # * repeated any number of times (including 0)
                       # ? "lazy" evaluation: until the first colon ":"
                       # : the colon itself
                       # \\s white space
                       # * repeated any number of times (including 0)
                       gsub("^.*?\\: \\s*", "", string),
                       string)

  # convert to Title Case and return
  return(tools::toTitleCase(tolower(name_clean)))
}


#' Clean Earthquakes NOAA dataset
#'
#' This function clean the input data by cpnverting it into a data frame, bluid
#'   a new variable representing the date of the earthquake, assure relevant
#'   variables are numeric (\code{LATITUDE}, \code{LONGITUDE}, \code{EQ_PRIMARY},
#'   \code{DEATHS}), and finally clean the \code{LOCATION_NAME} by calling the
#'   \code{eq_location_clean} function.
#'   If any of those variables is not present, the function stops and return an
#'   error message.
#'
#' @param raw a data frame or matrix containing the information about earthquakes.
#'   Must contain the following columns: YEAR, MONTH, DAY, LATITUDE, LONGITUDE,
#'   EQ_PRIMARY, DEATHS.
#'
#' @return a data frame with cleaned data
#'
#' @importFrom dplyr filter
#' @importFrom dplyr mutate
#' @importFrom dplyr '%>%'
#' @importFrom tidyr replace_na
#' @importFrom lubridate as_date
#'
#' @examples
#' \dontrun{
#' data <- eq_clean_data(raw_data)
#' }
#'
#' @export
eq_clean_data <- function(raw) {
  utils::globalVariables(c("YEAR", "DAY", "MONTH", "LATITUDE", "LONGITUDE",
                           "EQ_PRIMARY", "DEATHS"))
  YEAR <- NULL
  DAY <- NULL
  MONTH <- NULL
  LATITUDE <- NULL
  LONGITUDE <- NULL
  EQ_PRIMARY <- NULL
  DEATHS <- NULL
  LOCATION_NAME <- NULL
  tryCatch(
    data <- as.data.frame(raw) %>%
      # filter out years BCE
      dplyr::filter(YEAR > 0) %>%
      # replace NAs in MONTH and DAY with 1
      #tidyr::replace_na(replace = list("MONTH" = 1, DAY = 1)) %>%
      dplyr::mutate(
        # add DATE column with class "date"
        DAY = ifelse(is.na(DAY), 1, DAY),
        MONTH = ifelse(is.na(MONTH), 1, MONTH),
        DATE = lubridate::as_date(paste(YEAR, MONTH, DAY, sep = "-")),
        # force data to numeric
        LATITUDE = as.numeric(LATITUDE),
        LONGITUDE = as.numeric(LONGITUDE),
        EQ_PRIMARY = as.numeric(EQ_PRIMARY),
        DEATHS = as.numeric(DEATHS),
        # clean the LOCATION_NAME
        LOCATION_NAME = eq_location_clean(LOCATION_NAME)),
    error = function(e) {
      stop("ups! something went wrong: check that the data set contains all the colnames specified in the help")
      return(NULL)
    }
  )

  return(data)
}
frenkg/courseraeq documentation built on May 22, 2019, 12:42 p.m.