R/eq_cleaning.R

Defines functions eq_clean_data eq_location_clean eq_select_data eq_filter_data eq_count_events

Documented in eq_clean_data eq_count_events eq_filter_data eq_location_clean eq_select_data

#' Clean Earthquake Data
#'
#' \code{eq_clean_data} performs a series of edits to clean the earthquake
#' data.  It converts the SECOND variable to a numeric type and rounds the
#' decimal to the nearest whole number, replaces missing values in the MONTH and
#' DAY variables with '1' and missing values in the HOUR, MINUTE and SECOND
#' variables with '0'.  It uses the YEAR, MONTH, DAY, HOUR, MINUTE, and SECOND
#' variables to create a new DATE variable that contains the date of an event.
#' It converts the LATITUDE and LONGITUDE variables to a numeric type, renames the
#' I_D variable to ID, changes the FLAG_TSUNAMI variable to a logical value,
#' and changes the EQ_PRIMARY variable from a character to a numeric type.
#' Finally, it filters the dataset to remove observations with missing values in
#' the DATE, EQ_PRIMARY and TOTAL_DEATHS variables.
#'
#' @param df A data frame containing the earthquake data.
#'
#' @return A data frame containing the cleaned earthquake data.  If an error
#' or warning occurs, a message will be printed to the console and the function
#' will return NULL.
#'
#' @importFrom dplyr filter mutate rename
#'
#' @importFrom magrittr %>%
#'
#' @examples
#' \dontrun{
#' earthquakes <- eq_clean_data(earthquakes)
#' }
#'
#' @export

eq_clean_data <- function(df) {

      tryCatch({

            # Bind variables to object to avoid global variable warning on
            # R CMD CHECK.

            MONTH <- DAY <- HOUR <- MINUTE <- SECOND <- LATITUDE <- NULL
            LONGITUDE <- FLAG_TSUNAMI <- EQ_PRIMARY <- ID <- I_D <- NULL
            DATE <- TOTAL_DEATHS <- YEAR <- NULL

            # Clean data frame.

            df <- df %>%
                  dplyr::mutate(MONTH = ifelse(is.na(MONTH), 1, MONTH),
                                DAY = ifelse(is.na(DAY), 1, DAY),
                                HOUR = ifelse(is.na(HOUR), 0, HOUR),
                                MINUTE = ifelse(is.na(MINUTE), 0, MINUTE),
                                SECOND = ifelse(is.na(SECOND), 0, SECOND),
                                LATITUDE = as.numeric(LATITUDE),
                                LONGITUDE = as.numeric(LONGITUDE),
                                FLAG_TSUNAMI = ifelse(is.na(FLAG_TSUNAMI), FALSE, TRUE),
                                EQ_PRIMARY = as.numeric(EQ_PRIMARY)) %>%
                  dplyr::rename(ID = I_D) %>%
                  dplyr::mutate(SECOND = round(as.numeric(SECOND), 0),
                                DATE = ISOdate(year = YEAR,
                                               month = MONTH,
                                               day = DAY,
                                               hour = HOUR,
                                               min = MINUTE,
                                               sec = SECOND,
                                               tz = "")) %>%
                  dplyr::mutate(DATE = as.Date(DATE)) %>%
                  dplyr::filter(!is.na(DATE) & !is.na(EQ_PRIMARY) & !is.na(TOTAL_DEATHS))

      }, warning = function(w) {

            print(paste("eq_clean_data: ", w, sep = ""))

            return(NULL)

      }, error = function(e) {

            print(paste("eq_clean_data: ", e, sep = ""))

            return(NULL)

      }, finally = {

      })

      return(df)

}

#' Clean Location Values
#'
#' \code{eq_location_clean} formats the LOCATION_NAME variable by stripping
#' out the country from the name and converting the text from uppercase to
#' title case. For consistency, it also removes extra spaces from the text and
#' formats the COUNTRY variable in the same way as the LOCATION_NAME variable.
#'
#' @param df A data frame cotaining the earthquake data.
#'
#' @return A data frame containing the earthquake data with reformatted COUNTRY
#' and LOCATION_NAME variables.  If an error or warning occurs, a message will
#' be printed to the console and the function will return NULL.
#'
#' @importFrom dplyr mutate
#'
#' @importFrom magrittr %>%
#'
#' @importFrom stringr str_squish str_to_title
#'
#' @details \code{eq_location_clean} assumes the country appears in the
#' LOCATION_NAME variable at the beginning of the text and is separated from the
#' location by a colon.  Given this pattern, the function uses a regular
#' expression to find and remove the country from the beginning of the text.
#'
#' @examples
#' \dontrun{
#' df <- eq_location_clean(df)
#' }
#'
#' @export

eq_location_clean <- function(df) {

      tryCatch({

            # Bind variables to object to avoid global variable warning on
            # R CMD CHECK.

            LOCATION_NAME <- COUNTRY <- NULL

            # Clean data frame.

            df <- df %>%
                  dplyr::mutate(LOCATION_NAME = gsub("^.*: ", "", LOCATION_NAME),
                                LOCATION_NAME = stringr::str_to_title(LOCATION_NAME),
                                LOCATION_NAME = stringr::str_squish(LOCATION_NAME),
                                COUNTRY = stringr::str_to_title(COUNTRY),
                                COUNTRY = stringr::str_squish(COUNTRY))

      }, warning = function(w) {

            print(paste("eq_location_clean: ", w, sep = ""))

            return(NULL)

      }, error = function(e) {

            print(paste("eq_location_clean: ", e, sep = ""))

            return(NULL)

      }, finally = {

      })

      return(df)

}

#' Select Data
#'
#' \code{eq_select_data} selects a subset of the variables in the earthquakes
#' dataset needed for analysis and visualization.  It selects the ID, DATE,
#' COUNTRY, LOCATION_NAME, LONGITUDE, LATITUDE, EQ_PRIMARY and TOTAL_DEATHS
#' variables.
#'
#' @param df A data frame containing the earthquake data.
#'
#' @return A data frame containing the selected subset of earthquake data.
#' If an error or warning occurs, a message will be printed to the console and the
#' function will return NULL.
#'
#' @importFrom dplyr select
#'
#' @importFrom magrittr %>%
#'
#' @examples
#' \dontrun{
#' df <- eq_select_data(df)
#' }
#'
#' @export

eq_select_data <- function(df) {

   tryCatch({

      # Bind variables to object to avoid global variable warning on
      # R CMD CHECK.

      ID <- DATE <- COUNTRY <- LOCATION_NAME <- LONGITUDE <- NULL
      LATITUDE <- EQ_PRIMARY <- TOTAL_DEATHS <- NULL

      # Select data.

      df <- df %>%
            dplyr::select(ID,
                          DATE,
                          COUNTRY,
                          LOCATION_NAME,
                          LONGITUDE,
                          LATITUDE,
                          EQ_PRIMARY,
                          TOTAL_DEATHS)

   }, warning = function(w) {

      print(paste("eq_select_data: ", w, sep = ""))

      return(NULL)

   }, error = function(e) {

      print(paste("eq_select_data: ", e, sep = ""))

      return(NULL)

   }, finally = {

   })

   return(df)

}

#' Filter Data
#'
#' \code{eq_filter_data} filters the earthquakes data to the specified
#' COUNTRY and DATE values.
#'
#' @param df A data frame containing the earthquake data.
#'
#' @param countries A character vector of countries to be used to filter the
#' observations.
#'
#' @param minimum_date A date value representing the minimum date used to filter
#' observations.
#'
#' @param maximum_date A date value representing the maximum date used to filter
#' obvserations.
#'
#' @return A data frame containing the filtered earthquake data.  If an error
#' or warning occurs, a message will be printed to the console and the
#' function will return NULL.  If the function finds no matching observations,
#' it will return an empty data frame.
#'
#' @importFrom dplyr filter
#'
#' @importFrom magrittr %>%
#'
#' @examples
#' \dontrun{
#' df <- eq_filter_data(df,
#'                      countries = c("Usa", "China"),
#'                      minimum_date = "2000-01-01",
#'                      maximum_date = "2017-12-31")
#' }
#'
#' @export

eq_filter_data <- function(df, countries, minimum_date, maximum_date) {

   tryCatch({

      # Bind variables to object to avoid global variable warning on
      # R CMD CHECK.

      COUNTRY <- DATE <- NULL

      # Filter data.

      df <- df %>%
         dplyr::filter(COUNTRY %in% countries) %>%
         dplyr::filter(DATE >= minimum_date & DATE <= maximum_date)

   }, warning = function(w) {

      print(paste("eq_filter_data: ", w, sep = ""))

      return(NULL)

   }, error = function(e) {

      print(paste("eq_filter_data: ", e, sep = ""))

      return(NULL)

   }, finally = {

   })

   return(df)

}

#' Events By Country and Date Range
#'
#' \code{eq_count_events} assembles a summary data frame showing a count of
#' earthquakes for each country for a given date range.
#'
#' @param df A data frame containing the earthquake data.
#'
#' @param minimum_date A date value representing the minimum date used to filter
#' observations.
#'
#' @param maximum_date A date value representing the maximum date used to filter
#' obvserations.
#'
#' @return A data frame containing the count of earthquakes by country in
#' descending order by count.  If an error or warning occurs, a message will
#' be printed to the console and the function will return NULL.
#'
#' @importFrom dplyr desc filter group_by n summarise arrange
#'
#' @importFrom magrittr %>%
#'
#' @examples
#' \dontrun{
#' df <- eq_count_events(df,
#'                       minimum_date = "2000-01-01",
#'                       maximum_date = "2017-12-31")
#' }
#'
#' @export

eq_count_events <- function(df, minimum_date, maximum_date) {

   tryCatch({

      # Bind variables to object to avoid global variable warning on
      # R CMD CHECK.

      DATE <- COUNTRY <- EVENTS <- NULL

      # Count events.

      df <- df %>%
         dplyr::filter(DATE >= minimum_date & DATE <= maximum_date) %>%
         dplyr::group_by(COUNTRY) %>%
         dplyr::summarise(EVENTS = dplyr::n()) %>%
         dplyr::arrange(dplyr::desc(EVENTS))

   }, warning = function(w) {

      print(paste("eq_count_events: ", w, sep = ""))

      return(NULL)

   }, error = function(e) {

      print(paste("eq_count_events: ", e, sep = ""))

      return(NULL)

   }, finally = {

   })

   return(df)

}
dtminnick/earthquake documentation built on Nov. 4, 2019, 11:04 a.m.