R/data_processing.R

#######################################################################
## Unauthorized copying of this file, via any medium is strictly     ##
## prohibited. The code/project can not be copied and/or distributed ##
## without the express permission of Ganna Androsova.                ##
#######################################################################

#' Polish LOCATION_NAME column in the significant earthquake data
#'
#' This function takes as input a data.frame of the entire significant
#' earthquake data \code{data}, cleans the LOCATION_NAME column by
#' stripping out the country name (including the colon) and converts
#' names to title case (as opposed to all caps).
#'
#' @param data A tibble data.frame with LOCATION_NAME column that has to be cleaned
#'
#' @return This function returns a tibble (data.frame)
#'
#' @import dplyr
#'
#' @examples
#' NOAA_url = "https://www.ngdc.noaa.gov/nndc/struts/results?type_0=Exact&query_0=$ID&t=101650&s=13&d=189&dfn=signif.txt"
#' data <- suppressMessages({readr::read_delim(NOAA_url, delim = "\t", progress = FALSE)})
#' eq_location_clean(data)
#'
#' @export
eq_location_clean = function(data){
  data %>%
    dplyr::mutate(LOCATION_NAME = sapply(LOCATION_NAME, function(x){
      x = gsub("\\]|\\)", "", gsub("^.+:[ | ]+", "", x))
      x = gsub(" \\(| \\[", ", ", x)
      splited = strsplit(x, " ")[[1]]
      splited = paste(substring(splited, 1, 1), tolower(substring(splited, 2)),
                      sep="", collapse=" ")
      if(grepl("\\[|\\(|,|;", splited)){
        splited = strsplit(splited, "\\[|\\(|,|;( )*")[[1]]
        splited = paste(toupper(substring(splited, 1, 1)), substring(splited, 2),
                  sep="", collapse=", ")
      }
      gsub(", , ", ", ", splited)
    }))
}

#' Load NOAA significant earthquake data into a tibble
#'
#' This function reads tab-delimited file of the entire significant earthquake
#' data available at \code{NOAA_url} from U.S. National Oceanographic
#' and Atmospheric Administration (NOAA) database and returns a tibble (data.frame).
#' After reading the columns YEAR, MONTH and DAY are merged into column named
#' DATE, which contains Date class entries.
#'
#' @param NOAA_url A character string giving the URL path to the significant
#' earthquake data file
#'
#' @return This function returns a tibble (data.frame) of the input file
#'
#' @note This function will give error if the file path is incorrect or
#' file does not exist
#'
#' @importFrom readr read_delim
#' @importFrom tidyr unite
#' @import dplyr
#'
#' @examples
#' NOAA_clean_data = eq_clean_data("https://www.ngdc.noaa.gov/nndc/struts/results?type_0=Exact&query_0=$ID&t=101650&s=13&d=189&dfn=signif.txt")
#'
#' @export
eq_clean_data <- function(NOAA_url) {
  raw_data <- suppressMessages({
    readr::read_delim(NOAA_url, delim = "\t", progress = FALSE)
  })
  processed_data = raw_data %>%
    dplyr::filter(YEAR >= 1900) %>%
    tidyr::unite(DATE, YEAR, MONTH, DAY, sep = "-") %>%
    dplyr::mutate(DATE = as.Date(DATE, format = "%Y-%m-%d", origin = '1900-1-1')) %>%
    dplyr::filter(!is.na(DATE) & LATITUDE != "       ") %>%
    dplyr::mutate(LATITUDE = as.numeric(LATITUDE),
                  LONGITUDE = as.numeric(LONGITUDE),
                  COUNTRY = as.factor(COUNTRY),
                  EQ_MAG_MW = as.numeric(EQ_MAG_MW),
                  DEATHS = ifelse(!is.na(DEATHS), as.numeric(DEATHS), 0))

  eq_location_clean(processed_data)
}

#' Earthquake data filtering
#'
#' This function filters the earthquake data \code{data} -
#' generated by U.S. National Oceanographic and Atmospheric Administration
#' (NOAA) database - based on specified minimum date \code{xmindate},
#' maximum date \code{xmaxdate} and country \code{country}.
#'
#' @param data A preproccessed tibble (data.frame) of NOAA earthquakes
#' dataset
#' @param xmindate a character vector indicating the mininum date specified for
#' earthquakes time line plotting. Should follow the format of YYYY-MM-DD.
#' @param xmaxdate a character vector indicating the maximum date specified
#' for earthquakes time line plotting. Should follow the format of YYYY-MM-DD.
#' @param country a vector of character strings for the countries of interest.
#' By default it shows earthquake data related to USA.
#'
#' @return This function returns a tibble (data.frame)
#'
#' @import dplyr
#'
#' @examples
#' NOAA_clean_data = eq_clean_data("https://www.ngdc.noaa.gov/nndc/struts/results?type_0=Exact&query_0=$ID&t=101650&s=13&d=189&dfn=signif.txt")
#' NOAA_filtered = filter_date_and_country(NOAA_clean_data, xmindate = "1999-01-01", xmaxdate = "2017-12-31", country = c("CHINA", "USA"))
#' head(NOAA_filtered)
#'
#' @export
filter_date_and_country = function(data, xmindate, xmaxdate, country){
  xmindate = as.Date(xmindate)
  xmaxdate = as.Date(xmaxdate)

  NOAA_filtered = data %>%
    dplyr::filter(DATE > xmindate, DATE < xmaxdate) %>%
    dplyr::filter(COUNTRY %in% country) %>%
    dplyr::filter(!is.na(EQ_MAG_MW))
}

#' Earthquake popup text
#'
#' This function takes the earthquake dataset \code{data} -
#' generated by U.S. National Oceanographic and Atmospheric Administration
#' (NOAA) database - as an argument and creates an HTML label that can
#' be used as the annotation text in the leaflet map.
#'
#' @param data A preproccessed tibble (data.frame) of NOAA earthquakes
#' dataset
#'
#' @return This function returns a character string for each earthquake
#' that will show the cleaned location (LOCATION_NAME), the magnitude (EQ_PRIMARY),
#' and the total number of deaths (TOTAL_DEATHS).
#'
#' @import dplyr
#' @importFrom lubridate year
#'
#' @examples
#' library(dplyr)
#' NOAA_clean_data = eq_clean_data("https://www.ngdc.noaa.gov/nndc/struts/results?type_0=Exact&query_0=$ID&t=101650&s=13&d=189&dfn=signif.txt")
#' NOAA_clean_data %>% dplyr::filter(COUNTRY == "MEXICO" & lubridate::year(DATE) >= 2000) %>% dplyr::mutate(popup_text = eq_create_label(.)) %>% eq_map(annot_col = "popup_text")
#'
#' @export
eq_create_label = function(data) {
  paste(ifelse(is.na(data$LOCATION_NAME), "", paste("<b>Location:</b>", data$LOCATION_NAME)),
        ifelse(is.na(data$EQ_PRIMARY), "", paste("<b>Magnitude:</b>", data$EQ_PRIMARY)),
        ifelse(is.na(data$TOTAL_DEATHS), "", paste("<b>Total deaths:</b>", data$TOTAL_DEATHS)),
        sep = "<br/>")
}
androsova/NOAA documentation built on May 10, 2019, 11:44 a.m.