R/module1.R

Defines functions eq_location_clean eq_clean_data

Documented in eq_clean_data eq_location_clean

#Andrew Spewak
#Mastering Software Development in R Capstone

# ##########
# # set up #
# ##########
# #clear R environment
# rm(list = ls())
#
# #set working directory
# setwd("C:/Users/aespe/Documents/Work/R Training/Capstone")
#
# #load packages
# library(tidyr)
# library(dplyr)
# library(lubridate)
# library(stringr)
# library(ggplot)





#' eq_clean_data
#' function to cleanhe earthquakes dataset, including creating a date variable and making latitude/longitude numeric t
#' @param datafile the raw earthquakes data file, in .txt format
#' @return a dataframe called "clean_data"
#' @importFrom readr read_delim
#' @importFrom tidyr unite
#' @importFrom dplyr mutate
#' @importFrom lubridate ymd
#'@examples
#' \dontrun{eq_clean_data("signif.txt")}
#'
#' @export
#'
eq_clean_data <- function(datafile) {
  
  # ** import data **
  raw_data <- readr::read_delim(datafile, delim='\t',progress = FALSE)
  
  
  # ** create date variable **
  #for months and days that are NA, replace with 1
  raw_data$MONTH[is.na(raw_data$MONTH)==TRUE] <- 1
  raw_data$DAY[is.na(raw_data$DAY)==TRUE] <- 1
  
  #create and format the date
  intermediate_data <<- raw_data %>%
    tidyr::unite(date, YEAR, MONTH, DAY) %>% #combine year, month, and day into one date variable
    dplyr::mutate(lubridate::ymd(date)) #convert to date format
  
  
  # ** convert latitude and longitude to numeric **
  intermediate_data$LATITUDE <<- as.numeric(intermediate_data$LATITUDE)
  intermediate_data$LONGITUDE <<- as.numeric(intermediate_data$LONGITUDE)
  
  
  # ** clean location name **
  clean_data <<- eq_location_clean(intermediate_data)
  
  return(clean_data)
  
}

 

#' eq_location_clean
#' function to clean the location_name variable of the earthquakes datasets; strips out the country name and converts to title case
#' @param dataframe the dataframe with earthquake data, with the original location_name_variable
#' @return a dataframe called "clean_data"
#' @importFrom stringr str_replace str_to_title
#'@examples
#' \dontrun{eq_location_clean(intermediate_data)}
#'
#' @export
#'
# ** define function and arguments **
eq_location_clean <- function(dataframe) {
  
  
  # ** create clean data file from the intermediate data file **
  clean_data <<- intermediate_data
  
  # ** strip out country name; many different patterns of country names with colons, so different line of code for each pattern **
  
  #location name preceded by - sign and followed by colon and two spaces
  clean_data$LOCATION_NAME <<- stringr::str_replace(dataframe$LOCATION_NAME, paste0("-", dataframe$COUNTRY, ":  "), "")
  
  #location name preceded by - sign and followed by colon and one space
  clean_data$LOCATION_NAME <<- stringr::str_replace(clean_data$LOCATION_NAME, paste0("-", clean_data$COUNTRY, ": "), "")
  
  #location name preceded by - sign
  clean_data$LOCATION_NAME <<- stringr::str_replace(clean_data$LOCATION_NAME, paste0("-", clean_data$COUNTRY), "")
  
  #location name followed by colon and two spaces
  clean_data$LOCATION_NAME <<- stringr::str_replace(clean_data$LOCATION_NAME, paste0(clean_data$COUNTRY, ":  "), "")
  
  #location name followed by colon and one space
  clean_data$LOCATION_NAME <<- stringr::str_replace(clean_data$LOCATION_NAME, paste0(clean_data$COUNTRY, ": "), "")
  
  #location name alone
  clean_data$LOCATION_NAME <<- stringr::str_replace(clean_data$LOCATION_NAME, clean_data$COUNTRY, "")
  
  
  # ** convert text to title case **
  clean_data$LOCATION_NAME <<- stringr::str_to_title(clean_data$LOCATION_NAME)
  
  return(clean_data)
  
}
AESpe/R_capstone_final documentation built on Oct. 16, 2020, 12:37 a.m.