globalVariables(c("admin", "province_state","country_region", "last_update", "confirmed", "deaths", "recovered"))
#' Transform COVID-19 Data to be Uploaded into PostgreSQL Database
#' @description \code{etl_transform} takes files saved in the \emph{raw} folder
#' from \code{etl_extract} and transforms them into usable datasets to be uploaded
#' into a SQL database. After transformation, files are saved in the \emph{load}
#' folder within the specified directory or the temp directory if the user did
#' not specify a directory. Under the hood, \code{etl_transform} cleans datasets
#' by standardizing colummn names and removing unnecessary punctuation. Most
#' importantly the function only selects relevant columns: admin, province/state,
#' country/region, last_update, confirmed, deaths, and recovered. Arguments,
#' such as month, day, and year let users transform data for a specified
#' time period. When no arguments are specified then all data saved
#' in the \emph{raw} folder will be transformed and saved in the
#' \emph{load} folder.
#' @rdname etl_transform.etl_covid
#' @method etl_transform etl_covid
#' @import etl
#' @import dplyr
#' @importFrom purrr map
#' @importFrom lubridate month day year
#' @importFrom stringr str_extract str_replace_all
#' @inheritParams etl::etl_transform
#' @param month
#' numeric vector specifying month(s)
#' @param day
#' numeric vector specifying day(s)
#' @param year
#' numeric vector specifying year(s)
#' @export
#' @examples
#'
#' covid_dat <- etl::etl("covid")
#'\dontrun{
#' covid_dat %>%
#' etl_extract() %>%
#' etl_transform(month = 4, day = c(1:15), year = 2020)
#'}
#' @export
etl_transform.etl_covid <- function(obj, month, day, year, ...){
# Reading in
# Read in raw files from raw_dir, raw_dir is either specified directory in initial etl call
# or from temp directory in initial etl call
files <- list.files(attr(obj, "raw_dir"), "\\.csv", full.names = T)
covid_dfs <- purrr::map(files, readr::read_csv) %>% set_names(map_chr(files, ~str_extract(.x, "\\d{4}\\-\\d{2}\\-\\d{2}")))
# To select dfs to transform
nms_covid_dfs <- names(covid_dfs)
if(missing(month) & missing(day) & missing(year)){
covid_dfs <- covid_dfs
}else if (!missing(month) & !missing(day) & !missing(year)){
covid_dfs <- covid_dfs[lubridate::month(nms_covid_dfs) %in% month &
lubridate::day(nms_covid_dfs) %in% day &
lubridate::year(nms_covid_dfs) %in% year]
} else if (missing(day) & missing(year)) {
covid_dfs <- covid_dfs[lubridate::month(nms_covid_dfs) %in% month]
} else if(missing(month) & missing(day)) {
covid_dfs <- covid_dfs[lubridate::year(nms_covid_dfs) %in% year]
} else if(missing(month) & missing(year)){
covid_dfs <- covid_dfs[lubridate::day(nms_covid_dfs) %in% day]
} else if(missing(day)){
covid_dfs <- covid_dfs[lubridate::month(nms_covid_dfs) %in% month &
lubridate::year(nms_covid_dfs) %in% year]
} else if(missing(year)){
covid_dfs <- covid_dfs[lubridate::month(nms_covid_dfs) %in% month &
lubridate::day(nms_covid_dfs) %in% day]
} else {
covid_dfs <- covid_dfs[lubridate::day(nms_covid_dfs) %in% day &
lubridate::year(nms_covid_dfs) %in% year]
}
# Cleaning
# Make all column names in dfs match
covid_dfs <- purrr::map(covid_dfs,standardize_names)
# Cleaning dataframes for easy loading to SQL db
covid_dfs <- purrr::map(covid_dfs, ~purrr::map_df(.x, ~stringr::str_replace_all(., "'", "")))
covid_dfs <- purrr::map(covid_dfs, cleaning_covid_datasets)
# Keeping only relevant columns (admin, province_state, country_region, last_update, confirmed, recovered, deaths)
covid_dfs <- purrr::map(covid_dfs, select_covid_cols)
# Saving transformed files to load folder in directory or temp directory
covid_dfs %>% names(.) %>% purrr::map(~readr::write_csv(covid_dfs[[.]], paste0(attr(obj, "load_dir"),"/",.,".csv")))
Sys.sleep(5)
print(covid_dfs)
invisible(obj)
}
# Internal function 1
#' All columns in every dataframe in the list should have the same column names
#' @param df takes a df and makes column names standardized
#' @noRd
standardize_names <- function(df){
# Take out column names and apply operations to them
names_df <- names(df)
names_df <- tolower(names_df)
names_df <- stringr::str_replace(names_df,"\\/", "_")
names_df <- stringr::str_replace(names_df,"_$", "")
names_df <- stringr::str_remove(names_df, "[[:digit:]]$")
names_df <- trimws(names_df)
names_df <- stringr::str_replace(names_df, "\\s+", "_")
names_df <- trimws(names_df)
names_df[grepl("^lat", names_df)] <- "lat"
names_df[grepl("^long", names_df)] <- "long"
names(df) <- names_df
return(df)
}
# Internal function 2
#' Cleaning all covid dfs to make them acceptable for loading to a SQL database
#' @param df takes a df and cleans it
#' @noRd
cleaning_covid_datasets <- function(df){
df <- df %>%
dplyr::mutate_at(vars(starts_with("last")), list(~stringr::str_replace_all(., "T", " "))) %>%
dplyr::mutate_at(vars(starts_with("last")),
list(~as.character(lubridate::parse_date_time(.,orders = c('mdy_hm', 'ymd_hms'))))) %>%
dplyr::mutate_at(vars(matches("^conf|^reco|^deat|^acti")), as.numeric)
return(df)
}
# Internal function 3
#' Selecting Relevant columns
#' @param df takes df and selects columns
#' @noRd
select_covid_cols <- function(df){
nms <- names(df)
if("admin" %in% nms){
return_df <- dplyr::select(df, admin,province_state,
country_region, last_update,
confirmed, deaths, recovered)
} else {
return_df <- dplyr::select(df, province_state, country_region,
last_update, confirmed, deaths, recovered)
}
return(return_df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.