#' smooth
#'
#' Function to smooth data across different time windows length and position (trailing or surrounding)
#'
#' @param data Default = NULL. Input data as R dataframe or CSV
#' @param window_length Default = 5. Length of smoothing window to use in units of data time period.
#' @param window_type Default = "trail". Can be "trail" or "surround" where "trail" uses
#' time periods up to the value being smoothed and "surround" uses time period before and after each value being smoothed.
#' @param save Default = TRUE. Whether data should be printed as a csv or not.
#' @param diagnostics Default = FALSE. Whether diagnostic figures should be produced.
#' @param diagnostics_n Default = 20. Number of sub-plots in each diagnostic figure.
#' @param diagnostics_col Default = NULL. Which column to plot diagnostics for.
#' @param filename Default = NULL
#' @param folder Default = NULL
#' @param output_type Default = "wide". Output dimensions long or wide.
#' @importFrom magrittr %>%
#' @return Returns a dataframe (and csv file if save = TRUE)
#' @keywords smoothing
#' @export
#' @examples
#' library(rpackageutils)
smooth <- function(data=NULL,
window_length=5,
window_type="trail",
save = TRUE,
diagnostics = FALSE,
diagnostics_n = 20,
diagnostics_col = NULL,
filename = NULL,
folder = NULL,
output_type = "wide") {
#For testing:
# data = model_smooth_total
# window_length=5
# window_type="trail"
# save = FALSE
# diagnostics = FALSE
# diagnostics_n = 20
# diagnostics_col = NULL
# filename = NULL
# folder = NULL
# output_type = "wide"
#...............
# Initialize
#...............
NULL -> data_i_smoothed -> YEAR -> Year -> mean_val_complete -> mean_value ->
value -> x -> year
data_smoothed <- list()
if (is.null(data)){stop("Data is NULL. Please provide valid data.")}
# Set Folder
if(!is.null(folder)){
if(!dir.exists(folder)){
dir.create(folder)
dirx <- paste0(getwd(),"/",folder)
} else {
dirx <- folder
}
} else {
dirx <- getwd()
}
#...............
# Check input data
#...............
# Convert to list if not already a list
if(any(!class(data) %in% "list")){
if(any(class(data) == "character") & length(data) > 0){
data <- as.list(data)
} else {
data <- list(data=data)
}
}
# For each element in list
# For testing set i = 1
for(i in 1:length(data)){
data_i = data[[i]]
#...............
# Check input data
#...............
if(any(class(data_i) %in% c("tbl_df","tbl","data.frame"))){
data_i_raw = data_i
}
# check if dataframe
if(any(class(data_i) %in% c("character"))){
# check if class data_i is a character
if(file.exists(data_i)){
data_i_raw <- data.table::fread(paste(data_i),encoding="Latin-1")%>%
tibble::as_tibble()
} else {stop(paste0(data_i," does not exist"))}
}
# check if character file
#...............
# Check Data format
#...............
# Check if data is in wide format adn ocnvert to long
if(!any(grepl(c("x|year"),names(data_i_raw), ignore.case = T))){
# Check data is in long format with a x column
if(any(!is.na(as.numeric(names(data_i_raw))))){
non_numeric_cols <- names(data_i_raw)[is.na(as.numeric(names(data_i_raw)))]
# Gather into long format
data_i_raw <- data_i_raw %>%
tidyr::gather(key="x",value="value",-non_numeric_cols)
} else {
stop("None of the columns in the data are years")
}
} else {
if(any(grepl("year",names(data_i_raw),ignore.case = T))){
data_i_raw <- data_i_raw %>%
dplyr::rename_all(tolower) %>%
dplyr::rename("x"="year")
}
non_numeric_cols <- names(data_i_raw)[!names(data_i_raw) %in% c("x","value")]
}
#...............
# Smooth Data
#...............
data_i_raw <- data_i_raw %>%
# Raw data from /pic/projects/GCAM/gcam_hydrology/runs/xanthos/pm_abcd_mrtm have incomplete basn names.
# Code below adjusts those
dplyr::mutate(name = gsub("^Adriatic Sea - Greece - Black Se$","Adriatic Sea - Greece - Black Sea Coast",name),
name = gsub("^Africa Red Sea - Gulf of Aden Co$","Africa Red Sea - Gulf of Aden Coast",name),
name = gsub("^Northeast South America South At$","Northeast South America South Atlantic Coast",name),
name = gsub("^North Brazil South Atlantic Coas$","North Brazil South Atlantic Coast",name),
name = gsub("^Uruguay - Brazil South Atlantic$","Uruguay - Brazil South Atlantic Coast",name),
name = gsub("^North Argentina South Atlantic C$","North Argentina South Atlantic Coast",name),
name = gsub("^South Argentina South Atlantic C$","South Argentina South Atlantic Coast",name))
data_i_smoothed_raw <- data_i_raw
# print(paste0("window_type = ",window_type))
# if trail:
if (window_type == "trail"){
data_i_smoothed_raw <- data_i_raw %>%
dplyr::group_by_at(non_numeric_cols)%>%
dplyr::mutate(mean_value = zoo::rollmean(x = value, k = window_length, fill = NA, align = 'right')) %>%
dplyr::ungroup()
}
# adjust
# if surrounding
if(window_type == "surround"){
data_i_smoothed_raw <- data_i_raw %>%
dplyr::group_by_at(non_numeric_cols)%>%
dplyr::mutate(mean_value=zoo::rollmean(x = value,k=window_length,fill=NA, align = 'center')) %>%
dplyr::ungroup()
}
# Replace NA values with initial and final data
data_i_smoothed_all <- data_i_smoothed_raw %>%
dplyr::group_by_at(non_numeric_cols) %>%
dplyr::mutate(mean_val_complete = zoo::na.approx(mean_value,na.rm=F,rule=2)) %>%
dplyr::ungroup(); data_i_smoothed_all
# Remove intermediate columns
data_i_smoothed <- data_i_smoothed_all %>%
dplyr::select(-value,-mean_value)%>%
dplyr::rename(value=mean_val_complete); data_i_smoothed
data_i_smoothed_long = data_i_smoothed
#...............
# Turn long format back into wide format
#...............
if(grepl("wide",output_type,ignore.case = T)){
data_i_smoothed = data_i_smoothed %>%
tidyr::spread(key="x", value ="value")}
#...............
# Save outputs
#...............
if (save == TRUE){
# if data_i is a character, then we can save it this way:
if (class(data_i) == "character"){
if(is.null(filename)){
fname_raw_i = paste0(dirx,"/",gsub(".csv","",basename(data_i)))
} else {
fname_raw_i = paste0(dirx,"/",basename(filename))
}
}
else {
fname_raw_i = paste0(dirx,"/smoothed_data")
}
fname_i = paste0(gsub(".csv","",fname_raw_i),"_window",window_length,window_type,".csv")
# file name for new .csv file
data.table::fwrite(x=data_i_smoothed,file=fname_i)
print(paste0("File saved as ",fname_i))
}
data_smoothed[[i]] = data_i_smoothed
}
#...............
# Produce Diagnostics
#...............
if (diagnostics == TRUE) {
print("Starting diagnostics...")
if(!dir.exists("diagnostics_smooth")){dir.create("diagnostics_smooth")}
data_diagnostic <- data_i_raw %>%
dplyr::mutate(data="raw") %>%
dplyr::bind_rows(data_i_smoothed_long %>%
dplyr::mutate(data="smoothed")); data_diagnostic
# Diagnostics in Groups of 50
if(is.null(diagnostics_col)){
col_name <- non_numeric_cols[1]
} else {
if(any(diagnostics_col %in% non_numeric_cols)){
col_name <- diagnostics_col
} else {
col_name <- non_numeric_cols[1]
}
}; col_name
diagnostics_n = min(diagnostics_n, length(unique(data_diagnostic[[col_name]]))); diagnostics_n
groups_n = ceiling(length(unique(data_diagnostic[[col_name]]))/diagnostics_n); groups_n
for(i in 1:groups_n){
fname_diagnostics_i = paste0(dirname(fname_raw_i),
"/diagnostics_smooth/",
basename(fname_raw_i),"_window",
window_length,window_type,"_",i,".png");fname_diagnostics_i
lower_n = ((i-1)*diagnostics_n)+1; lower_n
upper_n = (i*diagnostics_n); upper_n
p1 <- ggplot2::ggplot(data = data_diagnostic %>%
dplyr::filter(!!as.symbol(col_name) %in% unique(data_diagnostic[[col_name]])[lower_n:upper_n]),
ggplot2::aes(x = x, y = value, group = data)) +
ggplot2::geom_line(ggplot2::aes(color=data)) +
ggplot2::facet_wrap(as.formula(paste0(". ~ ",col_name)),scales="free_y") +
ggplot2::ggtitle(paste0(col_name," window length = ", window_length, " ", window_type)) +
ggplot2::theme_bw() +
ggplot2::theme(axis.text.x=ggplot2::element_text(angle=90,vjust=0.5)) +
ggplot2::scale_x_discrete(limits=unique(data_diagnostic$x),
breaks=pretty(unique(data_diagnostic$x),n=10)); p1
ggplot2::ggsave(filename = fname_diagnostics_i,
plot = p1,
width = 13*min(3,max(1,diagnostics_n/20)),
height = 10*min(3,max(1,diagnostics_n/20))) # save plot
print(paste0("Diagnostic figure saved as ",fname_diagnostics_i))
}
print("Diagnostics complete.")
} # Close diagnostics
#...............
# Return Results
#.............
if(length(data_smoothed)==1){
data_smoothed = data_smoothed[[1]]
}
print("Smoothing complete.")
return(data_smoothed)
} # Close smooth function
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.