### Code for conversion to seasonal data:
#' Auxiliary function for decoding season-strings
#' @param seasons A character vector of season strings, see [convert_monthly_to_seasonal()] for details
season_strings_to_int = function(seasons = c("MAM", "JJAS", "OND")){
full_string = "JFMAMJJASONDJFMAMJJASOND" # need it twice to cover wrapping over the end of the year
indices = c(1:12,1:12)
locations = stringr::str_locate(full_string,seasons)
month_matrix = matrix(indices[locations],ncol = 2, byrow = FALSE)
which_na = which(is.na(month_matrix[,1]))
if(length(which_na) > 0) stop(paste0("The following strings are not recognized seasons: ",paste(seasons[which_na], collapse = ", ")))
return(month_matrix)
}
#' Convert a data table from monthly to seasonal format
#'
#' Converts monthly to seasonal data. The function default values are set for precipitation. In particular, default behavior is to *sum*
#' values over all months contained in a season. If you want to average instead (for temperature, for example), you can change the aggregation function `FUN`.
#'
#' Note that it is impossible to derive seasonal tercile categories from monthly ones (and similar for seasonal tercile forecasts). For getting these, you should convert to seasonal
#' *before* deriving the tercile categories or forecasts, e.g. by using [add_tercile_cat()] or [tfc_from_efc()].
#'
#' @details
#' Seasons are provided as sequences of capitalized initial characters of the months they contain, e.g. `'MAM'` for March-April-May.
#' They can have any length from 1 to 12 months and are allowed to overlap and wrap over the end of the year
#' (for example, you can provide `seasons = c('OND', 'NDJ')` to derive values for October-December and November-January).
#' If a season includes months from 2 years, it gets assigned the year of its starting month. For example, `season = 'NDJ'` and `year = 2000` refers to values for the season November 2000 to January 2001.
#'
#' Factor- or Character-valued columns cannot be aggregated to seasonal values. If `vars` contains columns that are factor- or character-valued, it checks whether they take a unique value for each grouping level
#' provided in `by`. If yes, they are kept, else they are discarded. A typical case where this is useful is when your data table contains country names (see [add_country()]).
#' The grouping levels usually include `'lon'`, `'lat'`, so there is only one country name per grouping level and the name is kept.
#'
#' @param dt A data table containing the values for conversion.
#' @param vars Character vector of names of the columns containing the values for conversion. Default is to try converting everything that is not contained in `by` and that is not recognized as tercile category (see [tc_cols()]) or
#' tercile forecast (`'below'`, `'normal'`, `'above'`).
#' @param by Character vector of column names to group by. Separate values are derived for each unique combination of values in `by`.
#' Defaults to all [dimvars()] that are not `'month'`, which is usually what you want.
#' @param FUN function for aggregation.
#' @param ... arguments passed to `FUN`, for example `na.rm`.
#' @param seasons Vector of character strings specifying seasons. See details. Defaults to `c('MAM', 'JJAS', 'OND')`, which are the seasons considered in the COFs for the Greater Horn of Africa.
#' @param only_complete_seasons Logical. If `TRUE`, only values are kept for which we have data for all months. For example, no `OND` values would be derived if the data for December is missing.
#'
#' @export
#'
#' @examples
#' # returns empty data table, because the example data does not contain data for a full season:
#' dt = convert_monthly_to_seasonal(chirps_monthly)
#'
#' # remove terc_cat first to avoid the warning, and set season to the months we actually have data for:
#' dt2 = convert_monthly_to_seasonal(chirps_monthly[,terc_cat := NULL], seasons = c('ND'))
#' print(dt2)
#'
#' # season OND, get monthly averages rather than sums, and force the function to derive values
#' # even though we do not have October-data:
#' dt3 = convert_monthly_to_seasonal(chirps_monthly,
#' seasons = c('OND'),
#' FUN = mean,
#' only_complete_seasons = FALSE)
#' print(dt3)
convert_monthly_to_seasonal = function(dt,
vars = NULL,
by = NULL,
FUN = sum,
...,
seasons = c("MAM", "JJAS", "OND"),
only_complete_seasons = TRUE)
{
season_present = 'season' %in% names(dt)
if(!('month' %in% names(dt)) & ('season' %in% names(dt))){
message('The data table seems to be in seasonal format already.')
return(dt)
}
if(!('month' %in% names(dt)) & !('season' %in% names(dt))) stop('Your data needs to contain a column named "month".')
if(is.null(by)) by = setdiff(dimvars(dt), c('month',lt_cols()))
if(is.null(vars)) vars = setdiff(names(dt),c(by,'month',lt_cols()))
# checks:
if('above' %in% vars)
{
vars = setdiff(vars, c('below','normal','above'))
warning(paste('The data contains tercile probabilities.',
'I cannot derive seasonal tercile probabilities from monthly ones.',
'If the tercile probabilities were derived from an ensemble forecast,',
'please convert the forecasts to seasonal BEFORE extracting tercile probabilities.',
'I remove "below", "normal", and "above" from the columns that are converted.',sep = '\n'))
}
if(any(tc_cols() %in% vars))
{
vars = setdiff(vars, tc_cols())
warning(paste('The data contains tercile categories.',
'I cannot derive seasonal tercile categories from monthly ones.',
'Please derive tercile categories AFTER converting to seasonal values.',
sep = '\n'))
}
if(any(!vars %in% names(dt))){
warning('Not all provided vars are columns in dt.')
vars = intersect(vars, names(dt))
}
# Subset vars to columns that contain numeric values.
is_numLike = c()
for(i in seq_along(vars))
{
is_numLike = c(is_numLike, (is.numeric(dt[,get(vars[i])])| is.logical(dt[,get(vars[i])])))
}
if(sum(is_numLike) < length(is_numLike)) # at some point, we might want to get
{
nnl_cols = vars[!is_numLike]
# check whether these columns contain the same value for each level in by, then we can just carry them along.
# Technically we could even take them along if they have the same value for each level in by and each season (which is weaker)
# but that's more work to implement and likely not required.
check = identical(unique(dt[,.SD,.SDcols = by]),
unique(dt[,.SD,.SDcols = c(by,nnl_cols)])[,.SD,.SDcols = by])
if(check) by = c(by, nnl_cols)
if(!check){
warning(paste0("The columns ",paste(nnl_cols,collapse = ", "),' are non-numeric and cannot be converted to seasonal format.'))
}
}
vars = vars[is_numLike]
if(length(vars) == 0) stop("No variables found that could be converted.")
### Warn if predictions are detected ###
# Predictions are more messy because of the leadtime-dimension. This is currently not handled by the function
# Scenario 1: dt contains forecasts but not at several leadtimes
cns_indicating_forecasts = c('fc','forecast','member','pred','prediction')
if(any(cns_indicating_forecasts %in% names(dt)) | (any(lt_cols() %in% names(dt)))){
warning(paste(paste0('Your data table contains columns named ',
paste(c(intersect(cns_indicating_forecasts, names(dt)),
intersect(lt_cols(), names(dt))), collapse = ', '),'.'),
'This seems to indicate that it contains forecasts, and I assume they are valid for the month given by the columns "year" and "month".',
'Note that for seasonal forecasts we want different lead times for different months:',
'For example, a prediction initialized in February for the March-April-May-season requires forecasts with leadtime 1 month for March,',
'2 months for April, and 3 months for May. ' ,sep = '\n'))
}
if(any(lt_cols() %in% names(dt))){
ltcol = which(lt_cols() %in% names(dt))
test = unique(dt[,.SD,.SDcols = c('month',ltcol)])
multiple_leadtimes_per_month = !identical(sort(test[, month]), sort(unique(dt[,month])))
if(multiple_leadtimes_per_month){
stop(paste('Your data table contains forecasts for the same month at several lead times.',
'This is not supported by the convert_monthly_to_seasonal function, which currently does not find the right leadtimes for you.', sep = '\n'))
}
}
season_ints = season_strings_to_int(seasons)
return_data = list()
for(i in seq_along(seasons)){
lims = season_ints[i,]
# get months of the season:
if(lims[1] < lims[2]) {
months = lims[1]:lims[2]
which_months_shift_year = integer(0)
}
# the case where the season wraps around the end of the year:
if(lims[2] < lims[1]) {
lims[2] = lims[2] + 12
months = lims[1]:lims[2]
which_months_shift_year = months[months > 12] - 12 # year is always the year of the first month contained in the season
months[months > 12] = months[months > 12] - 12
}
if(only_complete_seasons)
{
dt_temp = dt[month %in% months][, N := .N, by = by]
dt_temp[month %in% which_months_shift_year, year := year - 1]
dt_new = dt_temp[N == length(months), lapply(.SD,FUN,...), .SDcols = vars, by = by] # this is weirdly ordered but whatever... Why is this taking so freaking long?
dt_new[,season := seasons[i]]
return_data[[i]] = dt_new
} else {
dt_temp = dt[month %in% months]
dt_temp[month %in% which_months_shift_year, year := year - 1]
dt_new = dt_temp[,lapply(.SD,FUN,...), .SDcols = vars, by = by] # this is weirdly ordered but whatever... Why is this taking so freaking long?
dt_new[,season := seasons[i]]
return_data[[i]] = dt_new
}
}
return_data = rbindlist(return_data)
message(paste0('The data in the following columns is aggregated:\n',
paste(vars,collapse = ', ')))
return(return_data)
}
#' Adjust Lead Time for Seasonal Forecasts
#'
#' This function adjusts the lead time values from monthly data for the seasonal format.
#'
#' @param lead_time A numeric vector of lead time values.
#' @param month A numeric vector of month values.
#'
#' @details
#' The function adjusts the lead time values by subtracting the difference between the month value and the corresponding starting month of the season.
#' The adjustment is performed for the months 3-5 (March to May), 6-9 (June to September), and 10-12 (October to December).
#' For example, if the month is March (3), the lead time is adjusted by subtracting (3 - 3) = 0.
#' This adjustment aligns the lead time values with the seasonal forecasts.
#'
#'
#' @return A numeric vector of lead time values adjusted for seasonal forecasts.
#' @export
lead_time_conversion = function(lead_time,month)
{
message('lead times are adjusted for seasonal forecasts.')
lead_time[month %in% 3:5] = lead_time[month %in% 3:5] - (month[month %in% 3:5] - 3)
lead_time[month %in% 6:9] = lead_time[month %in% 6:9] - (month[month %in% 6:9] - 6)
lead_time[month %in% 10:12] = lead_time[month %in% 10:12] - (month[month %in% 10:12] - 10)
return(lead_time)
}
# convert to seasonal:
# takes a vector of months and returns a vector of corresponding seasons
#' Getting seasons from season
#'
#' @description takes a vector of months and returns a same-length vector of seasons
#'
#' @param mm vector of months
#'
#' @export
season = function(mm)
{
ret = rep(NA_character_,length(mm))
ret[mm %in% 3:5] = 'MAM'
ret[mm %in% 6:9] = 'JJAS'
ret[mm %in% 10:12] = 'OND'
ret = factor(ret,levels = c('MAM','JJAS','OND')) # for correct ordering
return(ret)
}
#' Getting months from season
#'
#' @description takes a vector of up to three seasons and returns the months in them
#'
#' @param ss vector containing one or multiple season names ('MAM', 'JJAS', 'OND')
#'
#' @export
season_months = function(ss)
{
ss = unique(as.character(ss))
ret = NULL
for (s in ss)
{
if(s == 'MAM') ret = c(ret,3:5)
if(s == 'JJAS') ret = c(ret,6:9)
if(s == 'OND') ret = c(ret,10:12)
}
return(ret)
}
#' Convert Data to Seasonal Format
#'
#' This function converts data to seasonal format by aggregating monthly values.
#'
#' @param dt The input data table.
#' @param vars A character vector specifying the columns to be converted (optional).
#' @param by A character vector specifying the grouping variables (optional).
#' @param FUN The function to be used for aggregation (default is \code{sum}).
#' @param subset logical. If TRUE (default), then we expect three entries per level of by in MAM and OND, and 4 levels in JJAS.
#' The data is subset to all levels of by satisfying this condition. This is important if your data may be incomplete for some
#' levels. For example, if you have monthly precip data and it ends in October 2023, then the OND-2023 precip entries would just be the October values,
#' because the function simply sums over all OND 2023 values it finds. This is usually not what you want, you'd prefer OND 2023 to be suppressed, which
#' is the functions behavior when subset = TRUE.
#'
#' @details
#' The function checks if the input data table is already in seasonal format.
#' If not, it requires the data table to contain a column named "month" for conversion.
#' The function then aggregates the data by season based on the month column and the specified grouping variables.
#' The specified columns are reduced to those that are numeric-like (numeric or logical).
#' The function returns a new data table with the aggregated values.
#'
#'
#' @return A data table in seasonal format with aggregated values.
#' @export
seasonal = function(dt,vars = NULL,by = NULL,FUN = sum, subset = TRUE)
{
season_present = 'season' %in% names(dt)
if(!('month' %in% names(dt)) & season_present){
message('The data table seems to be in seasonal format already.')
return(dt)
}
if(!('month' %in% names(dt)) & !season_present) stop('For converting to seasonal format, your data needs to contain the column "month"')
if(!season_present) dt[,season := season(month)]
if(is.null(by)) by = setdiff(dimvars(dt),'month')
if(is.null(vars)) vars = setdiff(names(dt),c(by,'month'))
# checks:
if('above' %in% vars)
{
stop(paste0('The data contains tercile probabilities.\n',
'I cannot derive seasonal tercile probabilities from monthly ones.\n',
'Please convert your forecasts to seasonal format BEFORE getting the tercile probabilities.'))
}
# reduce to vars that are numLike:
is_numLike = c()
for(i in seq_along(vars))
{
is_numLike = c(is_numLike, (is.numeric(dt[,get(vars[i])])| is.logical(dt[,get(vars[i])])))
}
if(sum(is_numLike) < length(is_numLike)) # at some point, we might want to get
{
message(paste0('The following columns could not be converted:',
paste(vars[which(!is_numLike)]),collapse = ', '))
vars = vars[-which(!is_numLike)]
}
if(length(vars) == 0) stop('Nothing to convert.')
# fix lead times for seasonal data:
lt = ('lead_time' %in% names(dt))
if(lt) {
dt[,lead_time_new := lead_time_conversion(lead_time,month)]
if('lead_time' %in% by)
{
by = c(setdiff(by,'lead_time'),'lead_time_new')
}
}
if(subset)
{
dt[,N := .N, by = by]
check_n = function(season) return(3 + 1*(season == 'JJAS'))
dt_new = dt[!is.na(season)][N == check_n(season),lapply(.SD,FUN,na.rm = TRUE),.SDcols = vars,by = by] # this is weirdly ordered but whatever... Why is this taking so freaking long?
dt[,N:= NULL]
} else {
dt_new = dt[!is.na(season)][,lapply(.SD,FUN,na.rm = TRUE),.SDcols = vars,by = by] # this is weirdly ordered but whatever... Why is this taking so freaking long?
}
if(!season_present) dt[,season := NULL]
if(lt)
{
setnames(dt_new,'lead_time_new','lead_time')
dt[,lead_time_new := NULL]
}
message(paste0('The data in the following columns is aggregated:\n',
paste(vars,collapse = ', ')))
return(dt_new)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.