#' Aggregates or disaggregates a data frame
#'
#' Aggregates or disaggregates the values of a data frame according to a mapping
#'
#' By default `"weights"` is set to `NULL`. For aggregations, this means that
#' values will be summed as they are. For disaggregations, each component of the
#' larger category will take the same value as the larger category, or
#' `"scaleWeights"` is `TRUE`, each component will be given an even weight. For
#' aggregations, `"weights"` can also be the name of a variable contained in
#' `"data"`. `"weights"` may also be a data frame.
#'
#'
#' @param data a data frame.
#' @param mapping a data frame connecting the resolution in `"data"` and the
#' wished resolution
#' @param by (named) vector giving the correspondence between the column name
#' of `"data"` and of the `"mapping"`
#' @param subset2agg subset of variables for which the (dis)aggregation is
#' applied. If `NULL` (the default), the (dis)aggregation is applied to all
#' variables.
#' @param only.new If `FALSE` (the default), add the (dis)aggregated data frame
#' to existing ones. If `TRUE`, return only the (dis)aggregated data frame.
#' @param na.rm If `TRUE` (the default), remove items calculated as `NA`.
#' @param weights a data frame, a variable name as a character vector, or
#' `NULL`. See details.
#' @param forceAggregation binary. If `TRUE`, (dis)aggregation will be applied
#' even though the items contained contained in the data and in the data do
#' not fully match. The data is reduced to the items covered both by the
#' mapping and the data.
#' @param autodetect this parameter takes the values `'auto'`, `'aggregate'` or
#' `'disaggregate'`. If `'auto'` (the default) the function tries to
#' auto-detect whether this is an aggregation or a disaggregation. If
#' `'aggregate'`, it will aggregate, if `'disaggregate'`, it will
#' disaggregate.
#' @param scaleWeights logical. If `TRUE`, weights are scaled so that the sum of
#' the components equals the value of the larger category.
#' @param variable Column name of variables. Defaults to `"variable"`.
#' @param value Column name of values. Defaults to `"value"`.
#' @param unit Column name of units. Defaults to `"unit"`.
#' @param weight_val_col name of the value column in the `"weigths"` data frame,
#' if `"weigths"` is a data frame
#' @param weight_item_col name of the item column in the `"weigths"` data frame,
#' if `"weigths"` is a data frame. The item column is the column corresponding
#' to the mapping
#' @param fun aggregation function to use. Defaults to `sum`.
#'
#' @return A data frame.
#'
#' @examples
#' library(tidyr)
#' library(dplyr)
#' data <- inline.data.frame(c(
#' "model; scenario; region; variable; unit; period; value",
#' "REMIND; Baseline; USA; GDP per Capita|MER; US$2005/yr; 2010; 40000",
#' "REMIND; Baseline; USA; GDP per Capita|MER; US$2005/yr; 2020; 50000",
#' "REMIND; Baseline; USA; Population; million; 2010; 300",
#' "REMIND; Baseline; USA; Population; million; 2020; 350",
#' "REMIND; Baseline; CHN; GDP per Capita|MER; US$2005/yr; 2010; 7000",
#' "REMIND; Baseline; CHN; GDP per Capita|MER; US$2005/yr; 2020; 8000",
#' "REMIND; Baseline; CHN; Population; million; 2010; 1300",
#' "REMIND; Baseline; CHN; Population; million; 2020; 1400"))
#'
#' mapping = inline.data.frame(c(
#' "region; New_region",
#' "USA; GLO",
#' "CHN; GLO"
#' ))
#'
#' mapping2 = inline.data.frame(c(
#' "Item ; Item_new",
#' "Population; Urban Population ",
#' "Population; Rural Population"
#' ))
#'
#' weights = inline.data.frame(c(
#' "region; itemI ; weight",
#' "USA ; Urban Population; 0.5",
#' "USA ; Rural Population; 0.2",
#' "CHN ; Urban Population; 2",
#' "CHN ; Rural Population; 1"
#' ))
#'
#' #Regional Aggregation
#' aggregate_map(data,mapping, by = "region", subset2agg = c("Population"))
#'
#' #Regional Weighted Aggregation
#' aggregate_map(data,mapping, by = "region", subset2agg = "GDP per Capita|MER",
#' weights = "Population")
#'
#' #Variable Weigthed Disaggregation
#' aggregate_map(data,mapping2, by = c("variable" = "Item"),
#' subset2agg = c("Population"),weights = weights,
#' weight_val_col = "weight", weight_item_col = "itemI")
#'
#'
#' @author Antoine Levesque
#'
#' @importFrom stats setNames
#' @export
aggregate_map <- function(data,
mapping,
by,
subset2agg = NULL,
only.new = TRUE,
na.rm = TRUE,
weights = NULL,
forceAggregation = FALSE,
autodetect = "auto",
scaleWeights = TRUE,
variable = "variable",
value = "value",
unit = "unit",
weight_val_col = "weight_val_col",
weight_item_col = NULL,
fun = sum){
.colnames = colnames(data)
.mapnames = colnames(mapping)
.bynames = names(by)
if (is.null(.bynames)) .bynamesleft = by else .bynamesleft = unname(c(.bynames[.bynames != ""], by[.bynames == ""]))
.mapnamesright = setdiff(.mapnames, by)
#------- Internal Function -----------------------
scale_weights = function(weight_data,mapping_df, DetailedColumn,.by , value_col,na.rm_val){
if(identical(DetailedColumn,.by)){
DetailedColumn_map = DetailedColumn
DetailedColumn_df = DetailedColumn
} else{
DetailedColumn_map = unname(.by[DetailedColumn])
DetailedColumn_df = DetailedColumn
}
.mapCoarseCol = setdiff(colnames(mapping_df), DetailedColumn_map)
.cols = setdiff(c(colnames(weight_data),.mapCoarseCol), c(value_col,DetailedColumn_df) )
res = weight_data %>%
left_join(mapping_df, by = .by) %>%
group_by(!!!syms(.cols)) %>%
mutate(!!sym(value_col) := !!sym(value_col)
/ sum(!!sym(value_col), na.rm = na.rm_val)) %>%
ungroup()
res[.mapCoarseCol] = NULL
return(res)
}
#------- Guardians -----------------------
if (!variable %in% .colnames) stop("No column '", variable, "' found'")
if (!value %in% .colnames) stop("No column '", value, "' found'")
if (!unit %in% .colnames) stop("No column '", unit, "' found'")
if (weight_val_col %in% .colnames) stop("'",weight_val_col, "' in the columns of the data, please chose another name for 'weight_val_col'")
if (!all(.bynamesleft %in% .colnames)) stop(paste(.bynamesleft, collapse = " "), " are not all in colnames(data)")
if (length(by) > 1) stop("currently, the function only disaggregates one column at a time")
if (length(setdiff(.mapnames, by)) > length(by) ) stop("there are more remaining columns in the mapping than the length of 'by'")
if (length(setdiff(.mapnames, by)) < length(by) ) stop("there are less remaining columns in the mapping than the length of 'by'")
if (!all(by %in% .mapnames) ) stop(paste(by, collapse = " "), " are not all in colnames(mapping)")
if (is.data.frame(weights)){
if(!(weight_val_col %in% colnames(weights))) stop("No column '", weight_val_col, "' found in the weights'")
if(!(weight_item_col %in% colnames(weights))) stop("No column '", weight_item_col, "' found in the weights'")
weight_item_col
}
if (.mapnamesright %in% .bynamesleft) stop("The mapping column name '",.mapnamesright,"' equals the data colum name '",.bynamesleft,"'. Please remove that ambiguity")
if (! autodetect %in% c("auto","disaggregate","aggregate")) stop("autodetect must take 'auto', 'disaggregate', 'aggregate")
#-------- Determine whether there is a "unit" column------------------
if (unit %in% .colnames){
UnitInData <- T
}else{
stop("No column '", unit, "' found.")
}
#-------- Determine whether this is an aggregation or a disaggregation -------
if(autodetect == "auto"){
aggregation = length(getColValues(mapping,by)) > length(getColValues(mapping,.mapnamesright))
} else if (autodetect == "aggregate"){
aggregation = T
} else if (autodetect == "disaggregate"){
aggregation = F
}
.nameDetailedColumn = ifelse(aggregation,.bynamesleft,.mapnamesright)
weights_character = is.vector(weights) & length(weights) == 1
#-------- Filter for variables used on rhs ------------------------------
#Ajouter weights si c'est character.
if (!is.null(subset2agg)){
.data <- data %>%
filter(!!sym(variable) %in% subset2agg)
} else{
.data <- data
}
#-------- Are the regions/variables in the data correctly covered by the mapping? --------------
items_map = getColValues(mapping,by)
items_data = getColValues(.data, .bynamesleft)
diff_map_data = setdiff(items_map, items_data)
diff_data_map = setdiff(items_data, items_map)
if (length(diff_map_data) > 0 || length(diff_data_map) > 0) {
message_mismatch = paste0("the number of regions/variables does not correspond: \n",
"these regions/variables are in the mapping but not in data : ",paste(diff_map_data, collapse = " "), "\n",
"these regions/variables are in data but not in the mapping : ", paste(diff_data_map, collapse = " "),"\n",
" Reduce to the same dimension")
if (!forceAggregation) {
stop(message_mismatch)}
else {
warning(message_mismatch)
inter_data_map = intersect(items_map, items_data)
.data = .data %>% filter(!!sym(.bynamesleft) %in% inter_data_map)
}
}
#---- Computation of the weights according to each case -----
if (aggregation){
if( is.null(weights)){
.weights_df = data.frame(getColValues(mapping,by), 1)
colnames(.weights_df) <- c(.nameDetailedColumn, weight_val_col)
} else if ( weights_character){
#considers the character vector points to a variable in the data frame
.weights_df <- data %>%
filter(!!sym(variable) %in% weights)
names(.weights_df)[names(.weights_df) == value] = weight_val_col
.weights_df = .weights_df %>% select(-!!sym(variable))
if (UnitInData) .weights_df = .weights_df %>% select(-!!sym(unit))
if (scaleWeights) .weights_df <- scale_weights(.weights_df,mapping,.nameDetailedColumn, by,weight_val_col,na.rm)
} else if (is.data.frame(weights)){
#does not scale the weights if this is a df
.weights_df <- weights
if (scaleWeights) .weights_df <- scale_weights(.weights_df,mapping,.nameDetailedColumn, by,weight_val_col,na.rm)
} else {
stop("class of weights is not supported for aggregation")
}
}else{
if (is.null(weights)){
.weights_df = data.frame(getColValues(mapping,.nameDetailedColumn), 1)
colnames(.weights_df) <- c(.nameDetailedColumn, weight_val_col)
if (scaleWeights) .weights_df <- scale_weights(.weights_df,mapping, .nameDetailedColumn,.nameDetailedColumn,weight_val_col,na.rm)
} else if (is.data.frame(weights)){
.weights_df <- weights
names(.weights_df)[names(.weights_df) == weight_item_col] = .nameDetailedColumn
if (scaleWeights) .weights_df <- scale_weights(.weights_df,mapping, .nameDetailedColumn,.nameDetailedColumn,weight_val_col,na.rm)
} else { stop("For disaggregation, only NULL or a data.frame is supported for weights")}
}
if (any(.weights_df[[weight_val_col]] == "Inf", na.rm = TRUE))
stop("Some elements of the weighting matrix are Inf. Infinite weights are not allowed!")
#---- Multiply the values by the weights, even if the weights = 1 ----
.colGroups_weight = intersect(c(.colnames,.mapnames), colnames(.weights_df))
.data = .data %>% left_join(mapping, by = by, relationship = "many-to-many") %>%
left_join(.weights_df, by = .colGroups_weight ) %>% #.nameDetailedColumn
mutate(!!sym(value) := !!sym(value) * !!sym(weight_val_col))
.data[weight_val_col] = NULL
#---- For aggregations, sum over the more detailed column
#---- For disaggregations, delete the less detailed column and rename the more detailed one.
if(aggregation){
.colGroups = setdiff(colnames(.data), c(.bynamesleft,value,weight_val_col))
.data = .data %>%
group_by(!!!syms(.colGroups)) %>%
summarise(!!sym(value) := fun(!!sym(value), na.rm = na.rm),
.groups = 'drop')
} else {
.data[.bynamesleft] = NULL
}
names(.data)[names(.data) == .mapnamesright] = .bynamesleft
if(!only.new ) .data = rbind(data, .data)
return(.data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.