R/macro_helpers_aggregation.R

Defines functions RAL_calculator selbest_weighting propto_weighting gen_RAA gen_baseline

Documented in gen_baseline gen_RAA propto_weighting RAL_calculator selbest_weighting

#' @title Generate baseline aggregations
#' 
#' @description Generates baseline aggregations to compare with the relevance
#'   adjusted ones.
#' 
#' @details
#' Generates the following baseline aggregations: 
#' * Equal weight: gives equal weight to all models.
#' * Gewisano: gives weight using linear prediction pools (GewIsano).
#' 
#' @param atomic_df A data frame consisting of atomic predictions.
#' @param start_agg Which time point to start creating aggregate 
#'   predictions for. Note that this is based on the variable t in the 
#'   supplied data frame, and not row number.,
#' @import data.table

gen_baseline <- function(atomic_df, start_agg) {

    lpdens <- pmean <- NULL

    baseline_df <- gen_atomic_df()
    T <- max(atomic_df$t)

    # Giving all models equal weight
    df_all <- data.table::data.table(atomic_df)
    df_equal_wt <- df_all[
        t >= start_agg,
        list(
            pmean = mean(pmean),
            lpdens = log(mean(exp(lpdens))),
            method = "equal_wt",
            t
        ),
        by = list(t)][, 2:5]
    baseline_df <- rbind(baseline_df, df_equal_wt)
    ytrue <- rep(NA, nrow(baseline_df))
    baseline_df <- cbind(baseline_df, ytrue)

    # Optimal prediction pools (Geweke & Amisano)
    df_gewisano <- gen_gewisano(data = atomic_df, start_agg)
    baseline_df <- rbind(baseline_df, df_gewisano)

    return(baseline_df)
}

#' @title Generates local aggregations
#' 
#' @description
#' Generates relevance adjusted aggregations given a data set with
#' relevance adjusted log scores. This is just a switcher function that
#' sends the data on either to the propto or the "select best" (which
#' is kind of not in use atm) function.
#' 
#' @param RAL_data Data set containing relevance adjusted logscores.
#' @param agg_meth Which method should be used to aggregate the agents
#'   based on their relevance-adjusted logscores. Currently available
#'   are propto (which uses a softmax transformation) and select_best
#'   which picks the model with the best RAL at each time point.
#' @param sim_measure Which method was used to calculate the weights
#'   (caliper or mahala(nobis)).
#' 
#' @import data.table

gen_RAA <- function(RAL_data, agg_meth, sim_measure) {
    df_RAL <- switch(
        agg_meth,
        "propto" = propto_weighting(RAL_data, sim_measure),
        "select_best" = selbest_weighting(RAL_data, sim_measure),
        stop("Unknown aggregation method.")
    )
    return(df_RAL)
}


#' @title Weighting proportional to LPA (local predictive ability)
#' 
#' @param data Dataset to use.
#' @param sim_measure Caliper or mahala.
#' 
#' @import data.table

propto_weighting <- function(data, sim_measure) {

    pmean <- RAL <- lpdens <- NULL
        
    method_name <- sprintf("%s_propto", sim_measure)

    df_RAL <- data[,
        list(
            pmean = sum(pmean * exp(RAL)) / sum(exp(RAL)),
            lpdens = log(
                        sum(
                            exp(lpdens) * exp(RAL) / sum(exp(RAL))
                        )
                    ),
            method = method_name,
            t,
            ytrue = NA
        ),
        by = list(t)
    ][, -1]
        
    return(df_RAL)
}


#' @title Weighting by picking the best model
#' 
#' @param data Which dataset to use.
#' @param sim_measure Caliper or mahala.
#' 
#' @import data.table

selbest_weighting <- function(data, sim_measure) {

    RAL <- NULL

    method_name <- sprintf("%s_selbest", sim_measure)

    df_RAL <- data[data[, .I[RAL == max(RAL)], by = t]$V1]
    df_RAL$method <- method_name
    df_RAL[, RAL := NULL]

    return(df_RAL)
}

#' @title Local logscore calculator
#' 
#' @description
#' Calculates the local logscores given a weight data frame and a data
#' frame of atomic predictions. 
#' 
#' @details 
#' Starts with a weight data frame on the form
#' \tabular{rrr}{
#'   \strong{t} \tab \strong{t2} \tab \strong{similar} \cr
#'   730 \tab 729 \tab 1 \cr
#'   730 \tab 728 \tab 0 \cr
#'   730 \tab 727 \tab 1 \cr
#' }
#' 
#' @param weight_df Data frame containing relevance adjusted weights.
#'   Generated by a relevance function (caliper relevance or
#'   mahalanobis atm).
#' @param atomic_df Data frame with atomic predictions from any number
#'   of models.
#' @param pratig Use TRUE to run the function in a debuggy mode where a
#'   bunch of extra information is returned and printed.
#' 
#' @import data.table

RAL_calculator <- function(weight_df, atomic_df, pratig = FALSE) {
    
    t2 <- similarity <- lpdens <- method <- adj_lpdens <- NULL

    start_agg <- min(weight_df$t)
    stop_agg <- max(weight_df$t)

    atomic_df <- data.table::data.table(atomic_df)
    
    dfdf <- data.table::data.table(matrix(ncol = 3, nrow = 0)) # df to store RAL
    colnames(dfdf) <- c("method", "RAL", "t")
    
    for (i in start_agg:stop_agg) {
        sim_df <- weight_df[t == i, list(t = t2, similarity)]
        atomic_sub <- atomic_df[t < i, .(lpdens, method, t)]
        #data.table::setkey(sim_df, t)
        #data.table::setkey(atomic_sub, t)
        #df_all <- atomic_sub[sim_df]
        df_all <- merge(atomic_sub, sim_df, by.x = "t", by.y = "t")
        df_all[, `:=`(adj_lpdens = lpdens * similarity)]
        collapsed <- df_all[, list(RAL = sum(adj_lpdens)), by = list(method)]
        collapsed[, `:=`(t = i)]
        dfdf <- rbind(dfdf, collapsed)
    }

    keycols <- c("method", "t")
    data.table::setkeyv(dfdf, keycols)
    data.table::setkeyv(atomic_df, keycols)
    joined_df <- atomic_df[dfdf]
    return(joined_df)
}

#' @title Generate GewiSano Weights
#' 
#' @description
#' Generates weights according to Geweke & Amisano 2011/2012, also 
#' known as linear predicition pools. Obviously it takes data on a
#' specific form, which you should describe..
#' 
#' @param data Data set of atomic predictions.
#' @param start_t Which timepoint to start generating weights for. 
#'   Obs! This is based on the variable t in the data, not the row 
#'   number!
#' @param pratig If true, the function prints a bunch of information
#'   when run. It also returns a data frame that contains the weights
#'   at each time, so pratig has to be FALSE when running from one of
#'   the meta-funktions that generate predictions. This is because the
#'   object returned changes from a single data frame to a list with the
#'   original data frame returned as the first object, and a matrix of
#'   weights as the second object.
#' @import data.table
#' @importFrom pracma fmincon
#' @export

gen_gewisano <- function(data, start_t, pratig = FALSE) {

    fun_to_opt <- function(x, dataopt) {
        -sum(log(exp(dataopt) %*% x))
    }

    data <- data.table::data.table(data)
    df_fat <- data.table::dcast(data, t ~ method, value.var = "lpdens")
    # print(colnames(df_fat)) # Debugging to check the order
    k <- ncol(df_fat) - 1
    start_val <- rep((1 / k), k)
    tt <- seq(start_t, max(data$t))
    wts <- matrix(ncol = k, nrow = length(tt))

    j <- 0
    for (i in start_t:(max(data$t))) {
        j <- j + 1
        vecop <- as.matrix(df_fat[df_fat$t <= i, 2:(k + 1)])
        opt_sol <- pracma::fmincon(start_val, fun_to_opt, Aeq = matrix(1, 1, k),
                beq = 1, lb = rep(0.0001, k), ub = rep(1, k), dataopt = vecop)
        wts[j, ] <- opt_sol$par
        if (pratig) print(wts[j, ])
    }

    lpdens <- log(rowSums(exp(df_fat[df_fat$t >= start_t, 2:(k + 1)]) * wts))
    df_pmean <- data.table::dcast(data, t ~ method, value.var = "pmean")
    pmean <- rowSums(df_pmean[df_pmean$t >= start_t, 2:(k + 1)] * wts)
    
    method <- rep("gewisano", length(tt))
    ytrue <- rep(NA, length(tt))
    df_gew_res <- data.frame(pmean, lpdens, method, tt, ytrue)
    colnames(df_gew_res) <- c("pmean", "lpdens", "method", "t", "ytrue")
    df_gew <- gen_atomic_df()
    df_gew <- rbind(df_gew, df_gew_res)
    wts <- data.frame(tt, wts)

    if (pratig) {
        return(list(df_gew, wts))
    } else {
        return(df_gew)
    }
}
ooelrich/oscbvar documentation built on Sept. 8, 2021, 3:31 p.m.