Nothing
################################################################################
# R functions of the stodom R package #
# -----------------------------------------------------------------------------#
# 2022, Agroscope #
# developed by Sergei Schaub #
# e-mail: sergei.schaub@agroscope.admin.ch #
# first version: August, 15, 2022 #
# last update: August, 15, 2022 #
################################################################################
#===============================================================================
# setting
#===============================================================================
#' @import dplyr
#' @import tibble
#' @import ggplot2
#' @import pracma
#' @import tidyr
utils::globalVariables(c("x_axis", "n_var_a", "n_var_b", "pro_var_a", "pro_var_b",
"cum_var_a", "cum_var_b", "txtProgressBar", "setTxtProgressBar",
"statistic_observed", "larger", "dif_cum", "int_dif_cum",
"cum", "group", "cum_var_a_non_random"))
#===============================================================================
# first-order stochastic dominance
#===============================================================================
#' @title first-order stochastic dominance test
#'
#' @description This function tests for first-order stochastic dominance.
#' @usage fo_stodom(data_1, data_2, bins_size, n_draws, useed, variable_1, variable_2, type)
#' @param data_1 data 1.
#' @param data_2 data 2.
#' @param bins_size bin size.
#' @param n_draws number of draws to compute p values (default = 500).
#' @param useed user defined seed
#' @param variable_1 name of a (as a string); only for the output table (default = "a").
#' @param variable_2 name of b (as a string); only for the output table (default = "b").
#' @param type type of bootstrapped test, bootstrapping 1 and 2 of Barrett and Donald (2003) are available (default = "boot2").
#' @details This function computes the consistent test of first-order stochastic dominance following Barrett and Donald (2003). In detail, this function estimate their Kolmogorov-Smirnov type tests based on bootstrapping 2. The function was implemented as part of Schaub xxx
#' @return The function returns a list object containing the p-values of two dominance tests (i.e., variable 1 vs. variable 1 and variable 2 vs. variable 1).
#' @references Barrett, G. F., & Donald, S. G. (2003). Consistent tests for stochastic dominance. Econometrica, 71(1), 71-104.
#' @references Schaub, S. & El Benni, N. (2024). How do price (risk) changes influence farmers’ preference to reduce fertilizer application?
#' @examples
#'
#' # load stodom
#' require(stodom)
#'
#' data_a <- rnorm(500, 3, 2)
#' data_b <- rnorm(500, 1, 2)
#'
#' # estimate first-order stochastic dominance
#' fo_stodom(data_1 = data_a, data_2 = data_b, n_draws = 100, useed = 1, bins_size = 1)
#' @export
fo_stodom <- function(data_1, data_2, bins_size = 1, n_draws = 500, useed, variable_1 = "a", variable_2 = "b", type = "boot2") {
bin_scaling = 1/bins_size # define scaling parameter
object <- tibble(
"FO" = "",
pvalue = rep(NA, 2))
object[1,1] <- paste0("FO_", variable_1, "_vs_", variable_2)
object[2,1] <- paste0("FO_", variable_2, "_vs_", variable_1)
# data_aux <- data_1_2
n_draws <- n_draws
for (z in 1:2) {
# adjust data
if (z == 1) {
sample_aux_1 <- data_1
sample_aux_2 <- data_2}
if (z == 2) {
sample_aux_1 <- data_2
sample_aux_2 <- data_1}
#-------------------------------------------------------------------------------
# computing test statistic of observed distributions
#-------------------------------------------------------------------------------
min_ref <- floor(min(c(sample_aux_1, sample_aux_2))) - bins_size
max_ref <- ceiling(max(c(sample_aux_1, sample_aux_2))) + bins_size
data_dif <- data.frame(x_axis = seq(min_ref, max_ref, bins_size)) %>%
mutate(x_axis = round(x_axis*bin_scaling, digits = 0)/bin_scaling) %>%
distinct() %>%
arrange(x_axis) %>%
mutate(x_axis = as.character(x_axis)) %>%
left_join(
sample_aux_1 %>% as.data.frame() %>% rename(x_axis = 1) %>%
mutate(x_axis = round(x_axis*bin_scaling, digits = 0)/bin_scaling) %>%
group_by(x_axis) %>%
summarize(n_var_a = n()) %>%
arrange(x_axis) %>%
mutate(pro_var_a = n_var_a/length(sample_aux_1),
x_axis = as.character(x_axis)),
by = "x_axis") %>%
left_join(
sample_aux_2 %>% as.data.frame() %>% rename(x_axis = 1) %>%
mutate(x_axis = round(x_axis*bin_scaling, digits = 0)/bin_scaling) %>%
group_by(x_axis) %>%
summarize(n_var_b = n()) %>%
arrange(x_axis) %>%
mutate(pro_var_b = n_var_b/length(sample_aux_2),
x_axis = as.character(x_axis)),
by = "x_axis") %>%
mutate(pro_var_a = ifelse(!is.na(pro_var_a), pro_var_a, 0),
pro_var_b = ifelse(!is.na(pro_var_b), pro_var_b, 0),
cum_var_a = cumsum(pro_var_a),
cum_var_b = cumsum(pro_var_b),
x_axis = as.numeric(x_axis)) %>%
dplyr::select(x_axis, cum_var_a, cum_var_b) %>%
mutate(dif_cum = cum_var_a - cum_var_b)
# get maximum difference between the two ecdfs
sub_dif <- max(data_dif$dif_cum)
# compute test statistic
N <- length(sample_aux_1)
M <- length(sample_aux_2)
statistic <- (N * M / (N + M))^0.5 * sub_dif
#-------------------------------------------------------------------------------
# computing test statistic of randomly sampled data
#-------------------------------------------------------------------------------
#////////
# bootstrapping 2
#////////
if (type == "boot2") {
stats_boots <- data.frame(
draw = rep(NA, n_draws),
statistic_bootstrapped = NA)
dat_combo <- c(sample_aux_1, sample_aux_2) # combine samples
# progress bar
pb = txtProgressBar(min = 0, max = n_draws, initial = 0)
for (i in 1:n_draws) {
if (!is.na(useed)) {
set.seed(useed + i) # set seed to make results reproducible
}
# randomly draw samples
sample_a_boot <- sample(dat_combo, size = N, replace = T)
sample_b_boot <- sample(dat_combo, size = M, replace = T)
data_dif_boot <- data.frame(x_axis = seq(min_ref, max_ref, bins_size)) %>%
mutate(x_axis = round(x_axis*bin_scaling, digits = 0)/bin_scaling) %>%
distinct() %>%
arrange(x_axis) %>%
mutate(x_axis = as.character(x_axis)) %>%
left_join(
sample_a_boot %>% as.data.frame() %>% rename(x_axis = 1) %>%
mutate(x_axis = round(x_axis*bin_scaling, digits = 0)/bin_scaling) %>%
group_by(x_axis) %>%
summarize(n_var_a = n()) %>%
arrange(x_axis) %>%
mutate(pro_var_a = n_var_a/length(sample_a_boot),
x_axis = as.character(x_axis)),
by = "x_axis") %>%
left_join(
sample_b_boot %>% as.data.frame() %>% rename(x_axis = 1) %>%
mutate(x_axis = round(x_axis*bin_scaling, digits = 0)/bin_scaling) %>%
group_by(x_axis) %>%
summarize(n_var_b = n()) %>%
arrange(x_axis) %>%
mutate(pro_var_b = n_var_b/length(sample_b_boot),
x_axis = as.character(x_axis)),
by = "x_axis") %>%
mutate(pro_var_a = ifelse(!is.na(pro_var_a), pro_var_a, 0),
pro_var_b = ifelse(!is.na(pro_var_b), pro_var_b, 0),
cum_var_a = cumsum(pro_var_a),
cum_var_b = cumsum(pro_var_b),
x_axis = as.numeric(x_axis)) %>%
dplyr::select(x_axis, cum_var_a, cum_var_b) %>%
mutate(dif_cum = cum_var_a - cum_var_b)
sub_dif_sample <- max(data_dif_boot$dif_cum)
statistic_bootstrapped <- (N * M / (N + M))^0.5 * sub_dif_sample # compute test statistic
# store results
stats_boots[i,1] <- i
stats_boots[i,2] <- statistic_bootstrapped
setTxtProgressBar(pb, i)
close(pb)
}
stats_boots$statistic_observed <- statistic
stats_boots <- stats_boots %>%
mutate(larger = ifelse(statistic_bootstrapped > statistic_observed, 1, 0))
sum_larger <- stats_boots %>% summarize(sum(larger)) %>% as.numeric()
p_value <- 1/n_draws*sum_larger
p_value
}
#////////
# bootstrapping 1
#////////
if (type == "boot1") {
stats_boots <- data.frame(
draw = rep(NA, n_draws),
statistic_bootstrapped = NA)
dat_solo <- c(sample_aux_1) # pass dataset on
# progress bar
pb = txtProgressBar(min = 0, max = n_draws, initial = 0)
for (i in 1:n_draws) {
if (!is.na(useed)) {
set.seed(useed + i) # set seed to make results reproducible
}
# randomly draw samples
sample_a_boot <- sample(dat_solo, size = N, replace = T)
data_dif_boot <- data.frame(x_axis = seq(min_ref, max_ref, bins_size/bin_scaling)) %>%
distinct() %>%
arrange(x_axis) %>%
mutate(x_axis = as.character(x_axis)) %>%
left_join(
sample_a_boot %>% as.data.frame() %>% rename(x_axis = 1) %>%
mutate(x_axis = round(x_axis, digits = 0)) %>%
group_by(x_axis) %>%
summarize(n_var_a = n()) %>%
arrange(x_axis) %>%
mutate(pro_var_a = n_var_a/length(sample_a_boot),
x_axis = as.character(x_axis)),
by = "x_axis") %>%
left_join(
data_dif %>% dplyr::select(x_axis, cum_var_a) %>%
rename(cum_var_a_non_random = cum_var_a) %>%
mutate(x_axis = round(x_axis, digits = 0)) %>%
arrange(x_axis) %>%
mutate(x_axis = as.character(x_axis)),
by = "x_axis") %>%
mutate(pro_var_a = ifelse(!is.na(pro_var_a), pro_var_a, 0),
cum_var_a = cumsum(pro_var_a),
x_axis = as.numeric(x_axis)) %>%
dplyr::select(x_axis, cum_var_a, cum_var_a_non_random) %>%
mutate(dif_cum = cum_var_a - cum_var_a_non_random)
sub_dif_sample <- max(data_dif_boot$dif_cum)
statistic_bootstrapped <- N^0.5 * sub_dif_sample # compute test statistic
# store results
stats_boots[i,1] <- i
stats_boots[i,2] <- statistic_bootstrapped
setTxtProgressBar(pb, i)
close(pb)
}
stats_boots$statistic_observed <- statistic
stats_boots <- stats_boots %>%
mutate(larger = ifelse(statistic_bootstrapped > statistic_observed, 1, 0))
sum_larger <- stats_boots %>% summarize(sum(larger)) %>% as.numeric()
p_value <- 1/n_draws*sum_larger
p_value
}
#-------------------------------------------------------------------------------
# store result
#-------------------------------------------------------------------------------
object[z,2] <- p_value
}
oject_list <- list(object, "H0: cannot reject dominance. pvalue < a, we reject that variable_1 can dominant variable_2 at the level a. See section 6 of Barrett and Donald (2003) for interpretation")
return(oject_list)
}
#===============================================================================
# second-order stochastic dominance
#===============================================================================
#' @title second-order stochastic dominance test
#'
#' @description This function tests for second-order stochastic dominance.
#' @usage so_stodom(data_1, data_2, bins_size, n_draws, useed, variable_1, variable_2, type)
#' @param data_1 data 1.
#' @param data_2 data 2.
#' @param bins_size bin size.
#' @param n_draws number of draws to compute p values (default = 500).
#' @param useed user defined seed
#' @param variable_1 name of a (as a string); only for the output table (default = "a").
#' @param variable_2 name of b (as a string); only for the output table (default = "b").
#' @param type type of bootstrapped test, bootstrapping 1 and 2 of Barrett and Donald (2003) are available (default = "boot2").
#' @details This function computes the consistent test of second-order stochastic dominance following Barrett and Donald (2003). In detail, this function estimate their Kolmogorov-Smirnov type tests based on bootstrapping 2. The function was implemented as part of Schaub xxx
#' @return The function returns a list object containing the p-values of two dominance tests (i.e., variable 1 vs. variable 1 and variable 2 vs. variable 1).
#' @references Barrett, G. F., & Donald, S. G. (2003). Consistent tests for stochastic dominance. Econometrica, 71(1), 71-104.
#' @references Schaub, S. & El Benni, N. (2024). How do price (risk) changes influence farmers’ preference to reduce fertilizer application?
#' @examples
#'
#' # load stodom
#' require(stodom)
#'
#' data_a <- rnorm(500, 3, 2)
#' data_b <- rnorm(500, 1, 2)
#'
#' # estimate second-order stochastic dominance
#' so_stodom(data_1 = data_a, data_2 = data_b, n_draws = 100, useed = 1, bins_size = 1)
#' @export
so_stodom <- function(data_1, data_2 , bins_size = 1, n_draws = 500, useed, variable_1 = "a", variable_2 = "b", type = "boot2") {
bin_scaling = 1/bins_size # define scaling parameter
object <- tibble(
"SO" = "",
pvalue = rep(NA, 2))
object[1,1] <- paste0("SO_", variable_1, "_vs_", variable_2)
object[2,1] <- paste0("SO_", variable_2, "_vs_", variable_1)
# data_aux <- data_1_2
n_draws <- n_draws
for (z in 1:2) {
# adjust data
if (z == 1) {
sample_aux_1 <- data_1
sample_aux_2 <- data_2}
if (z == 2) {
sample_aux_1 <- data_2
sample_aux_2 <- data_1}
#-------------------------------------------------------------------------------
# computing test statistic of observed distributions
#-------------------------------------------------------------------------------
min_ref <- floor(min(c(sample_aux_1, sample_aux_2))) - bins_size
max_ref <- ceiling(max(c(sample_aux_1, sample_aux_2))) + bins_size
data_dif <- data.frame(x_axis = seq(min_ref, max_ref, bins_size)) %>%
mutate(x_axis = round(x_axis*bin_scaling, digits = 0)/bin_scaling) %>%
distinct() %>%
arrange(x_axis) %>%
mutate(x_axis = as.character(x_axis)) %>%
left_join(
sample_aux_1 %>% as.data.frame() %>% rename(x_axis = 1) %>%
mutate(x_axis = round(x_axis*bin_scaling, digits = 0)/bin_scaling) %>%
group_by(x_axis) %>%
summarize(n_var_a = n()) %>%
arrange(x_axis) %>%
mutate(pro_var_a = n_var_a/length(sample_aux_1),
x_axis = as.character(x_axis)),
by = "x_axis") %>%
left_join(
sample_aux_2 %>% as.data.frame() %>% rename(x_axis = 1) %>%
mutate(x_axis = round(x_axis*bin_scaling, digits = 0)/bin_scaling) %>%
group_by(x_axis) %>%
summarize(n_var_b = n()) %>%
arrange(x_axis) %>%
mutate(pro_var_b = n_var_b/length(sample_aux_2),
x_axis = as.character(x_axis)),
by = "x_axis") %>%
mutate(pro_var_a = ifelse(!is.na(pro_var_a), pro_var_a, 0),
pro_var_b = ifelse(!is.na(pro_var_b), pro_var_b, 0),
cum_var_a = cumsum(pro_var_a),
cum_var_b = cumsum(pro_var_b),
x_axis = as.numeric(x_axis)) %>%
dplyr::select(x_axis, cum_var_a, cum_var_b) %>%
mutate(dif_cum = cum_var_a - cum_var_b)
data_dif <- data_dif %>%
mutate(int_dif_cum = cumsum(dif_cum))
# get maximum difference between the two ecdfs
sub_dif <- max(data_dif$int_dif_cum)
# compute test statistic
N <- length(sample_aux_1)
M <- length(sample_aux_2)
statistic <- (N * M / (N + M))^0.5 * sub_dif
statistic
#-------------------------------------------------------------------------------
# computing test statistic of randomly sampled data
#-------------------------------------------------------------------------------
n_draws = n_draws
#////////
# bootstrapping 2
#////////
if (type == "boot2") {
stats_boots <- data.frame(
draw = rep(NA, n_draws),
statistic_bootstrapped = NA)
dat_combo <- c(sample_aux_1, sample_aux_2) # combine samples
# progress bar
pb = txtProgressBar(min = 0, max = n_draws, initial = 0)
for (i in 1:n_draws) {
set.seed(1 + i) # set seed to make results reproducible
# randomly draw samples
sample_a_boot <- sample(dat_combo, size = N, replace = T)
sample_b_boot <- sample(dat_combo, size = M, replace = T)
data_dif_boot <- data.frame(x_axis = seq(min_ref, max_ref, bins_size)) %>%
mutate(x_axis = round(x_axis*bin_scaling, digits = 0)/bin_scaling) %>%
distinct() %>%
arrange(x_axis) %>%
mutate(x_axis = as.character(x_axis)) %>%
left_join(
sample_a_boot %>% as.data.frame() %>% rename(x_axis = 1) %>%
mutate(x_axis = round(x_axis*bin_scaling, digits = 0)/bin_scaling) %>%
group_by(x_axis) %>%
summarize(n_var_a = n()) %>%
arrange(x_axis) %>%
mutate(pro_var_a = n_var_a/length(sample_a_boot),
x_axis = as.character(x_axis)),
by = "x_axis") %>%
left_join(
sample_b_boot %>% as.data.frame() %>% rename(x_axis = 1) %>%
mutate(x_axis = round(x_axis*bin_scaling, digits = 0)/bin_scaling) %>%
group_by(x_axis) %>%
summarize(n_var_b = n()) %>%
arrange(x_axis) %>%
mutate(pro_var_b = n_var_b/length(sample_b_boot),
x_axis = as.character(x_axis)),
by = "x_axis") %>%
mutate(pro_var_a = ifelse(!is.na(pro_var_a), pro_var_a, 0),
pro_var_b = ifelse(!is.na(pro_var_b), pro_var_b, 0),
cum_var_a = cumsum(pro_var_a),
cum_var_b = cumsum(pro_var_b),
x_axis = as.numeric(x_axis)) %>%
dplyr::select(x_axis, cum_var_a, cum_var_b) %>%
mutate(dif_cum = cum_var_a - cum_var_b)
data_dif_boot <- data_dif_boot %>%
mutate(int_dif_cum = cumsum(dif_cum))
# get maximum difference between the two ecdfs
sub_dif_sample <- max(data_dif_boot$int_dif_cum)
statistic_bootstrapped <- (N * M / (N + M))^0.5 * sub_dif_sample # compute test statistic
# store results
stats_boots[i,1] <- i
stats_boots[i,2] <- statistic_bootstrapped
setTxtProgressBar(pb, i)
if (z == 1) {close(pb)}
}
stats_boots$statistic_observed <- statistic
stats_boots <- stats_boots %>%
mutate(larger = ifelse(statistic_bootstrapped > statistic_observed, 1, 0))
sum_larger <- stats_boots %>% summarize(sum(larger)) %>% as.numeric()
p_value <- 1/n_draws*sum_larger
p_value
}
#////////
# bootstrapping 1
#////////
if (type == "boot1") {
stats_boots <- data.frame(
draw = rep(NA, n_draws),
statistic_bootstrapped = NA)
dat_solo <- c(sample_aux_1) # pass dataset on
# progress bar
pb = txtProgressBar(min = 0, max = n_draws, initial = 0)
for (i in 1:n_draws) {
if (!is.na(useed)) {
set.seed(useed + i) # set seed to make results reproducible
}
# randomly draw samples
sample_a_boot <- sample(dat_solo, size = N, replace = T)
data_dif_boot <- data.frame(x_axis = seq(min_ref, max_ref, bins_size/bin_scaling)) %>%
distinct() %>%
arrange(x_axis) %>%
mutate(x_axis = as.character(x_axis)) %>%
left_join(
sample_a_boot %>% as.data.frame() %>% rename(x_axis = 1) %>%
mutate(x_axis = round(x_axis, digits = 0)) %>%
group_by(x_axis) %>%
summarize(n_var_a = n()) %>%
arrange(x_axis) %>%
mutate(pro_var_a = n_var_a/length(sample_a_boot),
x_axis = as.character(x_axis)),
by = "x_axis") %>%
left_join(
data_dif %>% dplyr::select(x_axis, cum_var_a) %>%
rename(cum_var_a_non_random = cum_var_a) %>%
mutate(x_axis = round(x_axis, digits = 0)) %>%
arrange(x_axis) %>%
mutate(x_axis = as.character(x_axis)),
by = "x_axis") %>%
mutate(pro_var_a = ifelse(!is.na(pro_var_a), pro_var_a, 0),
cum_var_a = cumsum(pro_var_a),
x_axis = as.numeric(x_axis)) %>%
dplyr::select(x_axis, cum_var_a, cum_var_a_non_random) %>%
mutate(dif_cum = cum_var_a - cum_var_a_non_random)
data_dif_boot <- data_dif_boot %>%
mutate(int_dif_cum = cumsum(dif_cum))
# get maximum difference between the two ecdfs
sub_dif_sample <- max(data_dif_boot$int_dif_cum)
statistic_bootstrapped <- N^0.5 * sub_dif_sample # compute test statistic
# store results
stats_boots[i,1] <- i
stats_boots[i,2] <- statistic_bootstrapped
setTxtProgressBar(pb, i)
close(pb)
}
stats_boots$statistic_observed <- statistic
stats_boots <- stats_boots %>%
mutate(larger = ifelse(statistic_bootstrapped > statistic_observed, 1, 0))
sum_larger <- stats_boots %>% summarize(sum(larger)) %>% as.numeric()
p_value <- 1/n_draws*sum_larger
p_value
}
#-------------------------------------------------------------------------------
# store result
#-------------------------------------------------------------------------------
object[z,2] <- p_value
}
oject_list <- list(object, "H0: cannot reject dominance. pvalue < a, we reject that variable_1 can dominant variable_2 at the level a. See section 6 of Barrett and Donald (2003) for interpretation")
return(oject_list)
}
#===============================================================================
# get ecdf values of two variables
#===============================================================================
#' @title values of two ecdf and their cumulative difference
#'
#' @description This function computes the values of two empirical cumulative distribution function as well as their cumulative differences.
#' @usage ecdf_dat_g(data_1, data_2, bins_size)
#' @param data_1 data 1.
#' @param data_2 data 2.
#' @param bins_size bin size.
#' @details This function computes the values of two empirical cumulative distribution function as well as their cumulative differences.
#' @return The function returns a data table.
#' @examples
#'
#' # load stodom
#' require(stodom)
#'
#' data_a <- rnorm(500, 3, 2)
#' data_b <- rnorm(500, 1, 2)
#'
#' # compute the values of two ecdfs and their cumulative differences.
#' ecdf_dat_g(data_1 = data_a, data_2 = data_b, bins_size = 1)
#' @export
ecdf_dat_g <- function(data_1, data_2 , bins_size = 1) {
bin_scaling = 1/bins_size # define scaling parameter
sample_aux_1 <- data_1
sample_aux_2 <- data_2
min_ref <- floor(min(c(sample_aux_1, sample_aux_2))) - bins_size
max_ref <- ceiling(max(c(sample_aux_1, sample_aux_2))) + bins_size
data_dif <- data.frame(x_axis = seq(min_ref, max_ref, bins_size)) %>%
mutate(x_axis = round(x_axis*bin_scaling, digits = 0)/bin_scaling) %>%
distinct() %>%
arrange(x_axis) %>%
mutate(x_axis = as.character(x_axis)) %>%
left_join(
sample_aux_1 %>% as.data.frame() %>% rename(x_axis = 1) %>%
mutate(x_axis = round(x_axis*bin_scaling, digits = 0)/bin_scaling) %>%
group_by(x_axis) %>%
summarize(n_var_a = n()) %>%
arrange(x_axis) %>%
mutate(pro_var_a = n_var_a/length(sample_aux_1),
x_axis = as.character(x_axis)),
by = "x_axis") %>%
left_join(
sample_aux_2 %>% as.data.frame() %>% rename(x_axis = 1) %>%
mutate(x_axis = round(x_axis*bin_scaling, digits = 0)/bin_scaling) %>%
group_by(x_axis) %>%
summarize(n_var_b = n()) %>%
arrange(x_axis) %>%
mutate(pro_var_b = n_var_b/length(sample_aux_2),
x_axis = as.character(x_axis)),
by = "x_axis") %>%
mutate(pro_var_a = ifelse(!is.na(pro_var_a), pro_var_a, 0),
pro_var_b = ifelse(!is.na(pro_var_b), pro_var_b, 0),
cum_var_a = cumsum(pro_var_a),
cum_var_b = cumsum(pro_var_b),
x_axis = as.numeric(x_axis)) %>%
dplyr::select(x_axis, cum_var_a, cum_var_b) %>%
mutate(dif_cum = cum_var_a - cum_var_b)
# adding this
## this is added as we know the result and rounding and putting variables in to bins can cause mall errors
data_dif <- data_dif %>%
mutate(int_dif_cum = cumsum(dif_cum))
data_dif <- data_dif %>% mutate(int_dif_cum_s = int_dif_cum/abs(sum(data_dif$int_dif_cum))) # scale variable
return(data_dif)
}
#===============================================================================
# plot ecdfs
#===============================================================================
#' @title plot ecdfs
#'
#' @description This function computes the values of two empirical cumulative distribution function and plots the values.
#' @usage ecdf_plot(data_1, data_2, bins_size)
#' @param data_1 data 1.
#' @param data_2 data 2.
#' @param bins_size bin size.
#' @details This function computes the values of two empirical cumulative distribution function and plots the values.
#' @return The function returns a plot as a ggplot2 object.
#' @examples
#'
#' # load stodom
#' require(stodom)
#'
#' data_a <- rnorm(500, 3, 2)
#' data_b <- rnorm(500, 1, 2)
#'
#' # plot ecdfs
#' ecdf_plot(data_1 = data_a, data_2 = data_b, bins_size = 0.1)
#' @export
ecdf_plot <- function(data_1, data_2 , bins_size = 1) {
sample_aux_1 <- data_1
sample_aux_2 <- data_2
aux_1 <- ecdf_dat_g(data_1 = sample_aux_1,
data_2 = sample_aux_2, bins_size = bins_size)
# prepare for plotting
aux_2 <-
aux_1 %>% dplyr::select(x_axis, cum_var_a, cum_var_b) %>%
pivot_longer(!c(x_axis), names_to = "group", values_to = "cum")
theme_set(theme_bw())
# plot ecdfs
fig_aux <- aux_2 %>%
ggplot() +
geom_hline(yintercept = 0, color = c("#525252")) +
geom_hline(yintercept = 1, color = c("#525252")) +
geom_line(aes(x = x_axis, y = cum,
linetype = group), linewidth = 0.9) +
theme(
axis.title = element_text(size = 13),
axis.text = element_text(size = 13),
legend.text = element_text(size = 13),
legend.title = element_text(size = 13),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
panel.grid.major = element_blank(),
legend.position = "bottom",
legend.key.size = unit(1, "cm"),
panel.border = element_rect(colour = "black", fill = NA, size = 1)) +
scale_x_continuous(name = expression(paste("outcome (x)"))) +
scale_y_continuous(
breaks = c(0,1),
name = expression(paste(widehat("F"),"(x)"[j]))) +
scale_linetype_manual(values=c("solid", "dashed"),
labels = c("a", "b")) +
guides(linetype = guide_legend(ncol = 2, title = "", order = 1))
return(fig_aux)
}
#===============================================================================
# plot cumulative difference between two ecdfs
#===============================================================================
#' @title plot difference ecdfs
#'
#' @description This function computes the values of the cumulative difference of two empirical cumulative distribution function and plots the values.
#' @usage dif_ecdf_plot(data_1, data_2, bins_size)
#' @param data_1 data 1.
#' @param data_2 data 2.
#' @param bins_size bin size.
#' @details This function computes the values of the cumulative difference of two empirical cumulative distribution function and plots the values. This relates two showing second-order stochastic dominance.
#' @return The function returns a plot as a ggplot2 object.
#' @examples
#'
#' # load stodom
#' require(stodom)
#'
#' data_a <- rnorm(500, 3, 2)
#' data_b <- rnorm(500, 1, 2)
#'
#' # plot cumulative difference between two ecdfs
#' dif_ecdf_plot(data_1 = data_a, data_2 = data_b, bins_size = 0.1)
#' @export
dif_ecdf_plot <- function(data_1, data_2 , bins_size = 1) {
sample_aux_1 <- data_1
sample_aux_2 <- data_2
aux_1 <- ecdf_dat_g(data_1 = sample_aux_1,
data_2 = sample_aux_2, bins_size = bins_size)
theme_set(theme_bw())
# plot cum difference of ecdfs
fig_aux <-
aux_1 %>%
ggplot() +
geom_hline(yintercept = 0, color = c("#525252")) +
geom_line(aes(x = x_axis, y = int_dif_cum ), size = 0.9) +
theme(
axis.title = element_text(size = 13),
axis.text = element_text(size = 13),
legend.text = element_text(size = 13),
legend.title = element_text(size = 13),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
panel.grid.major = element_blank(),
legend.position = c(0.15, 0.85),
legend.key.size = unit(0.5, "cm"),
panel.border = element_rect(colour = "black", fill = NA, size = 1)) +
scale_x_continuous(name = expression(paste("outcome (x)"))) +
scale_y_continuous(
breaks = 0,
name = expression(paste(Sigma[paste("-",infinity)]^x^"*", " ", widehat("F"),"(x)"[j], " - ", widehat("F"),"(x)"[j], " ", " dx"))) +
guides(linetype = guide_legend(ncol = 2, title = "period:", order = 1))
fig_aux
return(fig_aux)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.