## {{{ docs }}}
#' Get true values from simulated data
#'
#' @param sim_data an object from \code{ef_simulateData}
#'
#' @export
## }}}
ef_get_true <- function(sim_data)
{
true = sim_data$parameters %>%
tibble::data_frame(Parameter=names(.), True=.) %>%
dplyr::mutate(Parameter = paste0(stringr::str_extract(Parameter, 'beta.tau|beta.nu|pi|.*chi.(m|s)|.*iota.(m|s)|n') , '[', stringr::str_extract(Parameter, '[0-9]+') ,']'),
Parameter = stringr::str_replace(string=Parameter, pattern="\\[NA\\]", replacement=""))
return(true)
}
## {{{ docs }}}
#' Classify observations into fraud distributions
#'
#' This function classifies the data points into one of three distributions: incremental fraud, extreme fraud, and no fraud.
#'
#'
#' @param data a data frame with the data
#' @param samples the output of the function \code{eforensics}
#'
#'
#' @export
## }}}
ef_classify <- function(data, samples){
if (!"k.hat" %in% names(samples[[1]]))
stop("You need to monitor parameter Z as well in order to classify the data. Use 'parameters=\"all\" in the function eforensics()")
k.list = list()
for (i in 1:length(samples))
{
k.list[[i]] = samples[[i]]$k.hat
}
k.list = k.list %>%
do.call(cbind,.) %>%
tibble::as_data_frame(.) %>%
data.table::setnames(., paste0("chain.", 1:length(samples), sep='')) %>%
base::apply(., 1, function(zi) which.max(c(sum(zi==1), sum(zi==2), sum(zi==3)) ) )
data = cbind(data, fraud.distribution.idx=k.list) %>%
dplyr::mutate(fraud.distribution.label = dplyr::case_when(fraud.distribution.idx == 1 ~ "no fraud",
fraud.distribution.idx == 2 ~ "incremental fraud",
fraud.distribution.idx == 3 ~ "extreme fraud",
))
return(data)
}
## {{{ docs }}}
#' Summary
#'
#' This function return summaries of the posterior distribution estimated by the function \code{eforensics}
#'
#'
#' @param object the output of the function \code{eforensics}
#' @param ... join.chains=TRUE can be used to provide summaries of chains after they are combined together
#'
#' @export
## }}}
summary.eforensics <- function(object, ...)
{
args = as.list(match.call())
if('join.chains' %in% names(args)) {
join.chains = as.logical(as.character(args$join.chains))
}else{
join.chains = FALSE
}
x = object
terms = attr(x, "terms")
if (join.chains) {
samp = x %>% purrr::map(.x=., ~.x['parameters'][[1]]) %>% do.call(rbind,.)
HPD = samp %>%
coda::as.mcmc(.) %>%
coda::HPDinterval(.) %>%
data.frame(Parameter = row.names(.), Covariate=terms,., row.names = 1:nrow(.)) %>%
dplyr::rename(HPD.lower = lower, HPD.upper = upper)
samp = samp %>%
coda::as.mcmc(.) %>%
summary(.) %>%
.[[1]] %>%
data.frame(Parameter = row.names(.), Covariate=terms, ., row.names = 1:nrow(.)) %>%
dplyr::select(Parameter, Mean, SD) %>%
dplyr::full_join(., HPD , by=c("Parameter")) %>%
dplyr::select(Parameter, Covariate, dplyr::everything())
}else{
samp = list()
for (i in 1:length(x))
{
HPD = x[[i]]$parameters %>%
coda::HPDinterval(.) %>%
data.frame(Parameter = row.names(.), Covariate=terms,., row.names = 1:nrow(.)) %>%
dplyr::rename(HPD.lower = lower, HPD.upper = upper)
tab = x[[i]]$parameters %>%
summary(.) %>%
.[[1]] %>%
data.frame(Parameter = row.names(.), Covariate=terms, ., row.names = 1:nrow(.)) %>%
dplyr::select(Parameter, Mean, SD) %>%
dplyr::full_join(., HPD , by=c("Parameter"))
samp[[i]] = tab %>%
dplyr::select(Parameter, Covariate, dplyr::everything())
}
names(samp) = paste0('Chain ', 1:length(x))
}
return(samp)
}
#' @export
summary.eforensics_sim_data <- function(object,...)
{
## get additional parameters ...
eligible.voters.provided =FALSE
args = as.list(match.call())
if(!'eligible.voters' %in% names(args)) {
eligible.voters = NULL
}else{
eligible.voters = eval(args$eligible.voters)
eligible.voters.provided =TRUE
}
sim_data = object
nrows.data = nrow(sim_data$data)
par = sim_data$parameters %>%
as.data.frame(.) %>%
cbind(., Parameter=row.names(.)) %>%
dplyr::rename(True=".") %>%
tidyr::spread(., key=Parameter, value=True) %>%
base::replicate(., n=nrows.data, simplify=FALSE) %>%
base::do.call(base::rbind,.) %>%
tibble::as_data_frame(.) %>%
dplyr::rename_all(dplyr::funs(paste0(., ".True") ))
sim_data_summ = sim_data$data %>%
dplyr::bind_cols(.,
sim_data$latent %>%
base::do.call(base::cbind, .) %>%
tibble::as_data_frame(.) %>%
dplyr::rename_all(dplyr::funs(paste0(., ".True") ))
) %>%
tibble::as_data_frame(.) %>%
dplyr::bind_cols(., par)
if ('alpha.True' %in% names(sim_data_summ)) {
sim_data_summ = sim_data_summ %>%
dplyr::mutate(iota.s.True = iota.m.True^alpha.True,
chi.s.True = chi.m.True^alpha.True)
}
sim_data_summ = sim_data_summ %>%
dplyr::mutate(Manufactured.Fraud.True = dplyr::case_when(z.True == 1 ~ 0,
z.True == 2 ~ iota.m.True * (1-tau.True),
z.True == 3 ~ chi.m.True * (1-tau.True)) ,
Stolen.Fraud.True = dplyr::case_when(z.True == 1 ~ 0,
z.True == 2 ~ iota.s.True * tau.True * (1-nu.True),
z.True == 3 ~ chi.s.True * tau.True * (1-nu.True)),
Total.Fraud.True = Manufactured.Fraud.True + Stolen.Fraud.True )
if (eligible.voters.provided) {
N = sim_data_summ[,N] %>% dplyr::pull(.)
sim_data_summ = sim_data_summ %>%
dplyr::mutate(Manufactured.Fraud.True = N*Manufactured.Fraud.True,
Stolen.Fraud.True = N*Stolen.Fraud.True ,
Total.Fraud.True = N*Total.Fraud.True)
}
return(sim_data_summ)
}
#' @export
print.eforensics <- function(x, ...)
{
print(summary(x))
return(summary(x))
}
## {{{ docs }}}
#' Get parameters
#'
#' This function returns the parameters of the model
#'
#'
#' @param samples the output of the function \code{eforensics}
#'
#' @export
## }}}
ef_get_parameters <- function(samples)
{
samp = list()
for (i in 1:length(samples))
{
samp[[i]] = samples[[i]]$parameters
}
return(samp)
}
## {{{ docs }}}
#' Compute Fraud Distribution
#'
#' This function returns the estimated posterior distribution of frauds at the individual-level observation
#'
#'
#' @param data the data set used in the estimation
#' @param samples the output of \code{eforensics} function
#' @param model a string with the model used in the estimation (e.g., 'rn', 'bl'). See \code{eforensics} function
#' @param eligible.voters either \code{NULL} (default) or a string with the name of the column containing the number of eligible voters
#' @inheritParams plot_local_fraud
#' @return It returns a tibble data.frame with the data points classified into no fraud, incremental, or extreme fraud distribution along with the posterior distribution of fraud for each observation
#'
#' @export
## }}}
ef_get_local_fraud <- function(data, samples, model, eligible.voters=NULL, sim_data=NULL)
{
op.default <- options()
on.exit(options(op.default), add=TRUE)
options(warn=-1)
## check_local_fraud(samples, model)
if(!'fraud.distribution.idx' %in% names(data)) dat = ef_classify(data, samples) %>% tibble::as_data_frame(.)
## getting parameters
## ------------------
samples.par = ef_get_parameters(samples) %>% runjags::combine.mcmc(.) %>% tibble::as_data_frame(.)
beta.nu = samples.par %>% tibble::as_data_frame(.) %>% dplyr::select(dplyr::contains("beta.nu"))
beta.tau = samples.par %>% tibble::as_data_frame(.) %>% dplyr::select(dplyr::contains("beta.tau"))
iota.m = samples.par %>% tibble::as_data_frame(.) %>% dplyr::select(dplyr::contains("mu.iota.m"))
chi.m = samples.par %>% tibble::as_data_frame(.) %>% dplyr::select(dplyr::contains("mu.chi.m"))
if(model=='bl') {
iota.s = samples.par %>% tibble::as_data_frame(.) %>% dplyr::select(dplyr::contains("mu.iota.s"))
chi.s = samples.par %>% tibble::as_data_frame(.) %>% dplyr::select(dplyr::contains("mu.chi.s"))
sigma.tau = NULL
sigma.nu = NULL
}
if(model=='rn') {
alpha = samples.par %>% tibble::as_data_frame(.) %>% dplyr::select(dplyr::contains("alpha")) %>% dplyr::summarise(mean(alpha)) %>% dplyr::pull(.)
iota.s = samples.par %>% tibble::as_data_frame(.) %>% dplyr::select(dplyr::contains("mu.iota.m")) %>% dplyr::mutate(mu.iota.s = mu.iota.m^alpha) %>% dplyr::select(mu.iota.s)
chi.s = samples.par %>% tibble::as_data_frame(.) %>% dplyr::select(dplyr::contains("mu.chi.m")) %>% dplyr::mutate(mu.chi.s = mu.chi.m ^alpha) %>% dplyr::select(mu.chi.s)
sigma.tau = samples.par %>% tibble::as_data_frame(.) %>% dplyr::select(dplyr::contains("sigma.tau"))
sigma.nu = samples.par %>% tibble::as_data_frame(.) %>% dplyr::select(dplyr::contains("sigma.nu"))
}
## getting data (Xw and Xa) (design matrix)
## ------------------------
getX <- function(formula1, data)
{
func.call <- match.call(expand.dots = FALSE)
mat = getRegMatrix(func.call, data, weights=NULL, formula_number=1)
Xw = mat$X
return(Xw)
}
formula = attr(samples, 'formula.w')
Xw = getX(formula, data)
formula = attr(samples, 'formula.a')
Xa = getX(formula, data)
## Get fraud distribution
## ---------------------------------------------
Z = dat$fraud.distribution.idx
N=rep(NA, nrow(dat))
if (!is.null(eligible.voters) ) {
if (eligible.voters %in% names(dat) ) {
N = dat[,eligible.voters]
N = N %>% dplyr::pull(.)
cat("\nLocal fraud will be returned in counts.\n")
}else{
cat(paste0("\n\nNOTE: The column ", eligible.voters, "is not in the data. The function will return fraud proportions instead of counts.") )
}
}else{
cat("\nLocal fraud will be returned in proportion of votes.\n")
}
## Debug/Monitoring message --------------------------
msg <- paste0('\n','Computing local fraud distribution.\n\nIt may take some time to compute ...', '\n'); cat(msg)
## ---------------------------------------------------
doParallel::registerDoParallel(parallel::detectCores() - 1) ## -- Parallel (start) -----------------------------------------
"%dopar%" <- foreach::"%dopar%"
dat$fraud.distribution = foreach::foreach(Zi = Z,
Ni = N,
Xwi=split(Xw, 1:nrow(Xw)),
Xai=split(Xa, 1:nrow(Xa)),
.maxcombine = nrow(dat),
.combine = list, .multicombine=TRUE) %dopar% ef_local_fraud_get_distribution(beta.tau, beta.nu, iota.m, iota.s, chi.m, chi.s, Zi, Xwi, Xai, Ni, sigma.tau, sigma.nu, model)
if (2 %in% unique(dat$fraud.distribution.idx) | 3 %in% unique(dat$fraud.distribution.idx) ) {
## Total
fraud.summary = foreach::foreach(dist = dat$fraud.distribution,
.maxcombine = nrow(dat),
.combine = list, .multicombine=T) %dopar% if(!is.null(dist[1])){c(mean(dist$Total), coda::HPDinterval(coda::as.mcmc(dist$Total)))}else{c(0,0,0)}
dat = dat %>%
dplyr::bind_cols(., do.call(rbind,fraud.summary) %>%
tibble::as_data_frame() %>%
dplyr::rename_at(., 1:3, dplyr::funs(c('Total.Fraud.Mean', "Total.Fraud.HPD.lower", "Total.Fraud.HPD.upper")) )
)
## Manufactured
fraud.summary = foreach::foreach(dist = dat$fraud.distribution,
.maxcombine = nrow(dat),
.combine = list, .multicombine=T) %dopar% if(!is.null(dist[1])){c(mean(dist$Manufactured), coda::HPDinterval(coda::as.mcmc(dist$Manufactured)))}else{c(0,0,0)}
dat = dat %>%
dplyr::bind_cols(., do.call(rbind,fraud.summary) %>%
tibble::as_data_frame() %>%
dplyr::rename_at(., 1:3, dplyr::funs(c('Manufactured.Fraud.Mean', "Manufactured.Fraud.HPD.lower", "Manufactured.Fraud.HPD.upper")) )
)
## Stolen
fraud.summary = foreach::foreach(dist = dat$fraud.distribution,
.maxcombine = nrow(dat),
.combine = list, .multicombine=T) %dopar% if(!is.null(dist[1])){c(mean(dist$Stolen), coda::HPDinterval(coda::as.mcmc(dist$Stolen)))}else{c(0,0,0)}
dat = dat %>%
dplyr::bind_cols(., do.call(rbind,fraud.summary) %>%
tibble::as_data_frame() %>%
dplyr::rename_at(., 1:3, dplyr::funs(c('Stolen.Fraud.Mean', "Stolen.Fraud.HPD.lower", "Stolen.Fraud.HPD.upper")) )
)
}
doParallel::stopImplicitCluster() ## --------------------------- Parallel (stop) -------------------------------------------
## get and merge true value if provided
if (!is.null(sim_data)) {
dat = dat %>% dplyr::full_join(., summary(sim_data), by=c())
if (!is.null(eligible.voters) ) {
if (eligible.voters %in% names(dat)) {
N = dat[,eligible.voters] %>% dplyr::pull(.)
dat = dat %>%
dplyr::mutate(Manufactured.Fraud.True = N*Manufactured.Fraud.True,
Stolen.Fraud.True = N*Stolen.Fraud.True ,
Total.Fraud.True = N*Total.Fraud.True)
}
}
}
return(dat)
}
ef_local_fraud_get_distribution <- function(beta.tau, beta.nu, iota.m, iota.s, chi.m, chi.s, Zi, Xwi, Xai, Ni, sigma.tau=NULL, sigma.nu=NULL, model )
{
if (Zi==1) {
fraud.density = NULL
}else{
mu.tau = as.matrix(beta.tau) %*% matrix(t(Xai))
mu.nu = as.matrix(beta.nu ) %*% matrix(t(Xwi))
if (model=='bl') {
mu.tau = 1/(1+exp(-mu.tau))
mu.nu = 1/(1+exp(-mu.nu))
}
if (model=='rn') {
## compute the expectation of a restricted normal distribution
sigma.tau = sigma.tau %>% dplyr::pull(.)
a.tau = (0 - mu.tau)/sigma.tau
b.tau = (1 - mu.tau)/sigma.tau
k.tau = stats::pnorm(b.tau, 0, 1) - stats::pnorm(a.tau, 0, 1)
mu.tau = mu.tau + ( ( stats::dnorm(a.tau) - stats::dnorm(b.tau) ) / k.tau ) * sigma.tau
sigma.nu = sigma.nu %>% dplyr::pull(.)
a.nu = (0 - mu.nu)/sigma.nu
b.nu = (1 - mu.nu)/sigma.nu
k.nu = stats::pnorm(b.nu, 0, 1) - stats::pnorm(a.nu, 0, 1)
mu.nu = mu.nu + ( ( stats::dnorm(a.nu) - stats::dnorm(b.nu) ) / k.nu ) * sigma.nu
}
if (Zi == 2) {
manufactured = iota.m * (1-mu.tau)
stolen = iota.s * mu.tau * (1-mu.nu)
}
if(Zi == 3){
manufactured = chi.m * (1-mu.tau)
stolen = chi.s * mu.tau * (1-mu.nu)
}
if (!is.na(Ni)) {
manufactured = Ni*manufactured
stolen = Ni*stolen
}
total = manufactured + stolen
fraud.density = data.frame(manufactured=manufactured, stolen=stolen, total=total) %>% tibble::as_data_frame(.)
names(fraud.density) = c('Manufactured', 'Stolen', 'Total')
}
return(fraud.density)
}
## =====================================================
## plots
## =====================================================
parameter.rename <- function(parameter)
{
parameter = parameter %>%
## stringr::str_replace(string=., pattern="\\[", replacement="_[") %>%
## stringr::str_replace(string=., pattern="]", replacement="}") %>%
stringr::str_replace(string=., pattern=".tau\\[", replacement="[tau*") %>%
stringr::str_replace(string=., pattern=".nu\\[", replacement="[nu*") %>%
stringr::str_replace(string=., pattern=".chi.", replacement="[chi]^") %>%
stringr::str_replace(string=., pattern=".iota.", replacement="[iota]^") %>%
stringr::str_replace_all(string=., pattern=" ", replacement="~")
## stringr::str_replace(string=., pattern="\\.w~", replacement="[w]~") %>%
## stringr::str_replace(string=., pattern="\\.a~", replacement="[a]~")
return(parameter)
}
## {{{ docs }}}
#' Plot summary of posterior distribution
#'
#' Plot summary of the samples from posterior distribution produced by \code{eforensics} function
#'
#'
#' @inheritParams summary.eforensics
#' @param samples the output of the function \code{eforensics}.
#' @param true used when data was generated by simulation produced by \code{ef_simulateData} function. The value must the be output of the function \code{ef_get_true()}
#' @param parse boolean, if \code{TRUE}, text in the tick marks of the y-axis is parsed.
#' @param plots a string vector with any combination of "Abstention and Vote", "Local Fraud Probabilities", and "Fraud Probability". It defines with plot will be included in the figure. Default includes all.
#' @param title a string with the title of the plot
#' @param subtitle a string wiht the subtitle of the plot
#' @param xlab a string with the lable of the x-axis
#' @param ylab a string with the lable of the y-axis
#'
#'
#' @export
## }}}
ef_plot <- function(samples, true=NULL, parse=FALSE, plots=c("Abstention and Vote", "Local Fraud Probabilities", "Fraud Probability" ), title=NULL, subtitle=NULL, xlab=NULL, ylab=NULL)
{
options(warn=-1)
on.exit(options(warn=0))
## creating summary table
tab = summary(samples, join.chains=T)
if (!is.null(true)) {
## joining true if 'True' value of the parameters are provided
tab = tab %>% dplyr::left_join(., True)
}
tab = tab %>%
dplyr::filter( ! (stringr::str_detect(Parameter, pattern="beta.*[1]") & stringr::str_detect(Covariate, pattern="Intercept")) ) %>%
dplyr::mutate(Group = dplyr::case_when(stringr::str_detect(Parameter, pattern="pi")~"Fraud Probability",
stringr::str_detect(Parameter, pattern="beta.(nu|tau)")~"Coefficients: Abstention and Vote for the winner",
TRUE~"Coefficients: Local Fraud Probabilities"
),
Parameter = dplyr::case_when(stringr::str_detect(Parameter, pattern="nu") ~ "votes to the winner",
stringr::str_detect(Parameter, pattern="tau") ~ "abstension",
stringr::str_detect(Parameter, pattern="iota.s") ~ "Incremental/Stolen",
stringr::str_detect(Parameter, pattern="iota.m") ~ "Incremental/Manufactured",
stringr::str_detect(Parameter, pattern="chi.s") ~ "Extreme/Stolen",
stringr::str_detect(Parameter, pattern="chi.m") ~ "Extreme/Manufactured",
## stringr::str_detect(Parameter, pattern="iota.s") ~ "Incremental Fraud/Stolen votes",
## stringr::str_detect(Parameter, pattern="iota.m") ~ "Incremental Fraud/Manufactured votes",
## stringr::str_detect(Parameter, pattern="chi.s") ~ "Extreme Fraud/Stolen votes",
## stringr::str_detect(Parameter, pattern="chi.m") ~ "Extreme Fraud/Manufactured votes",
## stringr::str_detect(Parameter, pattern="pi.1.") ~ "expression(p[1])",
## stringr::str_detect(Parameter, pattern="pi.2.") ~ "expression(p[2])",
## stringr::str_detect(Parameter, pattern="pi.3.") ~ "expression(p[3])",
TRUE ~ Parameter %>% as.character),
label = paste0(Covariate, " (", Parameter, ")"),
label = stringr::str_replace(string=label, pattern=".Intercept.", replacement="Intercept"),
) %>%
dplyr::filter(stringr::str_detect(Group, pattern=paste0(plots, collapse="|") ))
if (parse) {
tab = tab %>% dplyr::mutate(label = parameter.rename(label))
}else{
tab = tab %>% dplyr::mutate(label = stringr::str_replace_all(string=label, pattern="\\(pi.[0-9].\\)", replacement="") )
}
## plot
## ----
if (is.null(xlab )) {xlab = "Posterior Expectation"}
if (is.null(ylab )) {ylab = ""}
g = tab %>%
ggplot2::ggplot(.) +
ggplot2::geom_point(ggplot2::aes(x=Mean, y=label, shape="Posterior Mean", colour='Posterior Mean'), size=2, alpha=1) +
ggplot2::geom_errorbarh(ggplot2::aes(y = label, xmin=HPD.lower, xmax=HPD.upper, linetype="95% HPD interval"), height=.1) +
ggplot2::xlab(xlab) +
ggplot2::ylab(ylab) +
ggplot2::scale_linetype(name="") +
ggplot2::scale_shape(name="") +
ggplot2::facet_wrap( ~ Group, ncol = 1, scales='free' )
if (!is.null(true)) {
g = g +
ggplot2::geom_point(ggplot2::aes(x=True, y=label, shape="True", colour="True"), size=2, alpha=.6) +
ggplot2::scale_colour_manual(values = c("Posterior Mean" = "black", "True"='red'), name="")
}else{
g = g +
ggplot2::scale_colour_manual(values = c("Posterior Mean" = "black"), name="")
}
if (parse) {
g = g +
ggplot2::scale_y_discrete(labels=function(l) parse(text=l))
}
if (!is.null(title)) {
g = g +
ggplot2::ggtitle(label=title, subtitle="")
}
if (!is.null(title) & !is.null(subtitle)) {
g = g +
ggplot2::ggtitle(label=title, subtitle=subtitle)
}
## theme
g = g +
ggplot2::theme_bw() +
ggplot2::theme(legend.position = "top") +
ggplot2::theme(strip.background = ggplot2::element_rect(colour="white", fill="white"),
strip.text.x = ggplot2::element_text(size=11, face='bold', hjust=0),
strip.text.y = ggplot2::element_text(size=11, face="bold", vjust=0))
return(g)
}
## {{{ docs }}}
#' Countor plot
#'
#' Create a contour plot with marginal densities
#'
#' @param data a data frame
#' @param samples the output of the function \code{eforensics}. It is optional. If provided, the plot will display the classification of the points in each fraud distribution
#' @param x a string with the name of the variable in \code{data} that will be plotted in the x-axis
#' @param y a string with the name of the variable in \code{data} that will be plotted in the y-axis
#' @param xlab a string with the text to appear in the x-axis
#' @param ylab a string with the text to appear in the y-axis
#' @param contour.fill.color boolean, if \code{TRUE}, it uses the contour.fill.color.key to color the contour of the bivariate density
#' @param contour.fill.color.key a string with the name of the variable in \code{data} to use as a color key of the contour of the bivariate density. It is used only if \code{samples} is not provided
#' @param pch.color.key a string with the name of the variable in \code{data} that will be used to color the points. Default is \code{NULL}. It is used only if \code{samples} is not provided
#' @param pch.size a integer with the size of the points. Default is 1
#' @param pch.show a boolean value indicating if the points should be displayed in the plot (T) or not (F). Default is TRUE
#' @param legend.title a string with the title of the legend
#'
#' @return a ggplot object
#'
#' @export
## }}}
ef_byplot <- function(data, samples=NULL, x, y, xlab, ylab, contour.fill.color=FALSE, contour.fill.color.key=NULL, pch.color.key=NULL, pch.size=2, pch.show=TRUE, legend.title="")
{
options(warn=-1)
on.exit(options(warn=0))
if (!is.null(samples)) {
data = ef_classify(data, samples)
pch.color.key = "fraud.distribution.label"
legend.title = "Fraud Distribution (posterior proportion)"
if(contour.fill.color){
contour.fill.color.key = "fraud.distribution.label"
}
data = data %>%
dplyr::left_join(.,
summary(samples, join.chains=T) %>% dplyr::filter(stringr::str_detect(Parameter, pattern="pi")) %>%
dplyr::select(Parameter, Mean) %>%
dplyr::mutate_if(.predicate=is.numeric, .funs=list(~round(., digits=3))) %>%
dplyr::mutate(fraud.distribution.label = dplyr::case_when(Parameter == "pi[1]"~"no fraud",
Parameter == "pi[2]"~"incremental fraud",
Parameter == "pi[3]"~"extreme fraud")) ,
by = "fraud.distribution.label"
) %>%
dplyr::mutate(fraud.distribution.label = paste0(fraud.distribution.label , " (",Mean,")") )
data$fraud.distribution.label = stringr::str_to_title(data$fraud.distribution.label, locale = "en")
}
g = data %>%
ggplot2::ggplot(ggplot2::aes_string(x = x, y = y))
if (pch.show & is.null(pch.color.key)) {
g = g + ggplot2::geom_point(colour="#00000024", size=pch.size)
}
if (pch.show & !is.null(pch.color.key)) {
g = g +
ggplot2::geom_point(ggplot2::aes_string(colour=pch.color.key), size=pch.size, alpha=.3) +
ggplot2::guides(color=ggplot2::guide_legend(title=legend.title))
}
if (!contour.fill.color) {
contour.fill.color.key="..level.."
g = g + ggplot2::guides(fill = FALSE)
}else{
if (is.null(contour.fill.color.key)) {
stop("\n\nYou must provide the contour.fill.color.key")
}
g = g + ggplot2::guides(fill=ggplot2::guide_legend(legend.title))
}
g = g +
ggplot2::stat_density_2d(ggplot2::aes_string(fill=contour.fill.color.key), geom = "polygon", alpha=.2) +
ggplot2::xlab(xlab)+
ggplot2::ylab(ylab)+
ggplot2::theme_bw() +
ggplot2::theme(
legend.position = c(1,1), # position of the legend (top/bottom/(x,y coord)/...)
legend.justification = c(1,1), # justification the position: "center" or c(0,0)=(r,t), c(1,1)=(l,b)
legend.background= ggplot2::element_rect(fill='transparent'), # backg of each ind. level contour.fill.color for border
)
if (is.null(pch.color.key)) {
g = g + ggplot2::scale_colour_distiller(palette='Blues')
}
g = g %>% ggExtra::ggMarginal(., type = "histogram", fill="lightsteelblue1", colour='white')
return(g)
}
## put in the package
## ------------------
## {{{ docs }}}
#' Plot samples from eforensics
#'
#' This function creates a plot with posterior distribution of fraud in a given election unit (ballot box, pooling place, etc)
#'
#' @param data either the data set used to estimate fraud or the output of the function \code{ef_get_local_fraud}
#' @param samples eigher \code{NULL} (default) (which requires \code{data} to be the output of the function \code{ef_get_local_fraud}) or the outuput of the function \code{eforensics}
#' @param model either \code{NULL} (default) (which requires \code{data} to be the output of the function \code{ef_get_local_fraud}) or a string with the name of the model used to estimate fraud (see \code{eforensics})
#' @param row an integer with the row number of data to plot
#' @param by.types boolean, if \code{TRUE} incremental, extreme, and total fraud are displayed. Otherwise, only distribution of total fraud is shown
#' @param election.unit a string with the name of the column that contains a label that identifies the election unit. Default \code{NULL}
#' @param plot.mean boolean, if \code{TRUE} the posterior average of the distribution is also displayed as well as the 95\% HPD
#' @param legend.position a string with the position of the legend when posterior mean is displayed. Accepts \code{top}, \code{bottom}, \code{left}, \code{right}
#' @param sim_data the output of the function \code{ef_simulateData}. It is used only if the fraud was estimated using simulated data. If provided, the plot also display the true fraud value. Default \code{NULL}
#' @param title a string with the title of the plot. Default \code{NULL}
#' @param subtitle a string with the subtitle of the plot. Default \code{NULL}
#'
#' @export
## }}}
plot_local_fraud <- function(data, samples=NULL, model=NULL, row=NULL, election.unit=NULL, title=NULL, subtitle=TRUE, plot.mean=TRUE, legend.position='bottom', sim_data=NULL, by.types=TRUE)
{
## check if data provided contains classification and if not, classify it
## ----------------------------------------------------------------------
if (!("fraud.distribution.idx" %in% names(data) & "fraud.distribution.label" %in% names(data))) {
if (is.null(samples) | is.null(model)) {
stop("\n\n The 'data' provided does not contain classification of observations. Columns 'fraud.distribution.idx' and 'fraud.distribution.label' must be in the data. Either provide 'samples' and 'model' or use the function ef_get_local_fraud() to classify the data first.\n\n")
}
data = ef_get_local_fraud(data, samples, model)
}
## checke if row was provided and if it was, if that case was faudulent
## --------------------------------------------------------------------
if (is.null(row)) {
stop("\n\nThe row number with the observation to plot must be provided. Use the parameter 'row'\n\n")
}else{
if (data$fraud.distribution.idx[row]==1) return(cat("\n\nObservation in row ", row, " in the data was classified as a non-fraudulent case.\n\n") )
}
## if simulated data is provided, get summary to plot true values
## --------------------------------------------------------------
if (!is.null(sim_data)) {
## Debug/Monitoring message --------------------------
msg <- paste0('\n','Getting true parameter values ...', '\n'); cat(msg)
## ---------------------------------------------------
true = summary(sim_data)
data = data %>%
dplyr::left_join(., true)
}
data = data %>% dplyr::filter(dplyr::row_number()==row)
if (by.types) {
g = plot_frauds(data, plot.mean, sim_data)
}else{
g = plot_frauds_total(data, plot.mean, sim_data)
}
g = g +
ggplot2::theme(legend.position = legend.position)
## plot title and subtitle
if (!is.null(title)) {
g = g + ggplot2::ggtitle(title)
}else{
title=''
}
if (subtitle) {
subtitle = paste0("(", data$fraud.distribution.label, ")")
g = g + ggplot2::ggtitle(title,subtitle=subtitle)
}
if (!is.null(election.unit)) {
if (election.unit %in% names(data)) {
election.unit = paste0(" Election unit: ", data[,election.unit] %>% dplyr::pull(.))
g = g + ggplot2::annotate("text", label = election.unit, x = -Inf, y = Inf, hjust=0, vjust=1.5 )
}
}
return(g)
}
plot_frauds_total <- function(tab, plot.mean, sim_data)
{
g = tab %>%
dplyr::select(fraud.distribution) %>%
dplyr::pull(.) %>%
.[[1]] %>%
dplyr::select(Total) %>%
ggplot2::ggplot(.) +
ggplot2::geom_histogram(ggplot2::aes(x=Total, y=..density..),fill="lightsteelblue1", colour='white') +
ggplot2::geom_density(ggplot2::aes(x=Total), fill="#00000044", adjust=1, alpha=.2, colour="white") +
ggplot2::theme_bw()+
ggplot2::scale_x_continuous(expand = c(0, 0)) +
ggplot2::scale_y_continuous(expand = c(0, 0))
if (plot.mean & !is.null(sim_data)) {
g = g +
ggplot2::geom_vline(data=tab, ggplot2::aes(xintercept=Total.Fraud.Mean, linetype="Mean", colour='Mean'))+
ggplot2::geom_vline(data=tab, ggplot2::aes(xintercept=Total.Fraud.HPD.lower, linetype="95% HPD", colour="95% HPD"))+
ggplot2::geom_vline(data=tab, ggplot2::aes(xintercept=Total.Fraud.HPD.upper, linetype="95% HPD", colour="95% HPD"))+
ggplot2::geom_vline(data=tab, ggplot2::aes(xintercept=Total.Fraud.True, linetype="True", colour='True'))+
ggplot2::scale_linetype_manual(values=c( "95% HPD"='dashed', "Mean"='solid', 'True'='solid'), name='') +
ggplot2::scale_colour_manual(values = c( "95% HPD"='red', "Mean"='red', 'True'='black'), name='')
}
if (plot.mean & is.null(sim_data)) {
g = g +
ggplot2::geom_vline(data=tab, ggplot2::aes(xintercept=Total.Fraud.Mean, linetype="Mean", colour='Mean'))+
ggplot2::geom_vline(data=tab, ggplot2::aes(xintercept=Total.Fraud.HPD.lower, linetype="95% HPD", colour="95% HPD"))+
ggplot2::geom_vline(data=tab, ggplot2::aes(xintercept=Total.Fraud.HPD.upper, linetype="95% HPD", colour="95% HPD"))+
ggplot2::scale_linetype_manual(values=c( "95% HPD"='dashed', "Mean"='solid'), name='') +
ggplot2::scale_colour_manual(values = c( "95% HPD"='red', "Mean"='red'), name='')
}
return(g)
}
plot_frauds <- function(tab, plot.mean, sim_data)
{
g = tab %>%
## dplyr::filter(dplyr::row_number()==row.number) %>%
dplyr::select(fraud.distribution) %>%
dplyr::pull(.) %>%
.[[1]] %>%
tidyr::gather(key = Fraud, value=value) %>%
ggplot2::ggplot(.) +
ggplot2::geom_histogram(ggplot2::aes(x=value, y=..density..),fill="lightsteelblue1", colour='white') +
ggplot2::geom_density(ggplot2::aes(x=value), fill="#00000044", adjust=1, alpha=.2, colour='white') +
ggplot2::theme_bw()+
ggplot2::facet_wrap( ~ Fraud, ncol = 1, scales='free',labeller=ggplot2::label_parsed) +
ggplot2::scale_x_continuous(expand = c(0, 0)) +
ggplot2::scale_y_continuous(expand = c(0, 0)) +
ggplot2::scale_fill_brewer(palette='Blues', name="Fraud")+
ggplot2::theme(legend.position = "top") +
ggplot2::theme(strip.background = ggplot2::element_rect(colour="white", fill="white"),
strip.text.x = ggplot2::element_text(size=12, face='bold', hjust=0),
strip.text.y = ggplot2::element_text(size=12, face="bold", vjust=0))
if (plot.mean & !is.null(sim_data)) {
tab2 = tab %>%
dplyr::select(dplyr::contains("Manufactured"), dplyr::contains("Stolen"), dplyr::contains("Total")) %>%
tidyr::gather(key = Fraud, value=value) %>%
tidyr::separate(., col=Fraud, into=c("Fraud", "stat"), sep=".Fraud.") %>%
tidyr::spread(., key=stat, value=value)
g = g +
ggplot2::geom_vline(data=tab2, ggplot2::aes(xintercept=Mean, linetype="Mean", colour='Mean'))+
ggplot2::geom_vline(data=tab2, ggplot2::aes(xintercept=HPD.lower, linetype="95% HPD", colour="95% HPD"))+
ggplot2::geom_vline(data=tab2, ggplot2::aes(xintercept=HPD.upper, linetype="95% HPD", colour="95% HPD"))+
ggplot2::geom_vline(data=tab2, ggplot2::aes(xintercept=True, linetype="True", colour='True'))+
ggplot2::scale_linetype_manual(values=c( "95% HPD"='dashed', "Mean"='solid', 'True'='solid'), name='') +
ggplot2::scale_colour_manual(values = c( "95% HPD"='red', "Mean"='red', 'True'='black'), name='')
}
if (plot.mean & is.null(sim_data)) {
tab2 = tab %>%
dplyr::select(dplyr::contains("Manufactured"), dplyr::contains("Stolen"), dplyr::contains("Total")) %>%
tidyr::gather(key = Fraud, value=value) %>%
tidyr::separate(., col=Fraud, into=c("Fraud", "stat"), sep=".Fraud.") %>%
tidyr::spread(., key=stat, value=value)
g = g +
ggplot2::geom_vline(data=tab2, ggplot2::aes(xintercept=Mean, linetype="Mean", colour='Mean'))+
ggplot2::geom_vline(data=tab2, ggplot2::aes(xintercept=HPD.lower, linetype="95% HPD", colour="95% HPD"))+
ggplot2::geom_vline(data=tab2, ggplot2::aes(xintercept=HPD.upper, linetype="95% HPD", colour="95% HPD"))+
ggplot2::scale_linetype_manual(values=c( "95% HPD"='dashed', "Mean"='solid'), name='') +
ggplot2::scale_colour_manual(values = c( "95% HPD"='red', "Mean"='red'), name='')
}
return(g)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.