R/edar_eda.R

Defines functions recode rescale.axis

Documented in recode rescale.axis

## {{{ ancillary }}}

## {{{ docs }}}

#' Rescale values
#'
#' @param y.reference values to use as reference to rescale
#' @param y.to.rescale values to rescale into y.left range 
#'
#' @examples
#'
#' @export

## }}}
rescale.axis <- function(y.reference, y.to.rescale)
{
    ## y  = alpha + beta*x
    ## y0 = alpha + beta*x0
    ## => beta  = (y -y0) / (x - x0)
    ## => alpha = y - beta*x
    x  = max(y.reference)
    x0 = min(y.reference)
    y  = max(y.to.rescale)
    y0 = min(y.to.rescale)
    beta  = (y - y0)/(x - x0)
    alpha = y - beta*x
    ## y.rescaled = (alpha - y.to.rescale)/(-beta)
    ## return(y.rescaled)
    return(c(alpha=alpha, beta=beta))
}

## }}}
## {{{ data manipulation }}}


## {{{ docs }}}

#' Recode labelled columns
#'
#' This function recode labelled columns
#'
#' @param df a data frame
#' @param var a string with the name of the variable in the data frame
#'            to be recoded
#' @param newvar a string with the name of the new variable that will
#'               receive the recoded columns. By default, it uses the
#'               original variable (var)
#' @param newvar_label string with the label for the new variable 
#' @param recode a named vector. The names are the code of the labels in the
#'               old variable. The values are the new values
#' @param sep a string to use to separate labels when joining values
#'            during the recoding
#' 
#' @return It returns a tibble data frame with recoded column
#'
#' @examples
#'
#' 
#' @export
## }}}
recode <- function(df, var, newvar=var, newvar_label=NULL, missing=NULL,
                   recode, sep=' / ')
{
    v = df[,var] %>%  dplyr::pull(.)
    df[, newvar] = dplyr::recode(.x=v, !!!recode, 
                              ## .default = NULL,
                              ## .missing = 77,
                              ## .keep_value_labels=T,
                              .combine_value_labels = TRUE,
                              .sep = sep
                               )
    if (!is.null(missing)) {
        v = df[,newvar] %>% dplyr::pull(.)
        df[,newvar] = sjlabelled::set_na(x=v, na=missing, drop.levels=T)
    }
    if (!is.null(newvar_label)) {
        lab = newvar_label
        names(lab) = newvar
        df = df %>%
            labelled::set_variable_labels(., !!!lab)
    }
    return(df)
}

## }}}
## {{{ Describe data (dplyr extension) }}}

## {{{ docs }}}

#' Summarise numerical variables of the data set
#'
#' This function summarises all the numerical variables in the data set.
#'
#' @param df data frame
#' @param group string vector with the names of the grouping variables (can be more than one). The summary will be computed by groups. Default \code{NULL}.
#' @param weight string vector with the name of the column in the data set containing the weights. Default \code{NULL}.
#' @param spread boolean, if \code{TRUE} the summaries for each group are spread along the columns. Default \code{FALSE}
#' @param digits integer, number of digiits to return in the summaries (defualt 4)
#'
#' @return It returns a tibble data frame with summaries for all numerical variables
#'
#' @examples
#' data(starwars, package='dplyr')
#' 
#' summarise_alln(starwars, group=NULL, weight=NULL, spread=FALSE)
#' summarise_alln(starwars, group="gender", weight=NULL, spread=FALSE)
#' summarise_alln(starwars, group="gender", weight=NULL, spread=TRUE)
#'
#' # or use with pipe
#' # starwars  %>% summarise_alln(., group="gender", weight=NULL, spread=T)
#' @export

## }}}
summarise_alln <- function(df, group=NULL, weight=NULL, spread=F, digits=4)
{
    options(warn=-1)
    on.exit(options(warn=0))
    
    flag=FALSE
    ifelse(is.null(weight), df$weight<-1, df$weight <- df %>% dplyr::select_(weight) %>% dplyr::pull(.)) 
    if (is.null(group)) {
        flag=TRUE
        df$grouping_null__ = 1
        group="grouping_null__"
    }
    tab = df %>%
        dplyr::select(-dplyr::one_of(group))  %>% 
        dplyr::select_if(is.numeric) %>%
        dplyr::bind_cols(., df[,group])  %>%
        tidyr::gather(var, value,  -group, -weight) %>%
        dplyr::mutate(na = 1 - is.na(value)) %>%
        dplyr::group_by_("var", .dots=group) %>%
        dplyr::summarise(N      = sum(na),
                         NAs    = sum(is.na(value)),
                         Mean   = ifelse(N!=0, stats::weighted.mean(value, w=weight, na.rm=TRUE), NA_real_),
                         sd     = ifelse(N!=0, sqrt(Hmisc::wtd.var(value, weights=weight, na.rm=TRUE)), NA_real_),
                         se     = ifelse(N!=0, sd/sqrt(N), NA_real_),
                         Median = ifelse(N!=0, isotone::weighted.median(value, w=weight), NA_real_),
                         Min    = ifelse(N!=0, min(value, na.rm = TRUE), NA_real_),
                         Max    = ifelse(N!=0, max(value, na.rm = TRUE), NA_real_),
                         q.025  = ifelse(N!=0, reldist::wtd.quantile (value, q=0.025, na.rm = TRUE, weight=weight), NA_real_),
                         q.25   = ifelse(N!=0, reldist::wtd.quantile (value, q=0.25, na.rm = TRUE, weight=weight), NA_real_),
                         q.75   = ifelse(N!=0, reldist::wtd.quantile (value, q=0.75, na.rm = TRUE, weight=weight), NA_real_),
                         q.975  = ifelse(N!=0, reldist::wtd.quantile (value, q=0.975, na.rm = TRUE, weight=weight), NA_real_),
                         )  %>%
        dplyr::mutate(sd = dplyr::case_when(is.nan(sd) ~ 0, TRUE ~ sd)) %>%   
        dplyr::arrange(var)  %>%
        dplyr::ungroup(.)
    if (flag) {
        tab = tab %>% dplyr::select(-dplyr::one_of(group)) 
    }
    if(spread){
        tab = tab %>%
            tidyr::gather(key = Statistic, value=value, -var,-group)  %>%
            tidyr::unite(., Stat, Statistic, group) %>%
            dplyr::select(var, Stat, value)  %>%
            tidyr::spread(., key=Stat, value=value)
    }
    tab = tab %>%
        dplyr::mutate_if(is.numeric, round, digits=digits)
    return(tab)
}
## {{{ docs }}}
#' Summarise the categorical variables of the data set
#'
#' This function summarises all the categorical variables in the data set.
#'
#' @inheritParams summarise_alln 
#'
#' @return It returns a tibble data frame with summaries for all categorical variables. 
#'
#' @examples
#' data(starwars, package='dplyr')
#'
#' summarise_allc(starwars, group=NULL)
#' summarise_allc(starwars, group="gender")
#' summarise_allc(starwars, group=c("gender", "eye_color"))
#'
#' # or with pipe
#' # starwars %>% summarise_allc(., group=c("gender", "eye_color"))
#' 
#' @export
## }}}
summarise_allc <- function(df, group=NULL)
{
    options(warn=-1)
    on.exit(options(warn=0))
    
    flag=FALSE
    if (is.null(group)) {
        flag=TRUE
        df$grouping_null__ = 1
        group="grouping_null__"
    }
    tab = df %>%
        dplyr::select_if(function(col) class(col)[1] != 'list') %>% 
        dplyr::select(-dplyr::one_of(group))  %>% 
        dplyr::select_if(function(col) !is.numeric(col)) %>%
        dplyr::bind_cols(., df[,group]) %>%
        tidyr::gather(var, value,  -group) %>%
        dplyr::mutate(na = 1 - is.na(value)) %>%
        dplyr::group_by_("var", .dots=group) %>%
        dplyr::summarise(N      = sum(na),
                         NAs    = sum(is.na(value)),
                         Categories = length(unique(value[!is.na(value)])),
                         Frequency = paste0(stringr::str_pad(stringr::str_sub(names(table(value)),1,5),
                                                             width=5, side='right'),
                                            " (", formatC(100*table(value)/sum(table(value)),format="f",digits=2), " %)",
                                            collapse=", ") ,
                         Table     = list(table(value, useNA="always")
                                          %>% stats::setNames(., nm=(names(.) %>% sapply(., toString)) )
                                          %>% rbind 
                                          %>% tibble::as_data_frame(.)  
                                          %>% cbind(Variable=unique(var)) 
                                          %>% dplyr::select(Variable, dplyr::everything())  ),
                         tabl     = list(table(value, useNA="always")
                                           %>% stats::setNames(., nm=(names(.) %>% sapply(., toString)) )
                                           %>% rbind 
                                           %>% tibble::as_data_frame(.)  
                                           %>% cbind(Variable=unique(var)) 
                                           %>% dplyr::select(Variable, dplyr::everything())  ),
                         Categories.Labels = paste0(sort(unique(value)), collapse=", ") ,
                         )   %>% 
        dplyr::ungroup(.) %>% 
        dplyr::arrange(var) 
    if (flag) {
        tab = tab %>% dplyr::select(-dplyr::one_of(group)) 
    }
    return(tab)
}
## {{{ docs }}}

#' Summarise categorical variables in bundles
#'
#' This function summarises all the categorical variables in the data set and create bundles based on the category labels
#'
#' @inheritParams summarise_allc 
#'
#' @return It returns a tibble data frame with summaries for all categorical variables. The summaries are aggregated into single tables based on the category labels. 
#'
#' @examples
#'
#' data(edar_survey)
#' ## help(edar_survey)
#' tab =  summarise_allcbundle(edar_survey)
#' tab
#' tab$Table[[4]]
#' tab$tabp[[4]]
#' tab$tabl[[4]]
#' tab = summarise_allcbundle( edar_survey, group="gender")
#' tab
#' tab$Table[[5]]
#' tab$tabp[[5]]
#' tab$tabl[[5]]
#'
#' # or with pipe
#' # edar_survey  %>% summarise_allcbundle(., group="gender")
#' 
#' @export

## }}}
summarise_allcbundle <- function(df, group=NULL)
{
    if( is.null(group) )    return( summarise_allcbundle_g0(df, group) )
    if( length(group) == 1) return( summarise_allcbundle_g1(df, group) )
    if( length(group) > 1 ) return( cat("\n\nCurrently, this function only supports aggragation by 1 group\n\n"))
}
summarise_allcbundle_g0 <- function(df, group=NULL)
{
    options(warn=-1)
    on.exit(options(warn=0))
    
    vars = df %>%  summarise_allc(., group=group) %>%
        dplyr::group_by(Categories.Labels) %>%
        dplyr::summarise(Variables = list(var),
                         N.Variables = purrr::map_int(.x=Variables, ~length(.x)))
    tab = df %>% summarise_allc(., group=group) %>%
        dplyr::group_by(Categories.Labels) %>%
        dplyr::select(Categories.Labels, Table)  %>%
        dplyr::summarise(Table   = list(do.call(rbind,Table))) %>%
        dplyr::full_join(., vars , by=c("Categories.Labels"))  %>%
        dplyr::select(N.Variables, Variables, dplyr::contains("Labels"), Table)  %>%
        dplyr::ungroup(.) %>%
        dplyr::mutate(tabp = purrr::map(.x=Table, ~ .x %>% 
                                                 tidyr::gather(key = cat, value=N, -Variable)  %>%
                                                 dplyr::group_by(Variable) %>% 
                                                 dplyr::mutate(Frequency = round(100*N/sum(N),2)) %>%
                                                 dplyr::arrange(Variable)  %>%
                                                 dplyr::select(-N)  %>% 
                                                 tidyr::spread(., key=cat, value=Frequency) %>%
                                                 dplyr::ungroup(.)  %>%
                                                 as.data.frame %>%
                                                 dplyr::select(Variable,   'NA', dplyr::everything() ) %>% 
                                                 dplyr::select(c(1, 3:ncol(.)), 2 )
                                   ),
                      tabl = purrr::map(.x=Table, ~ .x %>% 
                                                 tidyr::gather(key = cat, value=N, -Variable)  %>%
                                                 dplyr::group_by(Variable) %>% 
                                                 dplyr::mutate(Frequency = paste0(round(100*N/sum(N),2), " % (N=",  N,")") ) %>%
                                                 dplyr::arrange(Variable)  %>%
                                                 dplyr::select(-N)  %>% 
                                                 tidyr::spread(., key=cat, value=Frequency) %>%
                                                 dplyr::ungroup(.)  %>%
                                                 as.data.frame %>%
                                                 dplyr::select(Variable,   'NA', dplyr::everything() ) %>% 
                                                 dplyr::select(c(1, 3:ncol(.)), 2 )
                                     ),
                      ) 
    tab = tab %>% dplyr::rename(tab=Table) 

    return(tab)
}
summarise_allcbundle_g1 <- function(df, group=NULL)
{
    options(warn=-1)
    on.exit(options(warn=0))
    
    vars = df %>%  summarise_allc(., group=group) %>%
        dplyr::rename(group=!!group) %>% 
        dplyr::group_by(Categories.Labels, group) %>%
        dplyr::summarise(Variables = list(var),
                         N.Variables = purrr::map_int(.x=Variables, ~length(.x)))
    tab = df %>% summarise_allc(., group=group) %>%
        dplyr::rename(group=!!group) %>% 
        dplyr::group_by(Categories.Labels, group) %>%
        dplyr::select(Categories.Labels, group, Table)  %>%
        dplyr::summarise(Table   = list(do.call(rbind,Table))) %>%
        dplyr::full_join(., vars , by=c("Categories.Labels", "group"))  %>%
        dplyr::select(group, N.Variables, Variables, dplyr::contains("Labels"), Table)  %>%
        dplyr::group_by(Categories.Labels) %>%
        dplyr::summarise(Table   = list(purrr::map2(.x=group,.y = Table, .f=function(.x,.y) cbind(group=.x,.y)) %>% do.call(rbind,. )%>% dplyr::arrange(Variable, group))  ) %>% 
        dplyr::ungroup(.) %>%
        dplyr::mutate(tabp = purrr::map(.x=Table, ~ .x %>% 
                                                 tidyr::gather(key = cat, value=N, -Variable, -group)  %>%
                                                 dplyr::group_by(Variable, group) %>% 
                                                 dplyr::mutate(Frequency = round(100*N/sum(N),2)) %>%
                                                 dplyr::arrange(Variable)  %>%
                                                 dplyr::select(-N)  %>% 
                                                 tidyr::spread(., key=cat, value=Frequency) %>%
                                                 dplyr::ungroup(.)  %>%
                                                 as.data.frame %>%
                                                 dplyr::select(group, Variable,   'NA', dplyr::everything() ) %>% 
                                                 dplyr::select(c(1, 2, 4:ncol(.)), 3 ) %>%
                                                 dplyr::arrange(Variable, group) 
                                   ),
                      tabl = purrr::map(.x=Table, ~ .x %>% 
                                                 tidyr::gather(key = cat, value=N, -Variable, -group)  %>%
                                                 dplyr::group_by(Variable, group) %>% 
                                                 dplyr::mutate(Frequency = paste0(N, " (", round(100*N/sum(N),2) ," %)") ) %>%
                                                 dplyr::arrange(Variable)  %>%
                                                 dplyr::select(-N)  %>% 
                                                 tidyr::spread(., key=cat, value=Frequency) %>%
                                                 dplyr::ungroup(.)  %>%
                                                 as.data.frame %>%
                                                 dplyr::select(group, Variable,   'NA', dplyr::everything() ) %>% 
                                                 dplyr::select(c(1, 2, 4:ncol(.)), 3 ) %>%
                                                 dplyr::arrange(Variable, group) 
                                   ),
                      )
    tab = tab %>% dplyr::rename(tab=Table) 
    return(tab)
}

## }}}
## {{{ Describe data (tables) }}}

## {{{ doc }}}

#' Descriptive Statistics in the Treatment and Control Groups
#'
#' This function generates a table with descriptive statistics of
#' numerical variables by group (Treatment versus Control groups)
#'
#'
#' @param data A data frame that contains a dummy indicator
#'             variable for the treatment and control
#'             groups
#' @param treatmentVar a string with the name of the indicator
#'                     variable of the treatment
#' @param alpha number between 0 and 1. It is used to ...
#' @param ttest boolean, indicates if a t-test of difference in
#'              means between treatment and control groups is
#'              computed
#' @param showMahalanobisDist boolean, indicates if the Mahalanobis
#'                            distance is computed
#' @param showpscore boolean, indicates if the pscore is computed
#' @param showLinPscore ...
#' @param col.size integer, size of the columns in the result
#'                 printed in the screen
#' @return The function returns a list with named elements. The
#'         element \code{balance} constains a table with the
#'         statistics of all numerical variables by treatment
#'         and control groups. The element \code{pscore} contains
#'         the pscore of all observations. The element \code{lpscore}
#'         of the list is the logarithm of pscore.
#'
#' @details The statistics computed here are recomended in
#' \itemize{
#'  \item Imbens, G. W., & Rubin, D. B. (2015). Causal inference in statistics, social, and biomedical sciences: an introduction. : Cambridge University Press.
#' }
#'
#' @examples
#' data(edar_survey)
#'
#' ebalance(edar_survey , treatmentVar='treat')
#' 
#' # or with paper
#' # edar_survey  %>% ebalance(., treatmentVar='treat')
#' 
#' @export

## }}}
ebalance   <- function(data, treatmentVar, alpha=0.05, ttest=F, showMahalanobisDist=T,showpscore=T,showLinPscore=T, col.size=120)
{
    op.default <- options()
    on.exit(options(op.default), add=TRUE)

    options(width=col.size)
    data    = data %>% dplyr::rename(treatmentVar=!!treatmentVar) %>% dplyr::mutate(treatmentVar=as.numeric(treatmentVar)) %>% dplyr::select_if(is.numeric)
    treated = data %>% dplyr::filter(treatmentVar==1) %>%  dplyr::select(-treatmentVar)
    control = data %>% dplyr::filter(treatmentVar==0) %>%  dplyr::select(-treatmentVar)
    Tr      = data  %>% dplyr::select(treatmentVar)  %>% dplyr::pull(.)
    ## Std diff in mean
    ## ----------------
    muc = apply(control,2, mean,na.rm=T)
    mut = apply(treated,2,mean,na.rm=T)
    s2t = apply(treated,2,function(x) stats::var(x,na.rm=T))
    s2c = apply(control,2,function(x) stats::var(x,na.rm=T))
    delta = (mut - muc)/sqrt((s2t+s2c)/2)

    ## t-test difference in means
    ## --------------------------
    nt     <- nrow(treated)
    nc     <- nrow(control)
    t      <- (mut-muc) / sqrt( s2t/nt + s2c/nc)
    df     <- ( s2t/nt + s2c/nc)^2 / ( (s2t/nt)^2/(nt-1) +  (s2c/nc)^2/(nc-1) )
    pttest <- 2*(1-stats::pt(abs(t), df=df))

    ## log ratio of std dev
    ## --------------------
    gamma <- log(sqrt(s2t)/sqrt(s2c))

    ## proportion of tail of the other group
    ## -------------------------------------
    ltt <- apply(treated,2,stats::quantile,prob=c(alpha/2), na.rm=T)
    utt <- apply(treated,2,stats::quantile,prob=c(1-alpha/2), na.rm=T)

    ltc <- apply(control,2,stats::quantile,prob=c(alpha/2), na.rm=T)
    utc <- apply(control,2,stats::quantile,prob=c(1-alpha/2), na.rm=T)

    pit <- vector()
    pic <- vector()
    for (i in 1:ncol(treated)){
        x <- treated[,i] %>% dplyr::pull(.)
        pit[i] <- (sum(x<ltc[i],na.rm=T) + sum(x>utc[i],na.rm=T))/ length(x[!is.na(x)])
        x <- control[,i] %>% dplyr::pull(.)
        pic[i] <- (sum(x<ltt[i],na.rm=T) + sum(x>utt[i],na.rm=T))/ length(x[!is.na(x)])
    }

    ## prop score
    ## ----------
    dat          <- data %>% cbind(., Tr) 
    pscore       <- stats::glm(Tr ~ ., data=dat, family=stats::binomial(link="logit")) 
    phat         <- pscore$fitted.values
    phatlin      <- pscore$linear.predictors
    ## statistics for the raw pscore
    pscore_meant <- base::mean(phat[Tr==1], na.rm=T)
    pscore_vart  <- stats::var(phat[Tr==1], na.rm=T)
    pscore_meanc <- base::mean(phat[Tr==0], na.rm=T)
    pscore_varc  <- stats::var(phat[Tr==0], na.rm=T)
    pscore_dif   <- (pscore_meant - pscore_meanc)/sqrt((pscore_vart+pscore_varc)/2)
    pscore_lnSTD <- log(sqrt(pscore_vart)/sqrt(pscore_varc))
    pscore_pit <- stats::quantile(phat[Tr==1],prob=c(alpha/2,1-alpha/2), na.rm=T)
    pscore_pic <- stats::quantile(phat[Tr==0],prob=c(alpha/2,1-alpha/2), na.rm=T)
    pscore_pialphat <- (sum(phat[Tr==1]<pscore_pic[1],na.rm=T) +
                     sum(phat[Tr==1]>pscore_pic[2],na.rm=T))/
                     length(phat[Tr==1][!is.na(phat[Tr==1])])
    pscore_pialphac    <- (sum(phat[Tr==0]<pscore_pit[1],na.rm=T) +
                     sum(phat[Tr==0]>pscore_pit[2],na.rm=T))/
                     length(phat[Tr==1][!is.na(phat[Tr==1])])
    ## statistics for the linearized pscore
    lpscore_meant <- base::mean(phatlin[Tr==1], na.rm=T)
    lpscore_vart  <- stats::var(phatlin[Tr==1], na.rm=T)
    lpscore_meanc <- base::mean(phatlin[Tr==0], na.rm=T)
    lpscore_varc  <- stats::var(phatlin[Tr==0], na.rm=T)
    lpscore_dif   <- (lpscore_meant - lpscore_meanc)/
      sqrt((lpscore_vart+lpscore_varc)/2)
    lpscore_lnSTD <- log(sqrt(lpscore_vart)/sqrt(lpscore_varc))
    lpscore_pit <- stats::quantile(phatlin[Tr==1],prob=c(alpha/2,1-alpha/2), na.rm=T)
    lpscore_pic <- stats::quantile(phatlin[Tr==0],prob=c(alpha/2,1-alpha/2), na.rm=T)
    lpscore_pialphat <- (sum(phatlin[Tr==1]<lpscore_pic[1],na.rm=T) +
                         sum(phatlin[Tr==1]>lpscore_pic[2],na.rm=T))/
        length(phatlin[Tr==1][!is.na(phatlin[Tr==1])])
    lpscore_pialphac    <- (sum(phatlin[Tr==0]<lpscore_pit[1],na.rm=T) +
                            sum(phatlin[Tr==0]>lpscore_pit[2],na.rm=T))/
        length(phatlin[Tr==1][!is.na(phatlin[Tr==1])])
    
    pscore <- round(c(pscore_meant,sqrt(pscore_vart),pscore_meanc,
                      sqrt(pscore_varc), pscore_dif,pscore_lnSTD,pscore_pialphat,
                      pscore_pialphac),3)
    lpscore <- round(c(lpscore_meant,sqrt(lpscore_vart),lpscore_meanc,
                       sqrt(lpscore_varc),
                       lpscore_dif,lpscore_lnSTD,lpscore_pialphat,
                      lpscore_pialphac),3)
    ## Mahalanobis distance
    ## --------------------
    sigmat <- stats::var(treated, na.rm=T)
    sigmac <- stats::var(control, na.rm=T)
    avCovInv <- solve( (sigmat + sigmac)/2 )
    mahalanobis <- sqrt( t(mut - muc) %*% avCovInv %*% (mut - muc) )

    ## final results
    ## -------------
    if (ttest){
        pscore <- round(c(pscore_meant,sqrt(pscore_vart),pscore_meanc,
                          sqrt(pscore_varc),
                          pscore_dif,NA,pscore_lnSTD,pscore_pialphat,
                          pscore_pialphac),3)
        lpscore <- round(c(lpscore_meant,sqrt(lpscore_vart),lpscore_meanc,
                           sqrt(lpscore_varc),
                           lpscore_dif,NA,lpscore_lnSTD,lpscore_pialphat,
                           lpscore_pialphac),3)        
        results <- cbind(mut=mut,st=sqrt(s2t), muc=muc, sc=sqrt(s2c),
                         NorDiff=delta, "p(T>|t|)"=pttest,lnRatioSdtDev=gamma,
                         pit=pit,pic=pic)
        mahalan <- as.numeric(c(rep('',4),round(mahalanobis,3),rep('',4)))
        N <- as.numeric(c(round(sum(Tr==1),0),'',round(sum(Tr==0),0),c(rep('',6))))
    }else{
        pscore <- round(c(pscore_meant,sqrt(pscore_vart),pscore_meanc,
                      sqrt(pscore_varc), pscore_dif,pscore_lnSTD,pscore_pialphat,
                      pscore_pialphac),3)
    lpscore <- round(c(lpscore_meant,sqrt(lpscore_vart),lpscore_meanc,
                       sqrt(lpscore_varc),
                       lpscore_dif,lpscore_lnSTD,lpscore_pialphat,
                      lpscore_pialphac),3)
        results <- cbind(mut=mut,st=sqrt(s2t), muc=muc, sc=sqrt(s2c),
                         NorDiff=delta, lnRatioSdtDev=gamma,
                         pit=pit,pic=pic)
        mahalan <- as.numeric(c(rep('',4),round(mahalanobis,3),rep('',3)))
        N <- as.numeric(c(round(sum(Tr==1),0),'',round(sum(Tr==0),0),
                          c(rep('',5))))        
    }
    results <- rbind(results,MahalanobisDist=mahalan)
    results <- rbind(results,pscore=pscore, LinPscore=lpscore, N=N)
    
    if(!showMahalanobisDist){
        results <- results[!rownames(results) == 'MahalanobisDist',]}
    if(!showpscore){
        results <- results[!rownames(results) == 'pscore',]}
    if(!showLinPscore){
        results <- results[!rownames(results) == 'LinPscore',]}

    results = results %>% data.frame(Variable=rownames(results))  %>%
    tibble::as_data_frame(.)  %>%
        dplyr::select(Variable, dplyr::everything())
    results = list(balance=results, pscore=phat, lpscore=phatlin)
    class(results) = "edar_balance"
    return(results)
}

## {{{ docs }}}

#' Print
#'
#' Generic method to print the output of a \code{ebalance}
#'
#' @param x an output of the function \code{ebalance}
#' @param digits integer, number of digits to display 
#' @param ... ingored
#'
#'
#' @export
## }}}
print.edar_balance <- function(x, digits=4, ...)
{
    x$balance = x$balance %>% dplyr::mutate_if(is.numeric, round, digits=digits)
    return(x$balance)
}

## Note: funcions summarise_alln and summarise_allcbundle in this package are better option to used instead of edescribe edar_bundle_cat 
## {{{ doc }}}

#' Describe the variables of a data frame
#'
#' This function generate two tables. One summarizes the non-numerical
#' variables, the other the numerical ones.
#'
#' @param data a data frame
#' @param weight a vector with the weights
#' @param minMax boolean, display the minimum and maximum in the summary
#'               when minMax=T
#' @param digits the number of digits to display
#' @param maxVars integer, the maximum number of variables in the data to summarize.
#'        It accepts any number, by it may result in slow computaiton of the summary
#'        depending on the size of the data set
#' @param col.size integer, the size of the column to display the summary
#'
#' @return It returns a list of tibbles. The first contains the summary of the
#'         categorical variables, the second the summary of the numerical ones.
#'
#' @examples
#' data(edar_survey)
#' edescribe(edar_survey)
#' 
#' tb = edescribe(edar_survey)
#' tb$Cat
#' tb$Num
#'
#' # or with pipe
#' # edar_survey %>% edescribe()
#' @export

## }}}
edescribe  <- function(data, weight=NULL, minMax=T, digits=2, maxVars=NULL, col.size=120)
{

    op.default <- options()
    on.exit(options(op.default), add=TRUE)
    options(width=col.size)
    
    idxChar <- sapply(sapply(data, class), function(x) x[[1]]) == 'factor' |
        sapply(sapply(data, class), function(x) x[[1]]) == 'character'  |
        sapply(sapply(data, class), function(x) x[[1]]) == 'ordered'

    ## Numerical Variables
    ## -------------------
    if (sum(!idxChar)>0)
    {
        NumVars = sum(!idxChar)
        CharVars =  sapply(data, class) == 'character'
        Factors = sapply(sapply(data, class), function(x) x[[1]]) == 'factor'
        Ordered = sapply(sapply(data, class), function(x) x[[1]]) == 'ordered'
        
        dataVars <- data.frame(N=dim(data)[1], nVars=dim(data)[2],
                               Char=sum(CharVars), Factor=sum(Factors),
                               Numerical=NumVars)
        dataVars <- list(VarsType=dataVars,Strings=names(data)[CharVars],
                         Factors=names(data)[Factors],
                         Ordered=names(data)[Ordered],
                         Numerical=names(data)[!idxChar])

        ## summary statistics
        N      = apply(data[,!idxChar],2,length)
        NAs    = apply(data[,!idxChar],2,function (x) sum(is.na(x)))
        if(is.null(weight)){
            mean   = apply(data[,!idxChar],2,mean,na.rm=TRUE)
            median = apply(data[,!idxChar],2,median,na.rm=T)
            sd     = apply(data[,!idxChar],2,sd,na.rm=T)
            se     = sd / sqrt(N)
            q      = apply(data[,!idxChar],2,stats::quantile,probs=c(.05,.95),na.rm=T) 
        }else{
            mean   = apply(data[,!idxChar],2,function(x) questionr::wtd.mean(x,na.rm=TRUE, weight=weight))
            median = apply(data[,!idxChar],2,function(x) isotone::weighted.median(x,w=weight))
            sd     = apply(data[,!idxChar],2,function(x) sum(weight*(x - questionr::wtd.mean(x,na.rm=TRUE, weight=weight))^2))
            se     = sd/sqrt(N)
            q      = apply(data[,!idxChar],2,function(x) Hmisc::wtd.quantile(x, probs=c(.05,.95), weights=weight, na.rm=T)) 
        }
        numericSummary <- data.frame(name = names(data[,!idxChar]),
                                     N=N, NAs=NAs, mean=mean
                                    ,sd=sd,se=se,q.05= q[1,],
                                     median=median,q.95=q[2,])
        ## minimum and maximum
        if(minMax==T) {
            minimum <- apply(data[,!idxChar],2,min, na.rm=T)
            maximum <- apply(data[,!idxChar],2,max, na.rm=T)        
            numericSummary <- cbind(numericSummary, minimum, maximum)
            numericSummary <- numericSummary[c("name","N", "NAs", "mean","sd","se","median",
                                               "minimum","maximum","q.05","q.95")]
        }
        row.names(numericSummary) = NULL
        numericSummary = tibble::as_data_frame(numericSummary) %>%
            dplyr::arrange(name)
    }else
    {
        numericSummary = NULL
    }

    ## categorical variables
    ## ---------------------
    if (sum(idxChar)>0)
    {
        tables <- list()
        if (sum(idxChar)>0){
            for (i in 1:sum(idxChar)){        
                if (is.null(weight)){
                    tables[[i]] <- table(as.data.frame(data[,idxChar])[,i],useNA = "always")
                }else{
                    tables[[i]] <- round(questionr::wtd.table(x = as.data.frame(data[,idxChar])[,i], weights = weight, na.show=T, na.rm=F), digits=digits)
                }
            }
            names(tables) <- names(idxChar)[idxChar==T]
        }else{
            tables <- 'no categorical or factor variables'
        }
        tables.summary =  do.call(rbind,lapply(tables,data.frame))
        tables.summary = cbind(tables.summary, round(100*tables.summary[,2]/nrow(data), 2))
        names(tables.summary) = c('Category','N',"Freq")
        tables.summary$Variable = row.names(tables.summary)
        tables.summary = tables.summary[,c(4,1:3)]
        tables.summary$Variable= gsub(tables.summary$Variable, pattern=".1$",replacement='')
        tables.summary$Variable[grepl(tables.summary$Variable, pattern="[[0-9]]*$")]=''
        row.names(tables.summary) = 1:nrow(tables.summary)
        levels(tables.summary$Category) = c(levels(tables.summary$Category), "NAs")
        tables.summary$Category[is.na(tables.summary$Category)] = "NAs"

        ## rows with NA
        idxNA <- which(apply(data,1,function (x) sum(is.na(x)))!=0)
        rowNA <- apply(data,1,function (x) sum(is.na(x)))[idxNA]
        rowNA <- data.frame(line=idxNA,total.NAs=rowNA)

        n = nrow(data)
        tables_summary = lapply(tables, function(x) paste0(names(x),' (',round(100*x/n,2),"%)",  collapse='; ') )
        tables_summary = tibble::data_frame(name = names(tables),
                                            N    = nrow(data),
                                            levels = unlist(lapply(tables, length)),
                                            missing = unlist(lapply(tables, function(x) x[is.na(names(x))])),
                                            class =unlist(lapply(subset(data, select=names(tables)), class)),
                                            freq_tables = tables,
                                            distribution = c(do.call(rbind, tables_summary))
                                            ) %>%
            dplyr::arrange(name)
    }else
    {
        tables_summary = NULL
    }
    
    results <- list(Num = numericSummary,
                    Cat = tables_summary
                    ##Vars=dataVars,
                    ##CatVarsSummaryTable=tables.summary
                    ## NAsRow=rowNA
                    )
    op.default <- options()
    on.exit(options(op.default), add=TRUE)
    options(tibble.print_max = max(nrow(results$Num), nrow(results$Cat)))

    return(results)

}
## {{{ docs }}}

#' Smart univariate summary of the categorical variables
#'
#' This function summarizes the distribution of the non-numerical
#' variables. It bundles together all the variables that have the same
#' categories and displays their summary in a single table.
#'
#'
#' @param data a data frame
#' @param vars a vector with the name of the variables to display.
#'             If none is provided (vars=NULL), it summarizes all
#'             the categorical variables
#' @param print a string with the type of summary to print in the
#'              screen (see \code{value} below for the complete
#'              description of the return). Values can be one of
#'              'prop' (display the proportions), 'count' (display
#'              the counts), or 'prop_count' (display both the proportion
#'              and counts).
#' @param weight vector with the weights
#' @param digits integer, the number of digits to print
#' @param print.summary boolean, if \code{TRUE} the function will print a summary
#' @param width.size numeric, size of the columns in the screen to print the results
#'
#' @return It retuns a tibble in which the columns are lists. The columns
#'         of the tibble are \code{var.idx}, which is a list with the idx
#'         of the variables, \code{vars}, which is a list with the
#'         variable names bundled together. Finally \code{table.p},
#'         \code{table.n}, and \code{table.pn} are the tables with,
#'         respectively, the proportions, counts, and both proportions
#'         and counts for the bundled variables
#'
#' @examples
#' data(edar_survey)
#' edar_bundle_cat(edar_survey)
#' tb = edar_bundle_cat(edar_survey )
#' tb
#' tb$var.idx
#' tb$table.n
#' tb$table.p
#' tb$table.pn
#'
#' # with pipe:
#' # edar_survey %>% edar_bundle_cat()
#' 
#'@export
## }}}
edar_bundle_cat <- function(data, vars=NULL, print.summary=NULL, weight=NULL, digits=4, width.size=150){
    if(!is.null(vars)){
        data_cat = data.frame(data[,vars])
    }else{
        data_cat = .edar_select_categorical(data) %>% dplyr::mutate_if(is.character, as.factor)
    }

    op.default <- options()
    options(width=width.size)
    on.exit(options(op.default), add=TRUE)

    list.of.levels = lapply(data_cat,levels) %>% purrr::map(sort) %>% purrr::map(paste, collapse='') %>% unlist
    duplic = lapply(1:length(list.of.levels), function(i) which(list.of.levels[i] == list.of.levels ))
    bundles = duplic[!duplicated(duplic)]    
    summaries = tibble::data_frame()
    for (bundle in bundles){
        tab.p=t(apply(data.frame(data_cat[,names(bundle)]),2, function(x)
            round(prop.table(questionr::wtd.table(x = x, weights = weight, na.show=T, na.rm=F)),2)))
        tab.n=t(apply(data.frame(data_cat[,names(bundle)]),2, function(x)
            questionr::wtd.table(x = x, weights = weight, na.show=T, na.rm=F)))
        tab.pn=t(apply(data.frame(data_cat[,names(bundle)]),2, function(x)
            paste0(paste(100*round(prop.table(questionr::wtd.table(x = x, weights = weight, na.show=T, na.rm=F)),digits), '%'),
                   paste0(' (N=',questionr::wtd.table(x = x, weights = weight, na.show=T, na.rm=F),')'))))
        row.names(tab.p)=names(bundle)
        row.names(tab.n)=names(bundle)
        row.names(tab.pn)=names(bundle)
        colnames(tab.p) = colnames(tab.n)
        colnames(tab.pn) = colnames(tab.n)
        summaries = summaries %>%
            dplyr::bind_rows(
                tibble::data_frame(var.idx = list(bundle),
                                   vars = list(names(bundle)),
                                   table.p = list(tab.p),
                                   table.n = list(tab.n),
                                   table.pn = list(tab.pn))
                )
    }
    if (! is.null(print.summary ))
    {
        if (! print.summary %in% c('prop', 'count','prop_count'))
        {
            stop("\n\n The parameter print.summary must be NULL or one of: 'prop', 'count','prop_count'\n\n")
        }else{
            if(print.summary[1]=='prop') print(summaries$table.p)
            if(print.summary[1]=='count') print(summaries$table.n)
            if(print.summary[1]=='prop_count') print(summaries$table.pn)
        }
    }
    invisible(summaries)
}

## }}}
## {{{ Describe data (plots) }}}

## {{{ docs }}}

#' Descriptive Plots
#'
#' This function generate plots with all the variables in the data frame individually
#'
#' @param df a data frame
#' @param group a string with the name of the varible(s) to group the plots
#' @param weight a string with the name of the varible containing the weights. They are used to compute the summary stattistics
#' @param mean boolean, if \code{TRUE}, displays a line with the averave value
#' @param median boolean, if \code{TRUE}, displays a line with the median value
#' @param conf.int boolean, if \code{TRUE}, displays lines with the 95\% confidence interval (it is meaningful for variables that normally distributed)
#' @param quantile boolean, if \code{TRUE}, displays lines with the 25\% and 75\% quantiles
#' @param n.plots integer, the number of plots to display in the grid
#' @param common.legend boolean, if \code{TRUE} the plots uses a single legend
#' @param hist boolean, if \code{TRUE}, histograms are displayed instead of densities for the numerical variables
#' @param legend.position a string (\code{top}, \code{bottom}, \code{left} (Default)), \code{right} 
#'
#' @export

## }}}
gge_describe <- function(df, group=NULL, weight=NULL, mean=TRUE, median=FALSE, conf.int=TRUE, quantile=FALSE, n.plots=16, common.legend=TRUE, hist=FALSE, legend.position='top')
{
    if (length(group)>1) {
        stop("\n\nCurrently, this function supports only 1 group")
    }
    ## Debug/Monitoring message --------------------------
    msg <- paste0('\n','Generating the plots ...',  '\n'); cat(msg)
    ## ---------------------------------------------------
    options(warn=-1)
    on.exit(options(warn=0))

    g=list()
    j = 1
    idx.cat=NA
    idx.num=NA
    vars = names(df)
    for (i in 1:ncol(df))
    {
        var = vars[i]
        if (!var %in% c(group, weight)) {
            numeric = df[,var] %>% dplyr::pull(.)  %>% is.numeric
            if (numeric) {
                if (hist) {
                    g[[j]] = gge_histogram(df[,c(var, group, weight)], group, weight)
                    idx.num = c(idx.num, j)
                }else{
                    g[[j]] = gge_density(df[,c(var, group, weight)], group, weight)
                    idx.num = c(idx.num, j)
                }
            }else{
                g[[j]] = gge_barplot_one(df[,c(var, group)], variable=var, group=group)
                idx.cat = c(idx.cat, j)
            }
        }
        j=j+1
    }
    idx.num=idx.num[-1]
    idx.cat=idx.cat[-1]
    remaining = length(g)
    ## display continuous covariates first
    chunks = list()
    i=1
    idx.plots = 1:length(g)
    idx.plots =idx.plots[c(idx.num,idx.cat)]
    while(length(idx.plots)>0){
        n = min(n.plots, length(idx.plots))
        chunks[[i]] = idx.plots[1:n]
        idx.plots = idx.plots[-c(1:n)]
        i=i+1
    }
    for (chunk in chunks)
    {
        print(ggpubr::ggarrange(plotlist=g[chunk],  common.legend = common.legend, legend="top"))

        remaining = remaining - length(chunk)
        if(remaining>0){
            cat(paste0("\n",remaining, " plots remaining to display.") )
            readline(prompt="\nPress any key to continue ...")
        }
    }
    invisible()
}
## {{{ docs }}}

#' Plot densities by group and facet, with Kolmogorov-Smirnov statistics
#'
#' @param df a data frame
#' @param value a string with the name of the numerical value to plot the density
#' @param group a string with the name of the variable to group the densities
#' @param facet a string with the name of the variable to create the grid of plots
#' @param scale see \code{\link[ggplot2]{facet_wrap}}
#'
#' @details All the variables in the data frame will be plotted
#' @export

## }}}
gge_fdensity <- function(df, value, group, facet, scale='fixed')
{
    if (length(group)>1) {
        stop("\n\nCurrently, this function supports only 1 group")
    }
    ## Debug/Monitoring message --------------------------
    ## msg <- paste0('\n','Generating the plots ...',  '\n'); cat(msg)
    ## ---------------------------------------------------
    options(warn=-1)
    on.exit(options(warn=0))

    tab = df %>%
        dplyr::rename_(group=group, value=value, facet=facet) %>% 
        dplyr::mutate(group=factor(group), facet=factor(facet))
    tab = tab %>% 
        dplyr::group_by(facet) %>%
        dplyr::summarise(d1 = list(base::split(value, group)[[1]]),
                         d2 = list(base::split(value, group)[[2]])) %>%
        dplyr::mutate(KS.test = purrr::map2_dbl(d1,d2, ~ ifelse( all(is.na(.x)) | all(is.na(.y)), 1,  ks.test(.x,.y)$p.value)),
                      KS.text = paste0("K-S test : ", round(KS.test,4)) ) %>%
        dplyr::select(facet, dplyr::contains("KS")) %>%
        dplyr::full_join(., tab  , by=c("facet"))
    g = tab  %>% 
        ggplot2::ggplot(.) +
        ggplot2::geom_density(ggplot2::aes(x=value, fill=group), alpha=.6) +
        ggplot2::facet_wrap( ~ facet,  scales=scale,labeller=ggplot2::label_parsed) +
        ggplot2::scale_x_continuous(expand = c(0, 0)) +
        ggplot2::scale_y_continuous(expand = c(0, 0)) +
        viridis::scale_fill_viridis(option="A", discrete=TRUE, alpha=1, name="", begin=.066, end=.77) +
        viridis::scale_colour_viridis(option="A", discrete=TRUE, alpha=1, name="", begin=.066, end=.77) +
        ggplot2::geom_text(data=tab %>% dplyr::select(facet, KS.text)  %>% dplyr::filter(!duplicated(.)), ggplot2::aes(x=-Inf, y=Inf, label=KS.text), vjust=2, hjust=-.5)
    g = g +
        ggplot2::theme_bw() +
        ggplot2::theme(strip.background = ggplot2::element_rect(colour="white", fill="white"),
                           strip.text.x = ggplot2::element_text(size=12, face='bold'),
                           strip.text.y = ggplot2::element_text(size=12, face="bold")) 
    g = g + ggplot2::theme(legend.position = "top") 

    return(g)
}
## {{{ docs }}}
#' Plot densities
#'
#' Plot densities of all numerical variables in the data frame
#'
#' @inheritParams gge_describe
#' 
#' @export
## }}}
gge_density <- function(df, group=NULL, weight=NULL, mean=TRUE, median=FALSE, conf.int=TRUE, quantile=FALSE)
{
    if (length(group)>1) {
        stop("\n\nCurrently, this function supports only 1 group")
    }
    ## Debug/Monitoring message --------------------------
    ## msg <- paste0('\n','Generating the plots ...',  '\n'); cat(msg)
    ## ---------------------------------------------------
    options(warn=-1)
    on.exit(options(warn=0))
    
    flag=FALSE
    ifelse(is.null(weight), df$weight<-1, df$weight <- df %>% dplyr::select_(weight) %>% dplyr::pull(.)) 
    if (is.null(group)) {
        flag=TRUE
        df$grouping_null__ = 1
        group="grouping_null__"
    }

    ## preparing the data to plot
    df[,group] = df[,group] %>% 
        dplyr::mutate_all(as.factor)
    tab = df %>% 
        summarise_alln(., group=group, weight="weight", spread=F)
    aes.statistics = c("Mean","Median",  "Quantile (2.5%, 97.5%)",  "Quantile (25%, 75%)")
    aes.var.names = c("Average", "Med", "CI.l", "CI.u", "q25", "q75")
    df$Average = aes.statistics [1]
    df$Med     = aes.statistics [2]
    df$CI.l    = aes.statistics [3]
    df$CI.u    =aes.statistics [3]
    df$q25     = aes.statistics [4]
    df$q75     = aes.statistics [4]

    tab = df %>%
        dplyr::select(-dplyr::one_of(group))  %>% 
        dplyr::select_if(is.numeric) %>%
        dplyr::bind_cols(., df[,c(group, aes.var.names)])  %>%
        tidyr::gather(var, value,  -group, -weight,  -aes.var.names)  %>%
        dplyr::full_join(., tab , by=c("var", group))  %>%
        dplyr::mutate(value=value*weight) 

    
    if(!is.null(group)){
        g = tab %>% 
            ggplot2::ggplot(.) + 
            ggplot2::geom_density(ggplot2::aes_string(x="value",  fill=group), adjust=1, alpha=.5) +
            ggplot2::facet_wrap( ~ var , scales='free',labeller=ggplot2::label_parsed)  +
            ggplot2::scale_x_continuous(expand = c(0, 0)) +
            ggplot2::scale_y_continuous(expand = c(0, 0)) +
            viridis::scale_fill_viridis(option="A", discrete=TRUE, alpha=1, name="", begin=.066, end=.77) +
            viridis::scale_colour_viridis(option="A", discrete=TRUE, alpha=1, name="", begin=.066, end=.77) +
            ggplot2::guides(linetype=ggplot2::guide_legend("")) 
    }else{
        g = tab %>% 
            ggplot2::ggplot(.) + 
            ggplot2::geom_density(ggplot2::aes(value), adjust=1, alpha=.5) +
            ggplot2::facet_wrap( ~ var , scales='free',labeller=ggplot2::label_parsed)  +
            ggplot2::scale_x_continuous(expand = c(0, 0)) +
            ggplot2::scale_y_continuous(expand = c(0, 0)) +
            ggplot2::guides(linetype=ggplot2::guide_legend("")) 
    }

    if(mean)
        g = g + ggplot2::geom_vline(ggplot2::aes_string(xintercept="Mean", col=group, linetype="Average")) 

    if(median) 
        g = g + ggplot2::geom_vline(ggplot2::aes_string(xintercept="Median", col=group, linetype="Med")) 

    if(conf.int) 
        g = g + ggplot2::geom_vline(ggplot2::aes_string(xintercept="q.025", col=group, linetype="CI.l")) +
            ggplot2::geom_vline(ggplot2::aes_string(xintercept="q.975", col=group, linetype="CI.u")) 

    if(quantile) 
        g = g + ggplot2::geom_vline(ggplot2::aes_string(xintercept="q.25", col=group, linetype="q25")) +
            ggplot2::geom_vline(ggplot2::aes_string(xintercept="q.75", col=group, linetype="q75")) 

    g = g + ggplot2::theme_bw() + ggplot2::theme(legend.position = "top") 

    ## KS test if two groups
    if (length(unique(df[,group] %>% dplyr::pull(.)))==2) {
        gr = df[,group]
        ks = tab %>%
            dplyr::group_by(var) %>%
            dplyr::summarise(d1 = list(base::split(value, gr)[[1]]),
                             d2 = list(base::split(value, gr)[[2]])) %>%
            dplyr::mutate(KS.test = purrr::map2_dbl(d1,d2, ~ks.test(.x,.y)$p.value),
                          KS.text = paste0("K-S test : ", round(KS.test,4)) ) %>%
            dplyr::select(var, dplyr::contains("KS")) 
        tab = tab %>%
            dplyr::full_join(., ks , by=c("var")) 
        g = g + ggplot2::geom_text(data=tab %>% dplyr::select(var, KS.text)  %>% dplyr::filter(!duplicated(.)),
                          ggplot2::aes(x=-Inf, y=Inf, label=KS.text), vjust=2, hjust=-.5)
    }
    g= g+ggplot2::theme(strip.background = ggplot2::element_rect(colour="white", fill="white"),
               strip.text.x = ggplot2::element_text(size=12, face='bold'),
               strip.text.y = ggplot2::element_text(size=12, face="bold")) 
    if (group[1] == "grouping_null__") {
        g = g + ggplot2::guides(fill=FALSE, colour=F)
    }

    return(g)
}
## {{{ docs }}}

#' Plot histograms
#'
#' @inheritParams gge_describe
#'
#' @details All the variables in the data frame will be plotted
#' @export

## }}}
gge_histogram <- function(df, group=NULL, weight=NULL, mean=T, median=F, conf.int=T, quantile=F, legend.position='top')
{
    if (length(group)>1) {
        stop("\n\nCurrently, this function supports only 1 group")
    }
    ## Debug/Monitoring message --------------------------
    ## msg <- paste0('\n','Generating the plots ...',  '\n'); cat(msg)
    ## ---------------------------------------------------
    options(warn=-1)
    on.exit(options(warn=0))
    
    flag=FALSE
    ifelse(is.null(weight), df$weight<-1, df$weight <- df %>% dplyr::select_(weight) %>% dplyr::pull(.)) 
    if (is.null(group)) {
        flag=TRUE
        df$grouping_null__ = 1
        group="grouping_null__"
    }

    ## preparing the data to plot
    df[,group] = df[,group] %>% dplyr::mutate_all(as.factor)
    tab = df %>% summarise_alln(., group=group, weight="weight", spread=F)
    aes.statistics = c("Mean","Median",  "Quantile (2.5%, 97.5%)",  "Quantile (25%, 75%)")
    aes.var.names = c("Average", "Med", "CI.l", "CI.u", "q25", "q75")
    df$Average = aes.statistics [1]
    df$Med     = aes.statistics [2]
    df$CI.l    = aes.statistics [3]
    df$CI.u    =aes.statistics [3]
    df$q25     = aes.statistics [4]
    df$q75     = aes.statistics [4]

    tab = df %>%
        dplyr::select(-dplyr::one_of(group))  %>% 
        dplyr::select_if(is.numeric) %>%
        dplyr::bind_cols(., df[,c(group, aes.var.names)])  %>%
        tidyr::gather(var, value,  -group, -weight,  -aes.var.names)  %>%
        dplyr::full_join(., tab , by=c("var", group))  %>%
        dplyr::mutate(value=value*weight) 

    
    if(!is.null(group)){
        g = tab %>% 
            ggplot2::ggplot(.) + 
            ggplot2::geom_histogram(ggplot2::aes_string(x="value",  fill=group), alpha=.5) +
            ggplot2::facet_wrap( ~ var , scales='free',labeller=ggplot2::label_parsed)  +
            ggplot2::scale_x_continuous(expand = c(0, 0)) +
            ggplot2::scale_y_continuous(expand = c(0, 0)) +
            viridis::scale_fill_viridis(option="A", discrete=TRUE, alpha=1, name="", begin=.066, end=.77) +
            viridis::scale_colour_viridis(option="A", discrete=TRUE, alpha=1, name="", begin=.066, end=.77) +
            ggplot2::guides(linetype=ggplot2::guide_legend("")) +
            ggplot2::theme(legend.position = legend.position) 
    }else{
        g = tab %>% 
            ggplot2::ggplot(.) + 
            ggplot2::geom_histogram(ggplot2::aes(value), alpha=.5) +
            ggplot2::facet_wrap( ~ var , scales='free',labeller=ggplot2::label_parsed)  +
            ggplot2::scale_x_continuous(expand = c(0, 0)) +
            ggplot2::scale_y_continuous(expand = c(0, 0)) +
            ggplot2::guides(linetype=ggplot2::guide_legend("")) 
    }

    if(mean)
        g = g + ggplot2::geom_vline(ggplot2::aes_string(xintercept="Mean", col=group, linetype="Average")) 

    if(median) 
        g = g + ggplot2::geom_vline(ggplot2::aes_string(xintercept="Median", col=group, linetype="Med")) 

    if(conf.int) 
        g = g + ggplot2::geom_vline(ggplot2::aes_string(xintercept="q.025", col=group, linetype="CI.l")) +
            ggplot2::geom_vline(ggplot2::aes_string(xintercept="q.975", col=group, linetype="CI.u")) 

    if(quantile) 
        g = g + ggplot2::geom_vline(ggplot2::aes_string(xintercept="q.25", col=group, linetype="q25")) +
            ggplot2::geom_vline(ggplot2::aes_string(xintercept="q.75", col=group, linetype="q75")) 

    g = g + ggplot2::theme(legend.position = "top") +
        ggplot2::theme_bw() +
        ggplot2::theme(strip.background = ggplot2::element_rect(colour="white", fill="white"),
              strip.text.x = ggplot2::element_text(size=12, face='bold'),
              strip.text.y = ggplot2::element_text(size=12, face="bold")) 
    if (group[1] == "grouping_null__") {
        g = g + ggplot2::guides(fill=FALSE, colour=F)
    }
    return(g)
}
## {{{ docs }}}

#' Plot categorical Variables using barplot
#'
#' @inheritParams gge_describe
#' @param ncol integer with the number of columns in the grid to plot. If \code{NULL} (default), plots will be places in a squared grid
#' @param nrow inteter with the number of rows in the grid to plot. If \code{NULL} (default), plots will be places in a squared grid
#'
#' @export

## }}}
gge_barplot <- function(df, group=NULL, ncol=NULL, nrow=NULL)
{
    options(warn=-1)
    on.exit(options(warn=0))

    if (length(group)>1) {
        stop("\n\nCurrently, this function supports only 1 group")
    }
    ## Debug/Monitoring message --------------------------
    ## msg <- paste0('\n','Generating the plots ...',  '\n'); cat(msg)
    ## ---------------------------------------------------
    vars = names(df) 
    g = list()
    j = 1
    for (i in 1:length(vars))
    {
        var = vars[i]
        if (!is.numeric(df[,var] %>% dplyr::pull(.))) {
            if (!is.null(group)) {
                if (var != group) {
                    g[[j]] = gge_barplot_one(df, var, group) 
                    j = j+1
                }
            }else{
                g[[j]] = gge_barplot_one(df, var, group) 
                j = j+1
            }
        }
    }
    print(ggpubr::ggarrange(plotlist=g, common.legend=TRUE, ncol=ncol, nrow=nrow))
    invisible()
}
## {{{ docs }}}

#' Descriptive Plots
#'
#' This function generate plots with all the variables in the data frame individually. It is a faster version of gge_describe
#'
#' @param data a data frame
#' @param qline boolean, if \code{TRUE} the plots will display lines in the quantiles
#' @param quantiles vector with values between 0 and 1 indicating the quantiles to display
#' 
#' @details All the variables in the data frame will be plotted
#' @export

## }}}
plot_edescribe <- function(data, qline=T, quantiles=c(.025,.975))
{
    op=graphics::par(no.readonly=TRUE)

    ## Debug/Monitoring message --------------------------
    msg <- paste0('\n','Generating the plots ...',  '\n'); cat(msg)
    ## ---------------------------------------------------

    dfc = data %>% 
        dplyr::select_if(function(col) !is.numeric(col)) 
    dfn = data %>%
        dplyr::select_if(is.numeric)
    ## check if there is any numerical variable for the summary
    maxVars <- 16
    if(ncol(data)>16)
        stop(cat("\n\n Max variables permited by default:",maxVars))

    m <- ceiling(sqrt(ncol(data)))
    if(m*(m-1)>=ncol(data))
        graphics::par(mfrow=c(m-1,m))
    else
        graphics::par(mfrow=c(m,m))

    if (nrow(dfn)>0){
        X <- dfn
        for (i in 1:ncol(X)){
            x <- X[,i] %>% dplyr::pull(.)
            x = x[!is.na(x)]
            edar_plotdensity(x,main=names(dfn)[i], qline)
            q <- stats::quantile(x,probs=quantiles)
            graphics::abline(v=q, col='red', lty=2)
            graphics::abline(v=mean(x, na.rm=T), col='red', lty=1)
            graphics::legend('topleft', lty=c(1,2), legend=c("Mean", paste0("Quantiles (", paste0(paste0(100*quantiles, "%"), collapse=','),  ")") ), col=c('red', 'red'), bty='n')
        }
    }
    if (ncol(dfc)>0 ){
        for (i in 1:ncol(dfc)){
            x <- dfc[,i] %>% dplyr::pull(.)
            edar_barPlot(as.factor(x), title=names(dfc)[i])
        }
    }
    graphics::par(op)
}


## {{{ docs }}}
#' Barplot
#'
#' This function generate a barplot
#'
#' @param data a data frame
#' @param variable string with the name of the categorical variable that is going to be used to create the plot
#' @param title either \code{NULL} or a string with the title of the plot
#' @param subtitle either \code{NULL} or a string with the subtitle of the plot
#' @inheritParams gge_describe
#' 
#' @export
## }}}
gge_barplot2 <- function(data, variable, group=NULL, title = NULL, subtitle=NULL, legend.position='top')
{
    options(warn=-1)
    on.exit(options(warn=0))
    
    cat = variable
    if (!is.null(group)) {
        g =  data %>% 
            dplyr::select(cat, group) %>%
            dplyr::rename(cat = !!cat, group=!!group) %>% 
            dplyr::mutate(group=factor(group))  %>%
            dplyr::group_by(cat) %>%
            dplyr::mutate(cat.label = paste0(cat, "\n(n=",n(),")" ))  %>% 
            dplyr::ungroup(.)  %>% 
            dplyr::mutate(cat = cat.label)  %>% 
            dplyr::group_by(cat, group) %>%
            dplyr::summarise(N=n())  %>% 
            dplyr::ungroup(.)  %>% 
            dplyr::mutate(Percentage= round(100*N/sum(N, na.rm=T),2) )  %>%
            dplyr::mutate(label = paste0(Percentage, " % (n=",N,")") )  %>%
            dplyr::ungroup(.)  %>%
            ggplot2::ggplot(.) +
            ggplot2::geom_col(ggplot2::aes(y= Percentage, x = cat, fill=group, group=group), stat='identity', position = 'dodge', alpha=.6) + 
            ggplot2::geom_text(ggplot2::aes(y= Percentage, x = cat, label=label, group=group), hjust=-.1, size=3, position = ggplot2::position_dodge(1))+
            ggplot2::coord_flip() +
            ggplot2::xlab("")+
            ggplot2::ggtitle(cat)+
            viridis::scale_fill_viridis(option="A", discrete=TRUE, alpha=1, name="", begin=.066, end=.77) +
            ggplot2::theme(legend.position = "bottom") 
    }else{
        g = data %>% 
            dplyr::select(cat) %>%
            dplyr::rename(cat = !!cat) %>% 
            dplyr::group_by(cat) %>%
            dplyr::mutate(ylabel = paste0(unique(cat), "\n(N=",n(),")") )  %>% 
            dplyr::group_by(ylabel) %>% 
            dplyr::summarise(n=n())  %>% dplyr::mutate("Percentage"=round(100*n/sum(n),2)) %>% 
            dplyr::mutate(label = paste0(Percentage, "%") ,
                          ylabel=factor(ylabel, levels=as.character(ylabel[order(Percentage)]))) %>% 
            ggplot2::ggplot(.) +
            ggplot2::geom_bar(ggplot2::aes(y= Percentage, x = ylabel), stat='identity',position = 'dodge', alpha=.5) + 
            ggplot2::geom_text(ggplot2::aes(y= Percentage, x = ylabel, label=label), hjust=-.1, size=4)+
            ggplot2::coord_flip() +
            ggplot2::xlab("")+
            ggplot2::ggtitle(cat)
    }
    if (!is.null(title) | !is.null(subtitle)) {
        if (is.null(title)) title=""
        g = g + ggplot2::ggtitle(title, subtitle=subtitle)
    }
    g = g + ggplot2::theme_bw()
    if (!is.null(group)) {
        g = g + ggplot2::theme(legend.position = legend.position) 
    }
    g = g + ggplot2::scale_y_continuous(expand = c(0, 0), limits=c(0,105), breaks=c(0,25,50,75,100), labels= )
    return(g)
}

## ---------
## anxillary
## ---------
gge_barplot_one <- function(df, variable, group=NULL)
{
    options(warn=-1)
    on.exit(options(warn=0))

    cat = variable
    if (!is.null(group)) {
        g =  df %>% 
            dplyr::select(cat, group) %>%
            dplyr::rename(cat = !!cat, group=!!group) %>% 
            dplyr::mutate(group=factor(group))  %>%
            dplyr::group_by(cat, group) %>%
            dplyr::mutate(ylabel = paste0(unique(cat), "\n(",group,"; N=",n(),")") )  %>%
            dplyr::group_by(ylabel, group) %>% 
            dplyr::summarise(n=n())  %>% 
            dplyr::ungroup(.) %>%
            dplyr::group_by(group) %>% 
            dplyr::mutate("Percentage"=round(100*n/sum(n),2))  %>%
            dplyr::mutate(label = paste0(Percentage, "%") ,
                          ylabel=factor(ylabel, levels=as.character(ylabel[order(Percentage)]))) %>% 
            ggplot2::ggplot(.) +
            ggplot2::geom_bar(ggplot2::aes(y= Percentage, x = ylabel, fill=group), stat='identity',position = 'dodge', alpha=.5) + 
            ggplot2::geom_text(ggplot2::aes(y= Percentage, x = ylabel, label=label), hjust=-.1, size=4)+
            ggplot2::coord_flip() +
            ggplot2::ylim(0,100) +
            ggplot2::xlab("")+
            ggplot2::ggtitle(cat)+
            viridis::scale_fill_viridis(option="A", discrete=TRUE, alpha=1, name="", begin=.066, end=.77) +
            ggplot2::theme(legend.position = "bottom") 
    }else{
        g = df %>% 
            dplyr::select(cat) %>%
            dplyr::rename(cat = !!cat) %>% 
            dplyr::group_by(cat) %>%
            dplyr::mutate(ylabel = paste0(unique(cat), "\n(N=",n(),")") )  %>% 
            dplyr::group_by(ylabel) %>% 
            dplyr::summarise(n=n())  %>% dplyr::mutate("Percentage"=round(100*n/sum(n),2)) %>% 
            dplyr::mutate(label = paste0(Percentage, "%") ,
                          ylabel=factor(ylabel, levels=as.character(ylabel[order(Percentage)]))) %>% 
            ggplot2::ggplot(.) +
            ggplot2::geom_bar(ggplot2::aes(y= Percentage, x = ylabel), stat='identity',position = 'dodge', alpha=.5) + 
            ggplot2::geom_text(ggplot2::aes(y= Percentage, x = ylabel, label=label), hjust=-.1, size=4)+
            ggplot2::coord_flip() +
            ggplot2::ylim(0,100) +
            ggplot2::xlab("")+
            ggplot2::ggtitle(cat)
    }
    g = g + ggplot2::theme_bw()
    return(g)
}
edar_plotdensity  <- function(x, ub=stats::quantile(x,.975), lb=stats::quantile(x,.025), shaded.area=F, shaded.area.col='grey90', lty=1, bty='n', add=F, grid=T, main="")
{
    data.rug = sample(x, size=min(1000, length(x)))
    if (add) {
        graphics::lines(stats::density(x, adjust=1), lty=lty,  cex.axis=.9)
    }else{
        graphics::plot(stats::density(x, adjust=1), lty=lty,  cex.axis=.9, main=main)
    }
    graphics::rug(x=data.rug)
    if(shaded.area){
    ## shadded area
        ## x.new <- density(x, adjust=1)$x
        x.new<- stats::density(x, adjust=1)$x
        y <- stats::density(x, adjust=1)$y
        x.coord <- c(lb, x.new[lb <= x.new & x.new <= ub], ub)
        y.coord <- c(0,  y[lb <= x.new & x.new <= ub], 0)
        graphics::polygon(x.coord,y.coord,col=shaded.area.col, lty=lty, border= NA)
        length(x.coord)
        length(y.coord)
    }
    if(grid) graphics::grid()
}
edar_barPlot       <- function(y, title='', show.group.name=T, show.group.count=T, col='lightgrey')
{
    if(class(y)=='table') {
        n <- sum(table)
        NAs <- 0
        table <- y
        tableProp <- round(y/sum(y),3)
    }else{
        y <- factor(y)
        n <- length(y)
        NAs <- sum(is.na(y))
        table <- table(y)
        tableProp <- round(table(y)/length(y),3)
        
    }
    if (show.group.count){
        N <- paste('\n(N=',table,')', sep='')
        names(tableProp) <-  paste(names(table),N)
    }        
    ypos <- graphics::barplot(tableProp, horiz=T, xlim=c(0,1.1), main=title,plot=F)
    graphics::par(las=1,cex.axis=.7,bty='l', pch=20, cex.main=.9, mar=c(4,5,3,0))
    graphics::barplot(tableProp, horiz=T, xlim=c(0,1.3), col=col, border=NA, xaxt='n', cex.axis=1.2, cex.names=1.2)
    title(main=title, cex.main=1.1)
    graphics::axis(1, at=seq(0,1,by=.2), labels=seq(0,1,by=.2))
    graphics::mtext(text=paste('N=',n-NAs,', NA=',NAs,sep=''), cex=.7)
    if(show.group.name) graphics::text(x=tableProp,y=ypos, labels=tableProp,cex=.9,pos=4)
}

## }}}
## {{{ Bivariate    Analysis (plots) }}}

## continuous variables
## --------------------
## {{{ docs }}}

#' Heat plot
#'
#'
#' @param x,y numeric vector with values to plot
#' @param xlab string to be displayed in the x-axis
#' @param ylab string to be displayed in the y-axis
#' @param title string with the title of the plot
#' @param axis boolean, if \code{TRUE} the axis are displayed
#' @param text text annotation 
#'
#'
#' @export

## }}}
plot_heat <- function(x='', y='', xlab='x', ylab='y', title='', text='', axis=TRUE){
    colors_palette <- grDevices::colorRampPalette(rev(RColorBrewer::brewer.pal(11,'Spectral')))
    r <- colors_palette(32)
    h1 <- graphics::hist(x, breaks=30, plot=F)
    h2 <- graphics::hist(y, breaks=30, plot=F)
    top <- max(h1$counts, h2$counts)
    k <- MASS::kde2d(x, y, n=200)
    ## margins
    if(axis){
        raster::image(k, col=r, xlab=xlab, ylab=ylab, bg='black')     
    }else{
        raster::image(k, col=r, xlab='', ylab='', bg='black', xaxt='n', yaxt='n')
    }
    graphics::title(title)
}
## {{{ docs }}}

#' Heat plot with histogram
#'
#' @inheritParams plot_heat
#' @param ylim two dimensional numeric vector with limits of the y-axis 
#' @param xlim two dimensional numeric vector with limits of the x-axis
#' @param breaks integer, for the marginal histograms
#' @param subtitle a string with subtitles of the plot 
#'
#' @export

## }}}
plot_heathist <- function( x='', y='', title='', subtitle='', xlab='x', ylab='y',  breaks=50, 
                         xlim=c(min(x), max(x)), ylim=c(min(y), max(y)))
{
    color_palette<- grDevices::colorRampPalette(rev(RColorBrewer::brewer.pal(11,'Spectral')))
    r <- color_palette(32)
    h1 <- graphics::hist(x, breaks=seq(xlim[1],xlim[2],length=breaks), plot=F)
    h2 <- graphics::hist(y, breaks=seq(ylim[1],ylim[2],length=breaks), plot=F)
    top <- max(h1$counts, h2$counts)
    k <- MASS::kde2d(x, y, n=200)
                                        # margins
    oldpar <- graphics::par(no.readonly = T)
    graphics::layout(matrix(c(2,0,1,3),2,2,byrow=T),c(3,1), c(1,3))
    graphics::par(las=1,cex.axis=.7,bty='l', pch=20, cex.main=.9, mar=c(5,5,.1,.1))
    graphics::image(k, col=r, xlab=xlab, ylab=ylab,  useRaster=T, xlim=xlim, ylim=ylim) #plot the image
    ##points(df$x,df$y, cex=.3)
    graphics::par(mar=c(0,4.5,3,0))
    graphics::barplot(h1$counts, axes=F, ylim=c(0, top), space=0, col='lightgrey', border='white')
    title(title)
    graphics::mtext(text=subtitle, cex=.8)
    graphics::par(mar=c(5,0,0.1,4))
    graphics::barplot(h2$counts, axes=F, xlim=c(0, top), space=0, col='lightgrey', horiz=T, border='white')
    graphics::par(oldpar)
}
## {{{ docs }}}

#' Contour plot
#'
#' @param df a data frame
#' @param x string with the name of the variable to plot in the x-axis
#' @param y string with the name of the variable to plot in the y-axis
#' @param points boolean, if \code{TRUE} displays the points
#' @param contour.breaks integer, number of breaks to use in the contour plot
#' @param palette a string with the name of a color brewer continuous palette
#' @param hist.fill a string with the colour to fill the histogram bars
#' @param hist.col a string with the color of the border of the histogram bars
#' @param group a string with the categories to group the points
#' @param xlim two dimensional numeric vectors with the limits of the x-axis 
#' @param xlab a string to be displayed in the x-axis
#' @param ylab a string to be displayed in the y-axis
#' @param direction either 1 or -1 indicating the direction of the color sequence.
#'
#' @export

## }}}
gge_contour <- function(df, x, y, points=FALSE, group=NULL, palette='Blues', hist.fill="lightsteelblue2", hist.col='white', xlim=NULL, xlab=NULL, ylab=NULL, contour.breaks=200, direction=1)
{
    if (is.null(group))  g= gge_contour_g0(df, x, y, points, group, palette, hist.fill, hist.col, xlim, xlab, ylab,
                                           contour.breaks=contour.breaks, direction=direction) 
    if (!is.null(group)) g= gge_contour_g1(df, x, y, points, group, palette, hist.fill, hist.col, xlim, xlab, ylab,
                                           contour.breaks=contour.breaks, direction=direction) 
    if (!is.null(xlim)) {
        g = g + xlim(xlim)
    }
    if (!is.null(xlab)) {
        g = g + xlab(xlab)
    }
    if (!is.null(ylab)) {
        g = g + ylab(ylab)
    }
    g = ggExtra::ggMarginal(g, type = "histogram",  colour=hist.col, fill = hist.fill)
    return(g)
}
gge_contour_g0 <- function(df, x, y, points, group=NULL, palette='Blues', hist.fill="lightsteelblue2", hist.col='white', xlim=NULL, xlab=NULL, ylab=NULL, contour.breaks = 200, direction=1)
{
    g = df %>%
        dplyr::rename(x=!!x, y=!!y)  %>%
        ggplot2::ggplot(., ggplot2::aes(x = x , y = y)) +
        ggplot2::stat_density_2d(ggplot2::aes(fill = ..level..), n=contour.breaks, size=0, contour=T, geom='polygon', alpha=.4) +
        ggplot2::scale_fill_distiller(palette=palette, direction=direction) +
        ggplot2::theme_bw() +
        ggplot2::theme(legend.position = "bottom")+
        ggplot2::guides(fill=FALSE)
    if (points) g = g + ggplot2::geom_point( colour="#00000024", size=2) 
    return(g)
}
gge_contour_g1 <- function(df, x, y, points, group=NULL, palette='A', hist.fill="lightsteelblue2", hist.col='white', xlim=NULL, xlab=NULL, ylab=NULL, contour.breaks = 200, direction=1)
{
    g = df %>%
        dplyr::rename(group=!!group, x=!!x, y=!!y)  %>%
        dplyr::mutate(group=factor(group))  %>% 
        ggplot2::ggplot(., ggplot2::aes(x = x , y = y, colour=group)) +
        ggplot2::stat_density_2d(ggplot2::aes(fill = ..level..), n=contour.breaks, colour='white', size=0,contour=T, geom='polygon',alpha=.4) +
        ggplot2::scale_fill_distiller(palette=palette, direction=direction) +
        ggplot2::theme_bw() +
        ggplot2::theme(legend.position = "bottom")+
        ggplot2::guides(fill=FALSE)
    if (points) g = g +  ggplot2::geom_point(size=1.2) +  ggplot2::scale_colour_brewer(palette='Dark2', name="")
    return(g)

}

## }}}
DiogoFerrari/edar documentation built on May 8, 2022, 8:26 a.m.