Nothing
##############################################################
### ###
### Subject: Compare Samples on a bivariate level ###
### Date: May 2023 ###
### Author: Bjoern Rohr ###
### Version: 1.00 ###
### ###
### Bugfix: / ###
### ###
##############################################################
#' Plot Comparison of Multiple Data Frames on a Bivariate Level
#'
#' Plot a object generated by \link{biv_compare} function.
#' @param biv_data_object A object generated by the \link{biv_compare} function.
#' @param plot_title A character string containing the title of the plot.
#' @param plots_label A character string or vector of character strings
#' containing the new labels of the data frames that are used in the plot.
#' @param p_value A number between 0 and one to determine the maximum
#' significance niveau.
#' @param varlabels A character string or vector of character strings containing
#' the new labels of variables that are used in the plot.
#' @param mar A vector that determines the margins of the plot.
#' @param note If \code{note = TRUE}, a note will be displayed to describe the plot.
# #' @param p_adjust Can be either \code{TRUE} or a character string indicating a adjustment method.
# #' If p_adjust=TRUE the p_values will be adjusted with the Bonferroni adjustment method, by default,
# #' to account for the problem of multiple comparisons. All adjustment methods available
# #' in \code{\link{p.adjust}} can be used here, with the same character strings.
#' @param grid A character string, that determines the color of the lines
#' between the tiles of the heatmap.
#' @param diff_perc If \code{TRUE} a percental measure of difference between
#' \code{dfs} and \code{benchmarks} is displayed in the plot.
#' @param diff_perc_size A number to determine the size of the displayed percental
#' difference between surveys in the plot.
#' @param perc_diff_transparance A number to determine the transparency of the
#' displayed percental-difference between surveys in the plot.
#' @param gradient If gradient = TRUE, colors in the heatmap will be more or
#' less transparent, depending on the difference in Pearson's r of the data
#' frames of comparison.
#' @param sum_weights A vector containing information for every variable to weigh them in
#' the displayed percental difference calculation. It can be used if some variables are
#' over- or underrepresented in the analysis.
# #' @param legend_show_x If \code{TRUE} the X will be shown in the legend. At the moment, das
# #' does not yet work correctly.
#' @param order A character vector to determine in which order the variables should be
#' displayed in the plot.
#' @param breaks A vector to label the color scheme in the legend.
#' @param colors A vector to determine the colors in the plot.
#' @param missings_x If TRUE, missing pairs in the plot will be marked with an X.
#' @param ncol_facet Number of columns used in faced_wrap() for the plots.
#'
#' @return A object generated with the help of [ggplot2::ggplot2()], used to visualize
#' the differences between the data frames and benchmarks.
#' @details The plot shows a heatmap of a correlation matrix, where the colors are determined by
#' the similarity of the Pearson's r value in both sets of respondents. Leaving
#' default breaks and colors,
#' * \code{Same} (green) indicates, that the Pearson's r correlation is not significant > 0 in
#' the related data frame or benchmark or the Pearson's r correlations are not significant
#' different, between data frame and benchmark.
#' * \code{Small Diff} (yellow) indicates that the Pearson's r
#' correlation is significant > 0 in the related data frame or benchmark and the Pearson's r
#' correlations are significant different, between data frame and benchmark.
#' * \code{Large Diff} (red) indicates, that the same coditions of yellow are fulfilled, and
#' the correlations are either in opposite directions,or one is double the size of the other.
#'
#' @examples
#'
#' ## Get Data for comparison
#'
#' data("card")
#'
#' north <- card[card$south==0,]
#' white <- card[card$black==0,]
#'
#' ## use the function to plot the data
#' bivar_data<-sampcompR::biv_compare(dfs = c("north","white"),
#' benchmarks = c("card","card"),
#' variables= c("age","educ","fatheduc","motheduc","wage","IQ"),
#' data=TRUE)
#'
#' sampcompR::plot_biv_compare(bivar_data)
#'
#' @export
plot_biv_compare<-function (biv_data_object, plot_title=NULL, plots_label=NULL,
p_value=NULL, varlabels=NULL,
mar = c(0,0,0,0),note=FALSE, grid="white",diff_perc=TRUE,
diff_perc_size=4.5,perc_diff_transparance=0,gradient=FALSE,sum_weights= NULL,
missings_x = TRUE, order=NULL, breaks=NULL,colors=NULL,
ncol_facet = 3){
plot_list<- biv_data_object
if(is.null(colors)==TRUE) colors=plot_list$colors
if (is.null(breaks)) breaks<-plot_list$breaks
if(is.null(p_value)==F){
help<- plot_list[[1]]
if(is.null(breaks)) breaks<-plot_list$breaks
help$p<- as.numeric(help$p)
help$bench_p<- as.numeric(help$bench_p)
help$p_diff<- as.numeric(help$p_diff)
help$corr<- as.numeric(help$corr)
help$corr_bench<- as.numeric(help$corr_bench)
help$value<- ifelse(help$bench_p >= p_value & help$p >= p_value,breaks[1],
ifelse(help$p_diff>=p_value,breaks[1],
ifelse((abs(help$corr)<2*abs(help$corr_bench)&
abs(help$corr)*2>abs(help$corr_bench)) &
((help$corr>0 & help$corr_bench>0)|
(help$corr<0 & help$corr_bench<0)),
breaks[2],breaks[3])))
plot_list[[1]]$value[plot_list[[1]]$value!="X" & is.na(plot_list[[1]]$value)==F]<-
help$value[plot_list[[1]]$value!="X" & is.na(plot_list[[1]]$value)==F]
rm(help)
}
##########################################
### Calculate percentage of difference ###
##########################################
if(diff_perc==TRUE) {
summary_df<-difference_summary(plot_list[[1]],breaks=breaks, sum_weights=sum_weights)
}
#return(summary_df)
################################
### change color of the grid ###
################################
if (grid!="white"){ # create a matrix for NA, to exclude from grid
### buid a df where no grid shall be set ###
na_df<-plot_list[[1]][is.na(plot_list[[1]]$value),]
### build a df, where the diagonal is.
plot_df2<-plot_list[[1]]
names_var<-as.character(unique(plot_df2$x))
names_var<-c(names_var,names_var[1])
plot_df2$value[is.na(plot_df2$value)]<- "not_edge"
for (i in 1:length(names_var)){
plot_df2$value[plot_df2$x==names_var[i+1] & plot_df2$y==names_var[i]]<-NA
}
edge_df<- plot_df2[is.na(plot_df2$value),]
}
#######################################
### reorder plots to original order ###
#######################################
#if (is.null(plots_label)) plot_list[[1]]$samp_name <- factor(plot_list[[1]]$samp_name, levels = unique(plot_list[[1]]$samp_name))
#if (is.null(plots_label)==FALSE) plot_list[[1]]$samp_name <- factor(plot_list[[1]]$samp_name, levels = plots_label)
if (is.null(plots_label)) plots_label <- plot_list$plots_label
if(length(plots_label)< length(plot_list$plots_label)) plots_label[(length(plots_label)+1):length(plot_list$plots_label)]<-
plot_list$plots_label[(length(plots_label)+1):length(plot_list$plots_label)]
breaks2<-c(breaks,"X")
colors2<-c(colors, "white")
plot_list[[1]]$shape<-NA
plot_list[[1]]$shape[plot_list[[1]]$value=="X"]<-"X"
plot_list$shape<- plot_list[[1]]$shape
labellist_biv<-function(lables,values){
output<-lables
names(output)<-as.character(values)
output
}
labellist<-labellist_biv(plots_label,c(1:length(plots_label)))
##############################
### order variables ###
##############################
if (is.null(order)==FALSE) plot_list[[1]]$x<-factor(plot_list[[1]]$x, levels =order)
if (is.null(order)==FALSE) plot_list[[1]]$y<-factor(plot_list[[1]]$y, levels = order)
##############################
### Label variables ###
##############################
variables_in<-unique(plot_list[[1]]$x)
if (is.null(varlabels)) varlabels<- unique(plot_list[[1]]$x)
if (length(varlabels)<length(variables_in)) varlables<-c(varlabels,variables_in[(length(varlabels)+1):length(variables_in)])
# ########################
# ### edit plots_label ###
# ########################
#
# if (is.null(plots_label)) plots_label<-"dfs"
# if (length(plots_label)<length(unique(plot_list[[1]]$samp_name))) plots_label<-c(plots_label,unique(plot_list[[1]]$samp_name)[(length(plots_label)+1:length(unique(plot_list[[1]]$samp_name)))])
#
######################
### Plots ###
######################
comparison_plot<-
ggplot2::ggplot(plot_list[[1]], ggplot2::aes(x = plot_list[[1]]$y, y = plot_list[[1]]$x)) +
#{if (gradient==TRUE) ggplot2::aes(alpha= gradient)}+
{if (grid != "none") ggplot2::geom_tile(colour= grid, lwd =1,linetype=1,
ggplot2::aes(fill = factor(plot_list[[1]]$value, levels = breaks)))}+
{if (grid == "none") ggplot2::geom_tile(ggplot2::aes( fill = factor(plot_list[[1]]$value, levels = breaks)))}+
# {if (grid != "white" & grid != "none") ggplot2::geom_tile(data = na_df, colour = "white", lwd=1,linetype=1,
# ggplot2::aes(fill = factor(plot_list[[1]]$value, levels = breaks)))}+
# {if (grid != "white" & grid != "none") ggplot2::geom_tile(data = edge_df, colour = grid, lwd=1,linetype=1,
# ggplot2::aes(fill = factor(plot_list[[1]]$value, levels = breaks)))}+
#ggplot2::geom_point(data= subset(plot_list[[1]], value=="X"), ggplot2::aes(x = y, y = x), show.legend = TRUE)+
{if(missings_x==TRUE)
ggplot2::geom_point(show.legend = FALSE, na.rm = TRUE, ggplot2::aes(shape= factor(plot_list$shape, levels="X", labels=c("Missing"))))}+
ggplot2::coord_fixed()+
ggplot2::scale_fill_manual(values= colors, name="", na.translate = FALSE)+
ggplot2::scale_y_discrete(name="", limits = rev(levels(plot_list[[1]]$x)), labels= varlabels, breaks=unique(plot_list[[1]]$x))+
ggplot2::scale_x_discrete(name="", limits = levels(plot_list[[1]]$y), labels= varlabels, breaks=unique(plot_list[[1]]$y))+
ggplot2::scale_shape_manual(name="", values = c("Missing"=4))+
ggplot2::theme_classic()+
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.33, hjust=0),
axis.text.y = ggplot2::element_text(vjust = 0.33, hjust=0),
axis.title.x= ggplot2::element_blank(),
axis.title.y= ggplot2::element_blank(),
plot.margin = grid::unit(mar, "cm"),
plot.caption=ggplot2::element_text(hjust = 0))+
ggplot2::ggtitle(plot_title)+
ggplot2::guides(alpha="none",
fill = ggplot2::guide_legend(order = 1),
shape = ggplot2::guide_legend(order = 2))+
ggplot2::facet_wrap(~ factor(samp,levels=unique(samp),labels = labellist), labeller = ggplot2::labeller(samp = labellist),ncol = ncol_facet)
if(note==TRUE) comparison_plot<-comparison_plot + ggplot2::labs(caption = plot_list$note_text)
if (diff_perc==TRUE) {
label<-summary_df$label
comparison_plot <- comparison_plot + ggplot2::geom_label(ggplot2::aes(x=rep(Inf,length(label)),
y=rep(Inf,length(label)),
hjust = rep(1,length(label)),
vjust = rep(1,length(label))),
data=summary_df,
label = summary_df$label,
fill = ggplot2::alpha("white", perc_diff_transparance),
color = ggplot2::alpha("black", 1), size= diff_perc_size, show.legend = FALSE)
}
return (comparison_plot)
}
#' Plot Difference or Relative Difference in Pearson's r for Multiple Data Frames
#'
#' Plot a object generated by \link{biv_compare} function as a heatmap.
#' @param biv_data_object A object generated by the \link{biv_compare} function.
#' @param plot_title A character string containing the title of the plot.
#' @param plots_label A character string or vector of character strings
#' containing the new labels of the data frames that are used in the plot.
#' @param varlabels A character string or vector of character strings containing
#' the new labels of variables that are used in the plot.
#' @param grid A character string, that determines the color of the lines
#' between the tiles of the heatmap.
#' @param summetric If \code{TRUE} Average Absolute Difference (AAB) and the
#' Average Absolute Relative Difference (AARB) of Pearson's r values between the
#' \code{dfs} and the \code{benchmarks} is displayed in the plot.
#' @param summet_size A number to determine the size of the displayed
#' \code{summetric} in the plot.
#' @param summet_transparance A number to determine the transparency of the
#' displayed \code{summetric}.
#' @param colors A vector of two colors used in the heatmap.
# #' @param missings_x If TRUE, missing pairs in the plot will be marked with an X.
#' @param ncol_facet Number of columns used in faced_wrap() for the plots.
#' @param corr_size The font size of correlation numbers displayed in the tiles of the heatmap.
#' @param ndigits_summet The maximum number of digits for numbers displayed in the summertic of the plot.
#' @param legend_title A character string indicating the title of the legend of the plot.
#' @param value A character string which is either \code{"AAB"} or \code{"AARB"}.
#' AAB means that the Absolute Difference in Pearson's r estimates between dfs
#' and the benchmarks should be displayed in the tiles by number and color. AARB
#' means that the Absolute Relative Difference should be displayed instead.
#' @param upper_limit,lower_limit A numeric value, indicating the highest or lowest
#' value that should be displayed in the tiles by number and color. This does
#' not affect the \code{summetric}. However, it can be used to keep differences
#' between values visible in the heatmap, even in the presence of strong ouliers.
#' @param number_color A character string indicating the color of the numbers,
#' displayed in the tiles.
#' @param ndigits_number The maximum digits of numbers displyed in the tiles of the
#' heatmap.
#' @param interest_breaks A numeric vector indicating the breaks for the color scheme
#' displayed in the legend of the heatmap.
#' @param interest_labels A character vector indicating the labels for the breaks
#' displayed in the legend of the heatmap.
#'
#' @return A object generated with the help of [ggplot2::ggplot2()], used to visualize
#' a heatmap of the bivariate differences between the data frames and benchmarks.
#' @details The plot shows a heatmap of a correlation matrix, where the colors are determined by
#' the Absolue Difference or the Absolute Relative Difference in Pearson's r estimates
#' between the data frames and the benchmarks.
#'
#' @examples
#'
#' ## Get Data for comparison
#'
#' data("card")
#'
#' north <- card[card$south==0,]
#' white <- card[card$black==0,]
#'
#' ## use the function to plot the data
#' bivar_data<-sampcompR::biv_compare(dfs = c("north","white"),
#' benchmarks = c("card","card"),
#' variables= c("age","educ","fatheduc","motheduc","wage","IQ"),
#' data=TRUE)
#'
#' Absolute_Bias_Plot<-sampcompR::heatmap_biv_compare(bivar_data,value = "AAB")
#' Absolute_Bias_Plot
#'
#' Absolute_Relative_Bias_Plot<-sampcompR::heatmap_biv_compare(bivar_data,value = "AARB")
#' Absolute_Relative_Bias_Plot
#'
#'
#' @export
heatmap_biv_compare<-function(biv_data_object,
value="AAB",
summet_transparance=0,
summetric=TRUE,
summet_size=4.5,
ndigits_summet=3,
upper_limit=NULL,
lower_limit=NULL,
corr_size=3,
ndigits_number=2,
varlabels=NULL,
plots_label=NULL,
grid="white",
colors=c("#8ECCEE","#1F45F9"),
number_color="white",
ncol_facet=3,
legend_title=NULL,
interest_breaks=NULL,
interest_labels=NULL,
plot_title=NULL){
### Chose Variable Labels ###
variables_in<-unique(biv_data_object[[1]]$x)
if (is.null(varlabels)) varlabels<- unique(biv_data_object[[1]]$x)
if (length(varlabels)<length(variables_in)) varlables<-c(varlabels,variables_in[(length(varlabels)+1):length(variables_in)])
labellist_biv<-function(lables,values){
output<-lables
names(output)<-as.character(values)
output
}
# define labels
if (is.null(plots_label)) plots_label <- biv_data_object$plots_label
if(length(plots_label)< length(biv_data_object$plots_label)) plots_label[(length(plots_label)+1):length(biv_data_object$plots_label)]<-
biv_data_object$plots_label[(length(plots_label)+1):length(biv_data_object$plots_label)]
labellist<-labellist_biv(plots_label,c(1:length(plots_label)))
biv_data_object[[1]]$difference_r[is.na(biv_data_object[[1]]$value)]<-NA
biv_data_object[[1]]$abs_rel_difference_r[is.na(biv_data_object[[1]]$value)]<-NA
samp<-factor(biv_data_object[[1]]$samp)
### Calculate AAB
AAB<-biv_data_object[[1]] %>%
dplyr::mutate(samp=factor(biv_data_object[[1]]$samp)) %>%
dplyr::group_by(samp) %>%
dplyr::summarise(AAB=mean(abs(as.numeric(biv_data_object[[1]]$difference_r)),na.rm=T))
### Calculate AARB
AARB<-biv_data_object[[1]] %>%
dplyr::mutate(samp=factor(biv_data_object[[1]]$samp)) %>%
dplyr::group_by(samp) %>%
dplyr::summarise(AARB=mean(abs(as.numeric(biv_data_object[[1]]$abs_rel_difference_r)),na.rm=T))
### Calculate summary data frame
summary_df<-AAB %>%
dplyr::transmute(samp = samp,
label = paste0("AAB ",format(round(AAB,digits=ndigits_summet),nsmall=ndigits_summet),"\nAARB ",format(round(AARB$AARB,digits=ndigits_summet),nsmall=ndigits_summet)))
### Chose value of interest
if(value=="AAB"){
value_of_interest<- abs(as.numeric(biv_data_object[[1]]$difference_r))
### set limits for heamt map
if(is.null(upper_limit)){
upper_limit=max(abs(as.numeric(biv_data_object[[1]]$difference_r)),na.rm=T)
}
if(is.null(lower_limit)){
lower_limit=min(abs(as.numeric(biv_data_object[[1]]$difference_r)),na.rm=T)
}
if(is.null(legend_title)) legend_title<-"Abs. Difference \nin Pearson's r"
}
if(value=="AARB"){
value_of_interest<- abs(as.numeric(biv_data_object[[1]]$abs_rel_difference_r))
### set limits for heamt map
if(is.null(upper_limit)){
upper_limit2=max(abs(as.numeric(biv_data_object[[1]]$abs_rel_difference_r)),na.rm=T)
}
if(is.null(lower_limit)){
lower_limit2=min(abs(as.numeric(biv_data_object[[1]]$abs_rel_difference_r)),na.rm=T)
}
if(is.null(legend_title)) legend_title<-"Abs. Relative Difference \nin Pearson's r"
}
max<-round(max(value_of_interest,na.rm=T),digits=1)
min<-round(min(value_of_interest,na.rm=T),digits=1)
### manage limits in plot
value_of_interest_txt<-format(round(value_of_interest,digits=ndigits_number),nsmall=ndigits_number)
if(is.null(upper_limit)==F){
upper_limit2<-upper_limit
value_of_interest_txt[value_of_interest>upper_limit2]=paste0(">",round(upper_limit2,digits=2))
value_of_interest_txt<-gsub("[[:space:]]", "", value_of_interest_txt)
value_of_interest_txt[biv_data_object[[1]]$value=="X"]<-"X"
value_of_interest_txt[is.na(value_of_interest)]=""
value_of_interest[value_of_interest>upper_limit2]<- upper_limit2
}
if(is.null(lower_limit)==F){
lower_limit2<-lower_limit
value_of_interest_txt[value_of_interest<lower_limit2]=paste0("<",round(lower_limit2,digits=2))
value_of_interest_txt<-gsub("[[:space:]]", "", value_of_interest_txt)
value_of_interest_txt[biv_data_object[[1]]$value=="X"]<-"X"
value_of_interest_txt[is.na(value_of_interest)]=""
value_of_interest[value_of_interest<lower_limit2]<- lower_limit2
}
if(is.null(interest_breaks)){
interest_breaks<-ggplot2::waiver()
}
if(is.null(interest_labels)){
interest_labels<-ggplot2::waiver()
}
##############################
### Edit grid if not white ###
##############################
# if (grid!="white"){ # create a matrix for NA, to exclude from grid
#
# ### buid a df where no grid shall be set ###
# na_df<-biv_data_object[[1]][is.na(biv_data_object[[1]]$value),]
#
# ### build a df, where the diagonal is.
# plot_df2<-biv_data_object[[1]]
# names_var<-as.character(unique(plot_df2$x))
# names_var<-c(names_var,names_var[1])
# plot_df2$value[is.na(plot_df2$value)]<- "not_edge"
#
# for (i in 1:length(names_var)){
# plot_df2$value[plot_df2$x==names_var[i+1] & plot_df2$y==names_var[i]]<-NA
# }
#
# edge_df<- plot_df2[is.na(plot_df2$value),]
# }
comparison_plot<-
ggplot2::ggplot(biv_data_object[[1]], ggplot2::aes(x = biv_data_object[[1]]$y, y = biv_data_object[[1]]$x)) +
{if (grid != "none") ggplot2::geom_tile(colour= grid, lwd =1,linetype=1,
ggplot2::aes(fill = value_of_interest))}+
{if (grid == "none") ggplot2::geom_tile(ggplot2::aes( fill = value_of_interest))}+
# {if (grid != "white" & grid != "none") ggplot2::geom_tile(data = na_df, colour = "white", lwd=1,linetype=1,
# ggplot2::aes(fill = value))}+
# {if (grid != "white" & grid != "none") ggplot2::geom_tile(data = edge_df, colour = grid, lwd=1,linetype=1,
# ggplot2::aes(fill = value))}+
ggplot2::geom_text(ggplot2::aes(label = value_of_interest_txt), color = number_color, size = corr_size) +
ggplot2::coord_fixed()+
#ggplot2::scale_fill_manual(values= colors, name="", na.translate = FALSE)+
ggplot2::scale_y_discrete(name="", limits = rev(levels(biv_data_object[[1]]$x)), labels= varlabels, breaks=unique(biv_data_object[[1]]$x))+
ggplot2::scale_x_discrete(name="", limits = levels(biv_data_object[[1]]$y), labels= varlabels, breaks=unique(biv_data_object[[1]]$y))+
ggplot2::scale_fill_gradient(na.value = 'white', low = colors[1], high=colors[2],limits=c(lower_limit2,upper_limit2),breaks=interest_breaks, labels=interest_labels)+
#ggplot2::scale_shape_manual(name="", values = c("Missing"=4))+
ggplot2::theme_classic()+
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.33, hjust=0),
axis.text.y = ggplot2::element_text(vjust = 0.33, hjust=0),
axis.title.x= ggplot2::element_blank(),
axis.title.y= ggplot2::element_blank(),
# plot.margin = grid::unit(mar, "cm"),
plot.caption=ggplot2::element_text(hjust = 0))+
ggplot2::ggtitle(plot_title)+
ggplot2::guides(alpha="none",
#fill = ggplot2::guide_legend(order = scale_direction),
fill= ggplot2::guide_colourbar(title = legend_title))+
#shape = ggplot2::guide_legend(order = scale_direction))+
ggplot2::facet_wrap(~ factor(samp,levels = unique(samp),labels = labellist), labeller = ggplot2::labeller(samp = labellist),ncol = ncol_facet)
if(isTRUE(summetric)){
label<-summary_df$label
comparison_plot <- comparison_plot +
ggplot2::geom_label(ggplot2::aes(x=rep(Inf,length(label)),
y=rep(Inf,length(label)),
hjust = rep(1,length(label)),
vjust = rep(1,length(label))),
data=summary_df,
label = summary_df$label,
#fill = ggplot2::alpha("white", perc_diff_transparance),
color = ggplot2::alpha("black", 1), size= summet_size, show.legend = FALSE)
}
comparison_plot
}
# comb_biv_att_heat<-biv_compare_heat_map(attrition_all_biv,
# ncol_facet = 4, value="AARB",
# upper_limit = 5,
# interest_breaks = c(1,2,3,4,5),
# interest_labels = c("1","2","3","4",">5"),
# number_color="black",corr_size=2.3)
#
# comb_biv_att_heat<-comb_biv_att_heat+ theme(text=element_text(size=20))
# comb_biv_att_heat
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.