R/Bivdiff2.R
In sampcompR: Comparing and Visualizing Differences Between Surveys

#' ##############################################################
#' ###                                                        ###
#' ### 		Subject:	Compare Samples on a bivariate level     ###
#' ### 		Date: 		May 2023                                 ###
#' ### 		Author: 	Bjoern Rohr                              ###
#' ### 	Version:  	1.00                                     ###
#' ###                                                        ###
#' ### 		Bugfix:   	/                                      ###
#' ###                                                        ###
#' ##############################################################
#'
#' ############################################
#' ### function for only 1 benchmark and df ###
#' ############################################
#'
#' biv_comp_subfunction2<-function(df, benchmark, data = TRUE, corrtype="r",plot_title=NULL,
#'                                variables=NULL,ID=NULL,ID_bench=NULL,
#'                                weight=NULL,weight_bench=NULL, strata=NULL,strata_bench=NULL,
#'                                p_value=NULL, varlabels=NULL, mar = c(0,0,0,0),
#'                                note=T,p_adjust=NULL, full=FALSE, grid="white",diff_perc=F,
#'                                diff_perc_size=4.5,perc_diff_transparance=0 ,gradient=F,
#'                                breaks=breaks, colors=NULL, remove_nas="pairwise", nboots=0,
#'                                parallel = F) {
#'
#'
#'   ### Build title
#'   df1_label<-deparse(substitute(df))
#'   bench_label<-deparse(substitute(benchmark))
#'   plot_title<-ifelse(is.null(plot_title),paste("Compare ", df1_label," & ",bench_label, sep = "", collapse = NULL), plot_title)
#'
#'   weighted_cor<-function(x,weight, ID, strata=NULL,return="r"){
#'
#'     variables<-colnames(x)
#'     if(is.null(ID)==F) if(is.na(ID)==F)variables<-variables[!variables==ID]
#'     if(is.null(weight)==F) if(is.na(weight)==F) variables<-variables[!variables==weight]
#'     if(is.null(strata)==F) if(is.na(strata)==F) variables<-variables[!variables==strata]
#'
#'
#'     if(is.null(ID)==F) {
#'       if(is.na(ID)==F) ID<-x[,ID]
#'       else ID<-c(1:nrow(x))}
#'     if(is.null(ID)==T) ID<-c(1:nrow(x))
#'
#'     if(is.null(weight)==F) {
#'       if(is.na(weight)==F) weight<-x[,weight]
#'       else weight<-c(rep(1,nrow(df)))}
#'     if(is.null(weight)==T) weight<-c(rep(1,nrow(df)))
#'
#'     ### normalize the weight
#'     weight<- weight/(sum(weight)/nrow(x))
#'
#'     if(is.null(strata)==F){
#'       if (is.na(strata)==F) strata<-x[,strata]
#'       else strata<-NULL
#'     }
#'
#'
#'
#'
#'     df_weighted<- survey::svydesign(id      = ID,
#'                                     strata  = strata,
#'                                     weights = weight,
#'                                     nest    = FALSE,
#'                                     data    = x)
#'
#'     # insertform<-paste("~",colnames(x[1]))
#'     # for (i in 2:(length(x))) {
#'     #   insertform<-paste(insertform," + ",colnames(x[i]), collapse = "")
#'     # }
#'
#'
#'     matrix<-jtools::svycor(formula=stats::reformulate(variables), design = df_weighted, na.rm = TRUE,sig.stats=F)
#'     if(return=="p") matrix2<-jtools::svycor(formula=stats::reformulate(variables), design = df_weighted, na.rm = TRUE,sig.stats=T)
#'     matrix$cors[matrix$cors==1]<-NA
#'
#'     if(return=="r") return(matrix$cors)
#'     if(return=="p") return(matrix2$p.values)
#'   }
#'
#'   ############################
#'   ### equalize data frames ###
#'   ############################
#'
#'   ###############################################
#'   ### When benchmark is a object of dataframe ###
#'   ###############################################
#'   if (inherits(benchmark,"data.frame")) {
#'   df2<-dataequalizer(benchmark,df,variables=variables,silence=T)
#'   benchmark2<-dataequalizer(df2,benchmark,variables=variables,silence=T)
#'
#'
#'   #### add weight variables again ifany ###
#'   if (is.null(weight)==F) df2<-df[,c(colnames(df2),weight)]
#'   if (is.null(ID)==F) df2<-df[,c(colnames(df2),ID)]
#'   if (is.null(strata)==F) df2<-df[,c(colnames(df2),strata)]
#'
#'   if (is.null(weight_bench)==F) benchmark2<-benchmark[,c(colnames(benchmark2),weight_bench)]
#'   if (is.null(ID_bench)==F) benchmark2<-benchmark[,c(colnames(benchmark2),ID_bench)]
#'   if (is.null(strata_bench)==F) benchmark2<-benchmark[,c(colnames(benchmark2), strata_bench)]
#'
#'
#'   #colnames(df)<-c(as.character(1:(ncol(df))))
#'   ### Build correlation matrices
#'   if(remove_nas=="all") df2<-stats::na.omit(df2)
#'   df<-df2
#'   if (is.null(weight)==FALSE & is.null(ID)==FALSE) {
#'     df[,weight]<-NULL
#'     df[,ID]<-NULL
#'     #ID<-df2[,ID]
#'     #weight<-df2[,weight]
#'     if (is.null(strata)==F) {
#'       df[,strata]<-NULL
#'       #strata<-df2[,strata]
#'       }
#'   }
#'
#'   if(remove_nas=="all") benchmark2<-stats::na.omit(benchmark2)
#'   benchmark<-benchmark2
#'   if (is.null(weight_bench)==FALSE & is.null(ID_bench)==FALSE) {
#'     benchmark[,weight_bench]<-NULL
#'     benchmark[,ID_bench]<-NULL
#'     #ID_bench<-benchmark2[,ID_bench]
#'     #weight_bench<-benchmark2[,weight_bench]
#'     if (is.null(strata_bench)==F) {
#'       benchmark[,strata_bench]<-NULL
#'       #strata_bench<-benchmark2[,strata_bench]
#'       }
#'     }
#'   }
#'
#'   ###########################################
#'   ### When benchmark is a object of rcorr ###
#'   ###########################################
#'
#'   if(inherits(benchmark,"data.frame")==F){
#'
#'     fit_cor<-function(rcorr_object,df=NULL,variables=NULL){
#'
#'       ### check for variables in df and variables ###
#'       if (is.null(variables)==F) vars<-variables[variables %in% colnames(df)]
#'       if (is.null(variables)) vars<- colnames(df)
#'
#'
#'       rcorr_object[[1]]<-rcorr_object[[1]][rownames(rcorr_object[[1]]) %in% vars, colnames(rcorr_object[[1]]) %in% vars]
#'       rcorr_object[[2]]<-rcorr_object[[2]][rownames(rcorr_object[[2]]) %in% vars, colnames(rcorr_object[[2]]) %in% vars]
#'       rcorr_object[[3]]<-rcorr_object[[3]][rownames(rcorr_object[[3]]) %in% vars, colnames(rcorr_object[[3]]) %in% vars]
#'
#'       rcorr_object
#'
#'     }
#'
#'     benchmark<-fit_cor(benchmark,df=df,variables = variables)
#'     df2<-df[,colnames(benchmark[[1]])]
#'
#'     if (is.null(weight)==F) df2<-df[,c(colnames(df2),weight)]
#'     if (is.null(ID)==F) df2<-df[,c(colnames(df2),ID)]
#'     if (is.null(strata)==F) df2<-df[,c(colnames(df2),strata)]
#'
#'
#'     if(remove_nas=="all") df2<-stats::na.omit(df2)
#'     df<-df2
#'     if (is.null(weight)==FALSE & is.null(ID)==FALSE) {
#'       df[,weight]<-NULL
#'       df[,ID]<-NULL
#'       #ID<-df2[,ID]
#'       #weight<-df2[,weight]
#'       if (is.null(strata)==F) {
#'         df[,strata]<-NULL
#'         #strata<-df2[,strata]
#'         }
#'     }
#'
#'   }
#'
#'   if(corrtype=="r") corrtype<-"pearson"
#'
#'   if (is.null(ID)==F ){
#'     if(is.na(ID)==T) if(corrtype=="rho") corrtype<-"spearman"
#'     if(is.na(ID)==F) if(corrtype=="rho") corrtype<-"pearson"
#'   }
#'   if (is.null(ID)==T){
#'     if(corrtype=="rho") corrtype<-"spearman"
#'   }
#'
#'
#'
#'   cor_matrix_df <- Hmisc::rcorr(as.matrix(df), type = corrtype)
#'   if (is.null(weight)==FALSE & is.null(ID)==FALSE) cor_matrix_df<- weighted_correlation2(df2,weight=weight,ids=ID,stratas=strata,
#'                                                                                         variables=colnames(df), remove_nas=remove_nas,
#'                                                                                         nboots = nboots)
#'   # if (is.null(weight)==FALSE & is.null(ID)==FALSE) cor_matrix_df$r<- weighted_cor(df2, weight , ID=ID, strata = strata, return="r")
#'   # if (is.null(weight)==FALSE & is.null(ID)==FALSE) cor_matrix_df[[3]]<- weighted_cor(df2, weight , ID=ID, strata = strata, return="p")
#'   cor_matrix_df$r[cor_matrix_df$r=="NaN"]<-NA
#'   cor_matrix_df[[3]][cor_matrix_df[[3]]=="NaN"]<-NA
#'
#'   if (inherits(benchmark,"data.frame")) {
#'   cor_matrix_bench <- Hmisc::rcorr(as.matrix(benchmark), type = corrtype)
#'   if (is.null(weight_bench)==FALSE & is.null(ID_bench)==FALSE) cor_matrix_bench<- weighted_correlation2(benchmark2,weight=weight_bench,
#'                                                                            ids=ID_bench,stratas=strata_bench,
#'                                                                            variables=colnames(benchmark),remove_nas = remove_nas ,
#'                                                                            nboots = 0, parallel = parallel)
#'   # if (is.null(weight_bench)==FALSE & is.null(ID_bench)==FALSE)cor_matrix_bench$r<-
#'   #   weighted_cor(benchmark2, weight_bench , ID=ID_bench, strata = strata_bench, return = "r")
#'   # if (is.null(weight_bench)==FALSE & is.null(ID_bench)==FALSE)cor_matrix_bench[[3]]<-
#'   #     weighted_cor(benchmark2, weight_bench , ID=ID_bench, strata = strata_bench, return = "p")
#'   cor_matrix_bench$r[cor_matrix_bench$r=="NaN"]<-NA
#'   cor_matrix_bench[[3]][cor_matrix_bench[[3]]=="NaN"]<-NA
#'   }
#'
#'
#'   if (inherits(benchmark,"data.frame")==F) cor_matrix_bench<-benchmark
#'
#'
#'
#'
#'   fischer_cor_df<- psych::fisherz(cor_matrix_df$r)
#'   fischer_cor_bench<- psych::fisherz(cor_matrix_bench$r)
#'
#'   fischer_z_test<-psych::paired.r(cor_matrix_df$r,cor_matrix_bench$r,n=cor_matrix_df$n, n2=cor_matrix_bench$n)
#'   fischer_z_test$p[fischer_z_test$p=="NaN"]<-NA
#'
#'   p_value= ifelse(is.null(p_value),0.05,p_value)
#'
#'   ### implement p_adjustments if needed ###
#'   if (is.null(p_adjust)==F) {
#'     if (p_adjust!=F) fischer_z_test$p<- matrix(stats::p.adjust(p = fischer_z_test$p, method = p_adjust),
#'                                                       ncol = ncol(fischer_z_test$p))}
#'
#'
#'
#'
#'   ### Compute Comparison Matrix
#'   comp_matrix<-fischer_cor_df
#'   comp_matrix[comp_matrix=="Inf"]<-NA
#'   comp_matrix[fischer_z_test$p=="NaN"]<-NA
#'   comp_matrix[fischer_z_test$p>p_value]<-breaks[1]
#'   comp_matrix[fischer_z_test$p<p_value & (cor_matrix_df[[3]]>=p_value & cor_matrix_bench[[3]]>=p_value)]<-breaks[1]
#'   comp_matrix[fischer_z_test$p<p_value & (cor_matrix_df[[3]]<p_value | cor_matrix_bench[[3]]<p_value)]<-breaks[2]
#'   comp_matrix[fischer_z_test$p<p_value & (cor_matrix_df[[3]]<p_value | cor_matrix_bench[[3]]<p_value) &
#'                 (abs(cor_matrix_df$r)>2*abs(cor_matrix_bench$r) | abs(cor_matrix_bench$r)>2*abs(cor_matrix_df$r)) &
#'                 ((cor_matrix_df$r<0 & cor_matrix_bench$r<0) | (cor_matrix_df$r>0 & cor_matrix_bench$r>0))]<-breaks[3]
#'   comp_matrix[fischer_z_test$p<p_value & (cor_matrix_df[[3]]<p_value | cor_matrix_bench[[3]]<p_value) &
#'                 ((cor_matrix_df$r>0 & cor_matrix_bench$r<0) | (cor_matrix_df$r<0 & cor_matrix_bench$r>0))]<-breaks[3] # 1 sig and one positive, while the other negative
#'   comp_matrix<-as.matrix(comp_matrix)
#'   if (isFALSE(full)) comp_matrix[upper.tri(comp_matrix)]<-NA
#'
#'   if (is.null(colors)==T) colors=c('green','yellow','red')
#'
#'   note_text<- paste("Note: ",breaks[1]," ", colors[1],") means that the Pearson's rs are not significant different. \n" ,breaks[2]," (", colors[2], ") means, at least one is significant >0 or <0 and both are
#'   significant different from each other. \n",breaks[3]," (", colors[3], ") means all conditions for Small Diff are true and the
#'   coeficients differ in direction or one is double the value of the other. \nLevel of Significance is p < 0.05.")
#'
#'
#'
#'   ### Calculate percentage of difference ###
#'
#'   if(diff_perc==T) {
#'     percental_difference_b1<-length(comp_matrix[comp_matrix == breaks[1] & is.na(comp_matrix)==F ])/ length(comp_matrix[is.na(comp_matrix)==F])
#'     percental_difference_b2<-length(comp_matrix[comp_matrix == breaks[2] & is.na(comp_matrix)==F ])/ length(comp_matrix[is.na(comp_matrix)==F])
#'     if (length(breaks)>2) percental_difference_b3<-length(comp_matrix[comp_matrix == breaks[3] & is.na(comp_matrix)==F ])/ length(comp_matrix[is.na(comp_matrix)==F])
#'     if (length(breaks)>3) percental_difference_b4<-length(comp_matrix[comp_matrix == breaks[4] & is.na(comp_matrix)==F ])/ length(comp_matrix[is.na(comp_matrix)==F])
#'
#'     diff_summary<-paste(breaks[1]," :",(round((percental_difference_b1), digits = 3)*100),"% \n",
#'                         breaks[2]," :",(round(percental_difference_b2, digits = 3)*100),"%")
#'     if (length(breaks)>2) diff_summary<-paste (diff_summary, "\n",breaks[3], " :", (round(percental_difference_b3, digits = 3)*100),"%")
#'     if (length(breaks)>3) diff_summary<-paste (diff_summary, "\n",breaks[4], " :", (round(percental_difference_b4, digits = 3)*100),"%")
#'   }
#'
#'   ###########################
#'   # prepare data for ggplot
#'   ###########################
#'
#'   comp_matrix_df<-reshape2::melt(comp_matrix)
#'   colnames(comp_matrix_df) <- c("x", "y", "value")
#'
#'   if (grid!="white"){ # create a matrix for NA, to exclude from grid
#'     na_matrix<-comp_matrix_df[is.na(comp_matrix_df$value),]
#'
#'     comp_matrix_2<-comp_matrix
#'     comp_matrix_2[is.na(comp_matrix_2)]<- 99
#'
#'     for (i in 1:(nrow(comp_matrix_2)-1)){
#'       comp_matrix_2[i+1,i]<-NA
#'     }
#'
#'     comp_matrix_df2<-reshape2::melt(comp_matrix_2)
#'     colnames(comp_matrix_df2) <- c("x", "y", "value")
#'     edge_matrix<- comp_matrix_df2[is.na(comp_matrix_df2$value),]
#'   }
#'
#'   #### add difference to data frame
#'
#'   difference_r<-(cor_matrix_df$r-cor_matrix_bench$r)
#'   difference_r<-reshape2::melt(difference_r)
#'   comp_matrix_df$difference_r<-difference_r$value
#'
#'   ### change gradient ###
#'
#'   alpha_matrix<-(abs(cor_matrix_df$r-cor_matrix_bench$r)/2/5)+0.8
#'   alpha_matrix<-reshape2::melt(alpha_matrix)
#'   colnames(alpha_matrix) <- c("x", "y", "gradient")
#'   comp_matrix_df$gradient<-alpha_matrix$gradient
#'   #}
#'
#'   if (gradient==F) alpha_matrix$value<-1
#'
#'   #if (matrix==TRUE) return(comp_matrix_df)
#'
#'   ##############################
#'   ###     Label variables    ###
#'   ##############################
#'
#'   if (is.null(varlabels)) varlabels<- unique(comp_matrix_df$x)
#'
#'
#'   ### build biger out matrix ###
#'
#'
#'   cor_matrix_df[[1]][upper.tri(cor_matrix_df[[1]], diag= T)]<-NA
#'   cor_matrix_df[[2]][upper.tri(cor_matrix_df[[2]], diag= T)]<-NA
#'   cor_matrix_df[[3]][upper.tri(cor_matrix_df[[3]], diag = T)]<-NA
#'   cor_matrix_bench[[1]][upper.tri(cor_matrix_bench[[1]], diag= T)]<-NA
#'   cor_matrix_bench[[2]][upper.tri(cor_matrix_bench[[2]], diag= T)]<-NA
#'   cor_matrix_bench[[3]][upper.tri(cor_matrix_bench[[3]], diag = T)]<-NA
#'   diff_table<-cor_matrix_df[[1]]-cor_matrix_bench[[1]]
#'   fischer_z_test$p[upper.tri(fischer_z_test$p,diag=T)]<-NA
#'
#'   comp_matrix_list<-list(comp_matrix_df,list(cor_matrix_df[[1]],cor_matrix_df[[2]],cor_matrix_df[[3]],
#'                                              cor_matrix_bench[[1]],cor_matrix_bench[[2]],cor_matrix_bench[[3]],
#'                                              diff_table,fischer_z_test$p))
#'
#'   names(comp_matrix_list)<-c("comparison_dataframe", "correlation_data")
#'   names(comp_matrix_list[[2]])<-c("pearsons_matrix_df","n_matrix_df","p_matrix_df",
#'                                   "pearsons_r_bench","n_matrix_bench","p_matrix_bench",
#'                                   "r_diff_matrix","r_diff_p_matrix")
#'
#'   #  comp_matrix_list<- cor_matrix_df[[1]]
#'   if (data == TRUE) return(comp_matrix_list)
#'
#'   #############################
#'   # Plot Matrix with ggplot2
#'   #############################
#'
#'
#'   comparison_plot<-
#'     ggplot2::ggplot(comp_matrix_df, ggplot2::aes(x = comp_matrix_df$y, y = comp_matrix_df$x, fill = factor(comp_matrix_df$value, levels = breaks))) +
#'     {if (gradient==T) ggplot2::aes(alpha= alpha_matrix$gradient)}+
#'     {if (grid != "none") ggplot2::geom_tile(colour= grid, lwd =1,linetype=1)}+
#'     {if (grid == "none") ggplot2::geom_tile()}+
#'     {if (grid != "white" & grid != "none") ggplot2::geom_tile(data = na_matrix, colour = "white", lwd=1,linetype=1)}+
#'     {if (grid != "white" & grid != "none") ggplot2::geom_tile(data = edge_matrix, colour = grid, lwd=1,linetype=1)}+
#'     ggplot2::coord_fixed()+
#'     ggplot2::scale_fill_manual(values= colors, name="", na.translate = FALSE)+
#'     ggplot2::scale_y_discrete(name="", limits = rev(levels(comp_matrix_df$x)), labels= varlabels, breaks=unique(comp_matrix_df$x))+
#'     ggplot2::scale_x_discrete(name="", limits = levels(comp_matrix_df$y), labels= varlabels, breaks=unique(comp_matrix_df$y))+
#'     #{if(gradient==T) ggplot2::scale_alpha_continuous(values = alpha_matrix$values, na.translate=FALSE)}+
#'     ggplot2::theme_classic()+
#'     ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, vjust = 1, hjust=0.9),
#'                    axis.text.y = ggplot2::element_text(vjust = 1, hjust=0.9),
#'                    axis.title.x= ggplot2::element_blank(),
#'                    axis.title.y= ggplot2::element_blank(),
#'                    plot.margin = grid::unit(mar, "cm"),
#'                    plot.caption=ggplot2::element_text(hjust = 0))+
#'     ggplot2::ggtitle(plot_title)+
#'     ggplot2::guides(alpha="none")
#'
#'   if(note==T) comparison_plot<-comparison_plot + ggplot2::labs(caption = note_text)
#'
#'
#'   if (diff_perc==T) {
#'     comparison_plot <- comparison_plot + ggplot2::geom_label(ggplot2::aes( x = Inf, y = Inf, hjust = 1, vjust = 1), data = diff_summary,
#'                                                              label=diff_summary$label,
#'                                                              fill = ggplot2::alpha("white", perc_diff_transparance),
#'                                                              color = ggplot2::alpha("black", 1), size= diff_perc_size)}
#'
#'
#'
#'
#'
#'
#'
#'
#'
#'
#'   #if (gradient==T) return(alpha_matrix)
#'
#'   if (data == FALSE) return(comparison_plot)
#'
#' }
#'
#'
#'
#'
#'
#'
#'
#' #' Compare Multiple Data Frames on a Bivariate Level
#' #'
#' #' Compare multiple data frames on a bivariate level and plot them together.
#' #'
#' #' @param dfs A character vector containing the names of data frames to compare against the
#' #' benchmarks.
#' #' @param benchmarks A character vector containing the names of benchmarks to compare the
#' #' data frames against, or the names of a list. If it is a list, it has to be of the form,
#' #' as the output of \link[Hmisc]{rcorr}, with a pearson's r matrix in the first position, a
#' #' n matrix (matrix of n for every correlation) in the second position and a P matrix in the
#' #' third position. The vector must either to be the same length as \code{dfs}, or length 1.
#' #' If it has length 1 every survey will be compared against the same benchmark.
#' #' @param variables A character vector containing the names of the variables for the comparison.
#' #' If NULL, all variables named similar in both the dfs and the benchmarks will be compared.
#' #' Variables missing in one of the data frames or the benchmarks will be neglected for this comparison.
#' #' @param plot_title A character string containing the title of the plot.
#' #' @param plots_label A character string or vector of character strings containing the new
#' #' names of the data frames , also used in plot.
#' #' @param data If TRUE, a biv_compare object is returned, containing results of the
#' #' comparison.
#' #' @param id_bench,id A character vector determining id variables used to weight
#' #' the \code{dfs} or \code{benchmarks} with the help of the \code{survey} package. They have
#' #' to be part of the respective data frame. If less characters strings are provided,
#' #' than in \code{dfs}, the first input is used to weight every df or benchmark, where no input
#' #' is provided.
#' #' @param weight_bench,weight A character vector determining variables to weight
#' #' the \code{dfs} of \code{benchmarks}. They have to be part of the respective data frame.
#' #' If less characters strings are provided, than in \code{dfs}, the first input is used
#' #' to weight every df or benchmark, where no input is provided. If a weight variable is
#' #' provided also an id variable is needed. For weighting, the \code{survey} package is used.
#' #' @param strata,strata_bench A character vector determining strata variables used to
#' #' weight the \code{dfs} or \code{benchmarks} with the help of the \code{survey} package.
#' #' They have to be part of the respective data frame. If less characters strings are provided,
#' #' than in \code{dfs}, the first input is used to weight every df or benchmark, where no input
#' #' is provided.
#' #' @param p_value A number between zero and one to determine the maximum significance niveau.
#' #' @param varlabels A character string or vector of character strings containing the new
#' #' names of variables, also used in plot.
#' #' @param mar A vector that determines the margins of the plot.
#' #' @param note If note = True, a note will be displayed to describe the Plot.
#' #' @param p_adjust Can be either TRUE or a character string indicating a adjustment method.
#' #' If p_adjust=T the p_values will be adjusted with the Bonferroni adjustment method, by default,
#' #' to account for the problem of multiple comparisons. All adjustment methods available
#' #' in \code{\link{p.adjust}} can be used here, with the same character strings.
#' #' @param grid Grid determines the color of the lines between the tiles of the heatmap.
#' #' @param diff_perc If true a percental difference between surveys and benchmarks is
#' #' displayed in the plot.
#' #' @param diff_perc_size A number to determine the size of the displayed percental
#' #' difference between surveys in the plot.
#' #' @param perc_diff_transparance A number to determine the transparency of the displayed
#' #' percental difference between surveys in the plot.
#' #' @param gradient If gradient = TRUE, colors in the heatmap will be more or less
#' #' transparent, depending on the difference in Pearson's r of the data frames of comparison.
#' #' @param sum_weights A vector containing information for every variable to weight them in
#' #' the displayed percental difference calculation. It can be used if some variables are over-
#' #' or underrepresented in the analysis.
#' #' @param legend_show_x If true the X will be shown in the legend. At the moment, das does
#' #' not yet work correctly.
#' #' @param order A character vector to determine in which order the variables should be
#' #' displayed in the plot.
#' #' @param breaks A vector to label the color scheme in the in the legend.
#' #' @param colors A vector to determine the colors in the plot.
#' #' @param corrtype A character string, indicating the type of the bivariate correlation.
#' #' It can either be "r" for Pearson's r or "rho" for Spearman's "rho". At the moment,
#' #' rho is only applicable to unweighted data.
#' #' @param missings_x If TRUE, missing pairs in the plot will be marked with an X.
#' #' @param remove_nas A character string, that indicates how missing should be removed, can either be
#' #' "all", two remove all cases that contain NA in any of the variables, or "pairwise", to remove nas
#' #' seperately for every variable pair when calculating Pearson's r
#' #' @param ncol_facet Number of colomns used in faced_wrap() for the plots.
#' #' @param nboots A numeric value indicating the number of bootstrap replications.
#' #' If nboots = 0 no bootstrapping will be performed. Else nboots must be >2. Note,
#' #' that bootstraping can be very computational heavy and can therefore take a while.
#' #' @param parallel If True, all detected cors will be used to in bootstrapping.
#' #'
#' #' @return A object generated with the help of [ggplot2::ggplot2()], used to visualize
#' #' the differences between the data frames and benchmarks. If data = T instead of the plot
#' #' a list will be returned containing information of the analyses. This biv_compare object
#' #' can be used in plot_biv_compare to build a plot later on.
#' #'
#' #' @details
#' #' The plot shows a heatmap of a correlation matrix, where the colors are determined by
#' #' the similarity of the Pearson's r value in both sets of respondents. Leaving default
#' #' breaks and colors,
#' #' * \code{Same} (green) indicates, that the Pearson's r correlation is not significant > 0 in
#' #' the related data frame or benchmark or the Pearson's r correlations are not significant
#' #' different, between data frame and benchmark.
#' #' * \code{Small Diff} (yellow) indicates that the Pearson's r
#' #' correlation is significant > 0 in the related data frame or benchmark and the Pearson's r
#' #' correlations are significant different, between data frame and benchmark.
#' #' * \code{Large Diff} (red) indicates, that the same coditions of yellow are fulfilled, and
#' #' the correlations are either in opposite directions,or one is double the size of the other.
#' #'
#' #' @examples
#' #'
#' #' ## Get Data for comparison
#' #' card<-wooldridge::card
#' #'
#' #' south <- card[card$south==1,]
#' #' north <- card[card$south==0,]
#' #' black <- card[card$black==1,]
#' #' white <- card[card$black==0,]
#' #'
#' #' ## use the function to plot the data
#' #' bivar_comp<-sampcompR::biv_compare(dfs = c("north","white"),
#' #'                                    benchmarks = c("south","black"),
#' #'                                    variables= c("age","educ","fatheduc","motheduc","wage","IQ"),
#' #'                                    data=FALSE)
#' #' bivar_comp
#' #'
#' # #' @importFrom jtools svycor
#' #' @export
#'
#' biv_compare2<-function (dfs, benchmarks, variables=NULL, corrtype="r", data = FALSE,
#'                        id=NULL,weight=NULL, strata=NULL,  id_bench=NULL,
#'                        weight_bench=NULL, strata_bench=NULL, p_value=NULL,
#'                        p_adjust=NULL, varlabels=NULL, plot_title=NULL, plots_label=NULL,
#'                        diff_perc=T, diff_perc_size=4.5, perc_diff_transparance=0,
#'                        note=F, order=NULL,  breaks=NULL, colors=NULL,
#'                        mar = c(0,0,0,0), grid="white", gradient=F,sum_weights= NULL ,missings_x=T,
#'                        legend_show_x=F,remove_nas="pairwise", ncol_facet=3, nboots=0,
#'                        parallel = F){
#'
#'
#'
#'
#'
#'   if(is.null(colors)==T) colors=c('green','yellow','red')
#'   if (is.null(breaks)) breaks<-c("Same","Small Diff", "Large Diff")
#'
#'   plot_list<-NULL
#'   summary_df<-data.frame("samp"=NA,"label"=NA)
#'
#'   if (is.null(plots_label)) plots_label<-dfs[1:length(dfs)]
#'   if (is.null(plots_label)==F) {
#'     if (length(plots_label)<length(dfs)) plots_label[(length(plots_label)+1):length(dfs)]<-dfs[(length(plots_label)+1):length(dfs)]
#'     if (length(plots_label)>length(dfs)) plots_label<-plots_label[[1:length(dfs)]]
#'   }
#'
#'   ### prepare some inputs ###
#'   if (length(benchmarks)>=1 & length(benchmarks) < length(dfs)) benchmarks<-c(benchmarks,rep(benchmarks[1],length(dfs)-length(benchmarks)))
#'
#'   if (length(id)>=1 & length(id) < length(dfs)) id<- c(id,rep(id[1],(length(dfs)-length(id))))
#'   if (length(weight)>=1 & length(weight) < length(dfs)) weight<- c(weight,rep(weight[1],(length(dfs)-length(weight))))
#'   if (length(strata)>=1 & length(strata) < length(dfs)) strata<- c(strata,rep(strata[1],(length(dfs)-length(strata))))
#'   if (length(id_bench)>=1 & length(id_bench) < length(dfs)) id_bench<- c(id_bench,rep(id_bench[1],(length(dfs)-length(id_bench))))
#'   if (length(weight_bench)>=1 & length(weight_bench) < length(dfs)) weight_bench<- c(weight_bench,rep(weight_bench[1],(length(dfs)-length(weight_bench))))
#'   if (length(strata_bench)>=1 & length(strata_bench) < length(dfs)) strata_bench<- c(strata_bench,rep(strata_bench[1],(length(dfs)-length(strata_bench))))
#'
#'   ### if p_adjust==T get bonferroni as default ###
#'   if(is.null(p_adjust)==F) if(isTRUE(p_adjust)) if(is.character(p_adjust)==F) p_adjust<-"bonferroni"
#'
#'
#'   ### Get Dataframes ###
#'   for (i in 1:length(dfs)){
#'
#'     curr_df<-get(dfs[i])
#'     curr_bench<-get(benchmarks[i])
#'
#'     if (is.null(weight)==F) {
#'     if (is.na(weight[i])==F) {
#'       curr_id <- id[i]
#'       curr_weight<- weight[i]}
#'       if (is.null(strata)==F) {if (is.na(strata[i])==F) curr_strata<- strata[i]}}
#'
#'     if (is.null(weight_bench)==F){
#'       if (is.na(weight_bench[i])==F) {
#'         curr_id_bench <- id_bench[i]
#'         curr_weight_bench<- weight_bench[i]}
#'       if (is.null(strata_bench)==F) {if (is.na(strata_bench[i])==F) curr_strata_bench<- strata_bench[i]}}
#'
#'
#'
#'
#'     if (is.null(weight)==F) {
#'       if (is.na(weight[i])) {
#'         curr_id <- NULL
#'         curr_weight<- NULL
#'         if (is.null(strata)==F) {if(is.na(strata[i])) curr_strata<-NULL}}}
#'
#'     if (is.null(weight)) {
#'       curr_id <- NULL
#'       curr_weight<- NULL}
#'     if (is.null(strata)) curr_strata<-NULL
#'
#'     if (is.null(weight_bench)==F){
#'       if (is.na(weight_bench[i])) {
#'         curr_id <- NULL
#'         curr_weight<- NULL
#'         if (is.null(strata_bench)==F) {if(is.na(strata_bench[i])) curr_strata_bench<-NULL}}}
#'
#'     if (is.null(weight_bench)){
#'         curr_id_bench <- NULL
#'         curr_weight_bench<- NULL}
#'     if (is.null(strata_bench)) curr_strata_bench<-NULL
#'
#'
#'     help<-biv_comp_subfunction2(df=curr_df,benchmark = curr_bench, corrtype=corrtype, variables = variables ,
#'                                plot_title=plot_title, ID_bench=curr_id_bench,
#'                                weight_bench= curr_weight_bench, ID=curr_id,weight=curr_weight,
#'                                strata = curr_strata, strata_bench = curr_strata_bench,
#'                                p_value=p_value, varlabels=varlabels, note=note,p_adjust=p_adjust, gradient=T,
#'                                data = T, breaks=breaks,remove_nas=remove_nas, nboots = nboots,
#'                                parallel = parallel)
#'
#'
#'     help[[1]]$samp<-i
#'     if (is.null(plots_label)) help[[1]]$samp_name<-dfs[i]
#'     if (is.null(plots_label)==F) help[[1]]$samp_name<-plots_label[i]
#'
#'
#'     if (is.null(plot_list)==F) {
#'       plot_list[[1]]<-rbind(plot_list[[1]],help[[1]])
#'       plot_list[1+i]<-help[2]
#'     }
#'
#'     if(is.null(plot_list)) plot_list <- help
#'
#'
#'   }
#'
#'
#'   names(plot_list)<-c(names(plot_list[1]),paste("correlation_data_",plots_label[1:length(dfs)],sep=""))
#'
#'
#'
#'
#'
#'   #######################################
#'   ### reorder plots to original order ###
#'   #######################################
#'
#'  # if (is.null(plots_label)) plot_list[[1]]$samp_name <- factor(plot_list[[1]]$samp_name, levels = dfs)
#'  # if (is.null(plots_label)==F) plot_list[[1]]$samp_name <- factor(plot_list[[1]]$samp_name, levels = plots_label)
#'    if (is.null(plots_label)) plots_label <- dfs
#'    if(length(plots_label)< length(dfs)) plots_label[(length(plots_label)+1):length(dfs)]<-
#'     dfs[(length(plots_label)+1):length(dfs)]
#'
#'
#'
#'   labellist_biv<-function(lables,values){
#'
#'     # output<-list()
#'     # for (i in 1:length(lables)){
#'     #   output[i]<-lables[i]
#'     # }
#'     output<-lables
#'     names(output)<-as.character(values)
#'     output
#'   }
#'
#'
#'   labellist<-labellist_biv(plots_label,c(1:length(plots_label)))
#'
#'
#'   ##########################
#'   ### add X for missings ###
#'   ##########################
#'
#'   plot_list[[1]]<-empty_finder(plot_list[[1]],plots_label)
#'
#'
#'   ################################
#'   ### change color of the grid ###
#'   ################################
#'
#'   if (grid!="white"){ # create a matrix for NA, to exclude from grid
#'
#'     ### buid a df where no grid shall be set ###
#'     na_df<-plot_list[[1]][is.na(plot_list[[1]]$value),]
#'
#'     ### build a df, where the diagonal is.
#'     plot_df2<-plot_list[[1]]
#'     names_var<-as.character(unique(plot_df2$x))
#'     names_var<-c(names_var,names_var[1])
#'     plot_df2$value[is.na(plot_df2$value)]<- "not_edge"
#'
#'
#'
#'     for (i in 1:length(names_var)){
#'       #if (is.null(k)==F) k=k+1
#'       #if (is.null(k)) k<-1
#'       plot_df2$value[plot_df2$x==names_var[i+1] & plot_df2$y==names_var[i]]<-NA
#'     }
#'
#'     edge_df<- plot_df2[is.na(plot_df2$value),]
#'   }
#'
#'
#'   ##########################################
#'   ### Calculate percentage of difference ###
#'   ##########################################
#'
#'   if(diff_perc==T) {
#'     summary_df<-difference_summary(plot_list[[1]],breaks=breaks, sum_weights=sum_weights)
#'   }
#'
#'
#'   #####################################
#'   ### edit shape, breaks and colors ###
#'   #####################################
#'
#'   breaks2<-c(breaks,"X")
#'   colors2<-c(colors, "white")
#'
#'   plot_list[[1]]$shape<-NA
#'   plot_list[[1]]$shape[plot_list[[1]]$value=="X"]<-"X"
#'
#'
#'   ##############################
#'   ###    order variables     ###
#'   ##############################
#'   if (is.null(order)==F) plot_list[[1]]$x<-factor(plot_list[[1]]$x, levels =order)
#'   if (is.null(order)==F) plot_list[[1]]$y<-factor(plot_list[[1]]$y, levels = order)
#'
#'   ##############################
#'   ###     Label variables    ###
#'   ##############################
#'
#'   variables_in<-unique(plot_list[[1]]$x)
#'   if (is.null(varlabels)) varlabels<- unique(plot_list[[1]]$x)
#'   if (length(varlabels)<length(variables_in)) varlabels<-c(varlabels,variables_in[(length(varlabels)+1):length(variables_in)])
#'
#'
#'
#'
#'
#'   ####################
#'   ### Build a Note ###
#'   ####################
#'
#'   note_text<- paste("Note: ",breaks[1]," (", colors[1],") means that the Pearson's rs are not significant different. \n" ,breaks[2]," (", colors[2], ") means, at least one is significant >0 or <0 and both are
#'   significant different from each other. \n",breaks[3]," (", colors[3], ") means all conditions for Small Diff are true and the
#'   coeficients differ in direction or one is double the value of the other. \nLevel of Significance is p < 0.05.")
#'
#'   ####################################
#'   ### Add information to plot_list ###
#'   ####################################
#'
#'   plot_list$dfs<-dfs
#'   plot_list$benchmarks<-benchmarks
#'   plot_list$variables <-variables
#'   plot_list$colors<-colors2
#'   plot_list$breaks <-breaks
#'   plot_list$shape<-plot_list[[1]]$shape
#'   plot_list$plots_label<-as.character(unique(plot_list[[1]]$samp_name))
#'
#'   if (is.null(plot_title)==F) plot_list$plot_title<-plot_title
#'   if (is.null(plot_title)) plot_list$plot_title<-NA
#'
#'   if (is.null(varlabels)==F) plot_list$varlabels<-varlabels
#'   if (is.null(varlabels)) plot_list$varlabels<-NA
#'
#'   if (is.null(p_value)==F) plot_list$p_value <-p_value
#'   if (is.null(p_value)) plot_list$p_value <-NA
#'
#'   if (is.null(id)==F) plot_list$id  <-id
#'   if (is.null(id)) plot_list$id  <-NA
#'
#'   if (is.null(weight)==F) plot_list$weight  <-weight
#'   if (is.null(weight)) plot_list$weight  <-NA
#'
#'   if (is.null(strata)==F) plot_list$strata   <-strata
#'   if (is.null(strata)) plot_list$strata   <-NA
#'
#'   if (is.null(id_bench)==F) plot_list$id_bench  <-id_bench
#'   if (is.null(id_bench)) plot_list$id_bench  <-NA
#'
#'   if (is.null(weight_bench)==F) plot_list$weight_bench  <-weight_bench
#'   if (is.null(weight_bench)) plot_list$weight_bench  <-NA
#'
#'   if (is.null(strata_bench)==F) plot_list$strata_bench <-strata_bench
#'   if (is.null(strata_bench)) plot_list$strata_bench <-NA
#'
#'   if (is.null(p_adjust )==F) plot_list$p_adjust  <-p_adjust
#'   if (is.null(p_adjust )) plot_list$p_adjust  <-NA
#'
#'   plot_list$note_text  <-note_text
#'
#'
#'   for (i in 1:length(plot_list)){
#'     plot_list[i][is.na(plot_list[i])]<-list(NULL)}
#'
#'   ###########################
#'   ### Return if data=TRUE ###
#'   ###########################
#'
#'   if (data == T) return(plot_list)
#'
#'   ######################
#'   ###     Plots      ###
#'   ######################
#'
#'
#'   comparison_plot<-
#'     ggplot2::ggplot(plot_list[[1]], ggplot2::aes(x = plot_list[[1]]$y, y = plot_list[[1]]$x)) +
#'     {if (gradient==T) ggplot2::aes(alpha= gradient)}+
#'     {if (grid != "none") ggplot2::geom_tile(colour= grid, lwd =1,linetype=1,
#'                                             ggplot2::aes(fill = factor(plot_list[[1]]$value, levels = breaks)))}+
#'     {if (grid == "none") ggplot2::geom_tile(ggplot2::aes( fill = factor(plot_list[[1]]$value, levels = breaks)))}+
#'     {if (grid != "white" & grid != "none") ggplot2::geom_tile(data = na_df, colour = "white", lwd=1,linetype=1,
#'                                                               ggplot2::aes(fill = factor(plot_list[[1]]$value, levels = breaks)))}+
#'     {if (grid != "white" & grid != "none") ggplot2::geom_tile(data = edge_df, colour = grid, lwd=1,linetype=1,
#'                                                               ggplot2::aes(fill = factor(plot_list[[1]]$value, levels = breaks)))}+
#'     #ggplot2::geom_point(data= subset(plot_list[[1]], value=="X"), ggplot2::aes(x = y, y = x), show.legend = T)+
#'     {if (missings_x==T)
#'       ggplot2::geom_point(show.legend = legend_show_x, na.rm = T, ggplot2::aes(shape= factor(plot_list[[1]]$shape, levels="X", labels=c("Missing"))))}+
#'     ggplot2::coord_fixed()+
#'     ggplot2::scale_fill_manual(values= colors, name="", na.translate = FALSE)+
#'     ggplot2::scale_y_discrete(name="", limits = rev(levels(plot_list[[1]]$x)), labels= varlabels, breaks=unique(plot_list[[1]]$x))+
#'     ggplot2::scale_x_discrete(name="", limits = levels(plot_list[[1]]$y), labels= varlabels, breaks=unique(plot_list[[1]]$y))+
#'     ggplot2::scale_shape_manual(name="", values = c("Missing"=4))+
#'     ggplot2::theme_classic()+
#'     ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.33, hjust=0),
#'                    axis.text.y = ggplot2::element_text(vjust = 0.33, hjust=0),
#'                    axis.title.x= ggplot2::element_blank(),
#'                    axis.title.y= ggplot2::element_blank(),
#'                    plot.margin = grid::unit(mar, "cm"),
#'                    plot.caption=ggplot2::element_text(hjust = 0))+
#'     ggplot2::ggtitle(plot_title)+
#'     ggplot2::guides(alpha="none",
#'                     fill  = ggplot2::guide_legend(order = 1),
#'                     shape = ggplot2::guide_legend(order = 2))+
#'     ggplot2::facet_wrap(~ samp, labeller = ggplot2::labeller(samp = labellist),ncol = ncol_facet)
#'
#'   if(note==T) comparison_plot<-comparison_plot + ggplot2::labs(caption = plot_list[[1]]$note_text)
#'
#'
#'   if (diff_perc==T) {
#'     comparison_plot <- comparison_plot + ggplot2::geom_label(x=Inf, y=Inf,
#'                                                              ggplot2::aes( hjust = 1, vjust = 1), data=summary_df,
#'                                                              label = summary_df$label,
#'                                                              fill = ggplot2::alpha("white", perc_diff_transparance),
#'                                                              color = ggplot2::alpha("black", 1), size= diff_perc_size, show.legend = F)}
#'
#'
#'
#'
#'   if (data == F) return (comparison_plot)
#'
#' }
#'
#'
#'
#'
#'
#'
#' #' Plot Comparison of Multiple Data Frames on a Bivariate Level
#' #'
#' #' Plot a object generated by \link{biv_compare} function.
#' #' @param biv_data_object A object generated by \link{biv_compare} function.
#' #' @param plot_title A character string containing the title of the plot.
#' #' @param plots_label A character string or vector of character strings containing the
#' #' new names of the data frames , also used in plot.
#' #' @param p_value A number between 0 and one to determine the maximum significance niveau.
#' #' @param varlabels A character string or vector of character strings containing the new
#' #' names of variables, also used in plot.
#' #' @param mar A vector that determines the margins of the plot.
#' #' @param note If note = True, a note will be displayed to describe the Plot.
#' # #' @param p_adjust Can be either TRUE or a character string indicating a adjustment method.
#' # #' If p_adjust=T the p_values will be adjusted with the Bonferroni adjustment method, by default,
#' # #' to account for the problem of multiple comparisons. All adjustment methods available
#' # #' in \code{\link{p.adjust}} can be used here, with the same character strings.
#' #' @param grid Grid determines the color of the lines between the tiles of the heatmap.
#' #' @param diff_perc If TRUE a percental measure of difference between dfs and benchmarks is
#' #' displayed in the plot.
#' #' @param diff_perc_size A number to determine the size of the displayed percental
#' #' difference between surveys in the plot.
#' #' @param perc_diff_transparance A number to determine the transparancy of the displayed
#' #' percental difference between surveys in the plot.
#' #' @param gradient If gradient = TRUE, colors in the heatmap will be more or less transparent,
#' #' depending on the difference in Pearson's r of the data frames of comparison.
#' #' @param sum_weights A vector containing information for every variable to weight them in
#' #' the displayed percental difference calculation. It can be used if some variables are
#' #' over- or underrepresented in the analysis.
#' #' @param legend_show_x If true the X will be shown in the legend. At the moment, das
#' #' does not yet work correctly.
#' #' @param order A character vector to determine in which order the variables should be
#' #' displayed in the plot.
#' #' @param breaks A vector to label the color sheme in the in the legend.
#' #' @param colors A vector to determine the colors in the plot.
#' #' @param missings_x If TRUE, missing pairs in the plot will be marked with an X.
#' #' @param ncol_facet Number of colomns used in faced_wrap() for the plots.
#' #'
#' #' @return A object generated with the help of [ggplot2::ggplot2()], used to visualize
#' #' the differences between the data frames and benchmarks.
#' #' @details The plot shows a heatmap of a correlation matrix, where the colors are determined by
#' #' the similarity of the Pearson's r value in both sets of respondents. Leaving
#' #' default breaks and colors,
#' #' * \code{Same} (green) indicates, that the Pearson's r correlation is not significant > 0 in
#' #' the related data frame or benchmark or the Pearson's r correlations are not significant
#' #' different, between data frame and benchmark.
#' #' * \code{Small Diff} (yellow) indicates that the Pearson's r
#' #' correlation is significant > 0 in the related data frame or benchmark and the Pearson's r
#' #' correlations are significant different, between data frame and benchmark.
#' #' * \code{Large Diff} (red) indicates, that the same coditions of yellow are fulfilled, and
#' #' the correlations are either in opposite directions,or one is double the size of the other.
#' #'
#' #' @examples
#' #'
#' #' ## Get Data for comparison
#' #' card<-wooldridge::card
#' #'
#' #' south <- card[card$south==1,]
#' #' north <- card[card$south==0,]
#' #' black <- card[card$black==1,]
#' #' white <- card[card$black==0,]
#' #'
#' #' ## use the function to plot the data
#' #' bivar_data<-sampcompR::biv_compare(dfs = c("north","white"),
#' #'                                    benchmarks = c("south","black"),
#' #'                                    variables= c("age","educ","fatheduc","motheduc","wage","IQ"),
#' #'                                    data=TRUE)
#' #'
#' #' sampcompR::plot_biv_compare(bivar_data)
#' #'
#' #' @export
#'
#'
#' plot_biv_compare<-function (biv_data_object, plot_title=NULL, plots_label=NULL,
#'                             p_value=NULL, varlabels=NULL,
#'                             mar = c(0,0,0,0),note=F, grid="white",diff_perc=T,
#'                             diff_perc_size=4.5,perc_diff_transparance=0,gradient=F,sum_weights= NULL,
#'                             missings_x = T, legend_show_x=F, order=NULL, breaks=NULL,colors=NULL,
#'                             ncol_facet = 3){
#'
#'
#'
#'   plot_list<- biv_data_object
#'   if(is.null(colors)==T) colors=plot_list$colors
#'   if (is.null(breaks)) breaks<-plot_list$breaks
#'
#'
#'   ##########################################
#'   ### Calculate percentage of difference ###
#'   ##########################################
#'
#'   if(diff_perc==T) {
#'
#'     summary_df<-difference_summary(plot_list[[1]],breaks=breaks, sum_weights=sum_weights)
#'   }
#'
#'   ################################
#'   ### change color of the grid ###
#'   ################################
#'
#'   if (grid!="white"){ # create a matrix for NA, to exclude from grid
#'
#'     ### buid a df where no grid shall be set ###
#'     na_df<-plot_list[[1]][is.na(plot_list[[1]]$value),]
#'
#'     ### build a df, where the diagonal is.
#'     plot_df2<-plot_list[[1]]
#'     names_var<-as.character(unique(plot_df2$x))
#'     names_var<-c(names_var,names_var[1])
#'     plot_df2$value[is.na(plot_df2$value)]<- "not_edge"
#'
#'
#'
#'     for (i in 1:length(names_var)){
#'       #if (is.null(k)==F) k=k+1
#'       #if (is.null(k)) k<-1
#'       plot_df2$value[plot_df2$x==names_var[i+1] & plot_df2$y==names_var[i]]<-NA
#'     }
#'
#'     edge_df<- plot_df2[is.na(plot_df2$value),]
#'   }
#'
#'
#'
#'   #######################################
#'   ### reorder plots to original order ###
#'   #######################################
#'
#'   #if (is.null(plots_label)) plot_list[[1]]$samp_name <- factor(plot_list[[1]]$samp_name, levels = unique(plot_list[[1]]$samp_name))
#'   #if (is.null(plots_label)==F) plot_list[[1]]$samp_name <- factor(plot_list[[1]]$samp_name, levels = plots_label)
#'   if (is.null(plots_label)) plots_label <- plot_list$plots_label
#'   if(length(plots_label)< length(plot_list$plots_label)) plots_label[(length(plots_label)+1):length(plot_list$plots_label)]<-
#'       plot_list$plots_label[(length(plots_label)+1):length(plot_list$plots_label)]
#'
#'
#'   breaks2<-c(breaks,"X")
#'   colors2<-c(colors, "white")
#'
#'   plot_list[[1]]$shape<-NA
#'   plot_list[[1]]$shape[plot_list[[1]]$value=="X"]<-"X"
#'
#'
#'   labellist_biv<-function(lables,values){
#'
#'     # output<-list()
#'     # for (i in 1:length(lables)){
#'     #   output[i]<-lables[i]
#'     # }
#'     output<-lables
#'     names(output)<-as.character(values)
#'     output
#'   }
#'
#'
#'   labellist<-labellist_biv(plots_label,c(1:length(plots_label)))
#'
#'
#'   ##############################
#'   ###    order variables     ###
#'   ##############################
#'   if (is.null(order)==F) plot_list[[1]]$x<-factor(plot_list[[1]]$x, levels =order)
#'   if (is.null(order)==F) plot_list[[1]]$y<-factor(plot_list[[1]]$y, levels = order)
#'
#'   ##############################
#'   ###     Label variables    ###
#'   ##############################
#'
#'   variables_in<-unique(plot_list[[1]]$x)
#'   if (is.null(varlabels)) varlabels<- unique(plot_list[[1]]$x)
#'   if (length(varlabels)<length(variables_in)) varlables<-c(varlabels,variables_in[(length(varlabels)+1):length(variables_in)])
#'
#'   # ########################
#'   # ### edit plots_label ###
#'   # ########################
#'   #
#'   # if (is.null(plots_label)) plots_label<-"dfs"
#'   # if (length(plots_label)<length(unique(plot_list[[1]]$samp_name))) plots_label<-c(plots_label,unique(plot_list[[1]]$samp_name)[(length(plots_label)+1:length(unique(plot_list[[1]]$samp_name)))])
#'   #
#'
#'   ######################
#'   ###     Plots      ###
#'   ######################
#'
#'   comparison_plot<-
#'     ggplot2::ggplot(plot_list[[1]], ggplot2::aes(x = plot_list[[1]]$y, y = plot_list[[1]]$x)) +
#'     {if (gradient==T) ggplot2::aes(alpha= gradient)}+
#'     {if (grid != "none") ggplot2::geom_tile(colour= grid, lwd =1,linetype=1,
#'                                             ggplot2::aes(fill = factor(plot_list[[1]]$value, levels = breaks)))}+
#'     {if (grid == "none") ggplot2::geom_tile(ggplot2::aes( fill = factor(plot_list[[1]]$value, levels = breaks)))}+
#'     {if (grid != "white" & grid != "none") ggplot2::geom_tile(data = na_df, colour = "white", lwd=1,linetype=1,
#'                                                               ggplot2::aes(fill = factor(plot_list[[1]]$value, levels = breaks)))}+
#'     {if (grid != "white" & grid != "none") ggplot2::geom_tile(data = edge_df, colour = grid, lwd=1,linetype=1,
#'                                                               ggplot2::aes(fill = factor(plot_list[[1]]$value, levels = breaks)))}+
#'     #ggplot2::geom_point(data= subset(plot_list[[1]], value=="X"), ggplot2::aes(x = y, y = x), show.legend = T)+
#'     {if(missings_x==T)
#'       ggplot2::geom_point(show.legend = legend_show_x, na.rm = T, ggplot2::aes(shape= factor(plot_list$shape, levels="X", labels=c("Missing"))))}+
#'     ggplot2::coord_fixed()+
#'     ggplot2::scale_fill_manual(values= colors, name="", na.translate = FALSE)+
#'     ggplot2::scale_y_discrete(name="", limits = rev(levels(plot_list[[1]]$x)), labels= varlabels, breaks=unique(plot_list[[1]]$x))+
#'     ggplot2::scale_x_discrete(name="", limits = levels(plot_list[[1]]$y), labels= varlabels, breaks=unique(plot_list[[1]]$y))+
#'     ggplot2::scale_shape_manual(name="", values = c("Missing"=4))+
#'     ggplot2::theme_classic()+
#'     ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.33, hjust=0),
#'                    axis.text.y = ggplot2::element_text(vjust = 0.33, hjust=0),
#'                    axis.title.x= ggplot2::element_blank(),
#'                    axis.title.y= ggplot2::element_blank(),
#'                    plot.margin = grid::unit(mar, "cm"),
#'                    plot.caption=ggplot2::element_text(hjust = 0))+
#'     ggplot2::ggtitle(plot_title)+
#'     ggplot2::guides(alpha="none",
#'                     fill  = ggplot2::guide_legend(order = 1),
#'                     shape = ggplot2::guide_legend(order = 2))+
#'     ggplot2::facet_wrap(~ samp, labeller = ggplot2::labeller(samp = labellist),ncol = ncol_facet)
#'
#'   if(note==T) comparison_plot<-comparison_plot + ggplot2::labs(caption = plot_list$note_text)
#'
#'
#'   if (diff_perc==T) {
#'     comparison_plot <- comparison_plot + ggplot2::geom_label(x=Inf, y=Inf,
#'                                                              ggplot2::aes(  hjust = 1, vjust = 1), data=summary_df,
#'                                                              label = summary_df$label,
#'                                                              fill = ggplot2::alpha("white", perc_diff_transparance),
#'                                                              color = ggplot2::alpha("black", 1), size= diff_perc_size, show.legend = F)}
#'
#'
#'
#'
#'
#'     return (comparison_plot)
#'
#' }
#'
#'
#'
#'
#' empty_finder <- function(df,samp_names){
#'
#'   varnames<-as.character(unique(df$y))
#'   samps<-as.character(unique(df$samp))
#'   sampnames<-samp_names
#'
#'
#'
#'   for (i in 1:length (varnames)) {
#'     for (j in 1:length (varnames)){
#'       for (k in 1:length (samps)) {
#'
#'         if ((length(df$value[df[,1]==varnames[i] & df[,2]==varnames[j] & df$samp==samps[k]])==0) &
#'             (any((df[,1]==varnames[i] & df[,2]==varnames[j] & is.na(df[,3]) & df$samp!=samps[k])))==F) df<-rbind(df, c(varnames[i],varnames[j],"X",NA,NA,samps[k],sampnames[k]))
#'         if ((length(df$value[df[,1]==varnames[i] & df[,2]==varnames[j] & df$samp==samps[k]])==0) &
#'             (any((df[,1]==varnames[i] & df[,2]==varnames[j] & df$samp!=samps[k])))==T) df<-rbind(df, c(varnames[i],varnames[j],NA,NA,NA,samps[k],sampnames[k]))
#'       }
#'
#'     }
#'
#'   }
#'   return (df)
#' }
#'
#'
#' empty_finder2 <- function(df){
#'
#'   varnames<-as.character(unique(df$y))
#'   sampnames<-as.character(unique(df$samp))
#'   length(varnames)
#'
#'   for (i in 1:length (varnames)) {
#'     for (j in 1:length (varnames)){
#'       for (k in 1:length (sampnames)) {
#'
#'         if ((length(df$value[df[,1]==varnames[i] & df[,2]==varnames[j] & df["samp"]==sampnames[k]])==0) &
#'             (any((df[,1]==varnames[i] & df[,2]==varnames[j] & is.na(df[,3]) & df["samp"]!=sampnames[k])))==F) df<-rbind(df, c(varnames[i],varnames[j],"X",NA,NA,sampnames[k]))
#'         if ((length(df$value[df[,1]==varnames[i] & df[,2]==varnames[j] & df["samp"]==sampnames[k]])==0) &
#'             (any((df[,1]==varnames[i] & df[,2]==varnames[j] & df["samp"]!=sampnames[k])))==T) df<-rbind(df, c(varnames[i],varnames[j],NA,NA,NA,sampnames[k]))
#'       }
#'
#'     }
#'
#'   }
#'   return (df)
#' }
#'
#'
#'
#' difference_summary<-function(results_object,sum_weights=NULL,breaks){
#'
#'   ### prepare needed variables ###
#'   varnames<-as.character(unique(results_object$x))
#'   samps<-unique(results_object$samp)
#'   results_object$sum_weight<-NA
#'   summary_df<-data.frame("samp"=NA,"label"=NA)
#'
#'   ### check for sum_weights ###
#'   if (is.null(sum_weights)) {
#'     sum_weights<-matrix(data=1, nrow=length(samps), ncol=length(varnames))
#'   }
#'
#'   for (i in 1:length(samps)){
#'
#'
#'     help_matrix<-matrix(NA, nrow=length(varnames), ncol=length(varnames))
#'     rownames(help_matrix)<-varnames
#'     colnames(help_matrix)<-varnames
#'
#'     ### build a weight matrix ###
#'     for (f in 1:length(varnames)){
#'       for (g in 1:length(varnames)){
#'         help_matrix[f,g]<-sum_weights[i,][f]*sum_weights[i,][g]
#'       }
#'     }
#'
#'     ### turn weight matrix to df ###
#'     help_matrix_df<-reshape2::melt(help_matrix)
#'     help_matrix_df$samp<-samps[i]
#'
#'
#'
#'     ### add help_matrix to results_object ###
#'     results_object$sum_weight[results_object$samp==samps[i]]<-help_matrix_df$value
#'     results_object$sum_weight[is.na(results_object$value)]<-NA
#'
#'     ### build a summary for every sample ###
#'
#'     percental_difference_b1<-sum(results_object$sum_weight[results_object$value == breaks[1] & is.na(results_object$value)==F
#'                                                            & results_object$samp==samps[i] & results_object$value != "X"])/
#'       sum(results_object$sum_weight[is.na(results_object$value)==F & results_object$samp==samps[i] & results_object$value != "X"])
#'
#'     percental_difference_b2<-sum(results_object$sum_weight[results_object$value == breaks[2] & is.na(results_object$value)==F
#'                                                            & results_object$samp==samps[i] & results_object$value != "X"])/
#'       sum(results_object$sum_weight[is.na(results_object$value)==F & results_object$samp==samps[i] & results_object$value != "X"])
#'     if (length(breaks)>2) {
#'       percental_difference_b3<-sum(results_object$sum_weight[results_object$value == breaks[3] & is.na(results_object$value)==F
#'                                                              & results_object$samp==samps[i] & results_object$value != "X"])/
#'         sum(results_object$sum_weight[is.na(results_object$value)==F & results_object$samp==samps[i] & results_object$value != "X"])}
#'
#'     diff_summary<-paste(breaks[1]," :",(round((percental_difference_b1), digits = 3)*100),"% \n",
#'                         breaks[2]," :",(round(percental_difference_b2, digits = 3)*100),"%")
#'     if (length(breaks)>2) diff_summary<-paste (diff_summary, "\n",breaks[3], " :", (round(percental_difference_b3, digits = 3)*100),"%")
#'
#'     summary_df[i,]<- c(as.character(samps[i]), diff_summary)
#'   }
#'
#'   return(summary_df)
#' }
#'
#'
#'
#'
#'
#'
#'
#'
#' ### Function, to weight correlations ###
#'
#' wgt_cor2<-function(df, i = NULL, weight_var = NULL, stratas = NULL, ids = NULL,
#'                   variables = NULL, remove_nas = "pairwise", bootstrap = F){
#'
#'
#'   ### look if bootstrap or not ###
#'   if (bootstrap==T) df<-df[i,]
#'
#'   ### prepare variables if needed ###
#'   if(is.null(variables)){
#'     variables<-colnames(df)
#'     if(is.null(ids)==F) if(is.na(ids) == F)variables<-variables[!variables == ids]
#'     if(is.null(weight_var)==F) if(is.na(weight_var) == F) variables<-variables[!variables == weight_var]
#'     if(is.null(stratas)==F) if(is.na(stratas) == F) variables<-variables[!variables == stratas]
#'   }
#'
#'   ### pret dataframe for the boot function
#'   if(remove_nas=="all") df<-stats::na.omit(df)
#'   df$rows<-1
#'
#'   ### prepare weight variables ###
#'   if(is.null(ids)==F) {
#'     if(is.na(ids)==F) ids<-df[,ids]
#'     else ids<-c(1:nrow(df))}
#'   if(is.null(ids)==T) ids<-c(1:nrow(df))
#'
#'   if(is.null(weight_var)==F) {
#'     if(is.na(weight_var)==F) weight_var<-df[,weight_var]
#'     else weight_var<-c(rep(1,nrow(df)))}
#'   if(is.null(weight_var)==T) weight_var<-c(rep(1,nrow(df)))
#'
#'   ### normalize the weight_var
#'   weight_var<- weight_var/(sum(weight_var)/nrow(df))
#'
#'   if(is.null(stratas)==F){
#'     if (is.na(stratas)==F) stratas<-df[,stratas]
#'     else stratas<-NULL
#'   }
#'
#'   ### Get surveydesign ###
#'   df_design<- survey::svydesign(id      = ids,
#'                                 strata  = stratas,
#'                                 weights = weight_var,
#'                                 nest    = FALSE,
#'                                 data    = df)
#'
#'
#'   ### correlation function ###
#'
#'   r_cor_func2<-function(design,var1,var2){
#'     ### check if variable is NA ###
#'     #if(sum(is.na(design$variables[,var1]))==length(design$variables[,var1])) return(NA)
#'     #if(sum(is.na(design$variables[,var2]))==length(design$variables[,var2])) return(NA)
#'     variance <- survey::svyvar(stats::reformulate(var1,var2), design, na.rm = T)
#'     as.matrix(variance)
#'     cormatrix<-stats::cov2cor(as.matrix(variance))
#'     cormatrix[2]
#'   }
#'
#'   ### use the correlation function, to get a vector with all correlations ###
#'
#'   helpfunc<-function(design,var1,vars2){
#'     mapply(r_cor_func2,
#'            var2=vars2,
#'            MoreArgs = list(design = design, var1=var1))
#'   }
#'
#'   r<-mapply(helpfunc,
#'             var1=variables,
#'             MoreArgs = list(design = df_design, vars2=variables))
#'
#'   # r<-c()
#'   # c<-1
#'   # for (i in 1:length(variables)){
#'   #   for (l in 1:length(variables)){
#'   #     r[(c)]<-r_cor_func(df_design,variables[i],variables[l])
#'   #     c<-c+1
#'   #   }}
#'   #
#'   # r<-matrix(r,ncol=4)
#'   # colnames(r)<-variables
#'   # rownames(r)<-variables
#'
#'
#'   ### Function to calculate pairwise n
#'   n_func<-function(design,var1,var2){
#'     rows<-"rows"
#'     formula<-paste(rows,  " ~ " ,var1, " + ", var2)
#'     n<-survey::svytotal(stats::as.formula(formula), design, na.rm = T)[1]
#'     names(n)<-NULL
#'     n
#'   }
#'
#'   helpfunc2<-function(design,var1,vars2){
#'     mapply(n_func,
#'            var2=vars2,
#'            MoreArgs = list(design = design, var1=var1))
#'   }
#'
#'   if(bootstrap==F) {n<-mapply(helpfunc2,
#'                               var1=variables,
#'                               MoreArgs = list(design = df_design, vars2=variables))}
#'
#'   # if(bootstrap==F) {
#'   #   n<-c()
#'   #   c<-1
#'   #   for (i in 1:length(variables)){
#'   #     for (l in 1:length(variables)){
#'   #       n[(c)]<-n_func(df_design,variables[i],variables[l])
#'   #       c<-c+1
#'   #     }}
#'   #
#'   #   n<-matrix(n,ncol=4)
#'   #   colnames(n)<-variables
#'   #   rownames(n)<-variables}
#'
#'
#'
#'   ### calculate analytical p-values ###
#'   if(bootstrap==F) t<-(abs(r)*sqrt(n-2))/sqrt(1-(r^2))
#'   if(bootstrap==F) p<-2*stats::pt(t,(n-2),lower.tail = F)
#'   if(bootstrap==F) output<- list(r=r,n=n,P=p)
#'
#'   if(bootstrap ==F ) return(output)
#'   if(bootstrap ==T ) return(r)
#' }
#'
#'
#'
#' bootstrap_correlations2<-function(df, weight=NULL, stratas=NULL,
#'                                  ids=NULL, variables=NULL, remove_nas="pairwise",
#'                                  nboots = 2, parallel = F){
#'
#'   # if(is.null(weight)==F) df[,weight]<-as.numeric(df[,weight])
#'   # if(is.null(stratas)==F) df[,stratas]<-as.numeric(df[stratas])
#'   # if(is.null(ids)==F) df[,ids]<-as.numeric(df[,ids])
#'   ### calculate bootstrap p_values ###
#'
#'   if(is.null(variables)){
#'     variables<-colnames(df)
#'     if(is.null(ids)==F) if(is.na(ids) == F)variables<-variables[!variables == ids]
#'     if(is.null(weight)==F) if(is.na(weight) == F) variables<-variables[!variables == weight]
#'     if(is.null(stratas)==F) if(is.na(stratas) == F) variables<-variables[!variables == stratas]
#'   }
#'
#'     ### perform bootstrap ###
#'
#'   if (parallel==T) para<-"snow"
#'   if (parallel==F) para<-"no"
#'
#'     boot_out<-boot(data = df, variables = variables, statistic = wgt_cor2, R = nboots,
#'                    weight_var = weight, stratas = stratas, ids = ids, bootstrap = T,
#'                    ncpus = (parallel::detectCores()-1), parallel = para)
#'
#'
#'   # p_value_func<-function(i,boot_object){
#'   #
#'   #   if(is.na(boot_object$t0[i])) return(NA)
#'   #   if(is.na(boot_object$t0[i])==F) return(
#'   #     mean(abs(boot_object$t[,i]) >= abs(boot_object$t0[i]))
#'   #   )
#'   # }
#'     #return(boot_out)
#'
#'     boot_pvalues<-function(boot_object,variables){
#'
#'       subfunc_boot_pvalues<-function(boot_object,i){
#'         if(is.na(as.vector(boot_object$t0)[i])==F){
#'
#'           r_null<-0
#'           in_interval<-TRUE
#'           alpha<-0
#'
#'           ### get p_values up to 0.0001
#'           while(in_interval){
#'
#'             alpha <- alpha + 0.0001
#'
#'             ci<-suppressWarnings(boot::boot.ci(boot.out=boot_object,type = "perc",index = i, conf = (1 - alpha)))
#'             cis<-c(ci$percent[4],ci$percent[5])
#'
#'             in_interval<-  (r_null > cis[1] & r_null < cis[2])|(r_null < cis[1] & r_null > cis[2])
#'           }
#'
#'           alpha<-alpha-0.0001
#'           in_interval<-TRUE
#'           ### get more exact p_values
#'
#'           while(in_interval){
#'
#'             alpha <- alpha + 0.000001
#'
#'             ci<-suppressWarnings(boot::boot.ci(boot.out=boot_object,type = "perc",index = i, conf = (1 - alpha)))
#'             cis<-c(ci$percent[4],ci$percent[5])
#'
#'             in_interval<-  (r_null > cis[1] & r_null < cis[2])|(r_null < cis[1] & r_null > cis[2])
#'           }
#'
#'           alpha
#'         }
#'
#'         else {alpha<-NA}
#'
#'         alpha
#'
#'       }
#'
#'       ps<-sapply(1:(length(variables)*length(variables)),subfunc_boot_pvalues, boot_object=boot_object)
#'       ps<-matrix(ps,ncol=length(variables))
#'       colnames(ps)<-variables
#'       rownames(ps)<-variables
#'       ps
#'       }
#'
#' boot_pvalues2<-function(boot_object,variables){
#'
#'   subfunc_boot_pvalues2<-function(boot_object,i){
#'
#'     if(is.na(as.vector(boot_object$t0)[i])==F){
#'       alpha<-boot.pval::boot.pval(boot_object, type="perc",theta_null=0,index = i)}
#'
#'     else {alpha<-c(NA)}
#'
#'     alpha
#'
#'   }
#'
#'   ps<-sapply(1:(length(variables)*length(variables)),subfunc_boot_pvalues2, boot_object=boot_object)
#'   ps<-matrix(ps,ncol=length(variables))
#'   colnames(ps)<-variables
#'   rownames(ps)<-variables
#'   ps}
#'
#'
#'   p<-boot_pvalues2(boot_out,variables)
#'
#'
#'   output<-wgt_cor2(df = df, weight_var = weight, stratas = stratas,
#'           ids = ids, variables = variables, remove_nas = remove_nas)
#'   output$P<-p
#'
#'   output
#' }
#'
#'
#'
#'
#' weighted_correlation2<-function(df, weight = NULL, stratas = NULL,
#'                                 ids = NULL, variables = NULL, remove_nas = "pairwise",
#'                                 nboots = 0, parallel = F){
#'
#'   if(nboots == 0){
#'     output<-wgt_cor2(df = df, weight_var = weight, stratas = stratas,
#'                     ids = ids, variables = variables, remove_nas = remove_nas, bootstrap=F)
#'   }
#'
#'   if (nboots!=0 & nboots <=1) {
#'     stop("nboots must be 0 (for analytic p_values) or >1 for bootstrap p_values")}
#'
#'   if(nboots > 1) {
#'     output<- bootstrap_correlations2(df = df, weight = weight, stratas = stratas,
#'                                     ids = ids, variables = variables,
#'                                     remove_nas= remove_nas, nboots = nboots,
#'                                     parallel = parallel)
#'   }
#'
#'   output
#'
#'
#'
#' }
#'
#'
#'
#'
Any scripts or data that you put into this service are public.
sampcompR documentation built on Aug. 8, 2025, 7:35 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
sampcompR
Comparing and Visualizing Differences Between Surveys

R/Bivdiff2.R
In sampcompR: Comparing and Visualizing Differences Between Surveys

Try the sampcompR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sampcompR Comparing and Visualizing Differences Between Surveys

R/Bivdiff2.R In sampcompR: Comparing and Visualizing Differences Between Surveys

Try the sampcompR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sampcompR
Comparing and Visualizing Differences Between Surveys

R/Bivdiff2.R
In sampcompR: Comparing and Visualizing Differences Between Surveys