knitr::opts_chunk$set(echo = F, warning = F, message = F, eval = T)
library(knitr) library(ISLR) library(tabplot) library(tidyverse) library(tabplot) library(randomForest) library(stringr) library(GGally) library(forcats) library(caret) library(ggcorrplot) library(Amelia) library(e1071)
wellPanel( selectInput('sql_or_lib' , label = 'Select Data Source' , choices = c('SQL', 'Sample_Data') , selected = 'Sample_Data') , selectInput('sample_data' , label = 'Select Sample Data' , choices = c('ISLR::OJ' , 'ISLR::Caravan' , 'ISLR::Auto' , 'ISLR::Carseats' , 'ISLR::Wage' , 'ISLR::College' , 'ISLR::Hitters' , 'ISLR::Default' , 'ISLR::Weekly' , 'ILSR::Wage categoricals' , 'ISLR::OJ numericals' , 'data diamonds') , selected = 'ISLR::Wage') # , textInput('server' # , label = 'Enter Server' # , width = 'auto') # # , textInput('query' # , label = 'Paste SQL' # , width = '100%') , textInput('path' , label = 'Save Analysis to this directory' , width = '100%' , value = "C:/Users/erbla/Documents") ) inputPanel( actionButton('load_data' , label = 'Load Data') , checkboxInput('save' , label = 'Save Results' , value = F) ) rea_load = eventReactive(input$load_data,{ if(input$sql_or_lib == 'SQL'){ require(RODBCext) con = odbcDriverConnect( str_c('Driver=SQL Server;Trusted_Connection=Yes;Server=', input$server) ) query = input$query data = sqlQuery(con, query = query) close(con) } else{ if(input$sample_data == 'ILSR::Wage categoricals'){ data = ISLR::Wage bool = ! summarise_all(data, is.numeric) data= data[,bool] }else if(input$sample_data == 'ISLR::OJ numericals'){ data = ISLR::OJ %>% as.data.frame() bool = summarise_all(data, is.numeric) data= data[,as.logical(bool[1,])] }else if(startsWith(input$sample_data,'data')) { set = input$sample_data %>% stringr::str_extract(' [A-Za-z0-9]+$')%>% stringr::str_trim() exec_str = set %>% stringr::str_c('data(',.,')') eval(parse(text = exec_str) ) exec_str = set data = eval(parse(text = exec_str) ) }else{ exec_str = input$sample_data data = eval(parse(text = exec_str) ) } } return(data) }) save_plot = function(plot , path , name , yes , scale = 1 , excel = T){ require(tidyverse) if(yes == F) return() folder_name = lubridate::today() %>% as.character() %>% stringr::str_replace_all('-','')%>% stringr::str_c('_multiview') path = path %>% stringr::str_c('/',folder_name) dir.create(path) file_name_plot = path %>% stringr::str_c('/',name,'.png') file_name_excel = path %>% stringr::str_c('/',name,'.xls') # tabplot object should save with the same function safe_ggsave = safely(ggsave) safe_tableSafe = safely(tabplot::tableSave) s = safe_ggsave(file_name_plot , plot = plot , scale = scale) if( !purrr::is_empty(s$error) ){ print(s$error) s = safe_tableSafe(tab = plot ,filename = file_name_plot , scale = scale) } if( !purrr::is_empty(s$error) ){ print(s$error) plot dev.copy(png, filename = file_name_plot ) dev.off() } if(excel == T){ coords = ggplot_build(plot) if(file.exists(file_name_excel)) { file.remove(file_name_excel) } purrr::pwalk(list( x = coords$data , sheetName = as.character(1:length(coords$data) ) ) , xlsx::write.xlsx , file = file_name_excel , col.names = T , row.names = T , append = T ) } }
DT::renderDataTable( options = list( pageLength = 5) ,{ d_clean = rea_clean() data = d_clean$data all_variables = d_clean$all_variables return( data[ ,all_variables] ) })
renderPrint({ d_clean = rea_clean() data = d_clean$data numericals = d_clean$numericals print(summary( data[, numericals])) })
renderPrint({ d_clean = rea_clean() data = d_clean$data categoricals = d_clean$categoricals print(summary( data[, categoricals])) })
renderPlot({ d_clean = rea_clean() data = d_clean$data all_variables = d_clean$all_variables p = Amelia::missmap( as_data_frame(data)[, all_variables]) return(p) })
inputPanel( numericInput('max_no_lvls_fctr' , label = 'Max number of levels for categorical variables' , value = 10 , min = 1 , step = 1) , numericInput('min_no_vals_num' , label = 'Minimum number of distinct values for numerical variables' , value = 6 , min = 1 , step = 1) , checkboxInput('missing' , label = 'Exclude Missing Values' , value = F) ) renderUI({ data = rea_load() checkboxGroupInput('deselect_cols' , label = 'Select Variables to exclude' , choices = names(data) ) }) rea_clean = reactive({ data = rea_load() if(! purrr::is_empty(input$deselect_cols) ) { data = data %>% select( - one_of(input$deselect_cols) ) } d_clean = f_clean_data(data , as.numeric(input$max_no_lvls_fctr) , as.numeric(input$min_no_vals_num) ) return(d_clean) }) f_clean_data = function(data , max_number_of_levels_factors = 10 , min_number_of_levels_nums = 6){ require(tidyverse) require(stringr) require(caret) require(forcats) data = data %>% as_tibble() numericals = names(data)[ map_lgl( data, is.numeric)] categoricals = names(data)[ ! map_lgl( data, is.numeric)] all_variables = names(data) # add numericals with less than x distinc values to factors no_unique_vals = map_dbl(data[, numericals], function(x) length(unique(x)) ) categoricals = c(categoricals, numericals[no_unique_vals < min_number_of_levels_nums] ) numericals = numericals[ !numericals %in% categoricals ] # convert all non numericals and grouping variables to factors for (var in categoricals ){ data[[var]] = as.factor(data[[var]]) } # collapse smallest levels into one if factor levels exceed size x no_levels = map_dbl( data[, categoricals], function(x) length( levels(x) ) ) for (var in names( no_levels[no_levels > max_number_of_levels_factors] )) { data[[var]] = fct_lump(data[[var]], n = max_number_of_levels_factors) } # Boxcox transform numericals if(!is_empty(numericals)){ data_box_cox = data[, numericals] %>% mutate_all( function(x) x + min(x) + 0.00001 ) trans = preProcess( as.data.frame(data_box_cox), c('BoxCox')) pred = predict(trans, as.data.frame(data_box_cox) ) names_transformed = names(pred) %>% str_c('_boxcox') names(pred) = names_transformed data = data %>% bind_cols(pred) if(!ncol(data[,numericals]) == ncol(data[,names_transformed])) { stop( 'boxcox transformation unsucessfull') } } else { names_transformed = NULL } # drop categoricals with only one level if( !is_empty(categoricals)){ data = data %>% mutate_at( vars(one_of(categoricals) ) , fct_drop ) no_lvl = data[,categoricals] %>% summarise_all( function(x) list(levels(x)) ) %>% summarise_all( function(x) length(x[[1]]) ) only_one_lvl = categoricals[no_lvl == 1] if(length(only_one_lvl) > 0 ) { data = data %>% select( - one_of(only_one_lvl) ) all_variables = all_variables[ !all_variables %in% only_one_lvl ] categoricals = categoricals[ no_lvl > 1] } } # drop observations with missing Values if(input$missing == T) data = data[complete.cases(data),] # return statement return( list( data = data , numericals = numericals , categoricals = categoricals , all_variables = all_variables , boxcox = names_transformed ) ) }
renderUI({ d_clean = rea_clean() categoricals = d_clean$categoricals inputPanel( selectInput("group_stats", label = "select grouping variable", choices = categoricals, selected = categoricals[1]) ) }) f_bring_to_pos_range = function(x){ if( min(x)< 0) x = x + abs(min(x)) return(x) } f_diff_of_means_medians = function(df, group, variable){ data = df %>% select( group = one_of(group), variable = one_of(variable) )%>% mutate ( variable = f_bring_to_pos_range(variable) ) %>% group_by( group ) %>% summarise( means = mean(variable, na.rm = T) , medians = median(variable, na.rm = T) ) %>% ungroup() %>% summarise( diff_of_means = max(means) - min(means) , diff_of_means_perc = ( ( max(means) - min(means) ) /max(means) ) *100 , diff_of_medians = max(medians) - min(medians) , diff_of_medians_perc = ( ( max(medians) - min(medians) ) /max(means) ) *100 ) return(data) } f_max_diff_of_freq = function(df, var1, var2){ t = table( df[[var1]], df[[var2]] ) %>% as_tibble() %>% group_by( Var1 ) %>% mutate ( diff_var1 = ( max(n)-min(n) ) , diff_var1_perc = ( ( max(n)-min(n))/ max(n) *100) ) %>% group_by( Var2 ) %>% mutate ( diff_var2 = ( max(n)-min(n) ) , diff_var2_perc = ( ( max(n)-min(n))/ max(n) *100) ) %>% ungroup()%>% summarise( max_diff_freq = max( c(diff_var1, diff_var2) ) , max_diff_freq_perc = max( c(diff_var1_perc, diff_var2_perc) ) ) } f_anova_stats = function(df, group, variables) { # returns a dataframe with anova stats # df : dataframe # group : grouping variable as character vector, must indicate factor variable # variables : numerical variables to be analyzed as character vector, must indicate numerical variable data = df formula = stringr::str_c('value~',group) %>% as.formula() df_anova = data %>% as_tibble() %>% select( one_of( c(group, variables) ) ) %>% gather(key = 'variable', value = 'value', one_of( variables ) ) %>% group_by( variable ) %>% nest( one_of(group), value) %>% mutate( model_anova = purrr::map( data, ~aov( formula, data = .)) , summary_anova = purrr::map( model_anova, summary) , summary_anova = purrr::map( summary_anova , function(x) x[[1]]) , anova_pval = purrr::map(summary_anova, 'Pr(>F)') , anova_pval = purrr::map_dbl(anova_pval , function(x) x[1]) , model_kruskal = purrr::map( data, ~kruskal.test(formula, data = .) ) , kruskal_pval = purrr::map_dbl(model_kruskal, 'p.value') , model_shapiro = purrr::map(data , function(x) shapiro.test(sample(x$value, 5000, replace = T) ) ) , shapiro_stat = purrr::map_dbl(model_shapiro,'statistic') , shapiro_pval = purrr::map_dbl(model_shapiro,'p.value') , diff_df = purrr::map( data, f_diff_of_means_medians, group = group, variable = 'value') ) %>% unnest(diff_df) %>% select(variable , shapiro_stat , shapiro_pval , anova_pval , kruskal_pval , diff_of_means , diff_of_means_perc , diff_of_medians , diff_of_medians_perc ) return(df_anova) } f_chi_square = function(df, group, variables) { # returns a dataframe with anova stats # df : dataframe # group : grouping variable as character vector, must indicate factor variable # variables : numerical variables to be analyzed as character vector, must indicate factor variable data = df variables = variables[!variables == group] if(purrr::is_empty(variables)) return() df_chi = data %>% as_tibble() %>% select( one_of( c(group, variables) ) ) %>% gather(key = 'variable', value = 'value', one_of( variables ) ) %>% group_by( variable ) %>% nest( one_of( group ), value) %>% mutate( model_chi = purrr::map( data, ~chisq.test(x = .[[group]] , y = .[['value']] ) ) ,chi_pval = purrr::map_dbl(model_chi, 'p.value') ,diff_df = purrr::map(data, f_max_diff_of_freq, group, 'value') ) %>% unnest(diff_df) %>% select(variable, chi_pval, max_diff_freq, max_diff_freq_perc) return(df_chi) } rea_group_stats = reactive({ d_clean = rea_clean() data = d_clean$data boxcox = d_clean$boxcox numericals = d_clean$numericals categoricals = d_clean$categoricals if(! is_empty(numericals)) { df_num = f_anova_stats(df = data ,group = input$group_stats ,variables = numericals ) df_boxcox = f_anova_stats(df = data ,group = input$group_stats ,variables = boxcox ) sig_num = df_num %>% mutate( p_val = ifelse(shapiro_stat >= 0.9 , anova_pval , kruskal_pval) ) %>% filter(p_val <= 0.001) %>% .[['variable']] sig_boxcox = df_boxcox %>% mutate( p_val = ifelse(shapiro_stat >= 0.9 , anova_pval , kruskal_pval) ) %>% filter(p_val <= 0.001) %>% .[['variable']] } else { df_num = NULL df_boxcox = NULL sig_num = NULL sig_boxcox = NULL } # only works with minimum two categoricals vars if( length(categoricals) >= 2 ){ df_chi = f_chi_square(df = data ,group = input$group_stats ,variables = categoricals ) sig_chi = df_chi %>% filter(chi_pval <= 0.001) %>% .[['variable']] } else{ df_chi = NULL sig_chi = NULL } return( list( df = list(df_num = df_num ,df_boxcox = df_boxcox ,df_chi = df_chi ) , sig = list(sig_num = sig_num ,sig_boxcox = sig_boxcox ,sig_chi = sig_chi ) ) ) })
render_table_num_group_stat = function( boxcox = F) { DT::renderDataTable( extensions = 'Buttons' , options = list( dom = 'Bftrip' , buttons = c('copy', 'excel') , pageLength = 1 ) ,{ group = input$group_stats if( boxcox == F ){ df = rea_group_stats()$df$df_num }else{ df = rea_group_stats()$df$df_boxcox } df = DT::datatable(df , extensions = 'Buttons' , options = list( dom = 'Bftrip' , buttons = c('copy', 'excel') ) ) %>% DT::formatStyle('shapiro_stat' , color = DT::styleInterval(0.9, c('red','green') ) ) %>% DT::formatStyle(3:5 , color = DT::styleInterval(0.01, c('green','red') ) ) %>% DT::formatRound(2:5, 6) %>% DT::formatRound(c(6,8), 3) %>% DT::formatRound(c(7,9), 2) }) } render_table_num_group_stat( boxcox = F )
render_table_num_group_stat( boxcox = T )
DT::renderDataTable({ d_clean = rea_clean() data = d_clean$data boxcox = d_clean$boxcox numericals = d_clean$numericals categoricals = d_clean$categoricals group = input$group_stats df = rea_group_stats()$df$df_chi df = DT::datatable(df , extensions = 'Buttons' , options = list( dom = 'Bftrip' , buttons = c('copy', 'excel') ) ) %>% DT::formatStyle('chi_pval' , color = DT::styleInterval(0.01, c('green','red') ) ) %>% DT::formatRound( 2, 6) %>% DT::formatRound( 4, 2) return(df) })
renderUI({ d_clean = rea_clean() categoricals = d_clean$categoricals numericals = d_clean$numericals inputPanel( selectInput("group_imp" , label = "select predicted variable" , choices = c(categoricals, numericals) , selected = categoricals[1]) ) }) rea_imp = reactive({ d_clean = rea_clean() data = d_clean$data all_variables = d_clean$all_variables form = as.formula( stringr::str_c(input$group_imp,'~.') ) m = randomForest::randomForest(form, data[, all_variables] ) imp = tibble( mean_decrease_gini = m$importance[,1] ,variable = names(m$importance[,1]) ,group = input$group_imp) %>% arrange(desc(mean_decrease_gini)) return(imp) }) renderPlot({ imp = rea_imp() p = ggplot(imp) + geom_bar(aes(x = fct_reorder(variable , mean_decrease_gini) , y = mean_decrease_gini , fill = fct_reorder(variable , mean_decrease_gini , .desc = T) ) , stat = 'identity' , show.legend = F) + coord_flip() + labs( x = 'Variable' , y = 'Predictive capacity (Mean decrease in Gini)') name = stringr::str_c( input$sql_or_lib,'_predictive_capacity_', imp[[1,'group']] ) save_plot(plot = p , path = input$path , name = name , yes = input$save ) return(p) })
renderUI({ d_clean = rea_clean() categoricals = d_clean$categoricals numericals = d_clean$numericals inputPanel( selectInput("group_tree" , label = "select predicted variable" , choices = c(categoricals, numericals) , selected = categoricals[1] ) , numericInput('min_split' ,label = 'minimum node size to attempt split' , min = 2 , max = 10000 , step = 1 , value = 20 ) , numericInput('max_depth' , label = 'maximum tree depth' , min = 2 , max = 30 , step = 1 , value = 30 ) , numericInput('cp' ,label = 'minimum complexity reduction to attempt split' , min = 0.0001 , max = 0.99 , step = 0.001 , value = 0.01 ) , numericInput('xval' ,label = 'k-fold cross validation' , min = 0 , max = 100 , step = 1 , value = 10 ) ) }) rea_tree = reactive({ d_clean = rea_clean() data = d_clean$data pred = input$group_tree form = stringr::str_c(pred, '~.') %>% as.formula() m = rpart::rpart(form , data , minsplit = input$min_split , cp = input$cp #, maxcompete = 4 #, maxsurrogate = 5 #, usesurrogate = 2 #, xval = 10 #, surrogatestyle = 0 , maxdepth = input$max_depth , xval = input$xval) return(m) }) renderPlot({ m = rea_tree() rpart.plot::prp(m , branch.type = 5 , box.palette ="RdYlGn" , faclen = 0 , extra = 6 , fallen.leaves = input$fallen_leaves , tweak = input$tweak , gap = input$gap , space = input$space ) name = stringr::str_c( input$sql_or_lib,'_tree1_', input$group_tree ) save_plot(plot = p , path = input$path , name = name , yes = input$save , excel = F ) }) renderPlot({ m = rea_tree() rpart.plot::rpart.plot(m , fallen.leaves = T , tweak = input$tweak , gap = input$gap , space = input$space ) name = stringr::str_c( input$sql_or_lib,'_tree2_', input$group_tree ) save_plot(plot = p , path = input$path , name = name , yes = input$save , excel = F ) }) inputPanel( checkboxInput('fallen_leaves' , label = 'fallen_leaves' , value = T) , numericInput('tweak' ,label = 'increase_text_size' , min = 0.1 , max = 10 , step = 0.1 , value = 1.2 ) , numericInput('gap' , label = 'gap' , min = 0 , max = 100 , step = 1 , value = 2 ) , numericInput('space' , label = 'space' , min = 0 , max = 100 , step = 1 , value = 2 ) )
Berechnent die ersten beiden Hauptkomponenten der numerisch- kontinuierlichen Variablen.
rea_pca = reactive({ d_clean = rea_clean() data = d_clean$data boxcox = d_clean$boxcox categoricals = d_clean$categoricals all_variables = d_clean$all_variables numericals = d_clean$numericals if(is_empty(numericals) | is_empty(boxcox)) return() if(input$boxcox_pca == T) { vars = boxcox } else{ vars = numericals } pca = prcomp(x = select(data, one_of(vars) ) , scale. = input$scale_pca , center = input$center_pca) pca$cos2 = pca$rotation^2 pca$contrib = lmap( as_tibble(pca$cos2 ), function(y) y/ sum(y) *100 ) row.names(pca$contrib) = row.names(pca$cos2) pca$vae = as_tibble (t( pca$sdev / sum(pca$sdev) *100 ) ) colnames(pca$vae) = colnames(pca$contrib) pca$contrib_abs_perc = t( t( apply( pca$contrib/100, 1, function(x,y) x*pca$vae ) ) ) pca$contrib_abs_perc = unnest( as.data.frame( pca$contrib_abs_perc) ) row.names(pca$contrib_abs_perc) = row.names(pca$contrib) pca$contrib_abs_perc = as.data.frame( t(pca$contrib_abs_perc) ) pca$contrib_abs_perc$var = row.names(pca$contrib_abs_perc) pca$contrib_abs_perc = pca$contrib_abs_perc %>% gather(key = 'key', value = 'value', everything(), -var) #group variables with less than 2.5% contribution pca$contrib_abs_perc_reduced = pca$contrib_abs_perc %>% mutate(key = ifelse(value < 2.5 , ' sum contrib < 2.5%' , key)) %>% group_by( var, key) %>% summarise( value = sum(value) ) # filter principle components that explain less than # 2.5% of the variance pca$x = pca$x[, pca$vae > 2.5] pca$contrib_abs_perc = pca$contrib_abs_perc %>% filter(var %in% colnames(pca$x)) pca$contrib_abs_perc_reduced = pca$contrib_abs_perc_reduced %>% filter(var %in% colnames(pca$x)) data = data %>% cbind(pca$x) return(list(data = data, pca = pca) ) }) renderUI({ inputPanel( checkboxInput('center_pca', label = 'center', value = T) , checkboxInput('scale_pca', label = 'scale', value = T) , checkboxInput('boxcox_pca', label = 'boxcox', value = T) ) })
# rotation plot renderPlot({ data = rea_pca()$data pca = rea_pca()$pca group = input$group_pca x_axis = input$x_axis_pca y_axis = input$y_axis_pca if(input$group_pca == 'None') group = NULL p = ggplot(data) + geom_point( aes_string(x = x_axis, y = y_axis, color = group) ,alpha=0.4 ) + labs(title = 'plot1') name = stringr::str_c( input$sql_or_lib,'_principle_component_')%>% stringr::str_c( input$group_pca ) %>% stringr::str_c('_boxcox', input$boxcox_pca) %>% stringr::str_c('_scale', input$scale_pca) %>% stringr::str_c('_center', input$center_pca) save_plot(plot = p , path = input$path , name = name , yes = input$save ) return( p ) }) renderUI({ d_clean = rea_clean() d_pca = rea_pca() data = d_pca$data boxcox = d_clean$boxcox categoricals = d_clean$categoricals all_variables = d_clean$all_variables numericals = d_clean$numericals prin_comp = select(data, starts_with('PC', ignore.case = F)) %>% names() inputPanel( selectInput("x_axis_pca" , label = "select x_axis" , choices = prin_comp , selected = prin_comp[2] ) , selectInput("y_axis_pca" , label = "select y_axis" , choices = prin_comp , selected = prin_comp[1] ) , selectInput("group_pca" , label = "select grouping variable" , choices = c(categoricals, 'None') , selected = categoricals[1] ) ) })
# variance explained renderPlot({ pca = rea_pca()$pca vae = tibble( value = pca$sdev , pca_n = str_c('pca', 1:length(pca$sdev)) ) p = ggplot(pca$contrib_abs_perc_reduced) + geom_bar(aes(x = fct_reorder(var, value, sum, .desc = T ) , y = value, fill = key) , stat = 'identity' , position='stack') + scale_fill_brewer(palette = 'Paired') name = stringr::str_c( input$sql_or_lib,'_variance_explained_')%>% stringr::str_c( input$group_pca ) %>% stringr::str_c('_boxcox', input$boxcox_pca) %>% stringr::str_c('_scale', input$scale_pca) %>% stringr::str_c('_center', input$center_pca) save_plot(plot = p , path = input$path , name = name , yes = input$save ) return(p) })
rea_corr = reactive({ d_clean = rea_clean() data = d_clean$data boxcox = d_clean$boxcox categoricals = d_clean$categoricals all_variables = d_clean$all_variables numericals = d_clean$numericals if(is_empty(boxcox)) return() corr = round( cor(data[, boxcox]), 1) p_val_corr = ggcorrplot::cor_pmat( data[, boxcox] ) return( list(corr = corr, p_val_corr = p_val_corr) ) }) renderPlot(width = 1024, height = 1024, { corr = rea_corr()$corr p_val_corr = rea_corr()$p_val_corr p = ggcorrplot::ggcorrplot( corr ,hc.order = TRUE , type = "lower" , lab = TRUE , p.mat = p_val_corr ) name = stringr::str_c( input$sql_or_lib,'_correlation') save_plot(plot = p , path = input$path , name = name , yes = input$save ) return(p) })
preselect = c('all' ,'none' ,'categoricals' ,'numericals' ,'top_10_importance' ,'boxcox' ,'none_boxcox' ,'pc1+pc2' ,'pc1+pc2 contrib >2.5%' ,'correlation' ,'group_stat_P<0.001' ,'2fac_2num' ) renderUI({ d_clean = rea_clean() data = d_clean$data boxcox = d_clean$boxcox categoricals = d_clean$categoricals all_variables = d_clean$all_variables numericals = d_clean$numericals inputPanel( sliderInput('n_breaks_tab' , label = "Number of bins:" , min = 1 , max = 500 , step = 1 , value = 30) , selectInput("sort_col_tab" , label = "Sort_by" , choices = names(data) , selected = numericals[1]) , checkboxInput('decreasing_tab' , label = 'decreasing' , value = T) , sliderInput('max_levels_tab' , label = "Maximum number of levels for categorical variables" , min = 3 , max = 30 , step = 1 , value = 10) , selectInput("preselect", label = "preselect" ,choices = preselect ,selected = 'top_10_importance') , selectInput("preselect_correlation", label = "select_correlation" ,choices = boxcox ,selected = boxcox[1]) ) }) renderPlot( { d_clean = rea_clean() data = d_clean$data boxcox = d_clean$boxcox categoricals = d_clean$categoricals all_variables = d_clean$all_variables numericals = d_clean$numericals imp = rea_imp() p = tabplot::tableplot(data , select_string = input$select_vars_tab , sortCol = input$sort_col_tab , decreasing = input$decreasing_tab , nBins = input$n_breaks_tab , max_levels = input$max_levels_tab , scales = 'lin') index_str = stringr::str_c( which(all_variables %in% input$select_vars_tab) , collapse = '+' ) name = stringr::str_c( input$sql_or_lib,'_tabplot_')%>% stringr::str_c('sort_', input$sort_col_tab, '_') %>% stringr::str_c( index_str) save_plot(plot = p , path = input$path , name = name , yes = input$save , excel = F ) return(p) }) chk_bx_grp_ui = function(x,y, name, label) { renderUI({ d_clean = rea_clean() data = d_clean$data boxcox = d_clean$boxcox categoricals = d_clean$categoricals all_variables = d_clean$all_variables numericals = d_clean$numericals imp = rea_imp() p_val_corr = rea_corr()$p_val_corr pca = rea_pca()$pca if(input[[x]] == 'all') preselect = names(data) if(input[[x]] == 'none') preselect = NULL if(input[[x]] == 'top_10_importance') preselect = imp$variable[1:10] if(input[[x]] == 'boxcox') preselect = boxcox if(input[[x]] == 'none_boxcox') preselect = names(data)[!names(data) %in% boxcox] if(input[[x]] == 'numericals') preselect = numericals if(input[[x]] == 'categoricals') preselect = categoricals if(input[[x]] == 'group_stat_P<0.001') { sig = rea_group_stats()$sig #remove boxcox that are also in numericals sig_boxcox = sig$sig_boxcox %>% stringr::str_extract('^[A-Za-z]*') %>% .[!. %in% sig$sig_num] %>% stringr::str_c('_boxcox') preselect = stringr::str_c( c(sig$sig_num, sig_boxcox, sig$sig_chi ) ) } if(input[[x]] == 'correlation') { correlating = p_val_corr %>% as_tibble() %>% mutate( vars = names(.) ) %>% select( filter_var = one_of(input[[y]]), vars) %>% filter( filter_var <= 0.05 ) %>% select( vars ) preselect = correlating$vars } if( startsWith( input[[x]], 'pc1+pc2')) { if(endsWith( input[[x]], '>2.5%')){ pca_1_2 = pca$contrib_abs_perc_reduced %>% filter(var %in% c('PC1', 'PC2') ) preselect = pca_1_2$key }else { pca_1_2 = pca$contrib_abs_perc %>% filter(var %in% c('PC1', 'PC2') ) preselect = pca_1_2$key } } if(input[[x]] == '2fac_2num') { preselect = c(numericals[1], categoricals[1] ,numericals[2], categoricals[2] ,numericals[3], categoricals[3] ,numericals[4], categoricals[4] )[1:4] } checkboxGroupInput( name , label = label , choices = names(data) , selected = preselect , inline = F ) }) } chk_bx_grp_ui('preselect', 'preselect_correlation', name = 'select_vars_tab', label = 'Select Variables')
renderUI({ d_clean = rea_clean() data = d_clean$data boxcox = d_clean$boxcox categoricals = d_clean$categoricals all_variables = d_clean$all_variables numericals = d_clean$numericals inputPanel( selectInput("preselect_duo_col" , label = "Preselect Column" ,choices = preselect ,selected = '2fac_2num') , selectInput("preselect_duo_row" , label = "Preselect Row" ,choices = preselect ,selected = '2fac_2num') ) }) renderUI({ d_clean = rea_clean() data = d_clean$data boxcox = d_clean$boxcox categoricals = d_clean$categoricals all_variables = d_clean$all_variables numericals = d_clean$numericals inputPanel( selectInput("preselect_corr_duo_col" , label = "Select Correlation Column" ,choices = boxcox ,selected = boxcox[1]) , selectInput("preselect_corr_duo_row" , label = "Select Correlation Row" ,choices = boxcox ,selected = boxcox[1]) ) }) rea_duo = eventReactive(input$but_duo , { return( list(col = input$select_vars_duo_col ,row = input$select_vars_duo_row ,group = input$group_duo ,corr = input$corr_duo) ) } ) renderPlot({ d_clean = rea_clean() data = d_clean$data boxcox = d_clean$boxcox categoricals = d_clean$categoricals all_variables = d_clean$all_variables numericals = d_clean$numericals ret = rea_duo() col = ret$col row = ret$row group = ret$group corr = ret$corr if(group == 'None') group = NULL lm_with_cor <- function(data, mapping, ..., method = "pearson") { x <- data[[deparse(mapping$x)]] y <- data[[deparse(mapping$y)]] cor <- cor(x, y, method = method) GGally::ggally_smooth_lm(data, mapping, ...) + ggplot2::geom_label( data = data.frame( x = min(x, na.rm = TRUE), y = max(y, na.rm = TRUE), lab = round(cor, digits = 3) ), mapping = ggplot2::aes(x = x, y = y, label = lab, color = NULL), hjust = 0, vjust = 1, size = 5, fontface = "bold" ) } if(corr == T) { p =GGally::ggduo( data, col, row, mapping = aes_string(color = group), types = list(continuous = GGally::wrap(lm_with_cor, alpha = 0.25)), showStrips = FALSE ) + theme(legend.position = "bottom") } else{ p = GGally::ggduo( data, col, row, mapping = aes_string(color = group), #types = list(continuous = wrap(lm_with_cor, alpha = 0.25)), showStrips = FALSE ) + theme(legend.position = "bottom") } index_str_row = stringr::str_c( which(all_variables %in% row) , collapse = '+' ) index_str_col = stringr::str_c( which(all_variables %in% col) , collapse = '+' ) name = stringr::str_c( input$sql_or_lib,'_ggduo')%>% stringr::str_c('_sort', input$sort_col_tab ) %>% stringr::str_c( '_col', index_str_col) %>% stringr::str_c( '_row', index_str_row) save_plot(plot = p , path = input$path , name = name , yes = input$save , excel = F ) return(p) }) renderUI({ d_clean = rea_clean() data = d_clean$data boxcox = d_clean$boxcox categoricals = d_clean$categoricals all_variables = d_clean$all_variables numericals = d_clean$numericals inputPanel( selectInput("group_duo" , label = "select grouping variable" , choices = c(categoricals, 'None') , selected = categoricals[length(categoricals)] ) , checkboxInput('corr_duo' , label = 'show corr coef' , value = T) , actionButton('but_duo', 'Render Plot') #, cellArgs = list(width = '50%') ) }) # chk_bx_grp_ui has its own gui rendering function splitLayout( chk_bx_grp_ui('preselect_duo_col', 'preselect_corr_duo_col' , name = 'select_vars_duo_col' , label = 'Select Variables Column') , chk_bx_grp_ui('preselect_duo_row', 'preselect_corr_duo_row' , name = 'select_vars_duo_row' , label = 'Select Variables Row') )
input_panel_histo = function(suffix) { renderUI({ d_clean = rea_clean() data = d_clean$data boxcox = d_clean$boxcox categoricals = d_clean$categoricals all_variables = d_clean$all_variables numericals = d_clean$numericals inputPanel( selectInput(str_c("n_breaks", suffix) , label = "Number of bins:" , choices = c(10, 20, 30, 50, 75, 100, 150, 200, 300, 500, 750, 1000) , selected = 30 ) , selectInput(str_c("variable", suffix) , label = "Select variable" , choices = names(data) , selected = numericals[1] ) , selectInput(str_c( "group", suffix) , label = "Select grouping variable" , choices = c(categoricals , 'None') , selected = categoricals[1] ) , selectInput(str_c("graph_type", suffix) , label = "Graph Type" , choices = c('bar', 'line', 'violin') , selected = 'line') , selectInput(str_c("y_axis", suffix) , label = "Y-Axis" , choices = c('density', 'counts') , selected = 'density') , checkboxInput(str_c( 'auto_range', suffix ) , label = 'use automatic range' , value = T) , textInput(str_c('x_min', suffix) , label = 'X-min' , value = '0') , textInput(str_c('x_max', suffix) , label = 'X-max' , value = '1000') ) }) } renderPlot_histo = function( suffix ){ renderPlot({ d_clean = rea_clean() data = d_clean$data boxcox = d_clean$boxcox categoricals = d_clean$categoricals all_variables = d_clean$all_variables numericals = d_clean$numericals # get input n_breaks = input[[ str_c("n_breaks" , suffix) ]] variable = input[[ str_c("variable" , suffix) ]] group = input[[ str_c( "group" , suffix) ]] graph_type = input[[ str_c("graph_type" , suffix) ]] y_axis = input[[ str_c("y_axis" , suffix) ]] auto_range = input[[ str_c( 'auto_range', suffix) ]] x_min = input[[ str_c('x_min' , suffix) ]] x_max = input[[ str_c('x_max' , suffix) ]] #y-axis if(y_axis == 'density') { y_axis = '..density..' } else{ y_axis = '..count..' } #group if(group == 'None') group = NULL # numericals ---------------------------------------------------------------------------- numericals = c(numericals, boxcox) #geom_freqpoly if(variable %in% numericals & graph_type == 'line'){ p = data %>% ggplot() + geom_freqpoly( aes_string(x = variable, y = y_axis, color = group) , bins = as.numeric(n_breaks)) } #geom_histo if(variable %in% numericals & graph_type == 'bar' ){ p = data %>% ggplot() + geom_histogram( aes_string(x = variable, y = y_axis, fill = group) , bins = as.numeric(n_breaks), alpha = 0.6 , position = 'identity') } #geom_violin if(variable %in% numericals & graph_type == 'violin'){ medians = data %>% group_by_(as.symbol(group)) %>% select( one_of(numericals ) ) %>% summarise_all( median ) p = data %>% ggplot() + geom_violin( aes_string(x = group , y = variable , fill = group) ) + geom_crossbar( data = medians, mapping = aes_string(x = group , y = variable , ymin = variable , ymax = variable) ) } # add x range if(variable %in% numericals & auto_range == F & !graph_type == 'violin'){ p = p + xlim( c( as.numeric(x_min), as.numeric(x_max)) ) } # add y range if(variable %in% numericals & auto_range == F & graph_type == 'violin'){ p = p + ylim( c( as.numeric(x_min), as.numeric(x_max)) ) } # categoricals ---------------------------------------------------------------------------- #geom_bar if(variable %in% categoricals ){ if(y_axis == '..density..') y_axis = '..prop..' p = data %>% ggplot() + geom_bar( aes_string(x = variable, y = y_axis, fill = group, group = group) , position = 'dodge') } y_axis_str = stringr::str_extract_all(y_axis , '[A-Za-z]') %>% unlist()%>% stringr::str_c(collapse = '') name = stringr::str_c( input$sql_or_lib, '_')%>% stringr::str_c(suffix ) %>% stringr::str_c('_', graph_type ) %>% stringr::str_c('_', y_axis_str) %>% stringr::str_c( '_group', group) %>% stringr::str_c( '_var', variable) %>% stringr::str_c( '_', n_breaks) save_plot(plot = p , path = input$path , name = name , yes = input$save ) return(p) }) }
input_panel_histo('histo_b') renderPlot_histo('histo_b')
input_panel_histo('histo_c') renderPlot_histo('histo_c')
input_panel_histo('histo_d') renderPlot_histo('histo_d')
# som can use contineous and categorical variables # categorical variables need to be converted to a # class matrix and added as an extra layer # variables data = diamonds map_dimension = 20 # prepare data numerics = summarise_all( data, is.numeric ) %>% as.logical() numerics factors = names(data)%>% .[!numerics] numerics = names(data)%>% .[numerics] fct_to_matrix = function(data, factor_vec){ name = names(factor_vec) mat = kohonen::classvec2classmat( factor_vec ) names(mat) = name return(mat) } data_list = list() distances = vector() for (fac in factors){ data_list[[fac]] = kohonen::classvec2classmat( data[[fac]] ) distances = c(distances, 'tanimoto') } data_list[['numerics']] = scale(data[,numerics]) distances = c( distances, 'euclidean') # create a grid onto which the som will be mapped som_grid = kohonen::somgrid(xdim = 20 , ydim=20 , topo="hexagonal") m = kohonen::supersom( data_list , grid=som_grid , rlen=100 , alpha = 0.05 , whatmap = c(factors, 'numerics') , dist.fcts = distances #, maxNA.fraction = .5 ) plot(m, type="changes") plot(m, type="counts") plot(m, type="dist.neighbours") plot(m, type="codes") plot(m, type="quality") # cluster map segments codes = tibble( layers = names(m$codes) ,codes = m$codes ) %>% mutate( codes = purrr::map(codes, as_tibble) ) %>% spread( key = layers, value = codes) %>% apply(1, bind_cols) %>% .[[1]] %>% as_tibble() dist_m = dist(codes) %>% as.matrix() # generate distance map based on position on map # map_dimension = 3 max_dist = Inf # max(dist_m) dist_on_map = tibble( r = rep(1:map_dimension, map_dimension) , c = rep(1:map_dimension, map_dimension) %>% sort() ) %>% dist() %>% as.matrix() %>% # distance for direct neighbours will be 1 and for # diagonal neighbours 1.41 apply( 1:2, function(x) ifelse(x>= 1.5, max_dist, x) ) %>% apply( 1:2, function(x) ifelse(x == max_dist, max_dist, 1) ) # multiply nodes distance with distance on map # this will set all distances between none neighbouring # points to infinity dist_adjusted = dist_m * dist_on_map dist_m = as.dist(dist_m) clust1 = hclust(dist_m, method = 'single') clust2 = hc(dist_adjusted, method = 'average') k_m = kmeans( codes, 6) k_m$cluster plot(clust1) plot(clust2) segments1 = cutree(clust1, 4 ) segments2 = cutree(clust2, 4 ) clust2$height clust2$order clust2$method clust2$call clust2$dist.method # Colour palette definition pretty_palette <- c("#1f77b4", '#ff7f0e', '#2ca02c','#d62728', '#9467bd', '#8c564b', '#e377c2') plot(m, bgcol = pretty_palette[segments1]) add.cluster.boundaries(m, segments1) plot(m, bgcol = pretty_palette[segments2], main = 'Clusters') add.cluster.boundaries(m, segments2) plot(m, bgcol = pretty_palette[k_m$cluster], main = 'Clusters') plot(clust2) coolBlueHotRed <- function(n, alpha = 1) {rainbow(n, end=4/6, alpha=alpha)[n:1]} l = 1:ncol(m$codes[[1]]) for (i in l) { plot( m, type = "property", property = m$codes[[1]][,i], main = colnames(m$codes[[1]])[i], palette.name = coolBlueHotRed ) } mydata <- m$codes [[1]] wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var)) for (i in 2:15) { wss[i] <- sum(kmeans(mydata, centers=i)$withinss) } plot(wss) n.elbow = 6 ## use hierarchical clustering to cluster the codebook vectors som_cluster <- cutree(hclust(dist(m$codes[[1]])), n.elbow) # plot these results: add.cluster.boundaries(m, som_cluster)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.