In erblast/oetteR: Collection of personal R functions

knitr::opts_chunk$set(echo = F, warning = F, message = F, eval = T)

library(knitr)
library(ISLR)
library(tabplot)
library(tidyverse)
library(tabplot)
library(randomForest)
library(stringr)
library(GGally)
library(forcats)
library(caret)
library(ggcorrplot)
library(Amelia)
library(e1071)

{.tabset}

Data{.tabset}

Load Data

wellPanel(

  selectInput('sql_or_lib'
              , label = 'Select Data Source'
              , choices = c('SQL', 'Sample_Data')
              , selected = 'Sample_Data')

  , selectInput('sample_data'
              , label = 'Select Sample Data'
              , choices = c('ISLR::OJ'
                            , 'ISLR::Caravan'
                            , 'ISLR::Auto'
                            , 'ISLR::Carseats'
                            , 'ISLR::Wage'
                            , 'ISLR::College'
                            , 'ISLR::Hitters'
                            , 'ISLR::Default'
                            , 'ISLR::Weekly'
                            , 'ILSR::Wage categoricals'
                            , 'ISLR::OJ numericals'
                            , 'data diamonds')
              , selected = 'ISLR::Wage')

  # , textInput('server'
  #             , label = 'Enter Server'
  #             , width = 'auto')
  # 
  # , textInput('query'
  #             , label = 'Paste SQL'
  #             , width = '100%')


  , textInput('path'
            , label = 'Save Analysis to this directory'
            , width = '100%'
            , value = "C:/Users/erbla/Documents")

)

inputPanel(

  actionButton('load_data'
               , label = 'Load Data')

  , checkboxInput('save'
                  , label = 'Save Results'
                  , value = F)

)

rea_load = eventReactive(input$load_data,{

  if(input$sql_or_lib == 'SQL'){

    require(RODBCext)

    con = odbcDriverConnect(
                             str_c('Driver=SQL Server;Trusted_Connection=Yes;Server=', input$server)
         )

    query = input$query

    data = sqlQuery(con, query = query)

    close(con)

  }  

  else{

    if(input$sample_data == 'ILSR::Wage categoricals'){

      data = ISLR::Wage

      bool = ! summarise_all(data, is.numeric)

      data= data[,bool]

    }else if(input$sample_data == 'ISLR::OJ numericals'){

      data = ISLR::OJ %>%
        as.data.frame()

      bool = summarise_all(data, is.numeric)


      data= data[,as.logical(bool[1,])]

    }else if(startsWith(input$sample_data,'data')) {

      set = input$sample_data %>%
        stringr::str_extract(' [A-Za-z0-9]+$')%>%
        stringr::str_trim() 

      exec_str = set %>%
        stringr::str_c('data(',.,')')

      eval(parse(text = exec_str) )

      exec_str = set 

      data = eval(parse(text = exec_str) )

    }else{
      exec_str = input$sample_data

      data = eval(parse(text = exec_str) )
    }


  }

  return(data)

})

save_plot = function(plot
                     , path
                     , name
                     , yes
                     , scale = 1
                     , excel = T){

  require(tidyverse)

  if(yes == F) return()

  folder_name = lubridate::today() %>%
    as.character() %>%
    stringr::str_replace_all('-','')%>%
    stringr::str_c('_multiview')

  path = path %>%
    stringr::str_c('/',folder_name)

  dir.create(path)

  file_name_plot = path %>%
    stringr::str_c('/',name,'.png')

  file_name_excel = path %>%
    stringr::str_c('/',name,'.xls')

  # tabplot object should save with the same function


  safe_ggsave    = safely(ggsave)
  safe_tableSafe = safely(tabplot::tableSave)

  s = safe_ggsave(file_name_plot
           , plot = plot
           , scale = scale)

  if( !purrr::is_empty(s$error) ){

    print(s$error)
    s = safe_tableSafe(tab = plot
                         ,filename = file_name_plot
                         , scale = scale)

  }


  if( !purrr::is_empty(s$error) ){

    print(s$error)

    plot

    dev.copy(png, filename = file_name_plot
            )
    dev.off()

  }




  if(excel == T){
    coords = ggplot_build(plot)

  if(file.exists(file_name_excel)) {
    file.remove(file_name_excel)
  }

  purrr::pwalk(list( x = coords$data
                    , sheetName = as.character(1:length(coords$data) ) 
                    )
              , xlsx::write.xlsx
              , file = file_name_excel
              , col.names = T
              , row.names = T
              , append = T
              )

  }

}

Sample View

DT::renderDataTable( options = list( pageLength = 5)

,{

  d_clean = rea_clean()

  data          = d_clean$data
  all_variables = d_clean$all_variables

  return( data[ ,all_variables] )
})

Summary

Numericals

renderPrint({

  d_clean = rea_clean()

  data          = d_clean$data
  numericals    = d_clean$numericals

    print(summary( data[, numericals]))
})

Categoricals

renderPrint({

  d_clean = rea_clean()

  data          = d_clean$data
  categoricals    = d_clean$categoricals

  print(summary( data[, categoricals]))
})

Missing Values

renderPlot({

  d_clean = rea_clean()

  data             = d_clean$data
  all_variables    = d_clean$all_variables

  p = Amelia::missmap( as_data_frame(data)[, all_variables])

  return(p)
})

Clean Data

inputPanel(

  numericInput('max_no_lvls_fctr'
               , label = 'Max number of levels for categorical variables'
               , value = 10
               , min = 1
               , step = 1)

  , numericInput('min_no_vals_num'
               , label = 'Minimum number of distinct values for numerical variables'
               , value = 6
               , min = 1
               , step = 1)

  , checkboxInput('missing'
                  , label = 'Exclude Missing Values'
                  , value = F)

)

renderUI({
  data = rea_load()

  checkboxGroupInput('deselect_cols'
                     , label = 'Select Variables to exclude'
                     , choices = names(data)
                     )

})



rea_clean = reactive({

  data = rea_load()

  if(! purrr::is_empty(input$deselect_cols) ) {
    data = data %>%
      select( - one_of(input$deselect_cols) )
  }

  d_clean = f_clean_data(data
                       ,  as.numeric(input$max_no_lvls_fctr)
                       ,  as.numeric(input$min_no_vals_num)
                        )


  return(d_clean)

})


f_clean_data = function(data
                       , max_number_of_levels_factors = 10
                       , min_number_of_levels_nums = 6){


  require(tidyverse)
  require(stringr)
  require(caret)
  require(forcats)

  data = data %>%
    as_tibble()

  numericals = names(data)[ map_lgl( data, is.numeric)]

  categoricals = names(data)[ ! map_lgl( data, is.numeric)]

  all_variables = names(data)


  # add numericals with less than x distinc values to factors

  no_unique_vals = map_dbl(data[, numericals], function(x) length(unique(x)) )

  categoricals = c(categoricals, numericals[no_unique_vals <  min_number_of_levels_nums] )

  numericals = numericals[ !numericals %in% categoricals ]


  # convert all non numericals and grouping variables to factors

  for (var in categoricals ){


    data[[var]] = as.factor(data[[var]])

  }

  # collapse smallest levels into one if factor levels exceed size x


  no_levels = map_dbl( data[, categoricals], function(x) length( levels(x) ) )  

  for (var in names( no_levels[no_levels > max_number_of_levels_factors] )) {

    data[[var]] = fct_lump(data[[var]], n = max_number_of_levels_factors)
  }

  # Boxcox transform numericals

  if(!is_empty(numericals)){

    data_box_cox = data[, numericals] %>%
      mutate_all( function(x) x + min(x) + 0.00001 )

    trans = preProcess( as.data.frame(data_box_cox), c('BoxCox'))

    pred = predict(trans, as.data.frame(data_box_cox) )

    names_transformed = names(pred) %>%
      str_c('_boxcox')

    names(pred) = names_transformed

    data = data %>%
      bind_cols(pred) 

    if(!ncol(data[,numericals]) 
         == ncol(data[,names_transformed])) {
      stop( 'boxcox transformation unsucessfull')
    }

  } else {
    names_transformed = NULL
  }
  # drop categoricals with only one level

  if( !is_empty(categoricals)){

    data = data %>%
      mutate_at( vars(one_of(categoricals) ) 
                 , fct_drop )

    no_lvl = data[,categoricals] %>%
      summarise_all( function(x) list(levels(x)) ) %>%
      summarise_all( function(x) length(x[[1]]) )

    only_one_lvl = categoricals[no_lvl == 1]

    if(length(only_one_lvl) > 0 ) {
      data = data %>%
        select( - one_of(only_one_lvl) )

      all_variables = all_variables[ !all_variables %in% only_one_lvl ]
      categoricals = categoricals[ no_lvl > 1]

    }
  } 

  # drop observations with missing Values

  if(input$missing == T) data = data[complete.cases(data),]

  # return statement

  return( list( data = data
                  , numericals = numericals
                  , categoricals = categoricals
                  , all_variables = all_variables
                  , boxcox = names_transformed
                )
  )
}

Analysis{.tabset}

Group Statistics{.tabset}

renderUI({

  d_clean = rea_clean()
  categoricals = d_clean$categoricals

  inputPanel(

  selectInput("group_stats", label = "select grouping variable",
              choices = categoricals, selected = categoricals[1])
  )

})

f_bring_to_pos_range = function(x){


  if( min(x)< 0) x = x + abs(min(x))


  return(x)
}

f_diff_of_means_medians = function(df, group, variable){


  data = df %>%
    select( group = one_of(group), variable = one_of(variable) )%>%
    mutate ( variable = f_bring_to_pos_range(variable) ) %>%
    group_by( group ) %>%
    summarise( means     = mean(variable,   na.rm = T)
               , medians = median(variable, na.rm = T) ) %>%
    ungroup() %>%
    summarise( diff_of_means        = max(means) - min(means)
               , diff_of_means_perc = ( ( max(means) - min(means) ) /max(means) ) *100
               , diff_of_medians      = max(medians) - min(medians)
               , diff_of_medians_perc = ( ( max(medians) - min(medians) ) /max(means) ) *100    )


  return(data)
}

f_max_diff_of_freq = function(df, var1, var2){

  t = table( df[[var1]], df[[var2]] ) %>%
    as_tibble() %>%
    group_by( Var1 ) %>% 
    mutate ( diff_var1          = ( max(n)-min(n) ) 
               , diff_var1_perc = ( ( max(n)-min(n))/ max(n) *100) 
             ) %>%
    group_by( Var2 ) %>%
    mutate ( diff_var2          = ( max(n)-min(n) ) 
               , diff_var2_perc = ( ( max(n)-min(n))/ max(n) *100) 
              ) %>%
    ungroup()%>%
    summarise( max_diff_freq          = max( c(diff_var1, diff_var2) )
               , max_diff_freq_perc  = max( c(diff_var1_perc, diff_var2_perc) ) 
    )
}

f_anova_stats = function(df, group, variables) {
  # returns a dataframe with anova stats
  # df        : dataframe
  # group     : grouping variable as character vector, must indicate factor variable
  # variables : numerical variables to be analyzed as character vector, must indicate numerical variable

  data = df

  formula = stringr::str_c('value~',group) %>%
    as.formula()

  df_anova = data %>%
    as_tibble() %>%
    select( one_of( c(group, variables) ) ) %>%
    gather(key = 'variable', value = 'value', one_of( variables ) ) %>%
    group_by( variable ) %>%
    nest( one_of(group), value) %>%
    mutate( model_anova     = purrr::map( data, ~aov( formula, data = .)) 
            , summary_anova = purrr::map( model_anova, summary)
            , summary_anova = purrr::map( summary_anova
                                          , function(x) x[[1]])
            , anova_pval    = purrr::map(summary_anova, 'Pr(>F)')
            , anova_pval    = purrr::map_dbl(anova_pval
                                             , function(x) x[1])
            , model_kruskal = purrr::map( data, ~kruskal.test(formula, data = .) )
            , kruskal_pval  = purrr::map_dbl(model_kruskal, 'p.value')
            , model_shapiro = purrr::map(data
                                      , function(x) shapiro.test(sample(x$value, 5000, replace = T) ) )
            , shapiro_stat  = purrr::map_dbl(model_shapiro,'statistic')
            , shapiro_pval  = purrr::map_dbl(model_shapiro,'p.value') 
            , diff_df       = purrr::map( data, f_diff_of_means_medians, group = group, variable = 'value') ) %>%
    unnest(diff_df) %>%
    select(variable
           , shapiro_stat
           , shapiro_pval
           , anova_pval
           , kruskal_pval
           , diff_of_means
           , diff_of_means_perc
           , diff_of_medians
           , diff_of_medians_perc
           )

  return(df_anova)
}

f_chi_square = function(df, group, variables) {
  # returns a dataframe with anova stats
  # df        : dataframe
  # group     : grouping variable as character vector, must indicate factor variable
  # variables : numerical variables to be analyzed as character vector, must indicate factor variable

  data = df

  variables = variables[!variables == group]

  if(purrr::is_empty(variables)) return()

  df_chi = data %>%
    as_tibble() %>%
    select( one_of( c(group, variables) ) ) %>%
    gather(key = 'variable', value = 'value', one_of( variables  ) ) %>%
    group_by( variable ) %>%
    nest( one_of( group ), value) %>%
    mutate(  model_chi = purrr::map( data, ~chisq.test(x = .[[group]]
                                                       , y = .[['value']]
                                                       ) )
            ,chi_pval  = purrr::map_dbl(model_chi, 'p.value')
            ,diff_df   = purrr::map(data, f_max_diff_of_freq, group, 'value')  ) %>%
    unnest(diff_df) %>%
    select(variable, chi_pval, max_diff_freq, max_diff_freq_perc)

  return(df_chi)
}

rea_group_stats = reactive({

  d_clean    = rea_clean()
  data       = d_clean$data
  boxcox     = d_clean$boxcox
  numericals = d_clean$numericals
  categoricals = d_clean$categoricals

  if(! is_empty(numericals)) {


    df_num    = f_anova_stats(df = data
                             ,group = input$group_stats
                             ,variables = numericals
                             )

    df_boxcox = f_anova_stats(df = data
                              ,group = input$group_stats
                              ,variables = boxcox
                              )

    sig_num = df_num %>%
      mutate( p_val = ifelse(shapiro_stat >= 0.9
                             , anova_pval
                             , kruskal_pval) 
              ) %>%
      filter(p_val <= 0.001) %>%
      .[['variable']]

    sig_boxcox = df_boxcox %>%
      mutate( p_val = ifelse(shapiro_stat >= 0.9
                             , anova_pval
                             , kruskal_pval) 
              ) %>%
      filter(p_val <= 0.001) %>%
      .[['variable']]


  } else {

    df_num     = NULL
    df_boxcox  = NULL
    sig_num    = NULL
    sig_boxcox = NULL

  }



  # only works with minimum two categoricals vars
  if( length(categoricals) >= 2 ){

    df_chi    = f_chi_square(df = data
                              ,group = input$group_stats
                              ,variables = categoricals
                             )

    sig_chi = df_chi %>%
      filter(chi_pval <= 0.001) %>%
      .[['variable']]


  } else{

    df_chi  = NULL
    sig_chi = NULL
  }

  return( list( df = list(df_num     = df_num
                          ,df_boxcox = df_boxcox
                          ,df_chi    = df_chi
                          )
                , sig = list(sig_num     = sig_num
                           ,sig_boxcox = sig_boxcox
                           ,sig_chi    = sig_chi
                           ) 
                ) 
          )

})

Numericals (ANOVA)

render_table_num_group_stat = function( boxcox = F) {

  DT::renderDataTable(
   extensions = 'Buttons'
   , options = list( dom       = 'Bftrip'
                    , buttons = c('copy', 'excel')
                    , pageLength = 1
                    )
  ,{

    group = input$group_stats

    if( boxcox == F ){ 

      df = rea_group_stats()$df$df_num

    }else{

      df = rea_group_stats()$df$df_boxcox

    }

      df = DT::datatable(df
                         , extensions = 'Buttons'
                         , options    = list( dom       = 'Bftrip'
                                              , buttons = c('copy', 'excel')
                                              )
                     ) %>%
      DT::formatStyle('shapiro_stat'
                      , color = DT::styleInterval(0.9, c('red','green') )
                      ) %>%
      DT::formatStyle(3:5
                      , color = DT::styleInterval(0.01, c('green','red') )
                      ) %>%
      DT::formatRound(2:5, 6) %>%
      DT::formatRound(c(6,8), 3) %>%
      DT::formatRound(c(7,9), 2)

  })

}

render_table_num_group_stat( boxcox = F )

Numericals boxcox (ANOVA)

render_table_num_group_stat( boxcox = T )

Categoricals (Chi-Square)

DT::renderDataTable({

  d_clean    = rea_clean()
  data       = d_clean$data
  boxcox     = d_clean$boxcox
  numericals = d_clean$numericals
  categoricals = d_clean$categoricals


  group = input$group_stats

  df = rea_group_stats()$df$df_chi 

  df = DT::datatable(df
                     , extensions = 'Buttons'
                     , options    = list( dom = 'Bftrip'
                                          , buttons = c('copy', 'excel')
                                          )
                     ) %>%
    DT::formatStyle('chi_pval'
                    , color = DT::styleInterval(0.01, c('green','red') )
                    ) %>%
    DT::formatRound( 2, 6)   %>%
    DT::formatRound( 4, 2)

  return(df)

})

Predictive Capacity

renderUI({

  d_clean = rea_clean()
  categoricals = d_clean$categoricals
  numericals   = d_clean$numericals

  inputPanel(

    selectInput("group_imp"
                , label = "select predicted variable"
                , choices = c(categoricals, numericals)
                , selected = categoricals[1])

  )

})

rea_imp = reactive({

  d_clean = rea_clean()
  data = d_clean$data
  all_variables = d_clean$all_variables

  form = as.formula( stringr::str_c(input$group_imp,'~.') )

  m = randomForest::randomForest(form, data[, all_variables] )

  imp = tibble( mean_decrease_gini = m$importance[,1]
                ,variable = names(m$importance[,1])
                ,group = input$group_imp) %>%
    arrange(desc(mean_decrease_gini))

  return(imp)

})

renderPlot({

  imp = rea_imp()

  p = ggplot(imp) +
    geom_bar(aes(x = fct_reorder(variable
                             , mean_decrease_gini) 
                , y = mean_decrease_gini
                , fill = fct_reorder(variable
                                 , mean_decrease_gini
                                 , .desc = T)
                )
            , stat = 'identity'
            , show.legend = F) +
        coord_flip() +
        labs( x = 'Variable'
              , y = 'Predictive capacity (Mean decrease in Gini)')


  name = stringr::str_c( input$sql_or_lib,'_predictive_capacity_', imp[[1,'group']] )

  save_plot(plot = p
            , path = input$path
            , name = name
            , yes = input$save
            )

  return(p)

})

Decision Tree

renderUI({

  d_clean = rea_clean()
  categoricals = d_clean$categoricals
  numericals   = d_clean$numericals

  inputPanel(

    selectInput("group_tree"
                , label    = "select predicted variable"
                , choices  = c(categoricals, numericals)
                , selected = categoricals[1]

                )

    , numericInput('min_split'
                  ,label = 'minimum node size to attempt split'
                  , min   = 2
                  , max   = 10000
                  , step  = 1
                  , value = 20
                  )

    , numericInput('max_depth'
                  , label = 'maximum tree depth'
                  , min   = 2
                  , max   = 30
                  , step  = 1
                  , value = 30
                  )

    , numericInput('cp'
                   ,label = 'minimum complexity reduction to attempt split'
                   , min   = 0.0001
                   , max   = 0.99
                   , step  = 0.001
                   , value = 0.01
                   )

    , numericInput('xval'
                   ,label = 'k-fold cross validation'
                   , min   = 0
                   , max   = 100
                   , step  = 1
                   , value = 10
                   )

  )

})

rea_tree = reactive({

  d_clean = rea_clean()
  data = d_clean$data

  pred = input$group_tree

  form = stringr::str_c(pred, '~.') %>%
    as.formula()

  m = rpart::rpart(form
                   , data
                   , minsplit = input$min_split
                   , cp       = input$cp
                   #, maxcompete = 4
                   #, maxsurrogate = 5
                   #, usesurrogate = 2
                   #, xval = 10
                   #, surrogatestyle = 0
                   , maxdepth = input$max_depth
                   , xval     = input$xval)

  return(m)

})


renderPlot({

  m = rea_tree()

  rpart.plot::prp(m
                  , branch.type   = 5
                  , box.palette   ="RdYlGn"
                  , faclen        = 0
                  , extra         = 6
                  , fallen.leaves = input$fallen_leaves
                  , tweak         = input$tweak
                  , gap           = input$gap
                  , space         = input$space
                  )

  name = stringr::str_c( input$sql_or_lib,'_tree1_', input$group_tree )

  save_plot(plot = p
            , path = input$path
            , name = name
            , yes = input$save
            , excel = F
            )
})

renderPlot({

  m = rea_tree()

  rpart.plot::rpart.plot(m
                  , fallen.leaves = T
                  , tweak         = input$tweak
                  , gap           = input$gap
                  , space         = input$space
                  )

  name = stringr::str_c( input$sql_or_lib,'_tree2_', input$group_tree )

  save_plot(plot = p
            , path = input$path
            , name = name
            , yes = input$save
            , excel = F
            )
})


inputPanel(

checkboxInput('fallen_leaves'
              , label = 'fallen_leaves'
              , value = T)

, numericInput('tweak'
              ,label = 'increase_text_size'
              , min   = 0.1
              , max   = 10
              , step  = 0.1
              , value = 1.2
              )

, numericInput('gap'
              , label = 'gap'
              , min   = 0
              , max   = 100
              , step  = 1
              , value = 2
              )

, numericInput('space'
              , label = 'space'
              , min   = 0
              , max   = 100
              , step  = 1
              , value = 2
               )
)

Principle Component Analysis

Plot Principle Components

Berechnent die ersten beiden Hauptkomponenten der numerisch- kontinuierlichen Variablen.

rea_pca = reactive({

  d_clean       = rea_clean()
  data          = d_clean$data
  boxcox        = d_clean$boxcox
  categoricals  = d_clean$categoricals
  all_variables = d_clean$all_variables
  numericals    = d_clean$numericals

  if(is_empty(numericals) | is_empty(boxcox)) return()

  if(input$boxcox_pca == T) {

    vars = boxcox

  }
  else{

    vars = numericals
  }

  pca = prcomp(x = select(data, one_of(vars) )
               , scale. = input$scale_pca
               , center = input$center_pca)

  pca$cos2 = pca$rotation^2

  pca$contrib = lmap( as_tibble(pca$cos2 ), function(y) y/ sum(y) *100 )
  row.names(pca$contrib) = row.names(pca$cos2)

  pca$vae = as_tibble (t( pca$sdev / sum(pca$sdev) *100 ) )
  colnames(pca$vae) = colnames(pca$contrib)

  pca$contrib_abs_perc = t( t( apply( pca$contrib/100, 1, function(x,y) x*pca$vae ) ) )
  pca$contrib_abs_perc = unnest( as.data.frame( pca$contrib_abs_perc) )
  row.names(pca$contrib_abs_perc) = row.names(pca$contrib)

  pca$contrib_abs_perc = as.data.frame( t(pca$contrib_abs_perc) )
  pca$contrib_abs_perc$var = row.names(pca$contrib_abs_perc)

  pca$contrib_abs_perc = pca$contrib_abs_perc %>%
    gather(key = 'key', value = 'value', everything(), -var) 

  #group variables with less than 2.5% contribution
  pca$contrib_abs_perc_reduced = pca$contrib_abs_perc  %>%
    mutate(key = ifelse(value < 2.5
                        , ' sum contrib < 2.5%'
                        , key)) %>%
    group_by( var, key) %>%
    summarise( value = sum(value) )


  # filter principle components that explain less than
  # 2.5% of the variance

  pca$x                        = pca$x[, pca$vae > 2.5]

  pca$contrib_abs_perc         = pca$contrib_abs_perc %>%
    filter(var %in% colnames(pca$x))

  pca$contrib_abs_perc_reduced = pca$contrib_abs_perc_reduced %>%
    filter(var %in% colnames(pca$x))

  data = data %>%
    cbind(pca$x)

  return(list(data = data, pca = pca) )

})

renderUI({

  inputPanel(


     checkboxInput('center_pca', label = 'center', value = T)  

    , checkboxInput('scale_pca', label = 'scale', value = T)  

    , checkboxInput('boxcox_pca', label = 'boxcox', value = T)  



  )

})

Plot Principle Components

# rotation plot
renderPlot({

  data = rea_pca()$data
  pca  = rea_pca()$pca

  group  = input$group_pca
  x_axis = input$x_axis_pca
  y_axis = input$y_axis_pca


  if(input$group_pca == 'None') group = NULL 

  p = ggplot(data) +
    geom_point( aes_string(x = x_axis, y = y_axis, color = group)
                ,alpha=0.4 ) +
    labs(title = 'plot1')


  name = stringr::str_c( input$sql_or_lib,'_principle_component_')%>%
    stringr::str_c( input$group_pca ) %>%
    stringr::str_c('_boxcox', input$boxcox_pca) %>% 
    stringr::str_c('_scale',  input$scale_pca) %>% 
    stringr::str_c('_center', input$center_pca) 

  save_plot(plot = p
        , path = input$path
        , name = name
        , yes = input$save
        )

  return( p )

})

renderUI({

  d_clean       = rea_clean()
  d_pca         = rea_pca()
  data          = d_pca$data
  boxcox        = d_clean$boxcox
  categoricals  = d_clean$categoricals
  all_variables = d_clean$all_variables
  numericals    = d_clean$numericals

  prin_comp = select(data, starts_with('PC', ignore.case = F)) %>%
    names()

  inputPanel(


    selectInput("x_axis_pca"
                , label = "select x_axis"
                , choices = prin_comp 
                , selected = prin_comp[2]
                )

    , selectInput("y_axis_pca"
                , label = "select y_axis"
                , choices = prin_comp 
                , selected = prin_comp[1]
                )

    , selectInput("group_pca"
                , label = "select grouping variable"
                , choices = c(categoricals, 'None')
                , selected = categoricals[1]
                )

  )

})

Plot Variance Explained

# variance explained

renderPlot({

  pca = rea_pca()$pca

  vae = tibble( value = pca$sdev
                , pca_n = str_c('pca', 1:length(pca$sdev))
            ) 

  p =   ggplot(pca$contrib_abs_perc_reduced) +
    geom_bar(aes(x = fct_reorder(var, value, sum, .desc = T )
                 , y = value, fill = key)
             , stat = 'identity'
             , position='stack') +
    scale_fill_brewer(palette = 'Paired') 

  name = stringr::str_c( input$sql_or_lib,'_variance_explained_')%>%
    stringr::str_c( input$group_pca ) %>%
    stringr::str_c('_boxcox', input$boxcox_pca) %>% 
    stringr::str_c('_scale',  input$scale_pca) %>% 
    stringr::str_c('_center', input$center_pca)  

    save_plot(plot = p
          , path = input$path
          , name = name
          , yes = input$save
          )

  return(p)

})

Correlation

rea_corr = reactive({

  d_clean       = rea_clean()
  data          = d_clean$data
  boxcox        = d_clean$boxcox
  categoricals  = d_clean$categoricals
  all_variables = d_clean$all_variables
  numericals    = d_clean$numericals

  if(is_empty(boxcox)) return()

  corr = round( cor(data[, boxcox]), 1)

  p_val_corr = ggcorrplot::cor_pmat( data[, boxcox] )

  return( list(corr = corr,
               p_val_corr = p_val_corr) )

})

renderPlot(width = 1024, height = 1024, {

  corr = rea_corr()$corr
  p_val_corr = rea_corr()$p_val_corr

  p = ggcorrplot::ggcorrplot( corr
                              ,hc.order = TRUE
                              , type = "lower"
                              , lab = TRUE
                              , p.mat = p_val_corr
                              )

  name = stringr::str_c( input$sql_or_lib,'_correlation')

  save_plot(plot = p
        , path = input$path
        , name = name
        , yes = input$save
        )

  return(p)

})

Data Visualisation{.tabset}

Tabplot

preselect = c('all'
              ,'none'
              ,'categoricals'
              ,'numericals'
              ,'top_10_importance'
              ,'boxcox'
              ,'none_boxcox'
              ,'pc1+pc2'
              ,'pc1+pc2 contrib >2.5%'
              ,'correlation'
              ,'group_stat_P<0.001'
              ,'2fac_2num'
              )

renderUI({

  d_clean       = rea_clean()
  data          = d_clean$data
  boxcox        = d_clean$boxcox
  categoricals  = d_clean$categoricals
  all_variables = d_clean$all_variables
  numericals    = d_clean$numericals

  inputPanel(


      sliderInput('n_breaks_tab'
                  , label = "Number of bins:"
                  , min = 1
                  , max = 500
                  , step = 1
                  , value = 30)

    , selectInput("sort_col_tab"
                  , label = "Sort_by"
                  , choices = names(data) 
                  , selected = numericals[1])

    , checkboxInput('decreasing_tab'
                    , label = 'decreasing'
                    , value = T)  

    , sliderInput('max_levels_tab'
                  , label = "Maximum number of levels for categorical variables"
                  , min = 3
                  , max = 30
                  , step = 1
                  , value = 10)

    , selectInput("preselect", label = "preselect"
                  ,choices = preselect 
                  ,selected = 'top_10_importance')


    , selectInput("preselect_correlation", label = "select_correlation"
                  ,choices = boxcox 
                  ,selected = boxcox[1])



  )

})
renderPlot( {

  d_clean       = rea_clean()
  data          = d_clean$data
  boxcox        = d_clean$boxcox
  categoricals  = d_clean$categoricals
  all_variables = d_clean$all_variables
  numericals    = d_clean$numericals

  imp = rea_imp()

  p = tabplot::tableplot(data
                , select_string = input$select_vars_tab
                , sortCol       = input$sort_col_tab
                , decreasing    = input$decreasing_tab
                , nBins         = input$n_breaks_tab
                , max_levels    = input$max_levels_tab
                , scales = 'lin')

  index_str = stringr::str_c( which(all_variables %in% input$select_vars_tab)
                              , collapse = '+'  )

  name = stringr::str_c( input$sql_or_lib,'_tabplot_')%>%
  stringr::str_c('sort_', input$sort_col_tab, '_') %>% 
  stringr::str_c( index_str) 

  save_plot(plot = p
        , path = input$path
        , name = name
        , yes = input$save
        , excel = F
        )

  return(p)

  })


chk_bx_grp_ui = function(x,y, name, label) { renderUI({


  d_clean       = rea_clean()
  data          = d_clean$data
  boxcox        = d_clean$boxcox
  categoricals  = d_clean$categoricals
  all_variables = d_clean$all_variables
  numericals    = d_clean$numericals

  imp        = rea_imp()
  p_val_corr = rea_corr()$p_val_corr
  pca        = rea_pca()$pca

  if(input[[x]] == 'all')               preselect = names(data)
  if(input[[x]] == 'none')              preselect = NULL
  if(input[[x]] == 'top_10_importance') preselect = imp$variable[1:10]
  if(input[[x]] == 'boxcox')            preselect = boxcox
  if(input[[x]] == 'none_boxcox')       preselect = names(data)[!names(data) %in% boxcox]
  if(input[[x]] == 'numericals')        preselect = numericals
  if(input[[x]] == 'categoricals')      preselect = categoricals

  if(input[[x]] == 'group_stat_P<0.001') {

    sig = rea_group_stats()$sig

    #remove boxcox that are also in numericals
    sig_boxcox = sig$sig_boxcox %>%
      stringr::str_extract('^[A-Za-z]*') %>%
      .[!. %in% sig$sig_num] %>%
      stringr::str_c('_boxcox')

    preselect = stringr::str_c( c(sig$sig_num, sig_boxcox, sig$sig_chi ) )
  }

  if(input[[x]] == 'correlation') {

    correlating = p_val_corr %>%
      as_tibble() %>%
      mutate( vars = names(.) ) %>%
      select( filter_var = one_of(input[[y]]), vars) %>%
      filter( filter_var <= 0.05  ) %>%
      select( vars )


    preselect = correlating$vars
  }

  if( startsWith( input[[x]], 'pc1+pc2')) {

    if(endsWith( input[[x]], '>2.5%')){

      pca_1_2 = pca$contrib_abs_perc_reduced %>%
        filter(var %in% c('PC1', 'PC2') )

      preselect = pca_1_2$key

    }else {

      pca_1_2 = pca$contrib_abs_perc %>%
        filter(var %in% c('PC1', 'PC2') )

      preselect = pca_1_2$key

    }

  }


  if(input[[x]] == '2fac_2num')  {
    preselect = c(numericals[1], categoricals[1]
                  ,numericals[2], categoricals[2]
                  ,numericals[3], categoricals[3]
                  ,numericals[4], categoricals[4]
                  )[1:4]
  }


  checkboxGroupInput( name
                    , label = label
                    , choices = names(data)
                    , selected = preselect
                    , inline = F )
 })
}

chk_bx_grp_ui('preselect', 'preselect_correlation', name = 'select_vars_tab', label = 'Select Variables')

ggduo

renderUI({

  d_clean       = rea_clean()
  data          = d_clean$data
  boxcox        = d_clean$boxcox
  categoricals  = d_clean$categoricals
  all_variables = d_clean$all_variables
  numericals    = d_clean$numericals

  inputPanel(

    selectInput("preselect_duo_col"
                , label = "Preselect Column"
              ,choices = preselect 
              ,selected = '2fac_2num')

    , selectInput("preselect_duo_row"
                , label = "Preselect Row"
              ,choices = preselect 
              ,selected = '2fac_2num')


  )
})

renderUI({

  d_clean       = rea_clean()
  data          = d_clean$data
  boxcox        = d_clean$boxcox
  categoricals  = d_clean$categoricals
  all_variables = d_clean$all_variables
  numericals    = d_clean$numericals

  inputPanel(

   selectInput("preselect_corr_duo_col"
                , label = "Select Correlation Column"
                ,choices = boxcox 
                ,selected = boxcox[1])

  , selectInput("preselect_corr_duo_row"
                , label = "Select Correlation Row"
                ,choices = boxcox 
                ,selected = boxcox[1])

  )

})

rea_duo = eventReactive(input$but_duo
                        , {
                          return( list(col    = input$select_vars_duo_col
                                       ,row   = input$select_vars_duo_row
                                       ,group = input$group_duo
                                       ,corr  = input$corr_duo) )
                        } )


renderPlot({

  d_clean       = rea_clean()
  data          = d_clean$data
  boxcox        = d_clean$boxcox
  categoricals  = d_clean$categoricals
  all_variables = d_clean$all_variables
  numericals    = d_clean$numericals

  ret   = rea_duo()
  col   = ret$col
  row   = ret$row
  group = ret$group
  corr  = ret$corr

  if(group == 'None') group = NULL

  lm_with_cor <- function(data, mapping, ..., method = "pearson") {
    x <- data[[deparse(mapping$x)]]
    y <- data[[deparse(mapping$y)]]
    cor <- cor(x, y, method = method)
    GGally::ggally_smooth_lm(data, mapping, ...) +
      ggplot2::geom_label(
        data = data.frame(
          x = min(x, na.rm = TRUE),
          y = max(y, na.rm = TRUE),
          lab = round(cor, digits = 3)
        ),
        mapping = ggplot2::aes(x = x, y = y, label = lab, color = NULL),
        hjust = 0, vjust = 1,
        size = 5, fontface = "bold"
      )
  }


  if(corr == T)  {  
    p =GGally::ggduo(
      data, col, row,
      mapping = aes_string(color = group),
      types = list(continuous = GGally::wrap(lm_with_cor, alpha = 0.25)),
      showStrips = FALSE
    ) +
      theme(legend.position = "bottom")
  }

  else{
    p = GGally::ggduo(
      data, col, row,
      mapping = aes_string(color = group),
      #types = list(continuous = wrap(lm_with_cor, alpha = 0.25)),
      showStrips = FALSE
    ) +
      theme(legend.position = "bottom")
  }

  index_str_row = stringr::str_c( which(all_variables %in% row)
                              , collapse = '+'  )

  index_str_col = stringr::str_c( which(all_variables %in% col)
                              , collapse = '+'  )

  name = stringr::str_c( input$sql_or_lib,'_ggduo')%>%
  stringr::str_c('_sort', input$sort_col_tab ) %>% 
  stringr::str_c( '_col', index_str_col) %>%
  stringr::str_c( '_row', index_str_row) 

  save_plot(plot = p
        , path = input$path
        , name = name
        , yes = input$save
        , excel = F
        )

  return(p)
})

renderUI({

  d_clean       = rea_clean()
  data          = d_clean$data
  boxcox        = d_clean$boxcox
  categoricals  = d_clean$categoricals
  all_variables = d_clean$all_variables
  numericals    = d_clean$numericals


  inputPanel(

    selectInput("group_duo"
                , label = "select grouping variable"
                , choices = c(categoricals, 'None')
                , selected = categoricals[length(categoricals)]
                )

    , checkboxInput('corr_duo'
                    , label = 'show corr coef'
                    , value = T)  


    , actionButton('but_duo', 'Render Plot')

    #, cellArgs = list(width = '50%')
  )



})

# chk_bx_grp_ui has its own gui rendering function

splitLayout(


   chk_bx_grp_ui('preselect_duo_col', 'preselect_corr_duo_col'
                   , name = 'select_vars_duo_col'
                   , label = 'Select Variables Column')


   , chk_bx_grp_ui('preselect_duo_row', 'preselect_corr_duo_row'
                   , name = 'select_vars_duo_row'
                   , label = 'Select Variables Row')


 )

Histograms

Histogram A

input_panel_histo = function(suffix) {

  renderUI({

    d_clean       = rea_clean()
    data          = d_clean$data
    boxcox        = d_clean$boxcox
    categoricals  = d_clean$categoricals
    all_variables = d_clean$all_variables
    numericals    = d_clean$numericals



    inputPanel(

      selectInput(str_c("n_breaks", suffix)
                  , label = "Number of bins:"
                  , choices = c(10, 20, 30, 50, 75, 100, 150, 200, 300, 500, 750, 1000)
                  , selected = 30
                  )

      , selectInput(str_c("variable", suffix)
                    , label = "Select variable"
                    , choices = names(data) 
                    , selected = numericals[1]
                    )

      , selectInput(str_c( "group", suffix)
                    , label = "Select grouping variable"
                    , choices = c(categoricals , 'None')
                    , selected = categoricals[1]
                    )

      , selectInput(str_c("graph_type", suffix)
                    , label = "Graph Type"
                    , choices = c('bar', 'line', 'violin') 
                    , selected = 'line')

      , selectInput(str_c("y_axis", suffix)
                    , label = "Y-Axis"
                    , choices = c('density', 'counts') 
                    , selected = 'density')

      , checkboxInput(str_c( 'auto_range', suffix )
                      , label = 'use automatic range'
                      , value = T)  

      , textInput(str_c('x_min', suffix)
                  , label = 'X-min'
                  , value = '0')

      , textInput(str_c('x_max', suffix)
                  , label = 'X-max'
                  , value = '1000')

    )

  })
}

renderPlot_histo = function( suffix ){


  renderPlot({

    d_clean       = rea_clean()
    data          = d_clean$data
    boxcox        = d_clean$boxcox
    categoricals  = d_clean$categoricals
    all_variables = d_clean$all_variables
    numericals    = d_clean$numericals


    # get input

    n_breaks   = input[[ str_c("n_breaks"   , suffix) ]]
    variable   = input[[ str_c("variable"   , suffix) ]]
    group      = input[[ str_c( "group"     , suffix) ]]
    graph_type = input[[ str_c("graph_type" , suffix) ]]
    y_axis     = input[[ str_c("y_axis"     , suffix) ]]
    auto_range = input[[ str_c( 'auto_range', suffix) ]]
    x_min      = input[[ str_c('x_min'      , suffix) ]]
    x_max      = input[[ str_c('x_max'      , suffix) ]]


    #y-axis
    if(y_axis == 'density') {
      y_axis = '..density..'
    }
    else{
      y_axis = '..count..'
    }

    #group

    if(group == 'None') group = NULL 

    # numericals ----------------------------------------------------------------------------

    numericals = c(numericals, boxcox)

    #geom_freqpoly 
    if(variable %in% numericals & graph_type == 'line'){

      p = data %>%
      ggplot() +
        geom_freqpoly( aes_string(x = variable, y = y_axis, color = group)
                       , bins =  as.numeric(n_breaks))
    }


     #geom_histo 
    if(variable %in% numericals & graph_type == 'bar' ){


      p = data %>%
      ggplot() +
        geom_histogram( aes_string(x = variable, y = y_axis, fill = group)
                       , bins =  as.numeric(n_breaks), alpha = 0.6
                       , position = 'identity')
    }

    #geom_violin 
    if(variable %in% numericals & graph_type == 'violin'){

      medians = data %>%
        group_by_(as.symbol(group)) %>%
        select( one_of(numericals ) ) %>%
        summarise_all(  median )


      p = data %>%
      ggplot() +
        geom_violin( aes_string(x = group
                                , y = variable
                                , fill = group)
                      ) +
        geom_crossbar( data = medians,
                       mapping = aes_string(x = group
                                  , y = variable
                                  , ymin = variable
                                  , ymax = variable) )
    }


    # add x range

    if(variable %in% numericals & auto_range == F & !graph_type == 'violin'){

      p = p +
        xlim( c( as.numeric(x_min), as.numeric(x_max)) )
    }

    # add y range

    if(variable %in% numericals & auto_range == F & graph_type == 'violin'){

      p = p +
        ylim( c( as.numeric(x_min), as.numeric(x_max)) )
    }


    # categoricals ----------------------------------------------------------------------------

    #geom_bar
    if(variable %in% categoricals ){

      if(y_axis == '..density..')   y_axis = '..prop..'

      p = data %>%
      ggplot() +
        geom_bar( aes_string(x = variable, y = y_axis, fill = group, group = group)
                       , position = 'dodge')
      }

  y_axis_str = stringr::str_extract_all(y_axis
                                        , '[A-Za-z]') %>%
    unlist()%>%
    stringr::str_c(collapse = '')

  name = stringr::str_c( input$sql_or_lib, '_')%>%
  stringr::str_c(suffix ) %>% 
  stringr::str_c('_', graph_type ) %>%
  stringr::str_c('_', y_axis_str)  %>%
  stringr::str_c( '_group', group) %>%
  stringr::str_c( '_var', variable) %>%
  stringr::str_c( '_', n_breaks)

  save_plot(plot = p
        , path = input$path
        , name = name
        , yes = input$save
        )

    return(p)

    })
}

Histogram B

input_panel_histo('histo_b')

renderPlot_histo('histo_b')

Histogram C

input_panel_histo('histo_c')

renderPlot_histo('histo_c')

Histogram D

input_panel_histo('histo_d')

renderPlot_histo('histo_d')

Clustering{.tabset}

# som can use contineous and categorical variables
# categorical variables need to be converted to a
# class matrix and added as an extra layer


# variables

data = diamonds
map_dimension = 20

# prepare data

numerics = summarise_all( data, is.numeric ) %>%
  as.logical()

numerics

factors = names(data)%>%
  .[!numerics]

numerics = names(data)%>%
  .[numerics]


fct_to_matrix = function(data, factor_vec){

  name = names(factor_vec)

  mat = kohonen::classvec2classmat( factor_vec )

  names(mat) = name

  return(mat)

}

data_list = list()
distances = vector()

for (fac in factors){

  data_list[[fac]] = kohonen::classvec2classmat( data[[fac]] )

  distances = c(distances, 'tanimoto')

}

data_list[['numerics']] = scale(data[,numerics])
distances = c( distances, 'euclidean')


# create a grid onto which the som will be mapped
som_grid = kohonen::somgrid(xdim = 20
                            , ydim=20
                            , topo="hexagonal")



m = kohonen::supersom( data_list
                 , grid=som_grid
                 , rlen=100
                 , alpha = 0.05
                 , whatmap = c(factors, 'numerics')
                 , dist.fcts = distances
                 #, maxNA.fraction = .5
                  )



plot(m, type="changes")

plot(m, type="counts")

plot(m, type="dist.neighbours")

plot(m, type="codes")

plot(m, type="quality")

# cluster map segments

codes = tibble( layers = names(m$codes)
                ,codes = m$codes ) %>%
  mutate( codes = purrr::map(codes, as_tibble) ) %>%
  spread( key = layers, value = codes) %>%
  apply(1, bind_cols) %>%
  .[[1]] %>%
  as_tibble()


dist_m = dist(codes) %>%
  as.matrix()

# generate distance map based on position on map

# map_dimension = 3

max_dist = Inf # max(dist_m)

dist_on_map = tibble( r   = rep(1:map_dimension, map_dimension)
              , c = rep(1:map_dimension, map_dimension) %>% sort()
              ) %>%
  dist() %>%
  as.matrix() %>%
  # distance for direct neighbours will be 1 and for
  # diagonal neighbours 1.41
  apply( 1:2, function(x) ifelse(x>= 1.5, max_dist, x) ) %>% 
  apply( 1:2, function(x) ifelse(x == max_dist, max_dist, 1) ) 


# multiply nodes distance with distance on map
# this will set all distances between none neighbouring
# points to infinity

dist_adjusted = dist_m * dist_on_map 


dist_m = as.dist(dist_m)

clust1 = hclust(dist_m, method = 'single')

clust2 = hc(dist_adjusted, method = 'average')

k_m = kmeans( codes, 6)

k_m$cluster

plot(clust1)
plot(clust2)

segments1 = cutree(clust1, 4 )
segments2 = cutree(clust2, 4 )


clust2$height
clust2$order
clust2$method
clust2$call
clust2$dist.method

# Colour palette definition
pretty_palette <- c("#1f77b4", '#ff7f0e', '#2ca02c','#d62728', '#9467bd', '#8c564b', '#e377c2')

plot(m, bgcol = pretty_palette[segments1])
add.cluster.boundaries(m, segments1)

plot(m, bgcol = pretty_palette[segments2], main = 'Clusters')
add.cluster.boundaries(m, segments2)

plot(m, bgcol = pretty_palette[k_m$cluster], main = 'Clusters')



plot(clust2)

coolBlueHotRed <- function(n, alpha = 1) {rainbow(n, end=4/6, alpha=alpha)[n:1]}

l = 1:ncol(m$codes[[1]])
for (i in l) {
  plot( m, type = "property", property = m$codes[[1]][,i],
    main = colnames(m$codes[[1]])[i], palette.name =
      coolBlueHotRed   )
}


mydata <- m$codes [[1]]
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var)) 
for (i in 2:15) {
  wss[i] <- sum(kmeans(mydata, centers=i)$withinss)
}
plot(wss)

n.elbow = 6

## use hierarchical clustering to cluster the codebook vectors
som_cluster <- cutree(hclust(dist(m$codes[[1]])), n.elbow)
# plot these results:
add.cluster.boundaries(m, som_cluster)

erblast/oetteR documentation built on May 27, 2019, 12:11 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

erblast/oetteR
Collection of personal R functions

In erblast/oetteR: Collection of personal R functions

{.tabset}

Data{.tabset}

Load Data

Sample View

Summary

Numericals

Categoricals

Missing Values

Clean Data

Analysis{.tabset}

Group Statistics{.tabset}

Numericals (ANOVA)

Numericals boxcox (ANOVA)

Categoricals (Chi-Square)

Predictive Capacity

Decision Tree

Principle Component Analysis

Plot Principle Components

Plot Principle Components

Plot Variance Explained

Correlation

Data Visualisation{.tabset}

Tabplot

ggduo

Histograms

Histogram A

Histogram B

Histogram C

Histogram D

Clustering{.tabset}

R Package Documentation

Browse R Packages

We want your feedback!

erblast/oetteR Collection of personal R functions

In erblast/oetteR: Collection of personal R functions

{.tabset}

Data{.tabset}

Load Data

Sample View

Summary

Numericals

Categoricals

Missing Values

Clean Data

Analysis{.tabset}

Group Statistics{.tabset}

Numericals (ANOVA)

Numericals boxcox (ANOVA)

Categoricals (Chi-Square)

Predictive Capacity

Decision Tree

Principle Component Analysis

Plot Principle Components

Plot Principle Components

Plot Variance Explained

Correlation

Data Visualisation{.tabset}

Tabplot

ggduo

Histograms

Histogram A

Histogram B

Histogram C

Histogram D

Clustering{.tabset}

R Package Documentation

Browse R Packages

We want your feedback!

erblast/oetteR
Collection of personal R functions