R/f_pca.R

#'@title calculate principle components for a dataset
#'@description This function is an extended wrapper for prcomp(). I takes a
#'  data_ls object created by f_clean_data and calculates the contribution of
#'  each variable to each principle component in percent.
#'@param data_ls data_ls object generated by f_clean_data(), or a named list
#'  list( data = <dataframe>, numericals = < vector with column names of
#'  numerical columns>)
#'@param use_boxcox_tansformed_vars boolean, Default: T
#'@param center boolean, Default: T
#'@param scale boolean, Default: T
#'@param include_ordered_categoricals boolean, Default: T
#'@return a list with the original data complemented with the principle
#'  component vector data of each observation and an object returned by prcomp()
#'  supplemented with some extra features \item{data}{dataframe} \item{pca}{pca
#'  object created by prcomp()}
#'
#'  \strong{added features of pca}: \item{cos2}{The squared rotation vectors.A
#'  value between 0 and 1 denotes the amount of contribution of a variable to a
#'  specific principle component} \item{vae}{percent variance explained}
#'  \item{contrib_abs_perc}{The absolute contribution of one variable to the
#'  variance explained by one principle component in percent. The total
#'  contibution adds up to the total contibution of the principle componaent in
#'  percent. } \item{contrib_abs_perc_reduced}{as above but variables
#'  contibuting less than 2.5 percent are grouped}
#'  \item{threshold_vae_for_pc_perc}{principle components that explain less
#'  percent variance than this threshold are dropped}
#'@details
#'\href{http://www.sthda.com/english/wiki/principal-component-analysis-how-to-reveal-the-most-important-variables-in-your-data-r-software-and-data-mining}{Blog
#'post explaining how to calculate contributions}
#' @examples
#'pca_ls = f_clean_data(mtcars) %>%
#' f_boxcox() %>%
#' f_pca()
#'
#'@rdname f_pca
#'@export
#'@seealso \code{\link[stats]{prcomp}}
f_pca = function( data_ls
                  , center = T
                  , scale = T
                  , use_boxcox_tansformed_vars = T
                  , include_ordered_categoricals = T
                  ){


   data                 = data_ls$data
   boxcox               = data_ls$boxcox_data
   categoricals         = data_ls$categoricals
   categoricals_ordered = data_ls$categoricals_ordered
   all_variables        = data_ls$all_variables
   numericals           = data_ls$numericals

   if( is_empty(numericals) ){
     stop('Numerical variables missing')
   }

   if(use_boxcox_tansformed_vars
      & ! is.null(boxcox) ) {

     data = boxcox

   } else{

     data = data %>%
       select( one_of(numericals) )
   }

   if( include_ordered_categoricals == T
       & ! is.null(data_ls$categoricals_ordered) ){

     data = data %>%
       bind_cols( data_ls$data[, categoricals_ordered] ) %>%
       mutate_all( as.character ) %>%
       mutate_all( as.numeric )

   }

   suppressWarnings({
     pca = prcomp(x        = data
                  , scale. = scale
                  , center = center) %>%
       map( f_manip_matrix_2_tibble )
   })

   pca$cos2 = pca$rotation %>%
     mutate_if( is.numeric, function(y) y^2)

   pca$contrib = pca$cos2 %>%
     mutate_if(is.numeric, function(y) y/ sum(y) *100)

   pca$vae = as_tibble (t( pca$sdev / sum(pca$sdev) *100 ) ) %>%
     mutate( row_names = 'vae' ) %>%
     select( row_names, everything() )

   colnames(pca$vae) = colnames(pca$contrib)

   # i cant find a solution for this using purr

   contrib_mat = pca$contrib %>%
     select(2:ncol(.)) %>%
     as.matrix()

   vae_mat = pca$vae %>%
     select(2:ncol(.)) %>%
     as.matrix()

   pca$contrib_abs_perc = t( apply( contrib_mat/100, 1, function(x,y) vae_mat * x ) ) %>%
     as_tibble() %>%
     mutate(row_names = pca$contrib$row_names ) %>%
     select(row_names, everything() )

   names( pca$contrib_abs_perc ) = names(pca$vae)

   pca$contrib_abs_perc = pca$contrib_abs_perc %>%
     gather(key = 'PC', value = 'value', everything(), -row_names)


   data = data_ls$data %>%
     cbind(pca$x)

  return(list(data = data, pca = pca) )

}


#' @title plot principle components as a dot plot
#' @param pca_ls list created by f_pca()
#' @param x_axis character vector, Default: 'PC1'
#' @param y_axis character vector, Default: 'PC2'
#' @param group character vector denoting the grouping variable, determines dot
#'   colour, Default: NULL

#' @return htmltools taglist containing a plotly graph and tow DT datatables
#'   will only show if printed in a rmarkdown document
#' @examples
#' \dontrun{
#' tagls = f_clean_data(mtcars) %>%
#'   f_boxcox() %>%
#'   f_pca() %>%
#'   f_pca_plot_components(group = 'cyl')
#' }
#' @rdname f_pca_plot_components
#' @export
f_pca_plot_components = function(pca_ls
                                 , x_axis = 'PC1'
                                 , y_axis = 'PC2'
                                 , group = NULL
                                 ){

  data = pca_ls$data
  pca  = pca_ls$pca


  p = ggplot(data) +
    geom_point( aes_string(x = x_axis, y = y_axis, color = group)
                ,alpha=0.4 ) +
    labs(title = 'Principle Components') +
    scale_color_brewer( palette = 'Dark2')

  #convert rotation matrix to tibble and join contribution in percent
  tib = pca$rotation %>%
    gather( key = 'PC', value = 'rotation',  - row_names) %>%
    left_join( pca$contrib_abs_perc) %>%
    rename( abs_contrib_perc = value )

  #print contrib x_axis
  dt_x = tib %>%
    filter( PC == x_axis) %>%
    arrange( desc(abs_contrib_perc) ) %>%
    select(row_names, abs_contrib_perc, rotation) %>%
    mutate_if(is.numeric, round, 2 ) %>%
    DT::datatable( caption = x_axis)


  #print contrib y_axis
  dt_y = tib %>%
    filter( PC == y_axis) %>%
    arrange( desc(abs_contrib_perc) ) %>%
    select(row_names, abs_contrib_perc, rotation) %>%
    mutate_if(is.numeric, round, 2 ) %>%
    DT::datatable( caption = y_axis)

  # print(plotly::ggplotly(p))
  tagl = htmltools::tagList()
  tagl[[1]] = plotly::ggplotly(p)
  tagl[[2]] = dt_x
  tagl[[3]] = dt_y

  return(tagl)
}

#'@title plot varaince explained of principle components
#'@param pca_ls list created by f_pca()
#'@param threshold_vae_for_pc_perc double, filter principle components that
#'  explain a lesser percentage of the variance than this threshold, Default: 2.5
#'@return plotly graph
#' @examples
#' p = f_clean_data(mtcars) %>%
#'   f_boxcox() %>%
#'   f_pca() %>%
#'   f_pca_plot_variance_explained()
#'p
#'@rdname f_pca_plot_variance_explained
#'@export
#'@importFrom forcats fct_reorder
f_pca_plot_variance_explained = function(pca_ls
                                         , threshold_vae_for_pc_perc = 2.5){

  pca = pca_ls$pca

  # filter principle components that explain less than
  # threshold_vae_for_pc_perc of the variance

  bool = pca$vae %>%
    select_if( is.numeric) %>%
    summarise_all( function(x) x > threshold_vae_for_pc_perc) %>%
    as.matrix() %>%
    as.vector()

  pca$x                        = pca$x[,bool]

  pca$contrib_abs_perc         = pca$contrib_abs_perc %>%
    filter(PC %in% colnames(pca$x)) %>%
    mutate( PC       = forcats::fct_reorder(PC, value, sum, .desc = T )
            , values = round(value,2) ) %>%
    rename( variables = row_names)


  p =   ggplot(pca$contrib_abs_perc) +
    geom_bar(aes(x = PC
                 , y = value, fill = variables)
             , stat = 'identity'
             , position='stack') +
    scale_fill_brewer(palette = 'Paired') +
    labs( title = 'Variance Explained'
          , x   = 'Principle Components'
          , y   = 'Variance Explained'
          , fill = 'Variables')

  return( plotly::ggplotly(p, tooltip= c('y', 'fill') ) )

}
erblast/oetteR documentation built on May 27, 2019, 12:11 p.m.