#'@title calculate principle components for a dataset
#'@description This function is an extended wrapper for prcomp(). I takes a
#' data_ls object created by f_clean_data and calculates the contribution of
#' each variable to each principle component in percent.
#'@param data_ls data_ls object generated by f_clean_data(), or a named list
#' list( data = <dataframe>, numericals = < vector with column names of
#' numerical columns>)
#'@param use_boxcox_tansformed_vars boolean, Default: T
#'@param center boolean, Default: T
#'@param scale boolean, Default: T
#'@param include_ordered_categoricals boolean, Default: T
#'@return a list with the original data complemented with the principle
#' component vector data of each observation and an object returned by prcomp()
#' supplemented with some extra features \item{data}{dataframe} \item{pca}{pca
#' object created by prcomp()}
#'
#' \strong{added features of pca}: \item{cos2}{The squared rotation vectors.A
#' value between 0 and 1 denotes the amount of contribution of a variable to a
#' specific principle component} \item{vae}{percent variance explained}
#' \item{contrib_abs_perc}{The absolute contribution of one variable to the
#' variance explained by one principle component in percent. The total
#' contibution adds up to the total contibution of the principle componaent in
#' percent. } \item{contrib_abs_perc_reduced}{as above but variables
#' contibuting less than 2.5 percent are grouped}
#' \item{threshold_vae_for_pc_perc}{principle components that explain less
#' percent variance than this threshold are dropped}
#'@details
#'\href{http://www.sthda.com/english/wiki/principal-component-analysis-how-to-reveal-the-most-important-variables-in-your-data-r-software-and-data-mining}{Blog
#'post explaining how to calculate contributions}
#' @examples
#'pca_ls = f_clean_data(mtcars) %>%
#' f_boxcox() %>%
#' f_pca()
#'
#'@rdname f_pca
#'@export
#'@seealso \code{\link[stats]{prcomp}}
f_pca = function( data_ls
, center = T
, scale = T
, use_boxcox_tansformed_vars = T
, include_ordered_categoricals = T
){
data = data_ls$data
boxcox = data_ls$boxcox_data
categoricals = data_ls$categoricals
categoricals_ordered = data_ls$categoricals_ordered
all_variables = data_ls$all_variables
numericals = data_ls$numericals
if( is_empty(numericals) ){
stop('Numerical variables missing')
}
if(use_boxcox_tansformed_vars
& ! is.null(boxcox) ) {
data = boxcox
} else{
data = data %>%
select( one_of(numericals) )
}
if( include_ordered_categoricals == T
& ! is.null(data_ls$categoricals_ordered) ){
data = data %>%
bind_cols( data_ls$data[, categoricals_ordered] ) %>%
mutate_all( as.character ) %>%
mutate_all( as.numeric )
}
suppressWarnings({
pca = prcomp(x = data
, scale. = scale
, center = center) %>%
map( f_manip_matrix_2_tibble )
})
pca$cos2 = pca$rotation %>%
mutate_if( is.numeric, function(y) y^2)
pca$contrib = pca$cos2 %>%
mutate_if(is.numeric, function(y) y/ sum(y) *100)
pca$vae = as_tibble (t( pca$sdev / sum(pca$sdev) *100 ) ) %>%
mutate( row_names = 'vae' ) %>%
select( row_names, everything() )
colnames(pca$vae) = colnames(pca$contrib)
# i cant find a solution for this using purr
contrib_mat = pca$contrib %>%
select(2:ncol(.)) %>%
as.matrix()
vae_mat = pca$vae %>%
select(2:ncol(.)) %>%
as.matrix()
pca$contrib_abs_perc = t( apply( contrib_mat/100, 1, function(x,y) vae_mat * x ) ) %>%
as_tibble() %>%
mutate(row_names = pca$contrib$row_names ) %>%
select(row_names, everything() )
names( pca$contrib_abs_perc ) = names(pca$vae)
pca$contrib_abs_perc = pca$contrib_abs_perc %>%
gather(key = 'PC', value = 'value', everything(), -row_names)
data = data_ls$data %>%
cbind(pca$x)
return(list(data = data, pca = pca) )
}
#' @title plot principle components as a dot plot
#' @param pca_ls list created by f_pca()
#' @param x_axis character vector, Default: 'PC1'
#' @param y_axis character vector, Default: 'PC2'
#' @param group character vector denoting the grouping variable, determines dot
#' colour, Default: NULL
#' @return htmltools taglist containing a plotly graph and tow DT datatables
#' will only show if printed in a rmarkdown document
#' @examples
#' \dontrun{
#' tagls = f_clean_data(mtcars) %>%
#' f_boxcox() %>%
#' f_pca() %>%
#' f_pca_plot_components(group = 'cyl')
#' }
#' @rdname f_pca_plot_components
#' @export
f_pca_plot_components = function(pca_ls
, x_axis = 'PC1'
, y_axis = 'PC2'
, group = NULL
){
data = pca_ls$data
pca = pca_ls$pca
p = ggplot(data) +
geom_point( aes_string(x = x_axis, y = y_axis, color = group)
,alpha=0.4 ) +
labs(title = 'Principle Components') +
scale_color_brewer( palette = 'Dark2')
#convert rotation matrix to tibble and join contribution in percent
tib = pca$rotation %>%
gather( key = 'PC', value = 'rotation', - row_names) %>%
left_join( pca$contrib_abs_perc) %>%
rename( abs_contrib_perc = value )
#print contrib x_axis
dt_x = tib %>%
filter( PC == x_axis) %>%
arrange( desc(abs_contrib_perc) ) %>%
select(row_names, abs_contrib_perc, rotation) %>%
mutate_if(is.numeric, round, 2 ) %>%
DT::datatable( caption = x_axis)
#print contrib y_axis
dt_y = tib %>%
filter( PC == y_axis) %>%
arrange( desc(abs_contrib_perc) ) %>%
select(row_names, abs_contrib_perc, rotation) %>%
mutate_if(is.numeric, round, 2 ) %>%
DT::datatable( caption = y_axis)
# print(plotly::ggplotly(p))
tagl = htmltools::tagList()
tagl[[1]] = plotly::ggplotly(p)
tagl[[2]] = dt_x
tagl[[3]] = dt_y
return(tagl)
}
#'@title plot varaince explained of principle components
#'@param pca_ls list created by f_pca()
#'@param threshold_vae_for_pc_perc double, filter principle components that
#' explain a lesser percentage of the variance than this threshold, Default: 2.5
#'@return plotly graph
#' @examples
#' p = f_clean_data(mtcars) %>%
#' f_boxcox() %>%
#' f_pca() %>%
#' f_pca_plot_variance_explained()
#'p
#'@rdname f_pca_plot_variance_explained
#'@export
#'@importFrom forcats fct_reorder
f_pca_plot_variance_explained = function(pca_ls
, threshold_vae_for_pc_perc = 2.5){
pca = pca_ls$pca
# filter principle components that explain less than
# threshold_vae_for_pc_perc of the variance
bool = pca$vae %>%
select_if( is.numeric) %>%
summarise_all( function(x) x > threshold_vae_for_pc_perc) %>%
as.matrix() %>%
as.vector()
pca$x = pca$x[,bool]
pca$contrib_abs_perc = pca$contrib_abs_perc %>%
filter(PC %in% colnames(pca$x)) %>%
mutate( PC = forcats::fct_reorder(PC, value, sum, .desc = T )
, values = round(value,2) ) %>%
rename( variables = row_names)
p = ggplot(pca$contrib_abs_perc) +
geom_bar(aes(x = PC
, y = value, fill = variables)
, stat = 'identity'
, position='stack') +
scale_fill_brewer(palette = 'Paired') +
labs( title = 'Variance Explained'
, x = 'Principle Components'
, y = 'Variance Explained'
, fill = 'Variables')
return( plotly::ggplotly(p, tooltip= c('y', 'fill') ) )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.