#Install devtools,Rcpp(install again-some issue) and roxygen2 package if you don't already have it
#R.version
# installing/loading the package:
#install.packages("~/Desktop/devtools_1.13.6.tgz", repos = NULL, type = .Platform$pkgType)
if(!require(devtools)) { install.packages("devtools"); require(devtools)}
if(!require(Rcpp)) { install.packages("Rcpp"); require(Rcpp)}
if(!require(roxygen2)) {install.packages("roxygen2"); require(roxygen2)}
if(!require(shiny)) {install.packages("shiny"); require(shiny)}
library(Rcpp)
library(devtools)
library(roxygen2)
library(shiny)
# global items
# check if pkgs are installed already, if not, install automatically:
# (http://stackoverflow.com/a/4090208/1036500)
list.of.packages <- c("ggplot2",
"DT",
"GGally",
"psych",
"MASS")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# load all these
lapply(list.of.packages, require, character.only = TRUE)
#install.packages("devtools")
#library(devtools)
install_github("cran/latticeExtra")
install_github("sjmgarnier/viridisLite")
install_github("sjmgarnier/viridis")
install.packages("Hmisc")
install_github("cran/bit")
install_github("cran/ff")
install_github("cran/ffbase")
install_github("mtennekes/tabplot")
library(Hmisc)
shiny_gsk <- function(options = c("pca", "none")){
if (options=="none") {return(NULL)}
else if(options=="pca"){
server <- function(input, output) {
# read in the CSV
the_data_fn <- reactive({
inFile <- input$file1
if (is.null(inFile)) return(NULL)
the_data <- read.csv(inFile$datapath, header = (input$header == "Yes"),
sep = input$sep, quote = input$quote, stringsAsFactors=FALSE)
return(the_data)
})
# tableplot
output$tableplot <- renderPlot({
if(is.null(the_data_fn())) return()
the_data <- the_data_fn()
tabplot::tableplot(the_data)
})
# display a table of the CSV contents
output$contents <- DT::renderDataTable({
#
the_data_fn()
})
# display a summary of the CSV contents
output$summary <- renderTable({
the_data <- the_data_fn()
psych::describe(the_data)
})
# Check boxes to choose columns
output$choose_columns_biplot <- renderUI({
the_data <- the_data_fn()
colnames <- names(the_data)
# Create the checkboxes and select them all by default
checkboxGroupInput("columns_biplot", "Choose up to five columns to display on the scatterplot matrix",
choices = colnames,
selected = colnames[1:5])
})
# corr plot
output$corr_plot <- renderPlot({
the_data <- the_data_fn()
# Keep the selected columns
columns_biplot <- input$columns_biplot
the_data_subset_biplot <- the_data[, columns_biplot, drop = FALSE]
ggpairs(the_data_subset_biplot)
})
# corr tables
output$corr_tables <- renderTable({
the_data <- the_data_fn()
# we only want to show numeric cols
the_data_num <- the_data[,sapply(the_data,is.numeric)]
# exclude cols with zero variance
the_data_num <- the_data_num[,!apply(the_data_num, MARGIN = 2, function(x) max(x, na.rm = TRUE) == min(x, na.rm = TRUE))]
res <- Hmisc::rcorr(as.matrix(the_data_num))
cormat <- res$r
pmat <- res$P
ut <- upper.tri(cormat)
df <- data.frame(
row = rownames(cormat)[row(cormat)[ut]],
column = rownames(cormat)[col(cormat)[ut]],
cor = (cormat)[ut],
p = pmat[ut]
)
with(df, df[order(-cor), ])
})
output$bartlett <- renderPrint({
the_data <- the_data_fn()
the_data_num <- na.omit(the_data[,sapply(the_data,is.numeric)])
# exclude cols with zero variance
the_data_num <- the_data_num[,!apply(the_data_num, MARGIN = 2, function(x) max(x, na.rm = TRUE) == min(x, na.rm = TRUE))]
cortest.bartlett(cor(the_data_num), n = nrow(the_data_num))
})
output$kmo <- renderPrint({
the_data <- the_data_fn()
the_data_num <- the_data[,sapply(the_data,is.numeric)]
# exclude cols with zero variance
the_data_num <- the_data_num[,!apply(the_data_num, MARGIN = 2, function(x) max(x, na.rm = TRUE) == min(x, na.rm = TRUE))]
# R <- cor(the_data_num)
# KMO(R)
# http://www.opensubscriber.com/message/r-help@stat.math.ethz.ch/7315408.html
# KMO Kaiser-Meyer-Olkin Measure of Sampling Adequacy
kmo = function( data ){
library(MASS)
X <- cor(as.matrix(data))
iX <- ginv(X)
S2 <- diag(diag((iX^-1)))
AIS <- S2%*%iX%*%S2 # anti-image covariance matrix
IS <- X+AIS-2*S2 # image covariance matrix
Dai <- sqrt(diag(diag(AIS)))
IR <- ginv(Dai)%*%IS%*%ginv(Dai) # image correlation matrix
AIR <- ginv(Dai)%*%AIS%*%ginv(Dai) # anti-image correlation matrix
a <- apply((AIR - diag(diag(AIR)))^2, 2, sum)
AA <- sum(a)
b <- apply((X - diag(nrow(X)))^2, 2, sum)
BB <- sum(b)
MSA <- b/(b+a) # indiv. measures of sampling adequacy
AIR <- AIR-diag(nrow(AIR))+diag(MSA) # Examine the anti-image of the
# correlation matrix. That is the
# negative of the partial correlations,
# partialling out all other variables.
kmo <- BB/(AA+BB) # overall KMO statistic
# Reporting the conclusion
if (kmo >= 0.00 && kmo < 0.50){
test <- 'The KMO test yields a degree of common variance
unacceptable for FA.'
} else if (kmo >= 0.50 && kmo < 0.60){
test <- 'The KMO test yields a degree of common variance miserable.'
} else if (kmo >= 0.60 && kmo < 0.70){
test <- 'The KMO test yields a degree of common variance mediocre.'
} else if (kmo >= 0.70 && kmo < 0.80){
test <- 'The KMO test yields a degree of common variance middling.'
} else if (kmo >= 0.80 && kmo < 0.90){
test <- 'The KMO test yields a degree of common variance meritorious.'
} else {
test <- 'The KMO test yields a degree of common variance marvelous.'
}
ans <- list( overall = kmo,
report = test,
individual = MSA,
AIS = AIS,
AIR = AIR )
return(ans)
} # end of kmo()
kmo(na.omit(the_data_num))
})
# Check boxes to choose columns
output$choose_columns_pca <- renderUI({
the_data <- the_data_fn()
# Get the data set with the appropriate name
# we only want to show numeric cols
the_data_num <- na.omit(the_data[,sapply(the_data,is.numeric)])
# exclude cols with zero variance
the_data_num <- the_data_num[,!apply(the_data_num, MARGIN = 2, function(x) max(x, na.rm = TRUE) == min(x, na.rm = TRUE))]
colnames <- names(the_data_num)
# Create the checkboxes and select them all by default
checkboxGroupInput("columns", "Choose columns",
choices = colnames,
selected = colnames)
})
# choose a grouping variable
output$the_grouping_variable <- renderUI({
the_data <- the_data_fn()
# for grouping we want to see only cols where the number of unique values are less than
# 10% the number of observations
grouping_cols <- sapply(seq(1, ncol(the_data)), function(i) length(unique(the_data[,i])) < nrow(the_data)/10 )
the_data_group_cols <- the_data[, grouping_cols, drop = FALSE]
# drop down selection
selectInput(inputId = "the_grouping_variable",
label = "Grouping variable:",
choices=c("None", names(the_data_group_cols)))
})
pca_objects <- reactive({
# Keep the selected columns
columns <- input$columns
the_data <- na.omit(the_data_fn())
the_data_subset <- na.omit(the_data[, columns, drop = FALSE])
# from http://rpubs.com/sinhrks/plot_pca
pca_output <- prcomp(na.omit(the_data_subset),
center = (input$center == 'Yes'),
scale. = (input$scale. == 'Yes'))
# data.frame of PCs
pcs_df <- cbind(the_data, pca_output$x)
return(list(the_data = the_data,
the_data_subset = the_data_subset,
pca_output = pca_output,
pcs_df = pcs_df))
})
output$the_pcs_to_plot_x <- renderUI({
pca_output <- pca_objects()$pca_output$x
# drop down selection
selectInput(inputId = "the_pcs_to_plot_x",
label = "X axis:",
choices= colnames(pca_output),
selected = 'PC1')
})
output$the_pcs_to_plot_y <- renderUI({
pca_output <- pca_objects()$pca_output$x
# drop down selection
selectInput(inputId = "the_pcs_to_plot_y",
label = "Y axis:",
choices= colnames(pca_output),
selected = 'PC2')
})
output$plot2 <- renderPlot({
pca_output <- pca_objects()$pca_output
eig = (pca_output$sdev)^2
variance <- eig*100/sum(eig)
cumvar <- paste(round(cumsum(variance),1), "%")
eig_df <- data.frame(eig = eig,
PCs = colnames(pca_output$x),
cumvar = cumvar)
ggplot(eig_df, aes(reorder(PCs, -eig), eig)) +
geom_bar(stat = "identity", fill = "white", colour = "black") +
geom_text(label = cumvar, size = 4,
vjust=-0.4) +
theme_bw(base_size = 14) +
xlab("PC") +
ylab("Variances") +
ylim(0,(max(eig_df$eig) * 1.1))
})
# PC plot
pca_biplot <- reactive({
pcs_df <- pca_objects()$pcs_df
pca_output <- pca_objects()$pca_output
var_expl_x <- round(100 * pca_output$sdev[as.numeric(gsub("[^0-9]", "", input$the_pcs_to_plot_x))]^2/sum(pca_output$sdev^2), 1)
var_expl_y <- round(100 * pca_output$sdev[as.numeric(gsub("[^0-9]", "", input$the_pcs_to_plot_y))]^2/sum(pca_output$sdev^2), 1)
labels <- rownames(pca_output$x)
grouping <- input$the_grouping_variable
if(grouping == 'None'){
# plot without grouping variable
pc_plot_no_groups <- ggplot(pcs_df,
aes_string(input$the_pcs_to_plot_x,
input$the_pcs_to_plot_y
)) +
geom_text(aes(label = labels), size = 5) +
theme_bw(base_size = 14) +
coord_equal() +
xlab(paste0(input$the_pcs_to_plot_x, " (", var_expl_x, "% explained variance)")) +
ylab(paste0(input$the_pcs_to_plot_y, " (", var_expl_y, "% explained variance)"))
# the plot
pc_plot_no_groups
} else {
# plot with grouping variable
pcs_df$fill_ <- as.character(pcs_df[, grouping, drop = TRUE])
pc_plot_groups <- ggplot(pcs_df, aes_string(input$the_pcs_to_plot_x,
input$the_pcs_to_plot_y,
fill = 'fill_',
colour = 'fill_'
)) +
stat_ellipse(geom = "polygon", alpha = 0.1) +
geom_text(aes(label = labels), size = 5) +
theme_bw(base_size = 14) +
scale_colour_discrete(guide = FALSE) +
guides(fill = guide_legend(title = "groups")) +
theme(legend.position="top") +
coord_equal() +
xlab(paste0(input$the_pcs_to_plot_x, " (", var_expl_x, "% explained variance)")) +
ylab(paste0(input$the_pcs_to_plot_y, " (", var_expl_y, "% explained variance)"))
# the plot
pc_plot_groups
}
})
output$brush_info <- renderTable({
# the brushing function
brushedPoints(pca_objects()$pcs_df, input$plot_brush)
})
# for zooming
output$z_plot1 <- renderPlot({
pca_biplot()
})
# zoom ranges
zooming <- reactiveValues(x = NULL, y = NULL)
observe({
brush <- input$z_plot1Brush
if (!is.null(brush)) {
zooming$x <- c(brush$xmin, brush$xmax)
zooming$y <- c(brush$ymin, brush$ymax)
}
else {
zooming$x <- NULL
zooming$y <- NULL
}
})
# for zooming
output$z_plot2 <- renderPlot({
pca_biplot() + coord_cartesian(xlim = zooming$x, ylim = zooming$y)
})
output$brush_info_after_zoom <- renderTable({
# the brushing function
brushedPoints(pca_objects()$pcs_df, input$plot_brush_after_zoom)
})
output$pca_details <- renderPrint({
#
print(pca_objects()$pca_output$rotation)
summary(pca_objects()$pca_output)
})
output$Colophon <- renderPrint({
})
}
ui <- bootstrapPage(
mainPanel(
titlePanel("Interactive PCA Explorer"),
tabsetPanel(
tabPanel("Data input",
p("Before uploading your data, check that it is clean, especially ensure that the the numeric variables contain only the digits 0-9 or NA (to indicate missing data)."),
p("Rows that contain one or more NAs will be excluded from the PCA."),
p("Columns that contain a mixture of numbers and text will not be included in the computation of the PCA results."),
p("Have a look at the ", a("iris.csv", href = "https://raw.githubusercontent.com/benmarwick/Interactive_PCA_Explorer/master/iris.csv"), " file included with this app to see what a clean CSV file looks like."),
tags$hr(),
p("Select the options that match your CSV file, then upload your file:"),
radioButtons(inputId = 'header',
label = 'Header',
choices = c('Columns have headers'='Yes',
'Columns do not have headers'='No'),
selected = 'Yes'),
radioButtons('sep', 'Separator',
c(Comma=',',
Semicolon=';',
Tab='\t'),
','),
radioButtons('quote', 'Quote',
c(None='',
'Double Quote'='"',
'Single Quote'="'"),
'"'),
tags$hr(),
fileInput('file1', 'Choose a CSV file to upload:',
accept = c(
'text/csv',
'text/comma-separated-values',
'text/tab-separated-values',
'text/plain',
'.csv',
'.tsv'
)),
p("After uploading your CSV file, click on the 'Inspect the data' tab")
), # end file tab
tabPanel("Inspect the data",
p("The tableplot below (it will take a few seconds to appear) may be useful to explore the relationships between the variables, to discover strange data patterns, and to check the occurrence and selectivity of missing values."),
plotOutput("tableplot"),
tags$hr(),
p("Here is a summary of the data"),
tableOutput('summary'),
tags$hr(),
p("Here is the raw data from the CSV file"),
DT::dataTableOutput('contents')
), # end tab
tabPanel("Correlation Plots",
uiOutput("choose_columns_biplot"),
tags$hr(),
p("This plot may take a few moments to appear when analysing large datasets. You may want to exclude highly correlated variables from the PCA."),
plotOutput("corr_plot"),
tags$hr(),
p("Summary of correlations"),
tableOutput("corr_tables")
), # end tab
tabPanel("Diagnostics",
p("Among SPSS users, these tests are considered to provide some guidelines on the suitability of the data for a principal components analysis. However, they may be safely ignored in favour of common sense. Variables with zero variance are excluded."),
tags$hr(),
p("Here is the output of Bartlett's sphericity test. Bartlett's test of sphericity tests whether the data comes from multivariate normal distribution with zero covariances. If p > 0.05 then PCA may not be very informative"),
verbatimTextOutput("bartlett"),
tags$hr(),
p("Here is the output of the Kaiser-Meyer-Olkin (KMO) index test. The overall measure varies between 0 and 1, and values closer to 1 are better. A value of 0.6 is a suggested minimum. "),
verbatimTextOutput("kmo")
), # end tab
tabPanel("Compute PCA",
p("Choose the columns of your data to include in the PCA."),
p("Only columns containing numeric data are shown here because PCA doesn't work with non-numeric data."),
p("The PCA is automatically re-computed each time you change your selection."),
p("Observations (ie. rows) are automatically removed if they contain any missing values."),
p("Variables with zero variance have been automatically removed because they're not useful in a PCA."),
uiOutput("choose_columns_pca"),
tags$hr(),
p("Select options for the PCA computation (we are using the prcomp function here)"),
radioButtons(inputId = 'center',
label = 'Center',
choices = c('Shift variables to be zero centered'='Yes',
'Do not shift variables'='No'),
selected = 'Yes'),
radioButtons('scale.', 'Scale',
choices = c('Scale variables to have unit variance'='Yes',
'Do not scale variables'='No'),
selected = 'Yes')
), # end tab
tabPanel("PC Plots",
h2("Scree plot"),
p("The scree plot shows the variances of each PC, and the cumulative variance explained by each PC (in %) "),
plotOutput("plot2", height = "300px"),
tags$hr(),
h2("PC plot: zoom and select points"),
p("Select the grouping variable."),
p("Only variables where the number of unique values is less than 10% of the total number of observations are shown here (because seeing groups with 1-2 observations is usually not very useful)."),
uiOutput("the_grouping_variable"),
tags$hr(),
p("Select the PCs to plot"),
uiOutput("the_pcs_to_plot_x"),
uiOutput("the_pcs_to_plot_y"),
tags$hr(),
p("Click and drag on the first plot below to zoom into a region on the plot. Or you can go directly to the second plot below to select points to get more information about them."),
p("Then select points on zoomed plot below to get more information about the points."),
p("You can click on the 'Compute PCA' tab at any time to change the variables included in the PCA, and then come back to this tab and the plots will automatically update."),
plotOutput ("z_plot1", height = 400,
brush = brushOpts(
id = "z_plot1Brush",
resetOnNew = TRUE)),
tags$hr(),
p("Click and drag on the plot below to select points, and inspect the table of selected points below"),
plotOutput("z_plot2", height = 400,
brush = brushOpts(
id = "plot_brush_after_zoom",
resetOnNew = TRUE)),
tags$hr(),
p("Details of the brushed points"),
tableOutput("brush_info_after_zoom")
), # end tab
tabPanel("PCA output",
verbatimTextOutput("pca_details")
)
#, # end tab
# tabPanel("Colophon",
#p("The text is licensed ", a("CC-BY", href = "http://creativecommons.org/licenses/by/4.0/"), " and the code ", a(href = "https://opensource.org/licenses/MIT", "MIT"), ".")
# ) # end tab
)))
#shinyApp(ui = ui, server = server)
viewer <- paneViewer(300)
runGadget( ui, server, viewer = viewer)
}
}
library(shiny)
library(GGally)
library(ggplot2)
library(psych)
pca <- function() {
server <- function(input, output) {
# read in the CSV
the_data_fn <- reactive({
inFile <- input$file1
if (is.null(inFile)) return(NULL)
the_data <- read.csv(inFile$datapath, header = (input$header == "Yes"),
sep = input$sep, quote = input$quote, stringsAsFactors=FALSE)
return(the_data)
})
# tableplot
output$tableplot <- renderPlot({
if(is.null(the_data_fn())) return()
the_data <- the_data_fn()
tabplot::tableplot(the_data)
})
# display a table of the CSV contents
output$contents <- DT::renderDataTable({
#
the_data_fn()
})
# display a summary of the CSV contents
output$summary <- renderTable({
the_data <- the_data_fn()
psych::describe(the_data)
})
# Check boxes to choose columns
output$choose_columns_biplot <- renderUI({
the_data <- the_data_fn()
colnames <- names(the_data)
# Create the checkboxes and select them all by default
checkboxGroupInput("columns_biplot", "Choose up to five columns to display on the scatterplot matrix",
choices = colnames,
selected = colnames[1:5])
})
# corr plot
output$corr_plot <- renderPlot({
the_data <- the_data_fn()
# Keep the selected columns
columns_biplot <- input$columns_biplot
the_data_subset_biplot <- the_data[, columns_biplot, drop = FALSE]
ggpairs(the_data_subset_biplot)
})
# corr tables
output$corr_tables <- renderTable({
the_data <- the_data_fn()
# we only want to show numeric cols
the_data_num <- the_data[,sapply(the_data,is.numeric)]
# exclude cols with zero variance
the_data_num <- the_data_num[,!apply(the_data_num, MARGIN = 2, function(x) max(x, na.rm = TRUE) == min(x, na.rm = TRUE))]
res <- Hmisc::rcorr(as.matrix(the_data_num))
cormat <- res$r
pmat <- res$P
ut <- upper.tri(cormat)
df <- data.frame(
row = rownames(cormat)[row(cormat)[ut]],
column = rownames(cormat)[col(cormat)[ut]],
cor = (cormat)[ut],
p = pmat[ut]
)
with(df, df[order(-cor), ])
})
output$bartlett <- renderPrint({
the_data <- the_data_fn()
the_data_num <- na.omit(the_data[,sapply(the_data,is.numeric)])
# exclude cols with zero variance
the_data_num <- the_data_num[,!apply(the_data_num, MARGIN = 2, function(x) max(x, na.rm = TRUE) == min(x, na.rm = TRUE))]
cortest.bartlett(cor(the_data_num), n = nrow(the_data_num))
})
output$kmo <- renderPrint({
the_data <- the_data_fn()
the_data_num <- the_data[,sapply(the_data,is.numeric)]
# exclude cols with zero variance
the_data_num <- the_data_num[,!apply(the_data_num, MARGIN = 2, function(x) max(x, na.rm = TRUE) == min(x, na.rm = TRUE))]
# R <- cor(the_data_num)
# KMO(R)
# http://www.opensubscriber.com/message/r-help@stat.math.ethz.ch/7315408.html
# KMO Kaiser-Meyer-Olkin Measure of Sampling Adequacy
kmo = function( data ){
library(MASS)
X <- cor(as.matrix(data))
iX <- ginv(X)
S2 <- diag(diag((iX^-1)))
AIS <- S2%*%iX%*%S2 # anti-image covariance matrix
IS <- X+AIS-2*S2 # image covariance matrix
Dai <- sqrt(diag(diag(AIS)))
IR <- ginv(Dai)%*%IS%*%ginv(Dai) # image correlation matrix
AIR <- ginv(Dai)%*%AIS%*%ginv(Dai) # anti-image correlation matrix
a <- apply((AIR - diag(diag(AIR)))^2, 2, sum)
AA <- sum(a)
b <- apply((X - diag(nrow(X)))^2, 2, sum)
BB <- sum(b)
MSA <- b/(b+a) # indiv. measures of sampling adequacy
AIR <- AIR-diag(nrow(AIR))+diag(MSA) # Examine the anti-image of the
# correlation matrix. That is the
# negative of the partial correlations,
# partialling out all other variables.
kmo <- BB/(AA+BB) # overall KMO statistic
# Reporting the conclusion
if (kmo >= 0.00 && kmo < 0.50){
test <- 'The KMO test yields a degree of common variance
unacceptable for FA.'
} else if (kmo >= 0.50 && kmo < 0.60){
test <- 'The KMO test yields a degree of common variance miserable.'
} else if (kmo >= 0.60 && kmo < 0.70){
test <- 'The KMO test yields a degree of common variance mediocre.'
} else if (kmo >= 0.70 && kmo < 0.80){
test <- 'The KMO test yields a degree of common variance middling.'
} else if (kmo >= 0.80 && kmo < 0.90){
test <- 'The KMO test yields a degree of common variance meritorious.'
} else {
test <- 'The KMO test yields a degree of common variance marvelous.'
}
ans <- list( overall = kmo,
report = test,
individual = MSA,
AIS = AIS,
AIR = AIR )
return(ans)
} # end of kmo()
kmo(na.omit(the_data_num))
})
# Check boxes to choose columns
output$choose_columns_pca <- renderUI({
the_data <- the_data_fn()
# Get the data set with the appropriate name
# we only want to show numeric cols
the_data_num <- na.omit(the_data[,sapply(the_data,is.numeric)])
# exclude cols with zero variance
the_data_num <- the_data_num[,!apply(the_data_num, MARGIN = 2, function(x) max(x, na.rm = TRUE) == min(x, na.rm = TRUE))]
colnames <- names(the_data_num)
# Create the checkboxes and select them all by default
checkboxGroupInput("columns", "Choose columns",
choices = colnames,
selected = colnames)
})
# choose a grouping variable
output$the_grouping_variable <- renderUI({
the_data <- the_data_fn()
# for grouping we want to see only cols where the number of unique values are less than
# 10% the number of observations
grouping_cols <- sapply(seq(1, ncol(the_data)), function(i) length(unique(the_data[,i])) < nrow(the_data)/10 )
the_data_group_cols <- the_data[, grouping_cols, drop = FALSE]
# drop down selection
selectInput(inputId = "the_grouping_variable",
label = "Grouping variable:",
choices=c("None", names(the_data_group_cols)))
})
pca_objects <- reactive({
# Keep the selected columns
columns <- input$columns
the_data <- na.omit(the_data_fn())
the_data_subset <- na.omit(the_data[, columns, drop = FALSE])
# from http://rpubs.com/sinhrks/plot_pca
pca_output <- prcomp(na.omit(the_data_subset),
center = (input$center == 'Yes'),
scale. = (input$scale. == 'Yes'))
# data.frame of PCs
pcs_df <- cbind(the_data, pca_output$x)
return(list(the_data = the_data,
the_data_subset = the_data_subset,
pca_output = pca_output,
pcs_df = pcs_df))
})
output$the_pcs_to_plot_x <- renderUI({
pca_output <- pca_objects()$pca_output$x
# drop down selection
selectInput(inputId = "the_pcs_to_plot_x",
label = "X axis:",
choices= colnames(pca_output),
selected = 'PC1')
})
output$the_pcs_to_plot_y <- renderUI({
pca_output <- pca_objects()$pca_output$x
# drop down selection
selectInput(inputId = "the_pcs_to_plot_y",
label = "Y axis:",
choices= colnames(pca_output),
selected = 'PC2')
})
output$plot2 <- renderPlot({
pca_output <- pca_objects()$pca_output
eig = (pca_output$sdev)^2
variance <- eig*100/sum(eig)
cumvar <- paste(round(cumsum(variance),1), "%")
eig_df <- data.frame(eig = eig,
PCs = colnames(pca_output$x),
cumvar = cumvar)
ggplot(eig_df, aes(reorder(PCs, -eig), eig)) +
geom_bar(stat = "identity", fill = "white", colour = "black") +
geom_text(label = cumvar, size = 4,
vjust=-0.4) +
theme_bw(base_size = 14) +
xlab("PC") +
ylab("Variances") +
ylim(0,(max(eig_df$eig) * 1.1))
})
# PC plot
pca_biplot <- reactive({
pcs_df <- pca_objects()$pcs_df
pca_output <- pca_objects()$pca_output
var_expl_x <- round(100 * pca_output$sdev[as.numeric(gsub("[^0-9]", "", input$the_pcs_to_plot_x))]^2/sum(pca_output$sdev^2), 1)
var_expl_y <- round(100 * pca_output$sdev[as.numeric(gsub("[^0-9]", "", input$the_pcs_to_plot_y))]^2/sum(pca_output$sdev^2), 1)
labels <- rownames(pca_output$x)
grouping <- input$the_grouping_variable
if(grouping == 'None'){
# plot without grouping variable
pc_plot_no_groups <- ggplot(pcs_df,
aes_string(input$the_pcs_to_plot_x,
input$the_pcs_to_plot_y
)) +
geom_text(aes(label = labels), size = 5) +
theme_bw(base_size = 14) +
coord_equal() +
xlab(paste0(input$the_pcs_to_plot_x, " (", var_expl_x, "% explained variance)")) +
ylab(paste0(input$the_pcs_to_plot_y, " (", var_expl_y, "% explained variance)"))
# the plot
pc_plot_no_groups
} else {
# plot with grouping variable
pcs_df$fill_ <- as.character(pcs_df[, grouping, drop = TRUE])
pc_plot_groups <- ggplot(pcs_df, aes_string(input$the_pcs_to_plot_x,
input$the_pcs_to_plot_y,
fill = 'fill_',
colour = 'fill_'
)) +
stat_ellipse(geom = "polygon", alpha = 0.1) +
geom_text(aes(label = labels), size = 5) +
theme_bw(base_size = 14) +
scale_colour_discrete(guide = FALSE) +
guides(fill = guide_legend(title = "groups")) +
theme(legend.position="top") +
coord_equal() +
xlab(paste0(input$the_pcs_to_plot_x, " (", var_expl_x, "% explained variance)")) +
ylab(paste0(input$the_pcs_to_plot_y, " (", var_expl_y, "% explained variance)"))
# the plot
pc_plot_groups
}
})
output$brush_info <- renderTable({
# the brushing function
brushedPoints(pca_objects()$pcs_df, input$plot_brush)
})
# for zooming
output$z_plot1 <- renderPlot({
pca_biplot()
})
# zoom ranges
zooming <- reactiveValues(x = NULL, y = NULL)
observe({
brush <- input$z_plot1Brush
if (!is.null(brush)) {
zooming$x <- c(brush$xmin, brush$xmax)
zooming$y <- c(brush$ymin, brush$ymax)
}
else {
zooming$x <- NULL
zooming$y <- NULL
}
})
# for zooming
output$z_plot2 <- renderPlot({
pca_biplot() + coord_cartesian(xlim = zooming$x, ylim = zooming$y)
})
output$brush_info_after_zoom <- renderTable({
# the brushing function
brushedPoints(pca_objects()$pcs_df, input$plot_brush_after_zoom)
})
output$pca_details <- renderPrint({
#
print(pca_objects()$pca_output$rotation)
summary(pca_objects()$pca_output)
})
output$Colophon <- renderPrint({
})
}
ui <- bootstrapPage(
mainPanel(
titlePanel("Interactive PCA Explorer"),
tabsetPanel(
tabPanel("Data input",
p("Before uploading your data, check that it is clean, especially ensure that the the numeric variables contain only the digits 0-9 or NA (to indicate missing data)."),
p("Rows that contain one or more NAs will be excluded from the PCA."),
p("Columns that contain a mixture of numbers and text will not be included in the computation of the PCA results."),
p("Have a look at the ", a("iris.csv", href = "https://raw.githubusercontent.com/benmarwick/Interactive_PCA_Explorer/master/iris.csv"), " file included with this app to see what a clean CSV file looks like."),
tags$hr(),
p("Select the options that match your CSV file, then upload your file:"),
radioButtons(inputId = 'header',
label = 'Header',
choices = c('Columns have headers'='Yes',
'Columns do not have headers'='No'),
selected = 'Yes'),
radioButtons('sep', 'Separator',
c(Comma=',',
Semicolon=';',
Tab='\t'),
','),
radioButtons('quote', 'Quote',
c(None='',
'Double Quote'='"',
'Single Quote'="'"),
'"'),
tags$hr(),
fileInput('file1', 'Choose a CSV file to upload:',
accept = c(
'text/csv',
'text/comma-separated-values',
'text/tab-separated-values',
'text/plain',
'.csv',
'.tsv'
)),
p("After uploading your CSV file, click on the 'Inspect the data' tab")
), # end file tab
tabPanel("Inspect the data",
p("The tableplot below (it will take a few seconds to appear) may be useful to explore the relationships between the variables, to discover strange data patterns, and to check the occurrence and selectivity of missing values."),
plotOutput("tableplot"),
tags$hr(),
p("Here is a summary of the data"),
tableOutput('summary'),
tags$hr(),
p("Here is the raw data from the CSV file"),
DT::dataTableOutput('contents')
), # end tab
tabPanel("Correlation Plots",
uiOutput("choose_columns_biplot"),
tags$hr(),
p("This plot may take a few moments to appear when analysing large datasets. You may want to exclude highly correlated variables from the PCA."),
plotOutput("corr_plot"),
tags$hr(),
p("Summary of correlations"),
tableOutput("corr_tables")
), # end tab
tabPanel("Diagnostics",
p("Among SPSS users, these tests are considered to provide some guidelines on the suitability of the data for a principal components analysis. However, they may be safely ignored in favour of common sense. Variables with zero variance are excluded."),
tags$hr(),
p("Here is the output of Bartlett's sphericity test. Bartlett's test of sphericity tests whether the data comes from multivariate normal distribution with zero covariances. If p > 0.05 then PCA may not be very informative"),
verbatimTextOutput("bartlett"),
tags$hr(),
p("Here is the output of the Kaiser-Meyer-Olkin (KMO) index test. The overall measure varies between 0 and 1, and values closer to 1 are better. A value of 0.6 is a suggested minimum. "),
verbatimTextOutput("kmo")
), # end tab
tabPanel("Compute PCA",
p("Choose the columns of your data to include in the PCA."),
p("Only columns containing numeric data are shown here because PCA doesn't work with non-numeric data."),
p("The PCA is automatically re-computed each time you change your selection."),
p("Observations (ie. rows) are automatically removed if they contain any missing values."),
p("Variables with zero variance have been automatically removed because they're not useful in a PCA."),
uiOutput("choose_columns_pca"),
tags$hr(),
p("Select options for the PCA computation (we are using the prcomp function here)"),
radioButtons(inputId = 'center',
label = 'Center',
choices = c('Shift variables to be zero centered'='Yes',
'Do not shift variables'='No'),
selected = 'Yes'),
radioButtons('scale.', 'Scale',
choices = c('Scale variables to have unit variance'='Yes',
'Do not scale variables'='No'),
selected = 'Yes')
), # end tab
tabPanel("PC Plots",
h2("Scree plot"),
p("The scree plot shows the variances of each PC, and the cumulative variance explained by each PC (in %) "),
plotOutput("plot2", height = "300px"),
tags$hr(),
h2("PC plot: zoom and select points"),
p("Select the grouping variable."),
p("Only variables where the number of unique values is less than 10% of the total number of observations are shown here (because seeing groups with 1-2 observations is usually not very useful)."),
uiOutput("the_grouping_variable"),
tags$hr(),
p("Select the PCs to plot"),
uiOutput("the_pcs_to_plot_x"),
uiOutput("the_pcs_to_plot_y"),
tags$hr(),
p("Click and drag on the first plot below to zoom into a region on the plot. Or you can go directly to the second plot below to select points to get more information about them."),
p("Then select points on zoomed plot below to get more information about the points."),
p("You can click on the 'Compute PCA' tab at any time to change the variables included in the PCA, and then come back to this tab and the plots will automatically update."),
plotOutput ("z_plot1", height = 400,
brush = brushOpts(
id = "z_plot1Brush",
resetOnNew = TRUE)),
tags$hr(),
p("Click and drag on the plot below to select points, and inspect the table of selected points below"),
plotOutput("z_plot2", height = 400,
brush = brushOpts(
id = "plot_brush_after_zoom",
resetOnNew = TRUE)),
tags$hr(),
p("Details of the brushed points"),
tableOutput("brush_info_after_zoom")
), # end tab
tabPanel("PCA output",
verbatimTextOutput("pca_details")
)
#, # end tab
# tabPanel("Colophon",
#p("The text is licensed ", a("CC-BY", href = "http://creativecommons.org/licenses/by/4.0/"), " and the code ", a(href = "https://opensource.org/licenses/MIT", "MIT"), ".")
# ) # end tab
)))
#shinyApp(ui = ui, server = server)
viewer <- paneViewer(300)
runGadget( ui, server, viewer = viewer)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.