# R output pre blocks are styled by default to indicate output knitr::opts_chunk$set(comment = NA) # shorthand for rd_link() - see ?packagedocs::rd_link for more information rdl <- function(x) packagedocs::rd_link(deparse(substitute(x)))
muvis is a visualization and analysis toolkit for multivariate datasets. To use this package, you will need the R statistical computing environment (version 3.0 or later).
muvis can be installed through github:
devtools::install_github("bAIo-lab/muvis") library(muvis)
# devtools::install_github("bAIo-lab/muvis", dependencies=F) library(muvis)
library(dplyr) library(kableExtra)
We'll use National Health and Nutrition Examination Survey (NHANES), 2005-2006 dataset to work with muvis
. The original dataset contains more than 7,000 variables and nearly 10,000 samples which we selected 4461 samples and 161 variables, including Id variable ("SEQN"), continuous (i.e., laboratory measurements), and categorical variables (i.e., questions). For more details about variable names and samples visit website- https://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/25504/summary
data("NHANES") NHANES$SEQN <- NULL
The dataset has some missing values. We'll use data_preproc
to impute missing values and to specify categorical and continuous variables (Setting level = 15). The first 74 columns are continuous.
data <- data_preproc(NHANES, levels = 15, detect.outliers = T, alpha = .4) kableExtra::kable(head(data)) %>% kable_styling() %>% scroll_box(width = "100%", height = "200px")
Use ggm
to construct a Gaussian Graphical Model with glasso and significance test method. (Levels is set to default (NULL) as the data has been preprocessed using data_preproc
.)
ggm(data[,1:74], significance = 0.05, rho = 0.1, community = TRUE, methods = c("glasso", "sin"), plot = T, levels = NULL) -> g g$communities[1:10]
Employ min_forest
with BIC to construct a mixed-interaction model fitting the data.
mf <- min_forest(data, stat = "BIC", community = T, plot = F, levels = NULL) mf$network mf$betweenness[1:10]
Cluster the first 200 samples of data using min.forest
with community = TRUE. Before running function transpose the data and convert the values to numeric (Because the community detection applies on the columns of the dataset).
t_nhanes <- as.data.frame(sapply(as.data.frame(t(data[1:200, ])), function(x) as.numeric(as.character(x)))) cluster_mf <- min_forest(t_nhanes, plot = T)
Use dim_reduce
function with tsne method to plot the first 200 samples in the 2 dimensional space and also color the points based on communities resulted in the above minimal forest graph (cluster_mf).
communities <- cluster_mf$communities communities <- communities[match(c(1:200), as.integer(names(communities)))] dim_reduce(data[1:200,], method = "tsne", annot1 = as.factor(communities), annot1.name = "minimal forest communities")
Construct a causal network of continuous-valued variables of data using dgm
. (dtype parameter is set to 'gaussian' to run on continuous variables)
nhanes_dgm <- dgm(data, dtype = "gaussian", alpha = 1e-15, plot = F) nhanes_dgm$network
Use graph_vis
to find communities of an arbitrary graph object and plot them. (The graph "mf$graph" here is the one resulted by minforest
function.)
Here we plot just the two first communities.
mf.val <- graph_vis(mf$graph, directed = F, community = T, betweenness = T, plot = T, plot.community = T, plot.community.list = c(1,2))
Calculate variable-wise Kullback-Leibler divergence between two groups of samples g1 and g2. The first group is people watching TV less than an hour a day, and nobody smokes in their home. In contrast, the second one consists of people who watch television for more than 5 hours a day and smoke.
Use VKL
function to find the most different variables between the two groups.
g1 <- which(data$PAD590 == 1 & data$SMD410 == 2) g2 <- which(data$PAD590 == 6 & data$SMD410 == 1) KL <- VKL(data, group1 = g1, group2 = g2, permute = 100) KL[1:5, ]
Use VVKL
to find the most deferentially variables between people with relatively high Total Cholesterol (relative to vitamin E) and people with high Vitamin E (relative to Total Cholesterol).
"LBXTC" is Total Cholesterol and "LBXVIE" is Vitamin E. Here we use the function to find just the categorical variables.
"DSD010" is a questions about Dietary Supplements.
edges <- mf$network$x$edges lbxtc_edges <- edges[edges$from == "LBXTC" | edges$to == "LBXTC", ] lbxtc_edges[1:5, ] VVKL(data[, 75:160], var1 = data$LBXTC, var2 = data$LBXVIE, plot = T, var1.name = 'LBXTC', var2.name = 'LBXVIE', permute = 100, levels = NULL) -> KL kableExtra::kable(head(KL$kl, n = 5)) %>% kable_styling() %>% scroll_box(width = "300px", height = "300px") KL$plot
Use plot_assoc
for different visualizations.
levels
is set to 15 (As the data has been preprocessed by data_preproc
it can be NULL too.)
Histogram for "PAD600" Question (about number of hours use computer last month. 0: less than hour, 1: one hour, ..., 5: five hour, 6: None):
plot_assoc(data, vars = c("PAD600"), levels = 15, interactive = F)
Density plot for "RIDAGEYR" (age):
plot_assoc(data, vars = c("RIDAGEYR"), levels = 15, interactive = F)
Boxplot of "LBXTHG" (indicating total mercury amount in the blood (ug/L)) for different answers of "DRD360" (fish eaten during past 30 days. 1: Yes, 2: No, 3: Refused, 4: Don't know):
edges[edges$from == "DRD360" | edges$to == "DRD360", ] plot_assoc(data, vars = c("LBXTHG", "DRD360"), levels = 15, interactive = F)
Relative histogram of "DSD010" question (about taking any dietary supplements. 1: yes, 2: no, 3: refused, 4: don't know) vs "SMD410" question (about smoking in the home. 1: yes, 2: no):
edges[edges$from == 'SMD410'| edges$to == 'SMD410', ] plot_assoc(data, vars = c("DSD010", "SMD410"))
Scatter plot of "LBXHGB" (hemoglobin amount in the blood (g/dL)) and "LBXSIR" (refrigerated Iron in the blood (ug/dL)):
lbxhgb_edges <- edges[edges$from == "LBXHGB" | edges$to == "LBXHGB", ] lbxhgb_edges[1:5, ] plot_assoc(data, vars = c("LBXHGB", "LBXSIR"), levels = 15, interactive = F)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.