#' An R6 class representing a matrix dataset.
#'
#' EDAMatrix is a helper class for wrapping data matrices, with optional
#' support for row and column datadata. Methods are provided for common
#' exploratory data analysis summary statistics, transformations, and
#' visualizations.
#'
#' @section Usage:
#' ```
#' edm <- EDAMatrix$new(mat, row_mdata=row_mdata_df, row_color='some_var')
#' edm$summary()
#'
#' edm$plot_pca()
#'
#' edm$t$subsample(100)$plot_heatmap()
#' ```
#'
#' @section Arguments:
#' - `dat`: An m x n dataset.
#' - `row_mdata`: A matrix or data frame with rows corresponding to the row
#' names of `dat`
#' - `col_mdata`: A matrix or data frame with rows corresponding to the
#' column names of `dat`
#' - `row_names`: Column name or number containing row identifiers. If set to
#' `rownames` (default), row names will be used as identifiers.
#' - `col_names`: Column name or number containing column identifiers. If set to
#' `colnames` (default), column names will be used as identifiers.
#' - `row_mdata_row_names`: Column name or number containing row metadata row
#' identifiers. If set to `rownames` (default), row names will be used
#' as identifiers.
#' - `col_mdata_row_names`: Column name or number containing col metadata row
#' identifiers. If set to `rownames` (default), row names will be used
#' as identifiers.
#' - `row_color`: Row metadata field to use for coloring rowwise plot elements.
#' - `row_shape`: Row metadata field to use for determine rowwise plot
#' element shape.
#' - `row_label`: Row metadata field to use when labeling plot points or
#' other elements.
#' - `col_color`: Column metadata field to use for coloring columnwise plot elements.
#' - `col_shape`: Column metadata field to use for determine columnwise plot
#' element shape.
#' - `col_label`: Column metadata field to use when labeling plot points or
#' other elements.
#' - `color_pal`: Color palette to use for relevant plotting methods
#' (default: `Set1`).
#' - `title`: Text to use as a title or subtitle for plots.
#' - `ggplot_theme`: Default theme to use for ggplot2 plots
#' (default: `theme_bw`).
#'
#' @section Fields:
#' - `dat`: Underlying data matrix
#' - `row_mdata`: Dataframe containing row metadata
#' - `col_mdata`: Dataframe containing column metadata
#'
#' @section Methods:
#' - `clear_cache()`: Clears EDAMatrix cache.
#' - `clone()`: Creates a copy of the EDAMatrix instance.
#' - `cluster_tsne(num_clusters=10, ...)`: Clusters rows in dataset using a combination
#' of t-SNE and k-means clustering.
#' - `detect_col_outliers(num_sd=2, ctend='median', meas='pearson')`:
#' Measures average pairwise similarities between all columns in the dataset.
#' Outliers are considered to be those columns who mean similarity to
#' all other columns is greater than `num_sd` standard deviations from the
#' average of averages.
#' - `detect_row_outliers(num_sd=2, ctend='median', meas='pearson')`:
#' Measures average pairwise similarities between all rows in the dataset.
#' Outliers are considered to be those rows who mean similarity to
#' all other rows is greater than `num_sd` standard deviations from the
#' average of averages.
#' - `feature_cor()`: Detects dependencies between column metadata entries
#' (features) and dataset rows.
#' - `filter_col_outliers(num_sd=2, ctend='median', meas='pearson')`:
#' Removes column outliers from the dataset. See `detect_col_outliers()`
#' for details of outlier detection approach.
#' - `filter_row_outliers(num_sd=2, ctend='median', meas='pearson')`:
#' Removes row outliers from the dataset. See `detect_row_outliers()`
#' for details of outlier detection approach.
#' - `filter_cols(mask)`: Accepts a logical vector of length `ncol(obj$dat)`
#' and returns a new EDAMatrix instance with only the columns associated
#' with `TRUE` values in the mask.
#' - `filter_rows(mask)`: Accepts a logical vector of length `nrow(obj$dat)`
#' and returns a new EDAMatrix instance with only the rowsumns associated
#' with `TRUE` values in the mask.
#' - `impute(method='knn')`: Imputes missing values in the dataset and stores
#' the result _in-place_. Currently only k-Nearest Neighbors (kNN)
#' imputation is supported.
#' - `log(base=exp(1), offset=0)`: Log-transforms data.
#' - `log1p()`: Log(x + 1)-transforms data.
#' - `pca(...)`: Performs principle component analysis (PCA) on the dataset
#' and returns a new EDAMatrix instance of the projected data points.
#' Any additional arguements specified are passed to the `prcomp()` function.
#' - `pca_feature_cor(meas='pearson', ...)`: Measures correlation between
#' dataset features (column metadata fields) and dataset principle
#' components.
#' - `plot_cor_heatmap(meas='pearson', interactive=TRUE, ...)`: Plots a
#' correlation heatmap of the dataset.
#' - `plot_densities(color=NULL, title="", ...)`: Plots densities for each
#' column in the dataset.
#' - `plot_feature_cor(meas='pearson', color_scale=c('green', 'red')`:
#' Creates a tile plot of projected data / feature correlations. See
#' `feature_cor()` function.
#' - `plot_heatmap(interactive=TRUE, ...)`: Generates a heatmap plot of the
#' dataset
#' - `plot_pairwise_column_cors(color=NULL, title="", meas='pearson', mar=c(12,6,4,6))`:
#' Plot median pairwise column correlations for each variable (column)
#' in the dataset.
#' - `plot_pca(pcx=1, pcy=2, scale=FALSE, color=NULL, shape=NULL, title=NULL,
#' text_labels=FALSE, ...)`:
#' Generates a two-dimensional PCA plot from the dataset.
#' - `plot_tsne(color=NULL, shape=NULL, title=NULL, text_labels=FALSE, ...)`:
#' Generates a two-dimensional t-SNE plot from the dataset.
#' - `print()`: Prints an overview of the object instance.
#' - `subsample(row_n=NULL, col_n=NULL, row_ratio=NULL, col_ratio=NULL)`:
#' Subsamples dataset rows and/or columns.
#' - `summary(markdown=FALSE, num_digits=2)`: Summarizes overall
#' characteristics of a dataset.
#' - `t()`: Transposes dataset rows and columns.
#' - `tsne(...)`: Performs T-distributed stochastic neighbor embedding (t-SNE)
#' on the dataset and returns a new EDAMatrix instance of the projected
#' data points. Any additional arguements specified are passed to the
#' `Rtsne()` function.
#' - `tsne_feature_cor(meas='pearson', ...)`: Measures correlation between
#' dataset features (column metadata fields) and dataset t-SNE projected
#' axes.
#'
#' @section Examples:
#' ```
#' library('eda')
#'
#' dat <- as.matrix(iris[,1:4])
#' row_mdata <- iris[,5,drop=FALSE]
#'
#' edm <- EDAMatrix$new(dat, row_mdata=row_mdata, row_color='Species')
#'
#' edm
#' edm$summary()
#'
#' edm$plot_pca()
#' edm$log1p()$plot_cor_heatmap()
#' edm$subsample(100)$plot_tsne()
#' ```
#'
#' @importFrom R6 R6Class
#' @export
#' @name EDAMatrix
#'
NULL
EDAMatrix <- R6::R6Class("EDAMatrix",
inherit = eda:::AbstractMatrixDataSet,
public = list(
# EDAMatrix constructor
initialize = function(dat,
row_mdata=NULL, col_mdata=NULL,
row_names='rownames', col_names='colnames',
row_mdata_row_names='rownames',
col_mdata_row_names='rownames',
row_color=NULL, row_shape=NULL, row_label=NULL,
col_color=NULL, col_shape=NULL, col_label=NULL,
color_pal='Set1', title='', ggplot_theme=theme_bw) {
# verify input data type and call parent constructor
private$check_input(dat)
# associate styles with row / column metadata, if present
row_edat <- NULL
col_edat <- NULL
if (!is.null(row_mdata)) {
row_edat <- 'row_mdata'
}
if (!is.null(col_mdata)) {
col_edat <- 'col_mdata'
}
# create EDADat instances
edats <- list('dat' = EDADat$new(dat, xid = 'x', yid = 'y',
row_names = row_names, col_names = col_names,
row_color = row_color, row_shape = row_shape,
row_label = row_label, row_edat = row_edat,
col_color = col_color, col_shape = col_shape,
col_label = col_label, col_edat = col_edat))
# add row and column metadata, if provided
if (!is.null(row_mdata)) {
# create row metadata EDADat instance
row_edat <- EDADat$new(row_mdata, row_names = row_mdata_row_names)
# determine data orientation and assign x and y axis id's
if (length(intersect(rownames(edats[['dat']]$dat), rownames(row_edat$dat))) > 0) {
row_edat$xid <- 'x'
row_edat$yid <- 'row metadata'
} else {
row_edat$xid <- 'row metadata'
row_edat$yid <- 'x'
}
edats[['row_mdata']] <- row_edat
}
if (!is.null(col_mdata)) {
# create column metadata EDADat instance
col_edat <- EDADat$new(col_mdata, row_names = col_mdata_row_names)
# determine data orientation and assign x and y axis id's
if (length(intersect(colnames(edats[['dat']]$dat), rownames(col_edat$dat))) > 0) {
col_edat$xid <- 'y'
col_edat$yid <- 'column metadata'
} else {
col_edat$xid <- 'column metadata'
col_edat$yid <- 'y'
}
edats[['col_mdata']] <- col_edat
}
super$initialize(edats, color_pal, title, ggplot_theme)
},
cluster_tsne = function(num_clusters=10, ...) {
super$cluster_tsne(key='dat', num_clusters=k, ...)
},
detect_col_outliers = function(num_sd=2, ctend=median, meas='pearson', ...) {
super$detect_col_outliers(key='dat', num_sd=num_sd, ctend=ctend, meas = meas, ...)
},
detect_row_outliers = function(num_sd=2, ctend=median, meas='pearson', ...) {
super$detect_row_outliers(key='dat', num_sd=num_sd, ctend=ctend, meas = meas, ...)
},
# Detects dependencies between column metadata entries (features) and
# dataset rows
#
# Note: If metedata is not all-numeric, than a similarity measure which
# supports categorical data (currently only 'lm') must be chosen.
feature_cor = function(meas='lm') {
if (is.null(self$col_mdata)) {
stop("Error: missing column metadata.")
}
private$compute_cross_cor('dat', 'col_mdata', meas)
},
filter_col_outliers = function(num_sd=2, ctend=median, meas='pearson') {
super$filter_col_outliers(key = 'dat', num_sd = num_sd, ctend = ctend, meas = meas)
},
filter_rows = function(mask) {
super$filter_rows(key='dat', mask=mask)
},
filter_cols = function(mask) {
super$filter_cols(key='dat', mask=mask)
},
filter_row_outliers = function(num_sd=2, ctend=median, meas='pearson') {
super$filter_row_outliers(key = 'dat', num_sd = num_sd, ctend = ctend, meas = meas)
},
impute = function(method='knn') {
super$impute(key='dat', method=method)
},
log = function(base=exp(1), offset=0) {
super$log(key = 'dat', base = base, offset = offset)
},
log1p = function() {
self$log(base = 1, offset = 1)
},
log2p = function() {
self$log(base = 2, offset = 1)
},
pca_feature_cor = function(num_dims=10, meas='pearson', ...) {
x <- self$t()$pca(...)
x$dat <- x$dat[, 1:min(ncol(x$dat), num_dims)]
x$t()$feature_cor(meas)
},
tsne_feature_cor = function(num_dims=10, meas='pearson', ...) {
x <- self$t()$tsne(...)
x$dat <- x$dat[, 1:min(ncol(x$dat), num_dims)]
x$t()$feature_cor(meas)
},
plot_pairwise_column_cors = function(color=NULL, label=NULL, title="",
meas='pearson',
mar=c(12, 6, 4, 6), ...) {
super$plot_pairwise_column_cors(key = 'dat', color = color,
label = label, title = title,
meas = meas, mar = mar, ...)
},
# Creates a tile plot of projected data / feature correlations
#
# include Vector of strings indicating metadata columns which
# should be included in the analysis.
# exclude Features (column metadata variables) to exclude from
# the analysis.
# color_scale Character vector containing colors to sue for
# low-correlation and high-correlation values
# (default: c('green', 'red')).
#
# return ggplot plot instance
plot_feature_cor = function(meas='lm', color_scale=c('green', 'red')) {
# compute feature correlations
cor_mat <- self$feature_cor(meas = meas)$edat[[1]]$dat
dat <- melt(cor_mat)
colnames(dat) <- c('dim', 'variable', 'value')
# Labels
#if (method == 'pca') {
# xlab_text <- 'Principle Components'
#} else if (method == 't-sne') {
# xlab_text <- "t-SNE dimension"
#}
xlab_text <- ''
# create plot
ggplot(dat, aes(x = dim, y = variable)) +
geom_tile(aes(fill = value)) +
geom_text(aes(label = value), size = 2, show.legend = FALSE) +
scale_fill_gradient(low = color_scale[1], high = color_scale[2]) +
private$ggplot_theme() +
theme(axis.text.x = element_text(size = 8, angle = 45,
vjust = 1, hjust = 1),
axis.text.y = element_text(size = 8)) +
xlab(xlab_text) +
ylab("Features") +
guides(fill = guide_legend("R^2"))
},
# Correlation heatmap.
#
# Generates a correlation heatmap depicting the pairwise column
# correlations in the data.
#
# meas String name of correlation measure to use.
# ... Additional arguments
#
# @seealso \code{cor} for more information about supported correlation
# methods.
plot_cor_heatmap = function(meas='pearson', interactive=TRUE, cor_args = list(), heatmap_args = list()) {
# generate correlation matrix
#cor_mat <- private$similarity(self$edat[['dat']]$dat, meas = meas, ...)
cor_args <- c(list(self$edat[['dat']]$dat, meas = meas), cor_args)
cor_mat <- do.call(private$similarity, cor_args)
# list of parameters to pass to heatmaply
params <- list(
x = cor_mat,
showticklabels = c(FALSE, FALSE),
subplot_widths = c(0.65, 0.35),
subplot_heights = c(0.35, 0.65)
)
# if metadata is availble, display along side of heatmap
if (!is.null(self$col_mdata)) {
# get metadata with variables stored as columns
if (self$edat[['dat']]$yid == self$edat[['col_mdata']]$yid) {
mdata <- self$edat[['col_mdata']]$tdat
} else {
mdata <- self$edat[['col_mdata']]$dat
}
# for heatmaps, show binary/logical variables on one side of the heatmap and
# other variables on the other side
lens <- apply(mdata, 2, function(x) {
length(unique(x))
})
binary_vars <- lens == 2
# column colors (binary variables)
if (sum(binary_vars) >= 1) {
params[['col_side_colors']] <- mdata[rownames(cor_mat),
binary_vars, drop = FALSE]
params[['subplot_heights']] <- c(0.15, 0.3, 0.55)
}
# row colors (everything else)
if (sum(!binary_vars) >= 1) {
params[['row_side_colors']] <- mdata[rownames(cor_mat),
!binary_vars, drop = FALSE]
params[['subplot_widths']] <- c(0.55, 0.3, 0.15)
}
}
# add any additional function arguments
params <- c(params, heatmap_args)
private$construct_heatmap_plot(params, interactive)
},
# Generates a heatmap plot of the dataset
#
# ... Additional arguments
plot_heatmap = function(interactive=TRUE, ...) {
# list of parameters to pass to heatmaply
params <- list(
x = self$edat[['dat']]$dat,
showticklabels = c(FALSE, FALSE),
subplot_widths = c(0.65, 0.35),
subplot_heights = c(0.35, 0.65)
)
# if metadata is availble, display along side of heatmap
if (!is.null(self$row_mdata)) {
params[['row_side_colors']] <- self$row_mdata
params[['subplot_widths']] <- c(0.15, 0.3, 0.55)
}
if (!is.null(self$col_mdata)) {
params[['col_side_colors']] <- self$col_mdata
params[['subplot_heights']] <- c(0.55, 0.3, 0.15)
}
# add any additional function arguments
params <- c(params, list(...))
private$construct_heatmap_plot(params, interactive)
},
plot_densities = function(color=NULL, title="", ...) {
super$plot_densities(key='dat', color=color, title=title, ...)
},
plot_pca = function(method='prcomp', pcx=1, pcy=2,
color=NULL, shape=NULL, label=NULL,
title=NULL, text_labels=NULL, ...) {
super$plot_pca(key='dat', method=method, pcx=pcx, pcy=pcy,
color_var=color, shape_var=shape, label_var=label,
title=title, text_labels=text_labels, ...)
},
plot_tsne = function(dim1=1, dim2=2,
color=NULL, shape=NULL, label=NULL, title=NULL,
text_labels=NULL, ...) {
super$plot_tsne(key='dat', dim1=dim1, dim2=dim2, color_var=color,
shape_var=shape, label_var=label, title=title,
text_labels=text_labels, ...)
},
# Prints class greeting to the screen
print = function() {
rm <- ifelse(!is.null(self$row_mdata), '(m)', '')
cm <- ifelse(!is.null(self$col_mdata), '(m)', '')
cat(sprintf(entry_template, keys[i], class(ds), nrow(ds), ncol(ds), missing_flag))
cat("=========================================\n")
cat("=\n")
cat("= EDAMatrix\n")
cat("=\n")
cat(sprintf("= rows : %d %s\n", nrow(self$dat), rm))
cat(sprintf("= columns: %d %s\n", ncol(self$dat), cm))
cat("=\n")
# if data is incomplete, include number of missing entries
num_nas <- sum(is.na(self$dat))
if (num_nas > 0) {
cat(sprintf("= # missing: %d\n", num_nas))
}
cat("=========================================\n")
},
# Subsample dataset
subsample = function(row_n=NULL, col_n=NULL, row_ratio=NULL, col_ratio=NULL) {
# clone and subsample dataset
obj <- private$clone_()
obj$edat[['dat']]$subsample(row_n, col_n, row_ratio, col_ratio)
# update metadata
if ('row_mdata' %in% names(obj$edat)) {
row_ind <- rownames(obj$row_mdata) %in% rownames(obj$dat)
obj$row_mdata <- obj$row_mdata[row_ind, ]
}
if ('col_mdata' %in% names(obj$edat)) {
col_ind <- colnames(obj$col_mdata) %in% colnames(obj$dat)
obj$col_mdata <- obj$col_mdata[, col_ind]
}
obj
},
summary = function(markdown=FALSE, num_digits=2) {
super$summary(key='dat', markdown=markdown, num_digits=num_digits)
},
transpose = function() {
# transpose datasets in-place
super$transpose()
# swap keys for row/column metadata
row_mdat <- self$edat[['row_mdata']]
self$edat[['row_mdata']] <- self$edat[['col_mdata']]
self$edat[['col_mdata']] <- row_mdat
if (!is.null(self$edat[['dat']]$row_edat)) {
self$edat[['dat']]$row_edat <- 'row_mdata'
}
if (!is.null(self$edat[['dat']]$col_edat)) {
self$edat[['dat']]$col_edat <- 'col_mdata'
}
},
# transpose (out-of-place)
t = function() {
obj <- private$clone_()
obj$transpose()
obj
}
),
private = list(
# Verifies that input data is of type matrix
check_input = function(dat) {
if (!is.matrix(dat)) {
stop("Invalid input for EDAMatrix: dat must be a matrix.")
}
},
# Prints dataset summary to screen
print_summary = function(x) {
cat("=========================================\n")
cat("\n")
cat(sprintf(" %s\n", class(self)[1]))
cat("\n")
cat(" OVERVIEW\n")
cat(" --------\n")
cat("\n")
cat(sprintf(" Rows : %d\n", nrow(self$dat)))
cat(sprintf(" Columns : %d\n", ncol(self$dat)))
cat(sprintf(" Min value : %s\n", x$dat_range[1]))
cat(sprintf(" Max value : %s\n", x$dat_range[2]))
cat(sprintf(" # NA's : %d\n", x$dat_num_nas))
cat(sprintf(" # 0's : %d\n", x$dat_num_zeros))
cat("\n")
cat(' Quartiles:\n')
quartiles <- setNames(as.data.frame(x$dat_quartiles), '')
rownames(quartiles) <- paste0(" ", rownames(quartiles))
print(quartiles)
cat("\n")
cat(" Column types:\n")
col_types <- setNames(as.data.frame(x$col_types), c('', ''))
col_types[, 1] <- paste0(" ", col_types[, 1])
rownames(col_types) <- ""
print(col_types)
cat("\n")
cat(" COLUMNS\n")
cat(" -------\n")
cat("\n")
cat(sprintf(" Mean range : %s - %s\n", x$col_means[1], x$col_means[2]))
cat(sprintf(" Median range : %s - %s\n", x$col_medians[1], x$col_medians[2]))
cat(sprintf(" Stdev range : %s - %s\n", x$col_std_devs[1], x$col_std_devs[2]))
cat(sprintf(" Cor range : %s - %s\n",
min(x$col_cor_mat, na.rm = TRUE),
max(x$col_cor_mat, na.rm = TRUE)))
cat("\n")
cat(" Outliers:\n\n ")
cat(paste0(sprintf("%2d", 1:length(x$col_outliers)), ". ", x$col_outliers, "\n"))
cat("\n")
cat(" ROWS\n")
cat(" ----\n")
cat("\n")
cat(sprintf(" Mean range : %s - %s\n", x$row_means[1], x$row_means[2]))
cat(sprintf(" Median range : %s - %s\n", x$row_medians[1], x$row_medians[2]))
cat(sprintf(" Stdev range : %s - %s\n", x$row_std_devs[1], x$row_std_devs[2]))
cat(sprintf(" Cor range : %s - %s\n",
min(x$row_cor_mat, na.rm = TRUE),
max(x$row_cor_mat, na.rm = TRUE)))
cat("\n")
cat(" Outliers:\n\n ")
cat(paste0(sprintf("%2d", 1:length(x$row_outliers)), ". ", x$row_outliers, "\n"))
cat("\n")
cat("=========================================\n")
}
# Determines metadata columns to include for heatmap, etc. functions
#
# @param include Vector of strings indicating metadata columns which
# should be included in the analysis.
# @param exclude Vector of strings indicating metadata columns which
# should be excluded from the analysis.
#
# @return Character vector of fields to include in analysis.
#select_features = function(include, exclude) {
# # determine fields to include based on user arguments
# if (!is.null(include)) {
# include <- include
# } else if (!is.null(exclude)) {
# include <- colnames(self$row_mdata)[!colnames(self$row_mdata) %in% exclude]
# } else {
# include <- colnames(self$row_mdata)
# }
# # always exclude fields with all unique values (e.g. identifiers)
# mask <- sapply(self$row_mdata[, include], function(x) {
# max(table(x)) > 1
# })
# include[mask]
#}
),
# ------------------------------------------------------------------------
# active bindings
# ------------------------------------------------------------------------
active = list(
dat = function(value) {
if (missing(value)) {
self$edat[['dat']]$dat
} else {
self$edat[['dat']]$dat <- value
}
},
row_mdata = function(value) {
if (missing(value)) {
if ('row_mdata' %in% names(self$edat)) {
self$edat[['row_mdata']]$dat
} else {
NULL
}
} else {
if ('row_mdata' %in% names(self$edat)) {
self$edat[['row_mdata']]$dat <- value
} else {
self$edat[['row_mdata']] <- EDADat$new(value, xid = 'x', yid = 'row metadata')
}
}
},
col_mdata = function(value) {
if (missing(value)) {
if ('col_mdata' %in% names(self$edat)) {
self$edat[['col_mdata']]$dat
} else {
NULL
}
} else {
if ('col_mdata' %in% names(self$edat)) {
self$edat[['col_mdata']]$dat <- value
} else {
self$edat[['col_mdata']] <- EDADat$new(value, xid = 'y', yid = 'column metadata')
}
}
}
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.