#' An R6 class representing a generic collection of datasets linked by
#' either column or row identifiers.
#'
#' Includes fucntionality and methods that are relevant to all EDA child
#' subclasses.
#'
#' @import ggplot2
#' @import viridis
#' @importFrom R6 R6Class
#' @importFrom reshape2 melt
#' @importFrom NMF aheatmap nmf
#' @name AbstractMultiDataSet
#'
NULL
AbstractMultiDataSet <- R6Class("AbstractMultiDataSet",
# ------------------------------------------------------------------------
# public
# ------------------------------------------------------------------------
public = list(
edat = NULL,
# AbstractMultiDataSet constructor
initialize = function(datasets, color_pal='Set1', title="", ggplot_theme=theme_bw) {
# if a single dataset was provided, wrap as a list for consistency
if (class(datasets) != 'list') {
datasets <- list(dat = datasets)
}
# drop any empty datasets
datasets <- datasets[!sapply(datasets, is.null)]
# assign names to datasets if not provided
if (is.null(names(datasets))) {
names(datasets) <- paste0('dat', seq_along(datasets))
} else if (sum(names(datasets) == '') > 0) {
num_missing <- sum(names(datasets) == '')
names(datasets)[names(datasets) == ''] <- paste0('dat', 1:num_missing)
}
# store datasets as a list of EDADat objects
for (id in names(datasets)) {
if (!class(datasets[[id]])[1] == 'EDADat') {
# by default, do not assume that any datasets have
# shared axes; this must be explicitly specified by the user
datasets[[id]] <- EDADat$new(datasets[[id]],
xid=paste0(id, '_x'),
yid=paste0(id, '_y'))
}
}
self$edat <- datasets
# check to make sure valid edat keys specified
keys <- names(self$edat)
for (key in keys) {
if (!is.null(self$edat[[key]]$row_edat) && (!self$edat[[key]]$row_edat %in% keys)) {
stop(sprintf("Invalid row edat key specified for dataset: %s", key))
}
if (!is.null(self$edat[[key]]$col_edat) && (!self$edat[[key]]$col_edat %in% keys)) {
stop(sprintf("Invalid column edat key specified for dataset: %s", key))
}
}
# check to make sure datasets which are specified as sharing an axis
# have at least some row/column id's in common
for (k1 in keys) {
e1 <- self$edat[[k1]]
for (k2 in keys) {
e2 <- self$edat[[k2]]
# check for shared axis identifiers
# TODO: clean up / move to separate function
if ((e1$xid == e2$xid && length(intersect(rownames(e1$dat), rownames(e2$dat))) == 0) ||
(e1$xid == e2$yid && length(intersect(rownames(e1$dat), colnames(e2$dat))) == 0) ||
(e1$yid == e2$xid && length(intersect(colnames(e1$dat), rownames(e2$dat))) == 0) ||
(e1$yid == e2$yid && length(intersect(colnames(e1$dat), colnames(e2$dat))) == 0)) {
stop(sprintf("Datasets '%s' and '%s' specified as having a common axis, but no shared id's found.", k1, k2))
}
}
}
# share styles
private$color_pal <- color_pal
private$ggplot_theme <- ggplot_theme
private$title <- title
},
# Adds a new dataset to front of datasets list, in-place
add = function (key, edat) {
edat_names <- c(key, names(self$edat))
self$edat <- c(edat, self$edat)
names(self$edat) <- edat_names
},
# update edat data and move to front of the edat list
update = function (key, dat) {
edat <- self$edat[[key]]
edat$dat <- dat
# convert numeric key to string
key <- ifelse(is.numeric(key), names(self$edat)[key], key)
self$edat[[key]] <- NULL
self$edat <- c(edat, self$edat)
names(self$edat)[1] <- key
# remove any unlinked datasets
private$remove_unlinked(key)
},
# Clears any cached resuts and performs garbage collection to free
# up memory.
clear_cache = function() {
private$cache <- list()
invisible(gc())
},
# cluster dataset and return new object instance which including the
# results
cluster = function(key=1, method='kmeans', ...) {
# check to make sure specified clustering method is valid
if (!method %in% names(private$cluster_methods)) {
stop('Unsupported clustering method specified.')
}
# convert key to string name, if not already provided as such
if (is.numeric(key)) {
key <- names(self$edat)[key]
}
# key for cluster result dataset
cluster_key <- sprintf("clusters-%s", key)
# perform clustering
clusters <- private$cluster_methods[[method]](self$edat[[key]]$dat, ...)
# convert result to an n x 1 mapping matrix
mat <- as.matrix(clusters)
rownames(mat) <- rownames(self$edat[[key]])
# if cluster table doesn't exist, create it
if (!cluster_key %in% names(self$edat)) {
# crate a new EDADat instance
self$edat[[cluster_key]] <- EDADat$new(
mat,
xid = self$edat[[key]]$xid,
yid = cluster_key
)
} else {
# otherwise, add new column to clusters table
self$edat[[cluster_key]]$dat <- cbind(self$edat[[cluster_key]]$dat, mat)
}
clusters
},
# cluster dataset and apply function to resulting clusters
capply = function(key, method='kmeans', fun=median, result_key=NULL, fun_args=list(), ...) {
# check for valid dataset key
private$check_key(key)
# key form: <old key>_<annot_name>_<fun>
if (is.null(result_key)) {
# if a function is provided, require manually specifying result key
if (is.function(fun)) {
stop("When a function is specified for 'fun', 'result_key' must also be provided")
}
result_key <- paste(c(key, annot_key, fun), collapse='_')
}
# determine function to use
if (!is.function(fun)) {
if (fun %in% names(private$stat_fxns)) {
fun <- private$stat_fxns[[fun]]
} else {
stop("Invalid function specified.")
}
}
# check to see if clustering has been performed, and if not,
# compute clustering
clusters <- self$cluster(key = key, method = method, ...)
# dataset to compute statistics on
dat <- self$edat[[key]]$dat
message("Computing cluster statistics...")
# apply function to each cluster and drop "cluster" column from result
fun_args <- c(fun_args, list(x = dat, by = list(cluster = clusters), FUN = fun))
res <- do.call(aggregate, fun_args)[, -1]
# fix row names and return result
#cluster_ids <- sort(unique(clusters))
#rownames(res) <- cluster_ids
# convert numeric keys
key <- ifelse(is.numeric(key), names(self$edat)[key], key)
# clone dataset instance and add new edat
obj <- private$clone_()
obj$add(result_key, EDADat$new(res, xid=method, yid=self$edat[[key]]$yid))
obj
},
# Measure similarity between columns
cor = function(key=1, meas='pearson', ...) {
private$similarity(self$edat[[key]]$dat, meas=meas, ...)
},
# Applies a filter to rows of the dataset
#
# @param mask Logical vector of length equal to the number of rows in
# the dataset.
#
# @return A filtered version of the original EDADataSet object.
filter_rows = function(key=1, mask) {
obj <- private$clone_()
obj$update(key, obj$edat[[key]]$dat[mask,, drop = FALSE])
obj
},
# Applies a filter to columns of the dataset
#
# @param mask Logical vector of length equal to the number of columns
# in the dataset.
#
# @return A filtered version of the original EDADataSet object.
filter_cols = function(key=1, mask) {
obj <- private$clone_()
obj$update(key, obj$edat[[key]]$dat[, mask, drop = FALSE])
obj
},
# Imputes missing values in the dataset
#
# Imputes missing values in the dataset using a specified method
# and stores the result in-place. Currently only support k-Nearest
# Neighbors (kNN) method.
#
# Note: When using the `knn` method, it may be neccessary to set R
# the environmental variable `R_MAX_NUM_DLLS` to some value larger
# than its default of 100 (150 should be sufficient), prior to
# launching R. This can be set in the ~/.Renviron.
#
# @param method Character array specifying imputation method to use
# (options: knn)
impute = function(key=1, method='knn') {
# Keep track of original dataset class
dat <- self$edat[[key]]$dat
cls <- class(dat)
message(sprintf("Imputing %d missing values...", sum(is.na(dat))))
# kNN
if (method == 'knn') {
imputed <- VIM::kNN(dat)[, 1:ncol(dat)]
if (cls == 'matrix') {
imputed <- as.matrix(imputed)
}
rownames(imputed) <- rownames(dat)
}
message("Done.")
self$edat[[key]]$dat <- imputed
},
# Subsample dataset
#
# Randomly subsamples dataset and returns a new EDADataSet instance.
# If both `xx_n` and `xx_ratio` arguments are specified, the `xx_n`
# setting will be used.
# For removing specific rows and columns, see the `filter_rows` and
# `filter_cols` functions.
#
# @param key Dataset key
# @param row_n Number of rows to randomly select
# @param col_n Number of columns to randomly select
# @param row_ratio Ratio of rows to randomly select
# @param col_ratio Ratio of columns to randomly select
#
# @return An EDADataSet instance
subsample = function(key=1, row_n=NULL, col_n=NULL, row_ratio=NULL, col_ratio=NULL) {
# clone and subsample dataset
obj <- private$clone_()
obj$edat[[key]]$subsample(row_n, col_n, row_ratio, col_ratio)
# move to front of edat list
edat <- obj$edat[[key]]
obj$edat[[key]] <- NULL
obj$edat <- c(edat, obj$edat)
names(obj$edat)[1] <- key
obj
},
# Summarizes overall characteristics of a dataset
#
# @param markdown Logical indicating whether summary output should be
# formatted using markdown. (NOTE: Not yet implemented...)
# @param num_digits Number of digits to display when printing summary
# output (default: 2).
summary = function(key=1, markdown=FALSE, num_digits=2) {
# collection summary info
x <- self$edat[[key]]$dat
# list to store summary info
info <- list()
# include trailing zeros when rounding
round_ <- function(y, digits) {
formatC(round(y, digits), digits, format = "f")
}
# overall dataset
info[['dat_range']] <- round_(range(x, na.rm = TRUE), num_digits)
info[['dat_num_nas']] <- sum(is.na(x))
info[['dat_num_zeros']] <- sum(x == 0)
info[['dat_quartiles']] <- round_(quantile(x, na.rm = TRUE), num_digits)
# column types
info[['col_types']] <- table(apply(x, 2, class))
# rows
info[['row_means']] <- round_(range(apply(x, 1, mean, na.rm = TRUE)), num_digits)
info[['row_medians']] <- round_(range(apply(x, 1, median, na.rm = TRUE)), num_digits)
info[['row_std_devs']] <- round_(range(apply(x, 1, sd, na.rm = TRUE)), num_digits)
info[['row_outliers']] <- self$detect_row_outliers()
# columns
info[['col_means']] <- round_(range(apply(x, 2, mean, na.rm = TRUE)), num_digits)
info[['col_medians']] <- round_(range(apply(x, 2, median, na.rm = TRUE)), num_digits)
info[['col_std_devs']] <- round_(range(apply(x, 2, sd, na.rm = TRUE)), num_digits)
info[['col_outliers']] <- self$detect_col_outliers()
# clip outlier lists if more than a few
if (length(info[['row_outliers']]) > 5) {
info[['row_outliers']] <- c(head(info[['row_outliers']], 5), '...')
}
if (length(info[['col_outliers']]) > 5) {
info[['col_outliers']] <- c(head(info[['col_outliers']], 5), '...')
}
# row & column correlations
info[['col_cor_mat']] <- round_(cor(x), num_digits)
info[['row_cor_mat']] <- round_(cor(t(x)), num_digits)
diag(info[['col_cor_mat']]) <- NA
diag(info[['row_cor_mat']]) <- NA
# display using selected output format
if (markdown) {
# TODO
} else {
private$print_summary(info)
}
},
# returns a new object instance with only the top N rows / columns
# of specified dataset
bottom_n = function(key=1, num_rows=NULL, num_cols=NULL, fxn=sum) {
dat <- self$edat[[key]]$dat
# masks to keep track of which rows / columns to keep
row_mask <- rep(TRUE, nrow(dat))
col_mask <- rep(TRUE, ncol(dat))
# get top n rows
if (!is.null(num_rows)) {
row_scores <- apply(dat, 1, fxn)
row_mask <- rank(row_scores) <= num_rows
}
# get top n columns
if (!is.null(num_cols)) {
col_scores <- apply(dat, 2, fxn)
col_mask <- rank(col_scores) <= num_cols
}
obj <- private$clone_()
obj$update(key, dat[row_mask, col_mask])
obj
},
# returns a new object instance with only the top N rows / columns
# of specified dataset
top_n = function(key=1, num_rows=NULL, num_cols=NULL, fxn=sum) {
dat <- self$edat[[key]]$dat
# masks to keep track of which rows / columns to keep
row_mask <- rep(TRUE, nrow(dat))
col_mask <- rep(TRUE, ncol(dat))
# get top n rows
if (!is.null(num_rows)) {
row_scores <- apply(dat, 1, fxn)
row_mask <- rank(-row_scores) <= num_rows
}
# get top n columns
if (!is.null(num_cols)) {
col_scores <- apply(dat, 2, fxn)
col_mask <- rank(-col_scores) <= num_cols
}
obj <- private$clone_()
obj$update(key, dat[row_mask, col_mask])
obj
},
# filters dataset to include both the rows and columns associated with
# each of the top and/or bottom N values in the dataset
extreme_n = function(key=1, top=NULL, bottom=NULL, unique=FALSE) {
dat <- self$edat[[key]]$dat
# check limits
min_dim <- min(dim(dat))
if (top > min_dim || bottom > min_dim) {
stop("Limits must be at least as small as the smallest data dimension.")
}
# vector of rows / columns to keep
row_ind <- c()
col_ind <- c()
# indices of top N values
if (!is.null(top)) {
# require <top> unique rows/columns
if (unique) {
# iterate over max values until <top> unique rows and columns
# have been added
while (length(row_ind) < top || length(col_ind) < top) {
# get row and column indices for next highest value;
# in cases of ties, just use the first match
ind <- head(which(dat == max(dat, na.rm=TRUE), arr.ind=TRUE), 1)
rowi <- ind[, 1]
coli <- ind[, 2]
# set column / row to NA
dat[rowi, ] <- NA
dat[, coli] <- NA
if (length(row_ind) < top) {
row_ind <- c(row_ind, rowi)
}
if (length(col_ind) < top) {
col_ind <- c(col_ind, coli)
}
}
} else {
# non-unique rows / columns
ind <- which(dat >= sort(dat, decreasing=TRUE)[top], arr.ind=TRUE)
row_ind <- ind[, 'row']
col_ind <- ind[, 'col']
}
}
# indices of bottom N values
if (!is.null(bottom)) {
if (unique) {
# iterate until either all rows/cols have been added, or
# the requested number of bottom hits have been included
while (length(row_ind) < min(nrow(dat), (top + bottom)) ||
length(col_ind) < min(ncol(dat), (top + bottom))) {
ind <- head(which(dat == min(dat, na.rm=TRUE), arr.ind=TRUE), 1)
rowi <- ind[, 1]
coli <- ind[, 2]
# set column / row to NA
dat[rowi, ] <- NA
dat[, coli] <- NA
if (length(row_ind) < (top + bottom)) {
row_ind <- c(row_ind, rowi)
}
if (length(col_ind) < (top + bottom)) {
col_ind <- c(col_ind, coli)
}
}
} else {
# non-unique rows / columns
ind <- which(dat <= sort(dat, decreasing=FALSE)[bottom], arr.ind=TRUE)
row_ind <- unique(c(row_ind, ind[, 'row']))
col_ind <- unique(c(col_ind, ind[, 'col']))
}
}
# replace original dataset and move to front of edat list
obj <- private$clone_()
obj$update(key, self$edat[[key]]$dat[row_ind, col_ind])
obj
},
######################################################################
# plotting methods
######################################################################
# Plot column densities
#
# Plots densities for each column in the dataset. This is most
# useful when you are interested in similarties or differences in
# distributions across columns for a relatively small number of
# columns.
#
# @param color Variable to color density curves by. If not is
# specified, uses variable specified at object construction time,
# or else, uses a separate color for each column.
#
# @return ggplot plot instance.
plot_densities = function(key=1, color_var=NULL, color_key=NULL, title="", ...) {
dat <- setNames(melt(self$edat[[key]]$dat), c('row', 'column', 'val'))
styles <- private$get_geom_density_styles(key, color_var, color_key)
if (!is.null(styles$color)) {
dat <- cbind(dat, color_var = styles$color)
}
if (is.null(title)) {
title <- sprintf("Column densities: %s", private$title)
}
# construct density plot
plt <- ggplot(dat, aes(x = val, group = column, color = column)) +
geom_density(styles$aes) +
ggtitle(title) +
private$ggplot_theme()
# legend labels
if (length(styles$labels) > 0) {
plt <- plt + styles$labels
}
# only show legend if there are relatively few groups
if (length(unique(dat$column)) > 10) {
plt <- plt + guides(color = FALSE)
}
plt
},
# Prints an overview of the object instance
print = function() {
cls <- class(self)[1]
# create a vector of dataset keys to display
if (!is.null(names(self$edat))) {
keys <- names(self$edat)
# default to numeric key if no character key exists
keys[is.na(keys)] <- seq_along(keys)[is.na(keys)]
}
# determine lengtht to use for justification
key_format <- sprintf("%%%ds", max(nchar(keys)) + 1)
# entry output string
entry_template <- sprintf("= %s : %%s (%%d x %%d) %%s\n", key_format)
cat("=========================================\n")
cat("=\n")
cat(sprintf("= %s (n = %d)\n", cls, length(self$edat)))
cat("=\n")
for (i in seq_along(self$edat)) {
# print dataset entry
ds <- self$edat[[i]]$dat
missing_flag <- ifelse(sum(is.na(ds)) > 0, '[M]', '')
cat(sprintf(entry_template, keys[i], class(ds), nrow(ds), ncol(ds), missing_flag))
}
cat("=\n")
cat("=========================================\n")
},
# replaces a dataset with a new one, in-place
replace = function(old_key, new_key, dat, xid=NULL, yid=NULL) {
# Assume original dataset row and column ids, unless specified
orig_xid <- self$edat[[old_key]]$xid
orig_yid <- self$edat[[old_key]]$yid
# Remove any additional datasets which use the same
# identifiers as the target dataset
for (ds in names(self$edat)) {
if (self$edat[[ds]]$xid == orig_xid || self$edat[[ds]]$yid == orig_yid) {
self$edat[[ds]] <- NULL
}
}
# determine new row and column ids
xid <- ifelse(is.null(xid), orig_xid, xid)
yid <- ifelse(is.null(yid), orig_yid, yid)
# replace original dataset with annotation stat result matrix
self$edat[[old_key]] <- NULL
self$edat[[new_key]] <- EDADat$new(dat, xid=xid, yid=yid)
},
# transpose (out-of-place)
t = function(key=NULL) {
obj <- private$clone_()
obj$transpose(key)
obj
},
# transpose (in-place)
transpose = function(key=NULL) {
# if key provided, transpose specific dataset
if (!is.null(key)) {
self$edat[[key]]$transpose()
} else {
# otherwise, transpose all datasets
for (id in names(self$edat)) {
self$edat[[id]]$transpose()
}
}
}
),
# ------------------------------------------------------------------------
# private
# ------------------------------------------------------------------------
private = list(
# private properties
color_pal = NULL,
ggplot_theme = NULL,
title = NULL,
cache = list(),
# Helper functions for computing various statistics; used by the
# `AbstractMultiDataSet$capply` and `BioDataSet$aapply`
# method.
stat_fxns = list(
'num_nonzero' = function(x) {
sum(x != 0)
},
'num_zero' = function(x) {
sum(x == 0)
},
'num_above_cutoff' = function(x, cutoff=0) {
sum(x > cutoff)
},
'num_below_cutoff' = function(x, cutoff=0) {
sum(x < cutoff)
},
'ratio_nonzero' = function(x) {
sum(x != 0) / length(x)
},
'ratio_zero' = function(x) {
sum(x == 0) / length(x)
},
'ratio_above_cutoff' = function(x, cutoff=0) {
sum(x > cutoff) / length(x)
},
'ratio_below_cutoff' = function(x, cutoff=0) {
sum(x < cutoff) / length(x)
},
# aliases for built-in functions
'mad' = mad,
'mean' = mean,
'median' = median,
'max' = max,
'min' = min,
'sd' = sd,
'sum' = sum,
'var' = var
),
# check for valid dataset key
check_key = function(key) {
# check for valid dataset key
if (!key %in% c(1:length(self$edat), names(self$edat))) {
stop(sprintf("Invalid dataset specified: %s", key))
}
},
# clone object
clone_ = function() {
obj <- self$clone(deep = TRUE)
obj$clear_cache()
obj
},
# deep_clone function necessary to ensure edat list of R6 classes
# are copied during cloning
deep_clone = function(name, value) {
if (name == "edat") {
copied <- list()
for (key in names(value)) {
copied[[key]] <- value[[key]]$clone()
}
copied
} else {
value
}
},
# Generates ggplot aesthetics for bar plots
#
# @param color Color variable as passed into plot function call
#
# @return List of style information
get_geom_bar_styles = function(key, color_var, color_key) {
# list to store style properties
res <- list(aes = aes(), labels = list())
private$add_color_styles(key, res, color_var, color_key)
},
# Generates ggplot aesthetics for density plots
#
# @param color Color variable as passed into plot function call
#
# @return List of style information
get_geom_density_styles = function(key, color_var, color_key) {
# list to store style properties
res <- list(aes = aes(), labels = list())
private$add_color_styles(key, res, color_var, color_key)
},
# Generates ggplot aesthetics for histogram plots
#
# @param color Color variable as passed into plot function call
#
# @return List of style information
get_geom_histogram_styles = function(key, color_var, color_key) {
# list to store style properties
res <- list(aes = aes(), labels = list())
private$add_color_styles(key, res, color_var, color_key)
},
# Generates ggplot aesthetics for a geom_point plot
#
# @param color Color variable as passed into plot function call
# @param shape Shape variable as passed into plot function call
#
# @return List of geom_point style information
get_geom_point_styles = function(key,
color_var, color_key,
shape_var, shape_key) {
# list to store style properties
res <- list(
aes = aes(),
labels = list()
)
res <- private$add_color_styles(key, res, color_var, color_key)
res <- private$add_shape_styles(key, res, shape_var, shape_key)
res
},
# Determines color-related style information to use for a plot
#
# @param styles List of color-related style info
# @param color Color variable passed into plot function call
add_color_styles = function(key, styles, color_var, color_key) {
color_vec <- private$get_color_vector(key, color_var, color_key)
# if no color variable specified, or it is disabled, return styles as-is
if (is.null(color_vec)) {
return(styles)
}
styles[['color']] <- color_vec
# update styles with color info
styles[['aes']] <- modifyList(aes(color = color_var), styles[['aes']])
styles[['labels']] <- modifyList(labs(color = color_var), styles[['labels']])
styles
},
# returns a vector of color assignments to use for plotting
get_color_vector = function(key, color_var, color_key, target='rows') {
# determine color variable and data source to use
if (target == 'rows') {
if (is.null(color_var)) {
color_var <- self$edat[[key]]$row_color
}
if (is.null(color_key)) {
color_key <- self$edat[[key]]$row_edat
}
} else {
if (is.null(color_var)) {
color_var <- self$edat[[key]]$col_color
}
if (is.null(color_key)) {
color_key <- self$edat[[key]]$col_edat
}
}
# if no color variable specified, or disabled, stop here
if (is.null(color_var) || color_var == FALSE) {
return(NULL)
}
# convert numeric keys
key <- ifelse(is.numeric(key), names(self$edat)[key], key)
# check to make sure valid style key specified
if (!color_key %in% names(self$edat)) {
stop(sprintf("Invalid style source key specified: %s", color_key))
}
# otherwise, retrieve 1d vector to use for color assignment
if (target == 'rows') {
matched_axis <- self$edat[[key]]$xid
} else {
matched_axis <- self$edat[[key]]$yid
}
vals <- self$edat[[color_key]]$get(matched_axis, color_var, other_axis=TRUE)
# drop elements not found in target dataset and store result
matched_ids <- self$edat[[color_key]]$get(matched_axis, other_axis=TRUE)
# return color vector
if (target == 'rows') {
axis_names <- rownames(self$edat[[key]]$dat)
} else {
axis_names <- colnames(self$edat[[key]]$dat)
}
vals[match(axis_names, matched_ids)]
},
# Determines shape-related style information to use for a plot
#
# @param styles List of shape-related style info
# @param shape shape variable passed into plot function call
add_shape_styles = function(key, styles, shape_var, shape_key) {
if (is.null(shape_var)) {
shape_var <- self$edat[[key]]$row_shape
}
if (is.null(shape_key)) {
shape_key <- self$edat[[key]]$row_edat
}
# if no shape variable specified, or disabled, return styles as-is
if (is.null(shape_var) || shape_var == FALSE) {
return(styles)
}
# otherwise, retrieve 1d vector to use for shape assignment
matched_axis <- self$edat[[key]]$xid
vals <- self$edat[[shape_key]]$get(matched_axis, shape_var, other_axis=TRUE)
# drop elements not found in target dataset and store result
matched_ids <- self$edat[[shape_key]]$get(matched_axis, other_axis=TRUE)
styles[['shape']] <- vals[match(rownames(self$edat[[key]]$dat), matched_ids)]
# update styles with shape info
styles[['aes']] <- modifyList(aes(shape = shape_var), styles[['aes']])
styles[['labels']] <- modifyList(labs(shape = shape_var), styles[['labels']])
styles
},
# Returns a vector of color codes associated with the specified
# variable.
#
# @param color Variable to use for assigning colors.
# @param color_pal Color palette to use
#
# @return Vector of colors with length equal to the number of columns
# in the data.
get_row_colors = function(key, color_var=NULL, color_key=NULL) {
# determine color dataset and variable name
if (is.null(color_var)) {
color_var <- self$edat[[key]]$row_color
}
if (is.null(color_key)) {
color_key <- self$edat[[key]]$row_edat
}
# if no variable is specified, use default black for plots
if (is.null(color_var) || color == FALSE) {
return('black')
}
# if variable specified, but no edat, use self
if (is.null(color_key)) {
color_key <- key
}
# otherwise, assign colors based on the variable specified
color_vector <- self$edat[[color_key]]$get(self$edat[[key]]$xid, color_var, other_axis=TRUE)
color_vector <- as.numeric(factor(color_vector))
pal <- RColorBrewer::brewer.pal(9, private$color_pal)
colors <- colorRampPalette(pal)(min(1E4, length(unique(color_vector))))
# return vector of column color assignments
colors[as.integer(color_vector)]
},
# Returns a vector of text labels to use when plotting row elements of
# a dataset.
#
# @return Vector of labels with length equal to the number of columns
# in the data.
get_col_labels = function(key, label_var=NULL, label_key=NULL) {
# target dataset edat
edat <- self$edat[[key]]
# determine key and variable name associated with labels, if specified
if (is.null(label_var)) {
label_var <- edat$col_label
}
if (is.null(label_key)) {
label_key <- edat$col_edat
}
# colnames of target dataset
cnames <- colnames(edat$dat)
# if none provided, default to using colnames
if (is.null(label_var) || label_var == FALSE) {
return(cnames)
}
# if variable specified, but no edat, use self
if (is.null(label_key)) {
label_key <- key
}
# otherwise, generate a label mapping and determine labels to use
mapping <- data.frame(
id = self$edat[[label_key]]$get(edat$yid, other_axis = TRUE),
label = self$edat[[label_key]]$get(edat$yid, label_var, other_axis = TRUE)
)
# return labels in proper order
mapping$label[match(cnames, mapping$id)]
},
# Returns a vector of color codes associated with the specified
# variable.
#
# @param color Variable to use for assigning colors.
# @param color_pal Color palette to use
#
# @return Vector of colors with length equal to the number of columns
# in the data.
get_col_colors = function(key, color_var=NULL, color_key=NULL) {
# determine color dataset and variable name
if (is.null(color_var)) {
color_var <- self$edat[[key]]$col_color
}
if (is.null(color_key)) {
color_key <- self$edat[[key]]$col_edat
}
# if no variable is specified, use default black for plots
if (is.null(color_var) || color == FALSE) {
return('black')
}
# if variable specified, but no edat, use self
if (is.null(color_key)) {
color_key <- key
}
# otherwise, assign colors based on the variable specified
color_vector <- self$edat[[color_key]]$get(self$edat[[key]]$yid, color_var, other_axis=TRUE)
color_vector <- as.numeric(factor(color_vector))
pal <- RColorBrewer::brewer.pal(9, private$color_pal)
colors <- colorRampPalette(pal)(min(1E4, length(unique(color_vector))))
# return vector of column color assignments
colors[as.integer(color_vector)]
},
# Returns a vector of text labels to use when plotting row elements of
# a dataset.
#
# @return Vector of labels with length equal to the number of columns
# in the data.
get_row_labels = function(key, label_var=NULL, label_key=NULL) {
# target dataset edat
edat <- self$edat[[key]]
# determine key and variable name associated with labels, if specified
if (is.null(label_var)) {
label_var <- edat$row_label
}
if (is.null(label_key)) {
label_key <- edat$row_edat
}
# rownames of target dataset
rnames <- rownames(edat$dat)
# if none provided, default to using rownames
if (is.null(label_var) || label_var == FALSE) {
return(rnames)
}
# if variable specified, but no edat, use self
if (is.null(label_key)) {
label_key <- key
}
# otherwise, generate a label mapping and determine labels to use
mapping <- data.frame(
id = self$edat[[label_key]]$get(edat$xid, other_axis = TRUE),
label = self$edat[[label_key]]$get(edat$xid, label_var, other_axis = TRUE)
)
# return labels in proper order
mapping$label[match(rnames, mapping$id)]
},
#
# Supported cluster methods
#
cluster_methods = list(
'kmeans' = function(dat, num_clusters, ...) {
factor(as.numeric(kmeans(dat, centers = num_clusters, ...)$cluster))
},
'hclust' = function(dat, cor_method='cor',
hclust_method='average',
cutree_method='cutree', num_clusters=NULL, cut_height=NULL, ...) {
# hierachical clustering
# TODO: make similarity, etc. functions currently stored as
# private lists acessible to one another (store directly as
# methods in private, or make available somewhere else in eda
# package namespace)
#cor_mat <- private$similarity(dat, meas=cor_method)
cor_mat <- tryCatch({
get(cor_method)(t(dat))
}, warning = function(w) {
message("Warning encountered during correlation matrix generation:")
stop(w$message)
})
hc <- flashClust::flashClust(as.dist(1 - abs(cor_mat)), method=hclust_method)
# tree cut
cutree_fxn <- get(cutree_method)
factor(as.numeric(cutree_fxn(hc, k=num_clusters, h=h, ...)))
},
# k-means clustering of t-SNE projected data
#
# k Number of clusters to detect (default: 10)
# ... Additional arguments passed to Rtsne function
#
# return Vector of cluster assignments with length equal to the
# number of rows in the dataset.
'tsne-kmeans' = function(dat, num_clusters=10, ...) {
tsne <- Rtsne::Rtsne(dat, ...)
res <- setNames(as.data.frame(tsne$Y), c('x', 'y'))
kmeans(res, centers = num_clusters)$cluster
}
),
#
# Supported similarity measures
#
# Each of the below functions should accept either one or two matrices
# or data frames, measures the similarity/dependence either within
# (single dataset) or across (two datasets) columns in the dataset(s).
#
# The result is a matrix of similarity scores (e.g. correlation scores,
# r^2, etc.) of dimensions:
#
# ncol(dat1) x ncol(dat1) - single dataset
# ncol(dat1) x nrow(dat2) - two datasets
#
similarity_measures = list(
#
# linear model fit r^2
#
# Computes a pairwise matrix of linear model r^2 values; useful,
# for example, when measuring relationship between a numeric matrix
# and a categorical one.
#
'lm' = function(dat1, dat2=NULL, ...) {
# single dataset
if (is.null(dat2)) {
# construct linear model to measure pairwise dependence of
# dataset1 columns on dataset2 columns
cor_mat <- matrix(0, nrow = ncol(dat1), ncol = ncol(dat1))
# for each column in dataset1
for (i in 1:ncol(dat1)) {
# fit a linear model between the ith column of dataset1
# and the jth column of dataset1
feature_cor <- function(y) {
# y is the input from apply call; similar effect
# to having a nested for-loop.
round(summary(lm(y ~ dat1[, i]))$r.squared * 100, 2)
}
# for each column in dataset1
cor_mat[, i] <- apply(dat1, 2, feature_cor)
}
} else {
# two datasets
# construct linear model to measure pairwise dependence of
# dataset1 columns on dataset2 columns
cor_mat <- matrix(0, nrow = ncol(dat1), ncol = ncol(dat2))
# for each column in dataset2
for (i in 1:ncol(dat2)) {
# fit a linear model between the ith column of dat2 and the
# jth column of dat1
feature_cor <- function(y) {
round(summary(lm(y ~ dat2[, i]))$r.squared * 100, 2)
}
# for each column in dataset1
cor_mat[, i] <- apply(dat1, 2, feature_cor)
}
}
cor_mat
},
#
# Mutual Information (jackknife bias-corrected)
# https://cran.r-project.org/web/packages/mpmi/vignettes/Vignette.pdf
#
'cmi' = function(dat1, dat2=NULL, ...) {
if (!is.null(dat2)) {
# two datasets
cor_mat <- mpmi::cmi(cbind(dat1, dat2), ...)$bcmi
cor_mat[1:ncol(dat1), (ncol(dat1) + 1):nrow(cor_mat)]
} else {
# single dataset
mpmi::cmi(dat1, ...)$bcmi
}
},
#
# Maximal Information Coefficient (MIC)
# http://www.exploredata.net/
#
'mic' = function(dat1, dat2=NULL, ...) {
if (!is.null(dat2)) {
# two datasets
mic_mat <- minerva::mine(cbind(dat1, dat2), ...)$MIC
mic_mat[1:ncol(dat1), (ncol(dat1) + 1):nrow(mic_mat)]
} else {
# single dataset
minerva::mine(dat1, ...)$MIC
}
},
#
# Mutual Information (infotheo discretized estimate)
# https://cran.r-project.org/web/packages/infotheo/index.html
#
'mutinformation' = function(dat1, dat2=NULL, ...) {
# discretize continuous data
if (!is.integer(dat1)) {
dat1 <- infotheo::discretize(dat1, ...)
}
# two datasets
if (!is.null(dat2)) {
# result matrix
mi_mat <- matrix(nrow=ncol(dat1), ncol=ncol(dat2))
if (!is.integer(dat2)) {
dat2 <- infotheo::discretize(dat2, ...)
}
# iterate over pairs of columns and compute mutual information
# for each pair
for (i in 1:ncol(dat1)) {
for (j in 1:ncol(dat2)) {
mi_mat[i, j] <- infotheo::mutinformation(dat1[, i], dat2[, j])
}
}
} else {
# single dataset
# result matrix
mi_mat <- matrix(nrow=ncol(dat1), ncol=ncol(dat1))
# iterate over pairs of columns and compute mutual information
# for each pair
for (i in 1:ncol(dat1)) {
for (j in 1:ncol(dat1)) {
if (is.na(mi_mat[i, j])) {
mi_mat[i, j] <- infotheo::mutinformation(dat1[, i], dat1[, j])
mi_mat[j, i] <- mi_mat[i, j]
}
}
}
}
mi_mat
},
# Kendall correlation
#
'kendall' = function(dat1, dat2=NULL, ...) {
if (!is.null(dat2)) {
cor(dat1, dat2, method = 'kendall', ...)
} else {
cor(dat1, method = 'kendall', ...)
}
},
# Pearson correlation
#
'pearson' = function(dat1, dat2=NULL, ...) {
if (!is.null(dat2)) {
# data is passed in with shared rows, so columns can
# be correlation directly
cor(dat1, dat2, ...)
} else {
cor(dat1, ...)
}
},
#
# Spearman correlation
#
'spearman' = function(dat1, dat2=NULL, ...) {
if (!is.null(dat2)) {
cor(dat1, dat2, method = 'spearman', ...)
} else {
cor(dat1, method = 'spearman', ...)
}
}
),
check_input = function() {
# TODO: Check to make sure all datasets overlap in keys with dat
},
# Computes cross-dataset correlation matrix
#
# Measures similarity between rows or columns in two datasets which
# share the same set of column or row ids.
#
# @param key1 Numeric or character index of first dataset to use
# @param key2 Numeric or character index of second dataset to use
# @param meas Correlation measure to use. Supported options:
# - kendall (Kendall correlation)
# - pearson (Pearson correlation)
# - spearman (Spearman correlation)
# - lm (Linear model)
# - cmi (Mututal information)
#
# @return Matrix of pairwise dataset1 - dataset2 correlations
compute_cross_cor = function(key1=1, key2=2, meas='pearson', new_key=NULL, ...) {
# shortcuts to edats
e1 <- self$edat[[key1]]
e2 <- self$edat[[key2]]
# make sure datasets share some common ids
if (length(unique(c(e1$xid, e2$xid, e1$yid, e2$yid))) == 4) {
stop("Specified datasets must share a common axis.")
}
# determine orientation to use for comparison; similarity is
# measured across columns, so data will be rearranged such that
# shared ids are along rows (then column 1 from dat1 can be
# compared with column 1 from dat2, etc.)
dat1_shared_axis <- ifelse(e1$xid == e2$xid || e1$xid == e2$yid, 'rows', 'columns')
dat2_shared_axis <- ifelse(e2$xid == e1$xid || e2$xid == e1$yid, 'rows', 'columns')
# get row-oriented datasets and associated style information
if (dat1_shared_axis == 'rows') {
# if dat1 is already oriented with shared row ids, then it
# is in the expected orientation
dat1 <- e1$dat
# each row in the resulting correlation matrix will correspond
# to a column in dat1
xid <- e1$yid
# similarly, the styles and labels for the rows in the cor
# matrix correspond to that for the columns in dat1
xcolor <- e1$col_color
xshape <- e1$col_shape
xlabel <- e1$col_label
xedat <- e1$col_edat
} else {
# if dat1 shared its column names with dat2, transpose to
# match expected orientation
dat1 <- e1$tdat
# in this case, each row in the resulting dataset will correspond
# to a single row in dat1
xid <- e1$xid
xcolor <- e1$row_color
xshape <- e1$row_shape
xlabel <- e1$row_label
xedat <- e1$row_edat
}
# similar logic to above, but for dat2 which will appear as column
# entries in the resulting correlation matrix
if (dat2_shared_axis == 'rows') {
dat2 <- e2$dat
yid <- e1$yid
ycolor <- e2$col_color
yshape <- e2$col_shape
ylabel <- e2$col_label
yedat <- e2$col_edat
} else {
dat2 <- e2$tdat
yid <- e2$xid
ycolor <- e2$row_color
yshape <- e2$row_shape
ylabel <- e2$row_label
yedat <- e2$row_edat
}
# normalize order of shared axis entries (ordered as rows now) and
# only operate on shared entries
row_ind <- intersect(rownames(dat1), rownames(dat2))
dat1 <- dat1[order(match(rownames(dat1), row_ind)),, drop = FALSE]
dat2 <- dat2[order(match(rownames(dat2), row_ind)),, drop = FALSE]
# measure similarity between rows in datasets 1 and rows in dataset 2
cor_mat <- private$similarity(dat1, dat2, meas = meas, ...)
# clone object and add new dataset
obj <- private$clone_()
# map numeric keys to string names
key1 <- ifelse(is.numeric(key1), names(self$edat)[key1], key1)
key2 <- ifelse(is.numeric(key2), names(self$edat)[key2], key2)
# determine key to use for storing result
new_key <- ifelse(is.null(new_key), sprintf('%s_%s_%s', key1, key2, meas), new_key)
# add new matrix to front of edat list and return
obj$add(new_key,
EDADat$new(cor_mat, xid = xid, yid = yid,
row_color = xcolor, row_shape = xshape,
row_label = xlabel, row_edat = xedat,
col_color = ycolor, col_shape = yshape,
col_label = ylabel, col_edat = yedat))
obj
},
# Plots multidataset correlation heatmap
#
# @param key1 Numeric or character index of first dataset to use
# @param key2 Numeric or character index of second dataset to use
# @param meas Correlation measure to use (passed to `cor` function)
#
plot_cross_cor_heatmap = function(key1=1, key2=2, meas='pearson', show_tick_labels=c(TRUE, TRUE), interactive=TRUE) {
# compute cross correlations
cor_mat <- private$compute_cross_cor(key1, key2, meas)
# list of parameters to pass to heatmaply
params <- list(
x = cor_mat,
showticklabels = show_tick_labels,
subplot_widths = c(0.65, 0.35),
subplot_heights = c(0.35, 0.65)
)
# add any additional function arguments
private$construct_heatmap_plot(params, interactive)
},
# Creates a static or interactive heatmap plot
#
# @param params A list of plotting parameters
# @param interactive Logical indicating whether an interactive heatmap
# should be generated.
construct_heatmap_plot = function(params, interactive) {
# interactive heatmap
if (interactive) {
return(do.call(heatmaply::heatmaply, params))
}
# rename column and row annotation matrix parameters, if present
if ('col_side_colors' %in% names(params)) {
params$annCol <- params$col_side_colors
}
if ('row_side_colors' %in% names(params)) {
params$annRow <- params$row_side_colors
}
# remove irrelevant heatmaply function arguments
heatmaply_args <- c('showticklabels', 'subplot_widths',
'subplot_heights', 'col_side_colors',
'row_side_colors')
params <- params[!names(params) %in% heatmaply_args]
# default to viridis color scheme
if (!'color' %in% names(params)) {
params$color <- viridis(100)
}
# 2018/04/17 - adding border just around heatmap not working at
# this time and results in warnings being generated
#params$border <- list(matrix = TRUE)
do.call(NMF::aheatmap, params)
},
# Removes any datasets which are not linked to the target dataset by
# either shared row or column ids
remove_unlinked = function(key) {
# iterate over datasets
edat_keys <- names(self$edat)
if (is.numeric(key)) {
edat_keys <- edat_keys[-key]
} else {
edat_keys <- edat_keys[names(edat_keys) != key]
}
for (eid in edat_keys) {
# create a vector of axis ids for all other datasets
axis_ids <- c()
for (edat in self$edat[names(self$edat) != eid]) {
axis_ids <- c(axis_ids, edat$xid, edat$yid)
}
# if a dataset does not overlap in ids with any of the other
# datasets (e.g. after a projection which replaces the original
# dataset), remove it.
if (length(intersect(axis_ids, c(self$edat[[eid]]$xid, self$edat[[eid]]$yid))) == 0) {
self$edat[[eid]] <- NULL
}
}
},
#
# Measures similarity between columns within or across datasets
#
similarity = function(dat1, dat2=NULL, meas='pearson', ...) {
# check to make sure specified similarity measure is valid
if (!meas %in% names(private$similarity_measures)) {
stop('Unsupported similarity measure specified.')
}
# drop any columns with zero variance
var_mask1 <- apply(dat1, 2, function(x) length(table(x)) ) > 1
if (sum(!var_mask1) > 0) {
message(sprintf("Excluding %d zero-variance entries from first dataset.", sum(!var_mask1)))
dat1 <- dat1[, var_mask1, drop = FALSE]
}
if (!is.null(dat2)) {
var_mask2 <- apply(dat2, 2, function(x) length(table(x)) ) > 1
if (sum(!var_mask2) > 0) {
message(sprintf("Excluding %d zero-variance entries from second dataset.", sum(!var_mask2)))
dat2 <- dat2[, var_mask2, drop = FALSE]
}
}
# construct similarity matrix
cor_mat <- private$similarity_measures[[meas]](dat1, dat2, ...)
# fix row and column names
rownames(cor_mat) <- colnames(dat1)
if (!is.null(dat2)) {
colnames(cor_mat) <- colnames(dat2)
} else {
colnames(cor_mat) <- colnames(dat1)
}
cor_mat
}
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.