#' An R6 class representing collection of related biological datasets
#'
#' BioEDADataSet is a class for interacting with one or more biological
#' datasets, typically those collected from high-throughput experiments.
#'
#' @section Arguments:
#' - `dataset`: A list of datasets (matrices, data frames, etc.), each of
#' which shared some column / row identifiers with the first entry in
#' the list.
#'
#' - `row_color`: Row metadata field to use for coloring rowwise plot elements.
#' - `row_shape`: Row metadata field to use for determine rowwise plot
#' element shape.
#' - `row_label`: Row metadata field to use when labeling plot points or
#' other elements.
#' - `col_color`: Column metadata field to use for coloring columnwise plot elements.
#' - `col_shape`: Column metadata field to use for determine columnwise plot
#' element shape.
#' - `col_label`: Column metadata field to use when labeling plot points or
#' other elements.
#' - `color_pal`: Color palette to use for relevant plotting methods
#' (default: `Set1`).
#' - `title`: Text to use as a title or subtitle for plots.
#' - `ggplot_theme`: Default theme to use for ggplot2 plots
#' (default: `theme_bw`).
#'
#' @section Fields:
#' - `edat`: List of EDADat dataset instances
#' - `annotations`: A list of gene, etc. annotations from external sources.
#'
#' @section Methods:
#' - `clear_cache()`: Clears BioEDADataSet cache.
#' - `clone()`: Creates a copy of the BioEDADataSet instance.
#' - `cluster_tsne(num_clusters=10, ...)`: Clusters rows in dataset using a combination
#' of t-SNE and k-means clustering.
#' - `cpm()`: Performs counts-per-million (CPM) transformation.
#' - `detect_col_outliers(num_sd=2, avg='median', meas='pearson')`:
#' Measures average pairwise similarities between all columns in the dataset.
#' Outliers are considered to be those columns who mean similarity to
#' all other columns is greater than `num_sd` standard deviations from the
#' average of averages.
#' - `detect_row_outliers(num_sd=2, avg='median', meas='pearson')`:
#' Measures average pairwise similarities between all rows in the dataset.
#' Outliers are considered to be those rows who mean similarity to
#' all other rows is greater than `num_sd` standard deviations from the
#' average of averages.
#' - `feature_cor()`: Detects dependencies between column metadata entries
#' (features) and dataset rows.
#' - `filter_col_outliers(num_sd=2, avg='median', meas='pearson')`:
#' Removes column outliers from the dataset. See `detect_col_outliers()`
#' for details of outlier detection approach.
#' - `filter_row_outliers(num_sd=2, avg='median', meas='pearson')`:
#' Removes row outliers from the dataset. See `detect_row_outliers()`
#' for details of outlier detection approach.
#' - `filter_cols(mask)`: Accepts a logical vector of length `ncol(obj$dat)`
#' and returns a new BioEDADataSet instance with only the columns associated
#' with `TRUE` values in the mask.
#' - `filter_rows(mask)`: Accepts a logical vector of length `nrow(obj$dat)`
#' and returns a new BioEDADataSet instance with only the rowsumns associated
#' with `TRUE` values in the mask.
#' - `impute(method='knn')`: Imputes missing values in the dataset and stores
#' the result _in-place_. Currently only k-Nearest Neighbors (kNN)
#' imputation is supported.
#' - `log(base=exp(1), offset=0)`: Log-transforms data.
#' - `log1p()`: Logn(x + 1)-transforms data.
#' - `log2p()`: Log2(x + 1)-transforms data.
#' - `pca(...)`: Performs principle component analysis (PCA) on the dataset
#' and returns a new BioEDADataSet instance of the projected data points.
#' Any additional arguements specified are passed to the `prcomp()` function.
#' - `pca_feature_cor(meas='pearson', ...)`: Measures correlation between
#' dataset features (column metadata fields) and dataset principle
#' components.
#' - `plot_cor_heatmap(meas='pearson', interactive=TRUE, ...)`: Plots a
#' correlation heatmap of the dataset.
#' - `plot_densities(color=NULL, title="", ...)`: Plots densities for each
#' column in the dataset.
#' - `plot_feature_cor(meas='pearson', color_scale=c('green', 'red')`:
#' Creates a tile plot of projected data / feature correlations. See
#' `feature_cor()` function.
#' - `plot_heatmap(x=1, interactive=TRUE, ...)`: Generates a heatmap plot of the
#' dataset
#' - `plot_pairwise_column_cors(color=NULL, title="", meas='pearson', mar=c(12,6,4,6))`:
#' Plot median pairwise column correlations for each variable (column)
#' in the dataset.
#' - `plot_pca(pcx=1, pcy=2, scale=FALSE, color=NULL, shape=NULL, title=NULL,
#' text_labels=FALSE, ...)`:
#' Generates a two-dimensional PCA plot from the dataset.
#' - `plot_tsne(color=NULL, shape=NULL, title=NULL, text_labels=FALSE, ...)`:
#' Generates a two-dimensional t-SNE plot from the dataset.
#' - `print()`: Prints an overview of the object instance.
#' - `subsample(row_n=NULL, col_n=NULL, row_ratio=NULL, col_ratio=NULL)`:
#' Subsamples dataset rows and/or columns.
#' - `summary(markdown=FALSE, num_digits=2)`: Summarizes overall
#' characteristics of a dataset.
#' - `t()`: Transposes dataset rows and columns.
#' - `tsne(...)`: Performs T-distributed stochastic neighbor embedding (t-SNE)
#' on the dataset and returns a new BioEDADataSet instance of the projected
#' data points. Any additional arguements specified are passed to the
#' `Rtsne()` function.
#' - `tsne_feature_cor(meas='pearson', ...)`: Measures correlation between
#' dataset features (column metadata fields) and dataset t-SNE projected
#' axes.
#' - `cross_cor(key1=1, key2=2, meas='pearson')`: Computes cross-dataset
#' correlation matrix between rows in two specified datasets.
#' - `plot_cross_cor_heatmap(key1=1, key2=2, meas='pearson', interactive=TRUE)`:
#' Plots multidataset correlation heatmap.
#' - `print()`: Prints an overview of the object instance.
#'
#' @section Examples:
#' ```
#' TODO
#' ```
#'
#' @importFrom R6 R6Class
#' @name BioDataSet
#' @export
#'
NULL
BioDataSet <- R6Class("BioDataSet",
inherit = eda:::EDAMultiMatrix,
# ------------------------------------------------------------------------
# public
# ------------------------------------------------------------------------
public = list(
# List of loaded annotations
annotations = list(),
# BioDataSet constructor
initialize = function(datasets, color_pal='Set1', title="", ggplot_theme=theme_bw) {
# supported bioconductor data types
bioc_types <- c('RangedSummarizedExperiment', 'ExpressionSet')
# create a list of counters to keep track of the number of each object type encountered
bioc_counters <- as.list(setNames(rep(1, length(bioc_types)), bioc_types))
# if a single Bioconductor object instance is passed in, wrap it in a list to
# be consistent
if (class(datasets) %in% bioc_types) {
datasets <- list(datasets)
}
for (i in seq_along(datasets)) {
# get a single dataset entry
dat <- datasets[[i]]
# skip anything that isn't a supported bioc object instance
if (!any(class(dat) %in% c('RangedSummarizedExperiment', 'ExpressionSet'))) {
next
}
# remove original object from input list
datasets[[i]] <- NULL
# base axis labels (rowData xid, rowData yid, colData xid, colData yid)
axis_ids <- c(feat_xid = 'features', feat_yid = 'feature_annotations',
pheno_xid = 'samples', pheno_yid = 'phenotypes')
# unpack bioconductor object and determine base EDADat keys to use
if ('ExpressionSet' %in% class(dat)) {
edat_keys <- c(assay = 'exprs', col_dat = 'pdata', row_dat = 'fdata')
bioc_dat <- Biobase::exprs(dat)
bioc_row_dat <- Biobase::fData(dat)
bioc_col_dat <- Biobase::pData(dat)
} else if ('RangedSummarizedExperiment' %in% class(dat)) {
edat_keys <- c(assay = 'assays', col_dat = 'coldata', row_dat = 'rowdata')
# TODO: extend support for RSE objects with multiple datasets
bioc_dat <- SummarizedExperiment::assays(dat)[[1]]
bioc_row_dat <- SummarizedExperiment::rowData(dat)
bioc_col_dat <- SummarizedExperiment::colData(dat)
}
# if multiple instances of the same object are passed in, append a numeric suffix
# to differentiate them
if (bioc_counters[[class(dat)]] > 1) {
edat_keys <- sprintf('%s_%d', edat_keys, bioc_counters[[class(dat)]])
axis_ids <- sprintf('%s_%d', axis_ids, bioc_counters[[class(dat)]])
}
# add assay data
datasets[[edat_keys['assay']]] = EDADat$new(bioc_dat,
xid = axis_ids['feat_xid'],
yid = axis_ids['pheno_xid'])
# add feature data
if (!is.null(bioc_row_dat)) {
datasets[[edat_keys['row_dat']]] = EDADat$new(bioc_row_dat,
xid = axis_ids['feat_xid'],
yid = axis_ids['feat_yid'])
}
# add phenotype data
if (!is.null(bioc_col_dat)) {
datasets[[edat_keys['col_dat']]] = EDADat$new(bioc_col_dat,
xid = axis_ids['pheno_xid'],
yid = axis_ids['pheno_yid'])
}
# increment bioc obj counter
bioc_counters[[class(dat)]] <- bioc_counters[[class(dat)]] + 1
}
# call parent constructor
super$initialize(datasets, color_pal, title, ggplot_theme)
},
# Computes pathway- or annotation-level statistics for a given dataset and
# annotation mapping.
#
# @param key Name of dataset to analyze.
# @param annot
# @param stat The annotation-level statistic to compute. Can either be a
# function (e.g. sum, median, or var), or a string indicating a specific statistic
# to compute.
#
# Currently supported statistics include:
#
# - `num_above_cutoff` number of genes w/ values above some cutoff
# - `num_below_cutoff` number of genes w/ values below some cutoff
# - `ratio_above_cutoff` ratio of genes w/ values above some cutoff
# - `ratio_below_cutoff` ratio of genes w/ values below some cutoff
# - `num_nonzero` number of genes with values not equal to zero
# - `num_zero` number of genes with values equal to zero
# - `ratio_nonzero` ratio of genes with values not equal to zero
# - `ratio_zero` ratio of genes with values equal to zero
#
aapply = function(key, fun, annot_key, annot_keytype='ensgene', result_key=NULL, fun_args=list(), ...) {
# annotations are parsed into n x 2 dataframes consisting of
# with each row containing an (annotation, gene id) pair.
ANNOT_IND <- 1
ITEM_IND <- 2
# check for valid dataset key
private$check_key(key)
# check for valid function
if (!is.function(fun)) {
valid_funcs <- c('gsva', names(private$stat_fxns))
if (!fun %in% valid_funcs) {
stop("Invalid function specified.")
}
}
# convert numeric keys
key <- ifelse(is.numeric(key), names(self$edat)[key], key)
# key form: <old key>_<annot_name>_<fun>
if (is.null(result_key)) {
# if a function is provided, require manually specifying result key
if (is.function(fun)) {
stop("When a function is specified for 'fun', 'result_key' must also be provided")
}
result_key <- paste(c(key, annot_key, fun), collapse='_')
}
# check to see if result has already been computed, and if so, move to front and return
if (result_key %in% names(self$edat)) {
# move dataset key to top of list and return self-reference
self$edat <- self$edat[c(result_key, names(self$edat)[names(self$edat) != result_key])]
return(self)
}
# get annotation mapping
mapping <- self$load_annotations(annot_key, keytype = self$edat[[key]]$xid)
# dataset to compute statistics on
dat <- self$edat[[key]]$dat
# exclude mapping pairs for items not in target dataset
mapping <- mapping[mapping[, ITEM_IND] %in% rownames(dat), ]
# exclude any annotations with less than two entries in the target dataset
gset_counts <- table(mapping[, ANNOT_IND])
to_exclude <- names(gset_counts)[gset_counts == 1]
mapping <- mapping[!mapping[, ANNOT_IND] %in% to_exclude, ]
message("Computing annotation statistics...")
#
# GSVA-based aggregation
#
if (!is.function(fun) && fun == 'gsva') {
# convert gene set mapping dataframe to a list
gset_list <- split(mapping$gene, mapping$annotation)
# get any addition function arguments, and set verbose to FALSE
gsva_args <- list(expr = dat, gset.idx.list = gset_list, verbose = FALSE)
gsva_args <- modifyList(gsva_args, fun_args)
# call GSVA
res <- do.call(GSVA::gsva, gsva_args)
} else {
#
# All other aggegration methods
#
if (!is.function(fun)) {
# built-in aggregation functions
fun <- private$stat_fxns[[fun]]
}
# output data frame
res <- data.frame()
# iterate over annotations
for (annot_key in unique(mapping[, ANNOT_IND])) {
# get list of genes, etc. associated with the annotation
annot_items <- mapping[mapping[, ANNOT_IND] == annot_key, ITEM_IND]
# get data values for relevant genes
dat_subset <- dat[rownames(dat) %in% annot_items,, drop = FALSE]
# combine arguments to apply and the aggregation function into a single list
iter_args <- c(list(X = dat_subset, MARGIN = 2, FUN = fun), fun_args)
# compute statistic for each column (often, samples) and append to result
res <- rbind(res, do.call(apply, iter_args))
}
rownames(res) <- unique(mapping[, ANNOT_IND])
colnames(res) <- colnames(dat)
}
# fix rownames and remove any duplicates after converting to valid rownames
rn <- gsub('\\.+', '\\.', make.names(rownames(res)))
mask <- !duplicated(rn)
# fix row names and return result; annotation names
# may change as a result, but this will help to avoid downstream
# confusion when saving to CSV, etc.
res <- res[mask, ]
rownames(res) <- rn[mask]
# clone BioDataSet instance and add new edat
obj <- private$clone_()
obj$add(result_key, EDADat$new(res, xid=annot_key, yid=self$edat[[key]]$yid))
obj
},
# returns annotation mapping using specified gene identifiers
load_annotations = function(annot, keytype='ensgene', annot_file=NULL, annot_keytype='ensgene',
annot_filetype='gmt', annot_field=1, gene_field=2,
exclude_annotations=NULL) {
#
# TODO: decide on best place for exclude annotations logic...
#
# check if annotations have been previously loaded
if (annot %in% names(self$annotations)) {
# if annotations have been loaded for the requested gene identifiers, return mapping
if (keytype %in% names(self$annotations[[annot]])) {
return(self$annotations[[annot]][[keytype]])
}
# if annotations have been loaded for a different key type, map identifiers and return
from <- names(self$annotations[[annot]])[1]
self$annotations[[annot]][[keytype]] <- private$map_gene_ids(self$annotations[[annot]][[from]],
from, keytype)
return(self$annotations[[annot]][[keytype]])
}
#
if (is.null(annot_file)) {
stop("Missing required parameter: annot_file")
}
# make sure valid filepath provided (TODO: extend support to URLs?)
if (!file.exists(annot_file)) {
stop(sprintf("Invalid annotation filepath specified: %s", annot_file))
}
message(sprintf("Loading %s annotations...", annot))
# load annotations from file
if (annot_filetype == 'gmt') {
mapping <- private$load_gmt(annot_file)
} else if (annot_filetype == 'tsv') {
mapping <- private$load_tsv(annot_file, annot_field, gene_field, exclude_annotations)
}
# store returned annotation mapping
if (!annot %in% names(self$annotations)) {
self$annotations[[annot]] <- list()
}
self$annotations[[annot]][[annot_keytype]] <- mapping
# if dataset keytype differs from annotation keytype, map ids
if (keytype != annot_keytype) {
self$annotations[[annot]][[keytype]] <- private$map_gene_ids(mapping, annot_keytype, keytype)
}
return(self$annotations[[annot]][[keytype]])
},
# Log2 transforms data (adding 1 to ensure finite results).
#
# @return Log2-transformed version of the expression data.
log2p = function(key = 1) {
self$log(key = key, 2, offset = 1)
},
# Creates a histogram or bar plot of sample library sizes.
#
# Generates a bargraph or histogram plot of sample library sizes
# (sum of expression levels within each column/sample). For the bar
# graph plot, each sample is shown as a separate bar in the plot.
# For larger datasets, a histogram of library sizes may be more useful
# to display.
#
# @param color Column metadata field to use for coloring points.
# @param title Title to use for plot.
# @param geom ggplot geometry to use (bar|histogram)
#
# @return A ggplot instance.
plot_libsizes = function(key=1, color=NULL, title=NULL, geom='hist') {
# create a data frame of library sizes
dat <- data.frame(libsize = colSums(self$edat[[key]]$dat))
# determine plot styles to use
if (geom == 'hist') {
styles <- private$get_geom_histogram_styles(key, color)
} else {
styles <- private$get_geom_bar_styles(key, color)
}
# update styles and title
if (!is.null(styles$color)) {
dat <- cbind(dat, color = styles$color)
}
if (is.null(title)) {
title <- sprintf("Library sizes: %s", private$title)
}
# create plot
if (geom == 'hist') {
# histogram
plt <- ggplot(aes(x = libsize), data = dat) +
geom_histogram(styles$aes) +
private$ggplot_theme()
} else {
# bar plot
plt <- ggplot(aes(x = rownames(libsizes), y = libsize), data = dat) +
geom_bar(styles$aes, stat = 'identity') +
xlab("Samples") +
ylab("Total expression") +
private$ggplot_theme() +
theme(axis.text.x = element_text(angle = 90),
legend.text = element_text(size = 8))
}
# legend labels
if (length(styles$labels) > 0) {
plt <- plt + styles$labels
}
plt
},
# Prints an overview of the object instance
print = function() {
cls <- class(self)[1]
# create a vector of dataset keys to display
if (!is.null(names(self$edat))) {
keys <- names(self$edat)
# default to numeric key if no character key exists
keys[is.na(keys)] <- seq_along(keys)[is.na(keys)]
}
# determine length to use for justification
key_format <- sprintf("%%%ds", max(nchar(keys)) + 1)
# entry output string
entry_template <- sprintf("= %s : %%s (%%d x %%d) %%s\n", key_format)
cat("=========================================\n")
cat("=\n")
cat(sprintf("= %s (n=%d)\n", cls, length(self$edat)))
cat("=\n")
for (i in seq_along(self$edat)) {
ds <- self$edat[[i]]$dat
# print dataset entry
missing_flag <- ifelse(sum(is.na(ds)) > 0, '[M]', '')
cat(sprintf(entry_template, keys[i], class(ds), nrow(ds), ncol(ds), missing_flag))
}
cat("=\n")
if (length(self$annotations) > 0) {
cat("= Annotations:\n")
cat("=\n")
for (annot_name in names(self$annotations)) {
for (annot_keytype in names(self$annotations[[annot_name]])) {
annot <- self$annotations[[annot_name]][[annot_keytype]]
cat(sprintf("= - %s (%d annotations, %d genes)\n", annot_name,
length(unique(annot[, 1])),
length(unique(annot[, 2]))))
}
}
cat("=\n")
}
cat("=========================================\n")
},
# Plots multidataset correlation heatmap
#
# @param key1 Numeric or character index of first dataset to use
# @param key2 Numeric or character index of second dataset to use
# @param meas Correlation meas to use
#
plot_cross_cor_heatmap = function(key1=1, key2=2, meas='pearson', new_key=NULL, interactive=TRUE) {
super$plot_cross_cor_heatmap(key1, key2, meas, new_key, interactive)
}
),
# ------------------------------------------------------------------------
# private
# ------------------------------------------------------------------------
private = list(
# Verifies that input data is an acceptable format.
check_input = function(dat) {
if (!is.matrix(dat) && (class(dat) != 'ExpressionSet')) {
stop("Invalid input for BioEDADataSet: dat must be a matrix or ExpressionSet.")
}
},
# load gene set gmt file
load_gmt = function(annot_file=NULL) {
# check for valid annotation filepath
if (is.null(annot_file)) {
stop("Filepath to annotation gmt file must be provided.")
} else if (!file.exists(annot_file)) {
stop("Invalid filepath for annotations specified.")
}
mapping <- private$parse_gmt(annot_file)
colnames(mapping) <- c('annotation', 'gene')
rownames(mapping) <- NULL
# exclude any annotations with only a single gene (not interesting...)
mask <- mapping$annotation %in% names(which(table(mapping$annotation) > 1))
mapping <- mapping[mask, ]
# discard unused factor levels and return result
mapping$annotation <- factor(mapping$annotation)
mapping
},
load_tsv = function(infile, annot_field, gene_field, exclude_annotations=NULL) {
# load annotation tab-delimited file
mapping <- read.delim(infile, sep = '\t', header = TRUE, stringsAsFactors = FALSE)
mapping <- mapping[, c(annot_field, gene_field)]
# for some annotation sources (e.g. CPDB), there are multiple entries for the same annotation
# name, each with a different list of genes; for now, arbitrarily choose one of the mappings
# to use
mapping <- mapping[!duplicated(mapping[, annot_field]), ]
# exclude specific annotations, if requested
if (!is.null(exclude_annotations)) {
mapping <- mapping[!mapping[, annot_field] %in% exclude_annotations, ]
}
# convert to an n x 2 mapping of (annotation, gene) pairs
mapping_list <- apply(mapping, 1, function(x) {
cbind(x[annot_field], unlist(strsplit(x[gene_field], ',')))
})
mapping <- as.data.frame(do.call('rbind', mapping_list))
colnames(mapping) <- c('annotation', 'gene')
rownames(mapping) <- NULL
# exclude any annotations with only a single gene (not that interesting..)
mask <- mapping$annotation %in% names(which(table(mapping$annotation) > 1))
mapping <- mapping[mask, ]
# discard unused factor levels
mapping$annotation <- factor(mapping$annotation)
mapping
},
# maps gene identifiers for a specified annotation mapping
map_gene_ids = function(annot_mapping, from, to) {
# annotation mapping indices
GID_IND <- 2
# map identifier names, if needed
if (from == 'entrez-gene') {
from <- 'entrez'
} else if (from == 'hgnc-symbol') {
from <- 'symbol'
}
if (to == 'entrez-gene') {
to <- 'entrez'
} else if (to == 'hgnc-symbol') {
to <- 'symbol'
}
# mapping gene identifiers and return result
ind <- match(annot_mapping[, GID_IND], annotables::grch37[, from, drop = TRUE])
annot_mapping$gene <- annotables::grch37[, to, drop = TRUE][ind]
annot_mapping
},
parse_gmt = function(infile) {
# determine maximum number columns
max_cols <- max(count.fields(infile))
# read in table, filling empty cells with NA's
gmt <- read.delim(infile, sep = '\t', header = FALSE,
col.names = paste0("gene_", seq_len(max_cols)),
fill = TRUE, stringsAsFactors = FALSE)
# fix column names
colnames(gmt)[1:2] <- c('annotation', 'source')
# convert to an n x 2 (annotation, gene) mapping
res <- do.call('rbind', apply(gmt, 1, function(entry) {
# gene gene id's starting in third column
gids <- entry[3:length(entry)]
gids <- gids[!is.na(gids)]
data.frame(annotation = entry[1], gene = gids, row.names=NULL)
}))
# TODO: Don't assume entrez ids!!!
res$gene <- as.numeric(as.character(res$gene))
# return dataframe
res
}
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.