Nothing
#' @include sigmas.r
#' @include s4-unions.r
#' @useDynLib destiny
NULL
sigma_msg <- function(sigma) sprintf(
"The sigma parameter needs to be NULL, 'local', 'global', numeric or a %s object, not a %s.",
sQuote('Sigmas'), sQuote(class(sigma)))
#' Create a diffusion map of cells
#'
#' The provided data can be a double \link[base]{matrix} of expression data or a \link[base]{data.frame} with all non-integer (double) columns
#' being treated as expression data features (and the others ignored), an \link[Biobase:class.ExpressionSet]{ExpressionSet}, or a \link[SingleCellExperiment]{SingleCellExperiment}.
#'
#' @param data Expression data to be analyzed and covariates. Provide \code{vars} to select specific columns other than the default: all double value columns.
#' If \code{distance} is a distance matrix, \code{data} has to be a \code{\link{data.frame}} with covariates only.
#' @param sigma Diffusion scale parameter of the Gaussian kernel. One of \code{'local'}, \code{'global'}, a (\link[base]{numeric}) global sigma or a \link{Sigmas} object.
#' When choosing \code{'global'}, a global sigma will be calculated using \code{\link{find_sigmas}}. (Optional. default: \code{'local'})
#' A larger sigma might be necessary if the eigenvalues can not be found because of a singularity in the matrix
#' @param k Number of nearest neighbors to consider (default: a guess betweeen 100 and \eqn{n - 1}. See \code{\link{find_dm_k}}).
#' @param n_eigs Number of eigenvectors/values to return (default: 20)
#' @param density_norm logical. If TRUE, use density normalisation
#' @param ... Unused. All parameters to the right of the \code{...} have to be specified by name (e.g. \code{DiffusionMap(data, distance = 'cosine')})
#' @param distance Distance measurement method applied to \code{data} or a distance matrix/\code{\link[stats]{dist}}. For the allowed values, see \code{\link{find_knn}}.
#' If this is a \code{\link[Matrix]{sparseMatrix}}, zeros are interpreted as "not a close neighbors",
#' which allows the use of kNN-sparsified matrices (see the return value of \code{\link{find_knn}}.
#' @param n_pcs Number of principal components to compute to base calculations on. Using e.g. 50 DCs results in more regular looking diffusion maps.
#' The default NULL will not compute principal components, but use \code{reducedDims(data, 'pca')} if present. Set to NA to suppress using PCs.
#' @param n_local If \code{sigma == 'local'}, the \code{n_local}th nearest neighbor(s) determine(s) the local sigma
#' @param rotate logical. If TRUE, rotate the eigenvalues to get a slimmer diffusion map
#' @param censor_val Value regarded as uncertain. Either a single value or one for every dimension (Optional, default: censor_val)
#' @param censor_range Uncertainity range for censoring (Optional, default: none). A length-2-vector of certainty range start and end. TODO: also allow \eqn{2\times G} matrix
#' @param missing_range Whole data range for missing value model. Has to be specified if NAs are in the data
#' @param vars Variables (columns) of the data to use. Specifying NULL will select all columns (default: All floating point value columns)
#' @param knn_params Parameters passed to \code{\link{find_knn}}
#' @param verbose Show a progressbar and other progress information (default: do it if censoring is enabled)
#' @param suppress_dpt Specify TRUE to skip calculation of necessary (but spacious) information for \code{\link{DPT}} in the returned object (default: FALSE)
#'
#' @return A DiffusionMap object:
#'
#' @slot eigenvalues Eigenvalues ranking the eigenvectors
#' @slot eigenvectors Eigenvectors mapping the datapoints to \code{n_eigs} dimensions
#' @slot sigmas \link{Sigmas} object with either information about the \link{find_sigmas} heuristic run or just local or \link{optimal_sigma}.
#' @slot data_env Environment referencing the data used to create the diffusion map
#' @slot eigenvec0 First (constant) eigenvector not included as diffusion component.
#' @slot transitions Transition probabilities. Can be NULL
#' @slot d Density vector of transition probability matrix
#' @slot d_norm Density vector of normalized transition probability matrix
#' @slot k The k parameter for kNN
#' @slot n_pcs Number of principal components used in kNN computation (NA if raw data was used)
#' @slot n_local The \code{n_local}th nearest neighbor(s) is/are used to determine local kernel density
#' @slot density_norm Was density normalization used?
#' @slot rotate Were the eigenvectors rotated?
#' @slot distance Distance measurement method used
#' @slot censor_val Censoring value
#' @slot censor_range Censoring range
#' @slot missing_range Whole data range for missing value model
#' @slot vars Vars parameter used to extract the part of the data used for diffusion map creation
#' @slot knn_params Parameters passed to \code{\link{find_knn}}
#'
#' @seealso \link{DiffusionMap methods} to get and set the slots. \code{\link{find_sigmas}} to pre-calculate a fitting global \code{sigma} parameter
#'
#' @examples
#' data(guo)
#' DiffusionMap(guo)
#' DiffusionMap(guo, 13, censor_val = 15, censor_range = c(15, 40), verbose = TRUE)
#'
#' covars <- data.frame(covar1 = letters[1:100])
#' dists <- dist(matrix(rnorm(100*10), 100))
#' DiffusionMap(covars, distance = dists)
#'
#' @importFrom methods setClass validObject
#' @rdname DiffusionMap-class
#' @export
setClass(
'DiffusionMap',
slots = c(
eigenvalues = 'numeric',
eigenvectors = 'matrix',
sigmas = 'Sigmas',
data_env = 'environment',
eigenvec0 = 'numeric',
transitions = 'dMatrixOrNULL',
d = 'numeric',
d_norm = 'numeric',
k = 'integer',
n_pcs = 'integer',
n_local = 'integer',
density_norm = 'logical',
rotate = 'logical',
distance = 'character',
censor_val = 'numericOrNULL',
censor_range = 'numericOrNULL',
missing_range = 'numericOrNULL',
vars = 'characterOrnumericOrNULL',
knn_params = 'list'),
validity = function(object) {
# don't use getters as thisfunction is used in them!
if (!is.vector(object@eigenvalues) || !is.numeric(object@eigenvalues))
'eigenvalues(dm) has to be a numeric vector'
else if (!is.matrix(object@eigenvectors) || !is.numeric(object@eigenvectors))
'eigenvectors(dm) has to be a numeric matrix'
else if (length(object@eigenvalues) != ncol(object@eigenvectors))
'There must be exactly one eigenvalue per eigenvector: length(eigenvalues(dm)) == ncol(eigenvectors(dm))'
else if (!isTRUE(validObject(object@sigmas, test = TRUE)))
paste('sigmas invalid:', validObject(object@sigmas, test = TRUE))
#TODO: validate data_env (data, accumulated_transitions) and eigenvec0
else if (length(object@d) != nrow(object@eigenvectors))
'd must be as long as each eigenvector'
else if (length(object@k) != 1)
'k must be a number'
else if (length(object@density_norm) != 1)
'density_norm must be TRUE or FALSE'
else if (length(object@rotate) != 1)
'rotate must be TRUE or FALSE'
else if (!(object@distance %in% c('euclidean', 'cosine', 'rankcor', 'l2', 'custom')))
'distance must be "euclidean", "cosine", "rankcor", or "l2"'
else if (is.null(object@censor_val) != is.null(object@censor_range))
'Both censor_val and censor_range either need to be NULL or not'
else if (!is.null(object@censor_val) && length(object@censor_val) != 1)
'censor_val must be a number'
else if (!is.null(object@censor_range) && (length(object@censor_range) != 2 || diff(object@censor_range) <= 0))
'censor_range must be a increasing range (two numbers, the left one being larger)'
else if (!is.null(object@missing_range) && (length(object@missing_range) != 2 || diff(object@missing_range) <= 0))
'missing_range must be a increasing range (two numbers, the left one being larger)'
#TODO validate vars
else TRUE
})
#' @importFrom methods new as is
#' @importFrom Matrix Diagonal colSums rowSums t
#' @importFrom VIM hotdeck
#' @importFrom SingleCellExperiment reducedDimNames reducedDim<-
#' @rdname DiffusionMap-class
#' @export
DiffusionMap <- function(
data = stopifnot_distmatrix(distance),
sigma = 'local',
k = find_dm_k(dataset_n_observations(data, distance) - 1L),
n_eigs = min(20L, dataset_n_observations(data, distance) - 2L),
density_norm = TRUE,
...,
distance = c('euclidean', 'cosine', 'rankcor', 'l2'),
n_pcs = NULL,
n_local = seq(to = min(k, 7L), length.out = min(k, 3L)),
rotate = FALSE,
censor_val = NULL, censor_range = NULL,
missing_range = NULL,
vars = NULL,
knn_params = list(),
verbose = !is.null(censor_range),
suppress_dpt = FALSE
) {
# make sure those promises are resolved before we mess with `data`
force(k)
force(n_eigs)
chkDots(...)
if (is.null(sigma) || !is(sigma, 'Sigmas') && isTRUE(is.na(sigma)))
sigma <- 'local'
if (!is(sigma, 'Sigmas') && !(length(sigma) == 1L && sigma %in% c('local', 'global')) && !is.numeric(sigma))
stop(sigma_msg(sigma))
if (identical(sigma, 'local') && any(n_local > k))
stop('For local sigma, All entries of n_local (', paste(n_local, collapse = ','), ') have to be \u2264 k (', k, ')')
# store away data and continue using imputed, unified version
data_env <- new.env(parent = .GlobalEnv)
if (is_distmatrix(distance)) {
if (!(is.data.frame(data) || is.null(data))) stop('If you provide a matrix for `distance`, `data` has to be NULL or a covariate `data.frame` is of class', class(data))
if (!is.null(n_pcs)) stop('If you provide a matrix for `distance`, `n_pcs` has to be NULL')
data_env$data <- if (is.null(data)) distance else data # put covariates or distance
dists <- as(distance, 'symmetricMatrix')
if (!is.null(rownames(data_env$data))) rownames(dists) <- colnames(dists) <- rownames(data)
distance <- 'custom'
imputed_data <- data_or_pca <- NULL
n <- nrow(dists)
} else {
dists <- NULL
distance <- match.arg(distance)
data_env$data <- data
data <- dataset_extract_doublematrix(data, vars)
imputed_data <- data
if (anyNA(imputed_data))
imputed_data <- as.matrix(hotdeck(data, imp_var = FALSE))
n <- nrow(imputed_data)
# PCA
pca <- get_pca(imputed_data, data_env$data, n_pcs, verbose)
if (is.null(pca) && ncol(imputed_data) > 500L) {
warning('You have ', ncol(imputed_data), ' genes. Consider passing e.g. n_pcs = 50 to speed up computation.')
}
if (is.null(pca)) {
n_pcs <- NA_integer_
data_or_pca <- imputed_data
} else {
n_pcs <- ncol(pca)
data_or_pca <- pca
# Update PCA in SCE if there was none
if (is(data_env$data, 'SingleCellExperiment') && !('pca' %in% reducedDimNames(data_env$data)))
reducedDim(data_env$data, 'pca') <- pca
}
}
# arg validation
if (n <= n_eigs + 1L) stop('Eigen decomposition not possible if n \u2264 n_eigs+1 (And ', n,' \u2264 ', n_eigs + 1L, ')')
if (is.null(k) || is.na(k)) k <- n - 1L
#TODO: optimize case
#dense <- k == n - 1L
if (k >= n) stop(sprintf('k has to be < nrow(data) (And %s \u2265 nrow(data))', k))
censor <- test_censoring(censor_val, censor_range, imputed_data, missing_range)
if (censor && !identical(distance, 'euclidean')) stop('censoring model only valid with euclidean distance')
knn <- get_knn(data_or_pca, dists, k, distance, knn_params, verbose) # use dists if given, else compute from pca if available, else from data
sigmas <- get_sigmas(imputed_data, knn$dist, sigma, n_local, distance, censor_val, censor_range, missing_range, vars, verbose)
sigma <- optimal_sigma(sigmas) # single number = global, multiple = local
trans_p <- transition_probabilities(imputed_data, sigma, knn$dist_mat, censor, censor_val, censor_range, missing_range, verbose)
rm(knn) # free memory
d <- rowSums(trans_p, na.rm = TRUE) + 1 # diagonal set to 1
# normalize by density if requested
norm_p <- get_norm_p(trans_p, d, d, density_norm)
rm(trans_p) # free memory
d_norm <- rowSums(norm_p)
# calculate the inverse of a diagonal matrix by inverting the diagonal
d_rot <- Diagonal(x = d_norm ^ -.5)
transitions <- as(d_rot %*% norm_p %*% d_rot, 'symmetricMatrix')
rm(norm_p) # free memory
eig_transitions <- decomp_transitions(transitions, n_eigs + 1L, verbose)
eig_vec <- eig_transitions$vectors
eig_val <- eig_transitions$values
if (rotate) eig_vec <- as.matrix(t(t(eig_vec) %*% d_rot))
rownames(eig_vec) <-
rownames(transitions) <-
colnames(transitions) <-
rownames(if (!is.null(rownames(dists))) dists else data)
colnames(eig_vec) <-
names(eig_val) <-
paste0('DC', seq(0, n_eigs))
new(
'DiffusionMap',
eigenvalues = eig_val[-1],
eigenvectors = eig_vec[, -1, drop = FALSE],
sigmas = sigmas,
data_env = data_env,
eigenvec0 = eig_vec[, 1],
transitions = if (suppress_dpt) NULL else transitions,
d = d,
d_norm = d_norm,
k = as.integer(k),
n_pcs = as.integer(n_pcs),
n_local = as.integer(n_local),
rotate = rotate,
density_norm = density_norm,
distance = distance,
censor_val = censor_val,
censor_range = censor_range,
missing_range = missing_range,
knn_params = knn_params)
}
is_distmatrix <- function(distance) {
if (is.character(distance))
FALSE
else if ((is.matrix(distance) && diff(dim(distance)) == 0L) || is(distance, 'dist') || is(distance, 'symmetricMatrix'))
TRUE
else
stop(
'`distance` needs to be a distance measure or a square matrix, but is a ',
class(dist), ' of dim ', nrow(distance), ' by ', ncol(distance))
}
stopifnot_distmatrix <- function(distance) {
if (is_distmatrix(distance)) NULL
else stop('`data`` needs to be set if `distance` is a distance measure')
}
#' @importFrom methods new is
get_sigmas <- function(imputed_data, nn_dists, sigma, n_local, distance = 'euclidean', censor_val = NULL, censor_range = NULL, missing_range = NULL, vars = NULL, verbose = FALSE) {
unspecified_local <- identical(sigma, 'local')
if (unspecified_local || is.numeric(sigma)) {
if (unspecified_local) {
sig_mat <- nn_dists[, n_local, drop = FALSE]
sigma <- rowSums(sig_mat) / length(n_local) / 2
}
new('Sigmas',
log_sigmas = NULL,
dim_norms = NULL,
optimal_sigma = sigma,
optimal_idx = NULL,
avrd_norms = NULL)
} else if (identical(sigma, 'global')) {
if (!identical(distance, 'euclidean'))
stop(sprintf('You have to use euclidean distances with sigma estimation, not %s.', sQuote(distance)))
find_sigmas(
imputed_data,
distance = distance,
censor_val = censor_val,
censor_range = censor_range,
missing_range = missing_range,
vars = vars,
verbose = verbose)
} else if (is(sigma, 'Sigmas')) {
sigma
} else {
stop(sigma_msg(sigma))
}
}
#' @importFrom methods hasMethod
#' @importFrom SingleCellExperiment reducedDimNames reducedDim
get_pca <- function(data_mat, data_raw, n_pcs, verbose = FALSE) {
stopifnot(is.null(n_pcs) || length(n_pcs) == 1L)
# If we suppress PCA computation, return NULL
if (isTRUE(is.na(n_pcs))) return(NULL)
# get PCs from SingleCellExperiment if possible
existing_pca <- if (hasMethod('reducedDim', class(data_raw))) reducedDim(data_raw, 'pca')
if (!is.null(existing_pca)) {
if (is.null(n_pcs) || n_pcs == ncol(existing_pca)) {
if (verbose) cat('Using reducedDim(data, "pca") to compute distances\n')
return(existing_pca)
} else if (n_pcs < ncol(existing_pca)) {
warning('Specified n_pcs < ncol(reducedDim(data, "pca")), using subset')
return(existing_pca[, seq_len(n_pcs), drop = FALSE])
} else {# n_pcs > ncol(pcs)
warning('Specified n_pcs > ncol(reducedDim(data, "pca")), recalculating PCA')
rm(pcs)
}
} else if (is.null(n_pcs)) {
# If n_pcs is NULL, data would need to have a PCA
return(NULL)
}
# No PCA in data or requested more PCs
if (ncol(data_mat) < n_pcs) stop('Cannot compute ', n_pcs, ' PCs from data with ', ncol(data_mat), ' columns.')
pca_scores(data_mat, n_pcs)
}
get_knn <- function(imputed_data, dists, k, distance = 'euclidean', knn_params = list(), verbose = FALSE) {
stopifnot(is.null(imputed_data) != is.null(dists))
if (!is.null(dists)) {
nn_dist <- t(apply(dists, 1, function(row) sort(row)[2:k]))
list(dist = nn_dist, dist_mat = dists)
} else tryCatch({
verbose_timing(verbose, 'finding knns', do.call(find_knn, c(list(imputed_data, k, distance = distance), knn_params)))
}, error = function(e) stop('Could not call find_knn. Consider specifying `knn_params = list(M = <larger number>)`. Original error:\n', e$message, call. = FALSE))
}
#' @importFrom Matrix sparseMatrix diag<- drop0 forceSymmetric skewpart symmpart
#' @importFrom utils txtProgressBar setTxtProgressBar
transition_probabilities <- function(imputed_data, sigma, dists, censor, censor_val, censor_range, missing_range, verbose) {
n <- nrow(dists)
# create markovian transition probability matrix (trans_p)
cb <- if (verbose) {
pb <- txtProgressBar(1, n, style = 3)
function(i) setTxtProgressBar(pb, i)
} else invisible
# initialize trans_p
trans_p <- verbose_timing(verbose, 'Calculating transition probabilities', {
if (censor)
censoring(imputed_data, sigma, dists, censor_val, censor_range, missing_range, cb)
else
no_censoring(dists, sigma, cb)
})
if (verbose) close(pb)
#nnzero
# normalize trans_p and only retain intra-cell transitions
diag(trans_p) <- 0
trans_p <- drop0(trans_p)
stopifnot(is(trans_p, 'symmetricMatrix'))
trans_p
}
#' @importFrom Matrix sparseMatrix which tcrossprod
no_censoring <- function(dists, sigma, cb = invisible) {
d2 <- dists ^ 2
stopifnot(isSymmetric(d2))
t_p <- if (length(sigma) == 1L) {
exp(-d2@x / (2 * sigma ^ 2))
} else {
stopifnot(d2@uplo == 'U')
coords <- as(d2, 'dsTMatrix')
i <- coords@i + 1L
j <- coords@j + 1L
sig2 <- sigma^2
S1 <- sigma[i] * sigma[j]
S2 <- sig2[i] + sig2[j]
sqrt(2 * S1 / S2) * exp(-d2@x / S2)
}
sparseMatrix(d2@i, p = d2@p, x = t_p, dims = dim(d2), symmetric = TRUE, index1 = FALSE)
}
#' @importFrom methods as
#' @importFrom Matrix sparseMatrix
get_norm_p <- function(trans_p, d, d_new, density_norm) {
if (density_norm) {
trans_p <- as(trans_p, 'dgTMatrix') # use non-symmetric triples to operate on all values
stopifsmall(max(trans_p@x, na.rm = TRUE))
#creates a dgCMatrix
sparseMatrix(trans_p@i, trans_p@j, x = trans_p@x / (d_new[trans_p@i + 1] * d[trans_p@j + 1]), dims = dim(trans_p), index1 = FALSE)
} else {
trans_p
}
}
decomp_transitions <- function(transitions, n_eigs, verbose)
verbose_timing(verbose, 'performing eigen decomposition', eig_decomp(transitions, n_eigs))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.