#' Perform UMAP dimensionality reduction and HDBSCAN clustering on copy number data
#'
#' This function takes copy number data, performs UMAP dimensionality reduction,
#' and then applies HDBSCAN clustering to identify cell populations. It can handle
#' both standard copy number data and haplotype-specific copy number (HSCN) data.
#'
#' @param CNbins A data frame containing copy number data. Must include columns
#' for 'cell_id' and the specified `field`.
#' @param n_neighbors Integer. The number of neighbors to consider in UMAP. Default is 10.
#' @param min_dist Numeric. The minimum distance between points in UMAP. Default is 0.1.
#' @param minPts Integer. The minimum number of points to form a cluster in HDBSCAN. Default is 30.
#' @param seed Integer or NULL. Random seed for reproducibility. Default is NULL.
#' @param field Character. The column name in `CNbins` to use for copy number values. Default is "copy".
#' @param umapmetric Character. The distance metric to use in UMAP. Default is "correlation".
#' @param hscn Logical. Whether to use haplotype-specific copy number data. Default is FALSE.
#' @param pca Integer or NULL. Number of principal components to use in UMAP. If NULL, pca not used, this is the default.
#'
#'
#' @return A list containing:
#' \item{clustering}{A data frame with UMAP coordinates and cluster assignments for each cell.}
#' \item{hdbscanresults}{The results of the HDBSCAN clustering.}
#' \item{umapresults}{The results of the UMAP dimensionality reduction.}
#' \item{tree}{A phylogenetic tree object representing the hierarchical structure of the clusters.}
#'
#' @details
#' The function performs the following steps:
#' 1. Creates a copy number matrix from the input data.
#' 2. Applies UMAP dimensionality reduction.
#' 3. Performs HDBSCAN clustering on the UMAP results.
#' 4. Generates a phylogenetic tree from the clustering results.
#'
#' If `hscn` is TRUE, the function expects columns 'copy' and 'BAF' in `CNbins`,
#' and creates separate matrices for A and B alleles.
#'
#' The function automatically adjusts `n_neighbors` if there are too few cells.
#' If UMAP fails, it attempts to rerun with small jitter added to the data points.
#' The function will reduce `minPts` if only one cluster is initially found.
#'
#' @export
umap_clustering <- function(CNbins,
n_neighbors = 10,
min_dist = 0.1,
minPts = 30,
seed = NULL,
field = "copy",
umapmetric = "correlation",
hscn = FALSE,
pca = NULL) {
if (length(unique(CNbins$cell_id)) < n_neighbors) {
n_neighbors <- length(unique(CNbins$cell_id)) - 1
}
minPts <- max(minPts, 2)
message("Creating CN matrix...")
if (hscn){
CNbins$A <- CNbins$copy * CNbins$BAF
CNbins$B <- CNbins$copy * (1 - CNbins$BAF)
cnmatrixA <- createCNmatrix(CNbins, fillna = TRUE, field = "A")
cnmatrixA <- subset(cnmatrixA, select = -c(chr, start, end, width))
cnmatrixA <- t(cnmatrixA)
cnmatrixA[!is.finite(cnmatrixA)] <- 0 # remove non finite values
cnmatrixB <- createCNmatrix(CNbins, fillna = TRUE, field = "B")
cnmatrixB <- subset(cnmatrixB, select = -c(chr, start, end, width))
cnmatrixB <- t(cnmatrixB)
cnmatrixB[!is.finite(cnmatrixB)] <- 0 # remove non finite values
colnames(cnmatrixA) <- paste0(colnames(cnmatrixA), "_A")
colnames(cnmatrixB) <- paste0(colnames(cnmatrixB), "_B")
cnmatrix <- cbind(cnmatrixA, cnmatrixB)
} else{
cnmatrix <- createCNmatrix(CNbins, fillna = TRUE, field = field)
cnmatrix <- subset(cnmatrix, select = -c(chr, start, end, width))
cnmatrix <- t(cnmatrix)
cnmatrix[!is.finite(cnmatrix)] <- 0 # remove non finite values
}
if (!is.null(seed)) {
set.seed(seed)
}
message("Calculating UMAP dimensionality reduction...")
if (nrow(cnmatrix) > 500 & is.null(seed) & !is.null(pca)) {
pca <- min(pca, ncol(cnmatrix))
fast_sgd <- TRUE
} else {
pca <- NULL
fast_sgd <- FALSE
}
# umapresults <- uwot::umap(cnmatrix,
# metric = umapmetric,
# n_neighbors = n_neighbors,
# n_components = 2,
# min_dist = min_dist,
# ret_model = TRUE,
# ret_nn = TRUE,
# pca = pca,
# fast_sgd = fast_sgd
# )
#TODO find out why umap gives an error for some cases, seems to be a new bug
umapresults <- tryCatch(
{
umapresults <- uwot::umap(cnmatrix,
metric = umapmetric,
n_neighbors = n_neighbors,
n_components = 2,
min_dist = min_dist,
ret_model = TRUE,
ret_nn = TRUE,
pca = pca,
fast_sgd = fast_sgd,
pca_method = "svdr")
},
error = function(e) {
# Handle error by rerunning UMAP with different parameters
message("An error occurred in umap calculation: ", e$message)
message("Rerunning UMAP after adding small jitter to data points...")
mat <- cnmatrix + matrix(runif(nrow(cnmatrix) * ncol(cnmatrix),
min=-0.005, max=0.005),
nrow=nrow(cnmatrix), ncol=ncol(cnmatrix))
umapresults <- uwot::umap(mat,
metric = umapmetric,
n_neighbors = n_neighbors,
n_components = 2,
min_dist = min_dist,
ret_model = TRUE,
ret_nn = TRUE,
pca = pca,
fast_sgd = fast_sgd,
pca_method = "svdr")
}
)
dfumap <- data.frame(
umap1 = umapresults$embedding[, 1],
umap2 = umapresults$embedding[, 2],
cell_id = row.names(cnmatrix)
)
dfumap$umap1 <- unlist(lapply(dfumap$umap1, function(x) ifelse(!is.finite(x), 0.0, x))) # remove non finite values
dfumap$umap2 <- unlist(lapply(dfumap$umap2, function(x) ifelse(!is.finite(x), 0.0, x))) # remove non finite values
rownames(dfumap) <- row.names(cnmatrix)
message("Clustering cells using hdbscan...")
gentree <- FALSE
while (gentree == FALSE) {
if (!is.null(seed)) {
set.seed(seed)
}
hdbscanresults <- try(dbscan::hdbscan(dfumap[, 1:2],
minPts = minPts,
gen_hdbscan_tree = FALSE,
gen_simplified_tree = FALSE
))
if (class(hdbscanresults) == "try-error") {
message("Only 1 cluster found, reducing minPts size by 10...")
minPts <- round(minPts - 10)
if (minPts <= 0) {
message("Only 1 cluster can be found")
gentree <- TRUE
}
message(paste0("Cluster size = ", minPts))
} else {
gentree <- TRUE
}
}
clusterids <- hdbscanresults$cluster
clusterids[clusterids == 0] <- 702
LETTERS702 <- c(LETTERS, sapply(LETTERS, function(x) paste0(x, LETTERS)))
dfumap$clone_id <- LETTERS702[clusterids]
dfumap <- dfumap %>%
dplyr::mutate(clone_id = ifelse(clone_id == "ZZ", "0", clone_id))
message(paste0("Identified ", length(unique(dfumap$clone_id)), " clusters"))
message("Distribution of clusters:")
f <- table(dfumap$clone_id)
for (cl in sort(unique(dfumap$clone_id))) {
message(paste0(" Cluster ", cl, ": ", f[[cl]]))
}
if (length(unique(dfumap$clone_id)) == 1) {
dfumap$clone_id <- "A"
}
tree <- ape::as.phylo(hdbscanresults$hc, use.labels = TRUE)
tree$tip.label <- row.names(cnmatrix)[as.numeric(tree$tip.label)]
return(list(
clustering = dfumap,
hdbscanresults = hdbscanresults,
umapresults = umapresults,
tree = tree
))
}
#' @export
umap_clustering_breakpoints <- function(CNbins,
n_neighbors = 10,
min_dist = 0.1,
minPts = 30,
seed = 1,
field = "state",
internalonly = TRUE,
use_state = FALSE,
state_remove = 2,
fixjitter = TRUE) {
if (length(unique(CNbins$cell_id)) < n_neighbors) {
n_neighbors <- length(unique(CNbins$cell_id)) - 1
}
minPts <- max(minPts, 2)
message("Creating breakpoint matrix...")
segs <- create_segments(CNbins, field = field)
segs_matrix <- createbreakpointmatrix(segs,
transpose = TRUE,
internalonly = internalonly,
use_state = use_state,
state_remove = state_remove,
fixjitter = fixjitter
)
segs_matrix <- segs_matrix$bps
if (!is.null(seed)) {
set.seed(seed)
}
message("Calculating UMAP dimensionality reduction...")
if (nrow(segs_matrix) > 500 & is.null(seed)) {
fast_sgd <- TRUE
} else {
fast_sgd <- FALSE
}
umapresults <- uwot::umap(segs_matrix,
metric = "hamming",
n_neighbors = n_neighbors,
n_components = 2,
min_dist = min_dist,
ret_model = TRUE,
ret_nn = TRUE,
fast_sgd = fast_sgd
)
dfumap <- data.frame(
umap1 = umapresults$embedding[, 1],
umap2 = umapresults$embedding[, 2],
cell_id = row.names(segs_matrix)
)
rownames(dfumap) <- row.names(segs_matrix)
message("Clustering cells using hdbscan...")
gentree <- FALSE
while (gentree == FALSE) {
if (!is.null(seed)) {
set.seed(seed)
}
hdbscanresults <- try(dbscan::hdbscan(dfumap[, 1:2],
minPts = minPts,
gen_simplified_tree = FALSE
))
if (class(hdbscanresults) == "try-error") {
message("Only 1 cluster found, reducing minPts size by 10...")
minPts <- round(minPts - 10)
message(paste0("Cluster size = ", minPts))
if (minPts < 0) {
message("Only 1 cluster can be found")
gentree <- TRUE
}
} else {
gentree <- TRUE
}
}
clusterids <- hdbscanresults$cluster
clusterids[clusterids == 0] <- 702
LETTERS702 <- c(LETTERS, sapply(LETTERS, function(x) paste0(x, LETTERS)))
dfumap$clone_id <- LETTERS702[clusterids]
dfumap <- dfumap %>%
dplyr::mutate(clone_id = ifelse(clone_id == "ZZ", "0", clone_id))
if (length(unique(dfumap$clone_id)) == 1) {
dfumap$clone_id <- "A"
}
tree <- ape::as.phylo(hdbscanresults$hc, use.labels = TRUE)
tree$tip.label <- row.names(segs_matrix)[as.numeric(tree$tip.label)]
message(paste0("Identified ", length(unique(dfumap$clone_id)), " clusters"))
message("Distribution of clusters:")
f <- table(dfumap$clone_id)
for (cl in sort(unique(dfumap$clone_id))) {
message(paste0(" Cluster ", cl, ": ", f[[cl]]))
}
return(list(
clustering = dfumap,
hdbscanresults = hdbscanresults,
umapresults = umapresults,
tree = tree
))
}
#' @export
hc_clustering <- function() {
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.