Nothing
#' @title Subtyping multi-omics data
#' @description Perform subtyping using multiple types of data
#'
#' @param dataList a list of data matrices. Each matrix represents a data type where the rows are items and the columns are features. The matrices must have the same set of items.
#' @param kMax The maximum number of clusters used for automatically detecting the number of clusters in \code{PerturbationClustering}. This paramter is passed to \code{PerturbationClustering} and does not affect the final number of cluster in \code{SubtypingOmicsData}. Default value is \code{5}.
#' @param kMin The minimum number of clusters used for automatically detecting the number of clusters in \code{PerturbationClustering}. This paramter is passed to \code{PerturbationClustering} and does not affect the final number of cluster in \code{SubtypingOmicsData}. Default value is \code{2}.
#' @param k The number of clusters. If k is set then kMin and kMax will be ignored.
#' @param agreementCutoff agreement threshold to be considered consistent. Default value is \code{0.5}.
#' @param ncore Number of cores that the algorithm should use. Default value is \code{1}.
#' @param verbose set it to \code{TRUE} of \code{FALSE} to get more or less details respectively.
#' @param sampledSetSize The number of sample size used for the sampling process when dataset is big. Default value is \code{2000}.
#' @param knn.k The value of k of the k-nearest neighbors algorithm. If knn.k is not set then it will be used elbow method to calculate the k.
#' @param ... these arguments will be passed to \code{PerturbationClustering} algorithm. See details for more information
#'
#' @details
#'
#' \code{SubtypingOmicsData} implements the Subtyping multi-omic data that are based on Perturbaion clustering algorithm of Nguyen et al (2017), Nguyen et al (2019) and Nguyen, et al. (2021).
#' The input is a list of data matrices where each matrix represents the molecular measurements of a data type. The input matrices must have the same number of rows.
#' \code{SubtypingOmicsData} aims to find the optimum number of subtypes and location of each sample in the clusters from integrated input data \code{dataList} through two processing stages:
#'
#' 1. Stage I: The algorithm first partitions each data type using the function \code{PerturbationClustering}.
#' It then merges the connectivities across data types into similarity matrices.
#' Both kmeans and similarity-based clustering algorithms - partitioning around medoids \code{pam} are used to partition the built similarity.
#' The algorithm returns the partitioning that agrees the most with individual data types.\cr
#' 2. Stage II: The algorithm attempts to split each discovered group if there is a strong agreement between data types,
#' or if the subtyping in Stage I is very unbalanced.
#'
#' When clustering a large number of samples, this function uses a subsampling technique to reduce the computational complexity with the two parameters \code{sampledSetSize} and \code{knn.k}. Please consult Nguyen et al. (2021) for details.
#'
#' @return
#'
#' \code{SubtypingOmicsData} returns a list with at least the following components:
#' \item{cluster1}{A vector of labels indicating the cluster to which each sample is allocated in Stage I}
#' \item{cluster2}{A vector of labels indicating the cluster to which each sample is allocated in Stage II}
#' \item{dataTypeResult}{A list of results for individual data type. Each element of the list is the result of the \code{PerturbationClustering} for the corresponding data matrix provided in dataList.}
#'
#'
#' @references
#'
#' 1. H Nguyen, S Shrestha, S Draghici, & T Nguyen. PINSPlus: a tool for tumor subtype discovery in integrated genomic data. Bioinformatics, 35(16), 2843-2846, (2019).
#'
#' 2. T Nguyen, R Tagett, D Diaz, S Draghici. A novel method for data integration and disease subtyping. Genome Research, 27(12):2025-2039, 2017.
#'
#' 3. T. Nguyen, "Horizontal and vertical integration of bio-molecular data", PhD thesis, Wayne State University, 2017.
#'
#' 4. H Nguyen, D Tran, B Tran, M Roy, A Cassell, S Dascalu, S Draghici & T Nguyen. SMRT: Randomized Data Transformation for Cancer Subtyping and Big Data Analysis. Frontiers in oncology. 2021.
#'
#' @seealso \code{\link{PerturbationClustering}}
#'
#' @examples
#' \donttest{
#' # Load the kidney cancer carcinoma data
#' data(KIRC)
#'
#' # Perform subtyping on the multi-omics data
#' dataList <- list (as.matrix(KIRC$GE), as.matrix(KIRC$ME), as.matrix(KIRC$MI))
#' names(dataList) <- c("GE", "ME", "MI")
#' result <- SubtypingOmicsData(dataList = dataList)
#'
#' # Change Pertubation clustering algorithm's arguments
#' result <- SubtypingOmicsData(
#' dataList = dataList,
#' clusteringMethod = "kmeans",
#' clusteringOptions = list(nstart = 50)
#' )
#'
#' # Plot the Kaplan-Meier curves and calculate Cox p-value
#' library(survival)
#' cluster1=result$cluster1;cluster2=result$cluster2
#' a <- intersect(unique(cluster2), unique(cluster1))
#' names(a) <- intersect(unique(cluster2), unique(cluster1))
#' a[setdiff(unique(cluster2), unique(cluster1))] <- seq(setdiff(unique(cluster2), unique(cluster1)))
#' + max(cluster1)
#' colors <- a[levels(factor(cluster2))]
#' coxFit <- coxph(
#' Surv(time = Survival, event = Death) ~ as.factor(cluster2),
#' data = KIRC$survival,
#' ties = "exact"
#' )
#' mfit <- survfit(Surv(Survival, Death == 1) ~ as.factor(cluster2), data = KIRC$survival)
#' plot(
#' mfit, col = colors,
#' main = "Survival curves for KIRC, level 2",
#' xlab = "Days", ylab = "Survival",lwd = 2
#' )
#' legend("bottomright",
#' legend = paste(
#' "Cox p-value:",
#' round(summary(coxFit)$sctest[3], digits = 5),
#' sep = ""
#' )
#' )
#' legend(
#' "bottomleft",
#' fill = colors,
#' legend = paste(
#' "Group ",
#' levels(factor(cluster2)),": ", table(cluster2)[levels(factor(cluster2))],
#' sep =""
#' )
#' )
#'
#' }
#' @importFrom FNN knnx.index
#' @importFrom entropy entropy
#' @importFrom impute impute.knn
#' @export
SubtypingOmicsData <- function (dataList, kMin = 2, kMax = 5, k = NULL, agreementCutoff = 0.5, ncore = 1, verbose = T, sampledSetSize = 2000, knn.k = NULL, ...) {
now = Sys.time()
# defined log function
mlog <- if(!verbose) function(...){} else function(...){
message(...)
flush.console()
}
dataListComplete <- dataList
commonSamples <- Reduce(f = intersect, x = lapply(dataList, rownames))
dataList <- lapply(dataList, function(d) d[commonSamples, ])
notCommonData <- lapply(dataListComplete, function(d) {
rn <- rownames(d)[(!rownames(d) %in% commonSamples)]
d <- matrix(d[rn, ], ncol = ncol(d))
rownames(d) <- rn
d
})
dataListTrain <- NULL
dataListTest <- NULL
seed = round(rnorm(1)*10^6)
dataList <- lapply(dataList, as.data.frame)
if (nrow(dataList[[1]]) > sampledSetSize) {
n_samples <- nrow(dataList[[1]])
ind <- sample.int(n_samples, size = sampledSetSize)
dataListTrain <- lapply(dataList, function(x) x[ind, ])
dataListTest <- lapply(dataList, function(x) x[-ind, , drop=F])
dataList <- dataListTrain
}
runPerturbationClustering <- function(dataList, kMin, kMax, stage = 1, forceSplit = FALSE, k = NULL){
dataTypeResult <- lapply(dataList, function(data) {
set.seed(seed)
data <- as.matrix(data)
data <- data[rowSums(is.na(data)) == 0, ]
PerturbationClustering(data, kMin, kMax, ncore = ncore, verbose = verbose,...)
})
# origList <- lapply(dataTypeResult, function(r) r$origS[[r$k]])
# orig = Reduce('+', origList)/length(origList)
# PW = Reduce('*', origList)
# agreement = (sum(orig == 0) + sum(orig == 1) - nrow(orig)) / (nrow(orig) ^ 2 - nrow(orig))
#
# pert = Reduce('+', lapply(dataTypeResult, function(r) r$pertS[[r$k]]))/length(dataList)
allSamples <- unique(unlist(lapply(dataList, rownames)))
origList <- lapply(dataTypeResult, function(r) r$origS[[r$k]])
origMerged <- do.call(
what = rbind,
args = lapply(origList, function(o){
o <- as.data.frame(o)
as.numeric(as.matrix(t(as.data.frame(t(o[allSamples, ]))[allSamples, ])))
})
)
orig = matrix(colMeans(origMerged, na.rm = T), nrow = length(allSamples))
rownames(orig) <- colnames(orig) <- allSamples
orig <- impute::impute.knn(orig)$data
orig[is.na(orig)] <- 0
PW = matrix(as.numeric(colSums(origMerged == 0, na.rm = T) == 0), nrow = length(allSamples))
rownames(PW) <- colnames(PW) <- allSamples
agreement = (sum(orig == 0) + sum(orig == 1) - nrow(orig)) / (nrow(orig) ^ 2 - nrow(orig))
pertList <- lapply(dataTypeResult, function(r) r$pertS[[r$k]])
pertMerged <- do.call(
what = rbind,
args = lapply(pertList, function(p){
p <- as.data.frame(p)
as.numeric(as.matrix(t(as.data.frame(t(p[allSamples, ]))[allSamples, ])))
})
)
pert = matrix(colMeans(pertMerged, na.rm = T), nrow = length(allSamples))
rownames(pert) <- colnames(pert) <- allSamples
pert <- impute::impute.knn(pert)$data
pert[is.na(pert)] <- 0
groups <- NULL
mlog("STAGE : ", stage, "\t Agreement : ", agreement)
if (agreement >= agreementCutoff | forceSplit){
hcW <- hclust(dist(PW))
maxK = min(kMax*2, dim(unique(PW, MARGIN = 2))[2] - (stage - 1))
maxHeight = FindMaxHeight(hcW, maxK = min(2*maxK, 10))
groups <- cutree(hcW, maxHeight)
# if k is specific then only use that k if the number of groups > k
# the max(groups) < k, this may cause small groups
if (!is.null(k) && max(groups) > k){
groups <- cutree(hcW, k)
}
}
list(dataTypeResult = dataTypeResult, orig = orig, pert = pert, PW = PW, groups = groups, agreement = agreement)
}
pResult <- runPerturbationClustering(dataList, kMin, kMax, k = k)
groups <- pResult$groups
groups2 <- NULL
if (!is.null(groups)) {
groups2 <- groups
if (is.null(k)){
for (g in sort(unique(groups))) {
miniGroup <- names(groups[groups == g])
if (length(miniGroup) > 30) {
groupsM <- runPerturbationClustering(dataList = lapply(dataList, function(d) d[miniGroup, ]), kMin = kMin, kMax = min(kMax, 5), stage = 2)$groups
if (!is.null(groupsM))
groups2[miniGroup] <- paste(g, groupsM, sep = "-")
}
}
} else {
agreements <- rep(1, length(groups))
names(agreements) <- names(groups)
# if k is specific then force further split
tbl <- sort(table(groups2), decreasing = T)
minGroupSize <- 30
while (length(unique(groups2)) < k){
if (all(tbl <= 30)) {
minGroupSize <- 10
}
# cannot split anymore
if (all(tbl <= 10)) break()
for (g in names(tbl)){
miniGroup <- names(groups2[groups2 == g])
if (length(miniGroup) > minGroupSize) {
splitRes <- runPerturbationClustering(dataList = lapply(dataList, function(d) d[miniGroup, ]), kMin = kMin, kMax = min(kMax, 5), stage = 2, forceSplit = T)
groupsM <- splitRes$groups
if (!is.null(groupsM)){
groups2[miniGroup] <- paste(g, groupsM, sep = "-")
agreements[miniGroup] <- splitRes$agreement
}
}
}
tbl <- sort(table(groups2), decreasing = T)
}
# now after further splitting, the number of groups can be > k
# need to merge cluster based on their agreement
agreements.unique = unique(agreements)
for (aggr in sort(unique(agreements))){
if (length(unique(groups2)) == k) break()
merge.group <- agreements == aggr
k.smallGroup <- length(unique(groups2[merge.group]))
k.need <- k - (length(unique(groups2)) - k.smallGroup + 1) + 1
groups2[merge.group] <- unlist(lapply(strsplit(groups2[merge.group], "-"), function(g){
paste0(g[1:(length(g)-1)], collapse = "-")
}))
if (k.need > 1){
splitRes <- runPerturbationClustering(dataList = lapply(dataList, function(d) d[miniGroup, ]),
kMin = kMin, kMax = min(kMax, 5), stage = 2, forceSplit = T,
k = k.need)
groupsM <- splitRes$groups
groups2[merge.group] <- paste(groups2[merge.group], groupsM, sep = "-")
}
agreements[merge.group] <- 1
}
}
} else{
set.seed(seed)
orig <- pResult$orig
dataTypeResult <- pResult$dataTypeResult
clusteringAlgorithm = GetClusteringAlgorithm(...)$fun
groupings <- lapply(dataTypeResult, function(r) clusteringAlgorithm(data = r$origS[[r$k]], k = r$k))
pGroups <- ClusterUsingPAM(orig = orig, kMax = kMax*2, groupings = groupings)
hGroups <- ClusterUsingHierarchical(orig = orig, kMax = kMax*2, groupings = groupings)
pAgree = pGroups$agree; hAgree = hGroups$agree;
groups <- (if (pAgree > hAgree) pGroups else if (hAgree > pAgree) hGroups else {
pAgree = ClusterUsingPAM(orig = pResult$pert, kMax = kMax, groupings = groupings)$agree
hAgree = ClusterUsingHierarchical(orig = pResult$pert, kMax = kMax, groupings = groupings)$agree
if (hAgree - pAgree >= 1e-3) hGroups else pGroups
})$cluster
names(groups) <- rownames(orig)
groups2 <- groups
if (is.null(k)){
mlog("Check if can proceed to stage II")
normalizedEntropy = entropy::entropy(table(groups)) / log(length(unique(groups)), exp(1))
if (normalizedEntropy < 0.5) {
for (g in sort(unique(groups))) {
miniGroup <- names(groups[groups == g])
#this is just to make sure we don't split a group that is already very small
if (length(miniGroup) > 30) {
#this is to check if the data types in this group can be split
groupsM <- runPerturbationClustering(dataList = lapply(dataList, function(d) d[miniGroup, ]),kMin = kMin, kMax = min(kMax, 5), stage = 2, forceSplit = T, k = NULL)$groups
if (!is.null(groupsM))
groups2[miniGroup] <- paste(g, groupsM, sep = "-")
}
}
}
} else {
if (length(unique(groups2)) > k){
pGroups <- ClusterUsingPAM(orig = orig, kMax = kMax*2, groupings = groupings, k)
hGroups <- ClusterUsingHierarchical(orig = orig, kMax = kMax*2, groupings = groupings, k)
pAgree = pGroups$agree; hAgree = hGroups$agree;
groups <- (if (pAgree > hAgree) pGroups else if (hAgree > pAgree) hGroups else {
pAgree = ClusterUsingPAM(orig = pResult$pert, kMax = kMax, groupings = groupings, k)$agree
hAgree = ClusterUsingHierarchical(orig = pResult$pert, kMax = kMax, groupings = groupings, k)$agree
if (hAgree - pAgree >= 1e-3) hGroups else pGroups
})$cluster
names(groups) <- rownames(orig)
groups2 <- groups
} else if (length(unique(groups2)) < k){
# split like normal using entropy
normalizedEntropy = entropy::entropy(table(groups)) / log(length(unique(groups)), exp(1))
agreements <- rep(1, length(groups))
names(agreements) <- names(groups)
if (normalizedEntropy < 0.5) {
for (g in sort(unique(groups))) {
miniGroup <- names(groups[groups == g])
#this is just to make sure we don't split a group that is already very small
if (length(miniGroup) > 30) {
#this is to check if the data types in this group can be split
splitRes <- runPerturbationClustering(dataList = lapply(dataList, function(d) d[miniGroup, ]), kMin = kMin, kMax = min(kMax, 5), stage = 2, forceSplit = T)
groupsM <- splitRes$groups
if (!is.null(groupsM)){
groups2[miniGroup] <- paste(g, groupsM, sep = "-")
agreements[miniGroup] <- splitRes$agreement
}
}
}
}
# if the number of group is still less than k, then force split
if (length(unique(groups2)) < k){
# if k is specific then force further split
tbl <- sort(table(groups2), decreasing = T)
minGroupSize <- 30
while (length(unique(groups2)) < k){
if (all(tbl <= 30)) {
minGroupSize <- 10
}
# cannot split anymore
if (all(tbl <= 10)) break()
for (g in names(tbl)){
miniGroup <- names(groups2[groups2 == g])
if (length(miniGroup) > minGroupSize) {
splitRes <- runPerturbationClustering(dataList = lapply(dataList, function(d) d[miniGroup, ]), kMin = kMin, kMax = min(kMax, 5), stage = 2, forceSplit = T)
groupsM <- splitRes$groups
if (!is.null(groupsM)){
groups2[miniGroup] <- paste(g, groupsM, sep = "-")
agreements[miniGroup] <- splitRes$agreement
}
}
}
tbl <- sort(table(groups2), decreasing = T)
}
}
# now after further splitting, the number of groups can be > k
# need to merge cluster based on their aggrement
agreements.unique = unique(agreements)
for (aggr in sort(unique(agreements))){
if (length(unique(groups2)) == k) break()
merge.group <- agreements == aggr
k.smallGroup <- length(unique(groups2[merge.group]))
k.need <- k - (length(unique(groups2)) - k.smallGroup + 1) + 1
groups2[merge.group] <- unlist(lapply(strsplit(groups2[merge.group], "-"), function(g){
paste0(g[1:(length(g)-1)], collapse = "-")
}))
if (k.need > 1){
splitRes <- runPerturbationClustering(dataList = lapply(dataList, function(d) d[merge.group, ]),
kMin = kMin, kMax = min(kMax, 5), stage = 2, forceSplit = T,
k = k.need)
groupsM <- splitRes$groups
groups2[merge.group] <- paste(groups2[merge.group], groupsM, sep = "-")
}
agreements[merge.group] <- 1
}
}
}
}
{
train_y <- groups
train_y2 <- groups2
if(!is.null(dataListTest)) {
set.seed(seed)
RcppParallel::setThreadOptions(ncore)
test_prob <- matrix(0, nrow = n_samples - sampledSetSize, ncol = length(unique(groups)))
if(!is.null(train_y2))
{
test_prob2 <- matrix(0, nrow = n_samples - sampledSetSize, ncol = length(unique(groups2)))
}
for (i in 1:length(dataListTrain)) {
train <- dataListTrain[[i]]
test <- dataListTest[[i]]
if(ncol(train)*nrow(train) > 2e7) {
pca <- pResult$dataTypeResult[[i]]$pca
} else {
pca <- rpca.para(train, min(nrow(data), 20), scale = F)
}
train <- pca$x
test <- predict.rpca.para(pca, test)
test_prob <- test_prob + classifierProb(train, groups, test, knn.k)
if(!is.null(train_y2)){
test_prob2 <- test_prob2 + classifierProb(train, groups2, test, knn.k)
}
# nn_index <- FNN::knnx.index(train, test, k = 10)
# nn_group <- matrix(train_y[nn_index], ncol = 10)
#
# for (j in 1:nrow(test_prob)) {
# tmp <- table(nn_group[j, ])
# test_prob[j, as.numeric(names(tmp))] <- test_prob[j, as.numeric(names(tmp))] + as.numeric(tmp)/10
# }
#
# if(!is.null(train_y2))
# {
# nn_group2 <- matrix(train_y2[nn_index], ncol = 10)
#
# for (j in 1:nrow(test_prob)) {
# tmp <- table(nn_group2[j, ])
# test_prob2[j, as.numeric(names(tmp))] <- test_prob2[j, as.numeric(names(tmp))] + as.numeric(tmp)/10
# }
# }
}
test_y <- apply(test_prob, 1, which.max)
groups <- rep(0, n_samples)
groups[ind] <- train_y
groups[-ind] <- test_y
if(!is.null(train_y2)) {
test_y2 <- apply(test_prob2, 1, which.max)
groups2 <- rep(0, n_samples)
groups2[ind] <- train_y2
groups2[-ind] <- test_y2
}
}
}
# for not common data
{
# if data is big, now groups include the testing
train_y <- groups
train_y2 <- groups2
allSamples <- unique(unlist(lapply(dataListComplete, rownames)))
n_samples <- length(allSamples)
notCommonSamples <- allSamples[!(allSamples %in% commonSamples)]
if(length(allSamples) > length(commonSamples)) {
set.seed(seed)
RcppParallel::setThreadOptions(ncore)
# if dataListTrain is not null this means the data is big
if (!is.null(dataListTrain)){
dataListTrain <- lapply(1:length(dataList), function(i){
as.matrix(rbind(dataList[[i]], dataListTrain[[i]]))
})
} else {
dataListTrain <- dataList
}
dataListTest <- notCommonData
test_prob <- matrix(0, nrow = length(notCommonSamples), ncol = length(unique(groups)))
rownames(test_prob) <- notCommonSamples
colnames(test_prob) <- unique(groups)
if(!is.null(train_y2))
{
test_prob2 <- matrix(0, nrow = length(notCommonSamples), ncol = length(unique(groups2)))
rownames(test_prob2) <- notCommonSamples
colnames(test_prob2) <- unique(groups2)
}
for (i in 1:length(dataListTrain)) {
train <- dataListTrain[[i]]
test <- dataListTest[[i]]
if (nrow(test) == 0) next()
pca <- rpca.para(train, min(nrow(train), 20), scale = F)
train <- pca$x
test <- predict.rpca.para(pca, test)
test_prob_tmp <- classifierProb(train, groups, test, knn.k)
test_prob[rownames(test_prob_tmp), ] <- test_prob[rownames(test_prob_tmp), ] + test_prob_tmp
if(!is.null(train_y2)){
test_prob_tmp2 <- classifierProb(train, groups2, test, knn.k)
test_prob2[rownames(test_prob_tmp2), ] <- test_prob2[rownames(test_prob_tmp2), ] + test_prob_tmp2
}
}
test_y <- colnames(test_prob)[apply(test_prob, 1, which.max)]
groups <- rep(0, n_samples)
names(groups) <- allSamples
groups[commonSamples] <- train_y
groups[notCommonSamples] <- test_y
if(!is.null(train_y2)) {
test_y2 <- colnames(test_prob2)[apply(test_prob2, 1, which.max)]
groups2 <- rep(0, n_samples)
names(groups2) <- allSamples
groups2[commonSamples] <- train_y2
groups2[notCommonSamples] <- test_y2
}
}
}
timediff = Sys.time() - now;
mlog("Done in ", timediff, " ", units(timediff), ".\n")
list(
cluster1 = groups,
cluster2 = groups2,
dataTypeResult = pResult$dataTypeResult
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.