R/consensus_partition.R

Defines functions .consensus_partition consensus_partition

Documented in consensus_partition

# == title
# Consensus partition
#
# == param
# -data A numeric matrix where subgroups are found by columns.
# -top_value_method A single top-value method. Available methods are in `all_top_value_methods`.
#                   Use `register_top_value_methods` to add a new top-value method.
# -top_n Number of rows with top values. The value can be a vector with length > 1. When n > 5000, 
#        the function only randomly sample 5000 rows from top n rows. If ``top_n`` is a vector, paritition
#        will be applied to every values in ``top_n`` and consensus partition is summarized from all partitions.
# -partition_method A single partitioning method. Available methods are in `all_partition_methods`.
#                   Use `register_partition_methods` to add a new partition method.
# -max_k Maximal number of subgroups to try. The function will try for ``2:max_k`` subgroups
# -sample_by Should randomly sample the matrix by rows or by columns?
# -p_sampling Proportion of the submatrix which contains the top n rows to sample.
# -partition_repeat Number of repeats for the random sampling.
# -partition_param Parameters for the partition method which are passed to ``...`` in a registered partitioning method. See `register_partition_methods` for detail.
# -anno A data frame with known annotation of samples. The annotations will be plotted in heatmaps and the correlation
#       to predicted subgroups will be tested.
# -anno_col A list of colors (color is defined as a named vector) for the annotations. If ``anno`` is a data frame,
#       ``anno_col`` should be a named list where names correspond to the column names in ``anno``.
# -scale_rows Whether to scale rows. If it is ``TRUE``, scaling method defined in `register_partition_methods` is used.
# -verbose Whether print messages.
# -mc.cores Multiple cores to use.
# -prefix Internally used.
# -.env An environment, internally used.
#
# == details
# The function performs analysis in following steps:
#
# - calculate scores for rows by top-value method,
# - for each top_n value, take top n rows,
# - randomly sample ``p_sampling`` rows from the top_n-row matrix and perform partitioning for ``partition_repeats`` times,
# - collect partitions from all individual partitions and summarize a consensus partition.
#
# == return
# A `ConsensusPartition-class` object. Simply type object in the interactive R session
# to see which functions can be applied on it.
#
# == seealso
# `run_all_consensus_partition_methods` runs consensus partitioning with multiple top-value methods
# and multiple partitioning methods.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# set.seed(123)
# m = cbind(rbind(matrix(rnorm(20*20, mean = 1,   sd = 0.5), nr = 20),
#                 matrix(rnorm(20*20, mean = 0,   sd = 0.5), nr = 20),
#                 matrix(rnorm(20*20, mean = 0,   sd = 0.5), nr = 20)),
#           rbind(matrix(rnorm(20*20, mean = 0,   sd = 0.5), nr = 20),
#                 matrix(rnorm(20*20, mean = 1,   sd = 0.5), nr = 20),
#                 matrix(rnorm(20*20, mean = 0,   sd = 0.5), nr = 20)),
#           rbind(matrix(rnorm(20*20, mean = 0.5, sd = 0.5), nr = 20),
#                 matrix(rnorm(20*20, mean = 0.5, sd = 0.5), nr = 20),
#                 matrix(rnorm(20*20, mean = 1,   sd = 0.5), nr = 20))
#          ) + matrix(rnorm(60*60, sd = 0.5), nr = 60)
# res = consensus_partition(m, partition_repeat = 10, top_n = c(10, 20, 50))
# res
consensus_partition = function(data,
	top_value_method = "ATC",
	top_n = seq(min(1000, round(nrow(data)*0.1)), 
		        min(3000, round(nrow(data)*0.3)), 
		        length.out = 3),
	partition_method = "skmeans",
	max_k = 6, 
	sample_by = "row",
	p_sampling = 0.8,
	partition_repeat = 50,
	partition_param = list(),
	anno = NULL,
	anno_col = NULL,
	scale_rows = NULL,
	verbose = TRUE,
	mc.cores = 1,
	prefix = "",
	.env = NULL) {

	if(missing(data)) {
		data = .env$data
	}

	if(max_k < 2) {
		stop_wrap("max_k should be no less than 2.")
	}

	t = system.time(res <- .consensus_partition(
		data = data,
		top_value_method = top_value_method,
		top_n = top_n,
		partition_method = partition_method,
		k = 2:max_k, 
		p_sampling = p_sampling,
		sample_by = sample_by,
		partition_repeat = partition_repeat,
		partition_param = partition_param,
		anno = anno,
		anno_col = anno_col,
		scale_rows = scale_rows,
		verbose = verbose,
		mc.cores = mc.cores,
		prefix = prefix,
		.env = .env))
	res@hash = digest(res)
	res@running_time = t[["elapsed"]]

	if(verbose) {
		tc = Sys.time()
		tf = format(tc + structure(t[["elapsed"]], units = "secs", class = "difftime") - tc)
		qqcat("@{prefix}* @{top_value_method}:@{partition_method} used @{tf}.\n")
	}
	return(res)
}

.consensus_partition = function(data,
	top_value_method = "MAD",
	top_n = seq(min(1000, round(nrow(data)*0.1)), 
		        min(3000, round(nrow(data)*0.3)), 
		        length.out = 3),
	partition_method = "kmeans",
	k = 2:6, 
	p_sampling = 0.8,
	sample_by = c("row", "column"), 
	partition_repeat = 50,
	partition_param = list(),
	anno = NULL,
	anno_col = NULL,
	scale_rows = NULL,
	verbose = TRUE,
	mc.cores = 1,
	prefix = "",
	.env = NULL) {

	# .env is used to store shared matrix because the matrix will be used multiple times in
	# run_all_consensus_partition_methods() and hierarchical_partition() and we don't want to
	# copy it multiple times
	# `column_index` is specifically for hierarchical_partition() where in each level only a subset
	# of columns in the original matrix is used
	# .env$column_index is only used to pass secret column_index parameter, 
	# the real column_index is stored as a slot in this object.
	if(is.null(.env)) {
		if(is.data.frame(data)) data = as.matrix(data)
		# if(is.null(rownames(data))) rownames(data) = seq_len(nrow(data))

		.env = new.env(parent = emptyenv())
		.env$data = data
		.env$column_index = seq_len(ncol(data))
	} else if(is.null(.env$data)) {
		if(is.data.frame(data)) data = as.matrix(data)
		# if(is.null(rownames(data))) rownames(data) = seq_len(nrow(data))

		.env$data = data
		.env$column_index = seq_len(ncol(data))
	} else if(is.null(.env$column_index)) {
		data = .env$data
		.env$column_index = seq_len(ncol(data))
	} else {
		data = .env$data
	}

	data = data[, .env$column_index, drop = FALSE]

	if(verbose) qqcat("@{prefix}* on a @{nrow(data)}x@{ncol(data)} matrix.\n")

	k = sort(k)
	l = k <= ncol(data)
	if(sum(l) != length(k)) {
		qqcat("@{prefix}* Following k (@{paste(k[!l], collapse=', ')}) are removed.\n")
	}
	k = k[l]
	if(length(k) == 0) {
		stop_wrap("There is no valid k.\n")
	}
	
	top_n = round(top_n)
	l = top_n <= nrow(data)
	if(sum(l) != length(top_n)) {
		qqcat("@{prefix}* Following top_n (@{paste(top_n[!l], collapse = ', ')}) are removed.\n")
	}
	top_n = top_n[l]
	if(length(top_n) == 0) {
		stop_wrap("There is no valid top_n.\n")
	}

	partition_fun = get_partition_method(partition_method, partition_param)

	get_top_value_fun = get_top_value_method(top_value_method)

	# also since one top value metric will be used for different partition methods,
	# we cache the top values for repetitive use
	if(is.null(.env$all_top_value_list)) {
		if(verbose) qqcat("@{prefix}* calculating @{top_value_method} values.\n")
		all_top_value = get_top_value_fun(data)
		all_top_value[is.na(all_top_value)] = -Inf
		.env$all_top_value_list = list()
		.env$all_top_value_list[[top_value_method]] = all_top_value
	} else if(is.null(.env$all_top_value_list[[top_value_method]])) {
		if(verbose) qqcat("@{prefix}* calculating @{top_value_method} values.\n")
		all_top_value = get_top_value_fun(data)
		all_top_value[is.na(all_top_value)] = -Inf
		.env$all_top_value_list[[top_value_method]] = all_top_value
	} else {
		if(verbose) qqcat("@{prefix}* @{top_value_method} values have already been calculated. Get from cache.\n")
		all_top_value = .env$all_top_value_list[[top_value_method]]
	}

	if(is.null(scale_rows)) {
		scale_rows = TRUE
	}
	if(scale_rows) {
		scale_method = attr(partition_fun, "scale_method")
		if("z-score" %in% scale_method) {
			if(verbose) qqcat("@{prefix}* rows are scaled before sent to partition, method: 'z-score' (x - mean)/sd\n")
			data = t(scale(t(data)))
		} else if("min-max" %in% scale_method) {
			if(verbose) qqcat("@{prefix}* rows are scaled before sent to partition, method: 'min-max' (x - min)/(max - min)\n")
			row_min = rowMins(data)
			row_max = rowMaxs(data)
			row_range = row_max - row_min
			data = (data - row_min)/row_range
		} else {
			scale_rows = FALSE
		}
		attr(scale_rows, "scale_method") = scale_method
	}

	# in case NA is produced in scaling
	l = apply(data, 1, function(x) any(is.na(x)))
	if(any(l)) {
		if(verbose) qqcat("@{prefix}* remove @{sum(l)} rows with NA values.\n")
		data = data[!l, , drop = FALSE]
		all_top_value = all_top_value[!l]
		l = top_n <= nrow(data)
		top_n = top_n[l]

		if(sum(l) != length(top_n)) {
			qqcat("@{prefix}* Following top_n (@{paste(top_n[!l], collapse = ', ')}) are removed.\n")
		}
		top_n = top_n[l]
		if(length(top_n) == 0) {
			stop_wrap("There is no valid top_n.\n")
		}
	}

	# now we do repetitive clustering
	param = data.frame(top_n = numeric(0), k = numeric(0), n_row = numeric(0))
	partition_list = list()

	sample_by = match.arg(sample_by)[1]
	for(i in seq_len(length(top_n))) {
		ind = order(all_top_value, decreasing = TRUE)[ 1:top_n[i] ]

		if(length(ind) > 5000) {
			ind = sample(ind, 5000)
			if(verbose) qqcat("@{prefix}* get top @{top_n[i]} (randomly sampled 5000) rows by @{top_value_method} method\n")
		} else {
			if(verbose) qqcat("@{prefix}* get top @{top_n[i]} rows by @{top_value_method} method\n")
		}

		if(verbose && mc.cores > 1) qqcat("@{prefix}  - @{partition_method} repeated for @{partition_repeat} times by @{sample_by}-sampling (p = @{p_sampling}) from top @{top_n[i]} rows (@{mc.cores} cores).\n")

		lt = mclapply(seq_len(partition_repeat), function(j) {

			param = data.frame(top_n = numeric(0), k = numeric(0), n_row = numeric(0))
			partition_list = list()
			
			if(sample_by == "row") {
				ind_sub = sample(ind, round(p_sampling*length(ind)))
				mat = data[ind_sub, , drop = FALSE]
				column_ind_sub = seq_len(ncol(data))
			} else if(sample_by == "column") {
				column_ind_sub = sample(ncol(data), round(p_sampling*ncol(data)))
				mat = data[ind, , drop = FALSE]
			}
			for(y in k) {
				if(interactive() && verbose && mc.cores == 1) {
					msg = qq("@{prefix}  [k = @{y}] @{partition_method} repeated for @{j}@{ifelse(j %% 10 == 1, 'st', ifelse(j %% 10 == 2, 'nd', ifelse(j %% 10 == 3, 'rd', 'th')))} @{sample_by}-sampling (p = @{p_sampling}) from top @{top_n[i]} rows.")
					cat(strrep("\r", nchar(msg)))
					cat(msg)
				}
				partition_list = c(partition_list, list(list(partition_fun(mat, y, column_ind_sub))))
				param = rbind(param, data.frame(top_n = top_n[i], k = y, n_row = nrow(mat), n_col = ncol(mat), stringsAsFactors = FALSE))
			}

			return(list(param = param, partition_list = partition_list))
		}, mc.cores = mc.cores)
		for(i in seq_along(lt)) {
			param = rbind(param, lt[[i]]$param)
			partition_list = c(partition_list, lt[[i]]$partition_list)
		}
		if(interactive() && verbose && mc.cores == 1) cat("\n")
	}

	construct_consensus_object = function(param, partition_list, k, prefix = "  - ", verbose = TRUE) {

		partition_list = do.call("c", partition_list)

		partition_list = cl_ensemble(list = partition_list)
		if(verbose) qqcat("@{prefix}merge @{length(partition_list)} (@{partition_repeat}x@{length(top_n)}) partitions into a single ensemble object.\n")
		if(sample_by == "row") {
			partition_consensus = cl_consensus(partition_list)
		} else {
			partition_consensus = cl_consensus2(partition_list, k)
		}

		# note: number of class_ids may be less than k
		class_ids = as.vector(cl_class_ids(partition_consensus))
		# adjust the class labels according to the tightness of each subgroup
		if(verbose) qqcat("@{prefix}adjust the class labels according to the mean intra-group distance.\n")
		ri = order(all_top_value, decreasing = TRUE)[1:max(param[, "top_n"])]
		if(length(ri) > 5000) ri = sample(ri, 5000)
		mean_dist = tapply(seq_len(ncol(data)), class_ids, function(ind) {
			n = length(ind)
			if(n == 1) {
				return(Inf)
			}
			sum(dist(t(data[ri, ind, drop = FALSE]))^2)/(n*(n-1)/2)
		})
		if(length(mean_dist) < k) {
			mean_dist_foo = structure(rep(Inf, k - length(mean_dist)), names = setdiff(seq_len(k), class_ids))
			mean_dist = c(mean_dist, mean_dist_foo)
		}
		map = structure(names = names(mean_dist)[order(mean_dist)], names(mean_dist))
		unclassfied = setdiff(1:k, map)
		if(length(unclassfied)) {
			map = c(map, structure(unclassfied, names = unclassfied))
		}
		class_ids = as.numeric(map[as.character(class_ids)])

		if(verbose) qqcat("@{prefix}calculate global membership from all partitions.\n")
		membership_mat = cl_membership(partition_consensus)
		class(membership_mat) = "matrix"
		if(ncol(membership_mat) < k) {
			membership_mat = cbind(membership_mat, matrix(0, nrow = nrow(membership_mat), ncol = k - ncol(membership_mat)))
		}
		map2 = structure(names(map), names = map)
		membership_mat = membership_mat[, as.numeric(map2[as.character(1:k)])]

		colnames(membership_mat) = paste0("p", 1:ncol(membership_mat))
		rownames(membership_mat) = colnames(data)
		attr(membership_mat, "n_of_classes") = NULL
		attr(membership_mat, "is_cl_hard_partition") = NULL
		
		if(verbose) qqcat("@{prefix}adjust class labels for every single partition.\n")
		class_ids_by_top_n = tapply(seq_along(partition_list), param$top_n, function(ind) {
			if(sample_by == "row") {
				partition_consensus = cl_consensus(cl_ensemble(list = partition_list[ind]))
			} else {
				partition_consensus = cl_consensus2(cl_ensemble(list = partition_list[ind]), k)
			}
			ci = as.vector(cl_class_ids(partition_consensus))
			map = relabel_class(ci, class_ids, full_set = 1:k)
			as.numeric(map[as.character(ci)])
		})

		# adjust class labels in each membership matrix to fit to the consensus class labels
		membership_each = do.call("cbind", lapply(seq_along(partition_list), function(i) {
			x = partition_list[[i]]
			class_ids = class_ids_by_top_n[[as.character(param$top_n[i])]]
			class = as.vector(cl_class_ids(x))
			map = relabel_class(class, class_ids, full_set = 1:k)
			class = as.numeric(map[as.character(class)])
			as.integer(class)
		}))
		rownames(membership_each) = rownames(membership_mat)

		if(verbose) qqcat("@{prefix}calculate consensus matrix.\n")
		# consensus_mat = matrix(1, nrow = nrow(membership_mat), ncol = nrow(membership_mat))
		# for(i in seq_len(nrow(membership_each)-1)) {
		# 	for(j in (i+1):nrow(membership_each)) {
		# 		consensus_mat[i, j] = sum(membership_each[i, ] == membership_each[j, ] + 0)/ncol(membership_each)
		# 		consensus_mat[j, i] = consensus_mat[i, j]
		# 	}
	 # 	}
		membership_each2 = membership_each
		membership_each2[is.na(membership_each2)] = as.integer(0)
		consensus_mat = get_consensus_matrix(membership_each2)
	 	rownames(consensus_mat) = rownames(membership_mat)
	 	colnames(consensus_mat) = rownames(membership_mat)

	 	class_df = data.frame(
	 		class = class_ids,
	 		entropy = apply(membership_mat, 1, entropy),
	 		stringsAsFactors = FALSE
	 	)
	 	rownames(class_df) = colnames(data)

	 	if(length(unique(class_ids)) == 1) {
	 		class_df$silhouette = rep(0, length(class_ids))
	 	} else {
			class_df$silhouette = silhouette(class_ids, dist(t(consensus_mat)))[, "sil_width"]
		}

		if(verbose) qqcat("@{prefix}calculate statistics for the consensus partition.\n")
		if(length(class_df$silhouette)*0.05 > 1) {
			l = class_df$silhouette >= quantile(class_df$silhouette, 0.05)
		} else {
			l = rep(TRUE, length(class_df$silhouette))
		}
		stat = list(
			ecdf = ecdf(consensus_mat[lower.tri(consensus_mat)]),
			"1-PAC" = 1 - PAC(consensus_mat[l, l, drop = FALSE]),
			mean_silhouette = mean(class_df$silhouette),
			concordance = concordance(membership_each, class_ids),
			cophcor = cophcor(consensus_mat),
			aPAC = aPAC(consensus_mat),
			FCC = FCC(consensus_mat[l, l, drop = FALSE])
		)

		# loc = cmdscale(dist(t(data)), k = 2)
		# stat$mean_group_dist_2PC = mean_group_dist(loc, class_ids)
		# loc = cmdscale(dist(t(data)), k = 3)
		# stat$mean_group_dist_3PC = mean_group_dist(loc, class_ids)
		
		return(list(
			class_df = class_df, 
			membership = membership_mat, 
			consensus = consensus_mat, 
			param = param, 
			membership_each = membership_each,
			stat = stat
		))
	}

	object_list = lapply(k, function(y) {
		l = param$k == y
		
		top_n_level = unique(param[l, "top_n"])
		if(verbose) qqcat("@{prefix}* wrap results for k = @{y}\n")
		construct_consensus_object(param[l, ], partition_list[l], y, verbose = FALSE)
		
	})
	names(object_list) = as.character(k)

	rm(partition_list)
	gc(verbose = FALSE, reset = TRUE)

	## adjust class labels for different k should also be adjusted
	if(verbose) qqcat("@{prefix}* adjust class labels between different k.\n")
	reference_class = object_list[[1]]$class_df$class
	for(i in seq_along(k)[-1]) {
		class_df = object_list[[i]]$class_df
    	class = class_df[, "class"]

    	map = relabel_class(class, reference_class, full_set = 1:(k[i]))
    	map2 = structure(names(map), names = map)
    	# unmapped = setdiff(as.character(1:k[i]), map)
    	# map = c(map, structure(unmapped, names = unmapped))
    	# map2 = c(map2, structure(unmapped, names = unmapped))
    	object_list[[i]]$class_df$class = as.numeric(map[as.character(class)])
    	
    	# the class label for the global membership matrix needs to be adjusted
    	object_list[[i]]$membership = object_list[[i]]$membership[, as.numeric(map2[as.character(1:k[i])]) ]
		colnames(object_list[[i]]$membership) = paste0("p", 1:k[i])
			
		# the class label for the individual membership needs to be adjusted
		odim = dim(object_list[[i]]$membership_each)
		object_list[[i]]$membership_each = as.numeric(map[as.character(object_list[[i]]$membership_each)])
		dim(object_list[[i]]$membership_each) = odim

		reference_class = object_list[[i]]$class_df$class
	}

	# an additional metric for determine "best k"
	ak = sapply(object_list, function(obj) {
		f = obj$stat$ecdf
		x = seq(0, 1, length = 1000)
		n = length(x)
		sum((x[2:n] - x[1:(n-1)])*f(x[2:n]))
	})
	names(ak) = NULL
	delta_k = ak
	for(i in seq_along(k)[-1]) {
		delta_k[i] = (ak[i] - ak[i-1])/ak[i-1]
	}
	for(i in seq_along(object_list)) {
		object_list[[i]]$stat$area_increased = delta_k[i]
	}

	n_sample = ncol(data)
	# for(method in c("Rand", "cRand", "NMI", "KP", "FM", "Jaccard", "purity", "PS")) {
	for(method in c("Rand", "Jaccard")) {
		for(i in seq_along(k)) {
			if(i == 1) {
				cl1 = rep(1, n_sample)
			} else {
				cl1 = object_list[[i - 1]]$class_df$class
			}
			cl2 = object_list[[i]]$class_df$class
			object_list[[i]]$stat[[method]] = cl_agreement(as.cl_hard_partition(cl1), as.cl_hard_partition(cl2), method = method)[[1]]
		}
	}
	
	# process the annotations so it can be shared in `run_all_consensus_partition_methods()` and `hierarchical_partitions()`
	if(!is.null(anno)) {
		if(is.atomic(anno)) {
			known_nm = deparse(substitute(anno))
			anno = data.frame(anno)
			colnames(anno) = known_nm
			if(!is.null(anno_col)) {
				anno_col = list(anno_col)
				names(anno_col) = known_nm
			}
		}
		anno = anno[.env$column_index, , drop = FALSE]
	}

	if(is.null(anno_col)) {
		anno_col = lapply(anno, ComplexHeatmap:::default_col)
	} else {
		if(ncol(anno) == 1 && is.atomic(anno_col)) {
			anno_col = list(anno_col)
			names(anno_col) = colnames(anno)
		} else if(is.null(names(anno_col))) {
			if(length(anno_col) == ncol(anno)) {
				names(anno_col) = colnames(anno)
			} else {
				anno_col = lapply(anno, ComplexHeatmap:::default_col)
			}
		}
		for(nm in names(anno)) {
			if(is.null(anno_col[[nm]])) {
				anno_col[[nm]] = ComplexHeatmap:::default_col(anno[[nm]])
			}
		}
	}
	if(is.null(anno)) {
		anno_col = NULL
	}

	res = ConsensusPartition(object_list = object_list, k = k, n_partition = partition_repeat * length(top_n) * length(k),  
		partition_method = partition_method, top_value_method = top_value_method, top_n = top_n, top_value_list = all_top_value,
		anno = anno, anno_col = anno_col, scale_rows = scale_rows, sample_by = sample_by,
		column_index = .env$column_index, .env = .env)

	return(res)
}

# == title
# Print the ConsensusPartition object
#
# == param
# -object A `ConsensusPartition-class` object.
#
# == value
# No value is returned.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
setMethod(f = "show",
	signature = "ConsensusPartition",
	definition = function(object) {

	# fix older version where there was no sample_by slot
	error = try(object@sample_by, silent = TRUE)
	if(inherits(error, "try-error")) {
		object@sample_by = "row"
	}
	qqcat("A 'ConsensusPartition' object with k = @{paste(object@k, collapse = ', ')}.\n")
	qqcat("  On a matrix with @{nrow(object@.env$data)} rows and @{length(object@column_index)} columns.\n")
	top_n_str = object@top_n
	qqcat("  Top rows (@{paste(top_n_str, collapse = ', ')}) are extracted by '@{object@top_value_method}' method.\n")
	qqcat("  Subgroups are detected by '@{object@partition_method}' method.\n")
	qqcat("  Performed in total @{object@n_partition} partitions by @{object@sample_by} resampling.\n")
	best_k = suggest_best_k(object)
	if(is.na(best_k)) {
		qqcat("  There is no best k.\n")
	} else {
		qqcat("  Best k for subgroups seems to be @{best_k}.\n")
	}
	qqcat("\n")
	qqcat("Following methods can be applied to this 'ConsensusPartition' object:\n")
	txt = showMethods(classes = "ConsensusPartition", where = topenv(), printTo = FALSE)
	txt = grep("Function", txt, value = TRUE)
	fname = gsub("Function: (.*?) \\(package.*$", "\\1", txt)
	print(fname)
})

# == title
# Plot the empirical cumulative distribution (eCDF) curve of the consensus matrix
#
# == param
# -object A `ConsensusPartition-class` object.
# -... Other arguments.
#
# == details
# It plots eCDF curve for each k.
#
# This function is mainly used in `collect_plots` and `select_partition_number` functions.
#
# == value
# No value is returned.
#
# == seealso
# See `stats::ecdf` for a detailed explanation of the empirical cumulative distribution function.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# data(golub_cola)
# plot_ecdf(golub_cola["ATC", "skmeans"])
setMethod(f = "plot_ecdf",
	signature = "ConsensusPartition",
	definition = function(object, ...) {

	omar = par("mar")
	par(mar = c(4.1, 4.1, 1, 1))
	on.exit(par(mar = omar))
	plot(NULL, xlim = c(0, 1), ylim = c(0, 1), xlab = "consensus value (x)", ylab = "P(X <= x)")
	for(i in seq_along(object@k)) {
		consensus_mat = get_consensus(object, k = object@k[i])
		f = ecdf(consensus_mat[lower.tri(consensus_mat)])
		x = seq(0, 1, length = 100)
		y = f(x)
		x = c(0, x)
		y = c(0, y)
		lines(x, y, col = i)
	}
	legend("bottomright", pch = 15, legend = paste0("k = ", object@k), col = seq_along(object@k))
})

# == title
# Several plots for determining the optimized number of subgroups
#
# == param
# -object A `ConsensusPartition-class` object.
# -all_stats Whether to show all statistics that were calculated. Used internally.
#
# == details
# There are following plots made:
#
# - eCDF of the consensus matrix under each k, made by `plot_ecdf,ConsensusPartition-method`,
# - `PAC` score,
# - mean sihouette score,
# - the `concordance` for each partition to the consensus partition,
# - area increase of the area under the ECDF of consensus matrix with increasing k,
# - Rand index for current k compared to k - 1,
# - Jaccard coefficient for current k compared to k - 1,
#
# == value
# No value is returned.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# data(golub_cola)
# select_partition_number(golub_cola["ATC", "skmeans"])
setMethod(f = "select_partition_number",
	signature = "ConsensusPartition",
	definition = function(object, all_stats = FALSE) {
	op = par(no.readonly = TRUE)

	m = get_stats(object, all_stats = all_stats)
	if(all_stats) STAT_USED = STAT_ALL
	m = m[, colnames(m) %in% c(STAT_USED, "area_increased", "Rand", "Jaccard"), drop = FALSE]
	nm = colnames(m)
	n = ncol(m)
	nr = floor(sqrt(n)+1)

	par(mfrow = c(nr, ceiling((n+1)/nr)), mar = c(4, 4, 1, 1))

	plot_ecdf(object, lwd = 1)

	best_k = suggest_best_k(object)
	if(is.na(best_k)) best_k = -1

	for(i in seq_len(ncol(m))) {
		l = object@k == best_k
		plot(object@k, m[, i], type = "b", xlab = "k", ylab = nm[i])
		if(any(l)) {
			points(object@k, m[, i],
				pch = ifelse(l, 16, 1),
				col = ifelse(l, "red", "black"))
		}
	}

	par(xpd = NA, mar = c(4, 2, 1, 1))
	plot(c(0, 1), c(0, 1), type = "n", axes = FALSE, ann = FALSE)
	text(x = 0, y = 1,
"Suggested rules:
Jaccard index < 0.95;
If 1-PAC >= 0.90,
  take the maximum k;
else take the k with higest votes of
  1. max 1-PAC,
  2. max mean silhouette,
  3. max concordance.
", cex = 1.2, adj = c(0, 1))

	legend("topright", pch = 16, col = "Red", legend = "best k", cex = 1.5)

	par(op)
})


# == title
# Heatmap of the consensus matrix
#
# == param
# -object A `ConsensusPartition-class` object.
# -k Number of subgroups.
# -internal Used internally.
# -anno A data frame of annotations for the original matrix columns. 
#       By default it uses the annotations specified in `consensus_partition` or `run_all_consensus_partition_methods`.
# -anno_col A list of colors (color is defined as a named vector) for the annotations. If ``anno`` is a data frame,
#       ``anno_col`` should be a named list where names correspond to the column names in ``anno``.
# -show_row_names Whether plot row names on the consensus heatmap (which are the column names in the original matrix)
# -simplify Internally used.
# -... other arguments
#
# == details
# For row i and column j in the consensus matrix, the value of corresponding x_ij
# is the probability of sample i and sample j being in a same group from all partitions.
#
# There are following heatmaps from left to right:
#
# - probability of the sample to stay in the corresponding group
# - silhouette scores which measure the distance of an item to the second closest subgroups.
# - predicted subgroups
# - consensus matrix.
# - more annotations if provided as ``anno``
#
# One thing that is very important to note is that since we already know the consensus subgroups from consensus
# partition, in the heatmap, only rows or columns within the group is clustered.
#
# == value
# No value is returned.
#
# == seealso
# `membership_heatmap,ConsensusPartition-method`
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# data(golub_cola)
# consensus_heatmap(golub_cola["ATC", "skmeans"], k = 3)
setMethod(f = "consensus_heatmap",
	signature = "ConsensusPartition",
	definition = function(object, k, internal = FALSE,
	anno = object@anno, anno_col = get_anno_col(object), 
	show_row_names = FALSE, simplify = FALSE, ...) {

	if(missing(k)) stop_wrap("k needs to be provided.")

	if(inherits(object, "DownSamplingConsensusPartition")) {
		class_df = get_classes(object, k, reduce = TRUE)
	} else {
		class_df = get_classes(object, k)
	}
	class_ids = class_df$class

	consensus_mat = get_consensus(object, k)

	mat_col_od = column_order_by_group(factor(class_ids, levels = sort(unique(class_ids))), consensus_mat)

	membership_mat = get_membership(object, k)

	if(simplify) {
		ht_list = NULL
	} else {
		ht_list = Heatmap(membership_mat, name = "Prob", cluster_columns = FALSE, show_row_names = FALSE,
				width = unit(5, "mm")*k, col = colorRamp2(c(0, 1), c("white", "red")),
				show_column_names = !internal) + 
			Heatmap(class_df$silhouette, name = "Silhouette", width = unit(5, "mm"),
				show_row_names = FALSE, col = colorRamp2(c(0, 1), c("white", "purple")),
				show_column_names = !internal)
	}
	ht_list = ht_list + Heatmap(class_ids, name = "Class", col = cola_opt$color_set_2,
		show_row_names = FALSE, width = unit(5, "mm"),
		show_column_names = !internal)
	
	ht_list = ht_list +	Heatmap(consensus_mat, name = "Consensus", show_row_names = FALSE, show_row_dend = FALSE,
		col = colorRamp2(c(0, 1), c("white", "blue")), row_order = mat_col_od, column_order = mat_col_od,
		cluster_rows = FALSE, cluster_columns = FALSE, show_column_names = FALSE)

	if(!is.null(anno)) {
		if(is.atomic(anno)) {
			anno_nm = deparse(substitute(anno))
			anno = data.frame(anno)
			colnames(anno) = anno_nm
			if(!is.null(anno_col)) {
				anno_col = list(anno_col)
				names(anno_col) = anno_nm
			}
		} else if(ncol(anno) == 1) {
			if(!is.null(anno_col)) {
				if(is.atomic(anno_col)) {
					anno_col = list(anno_col)
					names(anno_col) = colnames(anno)
				}
			}
		}
		if(is.null(anno_col))
			ht_list = ht_list + rowAnnotation(df = anno, show_annotation_name = !internal)
		else {
			ht_list = ht_list + rowAnnotation(df = anno, col = anno_col, show_annotation_name = !internal)
		}
	}

	if(show_row_names && !is.null(rownames(consensus_mat))) {
		ht_list = ht_list + rowAnnotation(rn = anno_text(rownames(consensus_mat)))
	}
	if(internal) {
		column_title = NULL
	} else {
		column_title = qq("consensus @{object@partition_method} with @{k} groups from @{object@n_partition/length(object@k)} partitions")
	}
	draw(ht_list, main_heatmap = "Consensus", column_title = column_title,
		show_heatmap_legend = !internal, show_annotation_legend = !internal)
})

# == title
# Heatmap of membership in each partition
#
# == param
# -object A `ConsensusPartition-class` object.
# -k Number of subgroups.
# -internal Used internally.
# -anno A data frame of annotations for the original matrix columns. 
#       By default it uses the annotations specified in `consensus_partition` or `run_all_consensus_partition_methods`.
# -anno_col A list of colors (color is defined as a named vector) for the annotations. If ``anno`` is a data frame,
#       ``anno_col`` should be a named list where names correspond to the column names in ``anno``.
# -show_column_names Whether show column names in the heatmap (which is the column name in the original matrix).
# -... Other arguments
#
# == details
# Each row in the heatmap is the membership in one single partition.
#
# Heatmap is split on rows by ``top_n``.
#
# == value
# No value is returned.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# data(golub_cola)
# membership_heatmap(golub_cola["ATC", "skmeans"], k = 3)
setMethod(f = "membership_heatmap",
	signature = "ConsensusPartition",
	definition = function(object, k, internal = FALSE, 
	anno = object@anno, anno_col = get_anno_col(object),
	show_column_names = FALSE, ...) {

	if(missing(k)) stop_wrap("k needs to be provided.")

	if(inherits(object, "DownSamplingConsensusPartition")) {
		class_df = get_classes(object, k, reduce = TRUE)
	} else {
		class_df = get_classes(object, k)
	}
	class_ids = class_df$class

	membership_mat = get_membership(object, k)
	col_fun = colorRamp2(c(0, 1), c("white", "red"))

	membership_each = get_membership(object, k, each = TRUE)
	membership_each = t(membership_each)
	mat_col_od = column_order_by_group(factor(class_ids, levels = sort(unique(class_ids))), membership_each)

	col = cola_opt$color_set_1[1:k]

	if(is.null(anno)) {
		bottom_anno = NULL
	} else {
		if(is.atomic(anno)) {
			anno_nm = deparse(substitute(anno))
			anno = data.frame(anno)
			colnames(anno) = anno_nm
			if(!is.null(anno_col)) {
				anno_col = list(anno_col)
				names(anno_col) = anno_nm
			}
		} else if(ncol(anno) == 1) {
			if(!is.null(anno_col)) {
				if(is.atomic(anno_col)) {
					anno_col = list(anno_col)
					names(anno_col) = colnames(anno)
				}
			}
		}

		if(is.null(anno_col)) {
			bottom_anno = HeatmapAnnotation(df = anno,
				show_annotation_name = !internal, annotation_name_side = "right")
		} else {
			bottom_anno = HeatmapAnnotation(df = anno, col = anno_col,
				show_annotation_name = !internal, annotation_name_side = "right")
		}
	}

	param = get_param(object, k, unique = FALSE)

	top_n_level = unique(param$top_n)
	suppressWarnings(n_row_col <- structure(brewer.pal(length(top_n_level), "Accent"), names = top_n_level))
	
	ht = Heatmap(as.character(param$top_n), name = "top_n", col = n_row_col,
		width = unit(10, "mm"), show_row_names = FALSE, show_heatmap_legend = FALSE,
		show_column_names = FALSE) +
	Heatmap(membership_each, name = "Class", show_row_dend = FALSE, show_column_dend = FALSE, col = cola_opt$color_set_2,
		column_title = ifelse(internal, "", qq("membership heatmap, k = @{k}")), 
		column_order = mat_col_od, cluster_columns = FALSE,
		row_split = factor(param$top_n, levels = sort(unique(param$top_n))),
		cluster_row_slices = FALSE,
		top_annotation = HeatmapAnnotation(Prob = membership_mat,
			Class = class_ids, col = c(list(Class = cola_opt$color_set_2), Prob = col_fun),
			show_annotation_name = !internal),
		bottom_annotation = bottom_anno,
		show_column_names = show_column_names,
		row_title = NULL
	)
	if(internal) {
		row_title = NULL
	} else {
		row_title = qq("@{round(object@n_partition/length(object@k)/length(top_n_level))} x @{length(top_n_level)} random samplings")
	}
	draw(ht, main_heatmap = "Class", row_title = row_title,
		show_heatmap_legend = FALSE, show_annotation_legend = !internal)
	param2 = get_param(object, k)
	if(!internal) {
		for(i in seq_along(top_n_level)) {
			decorate_heatmap_body("top_n", slice = i, {
				grid.text(qq("top @{top_n_level[i]} rows"), rot = 90, gp = gpar(fontsize = 10))
			})
		}
	}
})

# == title
# Visualize column after dimension reduction
#
# == description
# Visualize samples (the matrix columns) after dimension reduction
#
# == param
# -object A `ConsensusPartition-class` object.
# -k Number of subgroups.
# -top_n Top n rows to use. By default it uses all rows in the original matrix.
# -method Which method to reduce the dimension of the data. ``MDS`` uses `stats::cmdscale`,
#         ``PCA`` uses `stats::prcomp`. ``t-SNE`` uses `Rtsne::Rtsne`. ``UMAP`` uses
#         `umap::umap`.
# -control A list of parameters for `Rtsne::Rtsne` or `umap::umap`.
# -internal Internally used.
# -nr If number of matrix rows is larger than this value, random ``nr`` rows are used.
# -silhouette_cutoff Cutoff of silhouette score. Data points with values less
#        than it will be mapped with cross symbols.
# -remove Whether to remove columns which have less silhouette scores than
#        the cutoff.
# -scale_rows Whether to perform scaling on matrix rows.
# -verbose Whether print messages.
# -... Other arguments.
#
# == value
# No value is returned.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# data(golub_cola)
# dimension_reduction(golub_cola["ATC", "skmeans"], k = 3)
setMethod(f = "dimension_reduction",
	signature = "ConsensusPartition",
	definition = function(object, k, top_n = NULL,
	method = c("PCA", "MDS", "t-SNE", "UMAP"), 
	control = list(),
	internal = FALSE, nr = 5000,
	silhouette_cutoff = 0.5, remove = FALSE,
	scale_rows = TRUE, verbose = TRUE, ...) {

	if(missing(k)) stop_wrap("k needs to be provided.")

	cl = as.list(match.call())
	# default value
	if(! "method" %in% names(cl)) {
		method = NULL
		if(is.null(method)) {
			oe = try(loadNamespace("umap"), silent = TRUE)
			if(inherits(oe, "try-error")) {
				if(verbose) cat("umap package is not installed.\n")
			} else {
				if(verbose) cat("use UMAP\n")
				method = "UMAP"
			}
		}
		if(is.null(method)) {
			oe = try(loadNamespace("Rtsne"), silent = TRUE)
			if(inherits(oe, "try-error")) {
				if(verbose) cat("Rtsne package is not installed.\n")
			} else {
				if(verbose) cat("use t-SNE\n")
				method = "t-SNE"
			}
		}
		if(is.null(method)) {
			if(verbose) cat("use PCA\n")
			method = "PCA"
		}
	}

	method = match.arg(method)
	data = object@.env$data[, object@column_index, drop = FALSE]

	if(!is.null(top_n)) {
		top_n = min(c(top_n, nrow(data)))
		all_value = object@top_value_list
		ind = order(all_value)[1:top_n]
		if(length(ind) > nr) ind = sample(ind, nr)
		data = data[ind, , drop = FALSE]
	} else {
		top_n = nrow(data)
		if(nrow(data) > nr) data = data[sample(1:nrow(data), nr), , drop = FALSE]
	}

	class_df = get_classes(object, k)

	l = class_df$silhouette >= silhouette_cutoff
	
	op = par(c("mar", "xpd"))
	par(mar = c(4.1, 4.1, 4.1, 6), xpd = NA)
	on.exit(par(op))

	class_level = sort(as.character(unique(class_df$class)))
	n_class = length(class_level)
		
	if(remove) {
		dimension_reduction(data[, l], pch = 16, col = cola_opt$color_set_2[as.character(class_df$class[l])],
			cex = 1, main = qq("@{method} on @{top_n} rows with highest @{object@top_value_method} scores@{ifelse(scale_rows, ', rows are scaled', '')}\n@{sum(l)}/@{length(l)} confident samples (silhouette > @{silhouette_cutoff})"),
			method = method, control = control, scale_rows = scale_rows, nr = nr, internal = internal, verbose = verbose, ...)
		if(!internal) {
			legend(x = par("usr")[2], y = mean(par("usr")[3:4]), legend = c(paste0("group", class_level), "ambiguous"), 
				pch = c(rep(16, n_class), 0),
				col = c(cola_opt$color_set_2[class_level], "white"), xjust = 0, yjust = 0.5,
				title = "Class", title.adj = 0.1, bty = "n",
				text.col = c(rep("black", n_class), "white"))
		}
	} else {
		dimension_reduction(data, pch = ifelse(l, 16, 4), col = cola_opt$color_set_2[as.character(class_df$class)],
			cex = 1, main = qq("@{method} on @{top_n} rows with highest @{object@top_value_method} scores@{ifelse(scale_rows, ', rows are scaled', '')}\n@{sum(l)}/@{length(l)} confident samples (silhouette > @{silhouette_cutoff})"),
			method = method, control = control, scale_rows = scale_rows, nr = nr, internal = internal, verbose = verbose, ...)
		if(!internal) {
			if(any(!l)) {
				legend(x = par("usr")[2], y = mean(par("usr")[3:4]), legend = c(paste0("group", class_level), "ambiguous"), 
						pch = c(rep(16, n_class), 4),
						col = c(cola_opt$color_set_2[class_level], "black"), xjust = 0, yjust = 0.5,
						title = "Class", title.adj = 0.1, bty = "n")
			} else {
				legend(x = par("usr")[2], y = mean(par("usr")[3:4]), legend = c(paste0("group", class_level), "ambiguous"), 
					pch = c(rep(16, n_class), 0),
					col = c(cola_opt$color_set_2[class_level], "white"), xjust = 0, yjust = 0.5,
					title = "Class", title.adj = 0.1, bty = "n",
					text.col = c(rep("black", n_class), "white"))
			}
		}
	}
})


# == title
# Visualize columns after dimension reduction
#
# == param
# -object A numeric matrix.
# -method Which method to reduce the dimension of the data. ``MDS`` uses `stats::cmdscale`,
#         ``PCA`` uses `stats::prcomp`. ``t-SNE`` uses `Rtsne::Rtsne`. ``UMAP`` uses
#         `umap::umap`.
# -pc Which two principle components to visualize
# -control A list of parameters for `Rtsne::Rtsne` or `umap::umap`.
# -pch Ahape of points.
# -col Color of points.
# -cex Aize of points.
# -main Title of the plot.
# -scale_rows Whether perform scaling on matrix rows.
# -nr If number of matrix rows is larger than this value, random ``nr`` rows are used.
# -internal Internally used.
# -verbose Whether print messages.
#
# == value
# No value is returned.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
setMethod(f = "dimension_reduction",
	signature = "matrix",
	definition = function(object, 
	pch = 16, col = "black", cex = 1, main = "",
	method = c("PCA", "MDS", "t-SNE", "UMAP"),
	pc = NULL, control = list(), 
	scale_rows = TRUE, nr = 5000,
	internal = FALSE, verbose = TRUE) {

	data = object
	if(nrow(data) > nr) {
		data = data[sample(1:nrow(data), nr), , drop = FALSE]
	}
	cl = as.list(match.call())

	# default value
	if(! "method" %in% names(cl)) {
		method = NULL
		if(is.null(method)) {
			oe = try(loadNamespace("umap"), silent = TRUE)
			if(inherits(oe, "try-error")) {
				if(verbose) cat("umap package is not installed.\n")
			} else {
				if(verbose) cat("use UMAP\n")
				method = "UMAP"
			}
		}
		if(is.null(method)) {
			oe = try(loadNamespace("Rtsne"), silent = TRUE)
			if(inherits(oe, "try-error")) {
				if(verbose) cat("Rtsne package is not installed.\n")
			} else {
				if(verbose) cat("use t-SNE\n")
				method = "t-SNE"
			}
		}
		if(is.null(method)) {
			if(verbose) cat("use PCA\n")
			method = "PCA"
		}
	}

	method = match.arg(method)[1]

	if(is.null(pc)) {
		if(method == "PCA") {
			pc = 1:2
		} else if(method %in% c("UMAP", "t-SNE")) {
			pc = seq_len(min(c(10, ncol(data))))
		}
	} else if(length(pc) == 1) {
		pc = 1:pc
	}

	if(method %in% c("UMAP", "t-SNE")) {
		main = paste0(main, qq(", with @{length(pc)} PCs"))
	}

	if(scale_rows) data = t(scale(t(data)))
	l = apply(data, 1, function(x) any(is.na(x)))
	data = data[!l, ]

	if(internal) {
		omar = par("mar")
		par(mar = c(4.1, 4.1, 1, 1))
		on.exit(par(mar = omar))

		main = NULL
	}

	if(method == "MDS") {
		loc = cmdscale(dist(t(data)))
		plot(loc, pch = pch, col = col, cex = cex, main = main, xlab = "Coordinate 1", ylab = "Coordinate 2")
	} else if(method == "PCA") {
		fit = prcomp(t(data))
		sm = summary(fit)
		prop = sm$importance[2, pc]
		loc = fit$x[, pc]

		plot(loc, pch = pch, col = col, cex = cex, main = main, xlab = qq("PC@{pc[1]} (@{round(prop[1]*100)}%)"), ylab = qq("PC@{pc[2]} (@{round(prop[2]*100)}%)"))
	} else if(method == "t-SNE") {

		check_pkg("Rtsne", bioc = FALSE)
		fit = prcomp(t(data))
		sm = summary(fit)
		loc = fit$x[, pc]

		param = list(X = loc)
		param = c(param, control)
		fit = do.call(Rtsne::Rtsne, param)
		loc = fit$Y
		plot(loc, pch = pch, col = col, cex = cex, main = main, xlab = "t-SNE 1", ylab = "t-SNE 2")
	} else if(method == "UMAP") {

		check_pkg("umap", bioc = FALSE)

		fit = prcomp(t(data))
		sm = summary(fit)
		loc = fit$x[, pc]

		param = list(d = loc)
		if(!"config" %in% names(control)) {
			# reset n_neighbors for small dataset
			control$config = umap::umap.defaults
			control$config$n_neighbors = min(umap::umap.defaults$n_neighbors, round(ncol(data)/2))
		} else {
			if(!"n_neighbors" %in% names(control$config)) {
				# reset n_neighbors for small dataset
				control$config$n_neighbors = min(umap::umap.defaults$n_neighbors, round(ncol(data)/2))
			}
		}
		param = c(param, control)
		fit = do.call(umap::umap, param)
		loc = fit$layout
		plot(loc, pch = pch, col = col, cex = cex, main = main, xlab = "UMAP 1", ylab = "UMAP 2")
	}
})

Try the cola package in your browser

Any scripts or data that you put into this service are public.

cola documentation built on Nov. 8, 2020, 8:12 p.m.