R/down_sampling.R

Defines functions dim.DownSamplingConsensusPartition consensus_partition_by_down_sampling

Documented in consensus_partition_by_down_sampling dim.DownSamplingConsensusPartition

# == title
# The DownSamplingConsensusPartition class
#
# == alias
# DownSamplingConsensusPartition
#
# == details
# The ``DownSamplingConsensusPartition`` performs consensus partitioning only with a small subset
# of columns and the class of other columns are predicted by `predict_classes,ConsensusPartition-method`.
#
# The ``DownSamplingConsensusPartition-class`` is a child class of `ConsensusPartition-class`. It inherits
# all methods of `ConsensusPartition-class`.
#
# == seealso
# The constructor function `consensus_partition_by_down_sampling`.
#
DownSamplingConsensusPartition = setClass("DownSamplingConsensusPartition",
	slots = list(predict = "ANY",
		         full_column_index = "ANY",
		         full_anno = "ANY"),
	contains = "ConsensusPartition"
)

# == title
# Consensus partitioning only with a subset of columns
#
# == param
# -data A numeric matrix where subgroups are found by columns.
# -subset Number of columns to randomly sample, or a vector of selected indices.
# -verbose Whether to print messages.
# -prefix Internally used.
# -anno Annotation data frame.
# -anno_col Annotation colors.
# -dist_method Method for predict the class for other columns.
# -.env An environment, internally used.
# -... All pass to `consensus_partition`.
#
# == details
# The function performs consensus partitioning only with a small subset
# of columns and the class of other columns are predicted by `predict_classes,ConsensusPartition-method`.
#
# == example
# \dontrun{
# data(golub_cola)
# m = get_matrix(golub_cola)
#
# set.seed(123)
# golub_cola_ds = consensus_partition_by_down_sampling(m, subset = 50,
# 	anno = get_anno(golub_cola), anno_col = get_anno_col(golub_cola),
# 	top_value_method = "SD", partition_method = "kmeans")
# }
consensus_partition_by_down_sampling = function(data, subset = min(round(ncol(data)*0.2), 250),
	verbose = TRUE, prefix = "", anno = NULL, anno_col = NULL,
	dist_method = c("euclidean", "correlation", "cosine"),
	.env = NULL, ...) {

	if(is.null(.env)) {
		if(is.data.frame(data)) data = as.matrix(data)

		.env = new.env(parent = emptyenv())
		.env$data = data
		.env$column_index = seq_len(ncol(data))
	} else if(is.null(.env$data)) {
		if(is.data.frame(data)) data = as.matrix(data)

		.env$data = data
		.env$column_index = seq_len(ncol(data))
	} else if(is.null(.env$column_index)) {
		data = .env$data
		.env$column_index = seq_len(ncol(data))
	} else {
		data = .env$data
	}

	data = data[, .env$column_index, drop = FALSE]

	if(!is.null(anno)) {
		if(is.atomic(anno)) {
			anno_nm = deparse(substitute(anno))
			anno = data.frame(anno)
			colnames(anno) = anno_nm
			if(!is.null(anno_col)) {
				anno_col = list(anno_col)
				names(anno_col) = anno_nm
			}
		} else if(ncol(anno) == 1) {
			if(!is.null(anno_col)) {
				if(is.atomic(anno_col)) {
					anno_col = list(anno_col)
					names(anno_col) = colnames(anno)
				}
			}
		}
		anno2 = anno[.env$column_index, , drop = FALSE]
	} else {
		anno2 = NULL
	}

	qqcat("@{prefix}* @{ifelse(length(subset) == 1, subset, length(subset))} columns are randomly sampled from @{ncol(data)} columns.\n")
		
	column_index = .env$column_index
	if(length(subset) == 1) {
		subset_index = sample(column_index, subset)
	} else {
		if(!is.numeric(subset)) {
			stop_wrap("If `subset` is specified as an indices vector, it should be in numeric.")
		}
		subset_index = column_index[subset]
	}
	.env$column_index = subset_index
	
	# top_value_list cannot be repetitively used here
	.env$all_top_value_list = NULL
	cp = consensus_partition(.env = .env, prefix = prefix, anno = anno2, anno_col = anno_col, ...)
	
	obj = new("DownSamplingConsensusPartition")
	for(nm in slotNames(cp)) {
		slot(obj, nm) = slot(cp, nm)
	}

	cl = list()
	if(cp@scale_rows) {
		if("z-score" %in% attr(cp@scale_rows, "scale_method")) {
			data2 = t(scale(t(data)))
		} else if("min-max" %in% attr(cp@scale_rows, "scale_method")) {
			row_min = rowMins(data)
			row_max = rowMaxs(data)
			row_range = row_max - row_min
			data2 = (data - row_min)/row_range
		}
	}

	for(k in cp@k) {
		qqcat("@{prefix}* predict class for @{ncol(data)} samples with k = @{k}\n")
		if(cp@scale_rows) {
			cl[[as.character(k)]] = predict_classes(cp, k = k, data2, p_cutoff = 1, dist_method = dist_method, 
				plot = FALSE, verbose = verbose, force = TRUE, help = FALSE, prefix = qq("@{prefix}  "))
		} else {
			cl[[as.character(k)]] = predict_classes(cp, k = k, data, p_cutoff = 1, dist_method = dist_method, 
				plot = FALSE, verbose = verbose, force = TRUE, help = FALSE, prefix = qq("@{prefix}  "))
		}
	}

	obj@predict$class = cl
	obj@full_anno = anno
	obj@full_column_index = column_index

	return(obj)
}

# == title
# Print the DownSamplingConsensusPartition object
#
# == param
# -object A `DownSamplingConsensusPartition-class` object.
#
# == value
# No value is returned.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# data(golub_cola_ds)
# golub_cola_ds
setMethod(f = "show",
	signature = "DownSamplingConsensusPartition",
	definition = function(object) {

	# fix older version where there was no sample_by slot
	error = try(object@sample_by, silent = TRUE)
	if(inherits(error, "try-error")) {
		object@sample_by = "row"
	}
	qqcat("A 'DownSamplingConsensusPartition' object with k = @{paste(object@k, collapse = ', ')}.\n")
	qqcat("  On a matrix with @{nrow(object@.env$data)} rows and @{length(object@column_index)} columns, randomly sampled from @{length(object@full_column_index)} columns.\n")
	top_n_str = object@top_n
	qqcat("  Top rows (@{paste(top_n_str, collapse = ', ')}) are extracted by '@{object@top_value_method}' method.\n")
	qqcat("  Subgroups are detected by '@{object@partition_method}' method.\n")
	qqcat("  Performed in total @{object@n_partition} partitions by @{object@sample_by} resampling.\n")
	best_k = suggest_best_k(object)
	if(is.na(best_k)) {
		qqcat("  There is no best k.\n")
	} else {
		qqcat("  Best k for subgroups seems to be @{best_k}.\n")
	}
	qqcat("\n")
	qqcat("Following methods can be applied to this 'DownSamplingConsensusPartition' object:\n")
	txt = showMethods(classes = c("DownSamplingConsensusPartition", "ConsensusPartition"), where = topenv(), printTo = FALSE)
	txt = grep("Function", txt, value = TRUE)
	fname = gsub("Function: (.*?) \\(package.*$", "\\1", txt)
	print(unique(fname))

})

# == title
# Get annotations
#
# == param
# -object A `DownSamplingConsensusPartition-class` object.
# -reduce Used internally.
#
# == value
# A data frame if ``anno`` was specified in `consensus_partition_by_down_sampling`, or else ``NULL``.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# data(golub_cola_ds)
# get_anno(golub_cola_ds)
setMethod(f = "get_anno",
	signature = "DownSamplingConsensusPartition",
	definition = function(object, reduce = FALSE) {
	if(reduce) {
		object@anno
	} else {
		object@full_anno
	}
})

# == title
# Get subgroup labels
#
# == param
# -object A `DownSamplingConsensusPartition-class` object.
# -k Number of subgroups.
# -p_cutoff Cutoff of p-values of class label prediction. It is only used when ``k`` is a vector.
# -reduce Used internally.
#
# == return
# If ``k`` is a scalar, it returns a data frame with two columns:
#
# - the class labels
# - the p-value for the prediction of class labels.
#
# If ``k`` is a vector, it returns a data frame of class labels for each k. The class
# label with prediction p-value > ``p_cutoff`` is set to ``NA``.
#
# == example
# data(golub_cola_ds)
# get_classes(golub_cola_ds, k = 3)
# get_classes(golub_cola_ds)
setMethod(f = "get_classes",
	signature = "DownSamplingConsensusPartition",
	definition = function(object, k = object@k, p_cutoff = 0.05, reduce = FALSE) {

	if(reduce) {
		return(selectMethod("get_classes", signature = "ConsensusPartition")(object, k = k))
	}
	if(length(k) == 1) {
		i = which(object@k == k)
		object@predict$class[[i]]
	} else {
		df = do.call("cbind", lapply(k, function(i) {
			df = object@predict$class[[as.character(i)]]
			cl = df[, "class"]
			cl[df[, "p"] > p_cutoff] = NA
			cl
		}))
		colnames(df) = paste0("k=", k)
		rownames(df) = rownames(object@predict$class[[1]])
		df
	}
})

# == title
# Test correspondance between predicted subgroups and known factors
#
# == param
# -object A `DownSamplingConsensusPartition-class` object.
# -k Number of subgroups. It uses all ``k`` if it is not specified.
# -known A vector or a data frame with known factors. By default it is the annotation table set in `consensus_partition_by_down_sampling`.
# -p_cutoff Cutoff for p-values for the class prediction. Samples with p-value higher than it are omit.
# -verbose Whether to print messages.
#
# == details
# The test is performed by `test_between_factors` between the predicted classes and user's annotation table.
# 
# == value
# A data frame with the following columns:
#
# - number of samples used to test after filtered by ``p_cutoff``,
# - p-values from the tests,
# - number of subgroups.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# data(golub_cola_ds)
# test_to_known_factors(golub_cola_ds, k = 3)
# test_to_known_factors(golub_cola_ds)
setMethod(f = "test_to_known_factors",
	signature = "DownSamplingConsensusPartition",
	definition = function(object, k, known = get_anno(object), 
	p_cutoff = 0.05, verbose = FALSE) {

	if(missing(k)) {
		all_k = object@k
		m = do.call("rbind", lapply(all_k, function(k) test_to_known_factors(object, k, known = known, 
			p_cutoff = p_cutoff, verbose = verbose)))
		return(m)
	}

	class_df = get_classes(object, k)
	l = class_df$p <= p_cutoff
	class = as.character(class_df$class)[l]

	if(is.null(known)) {
		stop_wrap("`known` should not be NULL.")
	}

	if(is.data.frame(known)) {
		known = known[l, , drop = FALSE]
	} else if(is.matrix(known)) {
		known = known[l, ,drop = FALSE]
	} else {
		known = known[l]
	}

	m = test_between_factors(class, known, verbose = verbose)
	rownames(m) = paste(object@top_value_method, object@partition_method, sep = ":")
	colnames(m) = paste0(colnames(m), "(p-value)")
	m = cbind(n_sample = sum(l), m, k = k)
	return(m)
})

# == title
# Visualize column after dimension reduction
#
# == description
# Visualize samples (the matrix columns) after dimension reduction
#
# == param
# -object A `DownSamplingConsensusPartition-class` object.
# -k Number of subgroups.
# -top_n Top n rows to use. By default it uses all rows in the original matrix.
# -method Which method to reduce the dimension of the data. ``MDS`` uses `stats::cmdscale`,
#         ``PCA`` uses `stats::prcomp`. ``t-SNE`` uses `Rtsne::Rtsne`. ``UMAP`` uses
#         `umap::umap`.
# -control A list of parameters for `Rtsne::Rtsne` or `umap::umap`.
# -internal Internally used.
# -nr If number of matrix rows is larger than this value, random ``nr`` rows are used.
# -p_cutoff Cutoff of p-value of class label prediction. Data points with values higher
#        than it will be mapped with cross symbols.
# -remove Whether to remove columns which have high p-values than
#        the cutoff.
# -scale_rows Whether to perform scaling on matrix rows.
# -verbose Whether print messages.
# -... Other arguments.
#
# == details
# This function is basically very similar as `dimension_reduction,ConsensusPartition-method`.
#
# == value
# No value is returned.
#
# == example
# data(golub_cola_ds)
# dimension_reduction(golub_cola_ds, k = 2)
# dimension_reduction(golub_cola_ds, k = 3)
setMethod(f = "dimension_reduction",
	signature = "DownSamplingConsensusPartition",
	definition = function(object, k, top_n = NULL,
	method = c("PCA", "MDS", "t-SNE", "UMAP"), 
	control = list(),
	internal = FALSE, nr = 5000,
	p_cutoff = 0.05, remove = FALSE,
	scale_rows = TRUE, verbose = TRUE, ...) {

	# the following code is basically the same as dimension_reduction,ConsensusPartition-method

	if(missing(k)) stop_wrap("k needs to be provided.")

	cl = as.list(match.call())
	# default value
	if(! "method" %in% names(cl)) {
		method = NULL
		if(is.null(method)) {
			oe = try(loadNamespace("umap"), silent = TRUE)
			if(inherits(oe, "try-error")) {
				if(verbose) cat("umap package is not installed.\n")
			} else {
				if(verbose) cat("use UMAP\n")
				method = "UMAP"
			}
		}
		if(is.null(method)) {
			oe = try(loadNamespace("Rtsne"), silent = TRUE)
			if(inherits(oe, "try-error")) {
				if(verbose) cat("Rtsne package is not installed.\n")
			} else {
				if(verbose) cat("use t-SNE\n")
				method = "t-SNE"
			}
		}
		if(is.null(method)) {
			if(verbose) cat("use PCA\n")
			method = "PCA"
		}
	}

	method = match.arg(method)
	data = object@.env$data[, object@full_column_index, drop = FALSE]

	if(!is.null(top_n)) {
		top_n = min(c(top_n, nrow(data)))
		all_value = object@top_value_list
		ind = order(all_value)[1:top_n]
		if(length(ind) > nr) ind = sample(ind, nr)
		data = data[ind, , drop = FALSE]
	} else {
		top_n = nrow(data)
		if(nrow(data) > nr) data = data[sample(1:nrow(data), nr), , drop = FALSE]
	}

	class_df = get_classes(object, k)

	l = class_df$p <= p_cutoff
	
	op = par(c("mar", "xpd"))
	par(mar = c(4.1, 4.1, 4.1, 6), xpd = NA)
	on.exit(par(op))

	class_level = sort(as.character(unique(class_df$class)))
	n_class = length(class_level)
		
	if(remove) {
		dimension_reduction(data[, l], pch = 16, col = cola_opt$color_set_2[as.character(class_df$class[l])],
			cex = ifelse(which(l) %in% object@column_index, 1, 0.5), 
			main = qq("@{method} on @{top_n} rows with highest @{object@top_value_method} scores@{ifelse(scale_rows, ', rows are scaled', '')}\n@{sum(l)}/@{length(l)} confident samples (p < @{p_cutoff})"),
			method = method, control = control, scale_rows = scale_rows, nr = nr, internal = internal, verbose = verbose, ...)
		if(!internal) {
			legend(x = par("usr")[2], y = mean(par("usr")[3:4]), 
				legend = c(paste0("group", class_level), "predicted"), 
				pch = c(rep(16, n_class), 16),
				pt.cex = c(rep(1, n_class), 0.5),
				col = c(cola_opt$color_set_2[class_level], "black"), 
				xjust = 0, yjust = 0.5,
				title = "Class", title.adj = 0.1, bty = "n",
				text.col = c(rep("black", n_class), "black"))
		}
	} else {
		dimension_reduction(data, pch = ifelse(l, 16, 4), col = cola_opt$color_set_2[as.character(class_df$class)],
			cex = ifelse(1:ncol(data) %in% object@column_index, 1, 0.5),
			main = qq("@{method} on @{top_n} rows with highest @{object@top_value_method} scores@{ifelse(scale_rows, ', rows are scaled', '')}\n@{sum(l)}/@{length(l)} confident samples (p < @{p_cutoff})"),
			method = method, control = control, scale_rows = scale_rows, nr = nr, internal = internal, verbose = verbose, ...)
		if(!internal) {
			if(any(!l)) {
				legend(x = par("usr")[2], y = mean(par("usr")[3:4]), 
					legend = c(paste0("group", class_level), "ambiguous", "predicted", "predicted"), 
						pch = c(rep(16, n_class), 4, 16, 4),
						pt.cex = c(rep(1, n_class), 1, 0.5, 0.5),
						col = c(cola_opt$color_set_2[class_level], "black", "black", "black"), 
						xjust = 0, yjust = 0.5,
						title = "Class", title.adj = 0.1, bty = "n")
			} else {
				legend(x = par("usr")[2], y = mean(par("usr")[3:4]), 
					legend = c(paste0("group", class_level), "predicted"), 
					pch = c(rep(16, n_class), 16),
					pt.cex = c(rep(1, n_class), 0.5),
					col = c(cola_opt$color_set_2[class_level], "black"), 
					xjust = 0, yjust = 0.5,
					title = "Class", title.adj = 0.1, bty = "n",
					text.col = c(rep("black", n_class), "black"))
			}
		}
	}
})

# == title
# Get signature rows
#
# == param
# -object A `DownSamplingConsensusPartition-class` object.
# -k Number of subgroups.
# -p_cutoff Cutoff for p-values of class label prediction. Samples with values 
#        higher than it are not used for finding signature rows.
# -fdr_cutoff Cutoff for FDR of the difference test between subgroups.
# -group_diff Cutoff for the maximal difference between group means.
# -scale_rows Whether apply row scaling when making the heatmap.
# -row_km Number of groups for performing k-means clustering on rows. By default it is automatically selected.
# -diff_method Methods to get rows which are significantly different between subgroups, see 'Details' section.
# -anno A data frame of annotations for the original matrix columns. 
#       By default it uses the annotations specified in `consensus_partition` or `run_all_consensus_partition_methods`.
# -anno_col A list of colors (color is defined as a named vector) for the annotations. If ``anno`` is a data frame,
#       ``anno_col`` should be a named list where names correspond to the column names in ``anno``.
# -internal Used internally.
# -show_row_dend Whether show row dendrogram.
# -show_column_names Whether show column names in the heatmap.
# -use_raster Internally used.
# -plot Whether to make the plot.
# -verbose Whether to print messages.
# -seed Random seed.
# -left_annotation Annotation put on the left of the heatmap. It should be a `ComplexHeatmap::HeatmapAnnotation-class` object. 
#              The number of items should be the same as the number of the original matrix rows. The subsetting to the significant 
#              rows are automatically performed on the annotation object.
# -right_annotation Annotation put on the right of the heatmap. Same format as ``left_annotation``.
# -col Colors.
# -simplify Only use internally.
# -... Other arguments.
# 
# == details
# This function is very similar as `get_signatures,ConsensusPartition-method`.
#
# == example
# \donttest{
# data(golub_cola_ds)
# get_signatures(golub_cola_ds, k = 2)
# get_signatures(golub_cola_ds, k = 3)
# }
setMethod(f = "get_signatures",
	signature = "DownSamplingConsensusPartition",
	definition = function(object, k,
	p_cutoff = 0.05, 
	fdr_cutoff = cola_opt$fdr_cutoff, 
	group_diff = cola_opt$group_diff,
	scale_rows = object@scale_rows,
	row_km = NULL,
	diff_method = c("Ftest", "ttest", "samr", "pamr", "one_vs_others"),
	anno = get_anno(object), 
	anno_col = get_anno_col(object),
	internal = FALSE,
	show_row_dend = FALSE,
	show_column_names = FALSE, use_raster = TRUE,
	plot = TRUE, verbose = TRUE, seed = 888,
	left_annotation = NULL, right_annotation = NULL,
	col = if(scale_rows) c("green", "white", "red") else c("blue", "white", "red"),
	simplify = FALSE,
	...) {

	if(missing(k)) stop_wrap("k needs to be provided.")
	
	class_df = get_classes(object, k)
	class_ids = class_df$class

	data = object@.env$data[, object@full_column_index, drop = FALSE]

	l = class_df$p <= p_cutoff
	data2 = data[, l, drop = FALSE]
	class = class_df$class[l]
	column_used_index = which(l)
	tb = table(class)
	l = as.character(class) %in% names(which(tb <= 1))
	data2 = data2[, !l, drop = FALSE]
	class = class[!l]
	column_used_index = column_used_index[!l]
	column_used_logical = rep(FALSE, ncol(data))
	column_used_logical[column_used_index] = TRUE
	has_ambiguous = sum(!column_used_logical)
	n_sample_used = length(class)

	if(verbose) qqcat("* @{n_sample_used}/@{nrow(class_df)} samples (in @{length(unique(class))} classes) remain after filtering by p-value (<= @{p_cutoff}).\n")

	tb = table(class)
	if(sum(tb > 1) <= 1) {
		if(plot) {
			grid.newpage()
			fontsize = convertUnit(unit(0.1, "npc"), "char", valueOnly = TRUE)*get.gpar("fontsize")$fontsize
			grid.text("not enough samples", gp = gpar(fontsize = fontsize))
		}
		if(verbose) cat("* Not enough samples.\n")
		return(invisible(data.frame(which_row = integer(0))))
	}
	if(length(unique(class)) <= 1) {
		if(plot) {
			grid.newpage()
			fontsize = convertUnit(unit(0.1, "npc"), "char", valueOnly = TRUE)*get.gpar("fontsize")$fontsize
			grid.text("not enough classes", gp = gpar(fontsize = fontsize))
		}
		if(verbose) cat("* Not enough classes.\n")
		return(invisible(data.frame(which_row = integer(0))))
	}

	do_row_clustering = TRUE
	if(inherits(diff_method, "function")) {
		if(verbose) qqcat("* calculate row difference between subgroups by user-defined function.\n")
		diff_method_fun = diff_method
		diff_method = digest(diff_method)
	} else {
		diff_method = match.arg(diff_method)
	}

	if(diff_method %in% c("samr", "pamr")) {
		hash = digest(list(used_samples = which(l), 
			               class = class,
			               n_group = k, 
			               diff_method = diff_method,
			               column_index = object@column_index,
			               fdr_cutoff = fdr_cutoff,
			               group_diff = group_diff,
			               seed = seed),
					algo = "md5")
	} else {
		hash = digest(list(used_samples = which(l), 
			               class = class,
			               n_group = k, 
			               diff_method = diff_method,
			               column_index = object@column_index,
			               group_diff = group_diff,
			               seed = seed),
					algo = "md5")
	}
	nm = paste0("signature_fdr_", hash)
	if(verbose) qqcat("* cache hash: @{hash} (seed @{seed}).\n")

	find_signature = TRUE
	if(!is.null(object@.env[[nm]])) {
		if(diff_method == "samr") {
			if(object@.env[[nm]]$diff_method == "samr" && 
			   object@.env[[nm]]$n_sample_used == n_sample_used && 
			   abs(object@.env[[nm]]$fdr_cutoff - fdr_cutoff) < 1e-10) {
				fdr = object@.env[[nm]]$fdr
				find_signature = FALSE
			}
		} else if(diff_method == "pamr") {
			if(object@.env[[nm]]$diff_method == "pamr" && 
			   object@.env[[nm]]$n_sample_used == n_sample_used && 
			   abs(object@.env[[nm]]$fdr_cutoff - fdr_cutoff) < 1e-10) {
				fdr = object@.env[[nm]]$fdr
				find_signature = FALSE
			}
		} else {
			if(object@.env[[nm]]$diff_method == diff_method &&
			   object@.env[[nm]]$n_sample_used == n_sample_used) {
				fdr = object@.env[[nm]]$fdr
				find_signature = FALSE
			}
		}
	}

	if(verbose) qqcat("* calculating row difference between subgroups by @{diff_method}.\n")
	if(find_signature) {
		if(diff_method == "ttest") {
			fdr = ttest(data2, class)
		} else if(diff_method == "samr") {
			fdr = samr(data2, class, fdr.output = fdr_cutoff)
		} else if(diff_method == "Ftest") {
			fdr = Ftest(data2, class)
		} else if(diff_method == "pamr") {
			fdr = pamr(data2, class, fdr.ouput = fdr_cutoff)
		} else {
			fdr = diff_method_fun(data2, class)
		}
	} else {
		if(verbose) qqcat("  - row difference is extracted from cache.\n")
	}

	object@.env[[nm]]$diff_method = diff_method
	object@.env[[nm]]$fdr_cutoff = fdr_cutoff
	object@.env[[nm]]$fdr = fdr
	object@.env[[nm]]$n_sample_used = n_sample_used
	object@.env[[nm]]$group_diff = group_diff

	if(scale_rows && !is.null(object@.env[[nm]]$row_order_scaled)) {
		row_order = object@.env[[nm]]$row_order_scaled
		if(verbose) qqcat("  - row order for the scaled matrix is extracted from cache.\n")
		do_row_clustering = FALSE
	} else if(!scale_rows && !is.null(object@.env[[nm]]$row_order_unscaled)) {
		row_order = object@.env[[nm]]$row_order_unscaled
		if(verbose) qqcat("  - row order for the unscaled matrix is extracted from cache.\n")
		do_row_clustering = FALSE
	}

	# filter by fdr
	fdr[is.na(fdr)] = 1

	l_fdr = fdr < fdr_cutoff
	mat = data[l_fdr, , drop = FALSE]
	fdr2 = fdr[l_fdr]
	
	if(!is.null(left_annotation)) left_annotation = left_annotation[l_fdr, ]
	if(!is.null(right_annotation)) right_annotation = right_annotation[l_fdr, ]

	returned_df = data.frame(which_row = which(l_fdr), fdr = fdr2)

	# filter by group_diff
	mat1 = mat[, column_used_logical, drop = FALSE]
	if(nrow(mat) == 1) {
		group_mean = rbind(tapply(mat1, class, mean))
	} else {
		group_mean = do.call("cbind", tapply(seq_len(ncol(mat1)), class, function(ind) {
			rowMeans(mat1[, ind, drop = FALSE])
		}))
	}
	colnames(group_mean) = paste0("mean_", colnames(group_mean))
	returned_df = cbind(returned_df, group_mean)
	returned_df$group_diff = apply(group_mean, 1, function(x) max(x) - min(x))

	if(group_diff > 0) {
		l_diff = returned_df$group_diff >= group_diff
		mat = mat[l_diff, , drop = FALSE]
		mat1 = mat1[l_diff, , drop = FALSE]
		returned_df = returned_df[l_diff, , drop = FALSE]
	}

	if(scale_rows) {
		mat1_scaled = t(scale(t(mat[, column_used_logical, drop = FALSE])))
		if(nrow(mat) == 1) {
			group_mean_scaled = rbind(tapply(mat1_scaled, class, mean))
		} else {
			group_mean_scaled = do.call("cbind", tapply(seq_len(ncol(mat1_scaled)), class, function(ind) {
				rowMeans(mat1_scaled[, ind, drop = FALSE])
			}))
		}
		colnames(group_mean_scaled) = paste0("scaled_mean_", colnames(group_mean_scaled))
		returned_df = cbind(returned_df, group_mean_scaled)
	}

	returned_obj = returned_df
	rownames(returned_obj) = NULL

	attr(returned_obj, "sample_used") = column_used_logical

	## add k-means
	row_km_fit = NULL
	if(!internal) {
		if(nrow(mat1) > 10) {
			do_kmeans = TRUE
			if(scale_rows) {
				mat_for_km = t(scale(t(mat1)))
				row_km_fit = object@.env[[nm]]$row_km_fit_scaled
			} else {
				mat_for_km = mat1
				row_km_fit = object@.env[[nm]]$row_km_fit_unscaled
			}

			if(nrow(mat_for_km) > 5000) {
				set.seed(seed)
				mat_for_km2 = mat_for_km[sample(nrow(mat_for_km), 5000), , drop = FALSE]
			} else {
				mat_for_km2 = mat_for_km
			}

			if(!is.null(row_km_fit)) {
				if(is.null(row_km) || identical(as.integer(row_km), length(row_km_fit$size))) {
					returned_obj$km = apply(pdist(row_km_fit$centers, mat_for_km, as.integer(1)), 2, which.min)
					do_kmeans = FALSE
					if(verbose) qqcat("* use k-means partition that are already calculated in previous runs.\n")
				}
			}
			if(do_kmeans) {
				set.seed(seed)
				if(is.null(row_km)) {
					wss = (nrow(mat_for_km2)-1)*sum(apply(mat_for_km2,1,var))
					max_km = min(c(nrow(mat_for_km) - 1, 15))
					# if(verbose) qqcat("* apply k-means on rows with 2~@{max_km} clusters.\n")
					for (i in 2:max_km) {
						# if(verbose) qqcat("  - applying k-means with @{i} clusters.\n")
						wss[i] = sum(kmeans(mat_for_km2, centers = i, iter.max = 50)$withinss)
					}
					row_km = min(elbow_finder(1:max_km, wss)[1], knee_finder(1:max_km, wss)[1])
					if(length(unique(class)) == 1) row_km = 1
					if(length(unique(class)) == 2) row_km = min(row_km, 2)
				}
				if(row_km > 1) {
					row_km_fit = kmeans(mat_for_km2, centers = row_km)
					returned_obj$km = apply(pdist(row_km_fit$centers, mat_for_km, as.integer(1)), 2, which.min)
					if(scale_rows) {
						object@.env[[nm]]$row_km_fit_scaled = row_km_fit
					} else {
						object@.env[[nm]]$row_km_fit_unscaled = row_km_fit
					}
				}
				if(verbose) qqcat("* split rows into @{row_km} groups by k-means clustering.\n")
			}
		}
	}

	if(verbose) qqcat("* @{nrow(mat)} signatures (@{sprintf('%.1f',nrow(mat)/nrow(object)*100)}%) under fdr < @{fdr_cutoff}, group_diff > @{group_diff}.\n")

	if(nrow(mat) == 0) {
		if(plot) {
			grid.newpage()
			fontsize = convertUnit(unit(0.1, "npc"), "char", valueOnly = TRUE)*get.gpar("fontsize")$fontsize
			grid.text("no sigatures", gp = gpar(fontsize = fontsize))
		}
		return(invisible(NULL))
	}

	if(!plot) {
		return(invisible(returned_obj))
	}

	set.seed(seed)
	more_than_5k = FALSE
	if(!is.null(object@.env[[nm]]$row_index)) {
		if(verbose) qqcat("  - use the 2000 signatures what are already generated in previous runs.\n")
		row_index = object@.env[[nm]]$row_index
		mat1 = mat[row_index, column_used_logical, drop = FALSE]
		mat2 = mat[row_index, !column_used_logical, drop = FALSE]
		more_than_5k = TRUE
		if(!is.null(left_annotation)) left_annotation = left_annotation[row_index, ]
		if(!is.null(right_annotation)) right_annotation = right_annotation[row_index, ]
	} else if(nrow(mat) > 2000) {
		more_than_5k = TRUE
		row_index = sample(1:nrow(mat), 2000)
		object@.env[[nm]]$row_index = row_index
		# mat1 = mat[order(fdr2)[1:top_k_genes], column_used_logical, drop = FALSE]
		# mat2 = mat[order(fdr2)[1:top_k_genes], !column_used_logical, drop = FALSE]
		mat1 = mat[row_index, column_used_logical, drop = FALSE]
		mat2 = mat[row_index, !column_used_logical, drop = FALSE]
		# group2 = group2[order(fdr2)[1:top_k_genes]]
		if(verbose) cat(paste0("  - randomly sample 2000 signatures.\n"))
		if(!is.null(left_annotation)) left_annotation = left_annotation[row_index, ]
		if(!is.null(right_annotation)) right_annotation = right_annotation[row_index, ]
	} else {
		row_index = seq_len(nrow(mat))
		mat1 = mat[, column_used_logical, drop = FALSE]
		mat2 = mat[, !column_used_logical, drop = FALSE]
		
	}
	base_mean = rowMeans(mat1)
	if(nrow(mat) == 1) {
		group_mean = matrix(tapply(mat1, class, mean), nrow = 1)
	} else {
		group_mean = do.call("cbind", tapply(seq_len(ncol(mat1)), class, function(ind) {
			rowMeans(mat1[, ind, drop = FALSE])
		}))
	}
	rel_diff = (rowMaxs(group_mean) - rowMins(group_mean))/base_mean/2

	if(is.null(anno)) {
		bottom_anno1 = NULL
	} else {
		if(is.atomic(anno)) {
			anno_nm = deparse(substitute(anno))
			anno = data.frame(anno)
			colnames(anno) = anno_nm
			if(!is.null(anno_col)) {
				if(is.atomic(anno_col)) {
					anno_col = list(anno_col)
					names(anno_col) = anno_nm
				}
			}
		} else if(ncol(anno) == 1) {
			if(!is.null(anno_col)) {
				if(is.atomic(anno_col)) {
					anno_col = list(anno_col)
					names(anno_col) = colnames(anno)
				}
			}
		}

		if(is.null(anno_col)) {
			bottom_anno1 = HeatmapAnnotation(df = anno[column_used_logical, , drop = FALSE],
				show_annotation_name = !has_ambiguous & !internal, annotation_name_side = "right")
		} else {
			bottom_anno1 = HeatmapAnnotation(df = anno[column_used_logical, , drop = FALSE], col = anno_col,
				show_annotation_name = !has_ambiguous & !internal, annotation_name_side = "right")
		}
	}

	if(scale_rows) {
		scaled_mean = base_mean
		scaled_sd = rowSds(mat1)
		scaled_mat1 = t(scale(t(mat1)))
		scaled_mat2 = mat2
		if(has_ambiguous) {
			for(i in seq_len(nrow(mat2))) {
				scaled_mat2[i, ] = (scaled_mat2[i, ] - scaled_mean[i])/scaled_sd[i]
			}
		}

		use_mat1 = scaled_mat1
		use_mat2 = scaled_mat2
		use_mat1[is.infinite(use_mat1)] = 0
		use_mat1[is.na(use_mat1)] = 0
		use_mat2[is.infinite(use_mat2)] = 0
		use_mat2[is.na(use_mat2)] = 0
		mat_range = quantile(abs(scaled_mat1), 0.95, na.rm = TRUE)
		col_fun = colorRamp2(c(-mat_range, 0, mat_range), col)
		heatmap_name = "z-score"
	} else {
		use_mat1 = mat1
		use_mat2 = mat2
		mat_range = quantile(mat1, c(0.05, 0.95))
		col_fun = colorRamp2(c(mat_range[1], mean(mat_range), mat_range[2]), col)
		heatmap_name = "Value"
	}

	if(has_ambiguous) {
		class2 = class_df$class[!column_used_logical]

		if(is.null(anno)) {
			bottom_anno2 = NULL
		} else {
			anno_col = lapply(bottom_anno1@anno_list, function(anno) {
				if(is.null(anno@color_mapping)) {
					return(NULL)
				} else {
					if(anno@color_mapping@type == "discrete") {
						anno@color_mapping@colors
					} else {
						anno@color_mapping@col_fun
					}
				}
			})
			names(anno_col) = names(bottom_anno1@anno_list)
			anno_col = anno_col[!sapply(anno_col, is.null)]

			if(!is.null(object@anno_col)) {
				nmd = setdiff(names(object@anno_col), names(anno_col))
				if(length(nmd)) {
					anno_col[nmd] = object@anno_col[nmd]
				}
			}

			bottom_anno2 = HeatmapAnnotation(df = anno[!column_used_logical, , drop = FALSE], col = anno_col,
				show_annotation_name = !internal, annotation_name_side = "right")	
		}
	}

	if(verbose) qqcat("* making heatmaps for signatures.\n")

	row_split = NULL
	if(!internal) {
		if(scale_rows) {
			row_km_fit = object@.env[[nm]]$row_km_fit_scaled
		} else {
			row_km_fit = object@.env[[nm]]$row_km_fit_unscaled
		}
		if(!is.null(row_km_fit)) {
			row_split = factor(returned_obj$km[row_index], levels = sort(unique(returned_obj$km[row_index])))
		}
	}

	# group2 = factor(group2, levels = sort(unique(group2)))
	# ht_list = Heatmap(group2, name = "Group", show_row_names = FALSE, width = unit(5, "mm"), col = cola_opt$color_set_2)
	ht_list = NULL

	if(internal) {
		ha1 = HeatmapAnnotation(
				Class = class_df$class[column_used_logical],
				col = list(Class = cola_opt$color_set_2),
				show_annotation_name = !has_ambiguous & !internal,
				annotation_name_side = "right",
				show_legend = TRUE)
	} else {
		if(simplify) {
			ha1 = HeatmapAnnotation(
				Class = class_df$class[column_used_logical],
				col = list(Class = cola_opt$color_set_2),
				show_annotation_name = !has_ambiguous & !internal,
				annotation_name_side = "right",
				show_legend = TRUE)
		} else {
			ha1 = HeatmapAnnotation(
				Class = class_df$class[column_used_logical],
				col = list(Class = cola_opt$color_set_2),
				show_annotation_name = !has_ambiguous & !internal,
				annotation_name_side = "right",
				show_legend = TRUE)
		}
	}
	ht_list = ht_list + Heatmap(use_mat1, name = heatmap_name, col = col_fun,
		top_annotation = ha1, row_split = row_split,
		cluster_columns = TRUE, cluster_column_slices = FALSE, cluster_row_slices = FALSE,
		column_split = factor(class_df$class[column_used_logical], levels = sort(unique(class_df$class[column_used_logical]))), 
		show_column_dend = FALSE,
		show_row_names = FALSE, show_row_dend = show_row_dend, column_title = {if(internal) NULL else qq("@{ncol(use_mat1)} confident samples")},
		use_raster = use_raster, raster_by_magick = requireNamespace("magick", quietly = TRUE),
		bottom_annotation = bottom_anno1, show_column_names = show_column_names, 
		left_annotation = left_annotation, right_annotation = {if(has_ambiguous) NULL else right_annotation})
 	
	all_value_positive = !any(data < 0)
 	if(scale_rows && all_value_positive && !simplify) {
		ht_list = ht_list + Heatmap(base_mean, show_row_names = FALSE, name = "base_mean", width = unit(5, "mm"), show_column_names = !internal) +
			Heatmap(rel_diff, col = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red")), 
				show_row_names = FALSE, show_column_names = !internal, name = "rel_diff", width = unit(5, "mm"))
	}

	if(has_ambiguous) {
		if(internal) {
			ha2 = HeatmapAnnotation(
				Class = class_df$class[!column_used_logical],
				col = list(Class = cola_opt$color_set_2),
				show_annotation_name = !internal,
				annotation_name_side = "right",
				show_legend = FALSE)
		} else {
			if(simplify) {
				ha2 = HeatmapAnnotation(
					Class = class_df$class[!column_used_logical],
					col = list(Class = cola_opt$color_set_2),
					show_annotation_name = c(TRUE) & !internal,
					annotation_name_side = "right",
					show_legend = FALSE)
			} else {
				ha2 = HeatmapAnnotation(
					Class = class_df$class[!column_used_logical],
					col = list(Class = cola_opt$color_set_2),
					show_annotation_name = c(TRUE, TRUE) & !internal,
					annotation_name_side = "right",
					show_legend = FALSE)
			}
			
		}
		ht_list = ht_list + Heatmap(use_mat2, name = paste0(heatmap_name, 2), col = col_fun,
			top_annotation = ha2,
			cluster_columns = TRUE, show_column_dend = FALSE,
			show_row_names = FALSE, show_row_dend = FALSE, show_heatmap_legend = FALSE,
			use_raster = use_raster, raster_by_magick = requireNamespace("magick", quietly = TRUE),
			bottom_annotation = bottom_anno2, show_column_names = show_column_names,
			right_annotation = right_annotation)
	}

	if(has_ambiguous) {
		lgd = Legend(title = "Status (barplots)", labels = c("confident", "ambiguous"), legend_gp = gpar(fill = c("black", "grey")))
		heatmap_legend_list = list(lgd)
	} else {
		heatmap_legend_list = NULL
	}

	if(do_row_clustering) {
		ht_list = draw(ht_list, main_heatmap = heatmap_name, column_title = ifelse(internal, "", qq("@{k} subgroups, @{nrow(mat)} signatures (@{sprintf('%.1f',nrow(mat)/nrow(object)*100)}%) with fdr < @{fdr_cutoff}@{ifelse(group_diff > 0, paste0(', group_diff > ', group_diff), '')}")),
			show_heatmap_legend = !internal, show_annotation_legend = !internal,
			heatmap_legend_list = heatmap_legend_list,
			row_title = {if(length(unique(row_split)) <= 1) NULL else qq("k-means with @{length(unique(row_split))} groups")}
		)
		
		row_order = row_order(ht_list)
		if(!is.list(row_order)) row_order = list(row_order)
		if(scale_rows) {
			object@.env[[nm]]$row_order_scaled = do.call("c", row_order)
		} else {
			object@.env[[nm]]$row_order_unscaled = do.call("c", row_order)
		}
		
	} else {
		if(verbose) cat("  - use row order from cache.\n")
		draw(ht_list, main_heatmap = heatmap_name, column_title = ifelse(internal, "", qq("@{k} subgroups, @{nrow(mat)} signatures (@{sprintf('%.1f',nrow(mat)/nrow(object)*100)}%) with fdr < @{fdr_cutoff}@{ifelse(group_diff > 0, paste0(', group_diff > ', group_diff), '')}")),
			show_heatmap_legend = !internal, show_annotation_legend = !internal,
			cluster_rows = FALSE, row_order = row_order, heatmap_legend_list = heatmap_legend_list,
			row_title = {if(length(unique(row_split)) <= 1) NULL else qq("k-means with @{length(unique(row_split))} groups")}
		)
	}

	return(invisible(returned_obj))
})



# == title
# Number of columns in the matrix
#
# == param
# -x A `DownSamplingConsensusPartition-class` object.
#
setMethod(f = "ncol",
	signature = "DownSamplingConsensusPartition",
	definition = function(x) {
	length(x@full_column_index)
})


# == title
# Column names of the matrix
#
# == param
# -x A `DownSamplingConsensusPartition-class` object.
#
setMethod(f = "colnames",
	signature = "DownSamplingConsensusPartition",
	definition = function(x) {
	colnames(x@.env$data)[x@full_column_index]
})


# == title
# Dimension of the matrix
#
# == param
# -x A `DownSamplingConsensusPartition-class` object.
#
dim.DownSamplingConsensusPartition = function(x) {
	c(nrow(x), ncol(x))
}

Try the cola package in your browser

Any scripts or data that you put into this service are public.

cola documentation built on Nov. 8, 2020, 8:12 p.m.