R/hierarchical.R

Defines functions dim.HierarchicalPartition get_children mean_dist_decrease farthest_subgroup tightest_subgroup calc_dend zero_height_dend random_dend get_hierarchy subgroup_dend has_hierarchy hierarchical_partition

Documented in dim.HierarchicalPartition hierarchical_partition

# == title
# The HierarchicalPartition class
#
# == alias
# HierarchicalPartition
#
# == methods
# The `HierarchicalPartition-class` has following methods:
#
# -`hierarchical_partition`: constructor method.
# -`collect_classes,HierarchicalPartition-method`: plot the hierarchy of subgroups predicted.
# -`get_classes,HierarchicalPartition-method`: get the class IDs of subgroups.
# -`suggest_best_k,HierarchicalPartition-method`: guess the best number of partitions for each node.
# -`get_matrix,HierarchicalPartition-method`: get the original matrix.
# -`get_signatures,HierarchicalPartition-method`: get the signatures for each subgroup.
# -`compare_signatures,HierarchicalPartition-method`: compare signatures from different nodes.
# -`dimension_reduction,HierarchicalPartition-method`: make dimension reduction plots.
# -`test_to_known_factors,HierarchicalPartition-method`: test correlation between predicted subgrouping and known annotations, if available.
# -`cola_report,HierarchicalPartition-method`: generate a HTML report for the whole analysis.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
HierarchicalPartition = setClass("HierarchicalPartition",
    slots = list(
        list = "list",
        best_k = "numeric",
        hierarchy = "matrix",
        subgroup = "character",
        subgroup_col = "character",
        call = "ANY",
        .env = "environment"
    )
)

# == title
# Hierarchical partition
#
# == param
# -data a numeric matrix where subgroups are found by columns.
# -top_value_method a single top-value method. Available methods are in `all_top_value_methods`.
# -partition_method a single partition method. Available methods are in `all_partition_methods`.
# -PAC_cutoff the cutoff of PAC scores to determine whether to continue looking for subgroups.
# -min_samples the cutoff of number of samples to determine whether to continue looking for subgroups.
# -subset Number of columns to randomly sample.
# -max_k maximal number of partitions to try. The function will try ``2:max_k`` partitions. Note this is the number of
#        partitions that will be tried out on each node of the hierarchical partition. Since more subgroups will be found
#        in the whole partition hierarchy, on each node, ``max_k`` should not be set to a large value.
# -verbose whether print message.
# -mc.cores multiple cores to use. 
# -... pass to `consensus_partition`
#
# == details
# The function looks for subgroups in a hierarchical way.
#
# There is a special way to encode the node in the hierarchy. The length of the node name
# is the depth of the node in the hierarchy and the substring excluding the last digit is the name
# node of the parent node. E.g. for the node ``0011``, the depth is 4 and the parent node is ``001``.
#
# == return
# A `HierarchicalPartition-class` object. Simply type object in the interactive R session
# to see which functions can be applied on it.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
hierarchical_partition = function(data, 
	top_value_method = "ATC", 
	partition_method = "skmeans",
	PAC_cutoff = 0.2, min_samples = 6, subset = Inf,
	max_k = 4, verbose = TRUE, mc.cores = 1, ...) {

	cl = match.call()

	if(min_samples < 3) {
		if(verbose) qqcat("! 'min_samples' was reset to 3.\n")
		min_samples = 3
	}

	if(verbose) {
		qqcat("* on a @{nrow(data)}x@{ncol(data)} matrix.\n")
		qqcat("* hierarchical partition by @{top_value_method}:@{partition_method}.\n")
	}
	
	.hierarchical_partition = function(.env, column_index, node_id = '0', 
		min_samples = 6, max_k = 4, verbose = TRUE, mc.cores = 1, ...) {

		prefix = ""
		if(node_id != "0") {
			prefix = paste(rep("  ", nchar(node_id) - 1), collapse = "")
		}

		if(verbose) qqcat("@{prefix}=========================================================\n")
		if(verbose) qqcat("@{prefix}* submatrix with @{length(column_index)} columns, node_id: @{node_id}.\n")
		
		## all_top_value_list is only used in run_all_consensus_partition_methods(), we remove it here
	   	.env$all_top_value_list = NULL
	   	.env$column_index = column_index  #note .env$column_index is only for passing to `consensus_partition()` function
		
		if(mc.cores > 1 && verbose) {
			qqcat("@{prefix}* running consensus partitioning with @{mc.cores} cores.\n")
		}

		if(length(column_index) <= subset) {
			part = consensus_partition(verbose = TRUE, .env = .env, max_k = max_k, prefix = prefix,
				top_value_method = top_value_method, partition_method = partition_method, mc.cores = mc.cores, ...)
		} else {
			part = consensus_partition_by_down_sampling(subset = subset, verbose = TRUE, .env = .env, max_k = max_k, prefix = prefix,
				top_value_method = top_value_method, partition_method = partition_method, mc.cores = mc.cores, ...)
		}
		attr(part, "node_id") = node_id

		lt = list(obj = part)

	    best_k = suggest_best_k(part)
	    if(is.na(best_k)) {
	    	if(verbose) qqcat("@{prefix}* Rand index is too high, no meaningful subgroups, stop.\n")
	    	return(lt)
	    }
	    cl = get_classes(part, k = best_k)

	    mat = .env$data[, column_index, drop = FALSE]
	    if(part@scale_rows) {
	    	mat = t(scale(t(mat)))
	    }
	    mat = mat[order(.env$all_top_value_list[[1]], decreasing = TRUE), , drop = FALSE]

	    ## split k classes into two groups
	    if(best_k == 2) {
	    	kg1 = 1
	    	kg2 = 2
	    	set1 = which(cl$class == kg1)
	    	set2 = which(cl$class == kg2)
	    } else {
	    	tb = table(cl$class)
	    	kg1 = as.numeric(names(tb[which.max(tb)[1]]))
	    	kg2 = sort(setdiff(cl$class, kg1))

	    	# group_mean = do.call(rbind, tapply(1:ncol(mat), cl$class, function(ind) rowMeans(mat[, ind, drop = FALSE])))
	    	# kg = kmeans(group_mean, centers = 2)$cluster
	    	# kg1 = as.numeric(rownames(group_mean)[kg == 1])
	    	# kg2 = as.numeric(rownames(group_mean)[kg == 2])

	    	set1 = which(cl$class %in% kg1)
	    	set2 = which(cl$class %in% kg2)
	    }

	    .env$all_top_value_list = NULL

	    if(verbose) qqcat("@{prefix}* best k = @{best_k}, split into two groups with class IDs (@{paste(kg1, collapse=',')}) and (@{paste(kg2, collapse=',')})\n")
		PAC_score = 1 - get_stats(part, k = best_k)[, "1-PAC"]
	    if(PAC_score > PAC_cutoff) {
	    	if(verbose) qqcat("@{prefix}* PAC score is too big (@{sprintf('%.2f', PAC_score)}), stop.\n")
	    	return(lt)
	    }

    	if(length(set1) < min_samples) {
    		if(verbose) qqcat("@{prefix}* one of the subgroup has too few columns (@{length(set1)}), won't split.\n")
    		return(lt)
    	}

    	if(length(set2) < min_samples) {
    		if(verbose) qqcat("@{prefix}* one of the subgroups has too few columns (@{length(set2)}), won't split.\n")
    		return(lt)
    	}

    	if(verbose) qqcat("@{prefix}* partition into two subgroups with @{length(set1)} and @{length(set2)} columns.\n")
    	# insert the two subgroups into the hierarchy
    	sub_node_1 = paste0(node_id, 1)
    	sub_node_2 = paste0(node_id, 0)

    	lt2 = lapply(1:2, function(ind) {
	    	if(length(set1) >= min_samples && ind == 1) {
	    		return(.hierarchical_partition(.env, column_index = column_index[set1], node_id = sub_node_1,
	    			min_samples = min_samples, max_k = min(max_k, length(set1)-1), mc.cores = mc.cores, verbose = verbose, ...))
	    	}

	    	if(length(set2) >= min_samples && ind == 2) {
	    		return(.hierarchical_partition(.env, column_index = column_index[set2], node_id = sub_node_2,
	    			min_samples = min_samples, max_k = min(max_k, length(set2)-1), mc.cores = mc.cores, verbose = verbose, ...))
	    	}

	    	return(NULL)
	    })

	    if(!is.null(lt2[[1]])) lt$child1 = lt2[[1]]
	    if(!is.null(lt2[[2]])) lt$child2 = lt2[[2]]

	    return(lt)
	}

	.env = new.env(parent = emptyenv())
	.env$data = data
	lt = .hierarchical_partition(.env = .env, column_index = seq_len(ncol(data)), min_samples = min_samples, 
		node_id = "0", max_k = min(max_k, ncol(data)-1), verbose = verbose, mc.cores = mc.cores, ...)

	# reformat lt
	.e = new.env(parent = emptyenv())
	.e$hierarchy = matrix(nrow = 0, ncol = 2)
	.e$lt = list()
	reformat_lt = function(lt, .e) {
		nm = names(lt)
		parent_id = attr(lt$obj, "node_id")
		.e$lt[[parent_id]] = lt$obj
		if("child1" %in% nm) {
			child_id = attr(lt$child1$obj, "node_id")
			.e$hierarchy = rbind(.e$hierarchy, c(parent_id, child_id))
			reformat_lt(lt$child1, .e)
		}
		if("child2" %in% nm) {
			child_id = attr(lt$child2$obj, "node_id")
			.e$hierarchy = rbind(.e$hierarchy, c(parent_id, child_id))
			reformat_lt(lt$child2, .e)
		}
	}
	reformat_lt(lt, .e)

	hp = new("HierarchicalPartition")
	hp@hierarchy = .e$hierarchy
	hp@list = .e$lt
	hp@best_k = sapply(.e$lt, suggest_best_k)
	leaves = all_leaves(hp)
	subgroup = rep("0", ncol(data))
	for(le in leaves) {
		subgroup[ .e$lt[[le]]@column_index ] = le
	}
	hp@subgroup = subgroup
	names(hp@subgroup) = colnames(data)

	le = unique(as.vector(hp@hierarchy))
	if(length(le) <= 16) {
		hp@subgroup_col = structure(brewer_pal_set2_col[seq_along(le)], names = le)
	} else {
		hp@subgroup_col = structure(rand_color(length(le), luminosity = "bright"), names = le)
	}
	hp@call = cl
	hp@.env = hp@list[[1]]@.env

	return(hp)
}

has_hierarchy = function(object) {
	nrow(object@hierarchy) > 0
}

subgroup_dend = function(object, hierarchy = object@hierarchy) {

	check_pkg("data.tree", bioc = FALSE)

	lt = list()
	lt[["0"]] = data.tree::Node$new("0")
	cn = colnames(object@list[["0"]]@.env$data)
	max_depth = max(nchar(hierarchy))
	lt[["0"]]$node_height = max_depth - 1
	for(i in seq_len(nrow(hierarchy))) {
		lt[[ hierarchy[i, 2] ]] = lt[[ hierarchy[i, 1] ]]$AddChildNode({
			node = data.tree::Node$new(hierarchy[i, 2])
			node$node_height = max_depth - nchar(hierarchy[i, 2])
			node
		})
		l = hierarchy[, 1] == hierarchy[i, 2]
	}
	dend = as.dendrogram(lt[["0"]], heightAttribute = "node_height", edgetext = TRUE)

	dend = dendextend::`order.dendrogram<-`(dend, value = 1:nobs(dend))

	dend = edit_node(dend, function(d, index) {
		if(is.leaf(d)) {
			attr(d, "node_id") = attr(d, "label")
		} else {
			attr(d, "node_id") = attr(d, "edgetext")
		}
		d
	})

	# make sure all nodes have a node_id attribute
	# depth first
	.get_node_id = function(d) {
		node_id = attr(d, "node_id")
		if(is.null(node_id)) {
			child_node_id = .get_node_id(d[[1]])
			if(is.null(child_node_id)) {
				child_node_id = .get_node_id(d[[2]])
			}
			return(gsub("\\d$", "", child_node_id))
		}
		return(node_id)
	}

	dend = edit_node(dend, function(d, index) {
		node_id = attr(d, "node_id")
		if(is.null(node_id)) {
			node_id = .get_node_id(d)
			attr(d, "node_id") = node_id
		}
		d
	})

	# od = structure(1:length(order.dendrogram(dend)), names = labels(dend))
	# dend_env = new.env(parent = emptyenv())
	# dend_env$dend = dend

	# update_midpoint = function(index = NULL) {
	# 	if(is.null(index)) {
	# 		if(is.leaf(dend_env$dend)) {
	# 			pos = od[attr(dend_env$dend, "label")]
	# 			midpoint = 0
	# 		} else {
	# 			x = NULL
	# 			for(i in seq_len(length(dend_env$dend))) {
	# 				if(is.null(attr(dend_env$dend[[i]], "x"))) {
	# 					update_midpoint(i)
	# 				}
	# 				x[i] = attr(dend_env$dend[[i]], "x")
	# 			}
	# 			pos = (max(x) + min(x))/2
	# 			midpoint = (max(x) - min(x))/2
	# 		}
	# 	} else {
	# 		if(is.leaf(dend_env$dend[[index]])) {
	# 			pos = od[attr(dend_env$dend[[index]], "label")]
	# 			midpoint = 0
	# 		} else {
	# 			x = NULL
	# 			for(i in seq_len(length(dend_env$dend[[index]]))) {
	# 				if(is.null(attr(dend_env$dend[[c(index, i)]], "x"))) {
	# 					update_midpoint(c(index, i))
	# 				}
	# 				x[i] = attr(dend_env$dend[[c(index, i)]], "x")
	# 			}
	# 			pos = (max(x) + min(x))/2
	# 			midpoint = (max(x) - min(x))/2
	# 		}
	# 	}
	# 	if(is.null(index)) {
	# 		attr(dend_env$dend, "x") = pos
	# 	} else {
	# 		attr(dend_env$dend[[index]], "x") = pos
	# 		attr(dend_env$dend[[index]], "midpoint") = midpoint
	# 	}
	# }
	# update_midpoint()

	oe = try(dend_tmp <- as.dendrogram(as.hclust(dend)), silent = TRUE)

	if(!inherits(oe, "try-error")) {
		dend = edit_node(dend, function(d, ind) {
			if(length(ind) == 0) {
				attr(d, "midpoint") = attr(dend_tmp, "midpoint")
			} else {
				attr(d, "midpoint") = attr(dend_tmp[[ind]], "midpoint")
			}
			d
		})
	}

	dendrapply(dend, function(d) {
		if(is.leaf(d)) {
			attr(d, "height") = 0
		}
		d
	})
}

get_hierarchy = function(object, depth = max_depth(object)) {

	hierarchy = object@hierarchy
	if(!is.null(depth)) {
		hierarchy = hierarchy[nchar(hierarchy[, 2]) <= depth , , drop = FALSE]
	}

	dend = subgroup_dend(object, hierarchy)
	dend
}

random_dend = function(n) {
	x = rnorm(n)
	dend = as.dendrogram(hclust(dist(1:n)))
	# set height to zero

	dendrapply(dend, function(x) {attr(x, "height") = 0; x})
}

zero_height_dend = function(n) {
	check_pkg("data.tree", bioc = FALSE)

	lt = data.tree::Node$new("foo")
	lt$node_height = 0
	for(i in 1:n) {
		lt$AddChildNode({
			node = data.tree::Node$new(paste0("foo", i))
			node$node_height = 0
			node
		})
	}
	dend = as.dendrogram(lt, heightAttribute = "node_height")
	
}

calc_dend = function(object, depth = max_depth(object)) {

	pd = get_hierarchy(object, depth = depth)
	classes = get_classes(object, depth = depth)[, 1]
	if(is.null(names(classes))) names(classes) = seq_along(classes)
	cd_list = lapply(tapply(names(classes), classes, function(x) x), function(x) {
		d = random_dend(length(x))
		d = dendextend::`labels<-`(d, value = x)
		d
	})
	cd_list = cd_list[labels(pd)]

	dend = merge_dendrogram(pd, cd_list)
	dend = adjust_dend_by_x(dend)
	dend = dendextend::`order.dendrogram<-`(dend, value = structure(1:length(classes), names = names(classes))[labels(dend)])
	dend
}

tightest_subgroup = function(mat, subgroup, top_n) {
	x = NULL
	for(n in top_n) {
		m2 = mat[1:n, , drop = FALSE]
		mean_dist = tapply(seq_len(ncol(m2)), subgroup, function(ind) {
			n = length(ind)
			if(n == 1) {
				return(Inf)
			}
			sum(dist(t(m2[, ind, drop = FALSE]))^2)/(n*(n-1)/2)
		})
		x = c(x, as.numeric(names(mean_dist[which.min(mean_dist)])))
	}
	tb = table(x)
	as.numeric(names(which.min(tb)))
}


farthest_subgroup = function(mat, subgroup, top_n) {
	x = NULL
	for(n in top_n) {
		m2 = mat[1:n, , drop = FALSE]
		group_mean = do.call("cbind", tapply(seq_len(ncol(m2)), subgroup, function(ind) {
			rowMeans(m2[, ind, drop = FALSE])
		}))
		d = as.matrix(dist(t(group_mean)))
		x = c(x, as.numeric(names(which.max(rowSums(d)))))
	}
	tb = table(x)
	as.numeric(names(which.max(tb)))
}

# mat = matrix(rnorm(100), 10)
# mat2 = cbind(matrix(rnorm(50, mean = -1), nrow = 10), matrix(rnorm(50, mean = 1), nrow = 10))
# subset = sample(10, 10)
# mean_dist_decrease(mat, subset[1:5], subset[6:10])
mean_dist_decrease = function(mat, subset1, subset2) {
	n = ncol(mat)
	global_mean_dist = sum(dist(t(mat))^2) / (n*(n-1)/2)

	n1 = length(subset1)
	n2 = length(subset2)
	mean_dist = (sum(dist(t(mat[, subset1, drop = FALSE]))^2) +  sum(dist(t(mat[, subset2, drop = FALSE]))^2)) / ( n1*(n1-1)/2 + n2*(n2-1)/2)

	(global_mean_dist - mean_dist)/global_mean_dist
}

# == title
# Get class IDs from the HierarchicalPartition object
#
# == param
# -object A `HierarchicalPartition-class` object.
# -depth Depth of the hierarchy.
#
# == return
# A data frame of classes IDs. The class IDs are the node IDs where the subgroup sits in the hierarchy.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
setMethod(f = "get_classes",
	signature = "HierarchicalPartition",
	definition = function(object, depth = max_depth(object)) {

	if(length(depth) > 1) {
		df = do.call(cbind, lapply(depth, function(d) get_classes(object, d)))
		colnames(df) = paste0("depth=", depth)
		return(df)
	}
	subgroup = object@subgroup
	if(!is.null(depth)) {
		l = nchar(subgroup) > depth
		subgroup[l] = substr(subgroup[l], 1, depth)
	}
	data.frame(class = subgroup, stringsAsFactors = FALSE)
})

# == title
# Print the HierarchicalPartition object
#
# == param
# -object a `HierarchicalPartition-class` object
#
# == value
# No value is returned.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
setMethod(f = "show",
	signature = "HierarchicalPartition",
	definition = function(object) {

	qqcat("A 'HierarchicalPartition' object with '@{object@list[[1]]@top_value_method}:@{object@list[[1]]@partition_method}' method.\n")
	qqcat("  On a matrix with @{nrow(object@.env$data)} rows and @{ncol(object@.env$data)} columns.\n")
	qqcat("  Performed in total @{object@list[[1]]@n_partition*length(object@list)} partitions.\n")
	if(has_hierarchy(object)) {
		qqcat("  There are @{length(all_leaves(object))} groups.\n")
	}
	cat("\n")

	if(has_hierarchy(object)) {
		cat("Hierarchy of the partition:\n")
		hierarchy = object@hierarchy
		nodes = hierarchy[, 2]
		nc = nchar(nodes)
		names(nc) = nodes
		n = length(nc)

		parent = structure(hierarchy[, 1], names = hierarchy[, 2])

		lines = character(n)
		for(i in seq_len(n)) {
			lines[i] = paste0("  ", strrep("    ", nc[i] - 2), ifelse(grepl("0$", nodes[i]), "`-", "|-") ,"- ", nodes[i], qq(", @{length(object@list[[nodes[i]]]@column_index)} cols"))
			p = nodes[i]
			while(p != "0") {
				p = parent[p]
				if(!grepl("0$", p)) {
					substr(lines[i], (nc[p] - 2)*4+3, (nc[p] - 2)*4+3) = "|"
				}
			}
		}
		# substr(lines[1], 1, 1) = "+"
		lines = c(qq("  0, @{ncol(object@list[['0']])} cols"), lines)
		cat(lines, sep = "\n")
	} else {
		cat("No hierarchy found.\n")
	}
	
	# ne = nodes[which.max(nc)[1]]
	# qqcat("e.g. a node '@{ne}' which a parent node '@{gsub(\'.$\', \'\', ne)}'\n")

	qqcat("\n")
	qqcat("Following methods can be applied to this 'HierarchicalPartition' object:\n")
	txt = showMethods(classes = "HierarchicalPartition", where = topenv(), printTo = FALSE)
	txt = grep("Function", txt, value = TRUE)
	fname = gsub("Function: (.*?) \\(package.*$", "\\1", txt)
	print(fname)
	cat("\n")
	cat("You can get result for a single node by e.g. object[\"01\"]\n")
})

# == title
# Get signatures rows
#
# == param
# -object a `HierarchicalPartition-class` object.
# -depth depth of the hierarchy.
# -scale_rows whether apply row scaling when making the heatmap.
# -anno a data frame of annotations for the original matrix columns. 
#       By default it uses the annotations specified in `hierarchical_partition`.
# -anno_col a list of colors (color is defined as a named vector) for the annotations. If ``anno`` is a data frame,
#       ``anno_col`` should be a named list where names correspond to the column names in ``anno``.
# -show_column_names whether show column names in the heatmap.
# -verbose whether to print messages.
# -plot whether to make the plot.
# -... other arguments pass to `get_signatures,ConsensusPartition-method`.
# 
# == details
# The function calls `get_signatures,ConsensusPartition-method` to find signatures at
# each node of the partition hierarchy.
#
# == value
# A list of row indices where rows are significantly different between subgroups in at least one node.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
setMethod(f = "get_signatures",
	signature = "HierarchicalPartition",
	definition = function(object, depth = max_depth(object),
	scale_rows = object[1]@scale_rows, 
	anno = get_anno(object), 
	anno_col = get_anno_col(object),
	show_column_names = FALSE, 
	verbose = TRUE, plot = TRUE,
	...) {

	if(depth <= 1) {
		stop_wrap("depth should be at least larger than 1.")
	}

	alf = all_leaves(object, depth = depth)
	ap = setdiff(all_nodes(object, depth = depth), all_leaves(object, depth = depth))

	sig_lt = list()
	for(p in ap) {
		best_k = suggest_best_k(object[[p]])
		if(verbose) qqcat("* get signatures at node @{p} with @{best_k} subgroups.\n")
		sig_tb = get_signatures(object[[p]], k = best_k, verbose = FALSE, plot = FALSE, ...)
		
		sig_lt[[p]] = sig_tb
		if(verbose) qqcat("  - find @{nrow(sig_tb)} signatures at node @{p}\n")
	}

	all_index = sort(unique(unlist(lapply(sig_lt, function(x) x[, 1]))))

	sig = data.frame(which_row = all_index)
	
	if(!plot) {
		return(invisible(sig))
	}

	all_sig = all_index
	if(verbose) qqcat("* in total @{length(all_sig)} signatures in all classes found\n")

	m = object@.env$data[all_sig, , drop = FALSE]

	class = get_classes(object, depth = depth)[, 1]

	if(nrow(m) > 2000) {
		if(verbose) qqcat("* randomly sample @{2000} rows from @{nrow(m)} total rows.\n")
		ind = sample(nrow(m), 2000)
	} else {
		ind = seq_len(nrow(m))
	}
	mat1 = m[ind, , drop = FALSE]

	base_mean = rowMeans(mat1)
	if(nrow(mat1) == 1) {
		group_mean = matrix(tapply(mat1, class, mean), nrow = 1)
	} else {
		group_mean = do.call("cbind", tapply(seq_len(ncol(mat1)), class, function(ind) {
			rowMeans(mat1[, ind, drop = FALSE])
		}))
	}
	rel_diff = (rowMaxs(group_mean) - rowMins(group_mean))/base_mean/2

	if(is.null(anno)) {
		bottom_anno1 = NULL
	} else {
		if(is.atomic(anno)) {
			anno_nm = deparse(substitute(anno))
			anno = data.frame(anno)
			colnames(anno) = anno_nm
			if(!is.null(anno_col)) {
				anno_col = list(anno_col)
				names(anno_col) = anno_nm
			}
		} else if(ncol(anno) == 1) {
			if(!is.null(anno_col)) {
				if(is.atomic(anno_col)) {
					anno_col = list(anno_col)
					names(anno_col) = colnames(anno)
				}
			}
		}

		if(is.null(anno_col)) {
			bottom_anno1 = HeatmapAnnotation(df = anno,
				show_annotation_name = TRUE, annotation_name_side = "right")
		} else {
			bottom_anno1 = HeatmapAnnotation(df = anno, col = anno_col,
				show_annotation_name = TRUE, annotation_name_side = "right")
		}
	}

	if(scale_rows) {
		scaled_mean = base_mean
		scaled_sd = rowSds(mat1)
		scaled_mat1 = t(scale(t(mat1)))

		use_mat1 = scaled_mat1
		mat_range = quantile(abs(scaled_mat1), 0.95, na.rm = TRUE)
		col_fun = colorRamp2(c(-mat_range, 0, mat_range), c("green", "white", "red"))
		heatmap_name = "z-score"
	} else {
		use_mat1 = mat1
		mat_range = quantile(mat1, c(0.05, 0.95))
		col_fun = colorRamp2(c(mat_range[1], mean(mat_range), mat_range[2]), c("blue", "white", "red"))
		heatmap_name = "expr"
	}

	row_split = NULL
	if(nrow(use_mat1) > 10) {
		wss <- (nrow(use_mat1)-1)*sum(apply(use_mat1,2,var))
		for (i in 2:15) wss[i] <- sum(kmeans(use_mat1, centers = i, iter.max = 50)$withinss)
		row_km = min(elbow_finder(1:15, wss)[1], knee_finder(1:15, wss)[1])
		if(length(unique(class)) == 1) row_km = 1
		if(length(unique(class)) == 2) row_km = min(row_km, 2)
		if(row_km > 1) {
			row_split = kmeans(use_mat1, centers = row_km)$cluster
		}
		if(verbose) qqcat("  - split rows into @{row_km} groups by k-means clustering.\n")
	}
			

	if(verbose) qqcat("* making heatmaps for signatures\n")

	ha1 = HeatmapAnnotation(
		Class = class,
		col = list(Class = object@subgroup_col))

	dend = cluster_within_group(use_mat1, class)
	
	ht_list = Heatmap(use_mat1, top_annotation = ha1,
		name = heatmap_name, show_row_names = FALSE, 
		show_column_names = show_column_names, col = col_fun,
		use_raster = TRUE, row_split = row_split,
		show_row_dend = FALSE, cluster_columns = dend, column_split = length(unique(class)),
		column_title = qq("@{length(unique(class))} groups, @{length(all_sig)} signatures"),
		bottom_annotation = bottom_anno1,
		row_title = {if(length(unique(row_split)) <= 1) NULL else qq("k-means with @{length(unique(row_split))} groups")})

	all_value_positive = !any(m < 0)
 	if(scale_rows && all_value_positive) {
		ht_list = ht_list + Heatmap(base_mean, show_row_names = FALSE, name = "base_mean", width = unit(5, "mm")) +
			Heatmap(rel_diff, col = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red")), 
				show_row_names = FALSE, name = "rel_diff", width = unit(5, "mm"))
	}

	draw(ht_list)
	return(invisible(sig))
})


# == title
# Compare Signatures from Different Nodes
#
# == param
# -object A `HierarchicalPartition-class` object. 
# -depth Depth of the hierarchy.
# -method Method to visualize.
# -verbose Whether to print message.
# -... Other arguments passed to `get_signatures,HierarchicalPartition-method`.
#
# == details
# It plots an Euler diagram or a UpSet plot showing the overlap of signatures from different nodes.
# On each node, the number of subgroups is inferred by `suggest_best_k,ConsensusPartition-method`.
#
# == example
# data(golub_cola_rh)
# compare_signatures(golub_cola_rh)
setMethod(f = "compare_signatures",
	signature = "HierarchicalPartition",
	definition = function(object, depth = max_depth(object), 
	method = c("euler", "upset"), verbose = interactive(), ...) {

	lt = object@list
	al = all_leaves(object)
	lt = lt[! names(lt) %in% al]
	lt = lt[nchar(names(lt)) <= depth]

	sig_list = lapply(lt, function(x) {
		tb = get_signatures(x, k = suggest_best_k(x), verbose = verbose, ..., plot = FALSE)
		if(is.null(tb)) {
			return(integer(0))
		} else {
			return(tb$which_row)
		}
	})

	l = sapply(sig_list, length) > 0
	if(any(l) && verbose) {
		qqcat("Following nodes have no signature found: \"@{paste(names(sig_list)[l], collapse=', ')}\"\n")
	}
	sig_list = sig_list[l]

	if(missing(method)) {
		if(length(sig_list) <= 3) {
			method = "euler"
		} else {
			method = "upset"
		}
	} else {
		method = match.arg(method)[1]
	}

	if(method == "euler") {
		plot(eulerr::euler(sig_list), legend = TRUE, quantities = TRUE, main = "Signatures from different nodes")
	} else {
		m = make_comb_mat(sig_list)
		if(length(comb_size(m)) > 40) {
			m = m[order(comb_size(m), decreasing = TRUE)[1:40]]
		} else {
			m = m[order(comb_size(m), decreasing = TRUE)]
		}
		draw(UpSet(m, column_title = "Signatures from different nodes"))
	}

})

# == title
# Collect classes from HierarchicalPartition object
#
# == param
# -object A `HierarchicalPartition-class` object.
# -depth Depth of the hierarchy.
# -anno A data frame of annotations for the original matrix columns. 
#       By default it uses the annotations specified in `hierarchical_partition`.
# -anno_col A list of colors (color is defined as a named vector) for the annotations. If ``anno`` is a data frame,
#       ``anno_col`` should be a named list where names correspond to the column names in ``anno``.
#
# == details
# The function plots the hierarchy of the classes.
#
# == value
# No value is returned.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# data(golub_cola_rh)
# collect_classes(golub_cola_rh)
# collect_classes(golub_cola_rh, depth = 2)
setMethod(f = "collect_classes",
	signature = "HierarchicalPartition",
	definition = function(object, depth = max_depth(object), 
	anno = get_anno(object[1]), anno_col = get_anno_col(object[1])) {

	cl = get_classes(object, depth = depth)[, 1]
	dend = calc_dend(object, depth = depth)

	ht_list = Heatmap(cl, name = "Class", col = object@subgroup_col, width = unit(5, "mm"),
		row_title_rot = 0, cluster_rows = dend, row_dend_width = unit(2, "cm"),
		row_split = length(unique(cl)), show_row_names = FALSE, row_title = NULL)
	if(!is.null(anno)) {
		if(is.atomic(anno)) {
			anno_nm = deparse(substitute(anno))
			anno = data.frame(anno)
			colnames(anno) = anno_nm
			if(!is.null(anno_col)) {
				anno_col = list(anno_col)
				names(anno_col) = anno_nm
			}
		} else if(ncol(anno) == 1) {
			if(!is.null(anno_col)) {
				if(is.atomic(anno_col)) {
					anno_col = list(anno_col)
					names(anno_col) = colnames(anno)
				}
			}
		}
		if(is.null(anno_col))
			ht_list = ht_list + rowAnnotation(df = anno, show_annotation_name = TRUE,
				annotation_name_side = "bottom", width = unit(ncol(anno)*5, "mm"))
		else {
			ht_list = ht_list + rowAnnotation(df = anno, col = anno_col, show_annotation_name = TRUE,
				annotation_name_side = "bottom", width = unit(ncol(anno)*5, "mm"))
		}
	}
	draw(ht_list)
})

# == title
# Subset the HierarchicalPartition object
#
# == param
# -x A `HierarchicalPartition-class` object.
# -i Index. The value should be numeric or a node ID.
#
# == details
# On each node, there is a `ConsensusPartition-class` object.
#
# Note you cannot get a sub-hierarchy of the partition.
#
# == value
# A `ConsensusPartition-class` object.
#
# == example
# data(golub_cola_rh)
# golub_cola_rh["01"]
"[.HierarchicalPartition" = function(x, i) {
	if(length(i) > 1) {
		stop_wrap("length of the index should only be one.")
	}
	if(is.numeric(i)) {
		i = names(x@list)[i]
	}
	if(is.null(x@list[[i]])) {
		stop_wrap("index for the HierarchicalPartition object cannot be found.")
	}
	return(x@list[[i]])
}

# == title
# Subset the HierarchicalPartition object
#
# == param
# -x A `HierarchicalPartition-class` object
# -i Index. The value should be numeric or a node ID.
#
# == details
# On each node, there is a `ConsensusPartition-class` object.
#
# Note you cannot get a sub-hierarchy of the partition.
#
# == value
# A `ConsensusPartition-class` object.
#
"[[.HierarchicalPartition" = function(x, i) {
	x[i]
}


# == title
# Test correspondance between predicted classes and known factors
#
# == param
# -object A `HierarchicalPartition-class` object.
# -depth Depth of the hierarchy.
# -known A vector or a data frame with known factors. By default it is the annotation table set in `hierarchical_partition`.
# -verbose Whether to print messages.
#
# == value
# A data frame with columns:
#
# - number of samples
# - p-values from the tests
# - number of classes
#
# The classifications are extracted for each depth.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# data(golub_cola_rh)
# # golub_cola_rh already has known annotations, so test_to_known_factors()
# # can be directly applied
# test_to_known_factors(golub_cola_rh)
setMethod(f = "test_to_known_factors",
	signature = "HierarchicalPartition",
	definition = function(object, known = get_anno(object[1]),
	depth = 2:max_depth(object), verbose = FALSE) {

	if(!is.null(known)) {
		if(is.atomic(known)) {
			df = data.frame(known)
			colnames(df) = deparse(substitute(known))
			known = df
		}
	} else {
		stop_wrap("Known factors should be provided.")
	}

	class = get_classes(object, depth)
	m = test_between_factors(class, known, verbose = verbose)
	colnames(m) = paste0(colnames(m), "(p)")
	df = cbind(n = nrow(class), m, n_class = apply(class, 2, function(x) length(unique(x))))
	return(df)
})

# == title
# Visualize columns after dimension reduction
#
# == param
# -object A `HierarchicalPartition-class` object.
# -depth Depth of the hierarchy.
# -top_n Top n rows to use. By default it uses all rows in the original matrix.
# -parent_node Parent node. If it is set, the function call is identical to ``dimension_reduction(object[parent_node])``
# -method Which method to reduce the dimension of the data. ``MDS`` uses `stats::cmdscale`,
#         ``PCA`` uses `stats::prcomp`. ``t-SNE`` uses `Rtsne::Rtsne`. ``UMAP`` uses
#         `umap::umap`.
# -scale_rows Whether to perform scaling on matrix rows.
# -verbose Whether print messages.
# -... Other arguments passed to `dimension_reduction,ConsensusPartition-method`.
#
# == details
# The class IDs are extract at ``depth``.
#
# == value
# No value is returned.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
setMethod(f = "dimension_reduction",
	signature = "HierarchicalPartition",
	definition = function(object,
	depth = max_depth(object), parent_node,
	top_n = NULL, method = c("PCA", "MDS", "t-SNE", "UMAP"),
	scale_rows = TRUE, verbose = TRUE, ...) {

	cl = as.list(match.call())
	# default value
	if(! "method" %in% names(cl)) {
		method = NULL
		if(is.null(method)) {
			oe = try(loadNamespace("umap"), silent = TRUE)
			if(inherits(oe, "try-error")) {
				if(verbose) cat("umap package is not installed.\n")
			} else {
				if(verbose) cat("use UMAP\n")
				method = "UMAP"
			}
		}
		if(is.null(method)) {
			oe = try(loadNamespace("Rtsne"), silent = TRUE)
			if(inherits(oe, "try-error")) {
				if(verbose) cat("Rtsne package is not installed.\n")
			} else {
				if(verbose) cat("use t-SNE\n")
				method = "t-SNE"
			}
		}
		if(is.null(method)) {
			if(verbose) cat("use PCA\n")
			method = "PCA"
		}
	}

	method = match.arg(method)
	data = object@list[[1]]@.env$data

	op = par(c("mar", "xpd"))
	par(mar = c(4.1, 4.1, 4.1, 6), xpd = NA)

	if(missing(parent_node)) {
		if(!is.null(top_n)) {
			top_n = min(c(top_n, nrow(data)))
			all_top_value = object@list[["0"]]@.env$all_top_value_list[[object@list[["0"]]@top_value_method]]
			ind = order(all_top_value)[1:top_n]
			data = data[ind, , drop = FALSE]
		} else {
			top_n = nrow(data)
		}
		class = get_classes(object, depth = depth)[, 1]
		n_class = length(unique(class))
		dimension_reduction(data, pch = 16, col = object@subgroup_col[class],
			cex = 1, main = qq("@{method} on @{top_n} rows with highest @{object@list[[1]]@top_value_method} scores@{ifelse(scale_rows, ', rows are scaled', '')}\n@{ncol(data)} samples at depth @{depth} with @{n_class} classes"),
			method = method, scale_rows = scale_rows, ...)
		class_level = sort(unique(class))
		legend(x = par("usr")[2], y = mean(par("usr")[3:4]), legend = c(class_level, "ambiguous"), 
			pch = c(rep(16, n_class), 0),
			col = c(object@subgroup_col[class_level], "white"), xjust = 0, yjust = 0.5,
			title = "Class", title.adj = 0.1, bty = "n",
			text.col = c(rep("black", n_class), "white"))
	} else {
		if(!parent_node %in% setdiff(all_nodes(object), all_leaves(object))) {
			stop_wrap(qq("@{parent_node} has no children nodes."))
		}
		obj = object[parent_node]
		dimension_reduction(obj, k = suggest_best_k(obj), top_n = top_n, method = method,
			scale_rows = scale_rows, ...)
		legend(x = par("usr")[2], y = par("usr")[4], legend = qq("node @{parent_node}"))
	}

	par(op)
})

# == title
# Max depth of the hierarchy
#
# == param
# -object A `HierarchicalPartition-class` object.
#
# == value
# A numeric value.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# data(golub_cola_rh)
# max_depth(golub_cola_rh)
setMethod(f = "max_depth",
	signature = "HierarchicalPartition",
	definition = function(object) {

	if(has_hierarchy(object)) {
		max(nchar(object@hierarchy[, 2]))
	} else {
		1
	}
})

# == title
# All nodes in the hierarchy
#
# == param
# -object A `HierarchicalPartition-class` object.
# -depth Depth in the hierarchy.
#
# == value
# A vector of node ID.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# data(golub_cola_rh)
# all_nodes(golub_cola_rh)
setMethod(f = "all_nodes",
	signature = "HierarchicalPartition",
	definition = function(object, depth = max_depth(object)) {

	if(has_hierarchy(object)) {
		all_nodes = unique(as.vector(object@hierarchy))
		if(!is.null(depth)) {
			all_nodes = all_nodes[nchar(all_nodes) <= depth]
		}
		return(all_nodes)
	} else {
		return(character(0))
	}
})

# == title
# All leaves in the hierarchy
#
# == param
# -object A `HierarchicalPartition-class` object.
# -depth Depth in the hierarchy.
#
# == value
# A vector of node ID.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# data(golub_cola_rh)
# all_leaves(golub_cola_rh)
setMethod(f = "all_leaves",
	signature = "HierarchicalPartition",
	definition = function(object, depth = max_depth(object)) {

	if(has_hierarchy(object)) {
		hierarchy = unique(object@hierarchy)
		if(!is.null(depth)) {
			hierarchy = hierarchy[nchar(hierarchy[, 2]) <= depth, , drop = FALSE]
		}
		
		tb = table(hierarchy)
		tb = tb[tb <= 1]
		names(tb)	
	} else {
		"0"
	}
})

get_children = function(object, node = "0") {
	hierarchy = unique(object@hierarchy)
	hierarchy[hierarchy[, 1] == node, 2]
}

# == title
# Get the original matrix
#
# == param
# -object A `HierarchicalPartition-class` object.
#
# == value
# A numeric matrix.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
setMethod(f = "get_matrix",
	signature = "HierarchicalPartition",
	definition = function(object) {
	object@.env$data
})


# == title
# Get annotations
#
# == param
# -object A `HierarchicalPartition-class` object.
#
# == value
# A data frame if ``anno`` was specified in `hierarchical_partition`, or ``NULL``.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
setMethod(f = "get_anno",
	signature = "HierarchicalPartition",
	definition = function(object) {
	get_anno(object[1])
})



# == title
# Get annotation colors
#
# == param
# -object A `HierarchicalPartition-class` object.
#
# == value
# A list of color vectors or ``NULL``.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
setMethod(f = "get_anno_col",
	signature = "HierarchicalPartition",
	definition = function(object) {
	get_anno_col(object[1])
})

# == title
# Guess the best number of partitions
#
# == param
# -object A `HierarchicalPartition-class` object.
# -jaccard_index_cutoff The cutoff for Jaccard index for comparing to previous k.
#
# == details
# It basically gives the best k at each node.
#
# == value
# A data frame with the best k and other statistics for each node.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# data(golub_cola_rh)
# suggest_best_k(golub_cola_rh)
setMethod(f = "suggest_best_k",
	signature = "HierarchicalPartition",
	definition = function(object, jaccard_index_cutoff = 0.95) {

	best_k = NULL
	stability = NULL
	mean_silhouette = NULL
	concordance = NULL
	for(i in seq_along(object@list)) {
		obj = object@list[[i]]
		k = suggest_best_k(obj, jaccard_index_cutoff)
		best_k[i] = k

		if(is.na(best_k[i])) {
			stability[i] = NA
			mean_silhouette[i] = NA
			concordance[i] = NA
		} else {
			stat = get_stats(obj, k = best_k[i])
			stability[i] = stat[1, "1-PAC"]
			mean_silhouette[i] = stat[1, "mean_silhouette"]
			concordance[i] = stat[1, "concordance"]
		}
	}

	tb = data.frame(
		node = names(object@list),
		best_k = best_k,
		"1-PAC" = stability,
		mean_silhouette = mean_silhouette,
		concordance = concordance,
		check.names = FALSE)
	tb$n_sample = sapply(tb$node, function(id) {
		ncol(object[[id]])
	})

	rntb = rownames(tb)
	l = tb$`1-PAC` >= 0.9 & !is.na(tb$best_k)

	tb = cbind(tb, ifelse(l, ifelse(tb$`1-PAC` <= 0.95, "*", "**"), ""), stringsAsFactors = FALSE)
	colnames(tb)[ncol(tb)] = ""
	
	# l = tb[, 1] %in% all_leaves(object)
	# tb = cbind(tb, ifelse(l, "leaf", "node"), stringsAsFactors = FALSE)
	# colnames(tb)[ncol(tb)] = ""

	return(tb)
})


# == title
# Make HTML report from the HierarchicalPartition object
#
# == param
# -object A `HierarchicalPartition-class` object.
# -output_dir The output directory where the report is put.
# -mc.cores Multiple cores to use.
# -title Title of the report.
# -env Where the objects in the report are found, internally used.
#
# == details
# This function generates a HTML report which contains all plots for all nodes
# in the partition hierarchy.
#
# == value
# No value is returned.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# if(FALSE) {
# # the following code is runnable
# data(golub_cola_rh)
# cola_report(golub_cola_rh, output_dir = "~/test_cola_rh_report")
# }
setMethod(f = "cola_report",
	signature = "HierarchicalPartition",
	definition = function(object, output_dir = getwd(), mc.cores = 1,
	title = qq("cola Report for Hierarchical Partitioning (@{object[1]@top_value_method}:@{object[1]@partition_method})"), 
	env = parent.frame()) {

	if(max_depth(object) <= 1) {
		cat("No hierarchy is detected, no report is generated.\n")
		return(invisible(NULL))
	}

	check_pkg("genefilter", bioc = TRUE)

	var_name = deparse(substitute(object, env = env))
	make_report(var_name, object, output_dir, mc.cores = mc.cores, title = title, class = "HierarchicalPartition")
})



# == title
# Number of rows in the matrix
#
# == param
# -x A `HierarchicalPartition-class` object.
#
setMethod(f = "nrow",
	signature = "HierarchicalPartition",
	definition = function(x) {
	nrow(x@.env$data)
})

# == title
# Number of columns in the matrix
#
# == param
# -x A `HierarchicalPartition-class` object.
#
setMethod(f = "ncol",
	signature = "HierarchicalPartition",
	definition = function(x) {
	ncol(x@.env$data)
})

# == title
# Row names of the matrix
#
# == param
# -x A `HierarchicalPartition-class` object.
#
setMethod(f = "rownames",
	signature = "HierarchicalPartition",
	definition = function(x) {
	rownames(x@.env$data)
})

# == title
# Column names of the matrix
#
# == param
# -x A `HierarchicalPartition-class` object.
#
setMethod(f = "colnames",
	signature = "HierarchicalPartition",
	definition = function(x) {
	colnames(x@.env$data)
})

# == title
# Dimension of the matrix
#
# == param
# -x A `HierarchicalPartition-class` object.
#
dim.HierarchicalPartition = function(x) {
	dim(x@.env$data)
}

Try the cola package in your browser

Any scripts or data that you put into this service are public.

cola documentation built on Nov. 8, 2020, 8:12 p.m.