R/methylation_subtype_classification.R

# == title
# Classify subtypes by methylation data
#
# == param
# -gr a `GenomicRanges::GRanges` which contains mean methylation, should be generated by `get_mean_methylation_in_genomic_features`
# -n_class number of classes expected
# -pct_cutoff percent of most variable rows
# -corr_cutoff cutoff for absolute correlation
# -k number of correlated windows
# -ha additional annotation
# -cgi a `GenomicRanges::GRanges` object which contains CpG islands
# -shore a `GenomicRanges::GRanges` object which contains CpG shores
#
# == details
# For the subtype classification which is based on clustering, if there are clear subtypes, 
# it is expected that there must be a group of rows that show high correlation to each other. 
# Based on this correlation feature, we select rows that under cutoff of ``corr_cutoff``, 
# each row should correlate to at least other ``k`` rows. On the second hand, since difference between 
# subtypes are not in an identical position, we first separate samples into two groups based on consensus clustering, 
# then, for the subgroup which contains more samples, we separate them again into two subgroups. 
# We apply it repeatedly until there are ``n_class`` subtypes. On every step of clustering, 
# we select rows based on the correlation criterion and the final rows are union of rows in all iterations.
#
# CpG islands and shores will be added as row annotations to the heatmap.
#
# == value
# A vector with predicted classification of samples
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
methylation_subtype_classfication = function(gr, n_class, pct_cutoff = 1, corr_cutoff = 0.5, 
	k, ha = NULL, cgi = NULL, shore = NULL) {

	mat = as.matrix(mcols(gr))
	rownames(mat) = seq_len(nrow(mat))
	mat = mat[, colnames(mat) != "ncpg"]

	get_class = function(mat, p, corr, k) {
		od = order(rowVars(mat), decreasing = TRUE)[1:round(nrow(mat)*p)]
		mat2 = mat[od, ]

		ct_cgi = cor_columns(t(mat2), abs_cutoff = corr, mc = 2)

		ind = which(ct_cgi[, as.character(corr)] >= k)
		mat2 = mat2[ind, ]
		if(nrow(mat2) > 5000) {
			l = sample(seq_len(nrow(mat2)), 5000)
		} else {
			l = rep(TRUE, nrow(mat2))
		}
		qqcat("@{sum(l)} rows remains\n")
		pdf(NULL)
		res = ConsensusClusterPlus(mat2[l, ], maxK = 2, 
			clusterAlg = "km", distance = "euclidean", reps = 1000, verbose = TRUE)
		dev.off()
		list(class = lapply(res[-1], function(x) x$consensusClass), row_index = rownames(mat2))
	}

	# recursive consensus clustering
	cl = get_class(mat, p = pct_cutoff, corr = corr_cutoff, k = k)
	class = cl$class[[1]]
	row_index = cl$row_index
	i_class = 2

	while(i_class < n_class) {

		tb = table(class); i = as.numeric(names(tb)[which.max(tb)])
		cl = get_class(mat[, class == i], p = pct_cutoff, corr = corr_cutoff, k = k)
		class[class == i] = cl$class[[1]] + length(unique(class)) + 1
		row_index = union(row_index, cl$row_index)

		i_class = i_class + 1
	}

	n = length(row_index)
	if(length(row_index) > 5000) row_index = sample(row_index, 5000)

	m = NULL
	class2 = NULL
	for(i in unique(class)) {
		dend = as.dendrogram(hclust(dist(t(mat[row_index, class == i]))))
		dend = reorder(dend, colMeans(mat[row_index, class == i]))
		col_order = order.dendrogram(dend)
		m = cbind(m, mat[row_index, class == i][, col_order])
		class2 = c(class2, class[class == i][col_order])
	}


	ha_cc = HeatmapAnnotation(consensusCL = class2, col = list(consensusCL = structure(seq_along(unique(class2)), names = unique(class2))), show_legend  = FALSE)
	
	if(is.null(cgi) && is.null(shore)) {
		ht = Heatmap(m, name = "methylation", col = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red")), show_row_names = FALSE,
			top_annotation = ha, bottom_annotation = ha_cc, bottom_annotation_height = unit(3, "mm"),
			cluster_columns = FALSE, column_title = qq("@{n} 1kb windows"))
	} else if(!is.null(cgi) && is.null(shore)) {
		gr2 = annotate_to_genomic_features(gr[as.numeric(row_index)], list(cgi), name = c("cgi"))
		anno = ifelse(gr2$overlap_to_cgi > 0, "CGI", "Others")

		ht = Heatmap(m, name = "methylation", col = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red")), show_row_names = FALSE,
			top_annotation = ha, bottom_annotation = ha_cc, bottom_annotation_height = unit(3, "mm"), split = anno, 
			cluster_columns = FALSE, column_title = qq("@{n} 1kb windows")) + 
		Heatmap(anno, name = "anno", col = c("CGI" = "orange", "Others" = "blue"))
	} else if(is.null(cgi) && !is.null(shore)) {
		gr2 = annotate_to_genomic_features(gr[as.numeric(row_index)], list(shore), name = c("shore"))
		anno = ifelse(gr2$overlap_to_cgi > 0, "Shore", "Others")

		ht = Heatmap(m, name = "methylation", col = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red")), show_row_names = FALSE,
			top_annotation = ha, bottom_annotation = ha_cc, bottom_annotation_height = unit(3, "mm"), split = anno, 
			cluster_columns = FALSE, column_title = qq("@{n} 1kb windows")) + 
		Heatmap(anno, name = "anno", col = c("Shore" = "green", "Others" = "blue"))
	} else if(!is.null(cgi) && !is.null(shore)) {
		gr2 = annotate_to_genomic_features(gr[as.numeric(row_index)], list(cgi, shore), name = c("cgi", "shore"))
		anno = ifelse(gr2$overlap_to_shore > 0, ifelse(gr2$overlap_to_cgi > 1 - gr2$overlap_to_shore, "CGI", "Shore"), "Others")

		ht = Heatmap(m, name = "methylation", col = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red")), show_row_names = FALSE,
			top_annotation = ha, bottom_annotation = ha_cc, bottom_annotation_height = unit(3, "mm"), split = anno, 
			cluster_columns = FALSE, column_title = qq("@{n} 1kb windows")) + 
		Heatmap(anno, name = "anno", col = c("CGI" = "orange", "Shore" = "green", "Others" = "blue"))
	}
	
	draw(ht)
	for(an in sapply(ha@anno_list, function(x) x@name)) {
		decorate_annotation(an, {
			grid.text(an, x = unit(-2, "mm"), just = "right")
		})
	}

	return(invisible(class2))
}
eilslabs/epic documentation built on May 16, 2019, 1:24 a.m.