R/correlated_regions_enrichedheatmap.R

Defines functions normalize_epigenomic_signals merge_row_order add_boxplot_as_column_annotation cr_enriched_heatmap_at_tss_cgi cr_enriched_heatmap_at_tss cr_enriched_heatmap_at_gene cr_enriched_heatmap_at_genomic_features

Documented in cr_enriched_heatmap_at_gene cr_enriched_heatmap_at_genomic_features cr_enriched_heatmap_at_tss cr_enriched_heatmap_at_tss_cgi

normalize_epigenomic_signals = function(cr, target, marks = NULL, expr = NULL, include_correlation_matrix = TRUE,
	extend = 5000, target_ratio = 0.1) {

	cr_param = metadata(cr)$cr_param
	sample_id = cr_param$sample_id
	subgroup = cr_param$subgroup
	subgroup_level = unique(subgroup)
	n_subgroup = length(subgroup_level)

	target_name = deparse(substitute(target))

	############ enriched to methylations ##################
	message(qq("normalizing methylation to @{target_name}"))

	if(include_correlation_matrix) {
		meth_mat_corr = normalizeToMatrix(cr, target, mapping_column = "gene_id", value_column = "corr",
			extend = extend, mean_mode = "absolute", target_ratio = target_ratio, background = 0)
	} else {
		meth_mat_corr = NULL
	}

	meth_mat_mean = enrich_with_methylation(target, sample_id, target_ratio = target_ratio, extend = extend)
	meth_mat_mean[attr(meth_mat_mean, "failed_rows"), ] = 0.5

	if(n_subgroup <= 1) {
		meth_mat_diff = enrich_with_methylation(target, sample_id, target_ratio = target_ratio, extend = extend, mode = rowIQRs)
		meth_mat_diff[attr(meth_mat_diff, "failed_rows"), ] = 0
	} else if(n_subgroup == 2) {
		meth_mat_mean_1 = enrich_with_methylation(target, sample_id[subgroup == subgroup_level[1]], target_ratio = target_ratio, extend = extend)
		failed_rows = attr(meth_mat_mean_1, "failed_rows")
		message(qq("There are @{length(failed_rows)} failed rows when normalizing methylation to the targets."))
		meth_mat_mean_1[failed_rows, ] = 0.5

		meth_mat_mean_2 = enrich_with_methylation(target, sample_id[subgroup == subgroup_level[2]], target_ratio = target_ratio, extend = extend)
		failed_rows = attr(meth_mat_mean_2, "failed_rows")
		message(qq("There are @{length(failed_rows)} failed rows when normalizing methylation to the targets."))
		meth_mat_mean_2[failed_rows, ] = 0.5
		meth_mat_diff = meth_mat_mean_1 - meth_mat_mean_2
	} else {
		meth_mat_mean_list = lapply(subgroup_level, function(le) {
			meth_mat_mean = enrich_with_methylation(target, sample_id[subgroup == le], target_ratio = target_ratio, extend = extend)
			failed_rows = attr(meth_mat_mean, "failed_rows")
			message(qq("There are @{length(failed_rows)} failed rows when normalizing methylation to the targets."))
			meth_mat_mean[failed_rows, ] = 0.5
			meth_mat_mean
		})
		meth_array_mean = array(dim = c(dim(meth_mat_mean), n_subgroup))
		for(i in seq_along(meth_mat_mean_list)) {
			meth_array_mean[, , i] = meth_mat_mean_list[[i]]
		}
		meth_mat_diff = apply(meth_array_mean, c(1, 2), max)  - apply(meth_array_mean, c(1, 2), min)
		meth_mat_diff = copyAttr(meth_mat_mean, meth_mat_diff)
	}


	################# enrich to histome modifications #################
	hist_mat_corr_list = list()
	hist_mat_mean_list = list()
	hist_mat_diff_list = list()
	
	for(k in seq_along(marks)) {
		
		message(qq("normalizing @{marks[k]} signals to @{target_name}"))

		hm_sample_id = intersect(sample_id, chipseq_hooks$sample_id(marks[k]))
		hm_subgroup = subgroup[sample_id %in% hm_sample_id]
		hm_subgroup_level = unique(hm_subgroup)
		n_hm_subgroup = length(hm_subgroup_level)

		# applied to each sample, each mark
		lt = enrich_with_histone_mark(target, sample_id = hm_sample_id, mark = marks[k], return_arr = TRUE, 
			target_ratio = target_ratio, extend = extend)
		hist_mat_mean_list[[k]] = lt[[2]]
		arr = lt[[1]]

		# only calculate the correlation when there are enough samples
		if(length(hm_sample_id) >= 5 && include_correlation_matrix) {
			# detect regions that histone MARKS correlate to expression
			expr2 = expr[target$gene_id, intersect(colnames(expr), hm_sample_id)]
			hist_mat_corr = matrix(nrow = nrow(expr2), ncol = ncol(meth_mat_mean))

			counter = set_counter(nrow(hist_mat_corr), fmt = "  calculate correlation for %s rows.")
			for(i in seq_len(nrow(hist_mat_corr))) {
				counter()
			    for(j in seq_len(ncol(hist_mat_corr))) {
			        suppressWarnings(x <- cor(arr[i, j, ], expr2[i, ], method = "spearman"))
			        hist_mat_corr[i, j] = x
			    }
			}
			hist_mat_corr[is.na(hist_mat_corr)] = 0
			hist_mat_corr = copyAttr(meth_mat_mean, hist_mat_corr)
			hist_mat_corr_list[[k]] = hist_mat_corr
		} else {
			hist_mat_corr_list[k] = list(NULL)
		}

		if(n_hm_subgroup <= 1) {
			hist_mat_diff_list[[k]] = apply(arr, c(1, 2), IQR, na.rm = TRUE)
			hist_mat_diff_list[[k]] = copyAttr(meth_mat_mean, hist_mat_diff_list[[k]])
		} else if(n_hm_subgroup == 2) {
			hm_sample_id_subgroup1 = hm_sample_id[hm_subgroup == hm_subgroup_level[1]]
			hm_sample_id_subgroup2 = hm_sample_id[hm_subgroup == hm_subgroup_level[2]]
			h1 = apply(arr[, , hm_sample_id_subgroup1], c(1, 2), mean, na.rm = TRUE)
			h2 = apply(arr[, , hm_sample_id_subgroup2], c(1, 2), mean, na.rm = TRUE)
			hist_mat_diff_list[[k]] = h1 - h2
			hist_mat_diff_list[[k]] = copyAttr(meth_mat_mean, hist_mat_diff_list[[k]])
		} else {
			h_list = lapply(hm_subgroup_level, function(le) {
				apply(arr[, , hm_sample_id[hm_subgroup == le]], c(1, 2), mean, na.rm = TRUE)
			})
			hm_array_mean = array(dim = c(dim(meth_mat_mean), n_hm_subgroup))
			for(i in seq_along(h_list)) {
				hm_array_mean[, , i] = h_list[[i]]
			}
			hist_mat_diff_list[[k]] = apply(hm_array_mean, c(1, 2), max)  - apply(hm_array_mean, c(1, 2), min)
			hist_mat_diff_list[[k]] = copyAttr(meth_mat_mean, hist_mat_diff_list[[k]])
		}
	}
	names(hist_mat_corr_list) = marks
	names(hist_mat_mean_list) = marks
	names(hist_mat_diff_list) = marks

	return(list(meth_mat_corr = meth_mat_corr,
		        meth_mat_mean = meth_mat_mean,
		        meth_mat_diff = meth_mat_diff,
		        hist_mat_corr_list = hist_mat_corr_list,
		        hist_mat_mean_list = hist_mat_mean_list,
		        hist_mat_diff_list = hist_mat_diff_list))
}

if(is.memoised(normalize_epigenomic_signals)) {
	normalize_epigenomic_signals = memoise(normalize_epigenomic_signals)
}

merge_row_order = function(mat, l_list) {
	do.call("c", lapply(l_list, function(l) {
		if(sum(l) == 0) return(integer(0))
		if(sum(l) == 1) return(which(l))
		dend1 = as.dendrogram(hclust(dist_by_closeness2(mat[l, ])))
		dend1 = reorder(dend1, rowMeans(mat[l, ]))
		od = order.dendrogram(dend1)
		which(l)[od]
	}))
}


add_boxplot_as_column_annotation = function(ht_list, width, anno_name, anno_title = anno_name) {
	gl = width

	row_order_list = row_order(ht_list)
	lt = lapply(row_order_list, function(ind) gl[ind])
	bx = boxplot(lt, plot = FALSE)$stats
	n = length(row_order_list)
	x_ind = (seq_len(n) - 0.5)/n
	w = 1/n*0.5
	decorate_annotation(anno_name, slice = 1, {
		rg = range(bx)
		rg[1] = rg[1] - (rg[2] - rg[1])*0.1
		rg[2] = rg[2] + (rg[2] - rg[1])*0.1
		pushViewport(viewport(y = unit(1, "npc") + unit(1, "mm"), just = "bottom", height = unit(2, "cm"), yscale = rg))
		grid.rect(gp = gpar(col = "black"))
		grid.segments(x_ind - w/2, bx[5, ], x_ind + w/2, bx[5, ], default.units = "native", gp = gpar(lty = 1:2))
		grid.segments(x_ind - w/2, bx[1, ], x_ind + w/2, bx[1, ], default.units = "native", gp = gpar(lty = 1:2))
		grid.segments(x_ind, bx[1, ], x_ind, bx[5, ], default.units = "native", gp = gpar(lty = 1:2))
		grid.rect(x_ind, colMeans(bx[c(4, 2), ]), width = w, height = bx[4, ] - bx[2, ], default.units = "native", gp = gpar(fill = "white", lty = 1:2))
		grid.segments(x_ind - w/2, bx[3, ], x_ind + w/2, bx[3, ], default.units = "native", gp = gpar(lty = 1:2))
		grid.yaxis(main = FALSE, gp = gpar(fontsize = 8))
		grid.text(anno_title, y = unit(1, "npc") + unit(2.5, "mm"), gp = gpar(fontsize = 14), just = "bottom")
		upViewport()
	})
}

# == title
# Visualizing enrichment for epigenomic signals at TSS-CGIs
#
# == param
# -cr correalted regions
# -txdb transcriptome annotation which was used in `correlated_regions`
# -expr expression matrix which was used in `correlated_regions`
# -cgi CpG island, a `GenomicRanges::GRanges` object
# -fdr_cutoff cutoff for fdr, used to filter significant CRs
# -meth_diff_cutoff cutoff for methylation difference. If there are no subgroup information or only one subgroup,
#             ``meth_IQR`` column is used for filtering. If there are more than one subgroups, ``meth_diameter``
#             column is used for filtering.
# -marks names of histone marks, should be supported in `chipseq_hooks`
# -type visualize negative correlated regions or positive correlated regions
# -extend base pairs extended to upstream and downstream
# -expr_ha a `ComplexHeatmap::HeatmapAnnotation` class object. It is used for the expression heatmap
#
# == details
# There are several heatmaps visualize various signals enriched at TSS-CGIs. In the plot, in the extended
# CGI region, one CGI only overlaps to one gene TSS and one gene TSS should only overlap to one extended CGI.
# Since one CGI only corresponds to one gene, in the heatmap, each row corresponds to one single gene.
#
# There are following heatmaps:
#
# - heatmap for gene expression
# - If ``cr`` is returned form `cr_enriched_heatmap`, there is a one column heatmap which
#   shows the k-means cluters genes belong to
# - heatmap for significant correlated regions
# - a point plot showing CGI length
# - heatmap for correlation between methylation and gene expression
# - heatmap for mean methylation
# - heatmap for metnylation difference
# - heatmap for correlation, mean signal and signal difference for histone marks
#
# If there are more than 12 heatmaps, they will be put into two pages.
# 
# Heatmaps are split into two sub-clusters by k-means clustering on the mean methylation matrix.
# If there are two subgroups in all samples, each subcluster are split by high expression/low expression
# in subgroup 1. In each high expression/low expression, rows are split by the k-means clusters calculated
# in `cr_enriched_heatmap`. Finally, rows are clustered by considering closeness of signals in the extended
# CGI regions.
#
# == value
# no value is returned
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
cr_enriched_heatmap_at_tss_cgi = function(cr, txdb, expr, cgi,
	fdr_cutoff = 0.05, meth_diff_cutoff = 0.1, marks = NULL, type = "neg", extend = 5000,
	expr_ha) {
	
	n_subgroup = NULL
	subgroup = NULL
	subgroup_level = NULL
	expr_col_od = NULL
	km_col = NULL
	mat_mix = NULL

	eval(SNIPPET_ATTACH_CR_PARAM)

	message(qq("filter sigCR by fdr_cutoff = @{fdr_cutoff}, meth_diff_cutoff = @{meth_diff_cutoff}"))
	eval(SNIPPET_FILTER_SIG_CR)
		
	gene = genes(txdb)

	message("extracting gene tss")
	tss = promoters(gene, upstream = 1, downstream = 0)
	tss = tss[names(tss) %in% sig_cr$gene_id]

	if(length(extend) == 1) extend = rep(extend, 2)

	cgi_extend = cgi
	start(cgi_extend) = start(cgi) - extend[1]
	end(cgi_extend) = end(cgi) + extend[2]
	
	# there should only be one tss in +-5kb of CGI and one tss should
	# only overlaps to one extended CGI
	mtch = as.matrix(findOverlaps(cgi_extend, tss))
	t1 = table(mtch[, 1])
	t2 = table(mtch[, 2])
	s1 = as.numeric(names(t1[t1 == 1]))
	s2 = as.numeric(names(t2[t2 == 1]))
	l = mtch[, 1] %in% s1 & mtch[, 2] %in% s2
	mtch = mtch[l, ]
	cgi2 = cgi[mtch[, 1]]
	cgi2$gene_id = names(tss[mtch[, 2]])
	names(cgi2) = cgi2$gene_id
	strand(cgi2) = strand(tss[mtch[, 2]])
	message(qq("@{length(cgi2)} left filtered by one-to-one mapping between tss and cgi"))

	target_ratio = mean(width(cgi2))/(sum(extend) + mean(width(cgi2)))
	target = cgi2
	target_name = "cgi"

	message("normalize to sigCR")
	eval(SNIPPET_NORMALIZE_SIG_CR)

	if(is.not.null(km)) {
		km = km[names(target)]
	}
	cgi2 = cgi2[names(target)]

	cgi_width = width(cgi2)
	cgi_width[cgi_width > quantile(cgi_width, 0.99)] = quantile(cgi_width, 0.99)
	width_anno_name = "cgi_width"
	width_anno = cgi_width

	message("normalize to epi signals")
	eval(SNIPPET_NORMALIZE_EPI_SIGNALS)

	########### prepare the order of rows and columns
	message("determing row and colum orders")
	eval(SNIPPET_ROW_ORDER_AND_COLUMN_ORDER)

	cor_col_fun = colorRamp2(c(-1, 0, 1), c("darkgreen", "white", "red"))
	meth_col_fun = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red"))
	cr_col = c("-1" = "darkgreen", "0" = "white", "1" = "red")

	fixed_heatmap = 2

	eval(SNIPPET_HEATMAP_PAGE)

	if(missing(expr_ha)) {
		if(n_subgroup >= 2) {
			expr_ha = HeatmapAnnotation(subgroup = subgroup, col = list(subgroup = structure(rand_color(n_subgroup), names = subgroup_level)), 
				show_annotation_name = TRUE, annotation_name_side = "left", annotation_name_gp = gpar(fontsize = 10))
		}
	}

	######### construct heatmap list  ############
	## if there are too many heatmaps, they will be put in two pages.
	epi_color = c(brewer.pal(8, "Set2"), brewer.pal(12, "Set3"))
	epi_mark_list = list()
	epi_title_list = list()

	n_heatmap = 0
	n_row_group = 2
	expr = t(apply(expr, 1, function(x) {
		qu = quantile(x, c(0.05, 0.95), na.rm = TRUE)
	    x[x < qu[1]] = qu[1]
	    x[x > qu[2]] = qu[2]
	    x
	}))
	expr = t(scale(t(expr)))
	ht_list = Heatmap(expr, name = "expr", show_row_names = FALSE,
		show_column_names = FALSE, width = unit(5, "cm"), show_column_dend = FALSE, cluster_columns = FALSE, column_order = expr_col_od,
		top_annotation = expr_ha, column_title = "Expression", show_row_dend = FALSE,
		use_raster = TRUE, raster_quality = 2)
	n_heatmap = n_heatmap + 1
	
	if(is.not.null(km)) {
		ht_list = ht_list + Heatmap(km[names(target)], name = "km_groups", col = km_col, show_row_names = FALSE,
			width = unit(0.5, "cm"))
	}

	ht_list = ht_list + EnrichedHeatmap(mat_mix, name = qq("@{type}CR"), col = cr_col,
	                top_annotation = HeatmapAnnotation(lines1 = anno_enriched(gp = gpar(neg_col = "darkgreen", pos_col = "red", lty = 1:n_row_group))), 
	                top_annotation_height = unit(2, "cm"), column_title = qq("sig@{type}CR"),
	                use_raster = TRUE, raster_quality = 2, combined_name_fun = NULL)
	n_heatmap = n_heatmap + 1

	ht_list = ht_list + rowAnnotation(foo_width = row_anno_points(width_anno, axis = TRUE, gp = gpar(col = "#00000040")),
	                width = unit(1, "cm"))

	message("append epi heatmaps")
	eval(SNIPPET_APPEND_EPI_HEATMAP)

	message("draw heatmaps")
	eval(SNIPPET_DRAW_HEATMAP)

	return(invisible(NULL))
}


# == title
# Visualizing enrichment for epigenomic signals at TSS
#
# == param
# -cr correalted regions
# -txdb transcriptome annotation which was used in `correlated_regions`
# -expr expression matrix which was used in `correlated_regions`
# -cgi CpG island, a `GenomicRanges::GRanges` object
# -fdr_cutoff cutoff for fdr
# -meth_diff_cutoff cutoff for methylation difference. If there are no subgroup information or only one subgroup,
#             ``meth_IQR`` column is used for filtering. If there are more than one subgroups, ``meth_diameter``
#             column is used for filtering.
# -marks names of histone marks, should be supported in `chipseq_hooks`
# -type visualize negative correlated regions or positive correlated regions
# -extend base pairs extended to upstream and downstream
# -expr_ha a `ComplexHeatmap::HeatmapAnnotation` class object.It is used for the expression heatmap
#
# == details
# There are several heatmaps visualize various signals enriched at gene TSS.
#
# There are following heatmaps:
#
# - heatmap for gene expression
# - a point plot showing gene length
# - If ``cr`` is returned form `cr_enriched_heatmap`, there is a one column heatmap which
#   shows the k-means cluters genes belong to
# - heatmap for CGI enrichment at TSS
# - heatmap for significant correlated regions
# - heatmap for correlation between methylation and gene expression
# - heatmap for mean methylation
# - heatmap for metnylation difference
# - heatmap for correlation, mean signal and signal difference for histone marks
#
# If there are more than 12 heatmaps, they will be put into two pages.
# 
# Heatmaps are split into two sub-clusters by k-means clustering on the mean methylation matrix.
# If there are two subgroups in all samples, each subcluster are split by high expression/low expression
# in subgroup 1. In each high expression/low expression, rows are split by the k-means clusters calculated
# in `cr_enriched_heatmap`. Finally, rows are clustered by considering closeness of signals in the extended
# TSS regions.
#
# == value
# no value is returned
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
cr_enriched_heatmap_at_tss = function(cr, txdb, expr, cgi, fdr_cutoff = 0.05, 
	meth_diff_cutoff = 0.1, marks = NULL, type = "neg", extend = c(5000, 10000),
	expr_ha) {
	
	sig_cr = NULL
	n_subgroup = NULL
	subgroup = NULL
	subgroup_level = NULL
	expr_col_od = NULL
	km_col = NULL
	mat_mix = NULL

	eval(SNIPPET_ATTACH_CR_PARAM)

	message(qq("filter sigCR by fdr_cutoff = @{fdr_cutoff}, meth_diff_cutoff = @{meth_diff_cutoff}"))
	eval(SNIPPET_FILTER_SIG_CR)
		
	gene = genes(txdb)

	message("extracting gene tss")
	tss = promoters(gene, upstream = 1, downstream = 0)
	tss = tss[names(tss) %in% sig_cr$gene_id]
	target = tss
	axis_name = c("-5KB", "TSS", "5KB")
	target_ratio = 0.1
	target_name = "tss"
	
	message("normalize to sigCR")
	eval(SNIPPET_NORMALIZE_SIG_CR)

	if(is.not.null(km)) {
		km = km[names(target)]
	}

	gl = width(gene[tss$gene_id])
	gl[gl > quantile(gl, 0.95)] = quantile(gl, 0.95)
	width_anno_name = "gene_width"
	width_anno = gl

	# normalize to CGI
	mat_cgi = normalizeToMatrix(cgi, target, extend = extend, mean_mode = "absolute")

	message("normalize to epi signals")
	eval(SNIPPET_NORMALIZE_EPI_SIGNALS)

	########### prepare the order of rows and columns
	message("determing row and colum orders")
	eval(SNIPPET_ROW_ORDER_AND_COLUMN_ORDER)

	cor_col_fun = colorRamp2(c(-1, 0, 1), c("darkgreen", "white", "red"))
	meth_col_fun = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red"))
	cr_col = c("-1" = "darkgreen", "0" = "white", "1" = "red")

	fixed_heatmap = 3
	eval(SNIPPET_HEATMAP_PAGE)

	if(missing(expr_ha)) {
		if(n_subgroup >= 2) {
			expr_ha = HeatmapAnnotation(subgroup = subgroup, col = list(subgroup = structure(rand_color(n_subgroup), names = subgroup_level)), 
				show_annotation_name = TRUE, annotation_name_side = "left", annotation_name_gp = gpar(fontsize = 10))
		}
	}

	######### construct heatmap list  ############
	## if there are too many heatmaps, they will be put in two pages.
	epi_color = c(brewer.pal(8, "Set2"), brewer.pal(12, "Set3"))
	epi_mark_list = list()
	epi_title_list = list()

	n_heatmap = 0
	n_row_group = 2
	expr = t(apply(expr, 1, function(x) {
		qu = quantile(x, c(0.05, 0.95), na.rm = TRUE)
	    x[x < qu[1]] = qu[1]
	    x[x > qu[2]] = qu[2]
	    x
	}))
	expr = t(scale(t(expr)))
	ht_list = Heatmap(expr, name = "expr", show_row_names = FALSE,
		show_column_names = FALSE, width = unit(5, "cm"), show_column_dend = FALSE, cluster_columns = FALSE, column_order = expr_col_od,
		top_annotation = expr_ha, column_title = "Expression", show_row_dend = FALSE,
		use_raster = TRUE, raster_quality = 2)
	n_heatmap = n_heatmap + 1
	
	ht_list = ht_list + rowAnnotation(foo_width = row_anno_points(width_anno, axis = TRUE, gp = gpar(col = "#00000040")),
	                width = unit(1, "cm"))

	if(is.not.null(km)) {
		ht_list = ht_list + Heatmap(km[names(target)], name = "km_groups", col = km_col, show_row_names = FALSE,
			width = unit(1, "cm"))
	}

	ht_list = ht_list + EnrichedHeatmap(mat_cgi, col = c("white", "darkorange"), name = "CGI",
	  top_annotation = HeatmapAnnotation(lines1 = anno_enriched(gp = gpar(col = "darkorange", lty = 1:n_row_group))), 
	  top_annotation_height = unit(2, "cm"), column_title = "CGI", axis_name = axis_name,
	  use_raster = TRUE, raster_quality = 2) 


	ht_list = ht_list + EnrichedHeatmap(mat_mix, name = qq("@{type}CR"), col = cr_col, 
	                top_annotation = HeatmapAnnotation(lines1 = anno_enriched(gp = gpar(neg_col = "darkgreen", pos_col = "red", lty = 1:n_row_group))), 
	                top_annotation_height = unit(2, "cm"), column_title = qq("sig@{type}CR"),
	                use_raster = TRUE, raster_quality = 2, combined_name_fun = NULL, axis_name = axis_name)
	n_heatmap = n_heatmap + 1

	message("append epi heatmaps")
	eval(SNIPPET_APPEND_EPI_HEATMAP)

	message("draw heatmaps")
	eval(SNIPPET_DRAW_HEATMAP)

	return(invisible(NULL))

}

# == title
# Visualizing enrichment for epigenomic signals at gene body
#
# == param
# -cr correalted regions
# -txdb transcriptome annotation which was used in `correlated_regions`
# -expr expression matrix which was used in `correlated_regions`
# -cgi CpG island, a `GenomicRanges::GRanges` object
# -K which k-means cluster which is generated by `cr_enriched_heatmap`
# -marks names of histone marks, should be supported in `chipseq_hooks`
# -extend base pairs extended to upstream and downstream
# -expr_ha a `ComplexHeatmap::HeatmapAnnotation` class object.It is used for the expression heatmap
#
# == details
# There are several heatmaps visualize various signals enriched at gene body
#
# There are following heatmaps:
#
# - heatmap for gene expression
# - a point plot showing gene length
# - If ``cr`` is returned form `cr_enriched_heatmap`, there is a one column heatmap which
#   shows the k-means cluters genes belong to
# - heatmap for CGI enrichment at TSS
# - heatmap for correlation between methylation and gene expression
# - heatmap for mean methylation
# - heatmap for metnylation difference
# - heatmap for correlation, mean signal and signal difference for histone marks
#
# If there are more than 12 heatmaps, they will be put into two pages.
# 
# Heatmaps are split into three sub-clusters by k-means clustering on the mean methylation matrix.
# If there are two subgroups in all samples, each subcluster are split by high expression/low expression
# in subgroup 1. In each high expression/low expression, rows are split by the k-means clusters calculated
# in `cr_enriched_heatmap`. Finally, rows are clustered by mean methylation matrix.
#
# == value
# no value is returned
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
cr_enriched_heatmap_at_gene = function(cr, txdb, expr, cgi, K = 1, marks = NULL, 
	extend = 5000, expr_ha) {

	sample_id = NULL
	n_subgroup = NULL
	subgroup = NULL
	subgroup_level = NULL
	meth_mat_mean = NULL
	km_col = NULL
	
	eval(SNIPPET_ATTACH_CR_PARAM)

	if(is.null(km)) {
		stop("`cr` should be returned by ``.")
	}

	gi = names(km[km == K])
	cr = cr[cr$gene_id %in% gi]
		
	gene = genes(txdb)
	gene = gene[names(gene) %in% cr$gene_id]

	target = gene
	target_name = "gene"
	target_ratio = 0.6
	axis_name = c("-5KB", "TSS", "TES", "5KB")

	expr = expr[names(gene), , drop = FALSE]
	
	km = km[names(gene)]

	# normalize to CGI
	mat_cgi = normalizeToMatrix(cgi, gene, extend = extend, target_ratio = target_ratio, mean_mode = "absolute")

	message("normalize to epi signals")
	eval(SNIPPET_NORMALIZE_EPI_SIGNALS)

	gl = width(gene[gene$gene_id])
	gl[gl > quantile(gl, 0.95)] = quantile(gl, 0.95)
	width_anno_name = "gene_width"
	width_anno = gl

	expr = expr[names(gene), sample_id, drop = FALSE]
	if(n_subgroup == 2) {
		expr_mean = rowMeans(expr[, subgroup == subgroup_level[1], drop = FALSE]) - 
					rowMeans(expr[, subgroup == subgroup_level[1], drop = FALSE])
		expr_split = ifelse(expr_mean > 0, "high", "low")
		expr_split = factor(expr_split, levels = c("high", "low"))
	} else {
		expr_split = NULL
	}

	n_upstream_index = length(attr(meth_mat_mean, "upstream_index"))
	meth_split = kmeans(meth_mat_mean[, seq(round(n_upstream_index*0.8), round(n_upstream_index*1.2))], centers = 3)$cluster
	x = tapply(rowMeans(meth_mat_mean[, seq(round(n_upstream_index*0.8), round(n_upstream_index*7/5))]), meth_split, mean)
	od = structure(order(x), names = names(x))
	meth_split = paste0("cluster", od[as.character(meth_split)])

	if(n_subgroup == 2) {
		combined_split = paste(meth_split, expr_split, sep = "|")
		row_order = merge_row_order(meth_mat_mean, list(
			combined_split == "cluster1|high",
			combined_split == "cluster1|low",
			combined_split == "cluster2|high",
			combined_split == "cluster2|low",
			combined_split == "cluster3|high",
			combined_split == "cluster3|low"
		))
	} else {
		combined_split = meth_split
		row_order = merge_row_order(meth_mat_mean, list(
			combined_split == "cluster1",
			combined_split == "cluster2",
			combined_split == "cluster3"
		))
	}

	expr_col_od = do.call("c", lapply(subgroup_level, function(le) {
		dend1 = as.dendrogram(hclust(dist(t(expr[, subgroup == le, drop = FALSE]))))
		hc1 = as.hclust(reorder(dend1, colMeans(expr[, subgroup == le, drop = FALSE])))
		col_od1 = hc1$order
		which(subgroup == le)[col_od1]
	}))

	cor_col_fun = colorRamp2(c(-1, 0, 1), c("darkgreen", "white", "red"))
	meth_col_fun = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red"))
	cr_col = c("-1" = "darkgreen", "0" = "white", "1" = "red")

	fixed_heatmap = 2
	eval(SNIPPET_HEATMAP_PAGE)

	if(missing(expr_ha)) {
		if(n_subgroup >= 2) {
			expr_ha = HeatmapAnnotation(subgroup = subgroup, col = list(subgroup = structure(rand_color(n_subgroup), names = subgroup_level)), 
				show_annotation_name = TRUE, annotation_name_side = "left", annotation_name_gp = gpar(fontsize = 10))
		}
	}

	######### construct heatmap list  ############
	## if there are too many heatmaps, they will be put in two pages.
	epi_color = c(brewer.pal(8, "Set2"), brewer.pal(12, "Set3"))
	epi_mark_list = list()
	epi_title_list = list()

	n_heatmap = 0
	n_row_group = 3
	expr = t(apply(expr, 1, function(x) {
		qu = quantile(x, c(0.05, 0.95), na.rm = TRUE)
	    x[x < qu[1]] = qu[1]
	    x[x > qu[2]] = qu[2]
	    x
	}))
	expr = t(scale(t(expr)))
	ht_list = Heatmap(expr, name = "expr", show_row_names = FALSE,
		show_column_names = FALSE, width = unit(5, "cm"), show_column_dend = FALSE, cluster_columns = FALSE, column_order = expr_col_od,
		top_annotation = expr_ha, column_title = "Expression", show_row_dend = FALSE,
		use_raster = TRUE, raster_quality = 2)
	n_heatmap = n_heatmap + 1
	
	ht_list = ht_list + rowAnnotation(foo_width = row_anno_points(width_anno, axis = TRUE, gp = gpar(col = "#00000040")),
	                width = unit(1, "cm"))

	ht_list = ht_list + Heatmap(km[names(target)], name = "km_groups", col = km_col, show_row_names = FALSE,
			width = unit(1, "cm"))

	ht_list = ht_list + EnrichedHeatmap(mat_cgi, col = c("white", "darkorange"), name = "CGI",
	  top_annotation = HeatmapAnnotation(lines1 = anno_enriched(gp = gpar(col = "darkorange", lty = 1:n_row_group))), 
	  top_annotation_height = unit(2, "cm"), column_title = "CGI", axis_name = axis_name,
	  use_raster = TRUE, raster_quality = 2) 
	n_heatmap = n_heatmap + 1

	message("append epi heatmaps")
	eval(SNIPPET_APPEND_EPI_HEATMAP)

	message("draw heatmaps")
	eval(SNIPPET_DRAW_HEATMAP)

	return(invisible(NULL))

}

# == title
# Visualizing enrichment for epigenomic signals at TSS-CGIs
#
# == param
# -cr correalted regions
# -txdb transcriptome annotation which was used in `correlated_regions`
# -expr expression matrix which was used in `correlated_regions`
# -gf genomic features, a `GenomicRanges::GRanges` object
# -fdr_cutoff cutoff for fdr
# -meth_diff_cutoff cutoff for methylation difference. If there are no subgroup information or only one subgroup,
#             ``meth_IQR`` column is used for filtering. If there are more than one subgroups, ``meth_diameter``
#             column is used for filtering.
# -marks names of histone marks, should be supported in `chipseq_hooks`
# -type visualize negative correlated regions or positive correlated regions
# -extend base pairs extended to upstream and downstream
# -min_reduce base pairs for merging neighbouring regions
# -min_width minimal width of regions
# -nearest_by "tss" or "gene", how to connect genomic features to genes
# -expr_ha a `ComplexHeatmap::HeatmapAnnotation` class object.It is used for the expression heatmap
#
# == details
# There are several heatmaps visualize various signals enriched at genomic features. After annotate to genes,
# in the extended regions, each region can only have one gene.
#
# There are following heatmaps:
#
# - heatmap for gene expression
# - If ``cr`` is returned form `cr_enriched_heatmap`, there is a one column heatmap which
#   shows the k-means cluters genes belong to
# - heatmap for significant correlated regions
# - a point plot showing region length
# - heatmap for correlation between methylation and gene expression
# - heatmap for mean methylation
# - heatmap for metnylation difference
# - heatmap for correlation, mean signal and signal difference for histone marks
#
# If there are more than 12 heatmaps, they will be put into two pages.
# 
# Heatmaps are split into two sub-clusters by k-means clustering on the mean methylation matrix.
# If there are two subgroups in all samples, each subcluster are split by high expression/low expression
# in subgroup 1. In each high expression/low expression, rows are split by the k-means clusters calculated
# in `cr_enriched_heatmap`. Finally, rows are clustered by considering closeness of signals in the extended
# gf regions.
#
# == value
# no value is returned
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
cr_enriched_heatmap_at_genomic_features = function(cr, txdb, expr, gf,
	fdr_cutoff = 0.05, meth_diff_cutoff = 0.1, marks = NULL, type = "neg", extend = 5000,
	min_reduce = 1, min_width = 1000, nearest_by = "tss", expr_ha) {

	gm_extend = NULL
	n_subgroup = NULL
	subgroup = NULL
	subgroup_level = NULL
	expr_col_od = NULL
	km_col = NULL
	mat_mix = NULL

	eval(SNIPPET_ATTACH_CR_PARAM)

	message(qq("filter sigCR by fdr_cutoff = @{fdr_cutoff}, meth_diff_cutoff = @{meth_diff_cutoff}"))
	eval(SNIPPET_FILTER_SIG_CR)
	
	gf_origin = gf

	# overlap gf to gene extended regions
	message("extracting gene tss")
	gm = genes(txdb)
	gm = gm[gm$gene_id %in% unique(cr$gene_id)]
	tss = promoters(gm, upstream = 1, downstream = 0)
	gl = width(gm)
	g = gm
	strand(g) = "*"
	start(g) = start(g) - gm_extend[1]
	end(g) = end(g) + ifelse(length(gm_extend) == 2, gm_extend[2], gm_extend[1])
	start(g) = ifelse(start(g) > 1, start(g), 1)

	mtch = as.matrix(findOverlaps(g, gf))
	gf = gf[unique(mtch[, 2])]
	message(qq("@{length(gf)} are in extended gene regios."))

	if(min_reduce >= 0) {
		gf = reduce(gf, min = min_reduce)
		message(qq("@{length(gf)} remain after merging by min_reduce <= @{min_reduce}"))
	}

	gf = gf[width(gf) >= min_width, ]
	message(qq("@{length(gf)} regions remain after removing regions with width <= @{min_width}"))
	
	# find associated gene (by tss for by gene)
	if(nearest_by == "tss") {
		message("look for nearest tss")
		d = distanceToNearest(gf, tss, select = "all")
		subjectHits = subjectHits(d)
		ind = tapply(seq_len(length(d)), queryHits(d), function(ind) {
			ind[which.max(gl[subjectHits[ind]])][1]
		})
		d = d[as.vector(ind)]
		gf2 = gf[queryHits(d)]
		gf2$distanceToNearest = mcols(d)$distance
		gf2$nearestGene = gm$gene_id[subjectHits(d)]
		gf2$nearestGeneStrand = strand(tss[subjectHits(d)])
		l = gf2$nearestGeneStrand == "+" & start(gf2) < start(tss[subjectHits(d)]) |
		    gf2$nearestGeneStrand == "-" & end(gf2) > end(tss[subjectHits(d)])
		l = as.vector(l)
		gf2$distanceToNearest[l] = -gf2$distanceToNearest[l]
	} else {
		message("look for nearest gene body")
		d = distanceToNearest(gf, gm, select = "all")
		subjectHits = subjectHits(d)
		ind = tapply(seq_len(length(d)), queryHits(d), function(ind) {
			ind[which.max(gl[subjectHits[ind]])][1]
		})
		d = d[as.vector(ind)]
		gf2 = gf[queryHits(d)]
		gf2$distanceToNearest = mcols(d)$distance
		gf2$nearestGene = gm$gene_id[subjectHits(d)]
		gf2$nearestGeneStrand = strand(gm[subjectHits(d)])
		# if the gf is overlapped to gene body, how much of it is overlapped by the gene
		gg = pintersect(gf2, gm[gf2$nearestGene], resolve.empty = "max")
		gf2$overlapGenePercent = width(gg)/width(gf2)
		l = gf2$nearestGeneStrand == "+" & start(gf2) < start(gm[subjectHits(d)]) |
		    gf2$nearestGeneStrand == "-" & end(gf2) > end(gm[subjectHits(d)])
		l = as.vector(l)
		gf2$distanceToNearest[l] = -gf2$distanceToNearest[l]
	}
	message(qq("@{length(gf2)} regions remain after overlapping to genes"))

	strand(gf2) = strand(gm[gf2$nearestGene])
	names(gf2) = gf2$nearestGene
	gf2$gene_id = gf2$nearestGene

	target_ratio = mean(width(gf2))/(sum(extend) + mean(width(gf2)))
	target = gf2
	target_name = "gf"

	message("normalize to sigCR")
	eval(SNIPPET_NORMALIZE_SIG_CR)

	message("normalize to gf")
	mat_gf = normalizeToMatrix(gf_origin, target, extend = extend, target_ratio = target_ratio)
	
	if(length(target) < 10) {
		message("too few regions left, just quit.")
		return(invisible(NULL))
	}

	if(is.not.null(km)) {
		km = km[names(target)]
	}

	gf_width = width(gf2)
	gf_width[gf_width > quantile(gf_width, 0.99)] = quantile(gf_width, 0.99)
	width_anno_name = "gf_width"
	width_anno = gf_width

	message("normalize to epi signals")
	eval(SNIPPET_NORMALIZE_EPI_SIGNALS)

	########### prepare the order of rows and columns
	message("determing row and colum orders")
	eval(SNIPPET_ROW_ORDER_AND_COLUMN_ORDER)

	cor_col_fun = colorRamp2(c(-1, 0, 1), c("darkgreen", "white", "red"))
	meth_col_fun = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red"))
	cr_col = c("-1" = "darkgreen", "0" = "white", "1" = "red")
	gf_col = colorRamp2(c(0, 1), c("white", "blue"))

	fixed_heatmap = 3
	eval(SNIPPET_HEATMAP_PAGE)

	if(missing(expr_ha)) {
		if(n_subgroup >= 2) {
			expr_ha = HeatmapAnnotation(subgroup = subgroup, col = list(subgroup = structure(rand_color(n_subgroup), names = subgroup_level)), 
				show_annotation_name = TRUE, annotation_name_side = "left", annotation_name_gp = gpar(fontsize = 10))
		}
	}

	######### construct heatmap list  ############
	## if there are too many heatmaps, they will be put in two pages.
	epi_color = c(brewer.pal(8, "Set2"), brewer.pal(12, "Set3"))
	epi_mark_list = list()
	epi_title_list = list()

	n_heatmap = 0
	n_row_group = 2
	ht_list = Heatmap(expr, name = "expr", show_row_names = FALSE,
		show_column_names = FALSE, width = unit(5, "cm"), show_column_dend = FALSE, cluster_columns = FALSE, column_order = expr_col_od,
		top_annotation = expr_ha, column_title = "Expression", show_row_dend = FALSE,
		use_raster = TRUE, raster_quality = 2)
	n_heatmap = n_heatmap + 1
	
	if(is.not.null(km)) {
		ht_list = ht_list + Heatmap(km[names(target)], name = "km_groups", col = km_col, show_row_names = FALSE,
			width = unit(0.5, "cm"))
	}

	ht_list = ht_list + EnrichedHeatmap(mat_gf, name = "gf", col = gf_col,
	                top_annotation = HeatmapAnnotation(lines1 = anno_enriched(gp = gpar(col = "blue", lty = 1:n_row_group))), 
	                top_annotation_height = unit(2, "cm"), column_title = qq("gf"),
	                use_raster = TRUE, raster_quality = 2, combined_name_fun = NULL)
	n_heatmap = n_heatmap + 1

	ht_list = ht_list + EnrichedHeatmap(mat_mix, name = qq("@{type}CR"), col = cr_col,
	                top_annotation = HeatmapAnnotation(lines1 = anno_enriched(gp = gpar(neg_col = "darkgreen", pos_col = "red", lty = 1:n_row_group))), 
	                top_annotation_height = unit(2, "cm"), column_title = qq("sig@{type}CR"),
	                use_raster = TRUE, raster_quality = 2, combined_name_fun = NULL)
	n_heatmap = n_heatmap + 1

	ht_list = ht_list + rowAnnotation(foo_width = row_anno_points(width_anno, axis = TRUE, gp = gpar(col = "#00000040")),
	                width = unit(1, "cm"))

	message("append epi heatmaps")
	eval(SNIPPET_APPEND_EPI_HEATMAP)

	message("draw heatmaps")
	eval(SNIPPET_DRAW_HEATMAP)

	return(invisible(NULL))
}
jokergoo/epik documentation built on Sept. 28, 2019, 9:20 a.m.