R/genomic_region_subgroup_specificity.R

Defines functions common_regions subgroup_specific_genomic_regions print.subgroup_specific_genomic_regions heatmap_subgroup_specificity

Documented in common_regions heatmap_subgroup_specificity print.subgroup_specific_genomic_regions subgroup_specific_genomic_regions

# == title
# Find common genomic regions across samples
#
# == param
# -gr_list a list of `GenomicRanges::GRanges`
# -min_recurrency minimal cross-sample recurrency at each base for the common regions
# -gap gap to merge common regions, pass to `reduce2`
# -max_gap maximum gap for merging common regions, pass to `reduce2`
# -min_width minimal width for the common regions. It can be used to remove a lot of 
#        very short regions.
#
# == details
# A common region is defined as a region which is recurrent in at least k samples. The process of 
# finding common regions are as follows:
#
# - merge regions in all samples into one object
# - calculate coverage which is the recurrency, removed regions with recurrency less than the cutoff
# - merge the segments and remove regions which are too short
#
# Please note in each sample, regions should not be overlapped.
#
# == value
# A `GenomicRanges::GRanges` object contains coordinates of common regions. The columns in meta data
# are percent of the common region which is covered by regions in every sample.
#
# == seealso
# The returned variable can be sent to `subgroup_specific_genomic_regions` to find subgroup specific regions.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# gr_list = list(
#     gr1 = GRanges(seqnames = "chr1", ranges = IRanges(1, 8)),
#     gr2 = GRanges(seqnames = "chr1", ranges = IRanges(3, 9)),
#     gr3 = GRanges(seqnames = "chr1", ranges = IRanges(2, 7)),
#     gr4 = GRanges(seqnames = "chr1", ranges = IRanges(c(1, 6), c(4, 10))),
#     gr5 = GRanges(seqnames = "chr1", ranges = IRanges(c(1, 9), c(3, 10)))
# )
# common_regions(gr_list, min_recurrency = 4, gap = bp(1))
# common_regions(gr_list, min_recurrency = 4, gap = 0.5)
common_regions = function(gr_list, min_recurrency = floor(length(gr_list)/4), 
	gap = bp(1000), max_gap = Inf, min_width = 0) {
	
	# merge all gr into one data frame
	gr_merge = gr_list[[1]]
	for(i in seq_along(gr_list)[-1]) {
		gr_merge = c(gr_merge, gr_list[[i]])

		message(qq("merging @{i}/@{length(gr_list)} samples"))
	}
	
	message("calculating cross-sample coverage")
	cov = coverage(gr_merge)
	gr_cov = as(cov, "GRanges")
	
	gr_common = reduce2(gr_cov[gr_cov$score >= min_recurrency], gap = gap, max_gap = max_gap)
	mcols(gr_common) = NULL

	if(length(gr_common) == 0) {
		message("No common region has been detected.")
		return(NULL)
	}
	
	gr_common = gr_common[ width(gr_common) >= min_width ]

	message(qq("there are @{length(gr_common)} common regions"))
	
	# calculate the percentage for each CR covered by gr_list
	message("overlapping `gr_list` to common regions.")
	gr_common = annotate_to_genomic_features(gr_common, gr_list, type = "percent")
	
	return(gr_common)
}


# == title
# Find subgroup specific regions
#
# == param
# -gr a `GenomicRanges::GRanges` object generated by `common_regions`
# -subgroup subgroup information which corresponds to samples in ``gr``
# -present how to define a common region that is specifically present in one subgroup.
#          The subgroup specificity is calculated based on the precent matrix stored in ``gr``.
#          For each subgroup which is defined in ``subgroup``,
#          if ``present`` is a single numeric value, it means the mean value should be larger than it.
#          It can also be a function for which the input is the vector of percent in corresponding subgroup
#          and the output should be a single logical value.
# -absent how to define a common region is specifically absent in one subgroup. Format is same as ``present``
# -type It uses a string containing 1 and 0 to represent types of specificity. E.g. '1100' means
#       present in subgroup 1 and 2 while absent in subgroup 3 and 4. By default, it will output
#       all combination of subgroup specificities.
#
# == value
# This function splits the original ``gr`` into a list in which each element
# contains regions corresponding to different subgroup specificity.
#
# == seealso
# The returned value can be sent to `heatmap_subgroup_specificity` to visualize the specificity.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# \dontrun{
# # following two calls are basically the same
# subgroup_specific_genomic_regions(gr, subgroup, 
#     present = 0.7, absent = 0.3)
# subgroup_specific_genomic_regions(gr, subgroup, 
#     present = function(x) mean(x) >= 0.7, 
#     absent = function(x) mean(x) <= 0.3)
# }
# NULL
subgroup_specific_genomic_regions = function(gr, subgroup, present = 0.7, absent = 0.3, type = NULL) {

	if(length(subgroup) != ncol(mcols(gr))) {
		stop("Length of `subgroup` should be same as number of samples in `gr`\n")
	}

	level = unique(as.character(subgroup))
	if(is.null(type)) {
		ss = expand.grid(rep(list(0:1), length(level)))
		n_ss = nrow(ss)
		ss = ss[-c(1, n_ss), , drop = FALSE]
		type = apply(ss, 1, paste, collapse = "")
	} else {
		ss = as.matrix(as.data.frame(strsplit(type, "")))
		ss = t(ss)
		dm = dim(ss)
		ss = as.integer(ss)
		dim(ss) = dm
		
		if(ncol(ss) != length(level)) {
			stop("`nchar` of `type` should be same as the number of levels.\n")
		}
	}
	
	mat = mcols(gr)[, grepl("^overlap_to_", colnames(mcols(gr)))]
	mat = as.matrix(mat)
	
	gr_list = vector("list", length(type))
	names(gr_list) = type
	
	for(i in seq_along(type)) {
		message(qq("extracting subgroup: '@{type[i]}', "), appendLF = FALSE)
		l = sapply(seq_len(nrow(mat)), function(k) {
			x = mat[k, ]
			for(j in seq_along(level)) {
				x2 = x[subgroup == level[j]]
				if(ss[i, j]) {  # present
					if(is.function(present)) {
						l2 = present(x2)
					} else {
						l2 = mean(x2) >= present
					}
				} else {  # absent
					if(is.function(absent)) {
						l2 = absent(x2)
					} else {
						l2 = mean(x2) <= absent
					}
				}
				
				if(!l2) {
					return(FALSE)
				}
			}
			return(TRUE)
		})
		message(qq("@{sum(l)} regions"))
		
		gr_list[[i]] = gr[l]
	}
	
	attr(gr_list, "subgroup") = subgroup

	class(gr_list) = c("subgroup_specific_genomic_regions", "list")

	return(gr_list)
}

# == title
# Print subgroup_specific_genomic_regions class object
#
# == param
# -x a `subgroup_specific_genomic_regions` class object
# -... additional arguments
#
# == value
# no value is returned
#
# == author
# ZUguang Gu <z.gu@dkfz.de>
print.subgroup_specific_genomic_regions = function(x, ...) {
	subgroup = attr(x, "subgroup")
	level = unique(subgroup)
	qqcat("Subgroups: @{paste(level, collapse = ', ')} with following specificity patterns:\n")

	len = sapply(x, length)
	for(nm in names(len)) {
		qqcat("- @{nm}: @{len[nm]} regions\n")
	}

	nm = names(len)
	l = grepl("0", nm) & grepl("1", nm) & len > 0
	if(sum(l) == 0) {
		l = len > 0
	}
	i = which(l)[1]

	qqcat("The digits mean, e.g. @{nm[i]}, ")
	digits = strsplit(nm[i], "")[[1]]
	for(k in seq_along(digits)) {
		if(digits[k] == 0) {
			qqcat("absent in @{level[k]}@{ifelse(k == length(digits), '', ', ')}")
		} else {
			qqcat("present in @{level[k]}@{ifelse(k == length(digits), '', ', ')}")
		}
	}
	qqcat("\n")
}

# == title
# Subset the subgroup_specific_genomic_regions class object
#
# == param
# -x a `subgroup_specific_genomic_regions` object
# -i index
#
# == value
# No value is returned
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
"[.subgroup_specific_genomic_regions" = function(x, i) {
	subgroup = attr(x, "subgroup")
	class(x) = NULL
	y = x[i]
	attr(y, "subgroup") = subgroup
	class(y) = c("subgroup_specific_genomic_regions", "list")
	y
}

# == title
# Heatmap for visualizing subgroup specific genomic regions 
#
# == param
# -gr_list a list of `GenomicRanges::GRanges` objects which is generated by `subgroup_specific_genomic_regions`
# -genomic_features Genomic features that are used for annotating regions in ``gr_list``, it should be a list or a single `GenomicRanges::GRanges` objects
# -ha the default column annotation is the subgroups used in `subgroup_specific_genomic_regions` step.
#      The value should be a `ComplexHeatmap::HeatmapAnnotation-class` object.
# -... pass to `ComplexHeatmap::draw,HeatmapList-method`
#
# == details
# Columns are clustered within each subgroup and rows are clustered for each type of specificity.
#
# == value
# A `ComplexHeatmap::HeatmapList-class` object
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
heatmap_subgroup_specificity = function(gr_list, genomic_features = NULL,
	ha = HeatmapAnnotation(subgroup = attr(gr_list, "subgroup")), ...) {
		
	if(!inherits(gr_list, "subgroup_specific_genomic_regions")) {
		stop("`gr_list` should come from `subgroup_specific_genomic_regions()`.")
	}
	subgroup = attr(gr_list, "subgroup")

	# column name
	cn = gsub("^overlap_to_", "", grep("^overlap_to_", colnames(mcols(gr_list[[1]])), value = TRUE))

	level = unique(subgroup)

	mat = NULL
	gr_combine = GRanges()
	type = NULL

	gr_list_name = names(gr_list)
	for(i in seq_along(gr_list)) {
		if(length(gr_list[[i]]) == 0) next
		
		sub_matrix = mcols(gr_list[[i]])[, grepl("^overlap_to_", colnames(mcols(gr_list[[i]])))]
		sub_matrix = as.matrix(sub_matrix)
		colnames(sub_matrix) = cn
		
		if(length(gr_list[[i]] == 1)) {
			mat = rbind(mat, sub_matrix)
			gr_combine = c(gr_combine, gr_list[[i]])
			type = c(type, rep(gr_list_name[i], length(gr_list[[i]])))
		} else {
			# cluster rows for each sub-matrix
			message(qq("cluster rows for sub-matrix @{gr_list_name[i]} (@{length(gr_list[[i]])} rows)."))
			if(nrow(sub_matrix) == 1) {
				rclust = list(order = 1)
			} else {
				rclust = hclust(dist(sub_matrix))
			}
			mat = rbind(mat, sub_matrix[rclust$order, , drop = FALSE])
			gr_combine = c(gr_combine, gr_list[[i]][rclust$order])
			type = c(type, rep(gr_list_name[i], length(gr_list[[i]])))
		}
	}
	mcols(gr_combine) = NULL

	# globally cluster inside each subgroup
	mat2 = NULL
	for(le in level) {
		message(qq("cluster columns for samples in subgroup @{le}."))
		m = mat[, subgroup == le, drop = FALSE]
		if(ncol(m) > 1) {
			cclust = hclust(dist(t(m)))
			mat2 = cbind(mat2, m[, cclust$order])
		} else {
			mat2 = cbind(mat2, m)
		}
	}

	type_level = unique(type)
	n_type = length(type_level)
	if(n_type <= 9) {
		type_col = brewer.pal(9, name = "Set1")[1:n_type]
	} else if(n_type <= 9+7) {
		type_col = c(brewer.pal(9, name = "Set1"), brewer.pal(7, "Set2"))[1:n_type]
	} else {
		type_col = c(brewer.pal(9, name = "Set1"), brewer.pal(7, "Set2"), rand_color(n_type - 16))
	}
	names(type_col) = type_level
	ht_list = Heatmap(type, name = "type", col = type_col, width = unit(4, "mm"))
	
	ht_list = ht_list + Heatmap(mat2, name = "pct", col = colorRamp2(c(0, 1), c("white", "blue")), 
		top_annotation = ha,
		cluster_rows = FALSE, cluster_columns = FALSE, split = type,
		use_raster = nrow(mat2) > 2500, raster_quality = 2)
		
	len = width(gr_combine)
	len[len > quantile(len, 0.95)] = quantile(len, 0.95)
	ht_list = ht_list + rowAnnotation(length = row_anno_points(len, axis = TRUE, axis_side = "top", size = unit(1, "mm"), gp = gpar(col = "#00000040")), width = unit(2, "cm"),
		show_annotation_name = TRUE)
	
	if(!is.null(genomic_features)) {
		if(inherits(genomic_features, "GRanges")) {
			gf_name = deparse(substitute(genomic_features))
			genomic_features = list(genomic_features)
			names(genomic_features) = gf_name

		}
		gr_combine = annotate_to_genomic_features(gr_combine, genomic_features, prefix = "")
		
		ht_list = ht_list + Heatmap(as.matrix(mcols(gr_combine)), name = "anno", col = colorRamp2(c(0, 1), c("white", "orange")),
			cluster_columns = FALSE, use_raster = nrow(mat2) > 2500, raster_quality = 2,
			width = unit(4, "mm")*length(genomic_features))
	}

	message(qq("generating heatmap, (nrow = @{nrow(mat2)}, ncol = @{ncol(mat2)})"))
	draw(ht_list, ...)
}
jokergoo/epik documentation built on Sept. 28, 2019, 9:20 a.m.