candidate/mutation.R

######################################################################
####
#### functions specifically for files generated by CO pipeline
####
######################################################################

snv_indel_rainfall = function(pid, type = c("all", "intragenic"),
	..., species = "hg19", chromosome = c(paste0("chr", 1:22), c("chrX", "chrY")),
	title = paste0("snv and indel for ", pid)) {

	type = match.arg(type)[1]
	vcf_list = list(...)
	###################
	## rainfall plot for snv
	vcf_list = lapply(vcf_list, function(f) read.table(f, sep = "\t", header = TRUE, comment.char = "", stringsAsFactor = FALSE, check.names=FALSE))
	vcf_density_list = vector("list", length(vcf_list))
	names(vcf_density_list) = names(vcf_list)

	for(i in seq_along(vcf_list)) {

		vcf = vcf_list[[i]]
		if(type == "intragenic") {
			vcf = vcf[vcf$ANNOVAR_FUNCTION != "intergenic", ]
		}

		annotated_to_db = rep(FALSE, nrow(vcf))
		db_column = grep("DBSNP|1K_GENOMES|ENSEMBL_SNP|SANGER", colnames(vcf))
		for(cn in db_column) {
			annotated_to_db = annotated_to_db | vcf[[cn]] != "."
		}

		col = ifelse(annotated_to_db, "#FF000080", "#00000080")
		vcf = vcf[c(1, 2, 2)]; vcf[[3]] = vcf[[3]] + 1; 
		vcf[[1]] = ifelse(grepl("^chr", vcf[[1]]), vcf[[1]], paste("chr", vcf[[1]], sep = ""))
		vcf = cbind(vcf, col, stringsAsFactor = FALSE)
		vcf_list[[i]] = vcf

		vcf_density = lapply(split(vcf, vcf[[1]]), function(gr) {
	    gr2 = circlize::genomicDensity(gr[2:3], window.size = 1e6)
		    cbind(chr = rep(gr[1,1], nrow(gr2)), gr2)
		})
		vcf_density_list[[i]] = do.call("rbind", vcf_density)
	}

	n = length(vcf_list)
	gtrellis_layout(n_track = n*2, ncol = 5, byrow = FALSE, species = species, category = chromosome,
	    track_height = rep(c(2, 1), n),
	    track_ylim =  unlist(lapply(vcf_density_list, function(df) c(0, 8, 0, max(df[[4]])))),
	    track_ylab = rep(names(vcf_list), times = 2),
	    add_name_track = TRUE, add_ideogram_track = TRUE,
	    title = title)

	for(i in seq_along(vcf_list)) {
		# track for rainfall plots
		add_track(vcf_list[[i]], panel.fun = function(gr) {
			if(nrow(gr) > 2) {
			    df = circlize::rainfallTransform(gr[2:3])
			    x = (df[[1]] + df[[2]])/2
			    y = log10(df[[3]])
			    col = as.vector(gr[[4]])
			    grid.points(x, y, pch = 16, size = unit(4, "bigpts"), gp = gpar(col = col))
			}
		})

		# track for genomic density
		add_track(vcf_density_list[[i]], panel.fun = function(gr) {
		    x = (gr[[3]] + gr[[2]])/2
		    y = gr[[4]]
		    grid.polygon(c(x[1], x, x[length(x)]), 
		                 c(0, y, 0), default.units = "native", gp = gpar(fill = "red"))
		})
	}

	lg = legendGrob(c("Annotated", "others"), pch = 16, vgap = 0, gp = gpar(col = c("red", "black")))
	pushViewport(viewport(0.99, 0.01, width = grobWidth(lg), height = grobHeight(lg), just = c(1, 0)))
	grid.draw(lg)
	upViewport()

}


# for(pid in dir("/icgc/dkfzlsdf/analysis/hipo/hipo_047/results_per_pid_pcawg")) {

#  	png(qq("@{pid}_snv_indel.png"), width = 1200, height = 1200)
# 	snv_indel_rainfall(pid = pid,
# 		snv_vcf = qq("/icgc/dkfzlsdf/analysis/hipo/hipo_047/results_per_pid_pcawg/@{pid}/mpileup/snvs_@{pid}_somatic_snvs_conf_8_to_10.vcf"),
# 		indel_vcf = qq("/icgc/dkfzlsdf/analysis/hipo/hipo_047/results_per_pid_pcawg/@{pid}/platypus_indel/indel_@{pid}_somatic_indels_conf_8_to_10.vcf")
# 	)
# 	dev.off()
# }

# for(f in dir("/home/guz/project/analysis/hipo47/snv_indel/germline_snv/")) {
# 	pid = gsub("snvs_(.*?)_germline.*$", "\\1", f)
# 	png(qq("@{pid}_germline_snv.png"), width = 1200, height = 1200)
# 	snv_indel_rainfall(pid = pid,
# 		germline_snv = qq("/home/guz/project/analysis/hipo47/snv_indel/germline_snv/@{f}"),
# 		title = qq("germline snv for @{pid}"))
# 	dev.off()
# }



# load("/icgc/dkfzlsdf/analysis/B080/guz/gencode/gencode_v19_transcript_merged.RData")
# gn = sapply(gene_annotation$gtf, function(x) x$name)
# gene_len = sapply(gene_annotation$gtf, function(x) {
# 	sum(sapply(x$transcript[[1]]$exon, function(y) y$end - y$start + 1))
# })

# load("/icgc/dkfzlsdf/analysis/hipo/hipo_047/hipo_047_rnaseq_gencode19_expression.RData")

# rpkm = log2(expression$rpkm + 1)
# l = !duplicated(gn)
# rpkm = rpkm[l, ]
# rownames(rpkm) = gn[rownames(rpkm)]
# colnames(rpkm) = gsub("_tumor$", "", colnames(rpkm))

read_snv_mutation = function(files, samples, type = "exonic") {

	genes = list()
	for(i in seq_along(files)) {
		tb = read.table(files[i], stringsAsFactors = FALSE, sep = "\t", quote = "", header = TRUE, comment.char = "")
		tb = tb[!tb[[18]] %in% c("synonymous SNV"), ,drop = FALSE]
		tb = tb[tb[[16]] == type, ,drop = FALSE]
		g = tb[[17]]
		g = gsub("\\(.*\\)", "", g)
		g = unique(unlist(strsplit(g, ",")))

		genes[[i]] = unique(g)
	}

	all_genes = unique(unlist(genes))

	qqcat("@{length(all_genes)} genes\n")

	mat = matrix(0, nrow = length(all_genes), ncol = length(samples))
	rownames(mat) = all_genes
	colnames(mat) = samples
	for(i in seq_along(genes)) {
		mat[genes[[i]], i] = 1
	}

	return(mat)
}

snv_oncoprint = function(files, samples, filename = "Rplots.pdf") {

	all_types = c("exonic", "exonic;splicing", "splicing",
		"ncRNA_intronic", "intronic", "intergenic", "upstream", "downstream",
		"UTR3", "UTR5")
	for(type in all_types) {
		qqcat("drawing snv oncoprint for @{type}\n")
		
		mat = read_snv_mutation(files, samples, type = type)
		if(nrow(mat) == 0) next

		pdf(gsub("pdf$", paste0(type, ".pdf"), filename), width = 12, height = nrow(mat)/5+5)
		ht_list = oncoPrint(list(snv = mat), 
			alter_fun_list = list(snv = function(x, y, w, h) grid.rect(x, y, w*0.9, h*0.9, gp = gpar(fill = "red", col = NA))),
			col = c("snv" = "red"), show_column_names = TRUE, remove_empty_columns = FALSE,
			heatmap_legend_param = list(title = "alterations"))
		draw(ht_list, column_title = qq("@{nrow(mat)} genes having snvs (@{type})"))
		dev.off()

		if(nrow(mat) > 100) {
			file = qq(filename)
			pdf(qq("short_version_@{file}"), width = 12, height = 16)
			draw(ht_list, column_title = qq("@{nrow(mat)} genes having snvs (@{type})"))
			dev.off()
		}
	}
}


# files = scan(pipe("ls /icgc/dkfzlsdf/analysis/hipo/hipo_047/results_per_pid_pcawg/*/mpileup/*_somatic_snvs_conf_8_to_10.vcf"), what = "character")
# samples = gsub("^.*results_per_pid_pcawg/(.*?)/mpileup.*$", "\\1", files)

# snv_oncoprint(files, samples, filename = "nonsynonymous_somatic_snvs_oncoprint_@{type}.pdf", expr = rpkm)


# files = dir("/home/guz/project/analysis/hipo47/snv_indel/germline_snv/")
# files = paste0("/home/guz/project/analysis/hipo47/snv_indel/germline_snv/", files)
# samples = gsub("^.*snvs_(.*?)_germline.*$", "\\1", files)

# snv_oncoprint(files, samples, filename = "nonsynonymous_germline_snvs_oncoprint_@{type}.pdf", expr = rpkm)


read_indel_mutation = function(files, samples, type = "exonic") {

	genes = list()
	for(i in seq_along(files)) {
		tb = read.table(files[i], stringsAsFactors = FALSE, sep = "\t", quote = "", header = TRUE, comment.char = "")
		
		tb = tb[tb[[14]] == type, ,drop = FALSE]
		g = tb[[15]]
		g = gsub("\\(.*\\)", "", g)
		g = unique(unlist(strsplit(g, ",")))

		genes[[i]] = unique(g)
	}

	all_genes = unique(unlist(genes))

	qqcat("@{length(all_genes)} genes\n")

	mat = matrix(0, nrow = length(all_genes), ncol = length(samples))
	rownames(mat) = all_genes
	colnames(mat) = samples
	for(i in seq_along(genes)) {
		mat[genes[[i]], i] = 1
	}
	return(mat)
}

indel_oncoprint = function(files, samples, filename = "Rplots.pdf") {
	# indels
	all_types = c("exonic", "downstream", "intronic", "upstream", "UTR3", "UTR5")
	for(type in all_types) {
		qqcat("drawing indel oncoprint for @{type}\n")
		
		mat = read_indel_mutation(files, samples, type = type)
		if(nrow(mat) == 0) next

		pdf(gsub("pdf$", paste0(type, ".pdf"), filename), width = 12, height = nrow(mat)/5+5)
		ht_list = oncoPrint(list(indel = mat), 
			alter_fun_list = list(indel = function(x, y, w, h) grid.rect(x, y, w*0.9, h*0.9, gp = gpar(fill = "blue", col = NA))),
			col = c("indel" = "blue"), show_column_names = TRUE, remove_empty_columns = FALSE,
			heatmap_legend_param = list(title = "alterations"))
		draw(ht_list, column_title = qq("@{nrow(mat)} genes having indel (@{type})"))
		dev.off()

		if(nrow(mat) > 100) {
			file = qq(filename)
			pdf(qq("short_version_@{file}"), width = 12, height = 16)
			draw(ht_list, column_title = qq("@{nrow(mat)} genes having indel (@{type})"))
			dev.off()
		}
		#print(head(mat))
	}
}


snv_indel_oncoprint = function(files_snv, files_indel, samples, filename = "Rplots.pdf") {

	if(!is.null(names(files_snv))) {
		files_snv = files_snv[intersect(samples, names(files_snv))]
		samples_snv = names(files_snv)
	} else if(length(files_snv) != length(samples)) {
		stop("length of `files_snv` and `samples` differ")
	} else {
		samples_snv = samples
	}

	if(!is.null(names(files_indel))) {
		files_indel = files_indel[intersect(samples, names(files_indel))]
		samples_indel = names(files_indel)
	} else if(length(files_indel) != length(samples)) {
		stop("length of `files_indel` and `samples` differ")
	} else {
		samples_indel = samples
	}

	mat_snv = read_snv_mutation(files_snv, samples_snv)
	mat_indel = read_indel_mutation(files_indel, samples_indel)

	common_rn = union(rownames(mat_snv), rownames(mat_indel))
	common_cn = union(colnames(mat_snv), colnames(mat_indel))

	mat_snv2 = matrix(0, nrow = length(common_rn), ncol = length(common_cn))
	dimnames(mat_snv2) = list(common_rn, common_cn)
	mat_snv2[rownames(mat_snv), colnames(mat_snv)] = mat_snv

	mat_indel2 = matrix(0, nrow = length(common_rn), ncol = length(common_cn))
	dimnames(mat_indel2) = list(common_rn, common_cn)
	mat_indel2[rownames(mat_indel), colnames(mat_indel)] = mat_indel

	pdf(filename, width = 12, height = nrow(mat_snv2)/5+5)
	ht_list = oncoPrint(list(snv = mat_snv2, indel = mat_indel2),
		alter_fun_list = list(
	        snv = function(x, y, w, h) grid.rect(x, y, w*0.9, h*0.9, gp = gpar(fill = "red", col = NA)),
	        indel = function(x, y, w, h) grid.rect(x, y, w*0.9, h*0.4, gp = gpar(fill = "blue", col = NA))
	    ), col = c(snv = "red", indel = "blue"), show_column_names = TRUE, remove_empty_columns = FALSE,
	    heatmap_legend_param = list(title = "alterations"))
	draw(ht_list, column_title = qq("@{nrow(mat_snv2)} genes having functional snv/indel"))
	dev.off()
}

# files = scan(pipe("ls /icgc/dkfzlsdf/analysis/hipo/hipo_047/results_per_pid_pcawg/*/platypus_indel/*_somatic_indels_conf_8_to_10.vcf"), what = "character")
# samples = gsub("^.*results_per_pid_pcawg/(.*?)/platypus_indel.*$", "\\1", files)

# indel_oncoprint(files, samples, filename = "somatic_indels_@{type}_oncoprint.pdf", expr = rpkm)


coverage_hilbert_curve = function(pid, tumor_coverage_file, control_coverage_file, output_folder = ".") {

	chr.len = read.chromInfo()$chr.len

	tumor = read.table(tumor_coverage_file, stringsAsFactors = FALSE)
	tumor = GRanges(seqnames = ifelse(grepl("^chr", tumor[[1]]), tumor[[1]], paste0("chr", tumor[[1]])),
		            ranges = IRanges(start = tumor[[2]],
		               				end = tumor[[2]] + 999),
		            value = tumor[[3]])

	x = tumor$value; x = x[x > 0]
	col_fun = colorRamp2(quantile(log10(x), seq(0.1, 0.9, length = 11)), rev(brewer.pal(11, "Spectral")), space = "RGB")
	cm = ColorMapping(col_fun = col_fun)
	legend = color_mapping_legend(cm, plot = FALSE, title = "log10(cov)")
	pdf(qq("@{output_folder}/@{pid}_tumor_hilbert_curve_coverage.pdf"), width = 8, height = 8)
	for(chr in paste0("chr", c(1:22, "X", "Y"))) {
		qqcat("  tumor: @{chr}\n")
		hc = HilbertCurve(1, chr.len[chr], level = 10, mode = "pixel", title = qq("@{pid}, tumor: @{chr}"), legend = legend)
		gr = tumor[seqnames(tumor) == chr]
		gr = gr[gr$value > 0]
		hc_layer(hc, ranges(gr), col = col_fun(log10(gr$value)))
		hc_png(hc, file = qq("@{output_folder}/@{pid}_tumor_hilbert_curve_coverage_@{chr}.png"))
	}
	dev.off()


	control = read.table(control_coverage_file, stringsAsFactors = FALSE)
	control = GRanges(seqnames = ifelse(grepl("^chr", control[[1]]), control[[1]], paste0("chr", control[[1]])),
		            ranges = IRanges(start = control[[2]],
		               				end = control[[2]] + 999),
		            value = control[[3]])

	pdf(qq("@{output_folder}/@{pid}_control_hilbert_curve_coverage.pdf"), width = 8, height = 8)
	for(chr in paste0("chr", c(1:22, "X", "Y"))) {
		qqcat("  control: @{chr}\n")
		hc = HilbertCurve(1, chr.len[chr], level = 10, mode = "pixel", title = qq("@{pid}, control: @{chr}"), legend = legend)
		gr = control[seqnames(control) == chr]
		gr = gr[gr$value > 0]
		hc_layer(hc, ranges(gr), col = col_fun(log10(gr$value)))
		hc_png(hc, file = qq("@{output_folder}/@{pid}_control_hilbert_curve_coverage_@{chr}.png"))
	}
	dev.off()

	l = tumor$value > 0 | control$value > 0
	tumor = tumor[l]
	control = control[l]
	ratio = tumor
	q = quantile(c(tumor$value, control$value), 0.01)
	ratio$value = (tumor$value+q)/(control$value+q) * (sum(control$value)/sum(tumor$value))
	ratio$value = log2(ratio$value)
	col_fun = colorRamp2(c(-1, 0, 1), c("green", "white", "red"), space = "RGB")
	cm = ColorMapping(col_fun = col_fun)
	legend = color_mapping_legend(cm, plot = FALSE, title = "log2(ratio)")
	pdf(qq("@{output_folder}/@{pid}_coverage_ratio_hilbert_curve.pdf"), width = 8, height = 8)
	for(chr in paste0("chr", c(1:22, "X", "Y"))) {
		qqcat("  ratio: @{chr}\n")
		hc = HilbertCurve(1, chr.len[chr], level = 10, mode = "pixel", title = qq("@{pid}, log2(ratio): @{chr}"), legend = legend)
		gr = ratio[seqnames(ratio) == chr]
		hc_layer(hc, ranges(gr), col = col_fun(gr$value))
		hc_png(hc, file = qq("@{output_folder}/@{pid}_coverage_ratio_hilbert_curve_coverage_@{chr}.png"))
	}
	dev.off()

	#### put all chromosome in one plot ####
	cat("merging all chromosomes together.\n")
	sum_chr = 0
	s = numeric(length(ratio))
	e = numeric(length(ratio))
	for(chr in paste0("chr", c(1:22, "X", "Y"))) {
		l = as.character(seqnames(ratio)) == chr
		s[l] = start(ratio[l]) + sum_chr
		e[l] = end(ratio[l]) + sum_chr
		sum_chr = sum_chr + chr.len[chr]
	}

	ratio2 = GRanges(seqnames = rep("chr_merged", length(ratio)),
		            ranges = IRanges(start = round(s/1000),
		            	             end = round(e/1000)),
		            value = ratio$value)
	chr_cumsum = cumsum(chr.len[paste0("chr", c(1:22, "X", "Y"))])
	chr_ir = IRanges(start = round(c(1, chr_cumsum[-length(chr_cumsum)]+1)/1000)+1,
		             end = round(chr_cumsum/1000),
		             names = names(chr_cumsum))
	col_fun = colorRamp2(c(-1, 0, 1), c("green", "white", "red"), space = "RGB")
	cm = ColorMapping(col_fun = col_fun)
	legend = color_mapping_legend(cm, plot = FALSE, title = "log2(ratio)")
	pdf(qq("@{output_folder}/@{pid}_coverage_ratio_hilbert_curve_all_chromosomes.pdf"), width = 8, height = 8)
	hc = HilbertCurve(0, round(max(chr_cumsum)/1000), level = 10, mode = "pixel", 
		title = qq("@{pid}, log2(ratio): all chromosomes"), legend = legend)
	hc_layer(hc, ranges(ratio2), col = col_fun(ratio2$value), grid_line = 3)
	#hc_layer(hc, chr_ir, col = rep(c("#00000020", "#20202020", "#40404020"), length(chr_ir))[seq_len(length(chr_ir))])
	hc_png(hc, file = qq("@{output_folder}/@{pid}_coverage_ratio_hilbert_curve_coverage_all_chromosomes.png"))
	
	hc = HilbertCurve(0, round(max(chr_cumsum)/1000), level = 8)
	hc_rect(hc, chr_ir, gp = gpar(fill = rand_color(length(chr_ir), transparency = 0.5), col = NA))
	hc_text(hc, chr_ir, names(chr_ir), gp = gpar(fontsize = 16))
	dev.off()
}


visualize_defuse = function(files, prefix = "", ...) {

	chr = NULL
	lt = list()
	for(f in files) {
		df = read.table(f, stringsAsFactors = FALSE, header = TRUE, sep = "\t", quote = "")
		x = df[,c("gene_chromosome1", "gene_chromosome2", "gene_name1", "gene_name2")]

		x2 = list()
		x2$gene1 = ifelse(x$gene_name1 > x$gene_name2, x$gene_name2, x$gene_name1)
		x2$gene2 = ifelse(x$gene_name1 > x$gene_name2, x$gene_name1, x$gene_name2)

		chr_info = ifelse(x$gene_chromosome1 == x$gene_chromosome2, "intra_chr", "inter_chr")

		chr[paste(x2$gene1, x2$gene2, sep = "|")] = chr_info

		lt[[basename(f)]] = table(paste(x2$gene1, x2$gene2, sep = "|"))
	}


	all_gn = unique(unlist(lapply(lt, names)))
	mat = matrix(0, nr = length(all_gn), nc = length(lt))
	rownames(mat) = all_gn
	colnames(mat) = gsub(".gene_fusion.annotated.txt", "", names(lt))

	for(i in seq_along(lt)) {
		mat[names(lt[[i]]), i] = lt[[i]]
	}

	pdf(qq("@{prefix}_gene_fusion_stat.pdf"), width = 10, height = 12)
	ht_list = rowAnnotation(bar = row_anno_barplot(apply(mat, 1, function(x) sum(x != 0)), 
		axis_direction = "reverse", axis = TRUE, axis_side = "bottom", baseline = 0), width = unit(2, "cm")) +
	Heatmap(mat, col = c("white", "red"), show_row_dend = FALSE, show_row_names = FALSE,
		heatmap_legend_param = list(title = "#fusion"), ...) + 
	Heatmap(chr[rownames(mat)], col = c("red", "blue"), name = "chr_info", show_row_names = FALSE, width = unit(5, "mm"))
	draw(ht_list, column_title = qq("@{nrow(mat)} gene pairs"))
	dev.off()

	write.csv(cbind(mat, chr = chr[rownames(mat)]), file = qq("@{prefix}_gene_fusion_stat.csv"))

	foo = apply(mat, 1, function(x) sum(x>0))
	foo_col = colorRamp2(seq(min(foo), max(foo), length = 11), rev(brewer.pal(11, "Spectral")))

	nc = ceiling(sqrt(length(files)))
	nr = ceiling(length(files)/nc)
	pdf(qq("@{prefix}_defuse_circos.pdf"), width = 4*nc, height = 4*nr)
	par(mfrow = c(nr, nc))
	chr = NULL
	lt = list()
	for(f in files) {
		qqcat("plotting @{f}\n")
		df = read.table(f, stringsAsFactors = FALSE, header = TRUE, sep = "\t", quote = "")
		x = df[,c("gene_chromosome1", "gene_chromosome2", "gene_name1", "gene_name2")]

		#x = x[x$gene_name1 %in% gn & x$gene_name2 %in% gn, ]

		x2 = list()
		x2$gene1 = ifelse(x$gene_name1 > x$gene_name2, x$gene_name2, x$gene_name1)
		x2$gene2 = ifelse(x$gene_name1 > x$gene_name2, x$gene_name1, x$gene_name2)

		gn= paste(x2$gene1, x2$gene2, sep = "|")

		bed1 = data.frame(chr = paste0("chr", df$gene_chromosome1),
			              start = df$genomic_break_pos1,
			              end = df$genomic_break_pos1)
		bed2 = data.frame(chr = paste0("chr", df$gene_chromosome2),
			              start = df$genomic_break_pos2,
			              end = df$genomic_break_pos2)
		circos.initializeWithIdeogram()
		circos.genomicLink(bed1, bed2, col = foo_col(foo[gn]))
		text(1, 1, gsub(".gene_fusion.annotated.txt", "", basename(f)), adj = c(1, 1))
	}
	plot(1,1, type = "n", axes = FALSE, ann = FALSE)
	legend = unique(round(seq(min(foo), max(foo), length = 5)))
	legend("topleft", lty = 1, lwd = 2, col = foo_col(legend), legend = legend)
	text(1, 1, "gene fusion", cex = 2)
	dev.off()

}

visualize_crest_tx = function(files, prefix = "", somatic_only = TRUE) {

	chr = NULL
	lt = list()
	for(f in files) {
		df = read.table(f, stringsAsFactors = FALSE, header = TRUE, sep = "\t", quote = "", comment.char = "")
		if(somatic_only) df = df[df$SOMATIC_GERMLINE_CLASSIFICATION == "somatic", , drop = FALSE]
		x = df[,c("X.CHROM_FROM", "CHROM_TO", "POS_FROM", "POS_TO")]
		
		x$GENE_LEFT = paste(df$X.CHROM_FROM, round(df$POS_FROM, -3), sep = "_")
		x$GENE_RIGHT = paste(df$CHROM_TO, round(df$POS_TO, -3), sep = "_")

		x2 = list()
		x2$gene1 = ifelse(x$GENE_LEFT > x$GENE_RIGHT, x$GENE_RIGHT, x$GENE_LEFT)
		x2$gene2 = ifelse(x$GENE_LEFT > x$GENE_RIGHT, x$GENE_LEFT, x$GENE_RIGHT)

		chr_info = ifelse(x[["X.CHROM_FROM"]] == x$CHROM_TO, "intra", "inter")

		chr[paste(x2$gene1, x2$gene2, sep = "|")] = chr_info

		lt[[basename(f)]] = table(paste(x2$gene1, x2$gene2, sep = "|"))
	}


	all_gn = unique(unlist(lapply(lt, names)))
	mat = matrix(0, nr = length(all_gn), nc = length(lt))
	rownames(mat) = all_gn
	colnames(mat) = gsub(".TX.annotated", "", names(lt))

	for(i in seq_along(lt)) {
		mat[names(lt[[i]]), i] = lt[[i]]
	}

	pdf(qq("@{prefix}_crest_tx_stat.pdf"), width = 8, height = 12)
	ht_list = rowAnnotation(bar = row_anno_barplot(apply(mat, 1, function(x) sum(x != 0)), 
		axis_direction = "reverse", axis = TRUE, axis_side = "bottom"), width = unit(2, "cm")) +
	Heatmap(mat, col = c("white", "red"), show_row_dend = FALSE, show_row_names = FALSE,
		heatmap_legend_param = list(title = "#TX")) + 
	Heatmap(chr[rownames(mat)], col = c("red", "blue"), name = "chr_info", show_row_names = FALSE, width = unit(5, "mm"))
	draw(ht_list, column_title = qq("@{nrow(mat)} pos pairs"))
	dev.off()

	write.csv(cbind(mat, chr = chr[rownames(mat)]), file = qq("@{prefix}_crest_tx_stat.csv"))


	foo = apply(mat, 1, function(x) sum(x>0))
	foo_col = colorRamp2(seq(min(foo), max(foo), length = length(files)), brewer.pal(length(files), "Spectral"))

	nc = ceiling(sqrt(length(files)))
	nr = ceiling(length(files)/nc)

	nc = ceiling(sqrt(length(files)))
	nr = ceiling(length(files)/nc)
	pdf(qq("@{prefix}_crest_tx_circos.pdf"), width = 4*nc, height = 4*nr)
	par(mfrow = c(nr, nc))
	chr = NULL
	lt = list()
	for(f in files) {
		df = read.table(f, stringsAsFactors = FALSE, header = TRUE, sep = "\t", quote = "", comment.char = "")
		if(somatic_only) df = df[df$SOMATIC_GERMLINE_CLASSIFICATION == "somatic", , drop = FALSE]
		x = df[,c("X.CHROM_FROM", "CHROM_TO", "POS_FROM", "POS_TO")]
		
		x$GENE_LEFT = paste(df$X.CHROM_FROM, round(df$POS_FROM, -3), sep = "_")
		x$GENE_RIGHT = paste(df$CHROM_TO, round(df$POS_TO, -3), sep = "_")

		x2 = list()
		x2$gene1 = ifelse(x$GENE_LEFT > x$GENE_RIGHT, x$GENE_RIGHT, x$GENE_LEFT)
		x2$gene2 = ifelse(x$GENE_LEFT > x$GENE_RIGHT, x$GENE_LEFT, x$GENE_RIGHT)

		gn= paste(x2$gene1, x2$gene2, sep = "|")

		bed1 = data.frame(chr = paste0("chr", df$X.CHROM_FROM),
			              start = df$POS_FROM,
			              end = df$POS_FROM)
		bed2 = data.frame(chr = paste0("chr", df$CHROM_TO),
			              start = df$POS_TO,
			              end = df$POS_TO)

		l = bed1$chr %in% paste0("chr", c(1:22, c("X", "Y"))) & bed2$chr %in% paste0("chr", c(1:22, c("X", "Y")))
		circos.initializeWithIdeogram()
		circos.genomicLink(bed1[l, ], bed2[l, ], col = foo_col[foo[gn[l]]])
		text(1, 1, gsub(".TX.annotated", "", basename(f)), adj = c(1, 1))
	}
	plot(1,1, type = "n", axes = FALSE, ann = FALSE)
	legend = unique(round(seq(min(foo), max(foo), length = 5)))
	legend("topleft", lty = 1, lwd = 2, col = foo_col(legend), legend = legend)
	text(1, 1, "crest tx", cex = 2)
	dev.off()
}

visualize_crest_deldupinv = function(files, prefix = "") {

	pdf(qq("@{prefix}_crest_deldupinv.pdf"), width = 10, height = 8)
	type_col = c("DEL" = "green", "DUP" = "red", "INV" = "blue")
	gtrellis_layout(ncol = 4, byrow = FALSE, n_track = 1, track_ylim = c(0, 6), add_name_track = TRUE, 
		add_ideogram_track = TRUE, title = "DEL(green)_DUP(red)_INV(blue)")
	for(i in seq_along(files)) {
		df = read.table(files[i], stringsAsFactors = FALSE, header = TRUE, sep = "\t", quote = "", comment.char = "")
		df = df[df$SOMATIC_GERMLINE_CLASSIFICATION == "somatic", , drop = FALSE]

		bed = data.frame(chr = paste0("chr", df[[1]]),
			             start = df$POS,
			             end = df$END,
			             type = df$SV_TYPE)
		
		add_track(NULL, track = 2, panel.fun = function(gr) {
			grid.lines(get_cell_meta_data("original_xlim"), unit(c(i, i), "native"), default.unit = "native", gp = gpar(lty = 2, col = "#CCCCCC"))
			if(get_cell_meta_data("name") == "chrY") {
				grid.text(gsub(".DELDUPINV.annotated", "", basename(files[i])), get_cell_meta_data("original_xlim")[2], i, 
					default.unit = "native", just = "left", gp = gpar(fontsize = 8))
			}
		})

		add_track(bed, track = 2, panel.fun = function(gr) {
			
			l = convertWidth(unit(gr$end - gr$start, "native"), "mm", valueOnly = TRUE) < 1
			if(sum(l)) {
				grid.points((gr$start[l] + gr$end[l])/2, rep(i, sum(l)), default.unit = "native", pch = 16,
					gp = gpar(col = type_col[gr$type[l]]), size = unit(1, "mm"))
			}
			if(sum(!l)) {
				grid.segments(gr$start[!l], rep(i, nrow(gr[!l])), gr$end[!l], rep(i, nrow(gr[!l])), default.unit = "native",
					gp = gpar(lwd = 4, col = type_col[gr$type[!l]]))
			}
		})

	}
	dev.off()	
}
jokergoo/cotools documentation built on May 19, 2019, 6:24 p.m.