R/genomic_utils.R
In circlize: Circular Visualization

Documented in cytoband.col generateRandomBed read.chromInfo read.cytoband

# == title
# Read/parse cytoband data from a data frame/file/UCSC database
#
# == param
# -cytoband Path of the cytoband file or a data frame that already contains cytoband data
# -species  Abbreviations of species. e.g. hg19 for human, mm10 for mouse. If this
#          value is specified, the function will download ``cytoBand.txt.gz`` from
#          UCSC website automatically.
# -chromosome.index subset of chromosomes, also used to reorder chromosomes.
# -sort.chr Whether chromosome names should be sorted (first sort by numbers then by letters).
#           If ``chromosome.index`` is set, this argument is enforced to ``FALSE``
#
# == details
# The function read the cytoband data, sort the chromosome names and calculate the length of each chromosome.
# By default, it is human hg19 cytoband data.
#
# You can find the data structure of the cytoband data from https://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/cytoBand.txt.gz
#
# == values
# -``df``         Data frame for cytoband data (rows are sorted if ``sort.chr`` is set to ``TRUE``)
# -``chromosome`` Sorted chromosome names
# -``chr.len``    Length of chromosomes. Orders are same as ``chromosome``
#
# == example
# data = read.cytoband(species = "hg19")
# data = read.cytoband(cytoband = system.file(package = "circlize", "extdata", "cytoBand.txt"))
# cytoband = read.table(system.file(package = "circlize", "extdata", "cytoBand.txt"),
#     colClasses = c("character", "numeric", "numeric", "character", "character"), sep = "\t")
# data = read.cytoband(cytoband = cytoband)
read.cytoband = function(
	cytoband = system.file(package = "circlize", "extdata", "cytoBand.txt"),
	species = NULL,
    chromosome.index = usable_chromosomes(species),
    sort.chr = TRUE) {

	# this function should also take charge of the order of chromosome
	if(!is.null(chromosome.index)) sort.chr = FALSE

	# `species` is prior to `cytoband`
	if(!is.null(species)) {
		url = paste("https://hgdownload.cse.ucsc.edu/goldenPath/", species, "/database/cytoBand.txt.gz", sep = "")
		cytoband = paste0(circos.par("__tempdir__"), "/", species, "_cytoBand.txt.gz")
		if(!file.exists(cytoband)) {
			e = try(suppressWarnings(download.file(url, destfile = cytoband, quiet = TRUE)), silent = TRUE)
			if(inherits(e, "try-error")) {
				if(file.exists(cytoband)) file.remove(cytoband)
				cytoband_list = readRDS(system.file("extdata", "cytoband_list.rds", package = "circlize"))
				if(species %in% names(cytoband_list)) {
					cytoband = cytoband_list[[species]]
				} else {
					stop_wrap("It seems your species name is wrong or UCSC does not provide cytoband data for your species or internet connection was interrupted. If possible, download cytoBand file from ", url, " and use `read.cytoband(file)`.")
				}
			}
		}
	}

	if(is.data.frame(cytoband)) {
		d = cytoband
	} else {
		if(grepl("\\.gz$", cytoband)) {
			cytoband = gzfile(cytoband)
		}

		d = read.table(cytoband, colClasses = c("character", "numeric", "numeric", "character", "character"), sep = "\t", stringsAsFactors = FALSE)
	}

	# now cytoband data is normalized to `d`
	if(!is.null(chromosome.index)) {
		chromosome.index = intersect(chromosome.index, unique(as.vector(d[[1]])))
		d = d[d[[1]] %in% chromosome.index, , drop = FALSE]
		if(is.factor(d[[1]])) {
			# re-factor because levels may decrease due to `chromosome.index`
			levels(d[[1]]) = intersect(chromosome.index, unique(as.vector(d[[1]])))  # ensures the remaining order is same as in `chromosome.index`
		}
	}

	if(nrow(d) == 0) {
		stop_wrap("Cannot find any chromosome. It is probably related to your chromosome names having or not having 'chr' prefix.")
	}

	if(is.null(chromosome.index)) {
		if(is.factor(d[[1]])) {
			chromosome = levels(d[[1]])
		} else {
			chromosome = unique(d[[1]])
		}
	} else {
		chromosome = chromosome.index
	}

	if(sort.chr) {
		chromosome = sort_chr(chromosome)
	}

	chr.len = NULL
	dnew = NULL
	for(chr in chromosome) {
		d2 = d[d[[1]] == chr, ]
		chr.len = c(chr.len, max(d2[, 3]))
		dnew = rbind(dnew, d2)
	}
	names(chr.len) = chromosome

	return(list(df = dnew, chromosome = chromosome, chr.len = chr.len))
}

# == title
# Read/parse chromInfo data from a data frame/file/UCSC database
#
# == param
# -chromInfo Path of the chromInfo file or a data frame that already contains chromInfo data
# -species  Abbreviations of species. e.g. hg19 for human, mm10 for mouse. If this
#          value is specified, the function will download ``chromInfo.txt.gz`` from
#          UCSC website automatically.
# -chromosome.index subset of chromosomes, also used to reorder chromosomes.
# -sort.chr Whether chromosome names should be sorted (first sort by numbers then by letters).
#           If ``chromosome.index`` is set, this argument is enforced to ``FALSE``
#
# == details
# The function read the chromInfo data, sort the chromosome names and calculate the length of each chromosome.
# By default, it is human hg19 chromInfo data.
#
# You can find the data structure for the chromInfo data from https://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/chromInfo.txt.gz
#
# == values
# -``df``         Data frame for chromInfo data (rows are sorted if ``sort.chr`` is set to ``TRUE``)
# -``chromosome`` Sorted chromosome names
# -``chr.len``    Length of chromosomes. Order are same as ``chromosome``
#
# == example
# data = read.chromInfo(species = "hg19")
# data = read.chromInfo(chromInfo = system.file(package = "circlize", "extdata", "chromInfo.txt"))
# chromInfo = read.table(system.file(package = "circlize", "extdata", "chromInfo.txt"),
#     colClasses = c("character", "numeric"), sep = "\t")
# data = read.chromInfo(chromInfo = chromInfo)
read.chromInfo = function(
	chromInfo = system.file(package = "circlize", "extdata", "chromInfo.txt"),
	species = NULL,
    chromosome.index = usable_chromosomes(species),
    sort.chr = TRUE) {

	# this function should also take charge of the order of chromosome
	if(!is.null(chromosome.index)) sort.chr = FALSE

	if(!is.null(species)) {
		url = paste("https://hgdownload.cse.ucsc.edu/goldenPath/", species, "/database/chromInfo.txt.gz", sep = "")
		chromInfo = paste0(circos.par("__tempdir__"), "/", species, "_chromInfo.txt.gz")
		if(!file.exists(chromInfo)) {
			e = try(suppressWarnings(download.file(url, destfile = chromInfo, quiet = TRUE)), silent = TRUE)
			if(inherits(e, "try-error")) {
				if(file.exists(chromInfo)) file.remove(chromInfo)
				chrom_info_list = readRDS(system.file("extdata", "chrom_info_list.rds", package = "circlize"))
				if(species %in% names(chrom_info_list)) {
					chromInfo = chrom_info_list[[species]]
				} else {
					stop_wrap("It seems your species name is wrong or UCSC does not provide chromInfo data for your species or internet connection was interrupted. If possible, download chromInfo file from ", url, " and use `read.chromInfo(file)`.")
				}
			}
		}
	}

	if(is.data.frame(chromInfo)) {
		d = chromInfo
	} else {
		if(grepl("\\.gz$", chromInfo)) {
			chromInfo = gzfile(chromInfo)
		}

		d = read.table(chromInfo, colClasses = c("character", "numeric"), sep = "\t", stringsAsFactors = FALSE)[1:2]  # only first two columns
	}
	rownames(d) = d[[1]]

	if(!is.null(chromosome.index)) {
		chromosome.index = intersect(chromosome.index, unique(as.vector(d[[1]])))
		d = d[d[[1]] %in% chromosome.index, , drop = FALSE]
		if(is.factor(d[[1]])) {
			# re-factor because levels may decrease due to `chromosome.index`
			levels(d[[1]]) = intersect(chromosome.index, unique(as.vector(d[[1]])))  # ensures the remaining order is same as in `chromosome.index`
		}
	}

	if(nrow(d) == 0) {
		stop_wrap("Cannot find any chromosome. It is probably related to your chromosome names having or not having 'chr' prefix.")
	}

	# now cytoband data is normalized to `d`
	if(!is.null(chromosome.index)) {
		chromosome.index = intersect(chromosome.index, unique(as.vector(d[[1]])))
		d = d[d[[1]] %in% chromosome.index, , drop = FALSE]
		if(is.factor(d[[1]])) {
			# re-factor because levels may decrease due to `chromosome.index`
			levels(d[[1]]) = intersect(chromosome.index, unique(as.vector(d[[1]])))  # ensures the remaining order is same as in `chromosome.index`
		}
	}

	if(is.null(chromosome.index)) {
		if(is.factor(d[[1]])) {
			chromosome = levels(d[[1]])
		} else {
			chromosome = unique(d[[1]])
		}
	} else {
		chromosome = chromosome.index
	}

	if(sort.chr) {
		chromosome = sort_chr(chromosome)
	}

	dnew = d[chromosome, , drop = FALSE]
	chr.len = dnew[[2]]
	names(chr.len) = chromosome

	dnew = data.frame(chr = chromosome, start = rep(0, nrow(dnew)), end = dnew[[2]], stringsAsFactors = FALSE)

	return(list(df = dnew, chromosome = chromosome, chr.len = chr.len))
}


usable_chromosomes = function(species) {
	if(is.null(species)) return(NULL)

	switch(gsub("\\d+$", "", species),
		"hg" = paste0("chr", c(1:22, "X", "Y")),
		"mm" = paste0("chr", c(1:19, "X", "Y")),
		"rn" = paste0("chr", c(1:20, "X", "Y")),
		"dm" = paste0("chr", c("2L", "2R", "3L", "3R", "4", "X")),
		"ce" = paste0("chr", c("I", "II", "III", "IV", "V", "X")),
		"sacCer" = paste0("chr", c("I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", "XI", "XII", "XIII", "XIV", "XV")),
		NULL
	)
}

# == title
# Assign colors to cytogenetic band (hg19) according to the Giemsa stain results
#
# == param
# -x A vector containing the Giemsa stain results
#
cytoband.col = function(x) {
	x = as.vector(x)
	col.panel = c("gpos100" = rgb(0, 0, 0, maxColorValue = 255),
                  "gpos"    = rgb(0, 0, 0, maxColorValue = 255),
                  "gpos75"  = rgb(130, 130, 130, maxColorValue = 255),
                  "gpos66"  = rgb(160, 160, 160, maxColorValue = 255),
                  "gpos50"  = rgb(200, 200, 200, maxColorValue = 255),
                  "gpos33"  = rgb(210, 210, 210, maxColorValue = 255),
                  "gpos25"  = rgb(200, 200, 200, maxColorValue = 255),
                  "gvar"    = rgb(220, 220, 220, maxColorValue = 255),
                  "gneg"    = rgb(255, 255, 255, maxColorValue = 255),
                  "acen"    = rgb(217, 47, 39, maxColorValue = 255),
                  "stalk"   = rgb(100, 127, 164, maxColorValue = 255) )
    col = col.panel[x]
    col[is.na(col)] = "#FFFFFF"
	names(col) = NULL
    return(col)
}

# == title
# Generate random genomic data
#
# == param
# -nr  Number of rows
# -nc  Number of numeric columns / value columns
# -fun Function for generating random values
# -species species, pass to `read.cytoband`
#
# == details
# The function will uniformly sample positions from the genome. Chromosome names start with "chr"
# and positions are sorted. The final number of rows may not be exactly as same as ``nr``.
generateRandomBed = function(
	nr = 10000,
	nc = 1,
	fun = function(k) rnorm(k, 0, 0.5),
    species = NULL) {
	cyto = read.cytoband(species = species)
	chr.len = cyto$chr.len
	chromosome = cyto$chromosome
	dl = lapply(seq_along(chr.len), function(i) {
		k = round(nr*2 * chr.len[i] / sum(chr.len))
		k = ifelse(k %% 2, k + 1, k)
		breaks = sort(sample(chr.len[i], k))
		res = data.frame(chr = rep(chromosome[i], length(breaks)/2),
						  start = breaks[seq_along(breaks) %% 2 == 1],
						  end = breaks[seq_along(breaks) %% 2 == 0],
						  stringsAsFactors = FALSE)
		for(k in seq_len(nc)) {
			res = cbind(res, value = fun(length(breaks)/2), stringsAsFactors = FALSE)
		}
		res
	})

	df = NULL
	for(i in seq_along(dl)) {
		df = rbind(df, dl[[i]])
	}

	if(ncol(df) == 3) {
		colnames(df) = c("chr", "start", "end")
	} else {
		colnames(df) = c("chr", "start", "end", paste0("value", seq_len(ncol(df)-3)))
	}
	return(df)
}


sort_chr = function(chromosome) {
	
	chromosome = sort(chromosome)

	if(!grepl("^(chr|\\d)", chromosome[1])) {
		return(chromosome)
	}

	chromosome.ind = gsub("chr", "", chromosome)
	chromosome.ind = gsub("_.*$", "", chromosome.ind)
	l_num = grepl("^\\d+$", chromosome.ind)
	l_letter = !l_num

	chromosome.num = chromosome[l_num]
	chromosome.levels = chromosome[!l_num]

	chromosome.num = chromosome.num[order(as.numeric(chromosome.ind[l_num]))]
	chromosome.letter = chromosome.levels[order(chromosome.ind[!l_num])]
	chromosome = c(chromosome.num, chromosome.letter)
	return(chromosome)
}

validate_region = function(df, start_column = 2, end_column = 3, check_chr = FALSE) {
	if(any(df[, end_column] < df[, start_column])) {
		stop_wrap("End positions should not be smaller than the start positions.")
	}

	if(check_chr) {
		if(is.circos.initialized()) {
			all_chr = as.vector(df[, 1])
			all_chr_level = unique(all_chr)
			chr_start = sapply(all_chr_level, function(x) get.sector.data(x)["min.data"])
			names(chr_start) = all_chr_level
			chr_end = sapply(all_chr_level, function(x) get.sector.data(x)["max.data"])
			names(chr_end) = all_chr_level

			if(any(df[, start_column] < chr_start[all_chr])) {
				warning_wrap("Some of the regions have start position values smaller than the start of the chromosomes.")
			}
			if(any(df[, end_column] > chr_end[all_chr])) {
				warning_wrap("Some of the regions have end position values larger than the end of the chromosomes.")
			}
		}
	}
}