#' Plot Genome Tracks for NGS Data
#'
#' @description Plot genome tracks for NGS data, resembling GBrowse/JBrowse style. Supporting coverage data in bigWig format from RNA-seq, sRNA-seq, BS-seq, ChIP-seq, RIP-seq, GRO-seq, NET-seq, DNase-seq, MNase-seq, ATAC-seq, SHAPE-seq etc. Annotation should be provided in GTF format. Some codes refers to https://github.com/PoisonAlien/trackplot and `genomemodel` package with some major modifications. Requires `bedtools` and `bwtool` on Linux.
#'
#' @param bed_file BED file containing locus to plot. For each line, TAB-delimited information should containing: <chr>, <start>, <end>, <locus_name>.
#' @param track_conf configuration file containing track info. For each line, TAB-delimited information should containing: <track_name>, <path_to_track_file>, <y_min>, <y_max>, <color>, <group>, without header.
#' @param gene_model BED file containing all elements of gene and TE model, generated by `[tinyfuncr::extractPos]`
#' @param ann_region BED file of annotation regions. If not provided, no annotation will be added. For each line, TAB-delimited information should containing: <chr>, <start>, <end>, <annotation_name>. <annotation_name> could be empty but the last TAB should be kept.
#' @param ann_color color of annotated regions
#' @param out_format format of output figures, one of "png" and "pdf"
#' @param out_dir path to output directory, create it if not exist
#'
#' @import magrittr
#' @importFrom data.table data.table as.data.table rbindlist fread fwrite
#' @importFrom parallel mclapply
#' @importFrom stringr str_split
#'
#' @author Yujie Liu
#' @export genometracks
#'
######################### main IO #########################
genometracks <- function(bed_file,
track_conf,
gene_model,
ann_region = NULL,
ann_color = "goldenrod1",
out_format = "png",
out_dir = "out") {
# check dependencies
.check_windows()
.check_bedtools()
.check_bwtool()
.check_datatable()
# make output dirs
dir_name <- paste0(out_dir, "/")
system(paste0("mkdir -p ", dir_name))
# read in and parse track info
track_info <- read_tcsv(track_conf, header = F)
colnames(track_info) <-
c("track_name",
"track_file",
"y_min",
"y_max",
"color",
"group")
# read loci info
loci <- read_tcsv(bed_file, header = F)
colnames(loci) <- c("chr", "start", "end", "name")
# loop against each locus and plot tracks
for (idx in 1:nrow(loci)) {
# parse info of this locus
chrom <- loci[idx, 1]
begin <- loci[idx, 2] - 30
end <- loci[idx, 3] + 30
name <- loci[idx, 4]
# extract annotation region within this locus
write_tcsv(
data.frame(chrom, as.character(begin), as.character(end)),
file = "tmp_locus.bed",
col.names = F
)
if (!is.null(ann_region)) {
system(
paste(
"bedtools intersect -a tmp_locus.bed -b",
ann_region,
"-wb | cut -f 4- > tmp_ann.bed"
)
)
this_mark_region <-
read_tcsv("tmp_ann.bed", header = F)
} else {
this_mark_region <- NULL
}
# extract the signal for this locus
track_data <-
track_extract(
bigWigs = track_info$track_file,
chr = chrom,
start = begin,
end = end,
binsize = 5,
custom_names = track_info$track_name
)
# plot
if (out_format == "png") {
png(
filename = paste0(dir_name, name, ".png"),
res = 300,
width = 2000,
height = 3000
)
} else if (out_format == "pdf") {
pdf(
file = paste0(dir_name, name, ".pdf"),
width = 10,
height = 10
)
} else {
warning("Format not support, use PDF instead ...")
pdf(
file = paste0(dir_name, name, ".pdf"),
width = 10,
height = 10
)
}
track_plot(
summary_list = track_data,
draw_gene_track = T,
gene_model = gene_model,
y_min = track_info$y_min,
y_max = track_info$y_max,
col = track_info$color,
regions = this_mark_region,
boxcol = ann_color,
boxcolalpha = 0.4
)
dev.off()
system("rm -rf tmp_locus.bed tmp_ann.bed tmp_model.bed")
}
}
############################ check dependency ############################
.check_windows <- function() {
if (Sys.info()["sysname"] == "Windows") {
stop("Windows is not supported :(")
}
}
.check_bedtools <- function(warn = TRUE) {
check <- as.character(Sys.which(names = 'bedtools'))[1]
if (check != "") {
if (warn) {
message("Checking for bedtools installation")
message(paste0(" All good! Found bedtools at: ", check))
} else{
return(invisible(0))
}
} else{
stop("Could not locate bedtools !")
}
}
.check_bwtool <- function(warn = TRUE) {
check <- as.character(Sys.which(names = 'bwtool'))[1]
if (check != "") {
if (warn) {
message("Checking for bwtool installation")
message(paste0(" All good! Found bwtool at: ", check))
} else{
return(invisible(0))
}
} else{
stop(
"Could not locate bwtool. Download it from here: https://github.com/CRG-Barcelona/bwtool/releases"
)
}
}
.check_datatable <- function() {
if (!requireNamespace("data.table", quietly = TRUE)) {
message("Could not find data.table library. Attempting to install..")
install.packages("data.table")
}
}
########################## parse track info ##########################
track_extract <- function(bigWigs,
chr,
start,
end,
binsize = 10,
custom_names = NULL) {
# make tmp dirs
options(warn = -1)
op_dir <- paste0(tempdir(), "/")
if (!dir.exists(paths = op_dir)) {
dir.create(path = op_dir,
showWarnings = FALSE,
recursive = TRUE)
}
# parse locus
if (start >= end) {
stop("End must be larger than Start!")
}
message(" Queried region: ",
chr,
":",
start,
"-",
end,
" [",
end - start,
" bps]")
# check bigwig files
if (is.null(bigWigs)) {
stop("Provide at-least one bigWig file")
}
for (i in 1:length(bigWigs)) {
if (!file.exists(as.character(bigWigs)[i])) {
stop(paste0(as.character(bigWigs)[i], " does not exist!"))
}
}
# check track names
if (!is.null(custom_names)) {
if (length(custom_names) != length(bigWigs)) {
stop("Please provide names for all bigWigs")
}
}
# split to each bin and summary track info
windows <- .gen_windows(
chr = chr,
start = start,
end = end,
window_size = binsize,
op_dir = op_dir
)
track_summary <- .gen_tracks(bedSimple = windows,
bigWigs = bigWigs,
op_dir = op_dir)
if (!is.null(custom_names)) {
names(track_summary) <- custom_names
}
track_summary$loci <- c(chr, start, end)
track_summary
}
.gen_windows <- function(chr,
start,
end,
window_size,
op_dir) {
message(paste0("Generating windows ", "[", window_size, " bp window size]"))
window_dat <- data.table::data.table()
while (start <= end) {
window_dat <- data.table::rbindlist(l = list(
window_dat,
data.table::data.table(start, end = start + window_size)
),
fill = TRUE)
start = start + window_size
}
window_dat$chr <- chr
window_dat <- window_dat[, .(chr, start, end)]
temp_op_bed <- tempfile(pattern = "trackr",
tmpdir = op_dir,
fileext = ".bed")
data.table::fwrite(
x = window_dat,
file = temp_op_bed,
sep = "\t",
quote = FALSE,
row.names = FALSE,
col.names = FALSE
)
temp_op_bed
}
.gen_tracks <- function(bedSimple,
bigWigs,
op_dir = getwd(),
nthreads = 1) {
message(paste0("Extracting signals"))
# summary using bwtool
summaries <- parallel::mclapply(
bigWigs,
FUN = function(bw) {
bn = gsub(pattern = "\\.bw$|\\.bigWig$",
replacement = "",
x = basename(bw))
message(paste0(" Processing ", bn, " .."))
cmd = paste(
"bwtool summary -with-sum -keep-bed -header",
bedSimple,
bw,
paste0(op_dir, bn, ".summary")
)
system(command = cmd, intern = TRUE)
paste0(op_dir, bn, ".summary")
},
mc.cores = nthreads
)
summary_list <- lapply(summaries, function(x) {
x = data.table::fread(x)
colnames(x)[1] = 'chromosome'
x = x[, .(chromosome, start, end, size, max)]
if (all(is.na(x[, max]))) {
message(
"No signal! Possible cause: chromosome name mismatch between bigWigs and queried loci."
)
x[, max := 0]
}
x
})
# remove intermediate files
lapply(summaries, function(x)
system(command = paste0("rm ", x), intern = TRUE))
system(command = paste0("rm ", bedSimple), intern = TRUE)
names(summary_list) <- gsub(
pattern = "*\\.summary$",
replacement = "",
x = basename(path = unlist(summaries))
)
summary_list
}
############################# plot tracks #############################
track_plot <- function(summary_list,
draw_gene_track = TRUE,
gene_model = NULL,
y_min = NULL,
y_max = NULL,
col = "black",
regions = NULL,
boxcol = "goldenrod1",
boxcolalpha = 0.4,
gene_track_height = 1,
xaxis_track_height = 0.5,
gene_fsize = 0.6,
show_axis = TRUE,
track_names = NULL,
track_names_pos = 0,
region_width = 1) {
# check and parse input
if (is.null(summary_list)) {
stop("Missing input! Expecting output from track_extract()")
}
chr <- summary_list$loci[1]
start <- as.numeric(summary_list$loci[2])
end <- as.numeric(summary_list$loci[3])
summary_list$loci <- NULL
if (length(col) != length(summary_list)) {
col <- rep(x = col, length(summary_list))
}
plot_regions = FALSE
if (!is.null(regions)) {
if (is(object = regions, class2 = "data.frame")) {
regions <- data.table::as.data.table(x = regions)
colnames(regions)[1:3] <- c("chromsome", "startpos", "endpos")
regions <- regions[chromsome %in% chr]
if (nrow(regions) == 0) {
warning("None of the regions are within the requested chromosme: ",
chr)
plot_regions <- TRUE
} else{
plot_regions <- TRUE
}
} else{
stop(
"'mark_regions' must be a data.frame with first 3 columns containing : chr, start, end"
)
}
}
if (!is.null(track_names)) {
names(summary_list) <- track_names
}
if (!is.null(y_max)) {
if (length(y_max) != length(summary_list)) {
y_max <- rep(y_max, length(summary_list))
}
plot_height <- y_max
} else{
plot_height <- round(plot_height, digits = 2)
}
if (!is.null(y_min)) {
if (length(y_min) != length(summary_list)) {
y_min <- rep(y_min, length(summary_list))
}
plot_height_min <- y_min
} else{
plot_height_min <- round(plot_height_min, digits = 2)
}
# plot layout
ntracks <- length(summary_list)
lo <- layout(
mat = matrix(data = seq_len(ntracks + 2)),
heights = c(xaxis_track_height, gene_track_height, rep(3, ntracks))
)
# draw x-axis
if (show_axis) {
par(mar = c(0, 4, 0, 0))
} else{
par(mar = c(0, 1, 0, 0))
}
lab_at <- pretty(c(start, end))
plot(
NA,
xlim = c(start, end),
ylim = c(-0.5, 0.5),
frame.plot = FALSE,
axes = FALSE,
xlab = NA,
ylab = NA
)
text(x = lab_at, y = 0.3, labels = lab_at)
rect(
xleft = start,
ybottom = 0,
xright = end,
ytop = 0,
lty = 2,
xpd = TRUE
)
rect(
xleft = lab_at,
ybottom = -0.1,
xright = lab_at,
ytop = 0.1,
xpd = TRUE
)
#axis(side = 1, at = lab_at, lty = 2, line = -3)
#text(
# x = end,
# y = 0.75,
# labels = paste0(chr, ":", start, "-", end),
# adj = 1,
# xpd = TRUE
#)
##############add plot name
# draw gene models
if (draw_gene_track) {
if (show_axis) {
par(mar = c(0.25, 4, 0, 0.5))
} else{
par(mar = c(0.25, 1, 0, 0.5))
}
#par(mar = c(1, 1, 3, 1), cex = 1)
# extract gene models within this locus
system(
paste(
"bedtools intersect -a tmp_locus.bed -b",
gene_model,
"-wb | cut -f 4- > tmp_model.bed"
)
)
model <-
read_tcsv("tmp_model.bed", header = F)
tmp_model_info <-
model[[4]] %>% stringr::str_split("_") %>% as.data.frame() %>% t() %>% as.data.frame()
model_info <- data.frame()
for (i in 1:nrow(tmp_model_info)) {
tmp <-
data.frame(
Chr = model[i, 1],
Start = model[i, 2],
End = model[i, 3],
ID = paste(tmp_model_info[i, 2:ncol(tmp_model_info)], collapse = "_"),
Type = tmp_model_info[i, 1],
Strand = model[i, 6]
)
model_info <- rbind(model_info, tmp)
}
# draw for each gene
genes_to_plot <- unique(model_info$ID)
plot(
NA,
xlim = c(start, end),
ylim = c(0, length(genes_to_plot)),
frame.plot = FALSE,
axes = FALSE,
xlab = NA,
ylab = NA
)
for (idx in 1:length(genes_to_plot)) {
this_gene <- genes_to_plot[idx]
this_gene_model <- model_info[model_info$ID == this_gene,]
this_gene_model <-
this_gene_model[order(this_gene_model$Start),]
text(
x = start,
y = idx - 0.5,
labels = this_gene,
adj = 0,
cex = gene_fsize
)
for (i in 1:nrow(this_gene_model)) {
type <- this_gene_model[i, "Type"]
if (type == "CDS") {
rect(
xleft = this_gene_model$Start[i],
ybottom = idx - 0.6,
xright = this_gene_model$End[i],
ytop = idx - 0.4,
col = "black",
border = "black",
lwd = 0.5
)
} else if (type == "Intron") {
mid <-
mean(c(this_gene_model$Start[i],
this_gene_model$End[i]))
segments(
x0 = this_gene_model$Start[i],
y0 = idx - 0.5,
x1 = mid,
y1 = idx - 0.4,
lwd = 0.5,
col = "black"
)
segments(
x0 = this_gene_model$End[i],
y0 = idx - 0.5,
x1 = mid,
y1 = idx - 0.4,
lwd = 0.5,
col = "black"
)
} else if ("UTR" %in% type) {
rect(
xleft = this_gene_model$Start[i],
ybottom = idx - 0.6,
xright = this_gene_model$End[i],
ytop = idx - 0.4,
col = "grey",
border = "black",
lwd = 0.5
)
}
}
if (this_gene_model[1, 6] == "+") {
x <-
c(
this_gene_model$Start[nrow(this_gene_model)],
this_gene_model$Start[nrow(this_gene_model)],
this_gene_model$End[nrow(this_gene_model)],
this_gene_model$End[nrow(this_gene_model)] + .02 * (this_gene_model$End[nrow(this_gene_model)] - this_gene_model$Start[1]),
this_gene_model$End[nrow(this_gene_model)]
)
y <-
c(idx - 0.4, idx - 0.6, idx - 0.6, idx - 0.5, idx - 0.4)
endtype <- this_gene_model$Type[nrow(this_gene_model)]
if (endtype == "CDS") {
polygon(x,
y,
col = "black",
border = "black",
lwd = 0.5)
} else {
polygon(x,
y,
col = "grey",
border = "black",
lwd = 0.5)
}
} else {
x <-
c(
this_gene_model$End[1],
this_gene_model$End[1],
this_gene_model$Start[1],
this_gene_model$Start[1] - .02 * (this_gene_model$End[nrow(this_gene_model)] - this_gene_model$Start[1]),
this_gene_model$Start[1]
)
y <-
c(idx - 0.4, idx - 0.6, idx - 0.6, idx - 0.5, idx - 0.4)
endtype <- this_gene_model$Type[1]
if (endtype == "CDS") {
polygon(x,
y,
col = "black",
border = "black",
lwd = 0.5)
} else {
polygon(x,
y,
col = "grey",
border = "black",
lwd = 0.5)
}
}
#########add MITEs
########################
#if (attr(txtbl, "gene") == "MITEs") {
# rect(
# xleft = txtbl[[1]],
# ybottom = tx_id - 0.75,
# xright = txtbl[[2]],
# ytop = tx_id - 0.25,
# col = grDevices::adjustcolor("red", alpha.f = 0.6),
# border = NA
# )
#}
###################################################
}
}
# draw bigWig signals
lapply(1:length(summary_list), function(idx) {
x <- summary_list[[idx]]
if (show_axis) {
par(mar = c(0.5, 4, 2, 1))
} else{
par(mar = c(0.5, 1, 2, 1))
}
plot(
NA,
xlim = c(start, end),
ylim = c(plot_height_min[idx], plot_height[idx]),
frame.plot = FALSE,
axes = FALSE,
xlab = NA,
ylab = NA
)
segments(
x0 = start,
y0 = 0,
x1 = end,
y1 = 0,
lwd = 1,
col = "black"
)
rect(
xleft = x$start,
ybottom = 0,
xright = x$end,
ytop = x$max,
col = col[idx],
border = NA
)
if (show_axis) {
axis(
side = 2,
at = c(plot_height_min[idx], plot_height[idx]),
las = 2
)
} else{
text(
x = start,
y = plot_height[idx],
labels = paste0("[", plot_height_min[idx], "-", plot_height[idx], "]"),
adj = 0,
xpd = TRUE
)
}
if (plot_regions) {
boxcol <- grDevices::adjustcolor(boxcol, alpha.f = boxcolalpha)
if (nrow(regions) > 0) {
for (i in 1:nrow(regions)) {
if (idx == length(summary_list)) {
# if its a last plot, draw rectangle till 0
rect(
xleft = regions[i, startpos],
ybottom = 0,
xright = regions[i, endpos],
ytop = 1.5 * plot_height[idx],
col = boxcol,
border = NA,
xpd = TRUE
)
} else if (idx == 1) {
if (ncol(regions) > 3) {
text(
x = mean(c(regions[i, startpos], regions[i, endpos])),
y = plot_height[idx] + (plot_height[idx] * 0.1),
labels = regions[i, 4],
adj = 0.5,
xpd = TRUE,
font = 1,
cex = 1.2
)
} else{
text(
x = mean(c(regions[i, startpos], regions[i, endpos])),
y = plot_height[idx] + (plot_height[idx] * 0.1),
labels = paste0(regions[i, startpos], "-", regions[i, endpos]),
adj = 0.5,
xpd = TRUE,
font = 1,
cex = 1.2
)
}
rect(
xleft = regions[i, startpos],
ybottom = -0.5 * plot_height[idx],
xright = regions[i, endpos],
ytop = plot_height[idx],
col = boxcol,
border = NA,
xpd = TRUE
)
} else{
rect(
xleft = regions[i, startpos],
ybottom = -0.5 * plot_height[idx],
xright = regions[i, endpos],
ytop = 1.5 * plot_height[idx],
col = boxcol,
border = NA,
xpd = TRUE
)
}
}
}
}
title(
main = names(summary_list)[idx],
adj = track_names_pos,
font.main = 3
)
})
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.