#' Plot Epigenetics Tracks for NGS Data using ggplot2
#'
#' @description Plot genome tracks for NGS data using ggplot2, resembling GBrowse/JBrowse style. Supporting coverage data in bigWig format from RNA-seq, sRNA-seq, BS-seq, ChIP-seq, RIP-seq, GRO-seq, NET-seq, DNase-seq, MNase-seq, ATAC-seq, SHAPE-seq etc. Especially optimized for BS-seq data in single-base resolution. Requires `bedtools` on Linux.
#'
#' @param locus_file BED file containing locus to plot. For each line, TAB-delimited information should contain: <chr>, <start>, <end>, <locus_name>, without header.
#' @param track_conf configuration file containing track info. For each line, TAB-delimited information should contain: <track_name>, <track_type>, <track_file>, <y_min>, <y_max>, <color>, <group>, without header. NOTE: If you write "BS-seq" in "<track_type>", provide mC type (one of CG, CHG, CHH, All) to plot in "<path_to_track_file>" and provide detailed info in `bsseq_conf`, "<color>" is currently ignored in multi-mC mode of BS-seq.
#' @param bsseq_conf configuration file of BS-seq track. If no BS-seq track plotted, keep `NULL`. For each line, TAB-delimited information should contain: <track_name>, <CG_file>, <CHG_file>, <CHH_file>, <C_context_file>, without header. "<track_name>" here should be the same as in `track_conf`. If you choose only one mC type to plot, you can keep the column of other file as "NA", but provide all the file may be used has no side-effect. "<C_context_file>" can be "NA" to avoid drawing C context track.
#' @param gene_model BED file containing all elements of gene and/or TE model, generated by `[tinyfuncr::extractPos]`
#' @param ann_region BED file of annotation regions. If not provided, no annotation will be added. For each line, TAB-delimited information should contain: <chr>, <start>, <end>, <annotation_name>, without header. <annotation_name> could be empty but the last TAB should be kept.
#' @param ann_color color of annotated regions
#' @param out_dir path to output directory, create it if not exist
#'
#' @import magrittr
#' @import ggplot2
#' @importFrom rtracklayer import.bw
#' @importFrom GenomicRanges split seqnames restrict
#' @importFrom cowplot plot_grid
#' @importFrom stringr str_split
#'
#' @author Yujie Liu
#' @export ggepitracks
#'
######################### main IO #########################
ggepitracks <- function(locus_file,
track_conf,
bsseq_conf = NULL,
gene_model,
ann_region = NULL,
ann_color = "goldenrod1",
out_dir = "out") {
# check dependencies
.check_windows()
.check_bedtools()
# make output dirs
dir_name <- paste0(out_dir, "/")
system(paste0("mkdir -p ", dir_name))
# read in and parse track info
track_info <- read_tcsv(track_conf, header = F)
colnames(track_info) <-
c("track_name",
"track_type",
"track_file",
"y_min",
"y_max",
"color",
"group")
# read loci info
loci <- read_tcsv(locus_file, header = F)
colnames(loci) <- c("chr", "start", "end", "name")
# loop against each locus and plot tracks
for (locus_idx in 1:nrow(loci)) {
# parse info of this locus and make granges
chrom <- loci[locus_idx, 1]
begin <- loci[locus_idx, 2]
end <- loci[locus_idx, 3]
locus_name <- loci[locus_idx, 4]
write_tcsv(
data.frame(chrom, as.character(begin), as.character(end)),
file = "tmp_locus.bed",
col.names = F
)
# extract annotation region within this locus
if (!is.null(ann_region)) {
system(
paste(
"bedtools intersect -a tmp_locus.bed -b",
ann_region,
"-wb | cut -f 4- > tmp_ann.bed"
)
)
this_mark_region <-
read_tcsv("tmp_ann.bed", header = F)
colnames(this_mark_region) <- c("chr", "start", "end", "name")
this_mark_region$start <- this_mark_region$start + 1
this_mark_region$end <- this_mark_region$end + 1
} else {
this_mark_region <- NULL
}
# extract gene model within this locus
system(
paste(
"bedtools intersect -a tmp_locus.bed -b",
gene_model,
"-wb | cut -f 4- > tmp_model.bed"
)
)
this_locus_model <-
read_tcsv("tmp_model.bed", header = F)
# plot gene model for this locus
p_model <- epiplot_model(
model = this_locus_model,
start = begin,
end = end
)
p_track <- list()
p_track[[1]] <- p_model
# loop against each track and plot for this locus
for (track_idx in 1:nrow(track_info)) {
if (track_info[track_idx, "track_type"] == "BS-seq") {
bsseq_info <- read_tcsv(bsseq_conf, header = F)
colnames(bsseq_info) <-
c("track_name",
"CG",
"CHG",
"CHH",
"C_context_file")
track_idx_in_bsseq <-
which(track_info[track_idx, "track_name"] == bsseq_info$track_name)
if (track_info[track_idx, "track_file"] != "All") {
# read in track file
bw <-
rtracklayer::import.bw(bsseq_info[track_idx_in_bsseq, track_info[track_idx, "track_file"]])
bw_chr <-
GenomicRanges::split(bw, GenomicRanges::seqnames(bw))
bw_this_locus <-
as.data.frame(GenomicRanges::restrict(
x = bw_chr[[chrom]],
start = begin,
end = end
))[c(1, 2, 3, 6)]
bw_this_locus$end <- bw_this_locus$end + 1
# plot bsseq tracks
p_track[[track_idx + 1]] <-
epiplot_bsseq(
bw_df = bw_this_locus,
start = begin,
end = end,
y_min = track_info[track_idx, "y_min"],
y_max = track_info[track_idx, "y_max"],
track_name = track_info[track_idx, "track_name"],
track_color = track_info[track_idx, "color"],
ann_df = this_mark_region,
ann_color = ann_color
)
# merge plot
#if (track_idx == 1) {
# p_merge <-
# cowplot::plot_grid(
# p_model,
# p_track,
# nrow = 2,
# rel_heights = c(1, 3),
# align = "h"
# )
#} else {
# p_merge <-
# cowplot::plot_grid(
# p_merge,
# p_track,
# nrow = 2,
# rel_heights = c(1 + 3 * (track_idx - 1), 3),
# align = "h"
# )
#}
} else {
# CG
bw_CG <-
rtracklayer::import.bw(bsseq_info[track_idx_in_bsseq, "CG"])
bw_CG_chr <-
GenomicRanges::split(bw_CG, GenomicRanges::seqnames(bw_CG))
bw_CG_this_locus <-
as.data.frame(GenomicRanges::restrict(
x = bw_CG_chr[[chrom]],
start = begin,
end = end
))[c(1, 2, 3, 6)]
bw_CG_this_locus$end <- bw_CG_this_locus$end + 1
# CHG
bw_CHG <-
rtracklayer::import.bw(bsseq_info[track_idx_in_bsseq, "CHG"])
bw_CHG_chr <-
GenomicRanges::split(bw_CHG, GenomicRanges::seqnames(bw_CHG))
bw_CHG_this_locus <-
as.data.frame(GenomicRanges::restrict(
x = bw_CHG_chr[[chrom]],
start = begin,
end = end
))[c(1, 2, 3, 6)]
bw_CHG_this_locus$end <- bw_CHG_this_locus$end + 1
# CHH
bw_CHH <-
rtracklayer::import.bw(bsseq_info[track_idx_in_bsseq, "CHH"])
bw_CHH_chr <-
GenomicRanges::split(bw_CHH, GenomicRanges::seqnames(bw_CHH))
bw_CHH_this_locus <-
as.data.frame(GenomicRanges::restrict(
x = bw_CHH_chr[[chrom]],
start = begin,
end = end
))[c(1, 2, 3, 6)]
bw_CHH_this_locus$end <- bw_CHH_this_locus$end + 1
# plot multi C type bsseq
p_track[[track_idx + 1]] <-
epiplot_bsseq_multi(
CG_df = bw_CG_this_locus,
CHG_df = bw_CHG_this_locus,
CHH_df = bw_CHH_this_locus,
start = begin,
end = end,
y_min = track_info[track_idx, "y_min"],
y_max = track_info[track_idx, "y_max"],
track_name = track_info[track_idx, "track_name"],
ann_df = this_mark_region,
ann_color = ann_color
)
# merge plot
#if (track_idx == 1) {
# p_merge <-
# cowplot::plot_grid(
# p_model,
# p_track,
# nrow = 2,
# rel_heights = c(2, 3),
# align = "h"
# )
#} else {
# p_merge <-
# cowplot::plot_grid(
# p_merge,
# p_track,
# nrow = 2,
# rel_heights = c(2 + 3 * (track_idx - 1), 3),
# align = "h"
# )
#}
}
} else{
# read in track file for non BS-seq data
bw <-
rtracklayer::import.bw(track_info[track_idx, "track_file"])
bw_chr <-
GenomicRanges::split(bw, GenomicRanges::seqnames(bw))
bw_this_locus <-
as.data.frame(GenomicRanges::restrict(
x = bw_chr[[chrom]],
start = begin,
end = end
))[c(1, 2, 3, 6)]
bw_this_locus$end <- bw_this_locus$end + 1
# plot coverage
p_track[[track_idx + 1]] <-
epiplot_cov(
bw_df = bw_this_locus,
start = begin,
end = end,
y_min = track_info[track_idx, "y_min"],
y_max = track_info[track_idx, "y_max"],
track_name = track_info[track_idx, "track_name"],
track_color = track_info[track_idx, "color"],
ann_df = this_mark_region,
ann_color = ann_color
)
# merge plot
#if (track_idx == 1) {
# p_merge <-
# cowplot::plot_grid(
# p_model,
# p_track,
# nrow = 2,
# rel_heights = c(1, 3),
# align = "h"
# )
#} else {
# p_merge <-
# cowplot::plot_grid(
# p_merge,
# p_track,
# nrow = 2,
# rel_heights = c(1 + 3 * (track_idx - 1), 3),
# align = "h"
# )
#}
}
}
# merge plots
p_merge <-
cowplot::plot_grid(
plotlist = p_track,
nrow = 1 + nrow(track_info),
rel_heights = c(3, rep(3, nrow(track_info))),
align = "h"
)
ggsave(
paste0(dir_name, locus_name, ".pdf"),
p_merge,
width = 6,
height = 4
)
}
}
############################ check dependency ############################
.check_windows <- function() {
if (Sys.info()["sysname"] == "Windows") {
stop("Windows is not supported :(")
}
}
.check_bedtools <- function(warn = TRUE) {
check <- as.character(Sys.which(names = 'bedtools'))[1]
if (check != "") {
if (warn) {
message("Checking for bedtools installation")
message(paste0(" All good! Found bedtools at: ", check))
} else{
return(invisible(0))
}
} else{
stop("Could not locate bedtools !")
}
}
######################### plot tracks for gene model #########################
epiplot_model <- function(model, start, end) {
# parse model info
tmp_model_info <-
model[[4]] %>% stringr::str_split("_") %>% as.data.frame() %>% t() %>% as.data.frame()
model_info <- data.frame()
for (i in 1:nrow(tmp_model_info)) {
tmp <-
data.frame(
Chr = model[i, 1],
Start = model[i, 2] + 1,
End = model[i, 3] + 1,
ID = paste(tmp_model_info[i, 2:ncol(tmp_model_info)], collapse = "_"),
Type = tmp_model_info[i, 1],
Strand = model[i, 6]
)
model_info <- rbind(model_info, tmp)
}
model_info <- model_info[order(model_info$Start),]
# make draw data
# parse coord
genes_to_plot <- unique(model_info$ID)
model_info$Group <-
unlist(lapply(model_info$ID, function(x)
which(x == genes_to_plot)))
model_info$Start <-
unlist(lapply(model_info$Start, function(x)
max(x, start)))
model_info$End <-
unlist(lapply(model_info$End, function(x)
min(x, end)))
model_info <- model_info[model_info$Start <= model_info$End, ]
# split body and end
model_info_group_first <-
model_info[!duplicated(model_info$Group), ]
model_info_des <-
model_info[order(model_info$Start, decreasing = T),]
model_info_group_last <-
model_info_des[!duplicated(model_info_des$Group), ]
model_info_group_last <-
model_info_group_last[order(model_info_group_last$Group), ]
model_info_group_length <-
model_info_group_last$End - model_info_group_first$Start
for (group_idx in 1:nrow(model_info_group_first)) {
if (model_info_group_first[group_idx, 6] == "+") {
model_info_group_first[group_idx,] <-
model_info_group_last[group_idx,]
}
}
model_info_end <- data.frame()
for (group_idx in 1:nrow(model_info_group_first)) {
if (model_info_group_first[group_idx, "Strand"] == "+") {
x <- c(
model_info_group_first[group_idx, "Start"],
model_info_group_first[group_idx, "Start"],
model_info_group_first[group_idx, "End"],
model_info_group_first[group_idx, "End"] + 0.02 * model_info_group_length[group_idx],
model_info_group_first[group_idx, "End"]
)
} else {
x <- c(
model_info_group_first[group_idx, "End"],
model_info_group_first[group_idx, "End"],
model_info_group_first[group_idx, "Start"],
model_info_group_first[group_idx, "Start"] - 0.02 * model_info_group_length[group_idx],
model_info_group_first[group_idx, "Start"]
)
}
model_info_end <-
rbind(model_info_end,
data.frame(
Type = rep(model_info_group_first[group_idx, "Type"], 5),
X = x,
Y = c(
model_info_group_first[group_idx, "Group"] - 0.4,
model_info_group_first[group_idx, "Group"] - 0.6,
model_info_group_first[group_idx, "Group"] - 0.6,
model_info_group_first[group_idx, "Group"] - 0.5,
model_info_group_first[group_idx, "Group"] - 0.4
),
Group = rep(model_info_group_first[group_idx, "Group"], 5)
))
}
# extract three context
model_info_name <- model_info[!duplicated(model_info$Group), ]
model_info_CDS <- model_info[model_info$Type == "CDS", ]
model_info_intron <- model_info[model_info$Type == "Intron",]
model_info_intron$Mid <-
rowMeans(model_info_intron[c("Start", "End")])
model_info_UTR <-
model_info[model_info$Type == "5UTR" |
model_info$Type == "3UTR", ]
model_info_end_CDS <-
model_info_end[model_info_end$Type == "CDS", ]
model_info_end_UTR <-
model_info_end[model_info_end$Type == "5UTR" |
model_info_end$Type == "3UTR", ]
# plot
p_model <-
ggplot() +
scale_x_continuous(
limits = c(start, end),
position = "top",
expand = c(0, 0)
) +
scale_y_continuous(
limits = c(0, length(genes_to_plot)),
breaks = c(0, 0.5, length(genes_to_plot)),
expand = c(0, 0)
) +
theme_classic() + theme(
axis.title.x = element_blank(),
axis.line.x = element_line(color = "black", linewidth = 0.6),
axis.ticks.x = element_line(color = "black", linewidth = 0.4),
axis.text.x = element_text(color = "black"),
panel.grid = element_blank(),
axis.line.y = element_line(color = "black", linewidth = 0.6),
axis.ticks.y = element_line(color = "black", linewidth = 0.6),
axis.ticks.length.y = unit(4, "pt"),
axis.text.y = element_text(color = "black")
) + ylab("Gene") + geom_text(
data = model_info_name,
mapping = aes(x = Start,
y = Group - 0.2,
label = ID),
color = "black",
size = 3
) + geom_rect(
data = model_info_CDS,
mapping = aes(
xmin = Start,
ymin = Group - 0.6,
xmax = End,
ymax = Group - 0.4
),
color = "black",
fill = "black"
) + geom_segment(
data = model_info_intron,
mapping = aes(
x = Start,
y = Group - 0.5,
xend = Mid,
yend = Group - 0.4
),
color = "black",
linewidth = 0.5
) + geom_segment(
data = model_info_intron,
mapping = aes(
x = End,
y = Group - 0.5,
xend = Mid,
yend = Group - 0.4
),
color = "black",
linewidth = 0.5
) + geom_rect(
data = model_info_UTR,
mapping = aes(
xmin = Start,
ymin = Group - 0.6,
xmax = End,
ymax = Group - 0.4
),
color = "black",
fill = "grey"
) + geom_polygon(
data = model_info_end_CDS,
mapping = aes(x = X,
y = Y,
group = Group),
color = "black",
fill = "black"
) + geom_polygon(
data = model_info_end_UTR,
mapping = aes(x = X,
y = Y,
group = Group),
color = "black",
fill = "grey"
)
# return
p_model
}
######################### plot tracks for coverage data #########################
epiplot_cov <- function(bw_df,
start,
end,
y_min,
y_max,
track_name,
track_color,
ann_df = NULL,
ann_color = "goldenrod1") {
p <-
ggplot() +
scale_x_continuous(limits = c(start, end),
expand = c(0, 0)) +
scale_y_continuous(
limits = c(y_min, y_max),
breaks = c(-1, -0.5, 0, 0.5, 1),
expand = c(0, 0)
) +
theme_classic() + theme(
axis.title.x = element_blank(),
axis.line.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
panel.grid = element_blank(),
axis.line.y = element_line(color = "black", linewidth = 0.6),
axis.ticks.y = element_line(color = "black", linewidth = 0.6),
axis.ticks.length.y = unit(4, "pt"),
axis.text.y = element_text(color = "black")
) + ylab(track_name) + geom_hline(mapping = aes(yintercept = 0), linewidth = 0.6) + geom_rect(
data = bw_df,
mapping = aes(
xmin = start,
ymin = 0,
xmax = end,
ymax = score
),
color = NA,
fill = track_color
)
if (!is.null(ann_df)) {
p <- p +
geom_rect(
data = ann_df,
mapping = aes(
xmin = start,
ymin = y_min,
xmax = end,
ymax = y_max
),
color = NA,
fill = ann_color,
alpha = 0.3
) + geom_text(
data = ann_df,
mapping = aes(
x = mean(c(start, end)),
y = y_max - 0.2 * (y_max - y_min),
label = name
),
color = "black"
)
}
p
}
######################### plot tracks BS-seq data #########################
epiplot_bsseq <- function(bw_df,
start,
end,
y_min,
y_max,
track_name,
track_color,
ann_df = NULL,
ann_color = "goldenrod1") {
p <-
ggplot() +
scale_x_continuous(limits = c(start, end),
expand = c(0, 0)) +
scale_y_continuous(
limits = c(y_min, y_max),
breaks = c(-1, -0.5, 0, 0.5, 1),
expand = c(0, 0)
) +
theme_classic() + theme(
axis.title.x = element_blank(),
axis.line.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
panel.grid = element_blank(),
axis.line.y = element_line(color = "black", linewidth = 0.6),
axis.ticks.y = element_line(color = "black", linewidth = 0.6),
axis.ticks.length.y = unit(4, "pt"),
axis.text.y = element_text(color = "black")
) + ylab(track_name) + geom_hline(mapping = aes(yintercept = 0), linewidth = 0.6) + geom_rect(
data = bw_df,
mapping = aes(
xmin = start,
xmax = end,
ymin = 0,
ymax = score
),
color = NA,
fill = track_color
)
if (!is.null(ann_df)) {
p <- p +
geom_rect(
data = ann_df,
mapping = aes(
xmin = start,
ymin = y_min,
xmax = end,
ymax = y_max
),
color = NA,
fill = ann_color,
alpha = 0.3
) + geom_text(
data = ann_df,
mapping = aes(
x = mean(c(start, end)),
y = y_max - 0.2 * (y_max - y_min),
label = name
),
color = "black"
)
}
p
}
################# plot tracks BS-seq data (multi C type) #################
epiplot_bsseq_multi <- function(CG_df,
CHG_df,
CHH_df,
start,
end,
y_min,
y_max,
track_name,
ann_df = NULL,
ann_color = "goldenrod1") {
p <-
ggplot() +
scale_x_continuous(limits = c(start, end),
expand = c(0, 0)) +
scale_y_continuous(
limits = c(y_min, y_max),
breaks = c(-1, -0.5, 0, 0.5, 1),
expand = c(0, 0)
) +
theme_classic() + theme(
axis.title.x = element_blank(),
axis.line.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
panel.grid = element_blank(),
axis.line.y = element_line(color = "black", linewidth = 0.6),
axis.ticks.y = element_line(color = "black", linewidth = 0.6),
axis.ticks.length.y = unit(4, "pt"),
axis.text.y = element_text(color = "black")
) + ylab(track_name) + geom_hline(mapping = aes(yintercept = 0), linewidth = 0.6) + geom_rect(
data = CG_df,
mapping = aes(
xmin = start,
xmax = end,
ymin = 0,
ymax = score
),
color = NA,
fill = "#FF0000",
alpha = 1
) + geom_rect(
data = CHG_df,
mapping = aes(
xmin = start,
xmax = end,
ymin = 0,
ymax = score
),
color = NA,
fill = "#0070C0",
alpha = 1
) + geom_rect(
data = CHH_df,
mapping = aes(
xmin = start,
xmax = end,
ymin = 0,
ymax = score
),
color = NA,
fill = "#00B050",
alpha = 1
)
if (!is.null(ann_df)) {
p <- p +
geom_rect(
data = ann_df,
mapping = aes(
xmin = start,
ymin = y_min,
xmax = end,
ymax = y_max
),
color = NA,
fill = ann_color,
alpha = 0.3
) + geom_text(
data = ann_df,
mapping = aes(
x = mean(c(start, end)),
y = y_max - 0.2 * (y_max - y_min),
label = name
),
color = "black"
)
}
p
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.