#' Plot Epigenetics Tracks for NGS Data using ggplot2 (version 3)
#'
#' @description Plot genome tracks for NGS data using ggplot2, resembling GBrowse style. Supporting coverage data in bigWig or bedGraph format from RNA-seq, sRNA-seq, BS-seq, ChIP-seq, RIP-seq, GRO-seq, NET-seq, DNase-seq, MNase-seq, ATAC-seq, SHAPE-seq etc. Especially optimized for BS-seq data in single-base resolution. Requires `bigWigToBedGraph` and `bedtools` on Linux.
#'
#' @param locus_file BED file containing locus to plot. For each line, TAB-delimited information should contain: <chr>, <start>, <end>, <locus_name>, without header.
#' @param track_conf Configuration file containing track info. For each line, TAB-delimited information should contain: <track_name>, <track_type>, <track_bw/bg>, <y_min>, <y_max>, <color>, <group>, without header. NOTE: If you write "BS-seq" in "<track_type>", provide mC type (one of CG, CHG, CHH, All) to plot in "<track_bw/bg>" and provide detailed info in `bsseq_conf`, "<color>" is ignored in BS-seq tracks. "<group>" is currently unused.
#' @param bsseq_conf Configuration file of BS-seq track. If there is no BS-seq track to plot, keep `NULL`. For each line, TAB-delimited information should contain: <track_name>, <CG_bw/bg>, <CHG_bw/bg>, <CHH_bw/bg>, <C_context_file>, without header. "<track_name>" here should be the same as in `track_conf`. If you choose only one mC type to plot, you can keep the column of other files as "NA", but provide all the bigWig files may be safe. "<C_context_file>" can be "NA" to ignore drawing C context track (currently unused).
#' @param gene_model BED file containing all elements of gene and/or TE model, generated by `[tinyfuncr::extractPos]`
#' @param ann_region BED file of annotation regions. If not provided, no annotation will be added. For each line, TAB-delimited information should contain: <chr>, <start>, <end>, <annotation_name>, without header. <annotation_name> could be empty but the last TAB should be kept.
#' @param ann_color Color of annotated regions, defalut "#FFC1254D" (i.e., "goldenrod1", alpha 30)
#' @param out_dir Path to output directory, will be created if not exist, default "out/". Output plots in PDF format will be saved as <out_dir>/<locus_name>.pdf
#' @param height Plot height, in "mm", defalut 6*ntracks+10
#' @param width Plot width, in "mm", defalut 80
#'
#' @import magrittr
#' @import ggplot2
#' @import ggh4x
#' @importFrom cowplot plot_grid
#' @importFrom stringr str_split
#' @importFrom scales minor_breaks_n
#'
#' @author Yujie Liu
#' @export ggepitracks3
#'
######################### main IO #########################
ggepitracks3 <- function(locus_file,
track_conf,
bsseq_conf = NULL,
gene_model,
ann_region = NULL,
ann_color = "#FFC1254D",
out_dir = "out",
height = NULL,
width = 80) {
# check dependencies
.check_windows()
.check_bedtools()
.check_bigWigToBedGraph()
# make output dirs
dir_name <- paste0(out_dir, "/")
system(paste0("mkdir -p ", dir_name, "/tmp"))
# read gene model info
system(
paste0(
"bedtools intersect -a ",
locus_file,
" -b ",
gene_model,
" -wo | cut -f 4-10 >",
dir_name,
"/tmp/gene_model.inter"
)
)
model_info <-
read_tcsv(
paste0(dir_name, "/tmp/gene_model.inter"),
header = F,
col.names = c("locus", "chr", "start", "end", "name", "score", "strand")
)
# read annotation info
if (!is.null(ann_region)) {
system(
paste0(
"bedtools intersect -a ",
locus_file,
" -b ",
ann_region,
" -wo | cut -f 4-8 >",
dir_name,
"/tmp/ann_region.inter"
)
)
mark_info <-
read_tcsv(
paste0(dir_name, "/tmp/ann_region.inter"),
header = F,
col.names = c("locus", "chr", "start", "end", "name")
)
mark_info$start <- mark_info$start + 1
mark_info$end <- mark_info$end + 1
} else {
mark_info <- NULL
}
# read track info
track_info <-
read_tcsv(
track_conf,
header = F,
col.names = c(
"track_name",
"track_type",
"track_file",
"y_min",
"y_max",
"color",
"group"
)
)
# calculate plot height according to num of tracks if not provided
if (is.null(height)) {
height <- 6 * nrow(track_info) + 10
}
# read loci info
loci <-
read_tcsv(
locus_file,
header = F,
col.names = c("chr", "start", "end", "locus")
)
locus_filename <- strsplit(basename(locus_file), "\\.")[[1]][1]
# convert each track into bedgraph format, and intersect with loci to plot
for (track_idx in 1:nrow(track_info)) {
track_name <- track_info[track_idx, "track_name"]
track_locus_interfile <- paste0(dir_name,
"/tmp/",
track_name,
".",
locus_filename,
".inter")
# Here, check whether pre-prepared intersection files exist (from prepEpiData.sh),
# if so, skip the conversion and intersection parts for specific tracks
if (file.exists(track_locus_interfile)) {
message(paste(track_locus_interfile, "exists, skip..."))
next
}
# get track types and files
if (track_info[track_idx, "track_type"] == "BS-seq") {
bsseq_info <- read_tcsv(
bsseq_conf,
header = F,
col.names = c("track_name",
"CG",
"CHG",
"CHH",
"C_context_file")
)
track_idx_in_bsseq <-
which(track_name == bsseq_info$track_name)
if (track_info[track_idx, "track_file"] == "All") {
types <- c("CG", "CHG", "CHH")
} else {
types <- track_info[track_idx, "track_file"]
}
track_files <-
as.character(bsseq_info[track_idx_in_bsseq, types])
} else {
types <- "Cov"
track_files <- track_info[track_idx, "track_file"]
}
# read in track files and do intersection
for (trackfile_idx in 1:length(types)) {
type <- types[trackfile_idx]
track_file <- track_files[trackfile_idx]
if (grepl("\\.(bw|bigWig|bigwig)$", track_file)) {
track_file_bg <- paste0(dir_name,
"/tmp/",
track_name,
".bg")
system(paste0("bigWigToBedGraph ",
track_file,
" ",
track_file_bg))
} else if (grepl("\\.(bg|bdg|bedGraph|bedgraph)$", track_file)) {
track_file_bg <- track_file
} else {
stop(
"File format not supported !! Please provide bigWig (ends with .bw/.bigWig/.bigwig) or bedGraph (ends with .bg/.bdg/.bedGraph/.bedgraph) files ..."
)
}
system(
paste0(
"bedtools intersect -a ",
locus_file,
" -b ",
track_file_bg,
" -wo | cut -f 4-8 | awk -v type=",
type,
" 'BEGIN{OFS=\"\t\"}; {print $1,type,$3,$4,$5}' >>",
dir_name,
"/tmp/",
track_name,
".",
locus_filename,
".inter"
)
)
system(paste0("rm -f ",
dir_name,
"/tmp/",
track_name,
".bg"))
}
}
# loop against each locus and plot tracks
for (locus_idx in 1:nrow(loci)) {
# parse info of this locus
chrom <- loci[locus_idx, 1]
begin <- loci[locus_idx, 2]
end <- loci[locus_idx, 3]
locus_name <- loci[locus_idx, 4]
# select gene model and annotation in this locus
model_info_this_locus <-
model_info[model_info$locus == locus_name, c(2, 3, 4, 5, 6, 7)]
mark_info_this_locus <-
mark_info[mark_info$locus == locus_name, c(2, 3, 4, 5)]
# plot gene model for this locus
p_model <- epiplot_model(model = model_info_this_locus,
start = begin,
end = end)
p_track <- list()
p_track[[1]] <- p_model
# loop against each track and plot for this locus
for (track_idx in 1:nrow(track_info)) {
track_name <- track_info[track_idx, "track_name"]
# read in track data
bgfile <- paste0(dir_name,
"/tmp/",
track_name,
".",
locus_filename,
".inter")
bg <-
read_tcsv(
bgfile,
header = F,
col.names = c("locus", "type", "start", "end", "score")
)
bg_this_locus <-
bg[bg$locus == locus_name, c(2, 3, 4, 5)]
bg_this_locus$start <- bg_this_locus$start + 1
bg_this_locus$end <- bg_this_locus$end + 1
# plot coverage
p_track[[track_idx + 1]] <-
epiplot_track(
bw_df = bg_this_locus,
start = begin,
end = end,
y_min = track_info[track_idx, "y_min"],
y_max = track_info[track_idx, "y_max"],
track_name = track_info[track_idx, "track_name"],
track_color = track_info[track_idx, "color"],
ann_df = mark_info_this_locus,
ann_color = ann_color
)
}
# merge plots
p_merge <-
cowplot::plot_grid(
plotlist = p_track,
nrow = nrow(track_info) + 1,
rel_heights = c(5, rep(3, nrow(track_info))),
align = "v"
)
ggsave(
paste0(dir_name, locus_name, ".pdf"),
p_merge,
height = height,
width = width,
units = "mm"
)
}
}
############################ check dependency ############################
.check_windows <- function() {
if (Sys.info()["sysname"] == "Windows") {
stop("Windows is not supported :(")
}
}
.check_bedtools <- function() {
check <- as.character(Sys.which(names = "bedtools"))[1]
if (check == "") {
stop("Could not locate bedtools !")
}
}
.check_bigWigToBedGraph <- function() {
check <- as.character(Sys.which(names = "bigWigToBedGraph"))[1]
if (check == "") {
stop("Could not locate bigWigToBedGraph !")
}
}
######################### plot tracks for gene model #########################
epiplot_model <- function(model, start, end) {
# parse model info
tmp_model_info <-
model[[4]] %>% stringr::str_split("_") %>% as.data.frame() %>% t() %>% as.data.frame()
model_info <- data.frame()
for (i in 1:nrow(tmp_model_info)) {
tmp <-
data.frame(
Chr = model[i, 1],
Start = model[i, 2] + 1,
End = model[i, 3] + 1,
ID = paste(tmp_model_info[i, 2:ncol(tmp_model_info)], collapse = "_"),
Type = tmp_model_info[i, 1],
Strand = model[i, 6]
)
model_info <- rbind(model_info, tmp)
}
model_info <- model_info[order(model_info$Start), ]
# make draw data
# parse coord
genes_to_plot <- unique(model_info$ID)
model_info$Group <-
unlist(lapply(model_info$ID, function(x)
which(x == genes_to_plot)))
model_info$Start <-
unlist(lapply(model_info$Start, function(x)
max(x, start)))
model_info$End <-
unlist(lapply(model_info$End, function(x)
min(x, end)))
model_info <- model_info[model_info$Start <= model_info$End, ]
# split body and end
model_info_group_first <-
model_info[!duplicated(model_info$Group), ]
model_info_des <-
model_info[order(model_info$Start, decreasing = T), ]
model_info_group_last <-
model_info_des[!duplicated(model_info_des$Group), ]
model_info_group_last <-
model_info_group_last[order(model_info_group_last$Group), ]
model_info_group_length <-
model_info_group_last$End - model_info_group_first$Start
for (group_idx in 1:nrow(model_info_group_first)) {
if (model_info_group_first[group_idx, 6] == "+") {
model_info_group_first[group_idx, ] <-
model_info_group_last[group_idx, ]
}
}
model_info_end <- data.frame()
for (group_idx in 1:nrow(model_info_group_first)) {
if (model_info_group_first[group_idx, "Strand"] == "+") {
x <- c(
model_info_group_first[group_idx, "Start"],
model_info_group_first[group_idx, "Start"],
model_info_group_first[group_idx, "End"],
model_info_group_first[group_idx, "End"] + 0.02 * model_info_group_length[group_idx],
model_info_group_first[group_idx, "End"]
)
} else {
x <- c(
model_info_group_first[group_idx, "End"],
model_info_group_first[group_idx, "End"],
model_info_group_first[group_idx, "Start"],
model_info_group_first[group_idx, "Start"] - 0.02 * model_info_group_length[group_idx],
model_info_group_first[group_idx, "Start"]
)
}
model_info_end <-
rbind(model_info_end,
data.frame(
Type = rep(model_info_group_first[group_idx, "Type"], 5),
X = x,
Y = c(
model_info_group_first[group_idx, "Group"] - 0.4,
model_info_group_first[group_idx, "Group"] - 0.6,
model_info_group_first[group_idx, "Group"] - 0.6,
model_info_group_first[group_idx, "Group"] - 0.5,
model_info_group_first[group_idx, "Group"] - 0.4
),
Group = rep(model_info_group_first[group_idx, "Group"], 5)
))
}
# extract three context
model_info_name <- model_info[!duplicated(model_info$Group), ]
model_info_CDS <- model_info[model_info$Type == "CDS", ]
model_info_intron <- model_info[model_info$Type == "Intron", ]
model_info_intron$Mid <-
rowMeans(model_info_intron[c("Start", "End")])
model_info_UTR <-
model_info[model_info$Type == "5UTR" |
model_info$Type == "3UTR", ]
model_info_end_CDS <-
model_info_end[model_info_end$Type == "CDS", ]
model_info_end_UTR <-
model_info_end[model_info_end$Type == "5UTR" |
model_info_end$Type == "3UTR", ]
#### Note for ggplot2 ####
# In ggplot2, 1 linewidth or size is about 2.141959pt
# (https://blog.csdn.net/weixin_43250801/article/details/130049894),
# so commonly used 0.75pt line can be set as 0.35 linewidth;
# for GBrowse, use 0.5pt for axis is better (set linewidth as 0.233);
# and use 0.3pt for gene border (set linewidth as 0.14);
# 7pt text can be set as 3.27 size (BUT wrong!),
# however, 7pt text should be set as size=7 due to pt in unit;
# in geom_text, size=1 means 3pt (?)
# And, if you want 5 minor ticks in two major ticks,
# you should use ggh4x and set scales::minor_breaks_n(7) (5+2) !
# plot gene models
p_model <-
ggplot() +
geom_text(
data = model_info_name,
mapping = aes(x = Start,
y = Group - 0.2,
label = ID),
color = "black",
size = 2
) + geom_rect(
data = model_info_CDS,
mapping = aes(
xmin = Start,
ymin = Group - 0.6,
xmax = End,
ymax = Group - 0.4
),
color = "black",
fill = "black",
linewidth = 0.14
) + geom_segment(
data = model_info_intron,
mapping = aes(
x = Start,
y = Group - 0.5,
xend = Mid,
yend = Group - 0.4
),
color = "black",
linewidth = 0.14
) + geom_segment(
data = model_info_intron,
mapping = aes(
x = End,
y = Group - 0.5,
xend = Mid,
yend = Group - 0.4
),
color = "black",
linewidth = 0.14
) + geom_rect(
data = model_info_UTR,
mapping = aes(
xmin = Start,
ymin = Group - 0.6,
xmax = End,
ymax = Group - 0.4
),
color = "black",
fill = "grey",
linewidth = 0.14
) + geom_polygon(
data = model_info_end_CDS,
mapping = aes(x = X,
y = Y,
group = Group),
color = "black",
fill = "black",
linewidth = 0.14
) + geom_polygon(
data = model_info_end_UTR,
mapping = aes(x = X,
y = Y,
group = Group),
color = "black",
fill = "grey",
linewidth = 0.14
) + ylab("Genes") +
scale_x_continuous(
limits = c(start, end),
guide = "axis_minor",
n.breaks = 6,
minor_breaks = scales::minor_breaks_n(7),
position = "top",
expand = c(0, 0)
) +
scale_y_continuous(limits = c(0, length(genes_to_plot)),
expand = c(0, 0)) +
theme_classic() + theme(
axis.title.x = element_blank(),
axis.line.x = element_line(
color = "black",
linewidth = 0.35,
lineend = "round"
),
axis.ticks.x = element_line(
color = "black",
linewidth = 0.35,
lineend = "round"
),
axis.ticks.length.x = unit(3, "pt"),
ggh4x.axis.ticks.length.minor = rel(0.5),
axis.text.x = element_text(color = "black", size = 5),
axis.title.y = element_text(
color = "black",
size = 7,
angle = 0,
vjust = 0.5,
hjust = 0.5
),
axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_blank(),
plot.background = element_blank(),
plot.margin = margin()
)
# return
p_model
}
######################### plot tracks for epigenetics data #########################
epiplot_track <- function(bw_df,
start,
end,
y_min,
y_max,
track_name,
track_color,
ann_df,
ann_color) {
p <-
ggplot() +
geom_rect(
data = bw_df,
mapping = aes(
xmin = start,
ymin = 0,
xmax = end,
ymax = score,
fill = type
),
color = NA,
) + scale_fill_manual(values = c(
Cov = track_color,
CG = "#FF0000",
CHG = "#0070C0",
CHH = "#00B050"
)) +
geom_hline(
mapping = aes(yintercept = 0),
color = "black",
linewidth = 0.233
) +
ylab(track_name) +
scale_x_continuous(limits = c(start, end),
expand = c(0, 0)) +
scale_y_continuous(
limits = c(y_min, y_max),
breaks = c(y_min, (y_min + y_max) / 2, y_max),
expand = c(0, 0),
sec.axis = dup_axis(name = NULL)
) +
theme_classic() + theme(
axis.title.x = element_blank(),
axis.line.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.title.y = element_text(
color = "black",
size = 7,
angle = 0,
vjust = 0.5,
hjust = 0.5
),
axis.line.y = element_line(
color = "black",
linewidth = 0.233,
lineend = "round"
),
axis.ticks.y = element_line(
color = "black",
linewidth = 0.233,
lineend = "round"
),
axis.ticks.length.y = unit(1, "pt"),
axis.text.y = element_text(color = "black", size = 5),
legend.position = "none",
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_blank(),
plot.background = element_blank(),
plot.margin = margin()
)
if (!is.null(ann_df)) {
p <- p +
geom_rect(
data = ann_df,
mapping = aes(
xmin = start,
ymin = y_min,
xmax = end,
ymax = y_max
),
color = NA,
fill = ann_color,
) + geom_text(
data = ann_df,
mapping = aes(
x = mean(c(start, end)),
y = y_max - 0.2 * (y_max - y_min),
label = name
),
color = "black",
size = 6
)
}
p
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.