tinyfuncr: tiny R functions

Documented in ggepitracks3

#' Plot Epigenetics Tracks for NGS Data using ggplot2 (version 3)
#'
#' @description Plot genome tracks for NGS data using ggplot2, resembling GBrowse style. Supporting coverage data in bigWig or bedGraph format from RNA-seq, sRNA-seq, BS-seq, ChIP-seq, RIP-seq, GRO-seq, NET-seq, DNase-seq, MNase-seq, ATAC-seq, SHAPE-seq etc. Especially optimized for BS-seq data in single-base resolution. Requires `bigWigToBedGraph` and `bedtools` on Linux.
#'
#' @param locus_file BED file containing locus to plot. For each line, TAB-delimited information should contain: <chr>, <start>, <end>, <locus_name>, without header.
#' @param track_conf Configuration file containing track info. For each line, TAB-delimited information should contain: <track_name>, <track_type>, <track_bw/bg>, <y_min>, <y_max>, <color>, <group>, without header. NOTE: If you write "BS-seq" in "<track_type>", provide mC type (one of CG, CHG, CHH, All) to plot in "<track_bw/bg>" and provide detailed info in `bsseq_conf`, "<color>" is ignored in BS-seq tracks. "<group>" is currently unused.
#' @param bsseq_conf Configuration file of BS-seq track. If there is no BS-seq track to plot, keep `NULL`. For each line, TAB-delimited information should contain: <track_name>, <CG_bw/bg>, <CHG_bw/bg>, <CHH_bw/bg>, <C_context_file>, without header. "<track_name>" here should be the same as in `track_conf`. If you choose only one mC type to plot, you can keep the column of other files as "NA", but provide all the bigWig files may be safe. "<C_context_file>" can be "NA" to ignore drawing C context track (currently unused).
#' @param gene_model BED file containing all elements of gene and/or TE model, generated by `[tinyfuncr::extractPos]`
#' @param ann_region BED file of annotation regions. If not provided, no annotation will be added. For each line, TAB-delimited information should contain: <chr>, <start>, <end>, <annotation_name>, without header. <annotation_name> could be empty but the last TAB should be kept.
#' @param ann_color Color of annotated regions, defalut "#FFC1254D" (i.e., "goldenrod1", alpha 30)
#' @param out_dir Path to output directory, will be created if not exist, default "out/". Output plots in PDF format will be saved as <out_dir>/<locus_name>.pdf
#' @param height Plot height, in "mm", defalut 6*ntracks+10
#' @param width Plot width, in "mm", defalut 80
#'
#' @import magrittr
#' @import ggplot2
#' @import ggh4x
#' @importFrom cowplot plot_grid
#' @importFrom stringr str_split
#' @importFrom scales minor_breaks_n
#'
#' @author Yujie Liu
#' @export ggepitracks3
#'


######################### main IO #########################
ggepitracks3 <- function(locus_file,
                         track_conf,
                         bsseq_conf = NULL,
                         gene_model,
                         ann_region = NULL,
                         ann_color = "#FFC1254D",
                         out_dir = "out",
                         height = NULL,
                         width = 80) {
  # check dependencies
  .check_windows()
  .check_bedtools()
  .check_bigWigToBedGraph()
  
  # make output dirs
  dir_name <- paste0(out_dir, "/")
  system(paste0("mkdir -p ", dir_name, "/tmp"))
  
  # read gene model info
  system(
    paste0(
      "bedtools intersect -a ",
      locus_file,
      " -b ",
      gene_model,
      " -wo | cut -f 4-10 >",
      dir_name,
      "/tmp/gene_model.inter"
    )
  )
  model_info <-
    read_tcsv(
      paste0(dir_name, "/tmp/gene_model.inter"),
      header = F,
      col.names = c("locus", "chr", "start", "end", "name", "score", "strand")
    )
  
  # read annotation info
  if (!is.null(ann_region)) {
    system(
      paste0(
        "bedtools intersect -a ",
        locus_file,
        " -b ",
        ann_region,
        " -wo | cut -f 4-8 >",
        dir_name,
        "/tmp/ann_region.inter"
      )
    )
    mark_info <-
      read_tcsv(
        paste0(dir_name, "/tmp/ann_region.inter"),
        header = F,
        col.names = c("locus", "chr", "start", "end", "name")
      )
    mark_info$start <- mark_info$start + 1
    mark_info$end <- mark_info$end + 1
  } else {
    mark_info <- NULL
  }
  
  # read track info
  track_info <-
    read_tcsv(
      track_conf,
      header = F,
      col.names = c(
        "track_name",
        "track_type",
        "track_file",
        "y_min",
        "y_max",
        "color",
        "group"
      )
    )
  
  # calculate plot height according to num of tracks if not provided
  if (is.null(height)) {
    height <- 6 * nrow(track_info) + 10
  }
  
  # read loci info
  loci <-
    read_tcsv(
      locus_file,
      header = F,
      col.names = c("chr", "start", "end", "locus")
    )
  locus_filename <- strsplit(basename(locus_file), "\\.")[[1]][1]
  
  # convert each track into bedgraph format, and intersect with loci to plot
  for (track_idx in 1:nrow(track_info)) {
    track_name <-  track_info[track_idx, "track_name"]
    track_locus_interfile <- paste0(dir_name,
                                    "/tmp/",
                                    track_name,
                                    ".",
                                    locus_filename,
                                    ".inter")
    # Here, check whether pre-prepared intersection files exist (from prepEpiData.sh),
    # if so, skip the conversion and intersection parts for specific tracks
    if (file.exists(track_locus_interfile)) {
      message(paste(track_locus_interfile, "exists, skip..."))
      next
    }
    
    # get track types and files
    if (track_info[track_idx, "track_type"] == "BS-seq") {
      bsseq_info <- read_tcsv(
        bsseq_conf,
        header = F,
        col.names = c("track_name",
                      "CG",
                      "CHG",
                      "CHH",
                      "C_context_file")
      )
      track_idx_in_bsseq <-
        which(track_name == bsseq_info$track_name)
      if (track_info[track_idx, "track_file"] == "All") {
        types <- c("CG", "CHG", "CHH")
      } else {
        types <- track_info[track_idx, "track_file"]
      }
      track_files <-
        as.character(bsseq_info[track_idx_in_bsseq, types])
    } else {
      types <- "Cov"
      track_files <- track_info[track_idx, "track_file"]
    }
    
    # read in track files and do intersection
    for (trackfile_idx in 1:length(types)) {
      type <- types[trackfile_idx]
      track_file <- track_files[trackfile_idx]
      if (grepl("\\.(bw|bigWig|bigwig)$", track_file)) {
        track_file_bg <- paste0(dir_name,
                                "/tmp/",
                                track_name,
                                ".bg")
        system(paste0("bigWigToBedGraph ",
                      track_file,
                      " ",
                      track_file_bg))
      } else if (grepl("\\.(bg|bdg|bedGraph|bedgraph)$", track_file)) {
        track_file_bg <- track_file
      } else {
        stop(
          "File format not supported !! Please provide bigWig (ends with .bw/.bigWig/.bigwig) or bedGraph (ends with .bg/.bdg/.bedGraph/.bedgraph) files ..."
        )
      }
      system(
        paste0(
          "bedtools intersect -a ",
          locus_file,
          " -b ",
          track_file_bg,
          " -wo | cut -f 4-8 | awk -v type=",
          type,
          " 'BEGIN{OFS=\"\t\"}; {print $1,type,$3,$4,$5}' >>",
          dir_name,
          "/tmp/",
          track_name,
          ".",
          locus_filename,
          ".inter"
        )
      )
      system(paste0("rm -f ",
                    dir_name,
                    "/tmp/",
                    track_name,
                    ".bg"))
    }
  }
  
  # loop against each locus and plot tracks
  for (locus_idx in 1:nrow(loci)) {
    # parse info of this locus
    chrom <- loci[locus_idx, 1]
    begin <- loci[locus_idx, 2]
    end <- loci[locus_idx, 3]
    locus_name <- loci[locus_idx, 4]
    
    # select gene model and annotation in this locus
    model_info_this_locus <-
      model_info[model_info$locus == locus_name, c(2, 3, 4, 5, 6, 7)]
    mark_info_this_locus <-
      mark_info[mark_info$locus == locus_name, c(2, 3, 4, 5)]
    
    # plot gene model for this locus
    p_model <- epiplot_model(model = model_info_this_locus,
                             start = begin,
                             end = end)
    p_track <- list()
    p_track[[1]] <- p_model
    
    # loop against each track and plot for this locus
    for (track_idx in 1:nrow(track_info)) {
      track_name <-  track_info[track_idx, "track_name"]
      # read in track data
      bgfile <- paste0(dir_name,
                       "/tmp/",
                       track_name,
                       ".",
                       locus_filename,
                       ".inter")
      bg <-
        read_tcsv(
          bgfile,
          header = F,
          col.names = c("locus", "type", "start", "end", "score")
        )
      bg_this_locus <-
        bg[bg$locus == locus_name, c(2, 3, 4, 5)]
      bg_this_locus$start <- bg_this_locus$start + 1
      bg_this_locus$end <- bg_this_locus$end + 1
      
      # plot coverage
      p_track[[track_idx + 1]] <-
        epiplot_track(
          bw_df = bg_this_locus,
          start = begin,
          end = end,
          y_min = track_info[track_idx, "y_min"],
          y_max = track_info[track_idx, "y_max"],
          track_name = track_info[track_idx, "track_name"],
          track_color = track_info[track_idx, "color"],
          ann_df = mark_info_this_locus,
          ann_color = ann_color
        )
    }
    # merge plots
    p_merge <-
      cowplot::plot_grid(
        plotlist = p_track,
        nrow = nrow(track_info) + 1,
        rel_heights = c(5, rep(3, nrow(track_info))),
        align = "v"
      )
    ggsave(
      paste0(dir_name, locus_name, ".pdf"),
      p_merge,
      height = height,
      width = width,
      units = "mm"
    )
  }
}


############################ check dependency ############################
.check_windows <- function() {
  if (Sys.info()["sysname"] == "Windows") {
    stop("Windows is not supported :(")
  }
}

.check_bedtools <- function() {
  check <- as.character(Sys.which(names = "bedtools"))[1]
  if (check == "") {
    stop("Could not locate bedtools !")
  }
}

.check_bigWigToBedGraph <- function() {
  check <- as.character(Sys.which(names = "bigWigToBedGraph"))[1]
  if (check == "") {
    stop("Could not locate bigWigToBedGraph !")
  }
}


######################### plot tracks for gene model #########################
epiplot_model <- function(model, start, end) {
  # parse model info
  tmp_model_info <-
    model[[4]] %>% stringr::str_split("_") %>% as.data.frame() %>% t() %>% as.data.frame()
  model_info <- data.frame()
  for (i in 1:nrow(tmp_model_info)) {
    tmp <-
      data.frame(
        Chr = model[i, 1],
        Start = model[i, 2] + 1,
        End = model[i, 3] + 1,
        ID = paste(tmp_model_info[i, 2:ncol(tmp_model_info)], collapse = "_"),
        Type = tmp_model_info[i, 1],
        Strand = model[i, 6]
      )
    model_info <- rbind(model_info, tmp)
  }
  model_info <- model_info[order(model_info$Start), ]
  
  # make draw data
  
  # parse coord
  genes_to_plot <- unique(model_info$ID)
  model_info$Group <-
    unlist(lapply(model_info$ID, function(x)
      which(x == genes_to_plot)))
  model_info$Start <-
    unlist(lapply(model_info$Start, function(x)
      max(x, start)))
  model_info$End <-
    unlist(lapply(model_info$End, function(x)
      min(x, end)))
  model_info <- model_info[model_info$Start <= model_info$End, ]
  
  # split body and end
  model_info_group_first <-
    model_info[!duplicated(model_info$Group), ]
  model_info_des <-
    model_info[order(model_info$Start, decreasing = T), ]
  model_info_group_last <-
    model_info_des[!duplicated(model_info_des$Group), ]
  model_info_group_last <-
    model_info_group_last[order(model_info_group_last$Group), ]
  model_info_group_length <-
    model_info_group_last$End - model_info_group_first$Start
  
  for (group_idx in 1:nrow(model_info_group_first)) {
    if (model_info_group_first[group_idx, 6] == "+") {
      model_info_group_first[group_idx, ] <-
        model_info_group_last[group_idx, ]
    }
  }
  model_info_end <- data.frame()
  for (group_idx in 1:nrow(model_info_group_first)) {
    if (model_info_group_first[group_idx, "Strand"] == "+") {
      x <- c(
        model_info_group_first[group_idx, "Start"],
        model_info_group_first[group_idx, "Start"],
        model_info_group_first[group_idx, "End"],
        model_info_group_first[group_idx, "End"] + 0.02 * model_info_group_length[group_idx],
        model_info_group_first[group_idx, "End"]
      )
    } else {
      x <- c(
        model_info_group_first[group_idx, "End"],
        model_info_group_first[group_idx, "End"],
        model_info_group_first[group_idx, "Start"],
        model_info_group_first[group_idx, "Start"] - 0.02 * model_info_group_length[group_idx],
        model_info_group_first[group_idx, "Start"]
      )
    }
    model_info_end <-
      rbind(model_info_end,
            data.frame(
              Type = rep(model_info_group_first[group_idx, "Type"], 5),
              X = x,
              Y = c(
                model_info_group_first[group_idx, "Group"] - 0.4,
                model_info_group_first[group_idx, "Group"] - 0.6,
                model_info_group_first[group_idx, "Group"] - 0.6,
                model_info_group_first[group_idx, "Group"] - 0.5,
                model_info_group_first[group_idx, "Group"] - 0.4
              ),
              Group = rep(model_info_group_first[group_idx, "Group"], 5)
            ))
  }
  
  # extract three context
  model_info_name <- model_info[!duplicated(model_info$Group), ]
  model_info_CDS <- model_info[model_info$Type == "CDS", ]
  model_info_intron <- model_info[model_info$Type == "Intron", ]
  model_info_intron$Mid <-
    rowMeans(model_info_intron[c("Start", "End")])
  model_info_UTR <-
    model_info[model_info$Type == "5UTR" |
                 model_info$Type == "3UTR", ]
  model_info_end_CDS <-
    model_info_end[model_info_end$Type == "CDS", ]
  model_info_end_UTR <-
    model_info_end[model_info_end$Type == "5UTR" |
                     model_info_end$Type == "3UTR", ]
  
  #### Note for ggplot2 ####
  # In ggplot2, 1 linewidth or size is about 2.141959pt
  # (https://blog.csdn.net/weixin_43250801/article/details/130049894),
  # so commonly used 0.75pt line can be set as 0.35 linewidth;
  # for GBrowse, use 0.5pt for axis is better (set linewidth as 0.233);
  # and use 0.3pt for gene border (set linewidth as 0.14);
  # 7pt text can be set as 3.27 size (BUT wrong!),
  # however, 7pt text should be set as size=7 due to pt in unit;
  # in geom_text, size=1 means 3pt (?)
  # And, if you want 5 minor ticks in two major ticks,
  # you should use ggh4x and set scales::minor_breaks_n(7) (5+2) !
  
  # plot gene models
  p_model <-
    ggplot() +
    geom_text(
      data = model_info_name,
      mapping = aes(x = Start,
                    y = Group - 0.2,
                    label = ID),
      color = "black",
      size = 2
    ) + geom_rect(
      data = model_info_CDS,
      mapping = aes(
        xmin = Start,
        ymin = Group - 0.6,
        xmax = End,
        ymax = Group - 0.4
      ),
      color = "black",
      fill = "black",
      linewidth = 0.14
    ) + geom_segment(
      data = model_info_intron,
      mapping = aes(
        x = Start,
        y = Group - 0.5,
        xend = Mid,
        yend = Group - 0.4
      ),
      color = "black",
      linewidth = 0.14
    ) + geom_segment(
      data = model_info_intron,
      mapping = aes(
        x = End,
        y = Group - 0.5,
        xend = Mid,
        yend = Group - 0.4
      ),
      color = "black",
      linewidth = 0.14
    ) + geom_rect(
      data = model_info_UTR,
      mapping = aes(
        xmin = Start,
        ymin = Group - 0.6,
        xmax = End,
        ymax = Group - 0.4
      ),
      color = "black",
      fill = "grey",
      linewidth = 0.14
    ) + geom_polygon(
      data = model_info_end_CDS,
      mapping = aes(x = X,
                    y = Y,
                    group = Group),
      color = "black",
      fill = "black",
      linewidth = 0.14
    ) + geom_polygon(
      data = model_info_end_UTR,
      mapping = aes(x = X,
                    y = Y,
                    group = Group),
      color = "black",
      fill = "grey",
      linewidth = 0.14
    ) + ylab("Genes") +
    scale_x_continuous(
      limits = c(start, end),
      guide = "axis_minor",
      n.breaks = 6,
      minor_breaks = scales::minor_breaks_n(7),
      position = "top",
      expand = c(0, 0)
    ) +
    scale_y_continuous(limits = c(0, length(genes_to_plot)),
                       expand = c(0, 0)) +
    theme_classic() + theme(
      axis.title.x = element_blank(),
      axis.line.x = element_line(
        color = "black",
        linewidth = 0.35,
        lineend = "round"
      ),
      axis.ticks.x = element_line(
        color = "black",
        linewidth = 0.35,
        lineend = "round"
      ),
      axis.ticks.length.x = unit(3, "pt"),
      ggh4x.axis.ticks.length.minor = rel(0.5),
      axis.text.x = element_text(color = "black", size = 5),
      axis.title.y = element_text(
        color = "black",
        size = 7,
        angle = 0,
        vjust = 0.5,
        hjust = 0.5
      ),
      axis.line.y = element_blank(),
      axis.ticks.y = element_blank(),
      axis.text.y =  element_blank(),
      panel.grid = element_blank(),
      panel.background = element_blank(),
      panel.border = element_blank(),
      plot.background = element_blank(),
      plot.margin = margin()
    )
  
  # return
  p_model
}


######################### plot tracks for epigenetics data #########################
epiplot_track <- function(bw_df,
                          start,
                          end,
                          y_min,
                          y_max,
                          track_name,
                          track_color,
                          ann_df,
                          ann_color) {
  p <-
    ggplot() +
    geom_rect(
      data = bw_df,
      mapping = aes(
        xmin = start,
        ymin = 0,
        xmax = end,
        ymax = score,
        fill = type
      ),
      color = NA,
    ) + scale_fill_manual(values = c(
      Cov = track_color,
      CG = "#FF0000",
      CHG = "#0070C0",
      CHH = "#00B050"
    )) +
    geom_hline(
      mapping = aes(yintercept = 0),
      color = "black",
      linewidth = 0.233
    ) +
    ylab(track_name) +
    scale_x_continuous(limits = c(start, end),
                       expand = c(0, 0)) +
    scale_y_continuous(
      limits = c(y_min, y_max),
      breaks = c(y_min, (y_min + y_max) / 2, y_max),
      expand = c(0, 0),
      sec.axis = dup_axis(name = NULL)
    ) +
    theme_classic() + theme(
      axis.title.x = element_blank(),
      axis.line.x = element_blank(),
      axis.ticks.x = element_blank(),
      axis.text.x = element_blank(),
      axis.title.y = element_text(
        color = "black",
        size = 7,
        angle = 0,
        vjust = 0.5,
        hjust = 0.5
      ),
      axis.line.y = element_line(
        color = "black",
        linewidth = 0.233,
        lineend = "round"
      ),
      axis.ticks.y = element_line(
        color = "black",
        linewidth = 0.233,
        lineend = "round"
      ),
      axis.ticks.length.y = unit(1, "pt"),
      axis.text.y = element_text(color = "black", size = 5),
      legend.position = "none",
      panel.grid = element_blank(),
      panel.background = element_blank(),
      panel.border = element_blank(),
      plot.background = element_blank(),
      plot.margin = margin()
    )
  
  if (!is.null(ann_df)) {
    p <- p +
      geom_rect(
        data = ann_df,
        mapping = aes(
          xmin = start,
          ymin = y_min,
          xmax = end,
          ymax = y_max
        ),
        color = NA,
        fill = ann_color,
      ) + geom_text(
        data = ann_df,
        mapping = aes(
          x = mean(c(start, end)),
          y = y_max - 0.2 * (y_max - y_min),
          label = name
        ),
        color = "black",
        size = 6
      )
  }
  
  p
}