tinyfuncr: tiny R functions

Documented in ggepitracks

#' Plot Epigenetics Tracks for NGS Data using ggplot2
#'
#' @description Plot genome tracks for NGS data using ggplot2, resembling GBrowse/JBrowse style. Supporting coverage data in bigWig format from RNA-seq, sRNA-seq, BS-seq, ChIP-seq, RIP-seq, GRO-seq, NET-seq, DNase-seq, MNase-seq, ATAC-seq, SHAPE-seq etc. Especially optimized for BS-seq data in single-base resolution. Requires `bedtools` on Linux.
#'
#' @param locus_file BED file containing locus to plot. For each line, TAB-delimited information should contain: <chr>, <start>, <end>, <locus_name>, without header.
#' @param track_conf configuration file containing track info. For each line, TAB-delimited information should contain: <track_name>, <track_type>, <track_file>, <y_min>, <y_max>, <color>, <group>, without header. NOTE: If you write "BS-seq" in "<track_type>", provide mC type (one of CG, CHG, CHH, All) to plot in "<path_to_track_file>" and provide detailed info in `bsseq_conf`, "<color>" is currently ignored in multi-mC mode of BS-seq.
#' @param bsseq_conf configuration file of BS-seq track. If no BS-seq track plotted, keep `NULL`. For each line, TAB-delimited information should contain: <track_name>, <CG_file>, <CHG_file>, <CHH_file>, <C_context_file>, without header. "<track_name>" here should be the same as in `track_conf`. If you choose only one mC type to plot, you can keep the column of other file as "NA", but provide all the file may be used has no side-effect. "<C_context_file>" can be "NA" to avoid drawing C context track.
#' @param gene_model BED file containing all elements of gene and/or TE model, generated by `[tinyfuncr::extractPos]`
#' @param ann_region BED file of annotation regions. If not provided, no annotation will be added. For each line, TAB-delimited information should contain: <chr>, <start>, <end>, <annotation_name>, without header. <annotation_name> could be empty but the last TAB should be kept.
#' @param ann_color color of annotated regions
#' @param out_dir path to output directory, create it if not exist
#'
#' @import magrittr
#' @import ggplot2
#' @importFrom rtracklayer import.bw
#' @importFrom GenomicRanges split seqnames restrict
#' @importFrom cowplot plot_grid
#' @importFrom stringr str_split
#'
#' @author Yujie Liu
#' @export ggepitracks
#'


######################### main IO #########################
ggepitracks <- function(locus_file,
                        track_conf,
                        bsseq_conf = NULL,
                        gene_model,
                        ann_region = NULL,
                        ann_color = "goldenrod1",
                        out_dir = "out") {
  # check dependencies
  .check_windows()
  .check_bedtools()

  # make output dirs
  dir_name <- paste0(out_dir, "/")
  system(paste0("mkdir -p ", dir_name))

  # read in and parse track info
  track_info <- read_tcsv(track_conf, header = F)
  colnames(track_info) <-
    c("track_name",
      "track_type",
      "track_file",
      "y_min",
      "y_max",
      "color",
      "group")

  # read loci info
  loci <- read_tcsv(locus_file, header = F)
  colnames(loci) <- c("chr", "start", "end", "name")

  # loop against each locus and plot tracks
  for (locus_idx in 1:nrow(loci)) {
    # parse info of this locus and make granges
    chrom <- loci[locus_idx, 1]
    begin <- loci[locus_idx, 2]
    end <- loci[locus_idx, 3]
    locus_name <- loci[locus_idx, 4]

    write_tcsv(
      data.frame(chrom, as.character(begin), as.character(end)),
      file = "tmp_locus.bed",
      col.names = F
    )

    # extract annotation region within this locus
    if (!is.null(ann_region)) {
      system(
        paste(
          "bedtools intersect -a tmp_locus.bed -b",
          ann_region,
          "-wb | cut -f 4- > tmp_ann.bed"
        )
      )
      this_mark_region <-
        read_tcsv("tmp_ann.bed", header = F)
      colnames(this_mark_region) <- c("chr", "start", "end", "name")
      this_mark_region$start <- this_mark_region$start + 1
      this_mark_region$end <- this_mark_region$end + 1
    } else {
      this_mark_region <- NULL
    }

    # extract gene model within this locus
    system(
      paste(
        "bedtools intersect -a tmp_locus.bed -b",
        gene_model,
        "-wb | cut -f 4- > tmp_model.bed"
      )
    )
    this_locus_model <-
      read_tcsv("tmp_model.bed", header = F)

    # plot gene model for this locus
    p_model <- epiplot_model(
      model = this_locus_model,
      start = begin,
      end = end
    )
    p_track <- list()
    p_track[[1]] <- p_model

    # loop against each track and plot for this locus
    for (track_idx in 1:nrow(track_info)) {
      if (track_info[track_idx, "track_type"] == "BS-seq") {
        bsseq_info <- read_tcsv(bsseq_conf, header = F)
        colnames(bsseq_info) <-
          c("track_name",
            "CG",
            "CHG",
            "CHH",
            "C_context_file")
        track_idx_in_bsseq <-
          which(track_info[track_idx, "track_name"] == bsseq_info$track_name)
        if (track_info[track_idx, "track_file"] != "All") {
          # read in track file
          bw <-
            rtracklayer::import.bw(bsseq_info[track_idx_in_bsseq, track_info[track_idx, "track_file"]])
          bw_chr <-
            GenomicRanges::split(bw, GenomicRanges::seqnames(bw))
          bw_this_locus <-
            as.data.frame(GenomicRanges::restrict(
              x = bw_chr[[chrom]],
              start = begin,
              end = end
            ))[c(1, 2, 3, 6)]
          bw_this_locus$end <- bw_this_locus$end + 1

          # plot bsseq tracks
          p_track[[track_idx + 1]] <-
            epiplot_bsseq(
              bw_df = bw_this_locus,
              start = begin,
              end = end,
              y_min = track_info[track_idx, "y_min"],
              y_max = track_info[track_idx, "y_max"],
              track_name = track_info[track_idx, "track_name"],
              track_color = track_info[track_idx, "color"],
              ann_df = this_mark_region,
              ann_color = ann_color
            )

          # merge plot
          #if (track_idx == 1) {
          #  p_merge <-
          #    cowplot::plot_grid(
          #      p_model,
          #      p_track,
          #      nrow = 2,
          #      rel_heights = c(1, 3),
          #      align = "h"
          #    )
          #} else {
          #  p_merge <-
          #    cowplot::plot_grid(
          #      p_merge,
          #      p_track,
          #      nrow = 2,
          #      rel_heights = c(1 + 3 * (track_idx - 1), 3),
          #      align = "h"
          #    )
          #}
        } else {
          # CG
          bw_CG <-
            rtracklayer::import.bw(bsseq_info[track_idx_in_bsseq, "CG"])
          bw_CG_chr <-
            GenomicRanges::split(bw_CG, GenomicRanges::seqnames(bw_CG))
          bw_CG_this_locus <-
            as.data.frame(GenomicRanges::restrict(
              x = bw_CG_chr[[chrom]],
              start = begin,
              end = end
            ))[c(1, 2, 3, 6)]
          bw_CG_this_locus$end <- bw_CG_this_locus$end + 1

          # CHG
          bw_CHG <-
            rtracklayer::import.bw(bsseq_info[track_idx_in_bsseq, "CHG"])
          bw_CHG_chr <-
            GenomicRanges::split(bw_CHG, GenomicRanges::seqnames(bw_CHG))
          bw_CHG_this_locus <-
            as.data.frame(GenomicRanges::restrict(
              x = bw_CHG_chr[[chrom]],
              start = begin,
              end = end
            ))[c(1, 2, 3, 6)]
          bw_CHG_this_locus$end <- bw_CHG_this_locus$end + 1

          # CHH
          bw_CHH <-
            rtracklayer::import.bw(bsseq_info[track_idx_in_bsseq, "CHH"])
          bw_CHH_chr <-
            GenomicRanges::split(bw_CHH, GenomicRanges::seqnames(bw_CHH))
          bw_CHH_this_locus <-
            as.data.frame(GenomicRanges::restrict(
              x = bw_CHH_chr[[chrom]],
              start = begin,
              end = end
            ))[c(1, 2, 3, 6)]
          bw_CHH_this_locus$end <- bw_CHH_this_locus$end + 1

          # plot multi C type bsseq
          p_track[[track_idx + 1]] <-
            epiplot_bsseq_multi(
              CG_df = bw_CG_this_locus,
              CHG_df = bw_CHG_this_locus,
              CHH_df = bw_CHH_this_locus,
              start = begin,
              end = end,
              y_min = track_info[track_idx, "y_min"],
              y_max = track_info[track_idx, "y_max"],
              track_name = track_info[track_idx, "track_name"],
              ann_df = this_mark_region,
              ann_color = ann_color
            )

          # merge plot
          #if (track_idx == 1) {
          #  p_merge <-
          #    cowplot::plot_grid(
          #      p_model,
          #      p_track,
          #      nrow = 2,
          #      rel_heights = c(2, 3),
          #      align = "h"
          #    )
          #} else {
          #  p_merge <-
          #    cowplot::plot_grid(
          #      p_merge,
          #      p_track,
          #      nrow = 2,
          #      rel_heights = c(2 + 3 * (track_idx - 1), 3),
          #      align = "h"
          #    )
          #}
        }
      } else{
        # read in track file for non BS-seq data
        bw <-
          rtracklayer::import.bw(track_info[track_idx, "track_file"])
        bw_chr <-
          GenomicRanges::split(bw, GenomicRanges::seqnames(bw))
        bw_this_locus <-
          as.data.frame(GenomicRanges::restrict(
            x = bw_chr[[chrom]],
            start = begin,
            end = end
          ))[c(1, 2, 3, 6)]
        bw_this_locus$end <- bw_this_locus$end + 1

        # plot coverage
        p_track[[track_idx + 1]] <-
          epiplot_cov(
            bw_df = bw_this_locus,
            start = begin,
            end = end,
            y_min = track_info[track_idx, "y_min"],
            y_max = track_info[track_idx, "y_max"],
            track_name = track_info[track_idx, "track_name"],
            track_color = track_info[track_idx, "color"],
            ann_df = this_mark_region,
            ann_color = ann_color
          )

        # merge plot
        #if (track_idx == 1) {
        #  p_merge <-
        #    cowplot::plot_grid(
        #      p_model,
        #      p_track,
        #      nrow = 2,
        #      rel_heights = c(1, 3),
        #      align = "h"
        #    )
        #} else {
        #  p_merge <-
        #    cowplot::plot_grid(
        #      p_merge,
        #      p_track,
        #      nrow = 2,
        #      rel_heights = c(1 + 3 * (track_idx - 1), 3),
        #      align = "h"
        #    )
        #}
      }
    }

    # merge plots
    p_merge <-
      cowplot::plot_grid(
        plotlist = p_track,
        nrow = 1 + nrow(track_info),
        rel_heights = c(3, rep(3, nrow(track_info))),
        align = "h"
      )
    ggsave(
      paste0(dir_name, locus_name, ".pdf"),
      p_merge,
      width = 6,
      height = 4
    )
  }
}


############################ check dependency ############################
.check_windows <- function() {
  if (Sys.info()["sysname"] == "Windows") {
    stop("Windows is not supported :(")
  }
}

.check_bedtools <- function(warn = TRUE) {
  check <- as.character(Sys.which(names = 'bedtools'))[1]
  if (check != "") {
    if (warn) {
      message("Checking for bedtools installation")
      message(paste0("    All good! Found bedtools at: ", check))
    } else{
      return(invisible(0))
    }
  } else{
    stop("Could not locate bedtools !")
  }
}


######################### plot tracks for gene model #########################
epiplot_model <- function(model, start, end) {
  # parse model info
  tmp_model_info <-
    model[[4]] %>% stringr::str_split("_") %>% as.data.frame() %>% t() %>% as.data.frame()
  model_info <- data.frame()
  for (i in 1:nrow(tmp_model_info)) {
    tmp <-
      data.frame(
        Chr = model[i, 1],
        Start = model[i, 2] + 1,
        End = model[i, 3] + 1,
        ID = paste(tmp_model_info[i, 2:ncol(tmp_model_info)], collapse = "_"),
        Type = tmp_model_info[i, 1],
        Strand = model[i, 6]
      )
    model_info <- rbind(model_info, tmp)
  }
  model_info <- model_info[order(model_info$Start),]

  # make draw data

  # parse coord
  genes_to_plot <- unique(model_info$ID)
  model_info$Group <-
    unlist(lapply(model_info$ID, function(x)
      which(x == genes_to_plot)))
  model_info$Start <-
    unlist(lapply(model_info$Start, function(x)
      max(x, start)))
  model_info$End <-
    unlist(lapply(model_info$End, function(x)
      min(x, end)))
  model_info <- model_info[model_info$Start <= model_info$End, ]

  # split body and end
  model_info_group_first <-
    model_info[!duplicated(model_info$Group), ]
  model_info_des <-
    model_info[order(model_info$Start, decreasing = T),]
  model_info_group_last <-
    model_info_des[!duplicated(model_info_des$Group), ]
  model_info_group_last <-
    model_info_group_last[order(model_info_group_last$Group), ]
  model_info_group_length <-
    model_info_group_last$End - model_info_group_first$Start

  for (group_idx in 1:nrow(model_info_group_first)) {
    if (model_info_group_first[group_idx, 6] == "+") {
      model_info_group_first[group_idx,] <-
        model_info_group_last[group_idx,]
    }
  }
  model_info_end <- data.frame()
  for (group_idx in 1:nrow(model_info_group_first)) {
    if (model_info_group_first[group_idx, "Strand"] == "+") {
      x <- c(
        model_info_group_first[group_idx, "Start"],
        model_info_group_first[group_idx, "Start"],
        model_info_group_first[group_idx, "End"],
        model_info_group_first[group_idx, "End"] + 0.02 * model_info_group_length[group_idx],
        model_info_group_first[group_idx, "End"]
      )
    } else {
      x <- c(
        model_info_group_first[group_idx, "End"],
        model_info_group_first[group_idx, "End"],
        model_info_group_first[group_idx, "Start"],
        model_info_group_first[group_idx, "Start"] - 0.02 * model_info_group_length[group_idx],
        model_info_group_first[group_idx, "Start"]
      )
    }
    model_info_end <-
      rbind(model_info_end,
            data.frame(
              Type = rep(model_info_group_first[group_idx, "Type"], 5),
              X = x,
              Y = c(
                model_info_group_first[group_idx, "Group"] - 0.4,
                model_info_group_first[group_idx, "Group"] - 0.6,
                model_info_group_first[group_idx, "Group"] - 0.6,
                model_info_group_first[group_idx, "Group"] - 0.5,
                model_info_group_first[group_idx, "Group"] - 0.4
              ),
              Group = rep(model_info_group_first[group_idx, "Group"], 5)
            ))
  }

  # extract three context
  model_info_name <- model_info[!duplicated(model_info$Group), ]
  model_info_CDS <- model_info[model_info$Type == "CDS", ]
  model_info_intron <- model_info[model_info$Type == "Intron",]
  model_info_intron$Mid <-
    rowMeans(model_info_intron[c("Start", "End")])
  model_info_UTR <-
    model_info[model_info$Type == "5UTR" |
                 model_info$Type == "3UTR", ]
  model_info_end_CDS <-
    model_info_end[model_info_end$Type == "CDS", ]
  model_info_end_UTR <-
    model_info_end[model_info_end$Type == "5UTR" |
                     model_info_end$Type == "3UTR", ]

  # plot
  p_model <-
    ggplot() +
    scale_x_continuous(
      limits = c(start, end),
      position = "top",
      expand = c(0, 0)
    ) +
    scale_y_continuous(
      limits = c(0, length(genes_to_plot)),
      breaks = c(0, 0.5, length(genes_to_plot)),
      expand = c(0, 0)
    ) +
    theme_classic() + theme(
      axis.title.x = element_blank(),
      axis.line.x = element_line(color = "black", linewidth = 0.6),
      axis.ticks.x = element_line(color = "black", linewidth = 0.4),
      axis.text.x = element_text(color = "black"),
      panel.grid = element_blank(),
      axis.line.y = element_line(color = "black", linewidth = 0.6),
      axis.ticks.y = element_line(color = "black", linewidth = 0.6),
      axis.ticks.length.y = unit(4, "pt"),
      axis.text.y = element_text(color = "black")
    ) + ylab("Gene") + geom_text(
      data = model_info_name,
      mapping = aes(x = Start,
                    y = Group - 0.2,
                    label = ID),
      color = "black",
      size = 3
    ) + geom_rect(
      data = model_info_CDS,
      mapping = aes(
        xmin = Start,
        ymin = Group - 0.6,
        xmax = End,
        ymax = Group - 0.4
      ),
      color = "black",
      fill = "black"
    ) + geom_segment(
      data = model_info_intron,
      mapping = aes(
        x = Start,
        y = Group - 0.5,
        xend = Mid,
        yend = Group - 0.4
      ),
      color = "black",
      linewidth = 0.5
    ) + geom_segment(
      data = model_info_intron,
      mapping = aes(
        x = End,
        y = Group - 0.5,
        xend = Mid,
        yend = Group - 0.4
      ),
      color = "black",
      linewidth = 0.5
    ) + geom_rect(
      data = model_info_UTR,
      mapping = aes(
        xmin = Start,
        ymin = Group - 0.6,
        xmax = End,
        ymax = Group - 0.4
      ),
      color = "black",
      fill = "grey"
    ) + geom_polygon(
      data = model_info_end_CDS,
      mapping = aes(x = X,
                    y = Y,
                    group = Group),
      color = "black",
      fill = "black"
    ) + geom_polygon(
      data = model_info_end_UTR,
      mapping = aes(x = X,
                    y = Y,
                    group = Group),
      color = "black",
      fill = "grey"
    )

  # return
  p_model
}


######################### plot tracks for coverage data #########################
epiplot_cov <- function(bw_df,
                        start,
                        end,
                        y_min,
                        y_max,
                        track_name,
                        track_color,
                        ann_df = NULL,
                        ann_color = "goldenrod1") {
  p <-
    ggplot() +
    scale_x_continuous(limits = c(start, end),
                       expand = c(0, 0)) +
    scale_y_continuous(
      limits = c(y_min, y_max),
      breaks = c(-1, -0.5, 0, 0.5, 1),
      expand = c(0, 0)
    ) +
    theme_classic() + theme(
      axis.title.x = element_blank(),
      axis.line.x = element_blank(),
      axis.ticks.x = element_blank(),
      axis.text.x = element_blank(),
      panel.grid = element_blank(),
      axis.line.y = element_line(color = "black", linewidth = 0.6),
      axis.ticks.y = element_line(color = "black", linewidth = 0.6),
      axis.ticks.length.y = unit(4, "pt"),
      axis.text.y = element_text(color = "black")
    )  + ylab(track_name) + geom_hline(mapping = aes(yintercept = 0), linewidth = 0.6) + geom_rect(
      data = bw_df,
      mapping = aes(
        xmin = start,
        ymin = 0,
        xmax = end,
        ymax = score
      ),
      color = NA,
      fill = track_color
    )

  if (!is.null(ann_df)) {
    p <- p +
      geom_rect(
        data = ann_df,
        mapping = aes(
          xmin = start,
          ymin = y_min,
          xmax = end,
          ymax = y_max
        ),
        color = NA,
        fill = ann_color,
        alpha = 0.3
      ) + geom_text(
        data = ann_df,
        mapping = aes(
          x = mean(c(start, end)),
          y = y_max - 0.2 * (y_max - y_min),
          label = name
        ),
        color = "black"
      )
  }

  p
}


######################### plot tracks BS-seq data #########################
epiplot_bsseq <- function(bw_df,
                          start,
                          end,
                          y_min,
                          y_max,
                          track_name,
                          track_color,
                          ann_df = NULL,
                          ann_color = "goldenrod1") {
  p <-
    ggplot() +
    scale_x_continuous(limits = c(start, end),
                       expand = c(0, 0)) +
    scale_y_continuous(
      limits = c(y_min, y_max),
      breaks = c(-1, -0.5, 0, 0.5, 1),
      expand = c(0, 0)
    ) +
    theme_classic() + theme(
      axis.title.x = element_blank(),
      axis.line.x = element_blank(),
      axis.ticks.x = element_blank(),
      axis.text.x = element_blank(),
      panel.grid = element_blank(),
      axis.line.y = element_line(color = "black", linewidth = 0.6),
      axis.ticks.y = element_line(color = "black", linewidth = 0.6),
      axis.ticks.length.y = unit(4, "pt"),
      axis.text.y = element_text(color = "black")
    )  + ylab(track_name) + geom_hline(mapping = aes(yintercept = 0), linewidth = 0.6) + geom_rect(
      data = bw_df,
      mapping = aes(
        xmin = start,
        xmax = end,
        ymin = 0,
        ymax = score
      ),
      color = NA,
      fill = track_color
    )

  if (!is.null(ann_df)) {
    p <- p +
      geom_rect(
        data = ann_df,
        mapping = aes(
          xmin = start,
          ymin = y_min,
          xmax = end,
          ymax = y_max
        ),
        color = NA,
        fill = ann_color,
        alpha = 0.3
      ) + geom_text(
        data = ann_df,
        mapping = aes(
          x = mean(c(start, end)),
          y = y_max - 0.2 * (y_max - y_min),
          label = name
        ),
        color = "black"
      )
  }

  p
}


################# plot tracks BS-seq data (multi C type) #################
epiplot_bsseq_multi <- function(CG_df,
                                CHG_df,
                                CHH_df,
                                start,
                                end,
                                y_min,
                                y_max,
                                track_name,
                                ann_df = NULL,
                                ann_color = "goldenrod1") {
  p <-
    ggplot() +
    scale_x_continuous(limits = c(start, end),
                       expand = c(0, 0)) +
    scale_y_continuous(
      limits = c(y_min, y_max),
      breaks = c(-1, -0.5, 0, 0.5, 1),
      expand = c(0, 0)
    ) +
    theme_classic() + theme(
      axis.title.x = element_blank(),
      axis.line.x = element_blank(),
      axis.ticks.x = element_blank(),
      axis.text.x = element_blank(),
      panel.grid = element_blank(),
      axis.line.y = element_line(color = "black", linewidth = 0.6),
      axis.ticks.y = element_line(color = "black", linewidth = 0.6),
      axis.ticks.length.y = unit(4, "pt"),
      axis.text.y = element_text(color = "black")
    ) + ylab(track_name) + geom_hline(mapping = aes(yintercept = 0), linewidth = 0.6) + geom_rect(
      data = CG_df,
      mapping = aes(
        xmin = start,
        xmax = end,
        ymin = 0,
        ymax = score
      ),
      color = NA,
      fill = "#FF0000",
      alpha = 1
    ) + geom_rect(
      data = CHG_df,
      mapping = aes(
        xmin = start,
        xmax = end,
        ymin = 0,
        ymax = score
      ),
      color = NA,
      fill = "#0070C0",
      alpha = 1
    ) + geom_rect(
      data = CHH_df,
      mapping = aes(
        xmin = start,
        xmax = end,
        ymin = 0,
        ymax = score
      ),
      color = NA,
      fill = "#00B050",
      alpha = 1
    )

  if (!is.null(ann_df)) {
    p <- p +
      geom_rect(
        data = ann_df,
        mapping = aes(
          xmin = start,
          ymin = y_min,
          xmax = end,
          ymax = y_max
        ),
        color = NA,
        fill = ann_color,
        alpha = 0.3
      ) + geom_text(
        data = ann_df,
        mapping = aes(
          x = mean(c(start, end)),
          y = y_max - 0.2 * (y_max - y_min),
          label = name
        ),
        color = "black"
      )
  }

  p
}