R/genometracks.R

Defines functions FUN .gen_windows track_extract .check_datatable .check_bwtool .check_bedtools .check_windows genometracks

Documented in genometracks

#' Plot Genome Tracks for NGS Data
#'
#' @description Plot genome tracks for NGS data, resembling GBrowse/JBrowse style. Supporting coverage data in bigWig format from RNA-seq, sRNA-seq, BS-seq, ChIP-seq, RIP-seq, GRO-seq, NET-seq, DNase-seq, MNase-seq, ATAC-seq, SHAPE-seq etc. Annotation should be provided in GTF format. Some codes refers to https://github.com/PoisonAlien/trackplot and `genomemodel` package with some major modifications. Requires `bedtools` and `bwtool` on Linux.
#'
#' @param bed_file BED file containing locus to plot. For each line, TAB-delimited information should containing: <chr>, <start>, <end>, <locus_name>.
#' @param track_conf configuration file containing track info. For each line, TAB-delimited information should containing: <track_name>, <path_to_track_file>, <y_min>, <y_max>, <color>, <group>, without header.
#' @param gene_model BED file containing all elements of gene and TE model, generated by `[tinyfuncr::extractPos]`
#' @param ann_region BED file of annotation regions. If not provided, no annotation will be added. For each line, TAB-delimited information should containing: <chr>, <start>, <end>, <annotation_name>. <annotation_name> could be empty but the last TAB should be kept.
#' @param ann_color color of annotated regions
#' @param out_format format of output figures, one of "png" and "pdf"
#' @param out_dir path to output directory, create it if not exist
#'
#' @import magrittr
#' @importFrom data.table data.table as.data.table rbindlist fread fwrite
#' @importFrom parallel mclapply
#' @importFrom stringr str_split
#'
#' @author Yujie Liu
#' @export genometracks
#'

######################### main IO #########################
genometracks <- function(bed_file,
                         track_conf,
                         gene_model,
                         ann_region = NULL,
                         ann_color = "goldenrod1",
                         out_format = "png",
                         out_dir = "out") {
  # check dependencies
  .check_windows()
  .check_bedtools()
  .check_bwtool()
  .check_datatable()

  # make output dirs
  dir_name <- paste0(out_dir, "/")
  system(paste0("mkdir -p ", dir_name))

  # read in and parse track info
  track_info <- read_tcsv(track_conf, header = F)
  colnames(track_info) <-
    c("track_name",
      "track_file",
      "y_min",
      "y_max",
      "color",
      "group")

  # read loci info
  loci <- read_tcsv(bed_file, header = F)
  colnames(loci) <- c("chr", "start", "end", "name")

  # loop against each locus and plot tracks
  for (idx in 1:nrow(loci)) {
    # parse info of this locus
    chrom <- loci[idx, 1]
    begin <- loci[idx, 2] - 30
    end <- loci[idx, 3] + 30
    name <- loci[idx, 4]

    # extract annotation region within this locus
    write_tcsv(
      data.frame(chrom, as.character(begin), as.character(end)),
      file = "tmp_locus.bed",
      col.names = F
    )
    if (!is.null(ann_region)) {
      system(
        paste(
          "bedtools intersect -a tmp_locus.bed -b",
          ann_region,
          "-wb | cut -f 4- > tmp_ann.bed"
        )
      )
      this_mark_region <-
        read_tcsv("tmp_ann.bed", header = F)
    } else {
      this_mark_region <- NULL
    }

    # extract the signal for this locus
    track_data <-
      track_extract(
        bigWigs = track_info$track_file,
        chr = chrom,
        start = begin,
        end = end,
        binsize = 5,
        custom_names = track_info$track_name
      )

    # plot
    if (out_format == "png") {
      png(
        filename = paste0(dir_name, name, ".png"),
        res = 300,
        width = 2000,
        height = 3000
      )
    } else if (out_format == "pdf") {
      pdf(
        file = paste0(dir_name, name, ".pdf"),
        width = 10,
        height = 10
      )
    } else {
      warning("Format not support, use PDF instead ...")
      pdf(
        file = paste0(dir_name, name, ".pdf"),
        width = 10,
        height = 10
      )
    }
    track_plot(
      summary_list = track_data,
      draw_gene_track = T,
      gene_model = gene_model,
      y_min = track_info$y_min,
      y_max = track_info$y_max,
      col = track_info$color,
      regions = this_mark_region,
      boxcol = ann_color,
      boxcolalpha = 0.4
    )
    dev.off()

    system("rm -rf tmp_locus.bed tmp_ann.bed tmp_model.bed")
  }
}


############################ check dependency ############################
.check_windows <- function() {
  if (Sys.info()["sysname"] == "Windows") {
    stop("Windows is not supported :(")
  }
}

.check_bedtools <- function(warn = TRUE) {
  check <- as.character(Sys.which(names = 'bedtools'))[1]
  if (check != "") {
    if (warn) {
      message("Checking for bedtools installation")
      message(paste0("    All good! Found bedtools at: ", check))
    } else{
      return(invisible(0))
    }
  } else{
    stop("Could not locate bedtools !")
  }
}

.check_bwtool <- function(warn = TRUE) {
  check <- as.character(Sys.which(names = 'bwtool'))[1]
  if (check != "") {
    if (warn) {
      message("Checking for bwtool installation")
      message(paste0("    All good! Found bwtool at: ", check))
    } else{
      return(invisible(0))
    }
  } else{
    stop(
      "Could not locate bwtool. Download it from here: https://github.com/CRG-Barcelona/bwtool/releases"
    )
  }
}

.check_datatable <- function() {
  if (!requireNamespace("data.table", quietly = TRUE)) {
    message("Could not find data.table library. Attempting to install..")
    install.packages("data.table")
  }
}


########################## parse track info ##########################
track_extract <- function(bigWigs,
                          chr,
                          start,
                          end,
                          binsize = 10,
                          custom_names = NULL) {
  # make tmp dirs
  options(warn = -1)
  op_dir <- paste0(tempdir(), "/")
  if (!dir.exists(paths = op_dir)) {
    dir.create(path = op_dir,
               showWarnings = FALSE,
               recursive = TRUE)
  }

  # parse locus
  if (start >= end) {
    stop("End must be larger than Start!")
  }
  message("    Queried region: ",
          chr,
          ":",
          start,
          "-",
          end,
          " [",
          end - start,
          " bps]")

  # check bigwig files
  if (is.null(bigWigs)) {
    stop("Provide at-least one bigWig file")
  }
  for (i in 1:length(bigWigs)) {
    if (!file.exists(as.character(bigWigs)[i])) {
      stop(paste0(as.character(bigWigs)[i], " does not exist!"))
    }
  }

  # check track names
  if (!is.null(custom_names)) {
    if (length(custom_names) != length(bigWigs)) {
      stop("Please provide names for all bigWigs")
    }
  }

  # split to each bin and summary track info
  windows <- .gen_windows(
    chr = chr,
    start = start,
    end = end,
    window_size = binsize,
    op_dir = op_dir
  )
  track_summary <- .gen_tracks(bedSimple = windows,
                               bigWigs = bigWigs,
                               op_dir = op_dir)
  if (!is.null(custom_names)) {
    names(track_summary) <- custom_names
  }
  track_summary$loci <- c(chr, start, end)
  track_summary
}


.gen_windows <- function(chr,
                         start,
                         end,
                         window_size,
                         op_dir) {
  message(paste0("Generating windows ", "[", window_size, " bp window size]"))

  window_dat <- data.table::data.table()
  while (start <= end) {
    window_dat <- data.table::rbindlist(l = list(
      window_dat,
      data.table::data.table(start, end = start + window_size)
    ),
    fill = TRUE)
    start = start + window_size
  }
  window_dat$chr <- chr
  window_dat <- window_dat[, .(chr, start, end)]

  temp_op_bed <- tempfile(pattern = "trackr",
                          tmpdir = op_dir,
                          fileext = ".bed")
  data.table::fwrite(
    x = window_dat,
    file = temp_op_bed,
    sep = "\t",
    quote = FALSE,
    row.names = FALSE,
    col.names = FALSE
  )
  temp_op_bed
}


.gen_tracks <- function(bedSimple,
                        bigWigs,
                        op_dir = getwd(),
                        nthreads = 1) {
  message(paste0("Extracting signals"))

  # summary using bwtool
  summaries <- parallel::mclapply(
    bigWigs,
    FUN = function(bw) {
      bn = gsub(pattern = "\\.bw$|\\.bigWig$",
                replacement = "",
                x = basename(bw))
      message(paste0("    Processing ", bn, " .."))
      cmd = paste(
        "bwtool summary -with-sum -keep-bed -header",
        bedSimple,
        bw,
        paste0(op_dir, bn, ".summary")
      )
      system(command = cmd, intern = TRUE)
      paste0(op_dir, bn, ".summary")
    },
    mc.cores = nthreads
  )

  summary_list <- lapply(summaries, function(x) {
    x = data.table::fread(x)
    colnames(x)[1] = 'chromosome'
    x = x[, .(chromosome, start, end, size, max)]
    if (all(is.na(x[, max]))) {
      message(
        "No signal! Possible cause: chromosome name mismatch between bigWigs and queried loci."
      )
      x[, max := 0]
    }
    x
  })

  # remove intermediate files
  lapply(summaries, function(x)
    system(command = paste0("rm ", x), intern = TRUE))
  system(command = paste0("rm ", bedSimple), intern = TRUE)

  names(summary_list) <- gsub(
    pattern = "*\\.summary$",
    replacement = "",
    x = basename(path = unlist(summaries))
  )
  summary_list
}


############################# plot tracks #############################
track_plot <- function(summary_list,
                       draw_gene_track = TRUE,
                       gene_model = NULL,
                       y_min = NULL,
                       y_max = NULL,
                       col = "black",
                       regions = NULL,
                       boxcol = "goldenrod1",
                       boxcolalpha = 0.4,
                       gene_track_height = 1,
                       xaxis_track_height = 0.5,
                       gene_fsize = 0.6,
                       show_axis = TRUE,
                       track_names = NULL,
                       track_names_pos = 0,
                       region_width = 1) {
  # check and parse input
  if (is.null(summary_list)) {
    stop("Missing input! Expecting output from track_extract()")
  }

  chr <- summary_list$loci[1]
  start <- as.numeric(summary_list$loci[2])
  end <- as.numeric(summary_list$loci[3])
  summary_list$loci <- NULL

  if (length(col) != length(summary_list)) {
    col <- rep(x = col, length(summary_list))
  }

  plot_regions = FALSE
  if (!is.null(regions)) {
    if (is(object = regions, class2 = "data.frame")) {
      regions <- data.table::as.data.table(x = regions)
      colnames(regions)[1:3] <- c("chromsome", "startpos", "endpos")
      regions <- regions[chromsome %in% chr]
      if (nrow(regions) == 0) {
        warning("None of the regions are within the requested chromosme: ",
                chr)
        plot_regions <- TRUE
      } else{
        plot_regions <- TRUE
      }
    } else{
      stop(
        "'mark_regions' must be a data.frame with first 3 columns containing : chr, start, end"
      )
    }
  }

  if (!is.null(track_names)) {
    names(summary_list) <- track_names
  }

  if (!is.null(y_max)) {
    if (length(y_max) != length(summary_list)) {
      y_max <- rep(y_max, length(summary_list))
    }
    plot_height <- y_max
  } else{
    plot_height <- round(plot_height, digits = 2)
  }

  if (!is.null(y_min)) {
    if (length(y_min) != length(summary_list)) {
      y_min <- rep(y_min, length(summary_list))
    }
    plot_height_min <- y_min
  } else{
    plot_height_min <- round(plot_height_min, digits = 2)
  }

  # plot layout
  ntracks <- length(summary_list)
  lo <- layout(
    mat = matrix(data = seq_len(ntracks + 2)),
    heights = c(xaxis_track_height, gene_track_height, rep(3, ntracks))
  )

  # draw x-axis
  if (show_axis) {
    par(mar = c(0, 4, 0, 0))
  } else{
    par(mar = c(0, 1, 0, 0))
  }
  lab_at <- pretty(c(start, end))
  plot(
    NA,
    xlim = c(start, end),
    ylim = c(-0.5, 0.5),
    frame.plot = FALSE,
    axes = FALSE,
    xlab = NA,
    ylab = NA
  )
  text(x = lab_at, y = 0.3, labels = lab_at)
  rect(
    xleft = start,
    ybottom = 0,
    xright = end,
    ytop = 0,
    lty = 2,
    xpd = TRUE
  )
  rect(
    xleft = lab_at,
    ybottom = -0.1,
    xright = lab_at,
    ytop = 0.1,
    xpd = TRUE
  )
  #axis(side = 1, at = lab_at, lty = 2, line = -3)
  #text(
  #  x = end,
  #  y = 0.75,
  #  labels = paste0(chr, ":", start, "-", end),
  #  adj = 1,
  #  xpd = TRUE
  #)
  ##############add plot name

  # draw gene models
  if (draw_gene_track) {
    if (show_axis) {
      par(mar = c(0.25, 4, 0, 0.5))
    } else{
      par(mar = c(0.25, 1, 0, 0.5))
    }
    #par(mar = c(1, 1, 3, 1), cex = 1)

    # extract gene models within this locus
    system(
      paste(
        "bedtools intersect -a tmp_locus.bed -b",
        gene_model,
        "-wb | cut -f 4- > tmp_model.bed"
      )
    )
    model <-
      read_tcsv("tmp_model.bed", header = F)
    tmp_model_info <-
      model[[4]] %>% stringr::str_split("_") %>% as.data.frame() %>% t() %>% as.data.frame()
    model_info <- data.frame()
    for (i in 1:nrow(tmp_model_info)) {
      tmp <-
        data.frame(
          Chr = model[i, 1],
          Start = model[i, 2],
          End = model[i, 3],
          ID = paste(tmp_model_info[i, 2:ncol(tmp_model_info)], collapse = "_"),
          Type = tmp_model_info[i, 1],
          Strand = model[i, 6]
        )
      model_info <- rbind(model_info, tmp)
    }

    # draw for each gene
    genes_to_plot <- unique(model_info$ID)
    plot(
      NA,
      xlim = c(start, end),
      ylim = c(0, length(genes_to_plot)),
      frame.plot = FALSE,
      axes = FALSE,
      xlab = NA,
      ylab = NA
    )

    for (idx in 1:length(genes_to_plot)) {
      this_gene <- genes_to_plot[idx]
      this_gene_model <- model_info[model_info$ID == this_gene,]
      this_gene_model <-
        this_gene_model[order(this_gene_model$Start),]

      text(
        x = start,
        y = idx - 0.5,
        labels = this_gene,
        adj = 0,
        cex = gene_fsize
      )

      for (i in 1:nrow(this_gene_model)) {
        type <- this_gene_model[i, "Type"]

        if (type == "CDS") {
          rect(
            xleft = this_gene_model$Start[i],
            ybottom = idx - 0.6,
            xright = this_gene_model$End[i],
            ytop = idx - 0.4,
            col = "black",
            border = "black",
            lwd = 0.5
          )
        } else if (type == "Intron") {
          mid <-
            mean(c(this_gene_model$Start[i],
                   this_gene_model$End[i]))
          segments(
            x0 = this_gene_model$Start[i],
            y0 = idx - 0.5,
            x1 = mid,
            y1 = idx - 0.4,
            lwd = 0.5,
            col = "black"
          )
          segments(
            x0 = this_gene_model$End[i],
            y0 = idx - 0.5,
            x1 = mid,
            y1 = idx - 0.4,
            lwd = 0.5,
            col = "black"
          )
        } else if ("UTR" %in% type) {
          rect(
            xleft = this_gene_model$Start[i],
            ybottom = idx - 0.6,
            xright = this_gene_model$End[i],
            ytop = idx - 0.4,
            col = "grey",
            border = "black",
            lwd = 0.5
          )
        }
      }

      if (this_gene_model[1, 6] == "+") {
        x <-
          c(
            this_gene_model$Start[nrow(this_gene_model)],
            this_gene_model$Start[nrow(this_gene_model)],
            this_gene_model$End[nrow(this_gene_model)],
            this_gene_model$End[nrow(this_gene_model)] + .02 * (this_gene_model$End[nrow(this_gene_model)] - this_gene_model$Start[1]),
            this_gene_model$End[nrow(this_gene_model)]
          )
        y <-
          c(idx - 0.4, idx - 0.6, idx - 0.6, idx - 0.5, idx - 0.4)

        endtype <- this_gene_model$Type[nrow(this_gene_model)]
        if (endtype == "CDS") {
          polygon(x,
                  y,
                  col = "black",
                  border = "black",
                  lwd = 0.5)
        } else {
          polygon(x,
                  y,
                  col = "grey",
                  border = "black",
                  lwd = 0.5)
        }
      } else {
        x <-
          c(
            this_gene_model$End[1],
            this_gene_model$End[1],
            this_gene_model$Start[1],
            this_gene_model$Start[1] - .02 * (this_gene_model$End[nrow(this_gene_model)] - this_gene_model$Start[1]),
            this_gene_model$Start[1]
          )
        y <-
          c(idx - 0.4, idx - 0.6, idx - 0.6, idx - 0.5, idx - 0.4)

        endtype <- this_gene_model$Type[1]
        if (endtype == "CDS") {
          polygon(x,
                  y,
                  col = "black",
                  border = "black",
                  lwd = 0.5)
        } else {
          polygon(x,
                  y,
                  col = "grey",
                  border = "black",
                  lwd = 0.5)
        }
      }
      #########add MITEs
      ########################
      #if (attr(txtbl, "gene") == "MITEs") {
      #  rect(
      #    xleft = txtbl[[1]],
      #    ybottom = tx_id - 0.75,
      #    xright = txtbl[[2]],
      #    ytop = tx_id - 0.25,
      #    col = grDevices::adjustcolor("red", alpha.f = 0.6),
      #    border = NA
      #  )
      #}
      ###################################################

    }
  }

  # draw bigWig signals
  lapply(1:length(summary_list), function(idx) {
    x <- summary_list[[idx]]
    if (show_axis) {
      par(mar = c(0.5, 4, 2, 1))
    } else{
      par(mar = c(0.5, 1, 2, 1))
    }

    plot(
      NA,
      xlim = c(start, end),
      ylim = c(plot_height_min[idx], plot_height[idx]),
      frame.plot = FALSE,
      axes = FALSE,
      xlab = NA,
      ylab = NA
    )
    segments(
      x0 = start,
      y0 = 0,
      x1 = end,
      y1 = 0,
      lwd = 1,
      col = "black"
    )
    rect(
      xleft = x$start,
      ybottom = 0,
      xright = x$end,
      ytop = x$max,
      col = col[idx],
      border = NA
    )
    if (show_axis) {
      axis(
        side = 2,
        at = c(plot_height_min[idx], plot_height[idx]),
        las = 2
      )
    } else{
      text(
        x = start,
        y = plot_height[idx],
        labels = paste0("[", plot_height_min[idx], "-", plot_height[idx], "]"),
        adj = 0,
        xpd = TRUE
      )
    }

    if (plot_regions) {
      boxcol <- grDevices::adjustcolor(boxcol, alpha.f = boxcolalpha)
      if (nrow(regions) > 0) {
        for (i in 1:nrow(regions)) {
          if (idx == length(summary_list)) {
            # if its a last plot, draw rectangle till 0
            rect(
              xleft = regions[i, startpos],
              ybottom = 0,
              xright = regions[i, endpos],
              ytop = 1.5 * plot_height[idx],
              col = boxcol,
              border = NA,
              xpd = TRUE
            )
          } else if (idx == 1) {
            if (ncol(regions) > 3) {
              text(
                x = mean(c(regions[i, startpos], regions[i, endpos])),
                y = plot_height[idx] + (plot_height[idx] * 0.1),
                labels = regions[i, 4],
                adj = 0.5,
                xpd = TRUE,
                font = 1,
                cex = 1.2
              )
            } else{
              text(
                x = mean(c(regions[i, startpos], regions[i, endpos])),
                y = plot_height[idx] + (plot_height[idx] * 0.1),
                labels = paste0(regions[i, startpos], "-", regions[i, endpos]),
                adj = 0.5,
                xpd = TRUE,
                font = 1,
                cex = 1.2
              )
            }
            rect(
              xleft = regions[i, startpos],
              ybottom = -0.5 * plot_height[idx],
              xright = regions[i, endpos],
              ytop = plot_height[idx],
              col = boxcol,
              border = NA,
              xpd = TRUE
            )
          } else{
            rect(
              xleft = regions[i, startpos],
              ybottom = -0.5 * plot_height[idx],
              xright = regions[i, endpos],
              ytop = 1.5 * plot_height[idx],
              col = boxcol,
              border = NA,
              xpd = TRUE
            )
          }
        }
      }
    }

    title(
      main = names(summary_list)[idx],
      adj = track_names_pos,
      font.main = 3
    )
  })
}
liuyujie0136/tinyfuncr documentation built on Dec. 13, 2024, 8:49 a.m.