R/plot_hits.R

Defines functions gghits ggdotplot plot_hits

Documented in ggdotplot gghits plot_hits

#' @title plot hits as a xy dotplot
#' @description
#' \code{plot_hits} routines to make visually appealing dotplots
#' @name plot_hits
#'
#' @param gsParam A list of genespace parameters. This should be created
#' by init_genespace.
#' @param type character string of "all", "raw", or "syntenic" specifying which
#' type of dotplot to generate.
#' @param verbose logical, should updates be printed to the console?
#' @param hits data.table containg hits. See read_allBlast.
#' @param outDir file.path where pdf should be written
#' @param minGenes2plot integer specifying the minimum number of genes that can
#' be plotted
#' @param appendName character with text to append to the file name
#' @param dotsPerIn integer specifying how fine-scaled the heatmap is
#' @param quantileThresh integer specifying the top quantile to be thresholded
#' @param plotSize numeric smalled dimension of the plot
#' @param minScore numeric the minimum score to be permitted in the first plot
#' @param maxFacets integer the maximum number of facets to plot (doesn't plot
#' for genomes with lots of small chrs/scaffolds)
#' @param colorByBlks logical, should blocks be colored?
#' @param alpha numeric [0-1], specifying the transparency of the points
#' @param useOrder logical, should gene order or bp position be used?
#' @param minScore numeric, the minimum scoring hit to plot
#' @param minGenes2plot integer, the minimum number of hits to plot a chromosome
#' combination.
#'
#' \cr
#' If called, \code{plot_hits} returns its own arguments.
#'
#' @details Dotplots here aggregate across proximate positions to reduce file
#' size, especially for very large genomes. XY positions are always in gene-rank
#' order positions. Graphics are built with ggplot2.

#' @title Plot syntenic hits
#' @description
#' \code{plot_hits} The pipeline to plot syntenic hits in parallel
#' @rdname plot_hits
#' @import data.table
#' @import R.utils
#' @importFrom dbscan dbscan frNN
#' @importFrom parallel mclapply
#' @export
plot_hits <- function(gsParam,
                      verbose = TRUE,
                      type,
                      dotsPerIn = 256,
                      quantileThresh = .5,
                      plotSize = 12,
                      minScore = 50){
  ##############################################################################
  # 1. setup
  # -- 1.1 get env vars set up
  query <- target <- lab <- nRegionHits <- nRegions <- nAnchorHits <- nBlks <-
    nSVs <- selfOnly <- queryPloidy <- targetPloidy <- nGlobOgHits <- synHits <-
    nTotalHits <- chunk <- inBuffer <- NULL

  if(!"synteny" %in% names(gsParam))
    gsParam <- set_syntenyParams(gsParam)

  nCores <- gsParam$params$nCores

  # -- 1.2 check that files are all ok
  if(!"synteny" %in% names(gsParam))
    stop("must run set_syntenyParams prior to synteny")

  if(!all(file.exists(gsParam$synteny$blast$allBlast)) && type %in% c("all", "raw"))
    stop("some annotated blast files dont exist, run annotate_blast() first\n")
  if(!all(file.exists(gsParam$synteny$blast$synHits)) && type %in% c("all", "syntenic"))
    stop("some syntenic blast files dont exist, run synteny() first\n")
  # -- 1.1 split the metadata into chunks
  blMd <- data.table(gsParam$synteny$blast)
  blMd[,lab := align_charLeft(sprintf("%s v. %s:", query, target))]
  blMd[,selfOnly := query == target & queryPloidy == 1 & targetPloidy == 1]

  if(!"nGlobOgHits" %in% colnames(blMd))
    blMd[,nGlobOgHits := file.size(synHits)]

  if(!"nTotalHits" %in% colnames(blMd))
    blMd[,nTotalHits := file.size(synHits)]

  setorder(blMd, selfOnly, -nGlobOgHits, -nTotalHits)
  blMd[,chunk := rep(1:.N, each = nCores)[1:.N]]
  synMdSpl <- split(blMd, by = "chunk")

  ##############################################################################
  # -- 2. loop through each chunk
  blMdOut <- lapply(1:length(synMdSpl), function(chnki){

    chnk <- data.table(synMdSpl[[chnki]])

    ############################################################################
    # -- loop through each row in each chunk
    outChnk <- mclapply(1:nrow(chnk), mc.cores = nCores, function(i){

      # -- 2.1 read in the metadata and hits
      outMd <- data.table(chnk[i,])
      x <- data.table(outMd)
      rawHits <- read_allBlast(x$allBlast)

      l1 <- mean(table(rawHits$chr1[rawHits$sameOG]))/10
      l2 <- mean(table(rawHits$chr2[rawHits$sameOG]))/10
      minGenes = max(min(c(l1, l2)), 5)

      dps <- gsParam$params$dotplots
      if(dps == "check"){
        nchr1 <- uniqueN(rawHits$chr1)
        nchr2 <- uniqueN(rawHits$chr2)

        ggdotplot(
          hits = data.table(rawHits),
          outDir = gsParam$paths$dotplots,
          minGenes2plot = minGenes,
          maxFacets = 10000,
          type = type,
          dotsPerIn = dotsPerIn,
          quantileThresh = quantileThresh,
          plotSize = plotSize,
          minScore = minScore)
      }else{
        if(dps == "always"){
          ggdotplot(
            hits = data.table(rawHits),
            outDir = gsParam$paths$dotplots,
            minGenes2plot = minGenes,
            maxFacets = Inf,
            type = type,
            dotsPerIn = dotsPerIn,
            quantileThresh = quantileThresh,
            plotSize = plotSize,
            minScore = minScore)
        }
      }
    })
  })
  return(gsParam)
}



#' @title make dotplots of syntenic hits
#' @description
#' \code{ggdotplot} ggplot2 integrated graphics to produce dotplots
#' @rdname plot_hits
#' @import data.table
#' @import ggplot2
#' @importFrom grDevices pdf dev.off rgb
#' @importFrom dbscan dbscan frNN
#' @export
ggdotplot <- function(hits,
                      type,
                      outDir = NULL,
                      minGenes2plot = 100,
                      appendName = "synHits",
                      dotsPerIn = 256,
                      quantileThresh = .5,
                      plotSize = 12,
                      minScore = 50,
                      maxFacets = 10000,
                      verbose = is.null(outDir)){
  ofID1 <- ofID2 <- sameOg <- ngene1 <- ngene2 <- ord1 <- ord2 <- blkID <-
    inBuffer <- rnd2 <- rnd1 <- n <- isArrayRep2 <- isArrayRep1 <- chr1 <-
    noAnchor <- bitScore <- quantile <- chr2 <- sameOG <- isAnchor <- NULL

  ##############################################################################
  # 1. Get the plot size figured out
  tp <- data.table(hits)

  un1 <- uniqueN(tp$ofID1)
  un2 <- uniqueN(tp$ofID2)
  if(un1 > un2){
    ht <- plotSize
    wd <- ht * (un1/un2)
  }else{
    wd <- plotSize
    ht <- wd * (un2/un1)
  }

  x <- max(tp$ord1, na.rm = T)
  y <- max(tp$ord2, na.rm = T)

  ordPerIn <- x / dotsPerIn
  totDots <- wd * dotsPerIn
  xrnd2 <- floor(x / totDots)+1

  ordPerIn <- y / dotsPerIn
  totDots <- ht * dotsPerIn
  yrnd2 <- floor(y / totDots)+1


  tp[,`:=`(rnd1 = round_toInteger(ord1, xrnd2),
           rnd2 = round_toInteger(ord2, yrnd2))]
  tp <- subset(tp, complete.cases(tp[,c("rnd1", "rnd2", "chr1", "chr2")]))

  ##############################################################################
  # 2. Make the plot with all hits, regardless of og

  # -- 2.1 subset the hits to those with high enough score
  makeCrappyDotplots <- list(p0 = NULL, p1 = NULL, p2 = NULL)
  if(type %in% c("all", "raw")){
    hc <- subset(tp, bitScore > minScore)
    ng1 <- as.integer(uniqueN(hc$ofID1))
    ng2 <- as.integer(uniqueN(hc$ofID2))

    # -- 2.2 get axis labels
    xlab <- sprintf(
      "%s: gene rank order position (%s genes w/ blast hits), grids every 1000 genes",
      hits$genome1[1], ng1)
    ylab <- sprintf(
      "%s: gene rank order position (%s genes w/ blast hits), grids every 1000 genes",
      hits$genome2[1], ng2)

    # -- 2.3 subset to chrs with enough genes on them
    hc[,ngene1 := uniqueN(ofID1[!noAnchor & isArrayRep1], na.rm = T), by = "chr1"]
    hc[,ngene2 := uniqueN(ofID2[!noAnchor & isArrayRep2], na.rm = T), by = "chr2"]
    hc <- subset(hc, ngene1 > minGenes2plot & ngene2 > minGenes2plot)

    # -- 2.4 count n hits in each aggregated position
    hc <- hc[,c("chr1", "chr2", "rnd1", "rnd2")]
    hc <- subset(hc, complete.cases(hc))
    hc <- hc[,list(n = .N), by = c("chr1", "chr2", "rnd1", "rnd2")]
    setorder(hc, -n)
    hc <- subset(hc, !is.na(n))

    # -- 2.5 threshold n to not highlight super strong regions
    qthresh <- quantile(hc$n, quantileThresh)
    if(qthresh > 20)
      qthresh <- 20
    if(qthresh < 5)
      qthresh <- 5
    hc$n[hc$n > qthresh] <- qthresh

    # -- 2.6 get plot title
    titlab <- sprintf(
      "All blast hits with score > %s, %s/%s-gene x/y windows (heatmap range: 2-%s+ hits/window)",
      minScore, xrnd2, yrnd2, round(qthresh))

    # -- 2.7 make the plot
    setorder(hc, n)
    hc <- subset(hc, n > 1)
    nfacets <- nrow(with(hc, expand.grid(unique(chr1), unique(chr2))))
    if(nfacets < maxFacets){
      chrOrd1 <- unique(tp$chr1[order(tp$rnd1)])
      chrOrd2 <- unique(tp$chr2[order(tp$rnd2)])
      hc[,`:=`(chr1 = factor(chr1, levels = chrOrd1),
               chr2 = factor(chr2, levels = chrOrd2))]
      p0 <- ggplot(hc, aes(rnd1, rnd2, col = n)) +
        geom_point(pch = ".") +
        scale_color_viridis_c(begin = .1, trans = "log10", guide = "none") +
        scale_x_continuous(expand = c(0,0),
                           breaks = seq(from = 1e3, to = max(hc$rnd1), by = 1e3))+
        scale_y_continuous(expand = c(0,0),
                           breaks = seq(from = 1e3, to = max(hc$rnd2), by = 1e3))+
        theme_genespace()+
        facet_grid(chr2 ~ chr1, scales = "free",
                   space = "free", as.table = F, switch = "both")+
        labs(x = xlab, y = ylab, title = titlab)
    }else{
      p0 <- NULL
      makeCrappyDotplots[["p0"]] <- subset(tp, bitScore > minScore)
    }

    ##############################################################################
    # 3. Make the plot with just OG hits

    # -- 2.1 subset the hits to those with high enough score
    hc <- subset(tp, sameOG)
    ng1 <- as.integer(uniqueN(hc$ofID1))
    ng2 <- as.integer(uniqueN(hc$ofID2))

    # -- 2.2 get axis labels
    xlab <- sprintf(
      "%s: gene rank order position (%s genes w/ blast hits), grids every 1000 genes",
      hits$genome1[1], ng1)
    ylab <- sprintf(
      "%s: gene rank order position (%s genes w/ blast hits), grids every 1000 genes",
      hits$genome2[1], ng2)

    # -- 2.3 subset to chrs with enough genes on them
    hc[,ngene1 := uniqueN(ofID1[!noAnchor & isArrayRep1]), by = "chr1"]
    hc[,ngene2 := uniqueN(ofID2[!noAnchor & isArrayRep2]), by = "chr2"]
    hc <- subset(hc, ngene1 > minGenes2plot & ngene2 > minGenes2plot)

    # -- 2.4 count n hits in each aggregated position
    hc <- hc[,list(n = .N), by = c("chr1", "chr2", "rnd1", "rnd2")]
    setorder(hc, -n)
    hc <- subset(hc, !is.na(n))

    # -- 2.5 threshold n to not highlight super strong regions
    qthresh <- quantile(hc$n, quantileThresh)
    if(qthresh > 20)
      qthresh <- 20
    if(qthresh < 5)
      qthresh <- 5
    hc$n[hc$n > qthresh] <- qthresh

    # -- 2.6 get plot title
    titlab <- sprintf(
      "Blast hits where query and target are in the same orthogroup, %s/%s-gene x/y windows (heatmap range: 1-%s+ hits/window)",
      xrnd2, yrnd2, round(qthresh))

    # -- 2.7 make the plot
    setorder(hc, n)
    nfacets <- nrow(with(hc, expand.grid(unique(chr1), unique(chr2))))
    if(nfacets < maxFacets){
      chrOrd1 <- unique(tp$chr1[order(tp$rnd1)])
      chrOrd2 <- unique(tp$chr2[order(tp$rnd2)])
      hc[,`:=`(chr1 = factor(chr1, levels = chrOrd1),
               chr2 = factor(chr2, levels = chrOrd2))]
      p1 <- ggplot(hc, aes(rnd1, rnd2, col = n)) +
        geom_point(pch = ".") +
        scale_color_viridis_c(begin = .1, trans = "log10", guide = "none") +
        scale_x_continuous(expand = c(0,0),
                           breaks = seq(from = 1e3, to = max(hc$rnd1), by = 1e3))+
        scale_y_continuous(expand = c(0,0),
                           breaks = seq(from = 1e3, to = max(hc$rnd2), by = 1e3))+
        theme_genespace()+
        facet_grid(chr2 ~ chr1, scales = "free",
                   space = "free", as.table = F, switch = "both")+
        labs(x = xlab, y = ylab, title = titlab)
    }else{
      p1 <- NULL
      makeCrappyDotplots[["p1"]] <- subset(tp, sameOG)
    }
  }else{
    p1 <- p0 <- NULL
  }

  if(type %in% c("all", "syntenic")){
    ##############################################################################
    # 4. Make the plot with just anchors
    hcBlk <- subset(tp, isAnchor)
    hcBlk[,ngene1 := uniqueN(ofID1[!noAnchor & isArrayRep1]), by = "chr1"]
    hcBlk[,ngene2 := uniqueN(ofID2[!noAnchor & isArrayRep2]), by = "chr2"]
    hcBlk <- subset(hcBlk, ngene1 > minGenes2plot & ngene2 > minGenes2plot)
    hcBlk <- hcBlk[,list(n = .N), by = c("chr1", "chr2", "rnd1", "rnd2", "blkID")]
    blkCols <- sample(gs_colors(uniqueN(hcBlk$blkID)))

    ng1 <- as.integer(uniqueN(hcBlk$ofID1))
    ng2 <- as.integer(uniqueN(hcBlk$ofID2))

    nfacets <- nrow(with(hcBlk, expand.grid(unique(chr1), unique(chr2))))
    if(nfacets < maxFacets){
      chrOrd1 <- unique(hcBlk$chr1[order(hcBlk$rnd1)])
      chrOrd2 <- unique(hcBlk$chr2[order(hcBlk$rnd2)])
      hcBlk[,`:=`(chr1 = factor(chr1, levels = chrOrd1),
                  chr2 = factor(chr2, levels = chrOrd2))]
      hcBlk <- subset(hcBlk, !is.na(rnd1) & !is.na(rnd2))
      if(nrow(hcBlk) < 1){
        warning(sprintf("no syntenic hits found for %s vs. %s",
                        hits$genome1[1], hits$genome2[1]))
        p2 <- NULL
      }else{
        p2 <- ggplot(hcBlk, aes(x = rnd1, y = rnd2, col = blkID)) +
          geom_point(pch = ".") +
          scale_color_manual(values = blkCols, guide = "none") +
          scale_x_continuous(expand = c(0,0), breaks = seq(from = 1e3, to = max(hcBlk$rnd1), by = 1e3))+
          scale_y_continuous(expand = c(0,0), breaks = seq(from = 1e3, to = max(hcBlk$rnd2), by = 1e3))+
          theme_genespace() +
          facet_grid(chr2 ~ chr1, scales = "free", space = "free", as.table = F, switch = "both")+
          labs(x = sprintf("%s: gene rank order position (%s genes with blast hits), gridlines every 1000 genes",
                           hits$genome1[1], uniqueN(hits$ofID1[hits$isAnchor])),
               y = sprintf("%s: gene rank order position (%s genes with blast hits), gridlines every 1000 genes",
                           hits$genome2[1], uniqueN(hits$ofID2[hits$isAnchor])),
               title = sprintf("Syntenic anchor blast hits, colored by block ID"))
      }
    }else{
      p2 <- NULL
      makeCrappyDotplots[["p2"]] <- subset(tp, isAnchor)
    }
  }else{
    p2 <- NULL
  }

  if(is.null(outDir)){
    if(verbose)
      cat("writing to the present graphics device")
    if(!is.null(p0))
      print(p0)
    if(!is.null(p1))
      print(p1)
    if(!is.null(p2))
      print(p2)
  }else{
    dpFile <- file.path(outDir,
                        sprintf("%s_vs_%s.%sHits.pdf",
                                tp$genome1[1], tp$genome2[1], type))
    pdf(dpFile, height = ht, width = wd)
    if(verbose)
      cat(sprintf("writing to file: %s", dpFile))
    if(!is.null(makeCrappyDotplots[["p0"]])){
      with(makeCrappyDotplots[["p0"]], plot(
        rnd1, rnd2, pch = ".",
        xlab = "all gene rank order (genome1)",
        ylab = "all gene rank order (genome2)",
        main = "these genomes have too many chromosomes to plot nicely."))
    }else{
      if(!is.null(p0))
        print(p0)
    }

    if(!is.null(makeCrappyDotplots[["p0"]])){
      with(makeCrappyDotplots[["p0"]], plot(
        rnd1, rnd2, pch = ".",
        xlab = "orthogroup hit gene rank order (genome1)",
        ylab = "orthogroup gene rank order (genome2)",
        main = "these genomes have too many chromosomes to plot nicely."))
    }else{
      if(!is.null(p0))
        print(p1)
    }

    if(!is.null(makeCrappyDotplots[["p2"]])){
      with(makeCrappyDotplots[["p2"]], plot(
        rnd1, rnd2, pch = ".",
        xlab = "syntenic hit gene rank order (genome1)",
        ylab = "syntenic hit gene rank order (genome2)",
        main = "these genomes have too many chromosomes to plot nicely."))
    }else{
      if(!is.null(p2))
        print(p2)
    }

    de <- dev.off()
  }
}


#' @title simple dotplots from a hits data.table
#' @description
#' \code{gghits} ggplot2 integrated graphics to produce dotplots
#' @rdname plot_hits
#' @import data.table
#' @import ggplot2
#' @export
gghits <- function(hits,
                   colorByBlks = TRUE,
                   alpha = ifelse(colorByBlks, 1, .25),
                   useOrder = TRUE,
                   minScore = 0,
                   minGenes2plot = 0){
  ofID1 <- ofID2 <- sameOg <- ngene1 <- ngene2 <- ord1 <- ord2 <- blkID <-
    inBuffer <- rnd2 <- rnd1 <- n <- isArrayRep2 <- isArrayRep1 <- chr1 <-
    noAnchor <- bitScore <- quantile <- chr2 <- sameOG <- isAnchor <-
    start1 <- start2 <- x <- y <- NULL

  tp <- data.table(hits)

  if(colorByBlks){
    hc <- subset(tp, !is.na(blkID) & isAnchor)
  }else{
    hc <- subset(tp, bitScore > minScore)
  }

  ng1 <- as.integer(uniqueN(hc$ofID1))
  ng2 <- as.integer(uniqueN(hc$ofID2))

  hc[,ngene1 := uniqueN(ofID1[!noAnchor & isArrayRep1], na.rm = T), by = "chr1"]
  hc[,ngene2 := uniqueN(ofID2[!noAnchor & isArrayRep2], na.rm = T), by = "chr2"]
  hc <- subset(hc, ngene1 > minGenes2plot & ngene2 > minGenes2plot)

  if(useOrder){
    hc[,`:=`(x = ord1, y = ord2)]
    xlab <- "query gene rank order position"
    ylab <- "target gene rank order position"
  }else{
    hc[,`:=`(x = start1/1e6, y = start2/1e6)]
    xlab <- "query physical (Mb) gene position"
    ylab <- "target physical (Mb) gene position"
  }

  if(!colorByBlks){
    p <- ggplot(hc, aes(x = x, y = y)) +
      geom_point(pch = ".", alpha = alpha) +
      scale_x_continuous(expand = c(0,0), breaks = pretty(hc$x, n = 10))+
      scale_y_continuous(expand = c(0,0), breaks = pretty(hc$y, n = 10))+
      facet_grid(genome2 + chr2 ~ genome1 + chr1, scales = "free",
                 space = "free", as.table = F)+
      labs(x = xlab, y = ylab)+
      theme(panel.background = element_rect(fill = "black"),
            panel.grid.minor = element_blank(),
            panel.grid.major = element_line(
              color = rgb(1, 1, 1, .2), size = .2, linetype = 2),
            panel.spacing = unit(.1, "mm"),
            axis.ticks = element_blank(),
            strip.background = element_blank(),
            axis.text = element_text(family = "Helvetica", size = 5),
            axis.title = element_text(family = "Helvetica", size = 6),
            plot.title = element_text(family = "Helvetica", size = 7))
  }else{
    blkCols <- sample(gs_colors(uniqueN(hc$blkID)))
    p <- ggplot(hc, aes(x = x, y = y, col = blkID)) +
      geom_point(pch = ".", alpha = alpha) +
      scale_color_manual(values = blkCols, guide = "none") +
      scale_x_continuous(expand = c(0,0), breaks = pretty(hc$x, n = 10))+
      scale_y_continuous(expand = c(0,0), breaks = pretty(hc$y, n = 10))+
      facet_grid(genome2 + chr2 ~ genome1 + chr1, scales = "free", space = "free",
                 as.table = F)+
      labs(x = xlab, y = ylab)+
      theme(panel.background = element_rect(fill = "black"),
            panel.grid.minor = element_blank(),
            panel.grid.major = element_line(
              color = rgb(1, 1, 1, .2), size = .2, linetype = 2),
            panel.spacing = unit(.1, "mm"),
            axis.ticks = element_blank(),
            strip.background = element_blank(),
            axis.text = element_text(family = "Helvetica", size = 5),
            axis.title = element_text(family = "Helvetica", size = 6),
            plot.title = element_text(family = "Helvetica", size = 7))
  }
  print(p)
}
jtlovell/GENESPACE documentation built on Jan. 25, 2025, 6:39 a.m.