R/helper-allelic.R

Defines functions plotAllelicHeatmap grSelect plotAllelicGene joinNearbyTss makeTx2Tss importAllelicCounts

Documented in importAllelicCounts makeTx2Tss plotAllelicGene plotAllelicHeatmap

#' Import allelic counts as a SummarizedExperiment
#'
#' Read in Salmon quantification of allelic counts from a
#' diploid transcriptome. Assumes that diploid transcripts
#' are marked with the following suffix: an underscore and
#' a consistent symbol for each of the two alleles,
#' e.g. \code{ENST123_M} and \code{ENST123_P},
#' or \code{ENST123_alt} and \code{ENST123_ref}, etc.
#' \code{importAllelicCounts} requires the tximeta package.
#' Further information in Details below.
#'
#' \strong{Requirements} - There must be exactly two alleles for each
#' transcript, and the \code{--keep-duplicates} option should be used
#' in Salmon indexing to avoid removal of transcripts with identical
#' sequence. The output object has half the number of transcripts,
#' with the two alleles either stored in a \code{"wide"} object, or as
#' re-named \code{"assays"}. Note carefully that the symbol provided
#' to \code{a1} is used as the effect allele, and \code{a2} is used as
#' the non-effect allele (see the \code{format} argument description
#' and Value description below).
#'
#' \strong{tx2gene} - The two columns should include the \code{a1} and
#' \code{a2} suffix for the transcripts and genes/groups, or those
#' will be added internally, if it is detected that the first
#' transcript does not have these suffices. For example if
#' \code{_alt} or \code{_ref}, or \code{_M} or \code{_P} (as indicated
#' by the \code{a1} and \code{a2} arguments) are not present in the
#' table, the table rows will be duplicated with those suffices added
#' on behalf of the user. If \code{tx2gene} is not provided, the
#' output object will be transcript-level. Do not attempt to set the
#' \code{txOut} argument, it will conflict with internal calls to
#' downstream functions. If the a1/a2 suffices are not at the end of
#' the transcript name in the quantification files,
#' e.g. \code{ENST123_M|<metadata>}, then \code{ignoreAfterBar=TRUE}
#' can be used to match regardless of the string following \code{|} in
#' the quantification files.
#'
#' \code{skipMeta=TRUE} is used, as it is assumed the diploid
#' transcriptome does not match any reference transcript
#' collection. This may change in future iterations of the function,
#' depending on developments in upstream annotations and software.
#'
#' If \code{tx2gene} is a GRanges object, the rowRanges of the output
#' will be the reduced ranges of the grouped input ranges, with
#' \code{tx_id} collapsed into a CharacterList, and TSS positions
#' saved as an IntegerList, if these are not equal among the transcripts
#' of a group.
#' Other metadata columns are not manipulated, just the metadata
#' for the first range is returned.
#' 
#' @param coldata a data.frame as used in \code{tximeta}
#' @param a1 the symbol for the effect allele
#' @param a2 the symbol for the non-effect allele
#' @param format either \code{"wide"} or \code{"assays"} for whether
#' to combine the allelic counts as columns (wide) or put the allelic
#' count information in different assay slots (assays).
#' For wide output, the data for the non-effect allele (a2) comes first,
#' then the effect allele (a1), e.g. \code{[a2 | a1]}. The \code{ref} level
#' of the factor variable \code{se$allele} will be \code{"a2"}
#' (so by default comparisons will be: a1 vs a2).
#' For assays output, all of the original matrices are renamed with a prefix,
#' either \code{a1-} or \code{a2-}.
#' @param tx2gene optional, a data.frame with first column indicating
#' transcripts, second column indicating genes (or any other transcript
#' grouping). Alternatively, this can be a GRanges object with
#' required columns \code{tx_id}, and \code{group_id}
#' (see \code{makeTx2Tss}). For more information on this argument,
#' see Details.
#' @param ... any arguments to pass to tximeta
#'
#' @return a SummarizedExperiment, with allele counts (and other data)
#' combined into a wide matrix \code{[a2 | a1]}, or as assays (a1, then a2).
#' The original strings associated with a1 and a2 are stored in the
#' metadata of the object, in the \code{alleles} list element.
#' Note the reference level of \code{se$allele} will be \code{"a2"}, 
#' such that comparisons by default will be a1 vs a2
#' (effect vs non-effect). 
#'
#' @references
#'
#' Euphy Wu, Noor P. Singh, Kwangbom Choi, Mohsen Zakeri, Matthew
#' Vincent, Gary A. Churchill, Cheryl L. Ackert-Bicknell, Rob Patro,
#' Michael I. Love.
#' "Detecting isoform-level allelic imbalance accounting for
#' inferential uncertainty" bioRxiv (2022)
#' \url{https://doi.org/10.1101/2022.08.12.503785}
#' 
#' @export
importAllelicCounts <- function(coldata, a1, a2,
                                format=c("wide","assays"),
                                tx2gene=NULL, ...) {
  format <- match.arg(format)
  if (!requireNamespace("tximeta", quietly=TRUE)) {
    stop("this function requires installing the Bioconductor package 'tximeta'")
  }

  a1match <- paste0("_",a1,"$")
  a2match <- paste0("_",a2,"$")

  # output transcripts if no tx2gene provided
  txOut <- is.null(tx2gene) 
  # if tx2gene is ranges, some extra steps to modify output rowRanges
  t2gRanges <- is(tx2gene, "GRanges")
  if (!txOut) {
    # if tx2gene is ranges, then pull out the table for collapsing
    if (t2gRanges) {
      cols <- c("tx_id","group_id")
      stopifnot(all(cols %in% names(mcols(tx2gene))))
      # swap around variable names to run the data.frame-based code
      txps <- tx2gene
      tx2gene <- mcols(txps)[,cols]
    }
    stopifnot(ncol(tx2gene) == 2)
    # see if tx2gene already has tagged the txps and genes
    diploid <- FALSE # save whether tx2gene is diploid
    if (grepl(a1match, tx2gene[1,1]) | grepl(a2match, tx2gene[1,1])) {
      # ensure same number of a1 and a2 alleles in txps and genes
      sum1txp <- sum(grepl(a1match, tx2gene[,1]))
      sum2txp <- sum(grepl(a2match, tx2gene[,1]))
      sum1gene <- sum(grepl(a1match, tx2gene[,2]))
      sum2gene <- sum(grepl(a2match, tx2gene[,2]))
      stopifnot(sum1txp > 0 & sum2txp > 0 & sum1gene > 0 & sum2gene > 0)
      stopifnot(sum1txp == sum2txp)
      stopifnot(sum1gene == sum2gene)
      diploid <- TRUE
    } else {
      a2a1_vec <- rep(c(a2, a1), each=nrow(tx2gene))
      tx2gene <- data.frame(
        tx=paste0(rep(tx2gene[,1], 2), "_", a2a1_vec),
        gene=paste0(rep(tx2gene[,2], 2), "_", a2a1_vec)
      )
    }
  }

  se <- tximeta::tximeta(coldata, skipMeta=TRUE, tx2gene=tx2gene, txOut=txOut, ...)

  # remove any characters after "|"
  rownames(se) <- sub("\\|.*", "", rownames(se))

  ntxp <- nrow(se)/2
  n <- ncol(se)

  # gather transcript names for a1 and a2 alleles
  txp_nms_a1 <- grep(a1match, rownames(se), value=TRUE)
  stopifnot(length(txp_nms_a1) == ntxp)
  # here use 'a1match' to only match on the final suffix
  txp_nms_a2 <- sub(a1match, paste0("_",a2), txp_nms_a1)
  stopifnot(all(txp_nms_a2 %in% rownames(se)))
  stopifnot(length(txp_nms_a1) == length(txp_nms_a2))
  txp_nms <- sub(a1match, "", txp_nms_a1)
  
  if (format == "wide") {
    coldata_wide <- data.frame(
      allele=factor(rep(c("a2","a1"), each=n), levels=c("a2","a1"))
    )
    # add any other covariate data
    for (v in setdiff(names(coldata), c("files","names"))) {
      coldata_wide[[v]] <- coldata[[v]]
    }
    rownames(coldata_wide) <- paste0(colnames(se), "-", coldata_wide$allele)

    assays_wide <- lapply(assays(se), function(a) {
      a_wide <- cbind(a[txp_nms_a2,], a[txp_nms_a1,])
      rownames(a_wide) <- txp_nms
      colnames(a_wide) <- rownames(coldata_wide)
      a_wide
    })
    # make a new SE
    out <- SummarizedExperiment(assays=assays_wide,
                                colData=coldata_wide)
    metadata(out) <- c(metadata(se), list(alleles=c(a1=a1, a2=a2)))
  } else if (format == "assays") {
    se_a1 <- se[txp_nms_a1,]
    se_a2 <- se[txp_nms_a2,]
    rownames(se_a1) <- txp_nms
    rownames(se_a2) <- txp_nms
    # rename the assays
    assayNames(se_a1) <- paste0("a1-", assayNames(se_a1))
    assayNames(se_a2) <- paste0("a2-", assayNames(se_a2))
    # add the a2 matrices to the a1 SE object
    assays(se_a1) <- c(assays(se_a1), assays(se_a2))
    metadata(se_a1) <- c(metadata(se_a1), list(alleles=c(a1=a1, a2=a2)))
    out <- se_a1
  }
  # extra steps to assign rowRanges to output
  if (t2gRanges) {
    if (diploid) {
      txps <- txps[grepl(a1match, names(txps))]
      names(txps) <- sub(a1match,"",names(txps))
      mcols(txps)$tx_id <- sub(a1match,"",mcols(txps)$tx_id)
      mcols(txps)$group_id <- sub(a1match,"",mcols(txps)$group_id)
    }
    stopifnot(all(rownames(out) %in% mcols(txps)$group_id))
    tx_list <- CharacterList(split(mcols(txps)$tx_id, mcols(txps)$group_id))
    # detect if the TSS have been grouped by `maxgap`
    # (the group_id will not be equal to "gene-tss")
    tss_avail <- "tss" %in% names(mcols(txps))
    if (tss_avail) {
      tss_grouped <- mcols(txps)$group_id[1] != paste0(mcols(txps)$gene_id[1],
                                                       "-", mcols(txps)$tss[1])
    } else {
      tss_grouped <- FALSE
    }
    # want to save individual TSS information in a list
    if (tss_grouped) {
      tss_list <- IntegerList(split(mcols(txps)$tss, mcols(txps)$group_id))
    }
    tx_starts <- sapply(split(start(txps), mcols(txps)$group_id), min)
    tx_ends <- sapply(split(end(txps), mcols(txps)$group_id), max)
    txps <- txps[!duplicated(mcols(txps)$group_id)]
    names(txps) <- mcols(txps)$group_id
    start(txps) <- tx_starts[names(txps)]
    end(txps) <- tx_ends[names(txps)]
    txps <- txps[rownames(out)]
    mcols(txps)$tx_id <- tx_list[rownames(out)]
    if (tss_grouped) {
      mcols(txps)$tss <- tss_list[rownames(out)]
    }
    SummarizedExperiment::rowRanges(out) <- txps
  }
  out
}

#' Make a GRanges linking transcripts to TSS within gene
#'
#' This helper function takes either a TxDb/EnsDb or
#' GRanges object as input and outputs a GRanges object
#' where transcripts are aggregated to the gene + TSS
#' (transcription start site). For nearby TSS that should
#' be grouped together, see \code{maxgap}.
#'
#' @param x either TxDb/EnsDb or GRanges object. The GRanges
#' object should have metadata columns \code{tx_id} and
#' \code{gene_id}
#' @param maxgap integer, number of basepairs to use determining
#' whether to combine nearby TSS
#'
#' @return GRanges with columns \code{tx_id}, \code{tss}, and
#' \code{group_id}
#'
#' @examples
#' \dontrun{
#' library(EnsDb.Hsapiens.v86)
#' edb <- EnsDb.Hsapiens.v86
#' t2t <- makeTx2Tss(edb)
#' }
#' 
#' @export
makeTx2Tss <- function(x, maxgap=0) {
  if (is(x, "GRanges")) {
    txps <- x
    if (!all(c("tx_id","gene_id") %in% names(mcols(txps)))) {
      stop("'tx_id' and 'gene_id' must be mcols of 'x'")
    }
  } else if (is(x, "TxDb")) {
    if (!requireNamespace("GenomicFeatures", quietly=TRUE)) {
      stop("'x' as TxDb requires GenomicFeatures")
    }
    if (!requireNamespace("AnnotationDbi", quietly=TRUE)) {
      stop("'x' as TxDb requires AnnotationDbi")
    }
    txps <- GenomicFeatures::transcripts(x)
    mcols(txps)$tx_id <- mcols(txps)$tx_name
    tx_id_stripped <- sub("\\..*", "", mcols(txps)$tx_id)
    mcols(txps)$gene_id <- AnnotationDbi::mapIds(x, tx_id_stripped, "GENEID", "TXNAME")
    # if any missing gene IDs, just propagate the txp IDs
    missing.idx <- is.na(mcols(txps)$gene_id)
    mcols(txps)$gene_id[ missing.idx ] <- mcols(txps)$tx_id[ missing.idx ]
  } else if (is(x, "EnsDb")) {
    if (!requireNamespace("ensembldb", quietly=TRUE)) {
      stop("'x' as EnsDb requires ensembldb")
    }
    txps <- ensembldb::transcripts(x)
  } else {
    stop("'x' should be a GRanges or TxDb/EnsDb")
  }
  tss_ranges <- GenomicRanges::resize(txps, width=1)
  tss <- GenomicRanges::start(tss_ranges)
  mcols(txps)$tss <- tss
  if (maxgap == 0) {
    # group ID is just gene ID + TSS
    mcols(txps)$group_id <- paste0(mcols(txps)$gene_id, "-", mcols(txps)$tss)
  } else {
    # in this case, we will number the TSS groups with 1,2,...
    # nearby TSS are grouped together according to maxgap
    tss_ranges <- sort(tss_ranges)
    txps <- txps[ names(tss_ranges) ]
    txps <- txps[ order(mcols(txps)$gene_id) ]
    sp_tss <- split(mcols(txps)$tss, mcols(txps)$gene_id)
    tss_groups <- lapply(sp_tss, joinNearbyTss, maxgap)
    mcols(txps)$group_id <- paste0(mcols(txps)$gene_id, "-", unlist(tss_groups))
  }
  txps
}

joinNearbyTss <- function(tss, maxgap) {
  delta <- diff(tss, lag=1)
  new_tss_loc <- as.integer(delta > maxgap)
  tss_loc <- c(1,cumsum(new_tss_loc) + 1)
  tss_loc
}

#' Plot allelic counts in a gene context using Gviz
#'
#' Plot allelic data (allelic proportions, isoform propostions)
#' in a gene context leveraging the Gviz package. See the allelic
#' vignette for example usage. TPM and count filters are used by
#' default to clean up the plot of features with minimal signal;
#' note that the isoform proportion displayed at the bottom of the
#' plot is among the features that pass the filtering steps.
#' If the function is not responding, it is likely due to issues
#' connecting to UCSC servers to draw the ideogram, in this case
#' set \code{ideogram=FALSE}.
#' 
#' @param y a SummarizedExperiment (see \code{swish})
#' @param gene the name of the gene of interest, requires
#' a column \code{gene_id} in the metadata columns of the
#' rowRanges of y
#' @param db either a TxDb or EnsDb object to use for the gene model
#' @param region GRanges, the region to be displayed in the Gviz plot.
#' if not specified, will be set according to the gene plus 20%
#' of the total gene extent on either side
#' @param symbol alternative to \code{gene}, to specify
#' the gene of interest according to a column \code{symbol}
#' in the metadata columns of the rowRanges of y
#' @param genome UCSC genome code (e.g. \code{"hg38"},
#' if not specified it will use the \code{GenomeInfoDb::genome()}
#' of the rowRanges of \code{y}
#' @param tpmFilter minimum TPM value (mean over samples) to keep a feature
#' @param isoPropFilter minimum percent of isoform proportion to keep a feature
#' @param countFilter minimum count value (mean over samples) to keep a feature
#' @param pc pseudocount to avoid dividing by zero in allelic proportion calculation
#' @param transcriptAnnotation argument passed to Gviz::GeneRegionTrack
#' (\code{"symbol"}, \code{"gene"}, \code{"transcript"}, etc.)
#' @param labels list, labels for a2 (non-effect) and a1 (effect) alleles
#' @param qvalue logical, whether to inclue qvalue track
#' @param log2FC logical, whether to include log2FC track
#' @param ideogram logical, whether to include ideogram track
#' @param cov character specifying a factor or integer variable to use
#' to facet the allelic proportion plots, should be a column in
#' \code{colData(y)}
#' @param covFacetIsoform logical, if \code{cov} is provided,
#' should it also be used to facet the isoform proportion track,
#' in addition to the allelic proportion track
#' @param allelicCol the colors of the points and lines for allelic proportion
#' @param isoformCol the colors of the points and lines for isoform proportion
#' @param statCol the color of the lollipops for q-value and log2FC
#' @param gridCol the color of the grid in the data tracks
#' @param baselineCol the color of the horizontal baseline for q-value and lo2gFC
#' @param titleCol font color of the side titles (track labels)
#' @param titleAxisCol axis color of the side titles (track labels)
#' @param titleBgCol background color of the side titles (track labels)
#' @param geneBorderCol the color of the borders and font in gene region track
#' @param geneFillCol the color of the fill in the gene region track
#' @param genomeAxisCol line color of the genome axis track
#' @param innerFontCol font color of genome axis track, ideogram, and
#' allelic proportion legend
#' @param ... additional arguments passed to \code{Gviz::plotTracks()}
#'
#' @return nothing, a plot is displayed
#'
#' @references
#'
#' The methods for allelic expression analysis are described in:
#' 
#' Euphy Wu, Noor P. Singh, Kwangbom Choi, Mohsen Zakeri, Matthew
#' Vincent, Gary A. Churchill, Cheryl L. Ackert-Bicknell, Rob Patro,
#' Michael I. Love.
#' "Detecting isoform-level allelic imbalance accounting for
#' inferential uncertainty" bioRxiv (2022)
#' \url{https://doi.org/10.1101/2022.08.12.503785}
#'
#' This function makes use of the Gviz package that is described in:
#'
#' Hahne, F., Ivanek, R. (2016). Visualizing Genomic Data
#' Using Gviz and Bioconductor.
#' In: Mathé, E., Davis, S. (eds) Statistical Genomics.
#' Methods in Molecular Biology, vol 1418. Humana Press,
#' New York, NY. \url{https://doi.org/10.1007/978-1-4939-3578-9_16}
#' 
#' @export
plotAllelicGene <- function(y, gene, db,
                            region=NULL, symbol=NULL, genome=NULL,
                            tpmFilter=1, isoPropFilter=.05, countFilter=10,
                            pc=1, transcriptAnnotation="symbol",
                            labels=list(a2="a2",a1="a1"),
                            qvalue=TRUE, log2FC=TRUE, ideogram=FALSE,
                            cov=NULL, covFacetIsoform=FALSE,
                            allelicCol=c("dodgerblue","goldenrod1"),
                            isoformCol="firebrick",
                            statCol="black",
                            gridCol="grey80",
                            baselineCol="black",
                            titleCol="black",
                            titleAxisCol="black",
                            titleBgCol="white",
                            geneBorderCol="darkblue",
                            geneFillCol="darkblue",
                            genomeAxisCol="black",
                            innerFontCol="black", ...) {
  if (!requireNamespace("Gviz", quietly=TRUE)) {
    stop("plotAllelicGene() requires 'Gviz' Bioconductor package")
  }
  if (!requireNamespace("GenomeInfoDb", quietly=TRUE)) {
    stop("plotAllelicGene() requires 'GenomeInfoDb' Bioconductor package")
  }
  stopifnot("gene_id" %in% names(mcols(y)))
  if (!is.null(symbol)) {
    stopifnot("symbol" %in% names(mcols(y)))
    gene <- mcols(y)$gene_id[match(symbol, mcols(y)$symbol)]
  }
  stopifnot(is.character(gene))
  stopifnot(is(db, "TxDb") | is(db, "EnsDb"))
  stopifnot(length(allelicCol) == 2)
  stopifnot(all(c("a2","a1") %in% names(labels)))
  if (!is.null(cov)) {
    stopifnot(cov %in% names(colData(y)))
  } else {
    stopifnot(!covFacetIsoform)
  }
  # pull out the ranges of y to build the Gviz plot
  gr <- rowRanges(y)
  # standard chr's and UCSC for compatibility w Gviz
  gr <- GenomeInfoDb::keepStandardChromosomes(
                        gr, pruning.mode="coarse")
  GenomeInfoDb::seqlevelsStyle(gr) <- "UCSC"
  # gene must be in the metadata
  stopifnot(gene %in% gr$gene_id)
  # pull out the ranges for the gene of interest
  gr <- gr[gr$gene_id == gene]
  stopifnot(!any(duplicated(names(gr))))
  # use TPM and count filtering to remove lowly expressed features
  if (!is.null(tpmFilter) | !is.null(countFilter)) {
    keep <- rep(TRUE, length(gr))
    meanTPM <- rowMeans(assay(y[names(gr),,drop=FALSE],"abundance"))
    meanIsoProp <- meanTPM / sum(meanTPM)
    meanCounts <- rowMeans(assay(y[names(gr),,drop=FALSE],"counts"))
    if (!is.null(tpmFilter)) {
      keep <- keep & meanTPM >= tpmFilter
    }
    if (!is.null(tpmFilter)) {
      keep <- keep & meanIsoProp >= isoPropFilter
    }
    if (!is.null(countFilter)) {
      keep <- keep & meanCounts >= countFilter
    }
    stopifnot(sum(keep) > 0)
    gr <- gr[keep,]
  }
  if (qvalue & !"qvalue" %in% names(mcols(gr))) {
    stop("expecting qvalue, first run swish()")
  }
  if (log2FC & !"log2FC" %in% names(mcols(gr))) {
    stop("expecting log2FC, first run swish()")
  }  
  regionProvided <- TRUE
  if (is.null(region)) {
    regionProvided <- FALSE
    # the region to be displayed
    region <- range(gr)
    total_width <- width(region)
  }  
  # plot data at the TSS
  gr <- flank(gr, width=1)
  strand(gr) <- "*"
  # so data plots work, need to be sorted
  gr <- sort(gr)
  if (qvalue) {
    gr$minusLogQ <- -log10(gr$qvalue)
    # define upper bounds for the q-value and LFC
    qUpper <- 1.2 * max(gr$minusLogQ)
  }
  if (log2FC) {
    lfcUpper <- 1.2 * max(abs(gr$log2FC))
  }
  chr <- as.character(seqnames(region))
  ##########################################
  ## store allelic data in GRanges object ##
  ##########################################
  gr_allelic <- gr
  mcols(gr_allelic) <- NULL
  stopifnot("counts" %in% assayNames(y))
  allelic_counts <- assay(y[names(gr),,drop=FALSE], "counts")
  stopifnot(rownames(allelic_counts) == names(gr_allelic))
  n <- ncol(allelic_counts)/2
  stopifnot(all(y$allele == rep(c("a2","a1"),each=n)))
  total_counts <- (
    allelic_counts[,1:n,drop=FALSE] +
    allelic_counts[,(n+1):(2*n),drop=FALSE])
  allelic_prop <- (allelic_counts+pc) /
    cbind(total_counts+2*pc, total_counts+2*pc)
  mcols(gr_allelic) <- allelic_prop
  ##########################################
  ## store isoform data in GRanges object ##
  ##########################################
  gr_isoform <- gr
  mcols(gr_isoform) <- NULL
  stopifnot("abundance" %in% assayNames(y))
  allelic_tpm <- assay(y[names(gr),,drop=FALSE], "abundance")
  total_tpm <- (
    allelic_tpm[,1:n,drop=FALSE] +
    allelic_tpm[,(n+1):(2*n),drop=FALSE])
  gene_tpm <- colSums(total_tpm)
  # deal with alleles that aren't expressed at all for gene
  gene_tpm <- ifelse(gene_tpm > 0, gene_tpm, .01)
  isoform_prop <- t(t(total_tpm) / gene_tpm)
  isoUpper <- 1.1 * max(isoform_prop)
  mcols(gr_isoform) <- isoform_prop
  ####################
  ## ideogram track ##
  ####################
  if (ideogram) {
    if (is.null(genome)) {
      genome <- GenomeInfoDb::genome(gr)[1]
    }
    ideo_track <- Gviz::IdeogramTrack(genome=genome, chromosome=chr,
                                      fontcolor=innerFontCol)
  } else {
    ideo_track <- NULL
  }
  ##################
  ## genome track ##
  ##################
  genome_track <- Gviz::GenomeAxisTrack(col=genomeAxisCol,
                                        fontcolor=innerFontCol)
  # for ensembldb EnsDb
  if (is(db, "EnsDb")) {
    if (!requireNamespace("ensembldb", quietly=TRUE)) {
      stop("db=EnsDb requires 'ensembldb' Bioconductor package")
    }
    GenomeInfoDb::seqlevelsStyle(db) <- "UCSC"
    # warning is also showing up on ensembldb vignette...
    suppressWarnings({
      gene_region <- ensembldb::getGeneRegionTrackForGviz(
        db,
        chromosome=chr,
        start=start(region)+10,
        end=end(region)-10)
    })
    gene_track <- Gviz::GeneRegionTrack(
      range=gene_region,
      name="gene model",
      transcriptAnnotation=transcriptAnnotation,
      col=geneBorderCol, col.line=geneBorderCol,
      fontcolor.group=geneBorderCol, fill=geneFillCol)
  } else {
    message("using TxDb, assuming UCSC seqlevelsStyle")
    gene_track <- Gviz::GeneRegionTrack(
      range=db,
      chromosome=chr,
      start=start(region)+10,
      end=end(region)-10,
      name="gene model",
      transcriptAnnotation=transcriptAnnotation,
      col=geneBorderCol, col.line=geneBorderCol,
      fontcolor.group=geneBorderCol, fill=geneFillCol)
  }
  ##################################
  ## put together the data tracks ##
  ##################################
  if (qvalue) {
    qvalue_track <- Gviz::DataTrack(
      grSelect(gr, "minusLogQ"),
      type=c("p","h","g"), name="-log10 qvalue",
      col=statCol, col.grid=gridCol, cex=1.5, lwd=2,
      ylim=c(0,qUpper), baseline=0)
  } else {
    qvalue_track <- NULL
  }
  if (log2FC) {
    lfc_track <- Gviz::DataTrack(
      grSelect(gr, "log2FC"),
      type=c("p","h","g"), name="log2FC",
      col=statCol, col.grid=gridCol, cex=1.5, lwd=2,
      baseline=0,
      ylim=c(-lfcUpper,lfcUpper))
  } else {
    lfc_track <- NULL
  }
  allele <- y$allele
  # optionally relabelling of alleles
  if (!(labels$a2 == "a2" & labels$a1 == "a1")) {
    levels(allele) <- c(labels$a2, labels$a1)
  }
  # case where we are not faceting across a covariate
  if (is.null(cov)) {
    allele_track <- list(Gviz::DataTrack(
      gr_allelic, type=c("p","a","g"), name="allelic prop.",
      groups=allele, col=allelicCol, col.grid=gridCol, lwd=2,
      baseline=0.5, fontcolor.legend=innerFontCol))
  } else {
    # faceting case
    covariate <- colData(y)[[cov]]
    if (!is(covariate, "factor")) {
      covariate <- factor(covariate)
    }
    allele_track <- lapply(levels(covariate), function(i) {
      gr_allelic_sub <- gr_allelic
      idx <- covariate == i
      mcols(gr_allelic_sub) <- mcols(gr_allelic_sub)[,idx]
      Gviz::DataTrack(
        gr_allelic_sub, type=c("p","a","g"), name=i,
        groups=allele[idx], col=allelicCol, col.grid=gridCol, lwd=2,
        baseline=0.5, fontcolor.legend=innerFontCol)
    })
  }
  # case where we are not faceting across a covariate
  if (!covFacetIsoform) {
    isoform_track <- list(Gviz::DataTrack(
      gr_isoform, type=c("p","a","g"), name="isoform prop.",
      col=isoformCol, col.grid=gridCol, lwd=2,
      baseline=0, ylim=c(0, isoUpper)))
  } else {
    # faceting case
    isoform_track <- lapply(levels(covariate), function(i) {
      gr_isoform_sub <- gr_isoform
      # subset to one half of the covariate vector
      idx <- covariate[1:n] == i
      mcols(gr_isoform_sub) <- mcols(gr_isoform_sub)[,idx]
      Gviz::DataTrack(
        gr_isoform_sub, type=c("p","a","g"), name=i,
        col=isoformCol, col.grid=gridCol, lwd=2,
        baseline=0, ylim=c(0, isoUpper))
    })
  }
  if (!regionProvided) {
    eps <- round(.2 * total_width)
    gvizFrom <- start(region) - eps
    gvizTo <- end(region) + eps
  } else {
    gvizFrom <- start(region)
    gvizTo <- end(region)
  }
  ##############################
  ## finally, plot the tracks ##
  ##############################
  tracks <- c(list(ideo_track, genome_track, gene_track,
                   qvalue_track, lfc_track),
              allele_track, # already list
              isoform_track) # already list
  tracks <- tracks[!sapply(tracks, is.null)]
  Gviz::plotTracks(
    tracks, from=gvizFrom, to=gvizTo,
    col.title=titleCol,
    col.axis=titleAxisCol,
    background.title=titleBgCol,
    col.baseline=baselineCol, ...)
}

grSelect <- function(gr, col) {
  out <- gr
  mcols(out) <- NULL
  mcols(out)[col] <- mcols(gr)[col]
  out
}

#' Plot allelic ratio heatmap 
#'
#' Plot allelic ratio heatmap over features and samples
#' using the pheatmap package. The a1/(a2 + a1) ratio
#' is displayed.
#' 
#' @param y a SummarizedExperiment (see \code{swish})
#' @param idx a numeric or logical vector of which features
#' to plot
#' @param breaks breaks passed along to pheatmap
#' @param cluster_cols logical, passed to pheatmap
#' @param main title of the plot
#' @param stripAfterChar for the column names, if specified
#' will strip allelic identifiers after this character,
#' default is hyphen. set to NULL to avoid this action
#' @param ... other arguments passed to pheatmap
#'
#' @return nothing, a plot is displayed
#'
#' @references
#'
#' The methods for allelic expression analysis are described in:
#' 
#' Euphy Wu, Noor P. Singh, Kwangbom Choi, Mohsen Zakeri, Matthew
#' Vincent, Gary A. Churchill, Cheryl L. Ackert-Bicknell, Rob Patro,
#' Michael I. Love.
#' "Detecting isoform-level allelic imbalance accounting for
#' inferential uncertainty" bioRxiv (2022)
#' \url{https://doi.org/10.1101/2022.08.12.503785}
#'
#' This function makes use of the pheatmap package:
#'
#' Kolde, Raivo. "Pheatmap: pretty heatmaps."
#' R package version 1.2 (2012): 726.
#' 
#' @export
plotAllelicHeatmap <- function(y, idx,
                               breaks=NULL,
                               cluster_cols=FALSE,
                               main="Allelic ratio",
                               stripAfterChar="-",
                               ...) {
  if (!requireNamespace("GenomeInfoDb", quietly=TRUE)) {
    stop("plotAllelicHeatmap() requires 'pheatmap' CRAN package")
  }
  stopifnot(ncol(y) > 1)
  if ("mean" %in% assayNames(y)) {
    cts <- assay(y, "mean")[idx,,drop=FALSE]
    message("using posterior mean for calculating ratio")
  } else {
    cts <- assay(y, "counts")[idx,,drop=FALSE]
    message("using counts, for posterior mean, run computeInfRV")
  }
  stopifnot("allele" %in% names(colData(y)))
  cts_a2 <- cts[,y$allele == "a2",drop=FALSE]
  cts_a1 <- cts[,y$allele == "a1",drop=FALSE]
  tot <- cts_a2 + cts_a1
  ratio <- cts_a1 / tot
  if (!is.null(breaks)) {
    delta <- max(abs(ratio - 0.5))
    breaks <- seq(from=0.5 - delta, 0.5 + delta, length.out=101)
  }
  if (!is.null(stripAfterChar)) {
    colnames(ratio) <- sub(paste0(stripAfterChar,".*"),
                           "",colnames(ratio))
  }
  pheatmap::pheatmap(ratio, breaks=breaks,
                     cluster_cols=cluster_cols,
                     main=main, ...)
}
mikelove/fishpond documentation built on Aug. 29, 2023, 2:45 p.m.