R/utils.R

#library(shiny)
#library(geuvPack)
#if (!exists("geuFPKM")) data(geuFPKM)
#library(ggplot2)
#library(BiocRnaHap)
#library(magrittr)
#library(dplyr)
#
#allg = sort(genegr$gene_name)

# varnum_to_use = 2:6
# haptab = NA06986_rnahaps

#' produce a filtered RNA-haplotype table as GRanges, limiting
#' to SNP-sets of specific cardinality and (optionally) providing a set
#' of gene ranges 'nearest' to the initial SNP
#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges IRanges
#' @param haptab data.frame corresponding to phaser haplotypes.txt table
#' @param genegr a GRanges with gene ranges
#' @param varnum_to_use the values of the 'variants' field for which records are retained
#' @param useNearestRubric logical(1) defaults to FALSE
#' @examples
#' head(filter_haptab())
#' @export
filter_haptab = function(haptab=BiocRnaHap::NA06986_rnahaps,
   genegr = ensembldb::genes(EnsDb.Hsapiens.v75::EnsDb.Hsapiens.v75),
   varnum_to_use=2:6, useNearestRubric=FALSE) {
 haptab_gr = GenomicRanges::GRanges(haptab$contig, 
              IRanges::IRanges(haptab$start, haptab$stop))
 mcols(haptab_gr) = haptab[,-c(1:4)]
 haptab_gr_filt = haptab_gr[haptab_gr$variants %in% varnum_to_use]
 haptab_gr_filt = haptab_gr_filt[ grep("^rs", haptab_gr_filt$variant_ids)]
 if (useNearestRubric) {
      gn = genegr[nearest(haptab_gr_filt, genegr)]
      gn = gn[which(nchar(gn$gene_name)>0)]
      }
 else gn = genegr
 list(haptab_filt = haptab_gr_filt, genes_near=gn)
}

#' produce a GRanges with RNA haplotypes near a specific gene
#' @param sym character(1) gene symbol
#' @param haptab data.frame corresponding to phaser haplotypes.txt table
#' @param genes a GRanges as produced by ensembldb::genes
#' @param radius numeric(1) interval upstream and downstream to be included with gene region for selection of RNA haplotypes
#' @param min_total_reads numeric(1) lower bound on number of reads needed to retain a haplotype
#' @param min_variants lower bound on number of variants in the haplotype to warrant retention
#' @param varnum_to_use the values of the 'variants' field for which records are retained
#' @note grep() is used with fixed=TRUE to find sym in genes()$gene_name
#' @examples
#' rnahapsNearGene("GSDMB")
#' @export
rnahapsNearGene = function(sym, haptab=NA06986_rnahaps,
       genes=ensembldb::genes(EnsDb.Hsapiens.v75::EnsDb.Hsapiens.v75),
       radius=10000, min_total_reads=15, min_variants=2, 
       varnum_to_use = 2:6) {
 ind = grep(sym,genes$gene_name,fixed=TRUE)
 if (length(ind)==0) stop("sym not found")
 if (length(ind)>1) {
    warning("multiple representations of symbol, using first")
    ind = ind[1]
    }
 haptab_gr_filt = filter_haptab(haptab)$haptab_filt
# haptab_gr = GRanges(haptab$contig, IRanges(haptab$start, haptab$start))
# mcols(haptab_gr) = haptab[,-c(1:4)]
# haptab_gr_filt = haptab_gr[haptab_gr$variants %in% varnum_to_use]
# haptab_gr_filt = haptab_gr_filt[ grep("^rs", haptab_gr_filt$variant_ids)]
 chk = subsetByOverlaps(haptab_gr_filt, genes[ind]+radius)
 chk[which(chk$reads_total>=min_total_reads & chk$variants >= min_variants)]
}
#' compute a data.frame with GEUVADIS expression for a selected gene
#' and associated haplotypes formed using entries in snps
#' @note it is assumed that the entries in `snps` are elements of
#' a 'long-range' RNA-based haplotype as
#' inferred by phaser
#' @param sym a character(1) gene symbol
#' @param snps a vector of SNP identifiers
#' @param vcf a CompressedVcf instance from VariantAnnotation
#' @param \dots not currently used
#' @examples
#' requireNamespace("geuvPack")
#' if (!exists("geuFPKM")) {
#'   require(geuvPack)
#'   data(geuFPKM)
#'   }
#' tab = geuvEvH(vcf=BiocRnaHap::gsd_vcf)
#' head(tab)
#' require(ggplot2)
#' ggplot(tab, aes(x=haplotypes, y=exprs, colour=popcode)) + 
#'   geom_boxplot(aes(group=haplotypes), outlier.size=0) + 
#'   geom_jitter() 
#' @export
geuvEvH = function (sym = "GSDMB", snps = c("rs2305480", "rs2305479"), 
   vcf = NULL, ...) 
{
    if (!exists("geuFPKM")) 
        stop("load geuvPack and data(geuFPKM) to use this function")
    ind = grep(sym, rowData(geuFPKM)$gene_name)
    stopifnot(length(ind) > 0)
    if (length(ind) > 1) 
        warning("multiple quantifications for requested gene, using first available")
    ex = log(assay(geuFPKM[ind, ]) + 1)
    names(ex) = colnames(geuFPKM)
    if (is.null(vcf)) vcf = look1kg(snps)
    gtvec = apply(geno(vcf)$GT, 2, paste, collapse = ":")
    okids = intersect(colnames(geuFPKM), names(gtvec))
    ex = as.numeric(ex[okids])
    names(ex) = okids
    pc = geuFPKM[,okids]$popcode
    data.frame(ids = okids, exprs = ex, haplotypes = gtvec[okids], 
        gene = sym, popcode=pc)
}

#' create list of snpids in RNA-defined haplotypes
#' @param sym character(1) gene symbol
#' @param radius numeric(1) interval around gene to include
#' @param rna_hap_tab a data.frame as generated by phaser
#' @param \dots passed to rnahapsNearGene
#' @return a list
#' @examples
#' variant_options()
#' @export
variant_options = function(sym="ORMDL3", radius=50000, rna_hap_tab=NA06986_rnahaps, ...) {
 ans = mcols(rnahapsNearGene(sym=sym, radius=radius, 
     haptab=rna_hap_tab, ...))$variant_ids
 strsplit(ans, ",")
}

setClass("HapTab", representation(table="data.frame",
  source="character"))
#' encapsulate phaser output text tables
#' @param datfr data.frame instance importing 'haplotypes.txt'
#' @param source character(1) tag characterizing origin
#' @examples
#' HapTab(NA06986_rnahaps, "phaser NA06986")
#' @export
HapTab = function(datfr, source)
 new("HapTab", table=datfr, source=source)
setMethod("show", "HapTab", function(object) {
 cat(sprintf("HapTab with %s records, source: %s\n",
    nrow(object@table), object@source))
})

setClass("RnaHapSet", representation(
  genesym = "character",
  radius = "numeric",
  haplist = "list",
  source = "HapTab"))
#' encapsulate a list of SNP configurations produced as haplotypes by phaser
#' @param genesym character(1) symbol
#' @param radius numeric(1) region around gene to include
#' @param haplist a list as returned by `variant_options`
#' @param source a HapTab instance
#' @examples
#' ht = HapTab(NA06986_rnahaps, "phaser NA06986")
#' RnaHapSet("ORMDL3", radius=50000, source=ht, 
#'     haplist=variant_options("ORMDL3", 50000))
#' @export
RnaHapSet = function(genesym, radius, source,
   haplist=variant_options(genesym, radius)) {
 new("RnaHapSet", genesym=genesym, radius=radius, haplist=haplist,
       source=source)
}
setMethod("show", "RnaHapSet", function(object) {
 cat(sprintf("RnaHapSet for gene %s (radius %s):\n", 
      object@genesym, object@radius))
 cat(sprintf(" there are %s haplotypes.\n", length(object@haplist)))
 ii = lapply(object@haplist, paste, collapse=":")
 jj = lapply(ii, function(x) paste0(x, ", "))
 hapegs = do.call(paste0, jj)
 if (nchar(hapegs)>55) hapegs = paste0(substr(hapegs, 1, 55), "...")
 cat("    ", hapegs, "\n")
})
vjcitn/BiocRnaHap documentation built on June 17, 2019, 9:31 p.m.