R/lite.R
In TxRegInfra: Metadata management for multiomic specification of transcriptional regulatory networks

Documented in addsyms basicFormatter getDocumentFields getFieldNames grConverter makeAggregator makeColData makeGRConverterList RaggedMongoExpt sbov txregCollections

# i don't want to wrap mongolite in anything we'll assume all access is through
# mongolite

#' list collections in AWS mongo server for txregnet
#' @import rjson
#' @param ignore NULL by default; otherwise an 
#' integer vector telling which lines of mongo db.getCollectionNames() result should be ignored
#' @param url a valid mongodb URL
#' @param db character(1) db name
#' @param cliparms character(1) arguments to 'mongo', defaults to '--quiet --eval'
#' @return a character vector of collection names
#' @note Different mongodb servers can have different response prologues.  The ignore
#' parameter is there to bypass some of the initial text.  However, with the --quiet option
#' this may not be needed.  We now search for "[" to start parsing the collection list output.
#' @examples
#' if (verifyHasMongoCmd()) txregCollections()[seq_len(5)]
#' @export
txregCollections = function(ignore = NULL, url = URL_txregInAWS(), db = "txregnet",
       cliparms="--quiet --eval") {
    url = gsub("test", db, url)
    #dbref = sprintf("%s/%s", url, db)
    lis = system2("mongo", args=c(url, cliparms, "'db.getCollectionNames()'"), stdout = TRUE)
    if (!is.null(ignore)) lis = lis[-ignore]
    cstart = grep("^\\[$", lis)
    if (length(cstart)==0) stop("could not find '[' isolated in the response.")
    rjson::fromJSON(paste0(lis[(cstart):length(lis)], collapse = ""))
}

#' get names of fields in a collection in remote txregnet
#' @param collection character(1) name of collection
#' @param check logical(1) if TRUE will verify that \code{coll} is present
#' @param url character(1) mongodb url
#' @param db character(1) mongodb db name
#' @param limitn numeric(1) number of records to probe to get field names
#' @return a vector of strings
#' @examples
#' getFieldNames('CD34_DS12274_hg19_FP', check=FALSE) # we know this collection is there
#' @export
getFieldNames = function(collection, check = TRUE, url = URL_txregInAWS(), db = "txregnet", 
    limitn = 1) {
    if (check) {
        allcoll = txregCollections(url = url, db = db)
        stopifnot(collection %in% allcoll)
    }
    ans = mongo(url = url, db = db, collection = collection)$find("{}", limit = limitn)
    names(ans)
}

#' operate on a character vector to derive a DataFrame, splitting on a tokenand retrieving first and last split fragments as 'base' and 'type' fields
#' @param x character vector
#' @param spltok token to use in strsplit
#' @return a DataFrame instance
#' @examples
#' some = c('Adipose_Subcutaneous_allpairs_v7_eQTL',
#'   'CD14_DS17215_hg19_FP',
#'   'CD19_DS17186_hg19_FP',
#'   'ENCFF001WGV_hg19_HS',
#'   'ENCFF994OCD_hg19_HS')
#' basicFormatter(some)
#' @export
basicFormatter = function(x, spltok = "_") {
    xs = strsplit(x, spltok)
    type = vapply(xs, tail, character(1), n = 1)
    base = vapply(xs, "[", character(1), i = 1)
    mid = vapply(xs, function(z) paste(z[-c(1, length(z))], collapse = "_"), character(1))
    DataFrame(base = base, type = type, mid = mid)
}

#' generate a colData component corresponding to a mongodb 
#' @param url character(1) url for mongodb
#' @param db character(1) database name
#' @param formatter a function that takes in a character vector and
#' returns a DataFrame with number of rows equal to the length of input
#' @return a DataFrame instance
#' @examples
#' if (verifyHasMongoCmd()) makeColData()
#' @export
makeColData = function(url = URL_txregInAWS(), db = "txregnet", formatter = basicFormatter) {
    cd = basicFormatter(tt <- txregCollections(url = url, db = db))
    rownames(cd) = tt
    cd
}

#' generate JSON to aggregate (counting records, and, by default, averaging
#' a given variable) within a collection
#' @param by character(1) telling the field for stratifying records for aggregation 
#' @param vbl character(1) telling field with numerical value for which a statistic will be computed within strata defined by 'by'
#' @param opname character(1) define the name of the aggregation
#' @param op character(1) evaluating to a mongo aggregation operator like `$avg` or `$min`
#' @return a JSON document as produced by rjson::toJSON
#' @note This produces json that can be used as an argument to m$aggregate() for m a mongolite::mongo instance
#' @examples
#' makeAggregator()
#' if (interactive() & verifyHasMongoCmd()) {
#'    remURL = URL_txregInAWS()
#'    colls = listAllCollections( url=remURL, db = 'txregnet')
#'    m1 = mongo(url = remURL, db = 'txregnet', 
#'      collection='CD14_DS17215_hg19_FP')
#' # find minimum value of statistic 'stat' per chromosome
#'    newagg = makeAggregator( by='chr',
#'      vbl='stat', op='$min', opname='min')
#'    tab = m1$aggregate( newagg )
#'    head(tab)
#'    }
#' @export
makeAggregator = function(by = "chrom", vbl = "chromStart", opname = "average", op = "$avg") {
    prependedBy = paste("$", by, sep = "")
    prependedVbl = paste("$", vbl, sep = "")
    counter = list(`_id` = prependedBy, count = list(`$sum` = 1))
    realop = list(prependedVbl)
    names(realop) = op
    opl = list(realop)
    names(opl) = opname
    rjson::toJSON(list(list(`$group` = c(counter, opl))))
}
# m1$aggregate( jq )

#' convert a GRanges to a JSON query for mongodb
#' @importFrom GenomicRanges GRanges start end
#' @param queryGRange a \code{\link[GenomicRanges]{GRanges-class}} instance of length 1
#' @param cfields a named character(3) vector with names 'chrom', 'start',
#' 'end'; the element values will be used to name document fields in the query
#' @return a JSON document generated by rjson::toJSON
#' @examples
#' gr = GenomicRanges::GRanges('chr1', IRanges(1,25000))
#' grConverter(gr, cfields=c(chrom='chr', start='start', end='end'))
#' @export
grConverter = function(queryGRange, cfields = c(chrom = "chrom", start = "chromStart", 
    end = "chromEnd")) {
    stopifnot(length(queryGRange) == 1)
    stopifnot(all(names(cfields) == c("chrom", "start", "end")))
    # lis has provisional names which will be replaced
    ss = as.character(GenomeInfoDb::seqnames(queryGRange))
    g = grep("chr", ss)  # assume mongo will use integer if NCBI annotation used, ignoring M for now
    if (length(g) == 0) 
        ss = as.integer(ss)
    lis = list(chr = ss, chromStart = c(`$gte` = start(queryGRange)), chromEnd = c(`$lte` = end(queryGRange)))
    names(lis) = cfields
    rjson::toJSON(lis)  # bplapply cannot find without ::
}

bannedColnames = function() c("seqnames", "ranges", "strand", "seqlevels", "seqlengths", 
    "isCircular", "start", "end", "width", "element")

# the purpose of this class is 1) to establish an iterator over collections S in
# X[,S] 2) to permit use of sparseAssay and compactAssay methods

setOldClass("mongo")
#' @importClassesFrom RaggedExperiment RaggedExperiment
#' @importFrom RaggedExperiment RaggedExperiment
#' @importFrom GenomicRanges GRangesList
#' @importFrom SummarizedExperiment colData 'colData<-' 'metadata<-' mcols assays rowRanges
setClass("RaggedMongoExpt", representation(con = "mongo"), contains = "RaggedExperiment")

#' bind colData to a mongo-based ragged-experiment incubator
#' @importFrom methods is new
#' @param con a mongolite::mongo instance
#' @param colData a DataFrame instance
#' @return instance of RaggedMongoExpt
#' @export
RaggedMongoExpt = function(con, colData) {
    nsamp = nrow(colData)
    # need a dummy GRanges
    if (nsamp > 0) {
        dum = GRanges("chr0", IRanges(2, 1))
        initass = GRangesList(lapply(seq_len(nsamp), function(x) dum))
        names(initass) = rownames(colData)
    } else initass = GRangesList()
    ans = RaggedExperiment(initass)  #
    colData(ans) = cbind(colData(ans), colData)  #, colData=colData)
#    colnames(ans) = rownames(colData) -> mergeROWS changes triggered error here 31 March 2019
    ans = new("RaggedMongoExpt", ans, con = con)
    colData(ans) = cbind(colData(ans), colData)
    ans
}
#' determine the fields present in a txregnet document
#' @param rme instance of RaggedMongoExperiment
#' @param docTypeName character(1) telling the name of the column of colData(rme) that supplies information on document type
#' @return a character vector
#' @examples
#' getDocumentFields
#' @export
getDocumentFields = function(rme, docTypeName = "type") {
    curmongo = rme@con
    conmeta = parent.env(curmongo)$orig  # risky, not in API?
    cd = colData(rme)
    stopifnot(docTypeName %in% names(cd))
    allcolls = rownames(cd)
    allty = cd$type
    utypes = unique(colData(rme)[[docTypeName]])
    inds = vapply(utypes, function(x) min(which(allty == x)), numeric(1))
    fields = lapply(inds, function(x) getFieldNames(url = conmeta$url, db = conmeta$db, 
        collection = allcolls[x], check = FALSE))
    names(fields) = utypes
    fields
}

# buildQueries = function(gr, chrtags, starttags, endtags) {
# stopifnot(length(chrtags)==length(starttags), length(chrtags)==length(endtags))
# lapply(seq_len(length(chrtags)), function(x) grConverter(gr,
# c(chrom=chrtags[x], start=starttags[x], end=endtags[x]))) }

# > getQueryFields(rme1) $eQTL [1] 'gene_id' 'variant_id' 'tss_distance'
# 'ma_samples' 'ma_count' [6] 'maf' 'pval_nominal' 'slope' 'slope_se' 'qvalue'
# [11] 'chr' 'snp_pos' 'A1' 'A2' 'build' $FP [1] 'chr' 'start' 'end' 'id' 'stat'
# $HS [1] 'chrom' 'chromStart' 'chromEnd' 'name' 'score' [6] 'strand'
# 'signalValue' 'pValue' 'qValue'

basicCfieldsMap = function() {
    list(eQTL = list(cfields = c(chrom = "chr", start = "snp_pos", end = "snp_pos")), 
        FP = list(cfields = c(chrom = "chr", start = "start", end = "end")), HS = list(cfields = c(chrom = "chrom", 
            start = "chromStart", end = "chromEnd")))
}

#' generate a list of GRanges to JSON for queries to mongo
#' @param rme RaggedMongoExperiment instance
#' @param map list of lists of named character vectors
#' @param docTypeName character(1) that identifies sample 'type'
#' @return a list of JSON documents
#' @export
makeGRConverterList = function(rme, map = basicCfieldsMap(), docTypeName = "type") {
    # return closures that will behave properly when evaluated on a GRanges input
    doctypes = colData(rme)[[docTypeName]]
    utypes = unique(doctypes)
    ans = lapply(utypes, function(x) function(gr) grConverter(gr, cfields = map[[x]]$cfields))
    names(ans) = utypes
    ans
}

simplify_sbov = function(x) {
 DF = DataFrame(as.list(assays(x)))
 names(DF) = names(assays(x))
 rownames(DF) = NULL
 myr = rowRanges(x)
 mcols(myr) = DF
 metadata(myr) = metadata(x)
 mcols(myr)$origin = colnames(x)
 myr
}


#' prototype of subsetter for mongo resource
#' @importFrom BiocParallel bplapply
#' @importFrom S4Vectors 'mcols<-' tail DataFrame metadata
#' @param rme RaggedMongoExpt instance
#' @param gr GRanges instance to subset by
#' @param map list with one element per document type telling what fields are chr, start, stop
#' @param docTypeName character(1) naming column of colData(rme) that has document type
#' @param simplify logical(1) if TRUE, a GRanges is assembled; if FALSE,
#' a RaggedExperiment is returned 
#' @note This is a placeholder for a proper subsetByOverlaps method.  While
#' it would appear sensible to allow subsetting for diverse assay types
#' (e.g., eQTL, DGF), in version 1.5.x this is not supported.  Each
#' call can operate on only one sample type.
#' @return if simplify is TRUE (the default), a GRanges is returned with assay components in the mcols; otherwise a RaggedExperiment instance, with metadata component 'docType' set to propagate the input type
#' @examples
#' requireNamespace('mongolite')
#' if (verifyHasMongoCmd()) {  # for makeColData, which must be able to enumerate collections,
#'                             # and thus must be able to run system (as opposed to mongolite function) 'mongo' 
#'  m1 = mongolite::mongo(url=URL_txregInAWS(), db='txregnet')
#'  #cd = makeColData(url=URL_txregInAWS(), db='txregnet')
#'  cd = TxRegInfra::basicColData
#'  rme1 = RaggedMongoExpt(m1, cd[which(cd$type=='FP'),][seq_len(8),])
#'  BiocParallel::register(BiocParallel::SerialParam())
#'  qrng = GRanges('chr1', IRanges(1e6, 1.5e6))
#'  GenomeInfoDb::genome(qrng) = "hg19"
#'  ss = sbov(rme1, GRanges('chr1', IRanges(1e6, 1.5e6)), simplify=FALSE)
#' } 
#' @export
sbov = function(rme, gr, map = basicCfieldsMap(), docTypeName = "type",
   simplify=TRUE) {
    stopifnot(is(gr, "GRanges"), length(gr) == 1)
    typecol = colData(rme)[[docTypeName]]
    stopifnot(length(unique(typecol))==1)
    if (is.na(GenomeInfoDb::genome(gr)[1])) warning("genome is not set for for query GRanges")
    meta = parent.env(rme@con)$orig
    theurl = meta$url
    thedb = meta$db
    convs = makeGRConverterList(rme = rme, map = map, docTypeName = docTypeName)
    cd = colData(rme)
    allcols = rownames(cd)
    content = bplapply(seq_len(length(allcols)), function(x) {
        cat(".")
        ty = cd[[docTypeName]][x]
        conn = mongo(url = theurl, db = thedb, collection = rownames(cd)[x])
        res = conn$find(convs[[ty]](gr))
        if (nrow(res) == 0) 
            return(NULL)
        curmap = map[[ty]]$cfields
        curstrand = rep("*", nrow(res))
        if ("strand" %in% names(res)) 
            curstrand = ifelse(res$strand %in% c("+", "-"), res$strand, "*")
        gr = GRanges(res[, curmap["chrom"]], IRanges(res[, curmap["start"]], res[, 
            curmap["end"]]), strand = curstrand)
        badnm = match(names(res), bannedColnames())
        if (any(tt <- !is.na(badnm))) 
            res = res[, -which(tt)]
        mcols(gr) = res
        gr
    })
    badcon = vapply(content, is.null, logical(1))
    if (length(dr <- which(badcon)) > 0) {
        content = content[-dr]
        cd = cd[-dr, ]
    }
    names(content) = rownames(cd)
    ans = RaggedExperiment(assay = GRangesList(content), colData = cd)
    GenomeInfoDb::seqinfo(ans) = GenomeInfoDb::seqinfo(gr)
    metadata(ans)$docType = typecol[1]
    if (simplify) return(simplify_sbov(ans))
    ans
}

#' add gene symbols to a GRanges that uses ensembl identifiers
#' @param x a GRanges assumed to have mcols element `gene_id` with ensembl gene identifiers
#' @param EnsDb an instance of EnsDb
#' @note At this time no checking of reference genome consistency is performed.
#' @return a GRanges with mcols column 'symbol'
#' @examples
#' 
addsyms = function(x, EnsDb=EnsDb.Hsapiens.v75::EnsDb.Hsapiens.v75) {
  ensids = gsub("\\..*", "", x$gene_id) # remove post period
  gns = genes(EnsDb)
  x$symbol = gns[ensids]$symbol
  x
}

Any scripts or data that you put into this service are public.

TxRegInfra documentation built on Nov. 8, 2020, 5:15 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

TxRegInfra
Metadata management for multiomic specification of transcriptional regulatory networks

R/lite.R
In TxRegInfra: Metadata management for multiomic specification of transcriptional regulatory networks

Defines functions addsyms sbov simplify_sbov makeGRConverterList basicCfieldsMap getDocumentFields RaggedMongoExpt bannedColnames grConverter makeAggregator makeColData basicFormatter getFieldNames txregCollections

Documented in addsyms basicFormatter getDocumentFields getFieldNames grConverter makeAggregator makeColData makeGRConverterList RaggedMongoExpt sbov txregCollections

Try the TxRegInfra package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

TxRegInfra Metadata management for multiomic specification of transcriptional regulatory networks

R/lite.R In TxRegInfra: Metadata management for multiomic specification of transcriptional regulatory networks

Defines functions addsyms sbov simplify_sbov makeGRConverterList basicCfieldsMap getDocumentFields RaggedMongoExpt bannedColnames grConverter makeAggregator makeColData basicFormatter getFieldNames txregCollections

Documented in addsyms basicFormatter getDocumentFields getFieldNames grConverter makeAggregator makeColData makeGRConverterList RaggedMongoExpt sbov txregCollections

Try the TxRegInfra package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

TxRegInfra
Metadata management for multiomic specification of transcriptional regulatory networks

R/lite.R
In TxRegInfra: Metadata management for multiomic specification of transcriptional regulatory networks