#' @importFrom methods new is
#' @import BiocGenerics
#' @import trena
#' @import MotifDb
#' @import RPostgreSQL
#' @import GenomicRanges
#' @importFrom AnnotationDbi select
#'
#' @name trenaSGM-class
#' @rdname trenaSGM-class
#' @exportClass trenaSGM
.trenaSGM <- setClass("trenaSGM",
slots=c(
genomeName="character",
targetGene="character",
quiet="logical",
state="environment"
),
)
#------------------------------------------------------------------------------------------------------------------------
setGeneric('calculate', signature='obj', function (obj, strategies) standardGeneric ('calculate'))
setGeneric('summarizeModels', signature='obj', function (obj, orderBy, maxTFpredictors, models=NA) standardGeneric ('summarizeModels'))
setGeneric('execute.footprint.strategy', signature='obj', function (obj, strategy) standardGeneric ('execute.footprint.strategy'))
#------------------------------------------------------------------------------------------------------------------------
#' Create a trenaSGM object
#'
#' @description
#' tell us what we need to know
#'
#' @rdname trenaSGM-class
#'
#' @param genomeName hg38, mm10, ...
#' @param targetGene in same vocabulary as rownames in the expression matrix
#' @param quiet do or do not print progress information
#'
#' @return An object of the trenaSGM class
#'
#'
#' @examples
#' if(interactive()){
#' load(system.file(package="trenaSGM", "extdata", "mayo.tcx.RData"))
#' pfms <- query(MotifDb, c("sapiens", "jaspar2018"))
#' sgm <- trenaSGM("hg38", "TREM2", mtx, pfms,
#' strategies=list(footprints="5000.5000.remapped"),
#' tfCorrelation=0.2)
#' }
#'
#' @export
trenaSGM <- function(genomeName, targetGene, quiet=TRUE)
{
obj <- .trenaSGM(genomeName=genomeName,
targetGene=targetGene,
quiet=quiet,
state=new.env(parent=emptyenv()))
obj
} # trenaSGM
#------------------------------------------------------------------------------------------------------------------------
#' create regulatory model of the gene, following all the specified options
#'
#' @rdname calculate
#' @aliases calculate
#'
#' @param obj An object of class trenaSGM
#' @param strategies a list of lists, specifying all the options to build one or more models
#'
#' @return A list with a bunch of tables...
#'
#' @examples
#' if(interactive()){
#' pfms <- query2(MotifDb, c("sapiens", "jaspar2018"))
#' load(system.file(package="trenaSGM", "extdata", "mayo.tcx.RData"))
#' sgm <- trenaSGM("hg38", "TREM2", mtx, pfms,
#' strategies=list(footprints="5000.5000.remapped"),
#' tfCorrelation=0.3)
#' calculate(sgm)
#' }
#'
#' @export
setMethod('calculate', 'trenaSGM',
function (obj, strategies) {
strategy.types <- unlist(lapply(strategies, function(s) s$type), use.names=FALSE)
stopifnot(all(strategy.types %in% c("footprint.database",
"regions.motifMatching",
"noDNA.tfsSupplied")))
obj@state$strategies <- strategies
models <- list()
for(name in names(strategies)){
strategy <- strategies[[name]]
if(!obj@quiet)
printf(" trenaSGM::calculate building %s of type %s", name, strategy$type)
if(strategy$type == "footprint.database")
builder <- FootprintDatabaseModelBuilder(obj@genomeName, obj@targetGene, strategy, quiet=obj@quiet)
if(strategy$type == "regions.motifMatching")
builder <- RegionsMotifMatchingModelBuilder(obj@genomeName, obj@targetGene, strategy, quiet=TRUE)
if(strategy$type == "noDNA.tfsSupplied")
builder <- NoDnaModelBuilder(obj@genomeName, obj@targetGene, strategy, quiet=TRUE)
models[[name]] <- build(builder)
} # for strategy name
obj@state$models <- models
return(models)
})
#------------------------------------------------------------------------------------------------------------------------
#' build a model based on the specified footprint strategy
#'
#'
#' @rdname execute.footprint.strategy
#' @aliases execute.footprint.strategy
#'
#' @param obj An object of class trenaSGM
#' @param footprint.strategy a list of parameters for footprint-driven model building
#'
#'
#' @return A list with two data.frames: a model, and motif regions
#'
#' @examples
#' if(interactive()){
#' }
#'
#' @export
setMethod('execute.footprint.strategy', 'trenaSGM',
function (obj, strategy) {
return(list(model=data.frame(), regions=data.frame()))
})
#------------------------------------------------------------------------------------------------------------------------
#' summarize the gene regulatory models preciously calculated by this trensSGM instance
#'
#' @rdname summarizeModels
#' @aliases summarizeModels
#'
#' @param obj An object of class trenaSGM
#' @param orderBy a characters string, the name of the column in the standard trena model data.frame, typically pcaMax, rfScore, pearsonCoeff
#' @param maxTFpredictors an integer, the number of tfs to extract from each model (when present)
#' @param models a list of lists of model + regulatoryRegion data.fames, default NA, in which case the list is found in object state data
#'
#' @return A data.frame summarizing tfs by model type, by count (the number of models in which it appears) and rank in each model
#'
#' @examples
#' if(interactive()){
#' load(system.file(package="trenaSGM", "extdata", "mayo.tcx.RData"))
#' genome <- "hg38"
#' targetGene <- "TREM2"
#' sgm <- trenaSGM(genome, targetGene)
#' chromosome <- "chr6"
#' upstream <- 2000
#' downstream <- 200
#' tss <- 41163186
#'
#' # strand-aware start and end: trem2 is on the minus strand
#' start <- tss - downstream
#' end <- tss + upstream
#'
#' build.spec <- list(title="fp.2000up.200down.04",
#' type="footprint.database",
#' chrom=chromosome,
#' start=start,
#' end=end,
#' tss=tss,
#' matrix=mtx,
#' db.host="khaleesi.systemsbiology.net",
#' databases=list("brain_hint_20"),
#' motifDiscovery="builtinFimo",
#' tfMapping="MotifDB",
#' tfPrefilterCorrelation=0.4,
#' solverNames=c("lasso", "lassopv", "pearson", "randomForest", "ridge", "spearman"))
#'
#'
#' build.spec.2 <- build.spec
#' build.spec.2$title <- "fp.2000up.200down.02"
#' build.spec.2$tfPrefilterCorrelation=0.2
#'
#' strategies <- list(one=build.spec, two=build.spec.2)
#' models <- calculate(sgm)
#' tbl.summary <- summarizeModels(sgm, orderBy="rfScore", maxTFpredictors=5)
#' }
#'
#' @export
setMethod('summarizeModels', 'trenaSGM',
function(obj, orderBy, maxTFpredictors, models=list()) {
if(length(models) == 0)
models <- obj@state$models
# collect the top tfs from each model
top.tfs <- c()
for(model in models){
tbl.model <- model$model
stopifnot(orderBy %in% colnames(tbl.model))
tbl.model <- tbl.model[order(tbl.model[, orderBy], decreasing=TRUE),]
tfs.this.model <- tbl.model$gene
if(nrow(tbl.model) > maxTFpredictors)
tfs.this.model <- tfs.this.model[1:maxTFpredictors]
top.tfs <- c(top.tfs, tfs.this.model)
} # for model
top.tfs <- sort(unique(top.tfs))
#------------------------------------------------------------
# build up an empty table
#------------------------------------------------------------
#strategies <- obj@state$strategies
#model.names <- unlist(lapply(strategies, function(strategy) strategy$title), use.names=FALSE)
model.names <- names(models)
count <- length(top.tfs)
empty.column <- rep(0, count)
tbl <- data.frame(bogus.column=empty.column)
for(i in seq_len(length(model.names))){
tbl <- cbind(tbl, empty.column, stringsAsFactors=FALSE)
} # for i
bogus.column.position <- grep("bogus.column", colnames(tbl))
tbl <- tbl[, -bogus.column.position]
model.columns <- grep("empty.column", colnames(tbl))
colnames(tbl)[model.columns] <- model.names
rownames(tbl) <- top.tfs
#------------------------------------------------------------
# now populate the table
#------------------------------------------------------------
failed.match <- NA
stopifnot(length(model.names) == length(models))
for(i in seq_len(length(model.names))){
model.name <- model.names[i]
tbl.model <- models[[i]]$model
tbl[top.tfs, model.name] <- match(top.tfs, tbl.model$gene, nomatch=failed.match)
} # for model.name
rank.sum <- apply(tbl, 1, function(row) sum(row, na.rm=TRUE))
observed <- apply(tbl, 1, function(row) length(which(!is.na(row))))
tbl$rank.sum <- rank.sum
tbl$observed <- observed
tbl <- tbl[order(tbl$rank.sum, decreasing=FALSE),]
tbl
})
#------------------------------------------------------------------------------------------------------------------------
#' specify the universe of recognized transcription factor gene symbols
#'
#' @rdname allKnownTFs
#' @aliases allKnownTFs
#'
#' @param obj An object of class ModelBuilder, or any subclass
#' @param source A character string, currently only (and defaulted to) "GO:DNAbindingTranscriptionFactorActivity"
#' @param identifierType A string, one of "geneSymbol" (the default), "entrezGeneID", "ensemblGeneID"
#'
#' @export
allKnownTFs <- function(source="GO:DNAbindingTranscriptionFactorActivity", identifierType="geneSymbol")
{
# todo: this function promises flexibility for GO term, but does not deliver.
stopifnot(source =="GO:DNAbindingTranscriptionFactorActivity")
stopifnot(identifierType %in% c("geneSymbol", "entrezGeneID", "ensemblGeneID", "ensembl|geneSymbol"))
load(system.file(package="trenaSGM", "extdata", "tfCollections",
"GO_00037000_DNAbindingTranscriptionFactorActivityHuman.RData"))
if(identifierType != "geneSymbol"){
printf("----------- select(org.Hs.eg.db, ...) in trenaSGM::allKnownTFs");
suppressMessages(tbl.ids <- select(org.Hs.eg.db, keytype="SYMBOL",
keys=tfs,
columns=c("ENSEMBL", "ENTREZID")))
tfs <- switch(identifierType,
"ensemblGeneID" = {unique(tbl.ids$ENSEMBL)},
"entrezGeneID" = {unique(tbl.ids$ENTREZID)},
"ensembl|geneSymbol" = {unique(paste(tbl.ids$ENSEMBL, tbl.ids$SYMBOL, sep="|"))}
)
removers <- which(is.na(tfs))
if(length(removers) > 0)
tfs <- tfs[-removers]
} # if !geneSymbol
tfs
} # allKnownTFs
#------------------------------------------------------------------------------------------------------------------------
#' trim an expansive model to a more credible one, using parameterized voting scheme
#'
#' @rdname trimModel
#' @aliases trimModel
#'
#' @param tbl.model a data.frame, an expansive, unfiltered trena model data.frame, with perhaps > 200 tf rows
#' @param tbl.reg a data.frame, included here only so that it can be returned, filtered to exclude binding sites for tfs
#' which did not make it into the trimmed model
#' @param votesNeeded, numeric, typically 2 or 3, at least this many votes, by any solver method, is required to keep
#' @param tf.keepers, a character vector, keep these in the model no matter how well they performed.
#' a tf in the model
#' @export
trimModel <- function(tbl.model, tbl.reg, votesNeeded=3, tf.keepers=c())
{
matched.keeper.rows <- unlist(lapply(tf.keepers, function(tf) grep(tf, tbl.model$gene)))
pvals <- -log10(tbl.model$lassoPValue)
good.lassoPval <- which(pvals > 5)
good.betaLasso <- which(abs(tbl.model$betaLasso) > 0.1)
good.betaRidge <- which(abs(tbl.model$betaRidge) > 0.1)
spearman.cutoff <- fivenum(abs(tbl.model$spearmanCoeff))[4]
good.spearmanCoeff <- which(abs(tbl.model$spearmanCoeff) >= spearman.cutoff)
randomForest.cutoff <- fivenum(tbl.model$rfScore)[4]
forest.tfs <- subset(tbl.model, rfScore >= randomForest.cutoff)$gene
good.rfScore <- unlist(lapply(forest.tfs, function(tf) grep(tf, tbl.model$gene)))
all.counts <- c(good.lassoPval, good.betaLasso, good.betaRidge, good.spearmanCoeff, good.rfScore)
tbl.freq <- as.data.frame(table(all.counts), stringsAsFactors=FALSE)
colnames(tbl.freq) <- c("rowNumber", "count")
tbl.freq <- tbl.freq[order(tbl.freq$count, decreasing=TRUE),]
tbl.freq$rowNumber <- as.integer(tbl.freq$rowNumber)
good.tf.rows <- subset(tbl.freq, count >= votesNeeded)$rowNumber
not.yet.included <- setdiff(matched.keeper.rows, good.tf.rows)
if(length(not.yet.included) > 0)
good.tf.rows <- c(good.tf.rows, not.yet.included)
tbl.model <- tbl.model[good.tf.rows,]
new.order <- order(abs(tbl.model$spearmanCoeff), decreasing=TRUE)
tbl.model <- tbl.model[new.order,]
if("geneSymbol" %in% colnames(tbl.reg))
tbl.reg <- subset(tbl.reg, geneSymbol %in% tbl.model$gene)
if("tf" %in% colnames(tbl.reg))
tbl.reg <- subset(tbl.reg, tf %in% tbl.model$gene)
# otherwise - shoddy programming, this! - don't reduce tbl.reg
rownames(tbl.model) <- NULL
rownames(tbl.reg) <- NULL
return(list(model=tbl.model, regulatoryRegions=tbl.reg))
} # trimModel
#------------------------------------------------------------------------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.