# runMany.R
#----------------------------------------------------------------------------------------------------
printf <- function(...) print(noquote(sprintf(...)))
#----------------------------------------------------------------------------------------------------
if(!interactive()) {
args <- commandArgs(trailingOnly=TRUE)
if(length(args) != 2){
printf("usage: Rscript runMany_noDNA.R <startGeneNumber> <endGeneNumber>")
stop()
}
stopifnot(length(args) == 2)
startGeneIndex <- as.integer(args[1])
endGeneIndex <- as.integer(args[2])
}
#------------------------------------------------------------------------------
library(RUnit)
library(trenaSGM)
library(MotifDb)
library(BiocParallel)
library(futile.logger)
library(RPostgreSQL)
library(org.Hs.eg.db)
#------------------------------------------------------------------------------
stopifnot(packageVersion("trena") >= "1.5.13")
stopifnot(packageVersion("trenaSGM") >= "0.99.76")
stopifnot(packageVersion("TrenaProjectBrainCell") >= "1.0.0")
stopifnot(packageVersion("TrenaProjectHG38") >= "1.0.2")
stopifnot(packageVersion("TrenaProject") >= "1.2.1")
stopifnot(packageVersion("trenaSGM") >= "0.99.92")
stopifnot(packageVersion("trena") >= "1.7.7")
#------------------------------------------------------------------------------
# we need a configuration file (of R commands), specified on the command line
# informing us of how to run this whole genome parallelized script
#------------------------------------------------------------------------------
configurationFile <- "config_noDNA.R"
stopifnot(file.exists(configurationFile))
if(!exists("configurationFileRead") || !configurationFileRead)
source(configurationFile)
#------------------------------------------------------------------------------
if(!interactive()){
args <- commandArgs(trailingOnly=TRUE)
stopifnot(length(args) == 2)
startGeneIndex <- as.integer(args[1])
endGeneIndex <- as.integer(args[2])
}
#-----------------------------------------------------------------------------
stopifnot(exists("trenaProject"))
stopifnot(exists("mtx"))
stopifnot(exists("goi"))
if(!file.exists(OUTPUTDIR)) dir.create(OUTPUTDIR)
if(!file.exists(LOGDIR)) dir.create(LOGDIR)
#----------------------------------------------------------------------------------------------------
basic.build.spec <- list(title=".noDNA.allTFs",
type="noDNA.tfsSupplied",
stageDirectory=OUTPUTDIR,
genomeName="hg38",
matrix=mtx,
candidateTFs=intersect(rownames(mtx), allKnownTFs()),
tfPool=allKnownTFs(),
tfs=allKnownTFs(),
tfPrefilterCorrelation=0.5,
annotationDbFile=dbfile(org.Hs.eg.db),
orderModelByColumn="rfScore",
solverNames=SOLVERS,
solvers=SOLVERS,
quiet=FALSE)
#----------------------------------------------------------------------------------------------------
buildModel <- function(short.spec)
{
required.fields <- c("targetGene", "runParallel")
missing.fields <- setdiff(required.fields, names(short.spec))
if(length(missing.fields) > 0){
msg <- sprintf("buildModel finds fields missing in short.spec: %s", paste(missing.fields, collapse=", "))
stop(msg)
}
printf("********* [building model for %s, parallel = %s]", short.spec$targetGene, short.spec$runParallel)
filename <- sprintf("%s/%s.RData", OUTPUTDIR, short.spec$targetGene)
system(sprintf("touch %s", filename)) # so an empty file exists if the model building fails
# this should be used if you need to skip genes for which the models are already in the OUTPUTDIR
# all.models <- list.files(OUTPUTDIR, full=T)
# good.models <- all.models[sapply(all.models, file.size) > 200] # I chose 200 because empty dataframes have a size of 160
#----------------
#tbl.geneLoc <- subset(tbl.geneInfo, geneSymbol==targetGene)[1,]
#chromosome <- tbl.geneLoc$chrom
#tss <- tbl.geneLoc$tss
#genomeName <- spec$genomeName
targetGene <- short.spec$targetGene
spec <- basic.build.spec
spec$targetGene <- targetGene
#spec$tss=tss
#spec$regions <- determineRegulatoryRegions(targetGene)
spec$geneSymbol <- targetGene
goodEnoughCor <- checkCor(targetGene, mtx)
if(!goodEnoughCor)
results <- list(model=data.frame(), regulatoryRegions=data.frame())
else{
builder <- NoDnaModelBuilder(spec$genomeName, targetGene, spec, quiet=FALSE)
timing.info <- system.time(results <- build(builder))
message(sprintf("time for %s: %f", targetGene, timing.info[["elapsed"]]))
}
# to not save the footprints, go into results and make the footprints an empty dataframe.
save(results, file=filename)
return(results)
} # buildModel
#----------------------------------------------------------------------------------------------------
# precheck to see if model already exists (helpful if genome-scale-model is interupted)
# this should be used if you need to skip genes for which the models are already in the OUTPUTDIR
checkFile <- function(OUTPUTDIR)
{
all.models <- list.files(OUTPUTDIR, full=T, pattern = ".RData")
good.models <- all.models[sapply(all.models, file.size) > 200] # I chose 200 because empty dataframes have a size of 160
good.names <- strsplit(good.models, "/")
just.names <- lapply(good.names, "[", 4)
just.names <- gsub(".RData", "", just.names)
return(just.names)
}
#----------------------------------------------------------------------------------------------------
# small check to see if there are enough TFs that correlate to run a model
checkCor <- function(targetGene, mtx)
{
target.gene.expression <- mtx[targetGene,]
other.tfs <- intersect(rownames(mtx), allKnownTFs())
# is our target.gene itself a TF? If so, eliminate it
target.gene.among.tfs <- match(targetGene, other.tfs)
if(!is.na(target.gene.among.tfs))
other.tfs <- other.tfs[-target.gene.among.tfs]
mtx.test <- mtx[other.tfs,]
correlations <- abs(apply(mtx.test, 1, function(row) cor(target.gene.expression, row)))
range.of.correlations <- fivenum(correlations)
print(fivenum(correlations))
third.quartile <- range.of.correlations[4]
max <- range.of.correlations[5]
return(third.quartile > 0.15)
}
#----------------------------------------------------------------------------------------------------
do.run <- function(genes, parallel=TRUE)
{
short.specs <- lapply(genes, function(gene)
list(targetGene=gene,
regionsMode="enhancers",
runParallel=parallel))
names(short.specs) <- as.character(genes)
if(parallel){
bp.params <- MulticoreParam(stop.on.error=FALSE, log=TRUE, logdir=LOGDIR, threshold="INFO", workers=WORKERS)
printf("running bplapply on %d genes, %d workers", length(genes), WORKERS)
results <- bptry({bplapply(short.specs, buildModel, BPPARAM=bp.params)})
} else {
results <- lapply(short.specs, buildModel)
names(results) <- genes
}
invisible(results)
} # do.run
#----------------------------------------------------------------------------------------------------
demoGenes <- function()
{
genes <- c("GSTM1", "RBMXP2", "INKA2", "APOE")
return(genes)
} # demoGenes
#----------------------------------------------------------------------------------------------------
# GSTM1: a "normal" gene, protein-coding, variable expression, with geneHancer regions
# RBMXP2: as above, but a pseuodgene with no footprints
# INKA2: as above, but with no geneHancer regions
# APOE: well-behaved should produce a strong model
test_fourGenes <- function(useParallel=FALSE)
{
genes <- demoGenes()
x <- do.run(genes, parallel=useParallel)
checkEquals(length(x), 4)
checkEquals(length(x), 4)
model.sizes <- lapply(x, function(element) nrow(element$model))
print(model.sizes)
checkTrue(model.sizes$GSTM1 > 20)
checkTrue(is.null(model.sizes$RBMKP2))
checkTrue(model.sizes$INKA2 > 100)
checkTrue(model.sizes$APOE > 100)
invisible(x)
} # test_fourGenes
#----------------------------------------------------------------------------------------------------
if(!interactive()){
stopifnot(startGeneIndex < length(goi))
stopifnot(endGeneIndex <= length(goi))
stopifnot(startGeneIndex < endGeneIndex)
completed.models <- checkFile(OUTPUTDIR)
goi.thisRun <- goi[startGeneIndex:endGeneIndex]
goi.thisRun <- setdiff(goi.thisRun, completed.models)
printf("running with genes %d - %d", startGeneIndex, endGeneIndex)
x <- do.run(goi.thisRun, parallel=TRUE)
}
if(interactive()){ # for development and debugging
startGeneIndex <- 61
endGeneIndex <- 70
completed.models <- checkFile(OUTPUTDIR)
goi.thisRun <- goi[startGeneIndex:endGeneIndex]
goi.thisRun <- setdiff(goi.thisRun, completed.models)
printf("running with genes %d - %d", startGeneIndex, endGeneIndex)
x <- do.run(goi.thisRun, parallel=FALSE)
}
#------------------------------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.