Nothing
# == title
# Class to store and retrieve GREAT results
#
# == details
# After submitting request to GREAT server, the generated results will be
# available on GREAT server for some time. The ``GreatJob-class`` is defined
# to store parameters that user has set and result tables what were retrieved
# from GREAT server.
#
# == Constructor
# Users don't need to construct by hand, `submitGreatJob` is used to generate a ``GreatJob-class`` instance.
#
# == Workflow
# After submitting request to GREAT server, users can perform following steps:
#
# - call `getEnrichmentTables` to get enrichment tables for selected ontologies catalogues.
# - call `plotRegionGeneAssociationGraphs` to get associations between regions and genes
# as well as making plots.
#
# == author
# Zuguang gu <z.gu@dkfz.de>
#
GreatJob = setClass("GreatJob",
slots = list(
job_env = "environment",
parameters = "list", # parameters that are sent to GREAT
enrichment_tables = "environment",
association_tables = "environment")
)
# == title
# Constructor method for GreatJob class
#
# == param
# -... arguments.
#
# == details
# There is no public constructor method for the `GreatJob-class`.
#
# == value
# No value is returned.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
GreatJob = function(...) {
new("GreatJob", ...)
}
# == title
# Send requests to GREAT web server
#
# == param
# -gr A `GenomicRanges::GRanges` object or a data frame which contains at least three columns (chr, start and end). Regions for test.
# -bg A `GenomicRanges::GRanges` object or a data frame. Background regions if needed. Note ``gr`` should be exactly subset of ``bg`` for all columns in ``gr``. Check http://great.stanford.edu/help/display/GREAT/File+Formats#FileFormats-Whatshouldmybackgroundregionsfilecontain\%3F for more explanation.
# -species Species. "hg38", "hg19", "mm10", "mm9" are supported in GREAT version 4.x.x, "hg19", "mm10", "mm9", "danRer7" are supported in GREAT version 3.x.x and "hg19", "hg18", "mm9", "danRer7" are supported in GREAT version 2.x.x.
# -includeCuratedRegDoms Whether to include curated regulatory domains.
# -rule How to associate genomic regions to genes. See 'details' section.
# -adv_upstream Unit: kb, only used when rule is ``basalPlusExt``
# -adv_downstream Unit: kb, only used when rule is ``basalPlusExt``
# -adv_span Unit: kb, only used when rule is ``basalPlusExt``
# -adv_twoDistance Unit: kb, only used when rule is ``twoClosest``
# -adv_oneDistance Unit: kb, only used when rule is ``oneClosest``
# -request_interval Time interval for two requests. Default is 300 seconds.
# -max_tries Maximum times trying to connect to GREAT web server.
# -version version of GREAT. The value should be "4.0.4", "3.0.0", "2.0.2". Shorten version numbers
# can also be used, such as using "4" or "4.0" is same as "4.0.4".
# -base_url the url of ``cgi-bin`` path, only used when explicitly specified.
# -help Whether to print help messages.
#
# == details
# Note: [On Aug 19 2019 GREAT released version 4](http://great.stanford.edu/help/display/GREAT/Version+History where it supports ``hg38`` genome and removes some ontologies such pathways. `submitGreatJob` still
# takes ``hg19`` as default. ``hg38`` can be specified by the ``species = "hg38"`` argument.
# To use the older versions such as 3.0.0, specify as ``submitGreatJob(..., version = "3.0.0")``.
#
# Note it is not the standard GREAT API. This function directly send data to GREAT web server
# by HTTP POST.
#
# Following text is copied from GREAT web site ( http://great.stanford.edu/public/html/ )
#
# Explanation of ``rule`` and settings with names started with 'adv_' (advanced settings):
#
# -basalPlusExt Mode 'Basal plus extension'. Gene regulatory domain definition:
# Each gene is assigned a basal regulatory domain of a minimum distance upstream
# and downstream of the TSS (regardless of other nearby genes, controlled by ``adv_upstream`` and
# ``adv_downstream`` argument). The gene regulatory domain is extended in both directions
# to the nearest gene's basal domain but no more than the maximum extension in one direction
# (controlled by ``adv_span``).
# -twoClosest Mode 'Two nearest genes'. Gene regulatory domain definition:
# Each gene is assigned a regulatory domain that extends in both directions to the nearest
# gene's TSS (controlled by ``adv_twoDistance``) but no more than the maximum extension in one direction.
# -oneClosest Mode 'Single nearest gene'. Gene regulatory domain definition:
# Each gene is assigned a regulatory domain that extends in both directions to the midpoint
# between the gene's TSS and the nearest gene's TSS (controlled by ``adv_oneDistance``) but no more than the maximum
# extension in one direction.
#
# == value
# A `GreatJob-class` class object which can be used to get results from GREAT server.
#
# == seealso
# `GreatJob-class`
#
# == author
# Zuguang gu <z.gu@dkfz.de>
#
# == example
# set.seed(123)
# bed = circlize::generateRandomBed(nr = 1000, nc = 0)
# job = submitGreatJob(bed, version = "3.0.0")
# job
#
# # more parameters can be set for the job
# if(FALSE) { # suppress running it when building the package
# # current GREAT version is 4.0.1
# job = submitGreatJob(bed, species = "mm9")
# job = submitGreatJob(bed, bg, species = "mm9", bgChoise = "data")
# job = submitGreatJob(bed, adv_upstream = 10, adv_downstream = 2, adv_span = 2000)
# job = submitGreatJob(bed, rule = "twoClosest", adv_twoDistance = 2000)
# job = submitGreatJob(bed, rule = "oneClosest", adv_oneDistance = 2000)
# }
#
submitGreatJob = function(gr, bg = NULL,
species = "hg19",
includeCuratedRegDoms = TRUE,
rule = c("basalPlusExt", "twoClosest", "oneClosest"),
adv_upstream = 5.0,
adv_downstream = 1.0,
adv_span = 1000.0,
adv_twoDistance = 1000.0,
adv_oneDistance = 1000.0,
request_interval = 60,
max_tries = 10,
version = DEFAULT_VERSION,
base_url = "http://great.stanford.edu/public/cgi-bin",
help = TRUE
) {
version = version[1]
if(!version %in% names(SPECIES)) {
stop(paste0("'version' should be in ", paste(names(SPECIES), collapse = ", "), "."))
}
species = species[1]
if(!species %in% SPECIES[[version]]) {
stop(paste0("GREAT with version '", version, "' only supports following species:\n ", paste(SPECIES[[version]], collapse = ", ")))
}
rule = match.arg(rule)[1]
if(help) {
cl = as.list(match.call())
if(version == "4.0.4" && (!"species" %in% names(cl))) {
message_wrap('Note: On Aug 19 2019 GREAT released version 4 which supports hg38 genome and removes some ontologies such
pathways. submitGreatJob() still takes hg19 as default. hg38 can be specified by argument `species = "hg38"`. To use
the older versions such as 3.0.0, specify as submitGreatJob(..., version = "3"). Set argument `help` to `FALSE`
to turn off this message.')
}
}
includeCuratedRegDoms = as.numeric(includeCuratedRegDoms[1])
op = qq.options(READ.ONLY = FALSE)
on.exit(qq.options(op))
qq.options(code.pattern = "@\\{CODE\\}")
if(inherits(gr, "data.frame")) {
gr = GRanges(seqnames = gr[[1]],
ranges = IRanges(start = gr[[2]],
end = gr[[3]]))
} else if(inherits(gr, "character")) {
gr = read.table(gr, stringsAsFactors = FALSE)
gr = GRanges(seqnames = gr[[1]],
ranges = IRanges(start = gr[[2]],
end = gr[[3]]))
}
mcols(gr) = NULL
# gr = reduce(sort(gr))
bgChoice = ifelse(is.null(bg), "wholeGenome", "file")
if(!is.null(bg)) {
if(inherits(bg, "data.frame")) {
bg = GRanges(seqnames = bg[[1]],
ranges = IRanges(start = bg[[2]],
end = bg[[3]]))
} else if(inherits(bg, "character")) {
bg = read.table(bg, stringsAsFactors = FALSE)
bg = GRanges(seqnames = bg[[1]],
ranges = IRanges(start = bg[[2]],
end = bg[[3]]))
}
mcols(bg) = NULL
# # check whether all grs are subsets of bg
# ov = findOverlaps(gr, bg)
# if(length(ov) == 0) {
# stop("No overlapping between `gr` and `bg`.")
# }
# mtch = as.matrix(ov)
# if(length(setdiff(gr, bg)) != 0) {
# warning("For each interval in `gr`, there should be an interval in `bg` which is exactly the same.\nThe different intervals in `gr` will be removed.")
# }
# # gr should be exactly subset of bg
# gr = sort(pintersect(gr[mtch[, 1]], bg[mtch[, 2]]))
# mcols(gr) = NULL
# bg = sort(c(gr, setdiff(bg, gr)))
}
# check seqnames should have 'chr' prefix
if(!all(grepl("^chr", seqnames(gr)))) {
stop("Chromosome names (in `gr`) should have 'chr' prefix.\n")
}
if(!is.null(bg)) {
if(!all(grepl("^chr", seqnames(bg)))) {
stop("Chromosome names (in `bg`) should have 'chr' prefix.\n")
}
}
# transform GRanges to data frame
bed = as.data.frame(gr)
bed_bg = NULL
if(!is.null(bg)) {
bed_bg = as.data.frame(bg)[, 1:3]
}
# check request frequency
time_interval = as.numeric(Sys.time()) - as.numeric(rGREAT_env$LAST_REQUEST_TIME)
if(time_interval < request_interval) {
message(qq("Don't make too frequent requests. The time break is @{request_interval}s.\nPlease wait for @{round(request_interval - time_interval)}s for the next request.\nThe time break can be set by `request_interval` argument.\n"))
sleep(request_interval - time_interval)
}
#message("sending request to GREAT web server...")
if(missing(base_url)) {
BASE_URL = BASE_URL_LIST[version]
} else {
BASE_URL = base_url
}
# save file into temporary files
bed = cbind(bed[, 1:3], name = paste(bed[[1]], ":", bed[[2]], "-", bed[[3]], sep = ""))
f_bed = tempfile(fileext = ".gz")
con = gzcon(file(f_bed, "wb"))
write.table(bed, con, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
close(con)
f_bed_bg = ""
if(!is.null(bed_bg)) {
bed_bg = cbind(bed_bg[, 1:3], name = paste(bed_bg[[1]], ":", bed_bg[[2]], "-", bed_bg[[3]], sep = ""))
f_bed_bg = tempfile(fileext = ".gz")
con = gzcon(file(f_bed_bg, "wb"))
write.table(bed_bg, con, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
close(con)
}
i_try = 0
while(1) {
if(bgChoice == "wholeGenome") {
error = try(response <- postForm(qq("@{BASE_URL}/greatWeb.php"),
"species" = species,
"rule" = rule,
"span" = adv_span,
"upstream" = adv_upstream,
"downstream" = adv_downstream,
"twoDistance" = adv_twoDistance,
"oneDistance" = adv_oneDistance,
"includeCuratedRegDoms" = includeCuratedRegDoms,
"fgChoice" = "file",
"fgFile" = fileUpload(f_bed),
"bgChoice" = bgChoice,
"adv_upstream" = adv_upstream,
"adv_downstream" = adv_downstream,
"adv_span" = adv_span,
"adv_twoDistance" = adv_twoDistance,
"adv_oneDistance" = adv_oneDistance
))
} else {
error = try(response <- postForm(qq("@{BASE_URL}/greatWeb.php"),
"species" = species,
"rule" = rule,
"span" = adv_span,
"upstream" = adv_upstream,
"downstream" = adv_downstream,
"twoDistance" = adv_twoDistance,
"oneDistance" = adv_oneDistance,
"includeCuratedRegDoms" = includeCuratedRegDoms,
"fgChoice" = "file",
"fgFile" = fileUpload(f_bed),
"bgChoice" = "file",
"bgFile" = fileUpload(f_bed_bg),
"adv_upstream" = adv_upstream,
"adv_downstream" = adv_downstream,
"adv_span" = adv_span,
"adv_twoDistance" = adv_twoDistance,
"adv_oneDistance" = adv_oneDistance
))
}
i_try = i_try + 1
if(class(error) != "try-error") {
break
} else {
message(error, appendLF = FALSE)
if(i_try > max_tries) {
stop(qq("max try: @{max_tries} reached. Stop with error.\n"))
} else {
message(qq("failed with the request, try after @{ceiling(request_interval/60)} min (@{i_try}@{ifelse(i_try %% 10 == 1, 'st', ifelse(i_try %% 10 == 2, 'nd', 'th'))} try)"))
sleep(request_interval)
}
}
}
file.remove(f_bed)
if(!is.null(bg)) {
file.remove(f_bed_bg)
}
rGREAT_env$LAST_REQUEST_TIME = Sys.time()
# parsing error
if(any(grepl("encountered a user error", response))) {
if(grepl("^.*<blockquote><b>Error details:<\\/b>(.*?)<\\/blockquote>.*$", response)) {
msg = gsub("^.*<blockquote><b>Error details:<\\/b>(.*?)<\\/blockquote>.*$", "\\1", response)
} else {
msg = gsub("^.*<blockquote>(.*?)<\\/blockquote>.*$", "\\1", response)
}
msg = gsub("<.*?>", "", msg)
msg = gsub(" +", " ", msg)
msg = strwrap(msg)
msg = paste(msg, collapse = "\n")
if(grepl("The foreground set is not a subset of the background set", msg)) {
msg = paste0(msg, "\nVisit http://great.stanford.edu/help/display/GREAT/File+Formats#FileFormats-Whatshouldmybackgroundregionsfilecontain%3F for more explanation.\n")
}
stop(paste0("GREAT encountered a user error (message from GREAT web server):\n", msg))
}
# parsing warning
if(any(grepl("<strong>Warning:<\\/strong>", response))) {
msg = gsub("^.*<strong>Warning:<\\/strong>(.*?)<\\/p>.*$", "\\1", response)
msg = gsub("<.*?>", "", msg)
msg = gsub(" +", " ", msg)
msg = strwrap(msg)
msg = paste(msg, collapse = "\n")
warning(paste0("GREAT gives a warning:\n", msg))
}
jobid = gsub("^.*var _sessionName = \"(.*?)\";.*$", "\\1", response)
jobid = as.vector(jobid)
job = GreatJob()
job@job_env = new.env()
job@enrichment_tables = new.env()
job@association_tables = new.env()
if(version == "default") version = DEFAULT_VERSION
job@parameters = list(
"species" = species,
"rule" = rule,
"span" = adv_span,
"upstream" = adv_upstream,
"downstream" = adv_downstream,
"twoDistance" = adv_twoDistance,
"oneDistance" = adv_oneDistance,
"includeCuratedRegDoms" = includeCuratedRegDoms,
"fgChoice" = "data",
"bgChoice" = bgChoice,
"adv_upstream" = adv_upstream,
"adv_downstream" = adv_downstream,
"adv_span" = adv_span,
"adv_twoDistance" = adv_twoDistance,
"adv_oneDistance" = adv_oneDistance,
"n_region" = nrow(bed),
"version" = version,
"f_bed" = f_bed,
"f_bed_bg" = f_bed_bg)
job@parameters$n_bg = nrow(bed_bg)
job@job_env$id = jobid
job@job_env$submit_time = Sys.time()
job@job_env$gr = bed
job@job_env$bg = bed_bg
td = tempdir()
dir.create(td, showWarnings = FALSE)
job@job_env$tempdir = td
### parse the response html code and extract the ontogolies
match = gregexpr("<div class=\"gc_header gc_column\">.*?<\\/div>", response)[[1]]
match.length = attr(match, "match.length")
category = sapply(seq_along(match), function(i) {
substr(response, match[i], match[i]+match.length[i]-1)
})
category = gsub("<.*?>", "", category)
match = gregexpr("<div class=\"gc_info_item gc_column\">.*?<\\/div>", response)[[1]]
match.length = attr(match, "match.length")
term = sapply(seq_along(match), function(i) {
substr(response, match[i], match[i]+match.length[i]-1)
})
term_value = gsub("^.*value=\"(\\w+?)\".*$", "\\1", term)
term_value = gsub(" ", "", term_value)
term_value = gsub("<.*?>", "", term_value)
term_value = gsub("^\\s+|\\s+$", "", term_value)
term_name = gsub("<.*?>", "", term)
term_name = gsub(" ", "", term_name)
term_name = gsub("^\\s+|\\s+$", "", term_name)
ONTOLOGY_KEYS = term_value
names(ONTOLOGY_KEYS) = term_name
ONTOLOGY_KEYS = ONTOLOGY_KEYS[ONTOLOGY_KEYS != ""]
CATEGORY = lapply(seq_along(category), function(i) {
index = seq_along(term_name)
tn = term_name[index %% length(category) == i %% length(category)]
tn[tn != ""]
})
names(CATEGORY) = category
job@job_env$CATEGORY = CATEGORY
job@job_env$ONTOLOGY_KEYS = ONTOLOGY_KEYS
return(job)
}
## reference class for GreatJob
setMethod(f = "show",
signature = "GreatJob",
definition = function(object) {
cat("Submit time:",
format(object@job_env$submit_time, "%Y-%m-%d %H:%M:%S"), "\n")
cat("Version:", param(object, "version"), "\n")
#cat("Session ID:", id(object), "\n")
cat("Species:", param(object, "species"), "\n")
cat("Inputs:", param(object, "n_region"), "regions\n")
if(param(object, "bgChoice") == "wholeGenome") {
cat("Background:", param(object, "bgChoice"), "\n")
} else {
cat("Background: user-defined,", param(object, "n_bg"), "regions\n")
}
if(param(object, "rule") == "basalPlusExt") {
cat("Model:", "Basal plus extension", "\n")
cat(" Proximal:", param(object, "adv_upstream"), "kb upstream,",
param(object, "adv_downstream"), "kb downstream,\n plus Distal: up to",
param(object, "adv_span"), "kb\n")
} else if(param(object, "rule") == "twoClosest") {
cat("Model:", "Two nearest genes", "\n")
cat(" within", param(object, "adv_twoDistance"), "kb\n")
} else if(param(object, "rule") == "oneClosest") {
cat("Model:", "Single nearest gene", "\n")
cat(" within", param(object, "adv_oneDistance"), "kb\n")
}
if(param(object, "includeCuratedRegDoms")) {
cat("Include curated regulatory domains\n")
}
cat("\n")
cat("Enrichment tables for following ontologies have been downloaded:\n")
if(length(ls(envir = object@enrichment_tables)) == 0) {
cat(" None\n")
} else {
for(nm in ls( envir = object@enrichment_tables)) {
cat(" ", nm, "\n", sep = "")
}
}
cat("\n")
})
setGeneric(name = "id",
def = function(job) {
standardGeneric("id")
})
setMethod(f = "id",
signature = "GreatJob",
definition = function(job) {
job@job_env$id
})
setGeneric(name = "param",
def = function(job, name) {
standardGeneric("param")
})
setMethod(f = "param",
signature = "GreatJob",
definition = function(job, name) {
job@parameters[[name]]
})
# == title
# Get enrichment tables from GREAT web server
#
# == param
# -job a `GreatJob-class` instance
# -ontology ontology names. Valid values are in `availableOntologies`. ``ontology`` is prior to
# ``category`` argument.
# -category Pre-defined ontology categories. One category can contain more than one ontologies. Valid values are in
# `availableCategories`
# -request_interval time interval for two requests. Default is 300 seconds.
# -max_tries maximum tries
# -download_by Internally used.
# -verbose Whether print messages.
#
# == details
# The table contains statistics for the each term in each ontology catalogue.
#
# Please note there is no FDR column in original tables. Users should
# calculate by themselves by functions such as `stats::p.adjust`
#
# == value
# The returned value is a list of data frames in which each one corresponds to
# result for a single ontology. The structure of the data frames are same as
# the tables available on GREAT website.
#
# == see also
# `availableOntologies`, `availableCategories`
#
# == author
# Zuguang gu <z.gu@dkfz.de>
#
# == example
# # note the `job` was generated from GREAT 3.0.0
# job = readRDS(system.file("extdata", "job.rds", package = "rGREAT"))
# tb = getEnrichmentTables(job)
# names(tb)
# head(tb[[1]])
# job
#
# tb = getEnrichmentTables(job, ontology = "GO Molecular Function")
# tb = getEnrichmentTables(job, category = "GO")
#
setMethod(f = "getEnrichmentTables",
signature = "GreatJob",
definition = function(job, ontology = NULL, category = "GO",
request_interval = 10, max_tries = 100, download_by = c("json", "tsv"),
verbose = TRUE) {
jobid = id(job)
species = param(job, "species")
version = param(job, "version")
BASE_URL = BASE_URL_LIST[version]
if(!file.exists(job@job_env$tempdir)) {
td = tempdir()
dir.create(td, showWarnings = FALSE)
job@job_env$tempdir = td
}
if(is.null(ontology)) {
if(is.null(category)) {
stop("`ontology` and `category` can not be both NULL.\n")
}
if(length(setdiff(category, availableCategories(job)))) {
stop("Only categories in `availableCategories()` are allowed.\n")
}
ontology = availableOntologies(job, category = category)
}
ontology = unique(ontology)
if(length(setdiff(ontology, availableOntologies(job)))) {
stop("Only ontologies in `availableOntologies()` are allowed.\n")
}
ONTOLOGY_KEYS = job@job_env$ONTOLOGY_KEYS
download_by = match.arg(download_by)[1]
if(download_by == "json" && verbose) {
message_wrap("The default enrichment tables contain no associated genes for the input regions.",
"You can set `download_by = 'tsv'` to download the complete table,",
"but note only the top 500 regions can be retreived. See the following link:\n\n",
"https://great-help.atlassian.net/wiki/spaces/GREAT/pages/655401/Export#Export-GlobalExport\n")
}
res = lapply(ontology, function(onto) {
if(!is.null(job@enrichment_tables[[onto]]) && download_by == "json") {
res = job@enrichment_tables[[onto]]
return(res)
} else {
if(download_by == "json") {
res = GREAT.read.json(job, qq("@{BASE_URL}/readJsFromFile.php?path=/scratch/great/tmp/results/@{jobid}.d/@{ONTOLOGY_KEYS[onto]}.js"), onto,
request_interval = request_interval, max_tries = max_tries)
job@enrichment_tables[[onto]] = res
} else {
res = download_enrichment_table(job, ONTOLOGY_KEYS[onto], request_interval = request_interval, max_tries = max_tries)
}
return(res)
}
})
names(res) = ontology
return(res)
})
message_wrap = function (...) {
x = paste(..., collapse = " ")
x = paste(strwrap(x), collapse = "\n")
message(x)
}
download_enrichment_table = function(job, onto, request_interval = 10, max_tries = 100) {
jobid = id(job)
species = param(job, "species")
bgChoice = param(job, "bgChoice")
version = param(job, "version")
BASE_URL = BASE_URL_LIST[version]
i_try = 0
while(1) {
error = try(
response <- postForm(qq("@{BASE_URL}/downloadAllTSV.php"),
outputDir = qq("/scratch/great/tmp/results/@{jobid}.d/"),
outputPath = qq("/tmp/results/@{jobid}.d/"),
ontoName = onto,
species = species,
ontoList = qq("@{onto}@1-Inf"),
binom = ifelse(bgChoice == "wholeGenome", "true", "false")
)
)
i_try = i_try + 1
if(class(error) != "try-error") {
break
} else {
if(i_try > max_tries) {
stop(qq("max try: @{max_tries} reached. Stop with an error.\n"))
} else {
message("failed to download, try after 30s")
sleep(request_interval)
}
}
}
response = strsplit(response, "\n")[[1]]
response = response[-(1:3)]
response[[1]] = gsub("^#", "", response[1])
response = response[!grepl("^#", response)]
response = paste(response, collapse = "\n")
error = try(tb <- read.table(textConnection(response), sep = "\t", quote = "", header = TRUE, stringsAsFactors = FALSE))
if(inherits(error, "try-error")) {
stop("downloading enrichment table failed.")
}
return(tb)
}
# == title
# All available ontology names
#
# == param
# -job a `GreatJob-class` instance
# -category one or multiple categories. All available categories can be get by `availableCategories`
#
# == details
# The values of the supported ontologies sometime change. You should run the function to get the real-time
# values. The meaning of ontology returned is quite self-explained by the name.
#
# == value
# The returned values is a vector of ontologies.
#
# == author
# Zuguang gu <z.gu@dkfz.de>
#
# == example
# # note the `job` was generated from GREAT 3.0.0
# job = readRDS(system.file("extdata", "job.rds", package = "rGREAT"))
# availableOntologies(job)
# availableOntologies(job, category = "Pathway Data")
#
setMethod(f = "availableOntologies",
signature = "GreatJob",
definition = function(job, category = NULL) {
species = param(job, "species")
CATEGORY = job@job_env$CATEGORY
if(is.null(category)) {
onto = unlist(CATEGORY)
names(onto) = NULL
return(onto)
} else {
if(length(setdiff(category, availableCategories(job))) > 0) {
stop("Value of `category` is invalid. Please use availableCategories(job) to find supported categories.\n")
}
onto = unlist(CATEGORY[category])
names(onto) = NULL
return(onto)
}
})
# == title
# Available ontology categories
#
# == param
# -job a `GreatJob-class` instance
#
# == details
# The values of the supported categories sometime change. You should run the function to get the real-time
# values. The meaning of categories returned is quite self-explained by the name.
#
# == value
# The returned value is a vector of categories.
#
# == author
# Zuguang gu <z.gu@dkfz.de>
#
# == example
# # note the `job` was generated from GREAT 3.0.0
# job = readRDS(system.file("extdata", "job.rds", package = "rGREAT"))
# availableCategories(job)
#
setMethod(f = "availableCategories",
signature = "GreatJob",
definition = function(job) {
species = param(job, "species")
CATEGORY = job@job_env$CATEGORY
names(CATEGORY)
})
# download from `url` and save to `file`
download = function(url, file, request_interval = 10, max_tries = 100) {
op = qq.options(READ.ONLY = FALSE)
on.exit(qq.options(op))
qq.options(code.pattern = "@\\{CODE\\}")
#message(qq("try to download from @{url}"))
i_try = 0
while(1) {
error = try(download.file(url, destfile = file, quiet = TRUE))
i_try = i_try + 1
if(class(error) != "try-error") {
break
} else {
if(file.exists(file)) file.remove(file)
if(i_try > max_tries) {
stop(qq("max try: @{max_tries} reached. Stop with an error.\n"))
} else {
message("failed to download, try after 30s")
sleep(request_interval)
}
}
}
}
GREAT.read.json = function(job, url, onto, request_interval = 10, max_tries = 100) {
jobid = id(job)
if(!file.exists(job@job_env$tempdir)) {
td = tempdir()
dir.create(td, showWarnings = FALSE)
job@job_env$tempdir = td
}
TEMP_DIR = job@job_env$tempdir
op = qq.options(READ.ONLY = FALSE)
on.exit(qq.options(op))
qq.options(code.pattern = "@\\{CODE\\}")
f1 = qq("@{TEMP_DIR}/@{jobid}_@{onto}.js")
f1 = gsub(" ", "_", f1)
download(url, file = f1, request_interval = request_interval, max_tries = max_tries)
# if something wrong, the returned js has length of 0
if(file.info(f1)$size == 0) {
stop("Retrieved 0 byte from remote server. Probably your job on GREAT server expires\nand you need to submit it again.\n")
}
# just in case downloading json file is interrupted
error = try(json <- fromJSON(file = f1))
if(class(error) == "try-error") {
stop("Downloading seems interrupted. Please re-run your command.\n")
}
file.remove(f1)
if(length(json) == 0) {
return(NULL)
}
lt = vector("list", length(json[[1]]))
for(i in seq_along(json)) {
for(j in seq_along(json[[i]])) {
lt[[j]] = c(lt[[j]], json[[i]][[j]])
}
}
res = as.data.frame(lt, stringsAsFactors = FALSE)
if (param(job, "bgChoice") == "file") {
colnames(res) = c("ID", "name", "Hyper_Total_Regions", "Hyper_Expected", "Hyper_Foreground_Region_Hits",
"Hyper_Fold_Enrichment", "Hyper_Region_Set_Coverage", "Hyper_Term_Region_Coverage",
"Hyper_Foreground_Gene_Hits", "Hyper_Background_Gene_Hits", "Total_Genes_Annotated", "Hyper_Raw_PValue")
res$Hyper_Adjp_BH = p.adjust(res$Hyper_Raw_PValue, method = "BH")
} else {
colnames(res) = c("ID", "name", "Binom_Genome_Fraction", "Binom_Expected", "Binom_Observed_Region_Hits", "Binom_Fold_Enrichment",
"Binom_Region_Set_Coverage", "Binom_Raw_PValue", "Hyper_Total_Genes", "Hyper_Expected",
"Hyper_Observed_Gene_Hits", "Hyper_Fold_Enrichment", "Hyper_Gene_Set_Coverage",
"Hyper_Term_Gene_Coverage", "Hyper_Raw_PValue")
res$Binom_Adjp_BH = p.adjust(res$Binom_Raw_PValue, method = "BH")
res$Hyper_Adjp_BH = p.adjust(res$Hyper_Raw_PValue, method = "BH")
res = res[, c("ID", "name", "Binom_Genome_Fraction", "Binom_Expected", "Binom_Observed_Region_Hits", "Binom_Fold_Enrichment",
"Binom_Region_Set_Coverage", "Binom_Raw_PValue", "Binom_Adjp_BH", "Hyper_Total_Genes", "Hyper_Expected",
"Hyper_Observed_Gene_Hits", "Hyper_Fold_Enrichment", "Hyper_Gene_Set_Coverage",
"Hyper_Term_Gene_Coverage", "Hyper_Raw_PValue", "Hyper_Adjp_BH")]
}
return(res)
}
# transform the original file into the standard data frame
parseRegionGeneAssociationFile = function(f1) {
error = try(data <- read.table(f1, sep = "\t", stringsAsFactors = FALSE))
if(class(error) == "try-error") {
stop("Downloading seems interrupted. Please re-run your command.\n")
}
region = NULL
gene = NULL
distance = NULL
for(i in seq_len(nrow(data))) {
if(data[i, 2] == "NONE") {
region = c(region, data[i, 1])
gene = c(gene, NA)
distance = c(distance, NA)
} else {
r = gregexpr("([a-zA-Z0-9\\-_.]+) \\(([+-]?\\d+)\\)", data[i, 2], perl = TRUE)[[1]]
capture.start = attr(r, "capture.start")
capture.length = attr(r, "capture.length")
k = nrow(capture.start)
region = c(region, rep(data[i, 1], k))
gene = c(gene, substr(rep(data[i, 2], k), capture.start[, 1], capture.start[, 1] + capture.length[, 1] - 1))
distance = c(distance, substr(rep(data[i, 2], k), capture.start[, 2], capture.start[, 2] + capture.length[, 2] - 1))
}
}
da = strsplit(region, "[-:]")
chr = sapply(da, function(x) x[1])
start = as.integer(sapply(da, function(x) x[2]))
end = as.integer(sapply(da, function(x) x[3]))
df = data.frame(chr = chr,
start = start,
end = end,
gene = gene,
distTSS = as.numeric(distance),
stringsAsFactors = FALSE)
rownames(df) = NULL
return(df)
}
# == title
# Plot region-gene association figures
#
# == param
# -job a `GreatJob-class` instance
# -type type of plots, should be in ``1, 2, 3``. See details section for explanation
# -ontology ontology name
# -termID term id which corresponds to the selected ontology
# -request_interval time interval for two requests. Default is 300 seconds.
# -max_tries maximum tries
# -verbose whether show message
# -plot whether make plots
#
# == details
# Generated figures are:
#
# - association between regions and genes
# - distribution of distance to TSS
# - distribution of absolute distance to TSS
#
#
# If ``ontology`` and ``termID`` are set, only regions and genes corresponding to
# selected ontology term will be used. Valid value for ``ontology`` is in
# `availableOntologies` and valid value for ``termID`` is from 'id' column
# in the table which is returned by `getEnrichmentTables`.
#
# == value
# a `GenomicRanges::GRanges` object. Columns in metadata are:
#
# -gene genes that are associated with corresponding regions
# -distTSS distance from the regions to TSS of the associated gene
#
# The returned values corresponds to whole input regions or only regions in specified ontology term,
# depending on user's setting.
#
# If there is no gene associated with the region, corresponding ``gene`` and ``distTSS``
# columns will be ``NA``.
#
# == author
# Zuguang gu <z.gu@dkfz.de>
#
# == example
# # note the `job` was generated from GREAT 3.0.0
# job = readRDS(system.file("extdata", "job.rds", package = "rGREAT"))
#
# res = plotRegionGeneAssociationGraphs(job)
# res
#
# plotRegionGeneAssociationGraphs(job, type = 1)
#
# res = plotRegionGeneAssociationGraphs(job, ontology = "GO Molecular Function",
# termID = "GO:0004984")
# res
#
setMethod(f = "plotRegionGeneAssociationGraphs",
signature = "GreatJob",
definition = function(job, type = 1:3, ontology = NULL,
termID = NULL, request_interval = 10, max_tries = 100, verbose = TRUE,
plot = TRUE) {
if(!file.exists(job@job_env$tempdir)) {
td = tempdir()
dir.create(td, showWarnings = FALSE)
job@job_env$tempdir = td
}
jobid = id(job)
species = param(job, "species")
TEMP_DIR = job@job_env$tempdir
version = param(job, "version")
BASE_URL = BASE_URL_LIST[version]
opqq = qq.options(READ.ONLY = FALSE)
on.exit(qq.options(opqq))
qq.options(code.pattern = "@\\{CODE\\}")
# make plot and return value for a single term
using_term = FALSE
if(!is.null(ontology) && !is.null(termID)) {
using_term = TRUE
ontology = ontology[1]
termID = termID[1]
if(! ontology %in% availableOntologies(job)) {
stop("Value of `ontology` should be in `availableOntologies()`\n")
}
}
if(sum(c(is.null(ontology), is.null(termID))) == 1) {
stop("You should set both of `ontology` and `termID` or neither of them.\n")
}
if(using_term) {
# check whether termID is in ontology if ontology table is already downloaded
if(!is.null(job@enrichment_tables[[ontology]])) {
if(! termID %in% job@enrichment_tables[[ontology]]$ID) {
stop(qq("Cannot find '@{termID}' in enrichment table of '@{ontology}'"))
}
}
}
ONTOLOGY_KEYS = job@job_env$ONTOLOGY_KEYS
if(using_term) {
# prepare file names for local table
f_term = qq("@{jobid}-@{ontology}-@{termID}-@{species}-region.txt")
f_term = gsub('[\\/:*?"<>|]', "_", f_term)
f_term = qq("@{TEMP_DIR}/@{f_term}")
if(!is.null(job@association_tables[[qq("@{ontology}-@{termID}")]])) {
df_term = job@association_tables[[qq("@{ontology}-@{termID}")]]
} else {
if(verbose) qqcat("The webpage for '@{ONTOLOGY_KEYS[ontology]}:@{termID}' is available at:\n @{BASE_URL}/showTermDetails.php?termId=@{termID}&ontoName=@{ONTOLOGY_KEYS[ontology]}&ontoUiName=@{ontology}&sessionName=@{jobid}&species=@{species}&foreName=@{basename(param(job, 'f_bed'))}&backName=@{basename(param(job, 'f_bed_bg'))}&table=region\n")
if (param(job, "bgChoice") != "data") {
url = qq("@{BASE_URL}/downloadAssociations.php?termId=@{termID}&ontoName=@{ONTOLOGY_KEYS[ontology]}&sessionName=@{jobid}&species=@{species}&foreName=@{basename(param(job, 'f_bed'))}&backName=@{basename(param(job, 'f_bed_bg'))}&table=region")
} else {
url = qq("@{BASE_URL}/downloadAssociations.php?termId=@{termID}&ontoName=@{ONTOLOGY_KEYS[ontology]}&sessionName=@{jobid}&species=@{species}&foreName=@{basename(param(job, 'f_bed'))}&backName=@{basename(param(job, 'f_bed_bg'))}&table=region")
}
download(url, file = f_term, request_interval = request_interval, max_tries = max_tries)
check_asso_file(f_term)
df_term = parseRegionGeneAssociationFile(f_term)
df_term$gene[df_term$gene == ""] = NA
job@association_tables[[qq("@{ontology}-@{termID}")]] = df_term
file.remove(f_term)
}
}
f_all = qq("@{TEMP_DIR}/@{jobid}-@{species}-all-region.txt")
# download
if(!is.null(job@association_tables[["all"]])) {
df_all = job@association_tables[["all"]]
} else {
url = qq("@{BASE_URL}/downloadAssociations.php?sessionName=@{jobid}&species=@{species}&foreName=@{basename(param(job, 'f_bed'))}&backName=@{basename(param(job, 'f_bed_bg'))}&table=region")
download(url, file = f_all, request_interval = request_interval, max_tries = max_tries)
check_asso_file(f_all)
df_all = parseRegionGeneAssociationFile(f_all)
df_all$gene[df_all$gene == ""] = NA
job@association_tables[["all"]] = df_all
file.remove(f_all)
}
# some values of gene is NA
if(using_term) {
df_term_NA = df_term[is.na(df_term$gene), , drop = FALSE]
df_term = df_term[!is.na(df_term$gene), , drop = FALSE]
}
df_all_NA = df_all[is.na(df_all$gene), , drop = FALSE]
df_all = df_all[!is.na(df_all$gene), , drop = FALSE]
if(plot) {
op_mfrow = par("mfrow")
on.exit(par(mfrow = op_mfrow))
par(mfrow = c(1, length(intersect(type, 1:3))))
} else {
type = 0
}
# make plots
if(1 %in% type) {
if(using_term) {
tb = table(table(df_term$gene))
vt = numeric(11)
vt[as.numeric(names(tb))] = tb
vt[is.na(vt)] = 0
v = c(vt[1:10], sum(vt[10:length(vt)]))
names(v) = c(as.character(1:10), ">10")
v[is.na(v)] = 0
p = v/sum(v)
pos = barplot(p, col = "black", xlab = "Number of associated regions per gene", ylab = "This term's genes", ylim = c(0, max(p)*1.5), main = qq("Number of associated regions per gene\n@{ontology}\n@{termID}"))
text(pos[, 1], p + 0.01, v, adj = c(0.5, 0), cex = 0.8)
} else {
tb = table(table(paste(df_all$chr, df_all$start, df_all$end, sep = ",")))
v = c(nrow(df_all_NA), tb["1"], tb["2"], sum(tb[as.numeric(names(tb)) > 2]))
names(v) = c("0", "1", "2", "> 3")
v[is.na(v)] = 0
p = v/sum(v)
pos = barplot(p, col = c("red", "grey", "grey", "grey"), xlab = "Number of associated genes per region", ylab = "Genomic regions", ylim = c(0, max(p)*1.5), main = "Number of associated genes per region")
text(pos[, 1], p + 0.01, v, adj = c(0.5, 0), col = c("red", "black", "black", "black"), cex = 0.8)
legend("topright", pch = 15, col = c("grey", "red"), legend = c("Genomic regions associated with one or more genes", "Genomic regions not associated with any genes"), cex = 0.8)
}
}
if(2 %in% type) {
v = cbind(
c("<-500" = sum(df_all$distTSS <= -500000),
"-500 to -50" = sum(df_all$distTSS > -500000 & df_all$distTSS <= -50000),
"-50 to -5" = sum(df_all$distTSS > -50000 & df_all$distTSS <= -5000),
"-5 to 0" = sum(df_all$distTSS > -5000 & df_all$distTSS < 0),
"0 to 5" = sum(df_all$distTSS >= 0 & df_all$distTSS <= 5000),
"5 to 50" = sum(df_all$distTSS > 5000 & df_all$distTSS <= 50000),
"50 to 500" = sum(df_all$distTSS > 50000 & df_all$distTSS <= 500000),
"> 500" = sum(df_all$distTSS > 500000)))
p = v/sum(v)
if(using_term) {
v = cbind(
c("<-500" = sum(df_term$distTSS <= -500000),
"-500 to -50" = sum(df_term$distTSS > -500000 & df_term$distTSS <= -50000),
"-50 to -5" = sum(df_term$distTSS > -50000 & df_term$distTSS <= -5000),
"-5 to 0" = sum(df_term$distTSS > -5000 & df_term$distTSS < 0),
"0 to 5" = sum(df_term$distTSS >= 0 & df_term$distTSS <= 5000),
"5 to 50" = sum(df_term$distTSS > 5000 & df_term$distTSS <= 50000),
"50 to 500" = sum(df_term$distTSS > 50000 & df_term$distTSS <= 500000),
"> 500" = sum(df_term$distTSS > 500000)), v)
p = v/sum(v)
}
p[is.na(p)] = 0
rownames(p) = NULL
if(using_term) {
pos = barplot(t(p), beside = TRUE, col = {if(using_term) c("green", "blue") else "blue"}, xlab = "Distance to TSS (kb)", ylab = "Region-gene associations", ylim = c(0, max(p)*1.5), main = qq("Binned by orientation and distance to TSS\n@{ontology}\n@{termID}"), axes = FALSE)
} else {
pos = barplot(t(p), beside = TRUE, col = {if(using_term) c("green", "blue") else "blue"}, xlab = "Distance to TSS (kb)", ylab = "Region-gene associations", ylim = c(0, max(p)*1.5), main = qq("Binned by orientation and distance to TSS"), axes = FALSE)
}
text(t(pos), p + 0.01, v, adj = c(0.5, 0), cex = 0.8)
axis(side = 2)
op = par("xpd")
par(xpd = NA)
text(colMeans(pos), -0.02,rownames(v), srt = 45, adj = c(1, 0.5))
par(xpd = op)
x1 = (mean(pos[, 4]) + mean(pos[, 5]))/2
x2 = (mean(pos[, 5]) + mean(pos[, 6]))/2
y = max(p)*1.2
lines( c(x1, x1), c(0, y))
arrows(x1, y, x2, y, angle = 15, length = 0.1, code = 2)
text(mean(pos[, 5]), y+0.01, "TSS", adj = c(0.5, 0), cex = 0.8)
if(using_term) {
legend("topright", pch = 15, col = c("green", "blue"), legend = c("This term's region-gene associations", "Set-wide region-gene associations"), cex = 0.8)
}
}
if(3 %in% type) {
v = cbind(
c("0 to 5" = sum(abs(df_all$distTSS) >= 0 & abs(df_all$distTSS) <= 5000),
"5 to 50" = sum(abs(df_all$distTSS) > 5000 & abs(df_all$distTSS) <= 50000),
"50 to 500" = sum(abs(df_all$distTSS) > 50000 & abs(df_all$distTSS) <= 500000),
"> 500" = sum(abs(df_all$distTSS) > 500000)))
p = v/sum(v)
if(using_term) {
v = cbind(
c("0 to 5" = sum(abs(df_term$distTSS) >= 0 & abs(df_term$distTSS) <= 5000),
"5 to 50" = sum(abs(df_term$distTSS) > 5000 & abs(df_term$distTSS) <= 50000),
"50 to 500" = sum(abs(df_term$distTSS) > 50000 & abs(df_term$distTSS) <= 500000),
"> 500" = sum(abs(df_term$distTSS) > 500000)), v)
p = v/sum(v)
}
p[is.na(p)] = 0
rownames(p) = NULL
if(using_term) {
pos = barplot(t(p), beside = TRUE, col = {if(using_term) c("green", "blue") else "blue"}, xlab = "Absolute distance to TSS (kb)", ylab = "Region-gene associations", ylim = c(0, max(p)*1.5), main = qq("Binned by absolute distance to TSS\n@{ontology}\n@{termID}"), axes = FALSE)
} else {
pos = barplot(t(p), beside = TRUE, col = {if(using_term) c("green", "blue") else "blue"}, xlab = "Absolute distance to TSS (kb)", ylab = "Region-gene associations", ylim = c(0, max(p)*1.5), main = qq("Binned by absolute distance to TSS"), axes = FALSE)
}
text(t(pos), p + 0.01, v, adj = c(0.5, 0), cex = 0.8)
axis(side = 2)
op = par("xpd")
par(xpd = NA)
text(colMeans(pos), -0.02, rownames(v), srt = 45, adj = c(1, 0.5))
par(xpd = op)
x1 = 0.9
y = max(p)*1.2
arrows(x1, 0, x1, y, angle = 15, length = 0.1, code = 2)
text(x1, y+0.01, "TSS", adj = c(0, 0), cex = 0.8)
if(using_term) {
legend("topright", pch = 15, col = c("green", "blue"), legend = c("This term's region-gene associations", "Set-wide region-gene associations"))
}
}
if(using_term) {
df = rbind(df_term, df_term_NA)
} else {
df = rbind(df_all, df_all_NA)
}
gr = GRanges(seqnames = factor(df[[1]], levels = sort_chr(unique(df[[1]]))),
ranges = IRanges(start = df[[2]],
end = df[[3]]),
gene = df[[4]],
distTSS = df[[5]])
gr = sort(gr)
return(invisible(gr))
})
# just want to print something while waiting
sleep = function(time) {
pb = txtProgressBar(style = 3)
for(i in seq_len(time)/time) {Sys.sleep(1); setTxtProgressBar(pb, i)}
close(pb)
}
sort_chr = function(x) {
y = gsub("^chr(\\d)$", "chr0\\1", x)
y = gsub("^chr(\\d)_", "chr0\\1_", y)
x[order(y)]
}
check_asso_file = function(file) {
# for downloading association tables
if(file.info(file)$size < 1000) {
# if session expires, there is a error page
file_text = readLines(file)
if(any(grepl("GREAT is unable to locate a session", file_text))) {
stop("Your job on GREAT server expires and you need to submit it again.\n")
}
# wrong termID returns a table only contain a newline
if(all(grepl("^\\s*$", file_text))) {
stop("Empty data, probably your 'termID' is invalid. Please check the URL above.\n")
}
}
}
# # == title
# # Add region-gene association to the internal enrichment tables
# #
# # == param
# # -job a `GreatJob-class` instance
# # -ontology ontology name
# # -verbose whether show message
# #
# # == details
# # After successfully run this function, users need to rerun `getEnrichmentTables` to
# # get the enrichment tables that contains the new gene association column.
# setMethod(f = "addRegionGeneAssociation",
# signature = "GreatJob",
# definition = function(job, ontology = names(job@enrichment_tables), verbose = TRUE) {
# if(length(ontology) == 0) {
# stop("You should run `getEnrichmentTables()` first.")
# }
# ontology = intersect(ontology, names(job@enrichment_tables))
# if(length(ontology) == 0) {
# stop("Cannot find some of the ontologies.")
# }
# for(onto in ontology) {
# enrichment_tb = job@enrichment_tables[[onto]]
# all_termID = enrichment_tb$ID
# n_term = length(all_termID)
# genes = character(n_term)
# for(i in seq_along(all_termID)) {
# tid = all_termID[i]
# if(verbose) {
# qqcat("Downloading associated genes for @{onto}, @{tid}, @{i}/@{n_term}...\n")
# }
# gr = plotRegionGeneAssociationGraphs(job, ontology = onto, termID = tid, request_interval = 5, verbose = FALSE, plot = FALSE)
# genes[i] = paste(unique(sort(gr$gene)), collapse = ",")
# }
# enrichment_tb$Asso_Genes = genes
# job@enrichment_tables[[onto]] = enrichment_tb
# }
# if(verbose) {
# cat("Done. Please rerun `getEnrichmentTables()` to get the tables.\n")
# }
# invisible(NULL)
# })
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.