#' Find orthologs.
#'
#' Interface to the g:Orth web toolkit. Organism names are constructed by combining the first letter of
#' the name and family name. For example human - 'hsapiens' and mouse - 'mmusculus'.
#'
#' To alleviate the problem of having many orthologs per gene (most of
#' them uninformative) one can set a threshold for the number of results. The program
#' tries to find the most informative by selecting the most popular ones.
#'
#' @param genelist list of gene IDs to be translated.
#' @param source_organism name of the source organism.
#' @param target_organism name of the target organism.
#' @param region_query interpret query as chromosomal ranges.
#' @param numeric_ns namespace to use for fully numeric IDs.
#' @param mthreshold how many ortholog names per gene to show.
#' @param df logical indicating whether the output will be a data.frame or list.
#' @return The output can be either a list or a data.frame. The list has an entry for every input gene.
#' The data frame is just a two column table with inputs and corrsponding outputs. The input names may be duplicated.
#' @references J. Reimand, M. Kull, H. Peterson, J. Hansen, J. Vilo: g:Profiler -- a web-based toolset for functional profiling of gene lists from large-scale experiments (2007) NAR 35 W193-W200
#' @author Raivo Kolde <rkolde@@gmail.com>, Juri Reimand <juri.reimand@@ut.ee>, Tambet Arak <tambet.arak@@gmail.com>
#' @examples
#' gorth(c("Klf4", "Pax5", "Sox2", "Nanog"), source_organism = "mmusculus", target_organism = "hsapiens")
#' @export
gorth <- function(
genelist,
source_organism = "hsapiens",
target_organism = "mmusculus",
region_query = F,
numeric_ns = "",
mthreshold = 3,
df = T
){
my_url <- "http://biit.cs.ut.ee/gprofiler/gorth.cgi"
if(length(genelist) == 0) return(NULL)
raw_query <- RCurl::postForm(my_url,
output = 'mini',
query = paste(genelist, collapse = " "),
organism = source_organism,
target = target_organism,
region_query = ifelse(region_query, "1", "0"),
prefix = numeric_ns
)
conn <- textConnection(raw_query)
tab <- read.table(conn, sep = "\t")
close(conn)
colnames(tab)[2] <- "Source"
res = plyr::dlply(tab, "Source", function(x){
x = x[x$V6 != "N/A",]
if(nrow(x) == 0){
return(NULL)
}
if(length(x$V6) < mthreshold + 1){
return(as.character(x$V6))
}
else{
return(names(rev(sort(table(as.character(x$V6)))))[1:mthreshold])
}
})
if(df){
res <- plyr::ldply(res, function(x) data.frame(Target = x))
}
return(res)
}
#' Annotate gene list functionally.
#'
#' Interface to the g:Profiler web toolkit for finding enrichments in gene lists.
#'
#' @param organism gene list origin.
#' @param query vector of gene IDs or a list of such vectors.
#' @param ordered_query when output gene lists are ranked one can use this option to get GSEA style
#' p-values.
#' @param significant whether all or only statistically significant results should be
#' returned.
#' @param exclude_iea exclude electronic annotations (IEA).
#' @param region_query interpret query as chromosomal ranges.
#' @param max_p_value custom p-value threshold, results with a larger p-value are excluded.
#' @param max_set_size maximum size of functional category, larger categories are excluded.
#' @param correction_method the algorithm used for determining the significance threshold, one of
#' "gSCS", "fdr", "bonferroni".
#' @param hier_filtering hierarchical filtering strength, one of "none", "moderate", "strong".
#' @param domain_size statistical domain size, one of "annotated", "known".
#' @param custom_bg vector of gene names to use as a statistical background.
#' @param numeric_ns namespace to use for fully numeric IDs.
#' @return Data frame with the enrichment analysis results. If the input consisted of several lists the corresponding list is indicated with a variable 'query number'.
#' @references J. Reimand, M. Kull, H. Peterson, J. Hansen, J. Vilo: g:Profiler - a web-based toolset for functional profiling of gene lists from large-scale experiments (2007) NAR 35 W193-W200
#' @author Juri Reimand <jyri.reimand@@ut.ee>, Raivo Kolde <rkolde@@gmail.com>, Tambet Arak <tambet.arak@@gmail.com>
#' @examples
#' gprofiler(c("Klf4", "Pax5", "Sox2", "Nanog"), organism = "mmusculus")
#' @export
gprofiler <- function(
query,
organism='hsapiens',
ordered_query=F,
significant=T,
exclude_iea=F,
region_query = F,
max_p_value=1.0,
max_set_size=0,
correction_method="analytical",
hier_filtering="none",
domain_size="annotated",
custom_bg="",
numeric_ns = ""
) {
my_url <- "http://biit.cs.ut.ee/gprofiler/gcocoa.cgi"
query_url = ""
# Query
if (is.list(query)) {
for (i in 1:length(query)) {
query_url <- paste(sep="\n", query_url, paste("> query", i), paste(query[[i]], collapse=" "))
}
} else if (is.vector(query)) {
query_url <- paste(query, collapse=" ")
} else {
print("ERROR missing query")
return()
}
# Significance threshold
if (correction_method == "gSCS")
correction_method = "analytical"
if (!correction_method %in% c("analytical", "fdr", "bonferroni"))
stop("Multiple testing correction method not recognized")
# Hierarchical filtering
if (!hier_filtering %in% c("none", "moderate", "strong"))
stop("hier_filtering must be one of \"none\", \"moderate\" or \"strong\"")
if (hier_filtering == "strong") hier_filtering = "compact_ccomp"
else if (hier_filtering == "moderate") hier_filtering = "compact_rgroups"
else hier_filtering = ""
# Domain size
if (!domain_size %in% c("annotated", "known"))
stop("domain_size must be one of \"annotated\" or \"known\"")
# Custom background
if (is.vector(custom_bg))
custom_bg = paste(custom_bg, collapse=" ")
else
stop("custom_bg must be a vector")
# Max. set size
if (max_set_size < 0) max_set_size = 0
raw_query <- RCurl::postForm(my_url,
organism=organism,
query=query_url,
output='mini',
analytical="1",
sort_by_structure="1",
ordered_query = ifelse(ordered_query, "1", "0"),
significant = ifelse(significant, "1", "0"),
no_iea = ifelse(exclude_iea, "1", "0"),
region_query = ifelse(region_query, "1", "0"),
user_thr = as.character(max_p_value),
max_set_size = as.character(max_set_size),
threshold_algo = correction_method,
hierfiltering = hier_filtering,
domain_size_type = domain_size,
custbg_file = "",
custbg = custom_bg,
prefix = numeric_ns
)
split_query <- unlist(strsplit(raw_query, split="\n"))
commented_lines <- grep("^#", split_query)
if (length(commented_lines)>0) {
split_query <- split_query[-commented_lines]
}
empty_lines <- which(split_query == "")
if (length(empty_lines)>0) {
split_query <- split_query[-empty_lines]
}
if(length(split_query) > 0){
conn <- textConnection(paste(split_query, collapse = "\n"))
split_query <- read.table(conn, sep = "\t", quote = "")
close(conn)
}
else{
split_query <- as.data.frame(matrix(NA, 0, 14))
}
rownames(split_query) <- NULL
colnames(split_query) <- c(
"query.number", "significant", "p.value",
"term.size", "query.size", "overlap.size",
"precision", "recall", "term.id",
"domain", "subgraph.number", "term.name",
"relative.depth", "intersection"
)
split_query$term.name <- gsub("^\\s+", "", split_query$term.name)
split_query$significant <- ifelse(split_query$significant == "!", T, F)
if(is.list(query) & !is.null(names(query))){
split_query$query.number <- names(query)[split_query$query.number]
}
return(split_query)
}
#' Convert gene IDs.
#'
#' Interface to the g:Convert web toolkit.
#'
#' @param ids a vector of gene IDs.
#' @param organism gene list origin.
#' @param target target namespace.
#' @param region_query interpret query as chromosomal ranges.
#' @param numeric_ns namespace to use for fully numeric IDs.
#' @param df logical indicating whether the output will be a data.frame or list.
#' @return The output can be either list or data.frame. List has an entry for every input gene. Data frame is just a two column table with inputs and corrsponding outputs. The input names may be duplicated.
#' @references J. Reimand, M. Kull, H. Peterson, J. Hansen, J. Vilo: g:Profiler - a web-based toolset for functional profiling of gene lists from large-scale experiments (2007) NAR 35 W193-W200
#' @author Juri Reimand <jyri.reimand@@ut.ee>, Raivo Kolde <rkolde@@gmail.com>, Tambet Arak <tambet.arak@@gmail.com>
#' @examples
#' gconvert(c("Klf4", "Pax5", "Sox2", "Nanog"), organism = "mmusculus")
#'
#' # Get all mouse Cell cycle genes
#' gconvert(c("GO:0007049"), organism = "mmusculus")
#'
#' @export
gconvert = function(
ids,
organism = "hsapiens",
target = "ENSG",
region_query = F,
numeric_ns = "",
df = T
) {
url = "http://biit.cs.ut.ee/gprofiler/gconvert.cgi"
field = 4
if (target == "DESC") {
field = 6
target = "ENSG"
}
raw_query = RCurl::postForm(url,
organism=organism,
output='mini',
query=paste(ids, collapse="+"),
target=target,
region_query = ifelse(region_query, "1", "0"),
prefix = numeric_ns
)
split_query = unlist(strsplit(raw_query, split="\n"))
commented_lines = grep("^#", split_query)
if (length(commented_lines)>0) {
split_query = split_query[-commented_lines]
}
empty_lines = which(split_query == "")
if (length(empty_lines)>0) {
split_query = split_query[-empty_lines]
}
split_query = t(as.data.frame(strsplit(split="\t", split_query)))
rownames(split_query) = c()
resulting_mapping = list()
if (nrow(split_query)) {
for (id in unique(toupper(ids))) {
my_map = split_query[split_query[,2]==id,,drop=F][,field]
my_map = my_map[my_map != "N/A"]
resulting_mapping[[id]] = my_map
}
}
if(df){
resulting_mapping <- plyr::ldply(resulting_mapping, function(x) data.frame(Target = x))
}
return(resulting_mapping)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.