Nothing
##########################
#biomaRt source code #
##########################
# #
#Licence: Artistic #
#Author: Steffen Durinck #
##########################
###############
#messageToUser#
###############
messageToUser <- function(message){
if(interactive()){
cat(message)
}
}
##############################################################
#martCheck #
# #
#This function checks if there is a valid Mart object, #
#if a dataset is selected and #
#if the correct BioMart database has been selected (optional)#
##############################################################
martCheck = function(mart, biomart = NULL){
if( missing( mart ) || class( mart ) != 'Mart'){
stop("You must provide a valid Mart object. To create a Mart object use the function: useMart. Check ?useMart for more information.")
}
if(!is.null(biomart)){
martcheck = martBM(mart)
bmok = FALSE
for(k in seq_along(biomart)) {
if(martcheck[1] == biomart[k]) {
bmok = TRUE
}
}
if(!bmok){
stop(paste("This function only works when used with the ",biomart," BioMart.",sep=""))
}
}
if(martDataset(mart)==""){
stop("No dataset selected, please select a dataset first. You can see the available datasets by using the listDatasets function see ?listDatasets for more information. Then you should create the Mart object by using the useMart function. See ?useMart for more information");
}
}
checkWrapperArgs = function(id, type, mart){
if(missing(type))stop("Specify the type of identifier you are using, see ?getGene for details. Valid values for the type argument can be found with the listFilters function.")
if(!type %in% listFilters(mart)[,1])stop(paste("Invalid identifier type:",type," see ?getGene for details. Use the listFilters function to get the valid value for the type argument.", sep=""))
if(missing(id))stop("No identifiers specified. Use the id argument to specify a vector of identifiers for which you want to retrieve the annotation.")
}
bmRequest <- function(request, verbose = FALSE){
if(verbose) writeLines(paste("Attempting web service request:\n",request, sep=""))
result <- tryCatch(httr::GET(request, content_type("text/plain"), timeout(10)),
error = function(c) { "timeout" } )
tryAgain <- any(result == "timeout") || httr::status_code(result) == 500
## try an alternative mirror if ensembl returns 500
if(tryAgain && grepl("ensembl", request)) {
mirrors <- c("www", "uswest", "useast", "asia")
subdomain <- stringr::str_match(request, "://([a-z]{3,5})\\.")[1,2]
mirror_option <- sample(mirrors[!mirrors %in% subdomain], size = 1)
message("Ensembl site unresponsive, trying ", mirror_option, " mirror")
request2 <- str_replace(request, pattern = "://([a-z]{3,5})\\.",
replacement = paste0("://", mirror_option, "."))
result <- httr::GET(request2, content_type("text/plain"))
} else { ## this isn't ensembl run the query again so we can get the error
result <- httr::GET(request, content_type("text/plain"))
}
stop_for_status(result)
result2 <- content(result, encoding = "UTF-8")
if(is.na(result2)) {
result2 <- content(result, encoding = "Latin1")
}
return(result2)
}
#######################################################
#listMarts: #
#list all available BioMart databases by default #
#listMarts will check the central service to see which#
#BioMart databases are present #
#######################################################
listMarts <- function( mart = NULL, host="www.ensembl.org", path="/biomart/martservice",
port, includeHosts = FALSE, archive = FALSE, ensemblRedirect = NULL, verbose = FALSE){
if(missing(port)) {
port <- ifelse(grepl("https", host), yes = 443, no = 80)
}
if(!is.null(ensemblRedirect)) {
warning('The argument "ensemblRedirect" has been deprecated and does not do anything.',
'\nSee ?useEnsembl for details on using mirror sites.')
}
.listMarts(mart = mart, host = host, path = path, port = port, includeHosts = includeHosts,
archive = archive, verbose = verbose, ensemblRedirect = TRUE)
}
.listMarts <- function( mart = NULL, host="www.ensembl.org", path="/biomart/martservice",
port=80, includeHosts = FALSE, archive = FALSE, verbose = FALSE,
ensemblRedirect = NULL){
request = NULL
if(is.null(mart)){
host <- .cleanHostURL(host)
if(archive) {
stop("The archive = TRUE argument is now defunct.\nUse listEnsemblArchives() to find the URL to directly query an Ensembl archive.")
} else {
request <- paste0(host, ":", port, path, "?type=registry&requestid=biomaRt")
}
}
else{
if(class(mart) == 'Mart'){
request = paste0(martHost(mart), "?type=registry&requestid=biomaRt")
}
else{
warning(paste(mart,"object needs to be of class Mart created with the useMart function. If you don't have a Mart object yet, use listMarts without arguments or only specify the host argument",sep=" "))
}
}
if(!ensemblRedirect && grepl(x = request, pattern = "ensembl.org")) {
request <- paste0(request, "&redirect=no")
}
registry = bmRequest(request = request, verbose = verbose)
## check this looks like the MartRegistry XML, otherwise throw an error
if(!grepl(x = registry, pattern = "^\n*<MartRegistry>")) {
if(grepl(x = registry, pattern = "status.ensembl.org")) {
stop("Your query has been redirected to http://status.ensembl.org ",
"indicating this Ensembl service is currently unavailable",
"\nLook at ?useEnsembl for details on how to try a mirror site.",
call. = FALSE)
} else {
stop('Unexpected format to the list of available marts.\n',
'Please check the following URL manually, ',
'and try ?listMarts for advice.\n',
request,
call. = FALSE)
}
}
registry = xmlTreeParse(registry, asText=TRUE)
registry = registry$doc$children[[1]]
marts = list(biomart = NULL, version = NULL, host = NULL, path = NULL, database = NULL)
index = 1
# if(host != "www.biomart.org" || archive){
for(i in seq(len=xmlSize(registry))){
if(xmlName(registry[[i]])=="MartURLLocation"){
if(xmlGetAttr(registry[[i]],"visible") == 1){
if(!is.null(xmlGetAttr(registry[[i]],"name"))) marts$biomart[index] = as.character(xmlGetAttr(registry[[i]],"name"))
if(!is.null(xmlGetAttr(registry[[i]],"database"))) marts$database[index] = as.character(xmlGetAttr(registry[[i]],"database"))
if(!is.null(xmlGetAttr(registry[[i]],"displayName"))) marts$version[index] = as.character(xmlGetAttr(registry[[i]],"displayName"))
if(!is.null(xmlGetAttr(registry[[i]],"host"))) marts$host[index] = as.character(xmlGetAttr(registry[[i]],"host"))
if(!is.null(xmlGetAttr(registry[[i]],"path"))) marts$path[index] = as.character(xmlGetAttr(registry[[i]],"path"))
if(!is.null(xmlGetAttr(registry[[i]],"port"))) marts$port[index] = as.character(xmlGetAttr(registry[[i]],"port"))
if(!is.null(xmlGetAttr(registry[[i]],"serverVirtualSchema"))){
marts$vschema[index] = as.character(xmlGetAttr(registry[[i]],"serverVirtualSchema"))
}
index=index+1
}
}
}
if(includeHosts){
return(marts)
}
else{
ret = data.frame(biomart = as.character(marts$biomart),
version = as.character(marts$version),
stringsAsFactors=FALSE)
return(ret)
}
}
#################################
# # # #
# # Generic BioMart functions # #
# # # #
#################################
useMart <- function(biomart, dataset, host = "https://www.ensembl.org", path = "/biomart/martservice", port,
archive = FALSE, ensemblRedirect = NULL, version, verbose = FALSE) {
if(missing(port)) {
port <- ifelse(grepl("https", host), yes = 443, no = 80)
}
if(!is.null(ensemblRedirect)) {
warning('The argument "ensemblRedirect" has been deprecated and will be removed in the next biomaRt release.')
}
mart <- .useMart(biomart, dataset, host = host, path = path, port = port,
archive = archive, version = version, verbose = verbose, ensemblRedirect = TRUE)
}
.useMart <- function(biomart, dataset, host = "www.ensembl.org", path = "/biomart/martservice", port = 80,
archive = FALSE, ensemblRedirect = NULL, version, verbose = FALSE){
if(missing(biomart) && missing(version))
stop("No biomart databases specified. Specify a biomart database to use using the biomart or version argument")
if(!missing(biomart)){
if(!(is.character(biomart)))
stop("biomart argument is not a string. ",
"The biomart argument should be a single character string")
}
if(biomart == "ensembl" & grepl(x = host, pattern = "ensembl.org")) {
biomart = "ENSEMBL_MART_ENSEMBL"
}
reqHost = host
host <- .cleanHostURL(host)
marts <- listMarts(host=host, path=path, port=port, includeHosts = TRUE,
archive = archive)
mindex = NA
if(!missing(biomart)){
mindex=match(biomart,marts$biomart)
}
if(!missing(version)){
mindex=match(version,marts$version)
}
if(is.na(mindex) || archive){
mindex=match(biomart,marts$database)
}
if(is.na(mindex))
stop("Incorrect BioMart name, use the listMarts function to see which BioMart databases are available")
if(is.na(marts$path[mindex]) || is.na(marts$vschema[mindex]) ||
is.na(marts$host[mindex]) || is.na(marts$port[mindex]) ||
is.na(marts$path[mindex]))
stop("The selected biomart databases is not available due to error in the BioMart central registry, please report so the BioMart registry file can be fixed.")
if(marts$path[mindex]=="") marts$path[mindex]="/biomart/martservice" #temporary to catch bugs in registry
#if(archive) biomart = marts$biomart[mindex]
if(!missing(version)) biomart = marts$biomart[mindex]
biomart = sub(" ","%20",biomart, fixed = TRUE, useBytes = TRUE)
## adding option to force use of specificed host with ensembl
redirect <- ifelse(!ensemblRedirect && grepl(x = host, pattern = "ensembl.org"),
"?redirect=no",
"")
mart <- new("Mart",
biomart = biomart,
vschema = marts$vschema[mindex],
host = paste0(host, ":",
port,
marts$path[mindex],
redirect))
if(length(grep("archive",martHost(mart)) > 0)){
## hack to work around redirection of most recent mirror URL
archives <- listEnsemblArchives()
current_release <- archives[archives$current_release == "*", 'url']
if(grepl(mart@host, pattern = current_release)) {
mart@host <- stringr::str_replace(mart@host, pattern = current_release, "https://www.ensembl.org")
mart@host <- stringr::str_replace(mart@host, pattern = ":80/", ":443/")
}
}
BioMartVersion=bmVersion(mart, verbose=verbose)
if(verbose){
writeLines(paste("BioMartServer running BioMart version:",BioMartVersion,sep=" "))
writeLines(paste("Mart virtual schema:",martVSchema(mart),sep=" "))
if(length(grep(reqHost,martHost(mart))) == 0){
writeLines(paste("Requested host was redirected from ", reqHost, " to " ,martHost(mart),sep=""))
}
writeLines(paste("Mart host:",martHost(mart),sep=" "))
}
if(!missing(dataset)){
mart = useDataset(mart = mart, dataset=dataset, verbose = verbose)
}
return(mart)
}
listDatasets <- function(mart, verbose = FALSE) {
.listDatasets(mart = mart, verbose = verbose, sort = TRUE)
}
.listDatasets <- function(mart, verbose = FALSE, sort = FALSE) {
if(missing(mart) || !is(mart, 'Mart'))
stop("No Mart object given or object not of class 'Mart'")
## we choose a separator based on whether 'redirect=no' is present
## should always be '?' now
sep <- ifelse(grepl(x = martHost(mart), pattern = ".+\\?.+"), "&", "?")
request = paste0(martHost(mart), sep, "type=datasets&requestid=biomaRt&mart=", martBM(mart))
bmResult = bmRequest(request = request, verbose = verbose)
con = textConnection(bmResult)
txt = scan(con, sep="\t", blank.lines.skip=TRUE, what="character", quiet=TRUE, quote = "\"")
close(con)
## select visible ("1") table sets
i = intersect(which(txt=="TableSet"), which(txt=="1")-3L)
res = data.frame(dataset = I(txt[i+1L]),
description = I(txt[i+2L]),
version = I(txt[i+4L]))
## sort alphabetically
if(sort)
res <- res[ order(res$dataset), ]
rownames(res) <- NULL
return(res)
}
## Check version of BioMart service
bmVersion <- function(mart, verbose=FALSE){
## we choose a separator based on whether 'redirect=no' is present
sep <- ifelse(grepl(x = martHost(mart), pattern = ".+\\?.+"), "&", "?")
request = paste0(martHost(mart), sep, "type=version", "&requestid=biomaRt&mart=", martBM(mart))
BioMartVersion = bmRequest(request = request, verbose = verbose)
bmv = ""
if(BioMartVersion == "\n" | BioMartVersion == ""){
bmv = NA
if(verbose) warning(paste("BioMart version is not available from BioMart server:",request,sep="\n"))
}
else{
con = textConnection(BioMartVersion)
bmVersionParsed = read.table(con, sep="\t", header=FALSE, quote = "", comment.char = "", as.is=TRUE)
close(con)
if(verbose) print(bmVersionParsed)
if(dim(bmVersionParsed)[2] >= 1){
bmv=bmVersionParsed[1,1]
}
}
return(bmv)
}
## Retrieve attributes and filters from web service
bmAttrFilt <- function(type, mart, verbose=FALSE){
## we choose a separator based on whether 'redirect=no' is present
sep <- ifelse(grepl(x = martHost(mart), pattern = ".+\\?.+"), "&", "?")
request = paste0(martHost(mart), sep, "type=", type,
"&dataset=", martDataset(mart),
"&requestid=biomaRt&mart=", martBM(mart),
"&virtualSchema=", martVSchema(mart))
attrfilt = bmRequest(request = request, verbose = verbose)
con = textConnection(attrfilt)
attrfiltParsed = read.table(con, sep="\t", header=FALSE, quote = "", comment.char = "", as.is=TRUE)
close(con)
if(type=="attributes"){
if(dim(attrfiltParsed)[2] < 3)
stop("biomaRt error: looks like we're connecting to incompatible version of BioMart suite.")
cnames = seq_len(dim(attrfiltParsed)[2])
cnames=paste(type,cnames,sep="")
cnames[1] = "name"
cnames[2] = "description"
cnames[3] = "fullDescription"
if(dim(attrfiltParsed)[2] < 4){
warning("biomaRt warning: looks like we're connecting to an older ",
"version of BioMart suite.\nSome biomaRt functions might ",
"not work.")
}
else{
cnames[4] = "page"
}
colnames(attrfiltParsed) = cnames
}
if(type=="filters"){
if(dim(attrfiltParsed)[2] < 4)
stop("biomaRt error: looks like we're connecting to incompatible version of BioMart suite.")
cnames = seq(1:dim(attrfiltParsed)[2])
cnames=paste(type,cnames,sep="")
cnames[1] = "name"
cnames[2] = "description"
cnames[3] = "options"
cnames[4] = "fullDescription"
if(dim(attrfiltParsed)[2] < 7){
warning("biomaRt warning: looks like we're connecting to an older version of BioMart suite. Some biomaRt functions might not work.")
}
else{
cnames[5] = "filters"
cnames[6] = "type"
cnames[7] = "operation"
}
colnames(attrfiltParsed) = cnames
}
return(attrfiltParsed)
}
## Utilty function to check dataset specification
## Returns dataset name as a character assuming all checks
## have been passed.
checkDataset <- function(dataset, mart) {
validDatasets <- .listDatasets(mart, sort = FALSE)
## subseting data.frames can produce some weird classes
## which aren't character(), so we coerce it here
dataset <- as.character(dataset)
if(length(dataset) > 1)
stop("Please only specify a single dataset name")
if(is.na(match(dataset, validDatasets$dataset)))
stop(paste("The given dataset: ",dataset,", is not valid. Correct dataset names can be obtained with the listDatasets() function."))
return(dataset)
}
## Select a BioMart dataset
useDataset <- function(dataset, mart, verbose = FALSE){
if(missing(mart) || class(mart)!="Mart")
stop("No valid Mart object given, specify a Mart object with the attribute mart")
if(missing(dataset)) {
stop("No dataset given. Please use the dataset argument to specify which dataset you want to use. Correct dataset names can be obtained with the listDatasets() function.")
} else {
dataset <- checkDataset(dataset = dataset, mart = mart)
}
martDataset(mart) <- dataset
if(verbose) messageToUser("Checking attributes ...")
martAttributes(mart) <- bmAttrFilt("attributes",mart, verbose = verbose)
if(verbose){
messageToUser(" ok\n")
messageToUser("Checking filters ...")
}
martFilters(mart) <- bmAttrFilt("filters",mart, verbose = verbose)
if(verbose) messageToUser(" ok\n")
return( mart )
}
## listAttributes
listAttributes <- function(mart, page, what = c("name","description","page")) {
martCheck(mart)
if(!missing(page) && !page %in% attributePages(mart))
stop("The chosen page: ",page," is not valid, please use the correct page name using the attributePages function")
attrib=NULL
if(!missing(page)){
sel = which(martAttributes(mart)[,"page"] == page)
attrib = martAttributes(mart)[sel,what]
}
else{
attrib = martAttributes(mart)[,what]
}
return(attrib)
}
## attributePages
attributePages <- function(mart){
martCheck(mart)
pages = unique(martAttributes(mart)[,"page"])
return(pages)
}
## listFilters
listFilters <- function(mart, what = c("name", "description")) {
martCheck(mart)
filters = martFilters(mart)
badwhat = !(what %in% colnames(filters))
if(any(badwhat))
stop(sprintf("The function argument 'what' contains %s: %s\nValid are: %s\n",
if(sum(badwhat)>1) "invalid values" else "an invalid value",
paste(what[badwhat], collapse=", "),
paste(colnames(filters), collapse=", ")))
return(filters[, what])
}
## filterOptions
filterOptions <- function(filter, mart){
if(missing(filter)) stop("No filter given. Please specify the filter for which you want to retrieve the possible values.")
if(class(filter)!="character")stop("Filter argument should be of class character")
martCheck(mart)
if(!filter %in% listFilters(mart, what="name"))stop("Filter not valid, check for typo in filter argument.")
sel = which(listFilters(mart, what="name") == filter)
return(listFilters(mart,what="options")[sel])
}
## filterType
filterType <- function(filter, mart){
if(missing(filter)) stop("No filter given. Please specify the filter for which you want to retrieve the filter type")
if(class(filter)!="character")stop("Filter argument should be of class character")
martCheck(mart)
type="unknown"
sel = which(listFilters(mart, what="name") == filter)
if(is.null(sel))stop(paste("Invalid filter",filter, sep=": "))
type = listFilters(mart,what="type")[sel]
return(type)
}
##########################################
#getBM: generic BioMart query function #
##########################################
getBM <- function(attributes, filters = "", values = "", mart, curl = NULL,
checkFilters = TRUE, verbose=FALSE, uniqueRows=TRUE, bmHeader=FALSE, quote="\"",
useCache = TRUE){
## check the arguments are all valid
martCheck(mart)
if(missing( attributes ))
stop("Argument 'attributes' must be specified.")
if(is.list(filters) && !missing( values ))
warning("Argument 'values' should not be used when argument 'filters' is a list and will be ignored.")
if(is.list(filters) && is.null(names(filters)))
stop("Argument 'filters' must be a named list when sent as a list.")
if(!is.list(filters) && all(filters != "") && missing( values ))
stop("Argument 'values' must be specified.")
if(length(filters) > 0 && length(values) == 0)
stop("Values argument contains no data.")
if(is.list(filters)){
values = filters
filters = names(filters)
}
if(class(uniqueRows) != "logical")
stop("Argument 'uniqueRows' must be a logical value, so either TRUE or FALSE")
## determine if we should use the results cache
if(useCache) {
cache <- Sys.getenv(x = "BIOMART_CACHE",
unset = rappdirs::user_cache_dir(appname="biomaRt"))
bfc <- BiocFileCache::BiocFileCache(cache, ask = FALSE)
}
hash <- .createHash(mart, attributes, filters, values, uniqueRows, bmHeader)
if( useCache && .checkCache(bfc, hash) ) {
if(verbose) {
message("Cache found")
}
cache_hits <- bfcquery(bfc, hash, field = "rname")
if(nrow(cache_hits) > 1) {
stop("Multiple cache results found")
} else {
rid <- cache_hits$rid
load( bfc[[ rid ]] )
return(result)
}
} else {
## force the query to return the 'descriptive text' header names with the result
## we use these later to match and order attribute/column names
callHeader <- TRUE
xmlQuery = paste0("<?xml version='1.0' encoding='UTF-8'?><!DOCTYPE Query><Query virtualSchemaName = '",
martVSchema(mart),
"' uniqueRows = '",
as.numeric(uniqueRows),
"' count='0' datasetConfigVersion='0.6' header='",
as.numeric(callHeader),
"' formatter='TSV' requestid='biomaRt'> <Dataset name = '",
martDataset(mart),"'>")
#checking the Attributes
invalid = !(attributes %in% listAttributes(mart, what="name"))
if(any(invalid))
stop(paste("Invalid attribute(s):", paste(attributes[invalid], collapse=", "),
"\nPlease use the function 'listAttributes' to get valid attribute names"))
#attribute are ok lets add them to the query
attributeXML = paste("<Attribute name = '", attributes, "'/>", collapse="", sep="")
#checking the filters
if(filters[1] != "" && checkFilters){
invalid = !(filters %in% listFilters(mart, what="name"))
if(any(invalid))
stop(paste("Invalid filters(s):", paste(filters[invalid], collapse=", "),
"\nPlease use the function 'listFilters' to get valid filter names"))
}
## filterXML is a list containing filters with reduced numbers of values
## to meet the 500 value limit in BioMart queries
filterXmlList <- .generateFilterXML(filters, values, mart)
resultList <- list()
if(length(filterXmlList) > 1) {
pb <- progress_bar$new(total = length(filterXmlList),
width = options()$width - 10,
format = "Batch submitting query [:bar] :percent eta: :eta")
pb$tick(0)
on.exit( pb$terminate() )
}
## we submit a query for each chunk of the filter list
for(i in seq_along(filterXmlList)) {
if(i > 1) {
pb$tick()
}
filterXML <- filterXmlList[[ i ]]
fullXmlQuery = paste(xmlQuery, attributeXML, filterXML,"</Dataset></Query>",sep="")
if(verbose) {
message(fullXmlQuery)
}
## we choose a separator based on whether '?redirect=no' is present
sep <- ifelse(grepl(x = martHost(mart), pattern = ".+\\?.+"), "&", "?")
## create a unique name for this chunk & see if it has been run before
chunk_hash <- as(openssl::md5(paste(martHost(mart), fullXmlQuery)), "character")
tf <- file.path(tempdir(), paste0("biomaRt_", chunk_hash, ".rds"))
if(!file.exists(tf)) {
postRes <- .submitQueryXML(host = paste0(martHost(mart), sep),
query = fullXmlQuery)
result <- .processResults(postRes, mart = mart, sep = sep, fullXmlQuery = fullXmlQuery,
verbose = verbose, callHeader = callHeader,
quote = quote, attributes = attributes)
saveRDS(result, file = tf)
} else {
result <- readRDS(tf)
}
resultList[[i]] <- .setResultColNames(result, mart = mart, attributes = attributes, bmHeader = bmHeader)
}
## collate results
result <- do.call('rbind', resultList)
if(useCache) {
tf <- tempfile()
save(result, file = tf)
bfcadd(bfc, rname = hash, fpath = tf, action = "copy")
file.remove(tf)
}
## remove any temp chunk files
file.remove( list.files(tempdir(), pattern = "^biomaRt.*rds$", full.names = TRUE) )
return(result)
}
}
###################################
#getLDS: Multiple dataset linking #
###################################
getLDS <- function(attributes, filters = "", values = "", mart, attributesL, filtersL = "", valuesL = "", martL, verbose = FALSE, uniqueRows = TRUE, bmHeader = TRUE) {
martCheck(mart)
martCheck(martL)
if(martHost(mart) != martHost(martL)) {
stop('Both datasets must be located on the same host.')
}
if(martBM(mart) != martBM(martL)) {
stop('Both datasets must be located in the same Mart.\n',
'You are trying to combine datasets in ',
martBM(mart), ' and ', martBM(martL))
}
invalid = !(attributes %in% listAttributes(mart, what="name"))
if(any(invalid))
stop(paste("Invalid attribute(s):", paste(attributes[invalid], collapse=", "),
"\nPlease use the function 'listAttributes' to get valid attribute names"))
invalid = !(attributesL %in% listAttributes(martL, what="name"))
if(any(invalid))
stop(paste("Invalid attribute(s):", paste(attributesL[invalid], collapse=", "),
"\nPlease use the function 'listAttributes' to get valid attribute names"))
if(filters[1] != ""){
invalid = !(filters %in% listFilters(mart, what="name"))
if(any(invalid))
stop(paste("Invalid filters(s):", paste(filters[invalid], collapse=", "),
"\nPlease use the function 'listFilters' to get valid filter names"))
}
if(filtersL[1] != ""){
invalid = !(filtersL %in% listFilters(martL, what="name"))
if(any(invalid))
stop(paste("Invalid filters(s):", paste(filtersL[invalid], collapse=", "),
"\nPlease use the function 'listFilters' to get valid filter names"))
}
xmlQuery = paste("<?xml version='1.0' encoding='UTF-8'?><!DOCTYPE Query><Query virtualSchemaName = 'default' uniqueRows = '",as.numeric(uniqueRows),"' count = '0' datasetConfigVersion = '0.6' header='",as.numeric(bmHeader),"' formatter = 'TSV' requestid= 'biomaRt'> <Dataset name = '",martDataset(mart),"'>",sep="")
attributeXML = paste("<Attribute name = '", attributes, "'/>", collapse="", sep="")
if(length(filters) > 1){
if(class(values)!= "list")
stop("If using multiple filters, the 'value' has to be a list.\nFor example, a valid list for 'value' could be: list(affyid=c('1939_at','1000_at'), chromosome= '16')\nHere we select on affyid and chromosome, only results that pass both filters will be returned");
filterXML = NULL
for(i in seq(along=filters)){
if(filterType(filters[i],mart) == 'boolean' || filterType(filters[i],mart) == 'boolean_list'){
if(!is.logical(values[[i]])) stop(paste("biomaRt error: ",filters[i]," is a boolean filter and needs a corresponding logical value of TRUE or FALSE to indicate if the query should retrieve all data that fulfill the boolean or alternatively that all data that not fulfill the requirement should be retrieved."), sep="")
if(!values[[i]]){
values[[i]] = 1
}
else{
values[[i]] = 0
}
filterXML = paste(filterXML,paste("<Filter name = '",filters[i],"' excluded = \"",values[[i]],"\" />", collapse="",sep=""),sep="")
}
else{
valuesString = paste(values[[i]],"",collapse=",",sep="")
filterXML = paste(filterXML,paste("<Filter name = '",filters[i],"' value = '",valuesString,"' />", collapse="",sep=""),sep="")
}
}
}
else{
if(filters != ""){
if(filterType(filters,mart) == 'boolean' || filterType(filters,mart) == 'boolean_list'){
if(!is.logical(values)) stop(paste("biomaRt error: ",filters," is a boolean filter and needs a corresponding logical value of TRUE or FALSE to indicate if the query should retrieve all data that fulfill the boolean or alternatively that all data that not fulfill the requirement should be retrieved."), sep="")
if(!values){
values = 1
}
else{
values = 0
}
filterXML = paste("<Filter name = '",filters,"' excluded = \"",values,"\" />", collapse="",sep="")
}
else{
valuesString = paste(values,"",collapse=",",sep="")
filterXML = paste("<Filter name = '",filters,"' value = '",valuesString,"' />", collapse="",sep="")
}
}
else{
filterXML=""
}
}
xmlQuery = paste(xmlQuery, attributeXML, filterXML,"</Dataset>",sep="")
xmlQuery = paste(xmlQuery, "<Dataset name = '",martDataset(martL),"' >", sep="")
linkedAttributeXML = paste("<Attribute name = '", attributesL, "'/>", collapse="", sep="")
if(length(filtersL) > 1){
if(class(valuesL)!= "list")
stop("If using multiple filters, the 'value' has to be a list.\nFor example, a valid list for 'value' could be: list(affyid=c('1939_at','1000_at'), chromosome= '16')\nHere we select on affyid and chromosome, only results that pass both filters will be returned");
linkedFilterXML = NULL
for(i in seq(along=filtersL)){
if(filterType(filtersL,martL) == 'boolean' || filterType(filtersL,martL) == 'boolean_list'){
if(!is.logical(valuesL[[i]])) stop(paste("biomaRt error: ",filtersL[i]," is a boolean filter and needs a corresponding logical value of TRUE or FALSE to indicate if the query should retrieve all data that fulfill the boolean or alternatively that all data that not fulfill the requirement should be retrieved."), sep="")
if(!valuesL[[i]]){
valuesL[[i]] = 1
}
else{
valuesL[[i]] = 0
}
linkedFilterXML = paste(linkedFilterXML,paste("<Filter name = '",filtersL[i],"' excluded = \"",valuesL[[i]],"\" />", collapse="",sep=""),sep="")
}
else{
valuesString = paste(valuesL[[i]],"",collapse=",",sep="")
linkedFilterXML = paste(linkedFilterXML,paste("<Filter name = '",filtersL[i],"' value = '",valuesString,"' />", collapse="",sep=""),sep="")
}
}
}
else{
if(filtersL != ""){
if(filterType(filtersL,martL) == 'boolean' || filterType(filtersL,martL) == 'boolean_list'){
if(!is.logical(valuesL)) stop(paste("biomaRt error: ",filtersL," is a boolean filter and needs a corresponding logical value of TRUE or FALSE to indicate if the query should retrieve all data that fulfill the boolean or alternatively that all data that not fulfill the requirement should be retrieved."), sep="")
if(!valuesL){
valuesL = 1
}
else{
valuesL = 0
}
linkedFilterXML = paste("<Filter name = '",filtersL,"' excluded = \"",valuesL,"\" />", collapse="",sep="")
}
else{
valuesString = paste(valuesL,"",collapse=",",sep="")
linkedFilterXML = paste("<Filter name = '",filtersL,"' value = '",valuesString,"' />", collapse="",sep="")
}
}
else{
linkedFilterXML=""
}
}
xmlQuery = paste(xmlQuery, linkedAttributeXML, linkedFilterXML,"</Dataset></Query>",sep="")
if(verbose){
cat(paste(xmlQuery,"\n", sep=""))
}
#postRes = postForm(paste(martHost(mart),"?",sep=""),"query"=xmlQuery)
## we choose a separator based on whether '?redirect=no' is present
sep <- ifelse(grepl(x = martHost(mart), pattern = ".+\\?.+"), "&", "?")
## POST query
postRes <- .submitQueryXML(host = paste0(martHost(mart), sep),
query = xmlQuery)
if(length(grep("^Query ERROR", postRes))>0L)
stop(postRes)
if(postRes != ""){
con = textConnection(postRes)
result = read.table(con, sep="\t", header=bmHeader, quote = "\"", comment.char = "", as.is=TRUE, check.names = TRUE)
close(con)
if(nrow(result) > 0 && all(is.na(result[,ncol(result)])))
result = result[,-ncol(result),drop=FALSE]
res_attributes <- c(attributes,attributesL)
if(!(is(result, "data.frame") && (ncol(result)==length(res_attributes)))) {
print(head(result))
stop("The query to the BioMart webservice returned an invalid result: ",
"the number of columns in the result table does not equal the number of attributes in the query. \n",
"Please report this on the support site at http://support.bioconductor.org")
}
if(!bmHeader){ #assumes order of results same as order of attibutes in input
colnames(result) = res_attributes
}
} else {
warning("getLDS returns NULL.")
result=NULL
}
return(result)
}
######################
#getBMlist
######################
getBMlist <- function(attributes, filters = "", values = "", mart, list.names = NULL, na.value = NA, verbose=FALSE, giveWarning=TRUE){
if(giveWarning) writeLines("Performing your query using getBM is preferred as getBMlist perfoms a separate getBM query for each of the values one gives. This is ok for a short list but will definitely fail when used with longer lists. Ideally one does a batch query with getBM and then iterates over that result.")
out <- vector("list", length(attributes))
if(is.null(list.names))
names(out) <- attributes
else
names(out) <- list.names
for(j in seq(along = attributes)){
tmp2 <- vector("list", length(values))
names(tmp2) <- values
for(k in seq(along = tmp2)){
tst <- getBM(attributes = attributes[j], filters=filters, values = values[k], mart = mart, verbose = verbose)
if(class(tst) == "data.frame"){
tmp <- unlist(unique(tst[!is.na(tst)]), use.names = FALSE)
if(length(tmp) > 0)
tmp2[[k]] <- tmp
else
tmp2[[k]] <- na.value
}else{
tmp2[[k]] <- na.value
}
out[[j]] <- tmp2
}
}
return(out)
}
###############################
# #
#Ensembl specific functions #
###############################
getGene <- function( id, type, mart){
martCheck(mart,"ensembl")
checkWrapperArgs(id, type, mart)
symbolAttrib = switch(strsplit(martDataset(mart), "_", fixed = TRUE, useBytes = TRUE)[[1]][1],hsapiens = "hgnc_symbol",mmusculus = "mgi_symbol","external_gene_id")
typeAttrib = switch(type,affy_hg_u133a_2 = "affy_hg_u133a_v2",type)
attrib = c(typeAttrib,symbolAttrib,"description","chromosome_name","band","strand","start_position","end_position","ensembl_gene_id")
table = getBM(attributes = attrib,filters = type, values = id, mart=mart)
return(table)
}
getSequence <- function(chromosome, start, end, id, type, seqType, upstream, downstream, mart, verbose=FALSE){
martCheck(mart,c("ensembl","ENSEMBL_MART_ENSEMBL"))
if(missing(seqType) || !seqType %in% c("cdna","peptide","3utr","5utr", "gene_exon", "transcript_exon",
"transcript_exon_intron", "gene_exon_intron","coding","coding_transcript_flank",
"coding_gene_flank","transcript_flank","gene_flank")) {
stop("Please specify the type of sequence that needs to be retrieved when using biomaRt in web service mode. Choose either gene_exon, transcript_exon,transcript_exon_intron, gene_exon_intron, cdna, coding,coding_transcript_flank,coding_gene_flank,transcript_flank,gene_flank,peptide, 3utr or 5utr")
}
if(missing(type))
stop("Please specify the type argument. If you use chromosomal coordinates to retrieve sequences, then the type argument will specify the type of gene indentifiers that you will retrieve with the sequences. ",
"If you use a vector of identifiers to retrieve the sequences, the type argument specifies the type of identifiers you are using.")
if(missing(id) && missing(chromosome) && !missing(type))
stop("No vector of identifiers given. Please use the id argument to give a vector of identifiers for which you want to retrieve the sequences.")
if(!missing(chromosome) && !missing(id))
stop("The getSequence function retrieves sequences given a vector of identifiers specified with the id argument of a type specified by the type argument. Or alternatively getSequence retrieves sequences given a chromosome, a start and a stop position on the chromosome. As you specified both a vector of identifiers and chromsomal coordinates. Your query won't be processed.")
if(!missing(chromosome)){
if(!missing(start) && missing(end))
stop("You specified a chromosomal start position but no end position. Please also specify a chromosomal end position.")
if(!missing(end) && missing(start))
stop("You specified a chromosomal end position but no start position. Please also specify a chromosomal start position.")
if(!missing(start)){ start = as.integer(start)
end = as.integer(end)
}
if(missing(upstream) && missing(downstream)){
sequence = getBM(c(seqType,type),
filters = c("chromosome_name","start","end"),
values = list(chromosome, start, end),
mart = mart,
checkFilters = FALSE,
verbose=verbose)
}
else{
if(!missing(upstream) && missing(downstream)){
sequence = getBM(c(seqType,type),
filters = c("chromosome_name","start","end","upstream_flank"),
values = list(chromosome, start, end, upstream),
mart = mart,
checkFilters = FALSE,
verbose=verbose)
}
if(!missing(downstream) && missing(upstream)){
sequence = getBM(c(seqType,type),
filters = c("chromosome_name","start","end","downstream_flank"),
values = list(chromosome, start, end, downstream),
mart = mart,
checkFilters = FALSE,
verbose = verbose)
}
if(!missing(downstream) && !missing(upstream)){
stop("Currently getSequence only allows the user to specify either an upstream of a downstream argument but not both.")
}
}
}
if(!missing(id)){
if(missing(type)) {
stop("Type argument is missing. ",
"This will be used to retrieve an identifier along with the sequence so one knows which gene it is from. ",
"Use the listFilters() function to select a valid type argument.")
}
if(!type %in% listFilters(mart, what="name")) {
stop("Invalid type argument. Use the listFilters() function to select a valid type argument.")
}
valuesString = paste(id,"",collapse=",",sep="")
if(missing(upstream) && missing(downstream)){
sequence = getBM(c(seqType,type), filters = type, values = id, mart = mart, verbose=verbose)
}
else{
if(!missing(upstream) && missing(downstream)){
sequence = getBM(c(seqType,type), filters = c(type, "upstream_flank"), values = list(id, upstream), mart = mart, checkFilters = FALSE, verbose=verbose)
}
if(!missing(downstream) && missing(upstream)){
sequence = getBM(c(seqType,type), filters = c(type, "downstream_flank"), values = list(id, downstream), mart = mart, checkFilters = FALSE, verbose=verbose)
}
if(!missing(downstream) && !missing(upstream)){
stop("Currently getSequence only allows the user to specify either an upstream of a downstream argument but not both.")
}
}
}
return(sequence)
}
####################
#export FASTA #
####################
exportFASTA <- function( sequences, file ) {
if( missing( sequences ) || class( sequences ) != "data.frame"){
stop("No data.frame given to write FASTA. The data.frame should be the output of the getSequence function.");
}
if( missing(file)){
stop("Please provide filename to write to");
}
if(length(sequences[1,]) == 2){
for(i in seq(along = sequences[,2])){
cat(paste(">",sequences[i,2],"\n",sep=""),file = file, append=TRUE);
cat(as.character(sequences[i,1]),file = file, append = TRUE);
cat("\n\n", file = file, append = TRUE);
}
}
else{
for(i in seq(along = sequences[,2])){
cat(paste(">chromosome_",sequences[i,1],"_start_",sequences[i,2],"_end_",sequences[i,3],"\n",sep=""),file = file, append=TRUE);
cat(as.character(sequences[i,4]),file = file, append = TRUE);
cat("\n\n", file = file, append = TRUE);
}
}
}
###################
#Nature Protocol
###################
NP2009code <- function(){
edit(file=system.file('scripts', 'Integration-NP.R', package = 'biomaRt'))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.