R/biomaRt.R

Defines functions NP2009code exportFASTA getSequence getGene getBMlist getLDS getBM filterType filterOptions listFilters attributePages listAttributes useDataset checkDataset bmAttrFilt bmVersion .listDatasets listDatasets .useMart useMart .listMarts listMarts bmRequest checkWrapperArgs martCheck messageToUser

Documented in attributePages exportFASTA filterOptions filterType getBM getBMlist getGene getLDS getSequence listAttributes listDatasets listFilters listMarts NP2009code useDataset useMart

##########################
#biomaRt source code     #
##########################
#                        #
#Licence: Artistic       #
#Author: Steffen Durinck #
##########################

###############
#messageToUser#
###############

messageToUser <- function(message){
    if(interactive()){
        cat(message)
    }
}

##############################################################
#martCheck                                                   # 
#                                                            #
#This function checks if there is a valid Mart object,       # 
#if a dataset is selected and                                #
#if the correct BioMart database has been selected (optional)# 
##############################################################

martCheck = function(mart, biomart = NULL){
    if( missing( mart ) || class( mart ) != 'Mart'){
        stop("You must provide a valid Mart object. To create a Mart object use the function: useMart.  Check ?useMart for more information.")
    }
    if(!is.null(biomart)){
        martcheck = martBM(mart)
        bmok = FALSE
        for(k in seq_along(biomart)) {
            if(martcheck[1] == biomart[k]) {	
                bmok = TRUE
            }
        }		    
        if(!bmok){
            stop(paste("This function only works when used with the ",biomart," BioMart.",sep="")) 
        }      
    }
    if(martDataset(mart)==""){
        stop("No dataset selected, please select a dataset first.  You can see the available datasets by using the listDatasets function see ?listDatasets for more information.  Then you should create the Mart object by using the useMart function.  See ?useMart for more information");
    }
}

checkWrapperArgs = function(id, type, mart){
    if(missing(type))stop("Specify the type of identifier you are using, see ?getGene for details. Valid values for the type argument can be found with the listFilters function.") 
    if(!type %in% listFilters(mart)[,1])stop(paste("Invalid identifier type:",type," see ?getGene for details. Use the listFilters function to get the valid value for the type argument.", sep=""))   
    if(missing(id))stop("No identifiers specified.  Use the id argument to specify a vector of identifiers for which you want to retrieve the annotation.")
}


bmRequest <- function(request, verbose = FALSE){
    if(verbose) writeLines(paste("Attempting web service request:\n",request, sep=""))

    result <- tryCatch(httr::GET(request, content_type("text/plain"), timeout(10)),
                       error = function(c) { "timeout" } )
    
    tryAgain <- any(result == "timeout") || httr::status_code(result) == 500
    
    ## try an alternative mirror if ensembl returns 500
    if(tryAgain && grepl("ensembl", request)) {
        mirrors <- c("www", "uswest", "useast", "asia")
        subdomain <- stringr::str_match(request, "://([a-z]{3,5})\\.")[1,2]
        mirror_option <- sample(mirrors[!mirrors %in% subdomain], size = 1)
        message("Ensembl site unresponsive, trying ", mirror_option, " mirror")
        request2 <- str_replace(request, pattern = "://([a-z]{3,5})\\.", 
                                replacement = paste0("://", mirror_option, "."))
        result <- httr::GET(request2, content_type("text/plain"))
    } else { ## this isn't ensembl run the query again so we can get the error 
        result <- httr::GET(request, content_type("text/plain"))
    }
    
    stop_for_status(result)
    result2 <- content(result, encoding = "UTF-8")
    if(is.na(result2)) {
        result2 <- content(result, encoding = "Latin1")
    }
    return(result2)
}

#######################################################
#listMarts:                                           #
#list all available BioMart databases by default      #
#listMarts will check the central service to see which#
#BioMart databases are present                        #
#######################################################

listMarts <- function( mart = NULL, host="www.ensembl.org", path="/biomart/martservice", 
                       port, includeHosts = FALSE, archive = FALSE, ensemblRedirect = NULL, verbose = FALSE){
    
    if(missing(port)) {
        port <- ifelse(grepl("https", host), yes = 443, no = 80)
    }
    
    if(!is.null(ensemblRedirect)) {
        warning('The argument "ensemblRedirect" has been deprecated and does not do anything.',
                '\nSee ?useEnsembl for details on using mirror sites.')
    }
    
    .listMarts(mart = mart, host = host, path = path, port = port, includeHosts = includeHosts,
                archive = archive, verbose = verbose, ensemblRedirect = TRUE)
    
}

.listMarts <- function( mart = NULL, host="www.ensembl.org", path="/biomart/martservice", 
                       port=80, includeHosts = FALSE, archive = FALSE, verbose = FALSE, 
                       ensemblRedirect = NULL){

    request = NULL
    if(is.null(mart)){
        
        host <- .cleanHostURL(host)
        if(archive) {
            stop("The archive = TRUE argument is now defunct.\nUse listEnsemblArchives() to find the URL to directly query an Ensembl archive.")
        } else {
            request <- paste0(host, ":", port, path, "?type=registry&requestid=biomaRt")
        }
    }
    else{
        if(class(mart) == 'Mart'){
            request = paste0(martHost(mart), "?type=registry&requestid=biomaRt") 
        }
        else{
            warning(paste(mart,"object needs to be of class Mart created with the useMart function.  If you don't have a Mart object yet, use listMarts without arguments or only specify the host argument",sep=" "))
        }
    } 	
    
    if(!ensemblRedirect && grepl(x = request, pattern = "ensembl.org")) {
        request <- paste0(request, "&redirect=no")
    }
    registry = bmRequest(request = request, verbose = verbose)
    
    ## check this looks like the MartRegistry XML, otherwise throw an error
    if(!grepl(x = registry, pattern = "^\n*<MartRegistry>")) {
        
        if(grepl(x = registry, pattern = "status.ensembl.org")) {
            stop("Your query has been redirected to http://status.ensembl.org ",
                 "indicating this Ensembl service is currently unavailable",
                 "\nLook at ?useEnsembl for details on how to try a mirror site.",
                 call. = FALSE)
        } else {
            stop('Unexpected format to the list of available marts.\n',
                 'Please check the following URL manually, ',
                 'and try ?listMarts for advice.\n',
                 request, 
                 call. = FALSE)
        }
    }
    registry = xmlTreeParse(registry, asText=TRUE)
    registry = registry$doc$children[[1]]
    
    marts = list(biomart = NULL, version = NULL, host = NULL, path = NULL, database = NULL)
    index = 1
    
    # if(host != "www.biomart.org" || archive){
        for(i in seq(len=xmlSize(registry))){
            if(xmlName(registry[[i]])=="MartURLLocation"){  
                if(xmlGetAttr(registry[[i]],"visible") == 1){
                    if(!is.null(xmlGetAttr(registry[[i]],"name"))) marts$biomart[index] = as.character(xmlGetAttr(registry[[i]],"name"))
                    if(!is.null(xmlGetAttr(registry[[i]],"database"))) marts$database[index] = as.character(xmlGetAttr(registry[[i]],"database"))
                    if(!is.null(xmlGetAttr(registry[[i]],"displayName"))) marts$version[index] = as.character(xmlGetAttr(registry[[i]],"displayName"))
                    if(!is.null(xmlGetAttr(registry[[i]],"host"))) marts$host[index] = as.character(xmlGetAttr(registry[[i]],"host"))
                    if(!is.null(xmlGetAttr(registry[[i]],"path"))) marts$path[index] = as.character(xmlGetAttr(registry[[i]],"path"))
                    if(!is.null(xmlGetAttr(registry[[i]],"port"))) marts$port[index] = as.character(xmlGetAttr(registry[[i]],"port"))
                    if(!is.null(xmlGetAttr(registry[[i]],"serverVirtualSchema"))){
                        marts$vschema[index] =  as.character(xmlGetAttr(registry[[i]],"serverVirtualSchema"))
                    }
                    index=index+1
                }
            }
        }
    if(includeHosts){
        return(marts)
    }
    else{
        ret = data.frame(biomart = as.character(marts$biomart),
                         version = as.character(marts$version), 
                         stringsAsFactors=FALSE)
        return(ret)
    } 
}

#################################
# #                           # #
# # Generic BioMart functions # #
# #                           # #
#################################

useMart <- function(biomart, dataset, host = "https://www.ensembl.org", path = "/biomart/martservice", port, 
                     archive = FALSE, ensemblRedirect = NULL, version, verbose = FALSE) {
    
    if(missing(port)) {
        port <- ifelse(grepl("https", host), yes = 443, no = 80)
    }
    
    if(!is.null(ensemblRedirect)) {
        warning('The argument "ensemblRedirect" has been deprecated and will be removed in the next biomaRt release.')
    }
    
    mart <- .useMart(biomart, dataset, host = host, path = path, port = port, 
                     archive = archive, version = version, verbose = verbose, ensemblRedirect = TRUE)
}

.useMart <- function(biomart, dataset, host = "www.ensembl.org", path = "/biomart/martservice", port = 80, 
                    archive = FALSE, ensemblRedirect = NULL, version, verbose = FALSE){
    
    if(missing(biomart) && missing(version)) 
        stop("No biomart databases specified. Specify a biomart database to use using the biomart or version argument")
    if(!missing(biomart)){ 
        if(!(is.character(biomart)))
            stop("biomart argument is not a string. ",
                 "The biomart argument should be a single character string")
    }
    
    if(biomart == "ensembl" & grepl(x = host, pattern = "ensembl.org")) {
        biomart = "ENSEMBL_MART_ENSEMBL"
    }
    
    reqHost = host
    host <- .cleanHostURL(host)
    
    marts <- listMarts(host=host, path=path, port=port, includeHosts = TRUE,
                       archive = archive)
    mindex = NA
    if(!missing(biomart)){ 
        mindex=match(biomart,marts$biomart)
    }
    if(!missing(version)){
        mindex=match(version,marts$version)
    }
    if(is.na(mindex) || archive){
        mindex=match(biomart,marts$database)
    }
    if(is.na(mindex))
        stop("Incorrect BioMart name, use the listMarts function to see which BioMart databases are available")
    
    if(is.na(marts$path[mindex]) || is.na(marts$vschema[mindex]) || 
       is.na(marts$host[mindex]) || is.na(marts$port[mindex]) || 
       is.na(marts$path[mindex])) 
        stop("The selected biomart databases is not available due to error in the BioMart central registry, please report so the BioMart registry file can be fixed.")
    
    if(marts$path[mindex]=="") marts$path[mindex]="/biomart/martservice" #temporary to catch bugs in registry
    #if(archive) biomart = marts$biomart[mindex]
    if(!missing(version)) biomart = marts$biomart[mindex]
    biomart = sub(" ","%20",biomart, fixed = TRUE, useBytes = TRUE)
    
    ## adding option to force use of specificed host with ensembl
    redirect <- ifelse(!ensemblRedirect && grepl(x = host, pattern = "ensembl.org"), 
                       "?redirect=no",
                       "")
    
    mart <- new("Mart", 
                biomart = biomart,
                vschema = marts$vschema[mindex], 
                host = paste0(host, ":", 
                              port,
                              marts$path[mindex],
                              redirect))
    
    if(length(grep("archive",martHost(mart)) > 0)){
        
        ## hack to work around redirection of most recent mirror URL
        archives <- listEnsemblArchives()
        current_release <- archives[archives$current_release == "*", 'url']
        if(grepl(mart@host, pattern = current_release)) {
            mart@host <- stringr::str_replace(mart@host, pattern = current_release, "https://www.ensembl.org")
            mart@host <- stringr::str_replace(mart@host, pattern = ":80/", ":443/")
        }
    }
    
    BioMartVersion=bmVersion(mart, verbose=verbose)
    
    if(verbose){
        writeLines(paste("BioMartServer running BioMart version:",BioMartVersion,sep=" "))
        writeLines(paste("Mart virtual schema:",martVSchema(mart),sep=" "))
        if(length(grep(reqHost,martHost(mart))) == 0){
            writeLines(paste("Requested host was redirected from ", reqHost, " to " ,martHost(mart),sep=""))
        } 
        writeLines(paste("Mart host:",martHost(mart),sep=" "))
    }
    if(!missing(dataset)){
        mart = useDataset(mart = mart, dataset=dataset, verbose = verbose)
    }
    return(mart)
}

listDatasets <- function(mart, verbose = FALSE) {
    .listDatasets(mart = mart, verbose = verbose, sort = TRUE)
}

.listDatasets <- function(mart, verbose = FALSE, sort = FALSE) {
    if(missing(mart) || !is(mart, 'Mart'))
        stop("No Mart object given or object not of class 'Mart'")
    
    ## we choose a separator based on whether 'redirect=no' is present
    ## should always be '?' now
    sep <- ifelse(grepl(x = martHost(mart), pattern = ".+\\?.+"), "&", "?")
    
    request = paste0(martHost(mart), sep, "type=datasets&requestid=biomaRt&mart=", martBM(mart))
    bmResult = bmRequest(request = request, verbose = verbose)
    con = textConnection(bmResult)
    txt = scan(con, sep="\t", blank.lines.skip=TRUE, what="character", quiet=TRUE, quote = "\"")
    close(con)
    
    ## select visible ("1") table sets
    i = intersect(which(txt=="TableSet"), which(txt=="1")-3L)
    
    res = data.frame(dataset     = I(txt[i+1L]),
                     description = I(txt[i+2L]),
                     version     = I(txt[i+4L]))
    
    ## sort alphabetically
    if(sort)
        res <- res[ order(res$dataset), ]
    rownames(res) <- NULL
    
    return(res)
}

## Check version of BioMart service

bmVersion <- function(mart, verbose=FALSE){
    
    ## we choose a separator based on whether 'redirect=no' is present
    sep <- ifelse(grepl(x = martHost(mart), pattern = ".+\\?.+"), "&", "?")
    
    request = paste0(martHost(mart), sep, "type=version", "&requestid=biomaRt&mart=", martBM(mart))
    BioMartVersion = bmRequest(request = request, verbose = verbose)
    bmv = ""
    if(BioMartVersion == "\n" | BioMartVersion == ""){
        bmv = NA
        if(verbose) warning(paste("BioMart version is not available from BioMart server:",request,sep="\n"))
    }
    else{
        con = textConnection(BioMartVersion)
        bmVersionParsed = read.table(con, sep="\t", header=FALSE, quote = "", comment.char = "", as.is=TRUE)
        close(con)
        if(verbose) print(bmVersionParsed)
        
        if(dim(bmVersionParsed)[2] >= 1){
            bmv=bmVersionParsed[1,1]
        }
    }
    return(bmv)
}

## Retrieve attributes and filters from web service
bmAttrFilt <- function(type, mart, verbose=FALSE){
    
    ## we choose a separator based on whether 'redirect=no' is present
    sep <- ifelse(grepl(x = martHost(mart), pattern = ".+\\?.+"), "&", "?")
    
    request = paste0(martHost(mart), sep, "type=", type,
                     "&dataset=", martDataset(mart),
                     "&requestid=biomaRt&mart=", martBM(mart),
                     "&virtualSchema=", martVSchema(mart))
    attrfilt = bmRequest(request = request, verbose = verbose)
    con = textConnection(attrfilt)
    attrfiltParsed = read.table(con, sep="\t", header=FALSE, quote = "", comment.char = "", as.is=TRUE)
    close(con)
    if(type=="attributes"){
        if(dim(attrfiltParsed)[2] < 3)
            stop("biomaRt error: looks like we're connecting to incompatible version of BioMart suite.")
        cnames = seq_len(dim(attrfiltParsed)[2])
        cnames=paste(type,cnames,sep="")
        cnames[1] = "name"
        cnames[2] = "description"
        cnames[3] = "fullDescription"
        if(dim(attrfiltParsed)[2] < 4){
            warning("biomaRt warning: looks like we're connecting to an older ",
                    "version of BioMart suite.\nSome biomaRt functions might ",
                    "not work.")
        }
        else{
            cnames[4] = "page"
        }
        colnames(attrfiltParsed) = cnames
    }
    if(type=="filters"){
        if(dim(attrfiltParsed)[2] < 4)
            stop("biomaRt error: looks like we're connecting to incompatible version of BioMart suite.")
        cnames = seq(1:dim(attrfiltParsed)[2])
        cnames=paste(type,cnames,sep="")
        cnames[1] = "name"
        cnames[2] = "description"
        cnames[3] = "options"
        cnames[4] = "fullDescription"
        if(dim(attrfiltParsed)[2] < 7){
            warning("biomaRt warning: looks like we're connecting to an older version of BioMart suite. Some biomaRt functions might not work.")
        }
        else{
            cnames[5] = "filters"
            cnames[6] = "type"
            cnames[7] = "operation"
        }
        colnames(attrfiltParsed) = cnames
    }
    return(attrfiltParsed)
}

## Utilty function to check dataset specification
## Returns dataset name as a character assuming all checks
## have been passed.
checkDataset <- function(dataset, mart) {
    
    validDatasets <- .listDatasets(mart, sort = FALSE)
    ## subseting data.frames can produce some weird classes
    ## which aren't character(), so we coerce it here
    dataset <- as.character(dataset)
    
    if(length(dataset) > 1) 
        stop("Please only specify a single dataset name")
    
    if(is.na(match(dataset, validDatasets$dataset)))
        stop(paste("The given dataset: ",dataset,", is not valid.  Correct dataset names can be obtained with the listDatasets() function."))
    
    return(dataset)
}

## Select a BioMart dataset             
useDataset <- function(dataset, mart, verbose = FALSE){
    if(missing(mart) || class(mart)!="Mart") 
        stop("No valid Mart object given, specify a Mart object with the attribute mart")
    
    if(missing(dataset)) {
        stop("No dataset given.  Please use the dataset argument to specify which dataset you want to use. Correct dataset names can be obtained with the listDatasets() function.")
    } else {
        dataset <- checkDataset(dataset = dataset, mart = mart)
    }
    martDataset(mart) <- dataset  
    
    if(verbose) messageToUser("Checking attributes ...")
    martAttributes(mart) <- bmAttrFilt("attributes",mart, verbose = verbose)
    if(verbose){
        messageToUser(" ok\n")
        messageToUser("Checking filters ...")
    }
    martFilters(mart) <- bmAttrFilt("filters",mart, verbose = verbose)
    if(verbose) messageToUser(" ok\n")
    return( mart )
}

## listAttributes

listAttributes <- function(mart, page, what = c("name","description","page")) {
    martCheck(mart)
    if(!missing(page) && !page %in% attributePages(mart)) 
        stop("The chosen page: ",page," is not valid, please use the correct page name using the attributePages function")
    attrib=NULL
    if(!missing(page)){
        sel = which(martAttributes(mart)[,"page"] == page)
        attrib = martAttributes(mart)[sel,what]
    }
    else{
        attrib = martAttributes(mart)[,what]
    }
    return(attrib)
}

## attributePages

attributePages <- function(mart){
    
    martCheck(mart)
    pages = unique(martAttributes(mart)[,"page"])
    return(pages)
}

## listFilters

listFilters <- function(mart, what = c("name", "description")) {
    
    martCheck(mart)
    filters = martFilters(mart)
    badwhat = !(what %in% colnames(filters))
    if(any(badwhat))
        stop(sprintf("The function argument 'what' contains %s: %s\nValid are: %s\n",
                     if(sum(badwhat)>1) "invalid values" else "an invalid value",
                     paste(what[badwhat], collapse=", "),
                     paste(colnames(filters), collapse=", ")))
    return(filters[, what])
}

## filterOptions

filterOptions <- function(filter, mart){
    if(missing(filter)) stop("No filter given. Please specify the filter for which you want to retrieve the possible values.")
    if(class(filter)!="character")stop("Filter argument should be of class character")
    martCheck(mart)
    if(!filter %in% listFilters(mart, what="name"))stop("Filter not valid, check for typo in filter argument.")
    sel = which(listFilters(mart, what="name") == filter)
    return(listFilters(mart,what="options")[sel])
}

## filterType

filterType <- function(filter, mart){
    if(missing(filter)) stop("No filter given. Please specify the filter for which you want to retrieve the filter type")
    if(class(filter)!="character")stop("Filter argument should be of class character")
    martCheck(mart)
    type="unknown"
    sel = which(listFilters(mart, what="name") == filter)
    if(is.null(sel))stop(paste("Invalid filter",filter, sep=": "))
    type = listFilters(mart,what="type")[sel]
    return(type)
}

##########################################
#getBM: generic BioMart query function   # 
##########################################

getBM <- function(attributes, filters = "", values = "", mart, curl = NULL, 
                  checkFilters = TRUE, verbose=FALSE, uniqueRows=TRUE, bmHeader=FALSE, quote="\"",
                  useCache = TRUE){
    
    ## check the arguments are all valid
    martCheck(mart)
    if(missing( attributes ))
        stop("Argument 'attributes' must be specified.")
    
    if(is.list(filters) && !missing( values ))
        warning("Argument 'values' should not be used when argument 'filters' is a list and will be ignored.")
    if(is.list(filters) && is.null(names(filters)))
        stop("Argument 'filters' must be a named list when sent as a list.")
    if(!is.list(filters) && all(filters != "") && missing( values ))
        stop("Argument 'values' must be specified.")
    if(length(filters) > 0 && length(values) == 0)
        stop("Values argument contains no data.")
    if(is.list(filters)){
        values = filters
        filters = names(filters)
    }
    if(class(uniqueRows) != "logical")
        stop("Argument 'uniqueRows' must be a logical value, so either TRUE or FALSE")
    
    ## determine if we should use the results cache
    if(useCache) {
        cache <- Sys.getenv(x = "BIOMART_CACHE", 
                            unset = rappdirs::user_cache_dir(appname="biomaRt"))
        bfc <- BiocFileCache::BiocFileCache(cache, ask = FALSE)
    }
    hash <- .createHash(mart, attributes, filters, values, uniqueRows, bmHeader)
    if( useCache && .checkCache(bfc, hash) ) {
        
        if(verbose) {
            message("Cache found")
        }
        cache_hits <- bfcquery(bfc, hash, field = "rname")
        if(nrow(cache_hits) > 1) {
            stop("Multiple cache results found")
        } else {
            rid <- cache_hits$rid
            load( bfc[[ rid ]] )
            return(result)
        }

    } else { 
    
    ## force the query to return the 'descriptive text' header names with the result
    ## we use these later to match and order attribute/column names    
    callHeader <- TRUE
    xmlQuery = paste0("<?xml version='1.0' encoding='UTF-8'?><!DOCTYPE Query><Query  virtualSchemaName = '",
                      martVSchema(mart),
                      "' uniqueRows = '",
                      as.numeric(uniqueRows),
                      "' count='0' datasetConfigVersion='0.6' header='",
                      as.numeric(callHeader),
                      "' formatter='TSV' requestid='biomaRt'> <Dataset name = '",
                      martDataset(mart),"'>")
    
    #checking the Attributes
    invalid = !(attributes %in% listAttributes(mart, what="name"))
    if(any(invalid))
        stop(paste("Invalid attribute(s):", paste(attributes[invalid], collapse=", "),
                   "\nPlease use the function 'listAttributes' to get valid attribute names"))
    
    #attribute are ok lets add them to the query
    attributeXML = paste("<Attribute name = '", attributes, "'/>", collapse="", sep="")
    
    #checking the filters
    if(filters[1] != "" && checkFilters){
        invalid = !(filters %in% listFilters(mart, what="name"))
        if(any(invalid))
            stop(paste("Invalid filters(s):", paste(filters[invalid], collapse=", "),
                       "\nPlease use the function 'listFilters' to get valid filter names"))
    }
    
    ## filterXML is a list containing filters with reduced numbers of values
    ## to meet the 500 value limit in BioMart queries
    filterXmlList <- .generateFilterXML(filters, values, mart)
    
    resultList <- list()
    if(length(filterXmlList) > 1) {
        pb <- progress_bar$new(total = length(filterXmlList),
                               width = options()$width - 10,
                               format = "Batch submitting query [:bar] :percent eta: :eta")
        pb$tick(0)
        on.exit( pb$terminate() )
    }
    
    ## we submit a query for each chunk of the filter list
    for(i in seq_along(filterXmlList)) {
        
        if(i > 1) {
            pb$tick()
        }
        
        filterXML <- filterXmlList[[ i ]]
        fullXmlQuery = paste(xmlQuery, attributeXML, filterXML,"</Dataset></Query>",sep="")
        
        if(verbose) {
            message(fullXmlQuery)
        }      
        
        ## we choose a separator based on whether '?redirect=no' is present
        sep <- ifelse(grepl(x = martHost(mart), pattern = ".+\\?.+"), "&", "?")
        
        ## create a unique name for this chunk & see if it has been run before
        chunk_hash <- as(openssl::md5(paste(martHost(mart), fullXmlQuery)), "character")
        tf <- file.path(tempdir(), paste0("biomaRt_", chunk_hash, ".rds"))
        if(!file.exists(tf)) {
            postRes <- .submitQueryXML(host = paste0(martHost(mart), sep),
                                   query = fullXmlQuery)
            result <- .processResults(postRes, mart = mart, sep = sep, fullXmlQuery = fullXmlQuery,
                                      verbose = verbose, callHeader = callHeader, 
                                      quote = quote, attributes = attributes)
            saveRDS(result, file = tf)
        } else {
            result <- readRDS(tf)
        }
        resultList[[i]] <- .setResultColNames(result, mart = mart, attributes = attributes, bmHeader = bmHeader)
    }
    ## collate results
    result <- do.call('rbind', resultList)

    if(useCache) {
        tf <- tempfile()
        save(result, file = tf)
        bfcadd(bfc, rname = hash, fpath = tf, action = "copy")
        file.remove(tf)
    }
    
    ## remove any temp chunk files
    file.remove( list.files(tempdir(), pattern = "^biomaRt.*rds$", full.names = TRUE) )
    return(result)
    }
}

###################################
#getLDS: Multiple dataset linking #
###################################

getLDS <- function(attributes, filters = "", values = "", mart, attributesL, filtersL = "", valuesL = "", martL, verbose = FALSE, uniqueRows = TRUE, bmHeader = TRUE) {
    
    martCheck(mart)
    martCheck(martL)
    
    if(martHost(mart) != martHost(martL)) {
        stop('Both datasets must be located on the same host.')
    }
    
    if(martBM(mart) != martBM(martL)) {
        stop('Both datasets must be located in the same Mart.\n',
             'You are trying to combine datasets in ', 
             martBM(mart), ' and ', martBM(martL))
    }
    
    invalid = !(attributes %in% listAttributes(mart, what="name"))
    if(any(invalid))
        stop(paste("Invalid attribute(s):", paste(attributes[invalid], collapse=", "),
                   "\nPlease use the function 'listAttributes' to get valid attribute names"))
    
    invalid = !(attributesL %in% listAttributes(martL, what="name"))
    if(any(invalid))
        stop(paste("Invalid attribute(s):", paste(attributesL[invalid], collapse=", "),
                   "\nPlease use the function 'listAttributes' to get valid attribute names"))
    
    if(filters[1] != ""){
        invalid = !(filters %in% listFilters(mart, what="name"))
        if(any(invalid))
            stop(paste("Invalid filters(s):", paste(filters[invalid], collapse=", "),
                       "\nPlease use the function 'listFilters' to get valid filter names"))
    }
    if(filtersL[1] != ""){
        invalid = !(filtersL %in% listFilters(martL, what="name"))
        if(any(invalid))
            stop(paste("Invalid filters(s):", paste(filtersL[invalid], collapse=", "),
                       "\nPlease use the function 'listFilters' to get valid filter names"))
    }
    
    xmlQuery = paste("<?xml version='1.0' encoding='UTF-8'?><!DOCTYPE Query><Query  virtualSchemaName = 'default' uniqueRows = '",as.numeric(uniqueRows),"' count = '0' datasetConfigVersion = '0.6' header='",as.numeric(bmHeader),"' formatter = 'TSV' requestid= 'biomaRt'> <Dataset name = '",martDataset(mart),"'>",sep="")
    attributeXML = paste("<Attribute name = '", attributes, "'/>", collapse="", sep="")
    if(length(filters) > 1){
        if(class(values)!= "list")
            stop("If using multiple filters, the 'value' has to be a list.\nFor example, a valid list for 'value' could be: list(affyid=c('1939_at','1000_at'), chromosome= '16')\nHere we select on affyid and chromosome, only results that pass both filters will be returned");
        filterXML = NULL
        for(i in seq(along=filters)){
            if(filterType(filters[i],mart) == 'boolean' || filterType(filters[i],mart) == 'boolean_list'){
                if(!is.logical(values[[i]])) stop(paste("biomaRt error: ",filters[i]," is a boolean filter and needs a corresponding logical value of TRUE or FALSE to indicate if the query should retrieve all data that fulfill the boolean or alternatively that all data that not fulfill the requirement should be retrieved."), sep="") 
                if(!values[[i]]){
                    values[[i]] = 1
                }
                else{
                    values[[i]] = 0 
                }
                filterXML = paste(filterXML,paste("<Filter name = '",filters[i],"' excluded = \"",values[[i]],"\" />", collapse="",sep=""),sep="")
            }
            else{
                valuesString = paste(values[[i]],"",collapse=",",sep="")
                filterXML = paste(filterXML,paste("<Filter name = '",filters[i],"' value = '",valuesString,"' />", collapse="",sep=""),sep="")
            }
        }
    }
    else{
        if(filters != ""){       
            if(filterType(filters,mart) == 'boolean' || filterType(filters,mart) == 'boolean_list'){
                if(!is.logical(values)) stop(paste("biomaRt error: ",filters," is a boolean filter and needs a corresponding logical value of TRUE or FALSE to indicate if the query should retrieve all data that fulfill the boolean or alternatively that all data that not fulfill the requirement should be retrieved."), sep="") 
                if(!values){
                    values = 1
                }
                else{
                    values = 0 
                }
                
                filterXML = paste("<Filter name = '",filters,"' excluded = \"",values,"\" />", collapse="",sep="")
            }
            else{
                valuesString = paste(values,"",collapse=",",sep="")
                filterXML = paste("<Filter name = '",filters,"' value = '",valuesString,"' />", collapse="",sep="")
            }
        }
        else{
            filterXML=""
        }
    }
    
    xmlQuery = paste(xmlQuery, attributeXML, filterXML,"</Dataset>",sep="")
    xmlQuery = paste(xmlQuery, "<Dataset name = '",martDataset(martL),"' >", sep="")
    linkedAttributeXML =  paste("<Attribute name = '", attributesL, "'/>", collapse="", sep="")
    
    if(length(filtersL) > 1){
        if(class(valuesL)!= "list")
            stop("If using multiple filters, the 'value' has to be a list.\nFor example, a valid list for 'value' could be: list(affyid=c('1939_at','1000_at'), chromosome= '16')\nHere we select on affyid and chromosome, only results that pass both filters will be returned");
        linkedFilterXML = NULL
        for(i in seq(along=filtersL)){
            if(filterType(filtersL,martL) == 'boolean' || filterType(filtersL,martL) == 'boolean_list'){
                if(!is.logical(valuesL[[i]])) stop(paste("biomaRt error: ",filtersL[i]," is a boolean filter and needs a corresponding logical value of TRUE or FALSE to indicate if the query should retrieve all data that fulfill the boolean or alternatively that all data that not fulfill the requirement should be retrieved."), sep="") 
                if(!valuesL[[i]]){
                    valuesL[[i]] = 1
                }
                else{
                    valuesL[[i]] = 0 
                } 
                linkedFilterXML = paste(linkedFilterXML,paste("<Filter name = '",filtersL[i],"' excluded = \"",valuesL[[i]],"\" />", collapse="",sep=""),sep="")
            }
            else{
                valuesString = paste(valuesL[[i]],"",collapse=",",sep="")
                linkedFilterXML = paste(linkedFilterXML,paste("<Filter name = '",filtersL[i],"' value = '",valuesString,"' />", collapse="",sep=""),sep="")
            }
        }
    }
    else{
        if(filtersL != ""){
            if(filterType(filtersL,martL) == 'boolean' || filterType(filtersL,martL) == 'boolean_list'){
                if(!is.logical(valuesL)) stop(paste("biomaRt error: ",filtersL," is a boolean filter and needs a corresponding logical value of TRUE or FALSE to indicate if the query should retrieve all data that fulfill the boolean or alternatively that all data that not fulfill the requirement should be retrieved."), sep="") 
                if(!valuesL){
                    valuesL = 1
                }
                else{
                    valuesL = 0 
                }
                
                linkedFilterXML = paste("<Filter name = '",filtersL,"' excluded = \"",valuesL,"\" />", collapse="",sep="")
            }
            else{
                valuesString = paste(valuesL,"",collapse=",",sep="")
                linkedFilterXML = paste("<Filter name = '",filtersL,"' value = '",valuesString,"' />", collapse="",sep="")
            }
        }
        else{
            linkedFilterXML=""
        }
    }
    
    xmlQuery = paste(xmlQuery, linkedAttributeXML, linkedFilterXML,"</Dataset></Query>",sep="")
    
    if(verbose){
        cat(paste(xmlQuery,"\n", sep=""))
    }
    #postRes = postForm(paste(martHost(mart),"?",sep=""),"query"=xmlQuery)
    
    ## we choose a separator based on whether '?redirect=no' is present
    sep <- ifelse(grepl(x = martHost(mart), pattern = ".+\\?.+"), "&", "?")
    ## POST query
    postRes <- .submitQueryXML(host = paste0(martHost(mart), sep),
                               query = xmlQuery)
    

    if(length(grep("^Query ERROR", postRes))>0L)
        stop(postRes)  

    if(postRes != ""){
        con = textConnection(postRes)
        result = read.table(con, sep="\t", header=bmHeader, quote = "\"", comment.char = "", as.is=TRUE, check.names = TRUE)
        close(con)
        
        if(nrow(result) > 0 && all(is.na(result[,ncol(result)])))
            result = result[,-ncol(result),drop=FALSE]

        res_attributes <- c(attributes,attributesL)
        if(!(is(result, "data.frame") && (ncol(result)==length(res_attributes)))) {
            print(head(result))
            stop("The query to the BioMart webservice returned an invalid result: ", 
            "the number of columns in the result table does not equal the number of attributes in the query. \n",
            "Please report this on the support site at http://support.bioconductor.org")
        } 
        if(!bmHeader){  #assumes order of results same as order of attibutes in input
            colnames(result) = res_attributes

        }
    } else {
        warning("getLDS returns NULL.")
        result=NULL
    }
    return(result)
} 

######################
#getBMlist
######################

getBMlist <- function(attributes, filters = "", values = "", mart, list.names = NULL, na.value = NA, verbose=FALSE, giveWarning=TRUE){
    if(giveWarning) writeLines("Performing your query using getBM is preferred as getBMlist perfoms a separate getBM query for each of the values one gives.  This is ok for a short list but will definitely fail when used with longer lists.  Ideally one does a batch query with getBM and then iterates over that result.")
    out <- vector("list", length(attributes))
    if(is.null(list.names))
        names(out) <- attributes
    else
        names(out) <- list.names
    for(j in seq(along = attributes)){
        tmp2 <- vector("list", length(values))
        names(tmp2) <- values
        for(k in seq(along = tmp2)){
            tst <- getBM(attributes = attributes[j], filters=filters, values = values[k], mart = mart, verbose = verbose)
            
            if(class(tst) == "data.frame"){
                tmp <- unlist(unique(tst[!is.na(tst)]), use.names = FALSE)
                if(length(tmp) > 0)
                    tmp2[[k]] <- tmp
                else
                    tmp2[[k]] <- na.value
            }else{
                tmp2[[k]] <- na.value
            }
            out[[j]] <- tmp2
        }
    }
    return(out)
}


###############################
#                             #
#Ensembl specific functions   #
###############################


getGene <- function( id, type, mart){
    martCheck(mart,"ensembl") 
    checkWrapperArgs(id, type, mart)
    symbolAttrib = switch(strsplit(martDataset(mart), "_", fixed = TRUE, useBytes = TRUE)[[1]][1],hsapiens = "hgnc_symbol",mmusculus = "mgi_symbol","external_gene_id")
    typeAttrib = switch(type,affy_hg_u133a_2 = "affy_hg_u133a_v2",type)
    attrib = c(typeAttrib,symbolAttrib,"description","chromosome_name","band","strand","start_position","end_position","ensembl_gene_id")
    table = getBM(attributes = attrib,filters = type, values = id, mart=mart)
    return(table)
}

getSequence <- function(chromosome, start, end, id, type, seqType, upstream, downstream, mart, verbose=FALSE){
    martCheck(mart,c("ensembl","ENSEMBL_MART_ENSEMBL"))
    if(missing(seqType) || !seqType %in% c("cdna","peptide","3utr","5utr", "gene_exon", "transcript_exon",
                                           "transcript_exon_intron", "gene_exon_intron","coding","coding_transcript_flank",
                                           "coding_gene_flank","transcript_flank","gene_flank")) {
        stop("Please specify the type of sequence that needs to be retrieved when using biomaRt in web service mode.  Choose either gene_exon, transcript_exon,transcript_exon_intron, gene_exon_intron, cdna, coding,coding_transcript_flank,coding_gene_flank,transcript_flank,gene_flank,peptide, 3utr or 5utr")
    }
    if(missing(type))
        stop("Please specify the type argument.  If you use chromosomal coordinates to retrieve sequences, then the type argument will specify the type of gene indentifiers that you will retrieve with the sequences. ", 
             "If you use a vector of identifiers to retrieve the sequences, the type argument specifies the type of identifiers you are using.")
    if(missing(id) && missing(chromosome) && !missing(type))
        stop("No vector of identifiers given. Please use the id argument to give a vector of identifiers for which you want to retrieve the sequences.")
    if(!missing(chromosome) && !missing(id))
        stop("The getSequence function retrieves sequences given a vector of identifiers specified with the id argument of a type specified by the type argument.  Or alternatively getSequence retrieves sequences given a chromosome, a start and a stop position on the chromosome.  As you specified both a vector of identifiers and chromsomal coordinates. Your query won't be processed.")
    
    if(!missing(chromosome)){
        if(!missing(start) && missing(end))
            stop("You specified a chromosomal start position but no end position.  Please also specify a chromosomal end position.")
        if(!missing(end) && missing(start))
            stop("You specified a chromosomal end position but no start position.  Please also specify a chromosomal start position.")
        if(!missing(start)){ start = as.integer(start)
        end = as.integer(end)
        }
        if(missing(upstream) && missing(downstream)){
            sequence = getBM(c(seqType,type), 
                             filters = c("chromosome_name","start","end"), 
                             values = list(chromosome, start, end), 
                             mart = mart, 
                             checkFilters = FALSE, 
                             verbose=verbose)
        }
        else{
            if(!missing(upstream) && missing(downstream)){
                sequence = getBM(c(seqType,type), 
                                 filters = c("chromosome_name","start","end","upstream_flank"), 
                                 values = list(chromosome, start, end, upstream), 
                                 mart = mart, 
                                 checkFilters = FALSE, 
                                 verbose=verbose)
            }
            if(!missing(downstream) && missing(upstream)){
                sequence = getBM(c(seqType,type), 
                                 filters = c("chromosome_name","start","end","downstream_flank"), 
                                 values = list(chromosome, start, end, downstream), 
                                 mart = mart, 
                                 checkFilters = FALSE, 
                                 verbose = verbose)
            }
            if(!missing(downstream) && !missing(upstream)){
                stop("Currently getSequence only allows the user to specify either an upstream of a downstream argument but not both.")
            }
        }
    }
    
    if(!missing(id)){
        if(missing(type)) {
            stop("Type argument is missing. ",
            "This will be used to retrieve an identifier along with the sequence so one knows which gene it is from. ", 
            "Use the listFilters() function to select a valid type argument.")
        }
        if(!type %in% listFilters(mart, what="name")) {
            stop("Invalid type argument.  Use the listFilters() function to select a valid type argument.")
        }
        
        valuesString = paste(id,"",collapse=",",sep="")
        if(missing(upstream) && missing(downstream)){
            sequence = getBM(c(seqType,type), filters = type, values = id, mart = mart, verbose=verbose)
        }
        else{
            if(!missing(upstream) && missing(downstream)){
                sequence = getBM(c(seqType,type), filters = c(type, "upstream_flank"), values = list(id, upstream), mart = mart, checkFilters = FALSE, verbose=verbose)
            }
            if(!missing(downstream) && missing(upstream)){
                sequence = getBM(c(seqType,type), filters = c(type, "downstream_flank"), values = list(id, downstream), mart = mart, checkFilters = FALSE, verbose=verbose)
            }
            if(!missing(downstream) && !missing(upstream)){
                stop("Currently getSequence only allows the user to specify either an upstream of a downstream argument but not both.")
            }
        }
    }
    return(sequence)
}

####################
#export FASTA      #
####################

exportFASTA <- function( sequences, file ) {
    if( missing( sequences ) || class( sequences ) != "data.frame"){
        stop("No data.frame given to write FASTA.  The data.frame should be the output of the getSequence function.");
    }
    if( missing(file)){
        stop("Please provide filename to write to");
    }
    if(length(sequences[1,]) == 2){
        for(i in seq(along = sequences[,2])){
            cat(paste(">",sequences[i,2],"\n",sep=""),file = file, append=TRUE);
            cat(as.character(sequences[i,1]),file = file, append = TRUE);
            cat("\n\n", file = file, append = TRUE);
        }
    }
    else{
        for(i in seq(along = sequences[,2])){
            cat(paste(">chromosome_",sequences[i,1],"_start_",sequences[i,2],"_end_",sequences[i,3],"\n",sep=""),file = file, append=TRUE);
            cat(as.character(sequences[i,4]),file = file, append = TRUE);
            cat("\n\n", file = file, append = TRUE);
        }
    }  
}

###################
#Nature Protocol
###################

NP2009code <- function(){
    edit(file=system.file('scripts', 'Integration-NP.R', package = 'biomaRt'))
}

Try the biomaRt package in your browser

Any scripts or data that you put into this service are public.

biomaRt documentation built on Feb. 11, 2021, 2:01 a.m.