R/utilityFunctions.R

Defines functions listFilterValues searchFilterValues searchFilters searchAttributes searchDatasets .searchInternal .processResults .fetchHTMLresults .submitQueryXML .createErrorMessage .cleanHostURL .generateFilterXML .createFilterXMLchunk .splitValues .setResultColNames

Documented in listFilterValues searchAttributes searchDatasets searchFilters searchFilterValues

## sometimes results can be returned by getMB() in a different order to we 
## asked for them, which messes up the column names.  Here we try to match
## results to known attribute names and rename accordingly.
.setResultColNames <- function(result, mart, attributes, bmHeader = FALSE) {
    
    ## get all avaialble sttributes and 
    ## filter only for the ones we've actually asked for
    att <- listAttributes(mart, what = c("name", "description"))
    att <- att[which(att[,'name'] %in% attributes), ]
    if(length(which(duplicated(att[,'description']))) > 
       length(which(duplicated(att)))) {
        warning("Cannot unambiguously match attribute names
                Ignoring bmHeader argument and using biomart 
                description field")
        return(result)
    }
    
    resultNames = colnames(result)
    ## match the returned column names with the attribute names
    matches <- match(resultNames, att[,2], NA)
    if(any(is.na(matches))) {
        warning("Problems assigning column names.",
                "Currently using the biomart description field.", 
                "You may wish to set these manually.")
        return(result)
    }
    ## if we want to use the attribute names we specified, do this, 
    ## otherwise we use the header returned with the query
    if(!bmHeader) {
        colnames(result) = att[matches, 1]
    }
    ## now put things in the order we actually asked for the attributes in
    result <- result[, match(att[matches,1], attributes), drop=FALSE]
    
    return(result)
}

## BioMart doesn't work well if the list of values provided to a filter is 
## longer than 500 values.  It returns only a subset of the requested data
## and does so silently!  This function is designed to take a list of provided
## filters, and split any longer than 'maxChunkSize'.  It operates recursively
## incase there are multiple filters that need splitting, and should ensure
## all possible groupings of filters are retained.
.splitValues <- function(valuesList, maxChunkSize = 500) {
    
    vLength <- vapply(valuesList[[1]], FUN = length, FUN.VALUE = integer(1))
    
    if(all(vLength <= maxChunkSize)) {
        return(valuesList)
    } else {
        ## pick the next filter to split
        vIdx <- min(which(vLength > maxChunkSize))
        
        nchunks <- (vLength[vIdx] %/% maxChunkSize) + 1
        splitIdx <- rep(1:nchunks, each = ceiling(vLength[vIdx] / nchunks))[ 1:vLength[vIdx] ]
        
        ## a new list we will populate with the chunks
        tmpList <- list()
        for(i in 1:nchunks) {
            for( j in 1:length(valuesList) ) {
                listIdx <- ((i - 1) * length(valuesList)) + j
                tmpList[[ listIdx ]] <- valuesList[[j]]
                tmpList[[ listIdx ]][[ vIdx ]] <- tmpList[[ listIdx ]][[ vIdx ]][which(splitIdx == i)]
            }
        }
        ## recursively call the function to process next filter
        valuesList <- .splitValues(tmpList, maxChunkSize = maxChunkSize)
    }
    return(valuesList)
}

## Creating the filter XML for a single chunk of values.  Returns a character
## vector containing the XML lines for all specified filters & their 
## attributes spliced together into a single string.
.createFilterXMLchunk <- function(filterChunk, mart) {
    
    individualFilters <- vapply(names(filterChunk), 
        FUN = function(filter, values, mart) {
            
            ## if the filter exists and is boolean we do this
            if(filter %in% listFilters(mart, what = "name") && 
               grepl('boolean', filterType(filter = filter, mart = mart)) ) {
                if(!is.logical(values[[filter]])) 
                    stop("'", filter, 
                         "' is a boolean filter and needs a ",
                         "corresponding logical value of TRUE or FALSE to ",
                         "indicate if the query should retrieve all data that ",
                         "fulfill the boolean or alternatively that all data ", 
                         "that not fulfill the requirement should be retrieved.", 
                         call. = FALSE)
                val <- ifelse(values[[filter]], yes = 0, no = 1)
                val <- paste0("\" excluded = \"", val, "\" ")
                
            } else { 
                ## otherwise the filter isn't boolean, or doesn't exist
                
                if(is.numeric(values[[filter]])) 
                    values[[filter]] <- as.integer(values[[filter]])
                val <- paste0(values[[filter]], collapse = ",")
                ## convert " to ' to avoid truncating the query string
                val <- gsub(x = val, pattern = "\"", replacement = "'", fixed = TRUE)
                val <- paste0('" value = "', val, '" ')
            }
            filterXML <- paste0("<Filter name = \"", filter, val, "/>")
            return(filterXML)
        }, FUN.VALUE = character(1), 
        filterChunk, mart,
        USE.NAMES = FALSE)
    
    filterXML <- paste0(individualFilters, collapse = "")
    return(filterXML)
}

.generateFilterXML <- function(filters = "", values, mart) {
    
    ## return empty string if no filter specified & this isn't ensembl
    ## specifying no filter is generally bad, as it will get 'everything'
    ## and we might encounter the time out problem
    if(filters[1] == "") {
        return("")
    }
    ## if we have multiple filters, the values must be specified as a list.
    if(length(filters) > 1 && class(values) != "list") {
        stop("If using multiple filters, the 'value' has to be a list.",
        "\nFor example, a valid list for 'value' could be: list(affyid=c('1939_at','1000_at'), chromosome= '16')",
        "\nHere we select on Affymetrix identifier and chromosome, only results that pass both filters will be returned");
    } 
    ## it's easy to not realise you're passing a data frame here, so check
    if(is.data.frame(values) && ncol(values == 1)) {
        values <- values[,1]
    }

    if(!is.list(values)){
        values <- list(values)
    }
    names(values) <- filters
    
    values <- .splitValues(list(values), maxChunkSize = 5000)
    
    filterXML_list <- lapply(values, .createFilterXMLchunk, mart)
    
    return(filterXML_list)
}

# it seems like pretty common practice for users to copy and paste the host
# name from a browser if they're not accessing Ensembl.  Typically this will
# include the "http://" and maybe a trailing "/" and this messes up our
# paste the complete URL strategy and produces something invalid.  
# This function tidies that up to catch common variants.
.cleanHostURL <- function(host) {
    
    ## strip trailing slash
    host <- gsub(pattern = "/$", replacement = "", x = host)
    
    ## just supplying 'ensembl.org' is no longer handled correctly
    ## stick 'www' infront if we see this
    if( grepl(pattern = "^ensembl\\.org$", x = host) ) {
        host = "www.ensembl.org"
    }
    
    ## only prepend http if needed 
    if(!grepl(pattern = "^http://|^https://", x = host)) {
        host <- paste0("http://", host)
    }
    
    return(host)
}

.createErrorMessage <- function( error_code, host = "" ) {
  
  ## if we encounter internal server error, suggest using a mirror
  if( error_code == 500) {
    err_msg <- 'biomaRt has encountered an unexpected server error.'
  } else if ( error_code == 509) {
    err_msg <- 'biomaRt has exceeded the bandwidth allowance with this server.'
  }
  
  if( grepl("ensembl", x = host) ) {
      err_msg <- c(err_msg, '\nConsider trying one of the Ensembl mirrors (for more details look at ?useEnsembl)')
  }

  return(err_msg)
}

# ensembl redirection doesn't seem to be working properly as of 12-12-2017
# This is a wrapper function to catch POSTS that are redirected and fail
# The new host is captured from the header and used in a re-submission
.submitQueryXML <- function(host, query) {
    res <- httr::POST(url = host,
                      body = list('query' = query),
                      #set_cookies(.cookies = c(redirect_mirror = 'no')),
                      timeout(300))

    if( httr::http_error(res) ) {
        err_msg <- .createErrorMessage( error_code = status_code(res), host = host )
        stop(err_msg, call. = FALSE)
    }
    
    ## now we set the redirection cookie, this code should never be executed
    if(status_code(res) == 302) {
        host <- stringr::str_match(string = res$all_headers[[1]]$headers$location,
                               pattern = "//([a-zA-Z./]+)\\??;?redirectsrc")[,2]
        res <- httr::POST(url = host,
                          body = list('query' = query),
                          config = list(timeout(600)))
    }
    ## content() prints a message about encoding not being supplied 
    ## for ensembl.org - no default, so we suppress it
    return( suppressMessages(content(res)) )
}

# if parsing of TSV results fails, try this
.fetchHTMLresults <- function(host, query) {
    query = gsub(x = query, pattern = "TSV", replacement = "HTML", fixed = TRUE)
    html_res <- .submitQueryXML(host, query)
    XML::readHTMLTable(html_res, stringsAsFactors = FALSE)[[1]]
}

.processResults <- function(postRes, mart, sep, fullXmlQuery, verbose, callHeader, quote, attributes) {

    if(verbose){
      writeLines("#################\nResults from server:")
      print(postRes)
    }
    if(!(is.character(postRes) && (length(postRes)==1L))) {
      stop("The query to the BioMart webservice returned an invalid result\n",
      "biomaRt expected a character string of length 1.\n",
      "Please report this on the support site at http://support.bioconductor.org")
    }
    
    if(gsub("\n", "", postRes, fixed = TRUE, useBytes = TRUE) == "") { # meaning an empty result
      
      result = as.data.frame(matrix(ncol=length(attributes), nrow=0))
      
    } else {
      
      if(length(grep("^Query ERROR", postRes))>0L)
        stop(postRes)
      
      ## convert the serialized table into a dataframe
      con = textConnection(postRes)
      on.exit(close(con))
      result <- tryCatch(read.table(con, sep="\t", header=callHeader, quote = quote, 
                                    comment.char = "", stringsAsFactors = FALSE, check.names = FALSE),
                         error = function(e) {
                           ## if the error relates to number of element, try reading HTML version
                           if(grepl(x = e, pattern = "line [0-9]+ did not have [0-9]+ elements"))
                             .fetchHTMLresults(host = paste0(martHost(mart), sep), query = fullXmlQuery)
                           else
                             stop(e)
                         }
      )
      if(verbose){
        writeLines("#################\nParsed results:")
        print(result)
      }
      
      if(!(is(result, "data.frame") && (ncol(result)==length(attributes)))) {
        print(head(result))
        stop("The query to the BioMart webservice returned an invalid result.\n", 
        "The number of columns in the result table does not equal the number of attributes in the query.\n",
        "Please report this on the support site at http://support.bioconductor.org")
      }
    }
    return(result)
}

##############################################
## searching Attributes, Filters, and Datasets
##############################################

# given a data.frame, searches every column for
# the value in 'pattern'
# returns index of rows containing a match
.searchInternal <- function(pattern, data) {
    colIdx <- vapply(data, 
                     FUN = stringr::str_detect, 
                     FUN.VALUE = logical(length = nrow(data)), 
                     pattern = pattern)
    rowIdx <- apply(colIdx, 1, any)
    
    ## return either the matching rows, or NULL
    if(any(rowIdx)) {
        return(data[rowIdx,])
    } else {
        message('No matching datasets found')
        return(NULL)
    }
}

 
searchDatasets <- function(mart, pattern) {
    
    if(missing(mart))
        stop("Argument 'mart' must be specified")
    if(missing(pattern)) 
        pattern = ".*"
    
    datasets <- listDatasets(mart)
    res <- .searchInternal(pattern = pattern, data = datasets)
    
    if(is.null(res))
        invisible(res)
    else
        res
}


searchAttributes <- function(mart, pattern) {
    
    if(missing(mart))
        stop("Argument 'mart' must be specified")
    if(missing(pattern)) 
        pattern = ".*"
    
    attributes <- listAttributes(mart)
    res <- .searchInternal(pattern = pattern, data = attributes)
    
    if(is.null(res))
        invisible(res)
    else
        res
}

searchFilters <- function(mart, pattern) {
    
    if(missing(mart))
        stop("Argument 'mart' must be specified")
    if(missing(pattern)) 
        pattern = ".*"
    
    filters <- listFilters(mart)
    res <- .searchInternal(pattern = pattern, data = filters)
    
    if(is.null(res))
       invisible(res)
    else
        res
}


## Some filters have a predefined list of options that can be selected.
## This function lets us search those values, given a specified filter.
searchFilterValues <- function(mart, filter, pattern) {
  
  if(missing(mart))
    stop("Argument 'mart' must be specified")
  if(missing(filter))
    stop("Argument 'filter' must be specified")
  if(missing(pattern)) 
    pattern = ".*"
  
  ## first get all filters & their options, then reduce to what's requested
  filters <- listFilters(mart, what = c("name", "options"))
  filters <- filters[ filters$name == filter, ]
  if(nrow(filters) == 0) { 
    stop("Filter '", filter, "' not found.")
  }
  options <- gsub(filters$options, pattern = "^\\[|\\]$", replacement = "")
  options <- strsplit(options, split = ",", fixed = TRUE)[[1]]
  
  res <- grep(x = options, pattern = pattern, 
              ignore.case = TRUE, value = TRUE)
  
  if(length(res) == 0)
    message('No matching values found')
  else
    res
}


listFilterValues <- function(mart, filter) {
    searchFilterValues(mart = mart, filter = filter)
}

Try the biomaRt package in your browser

Any scripts or data that you put into this service are public.

biomaRt documentation built on Feb. 11, 2021, 2:01 a.m.