R/organismFilters.R

Defines functions organismFilters

Documented in organismFilters

#' @title Retrieve Ensembl Biomart filters for a query organism
#' @description In addition to the \code{\link{organismBM}} and
#' \code{\link{organismAttributes}} functions, this function
#' returns all available filters that can be accessed through different marts
#' and datasets for a given query organism.
#' @param organism a character string specifying the scientific name of a
#' query organism.
#' @param update a logical value specifying whether or not the local
#' listMart.txt, listDatasets.txt, and listFilters_organism.txt files shall be
#' updated by remote access to BioMart.
#' @param topic a character string specifying a topic (category) of filters,
#' e.g. \code{topic} = \code{"id"}.
#' @author Hajk-Georg Drost
#' @return a data.frame storing corresponding filter names, description,
#' datasets, and marts.
#' @details
#' For a given query organism, this function retrieves all available
#' filters that can be accessed through different marts and datasets.
#'
#' Sometimes the same filter names correspond to different datasets and
#' marts causing problems when using \code{\link{getMarts}}.
#' The approach introduced by this function provides (again) a organism centric
#' way of accessing organism specific filters.
#'
#' The \code{topic} argument allows the user to search for specific filters
#' topics/categories for faster selection.
#' @note
#' When you run this function for the first time, the data retrieval procedure
#' will take some time, due to the remote access to BioMart. The corresponding
#' result is then saved in a *.txt file within the \code{\link{tempdir}}
#' directory  named "_biomart/listMarts.txt","_biomart/listDatasets.txt", and
#' "_biomart/listFilters_organism.txt", allowing subsequent queries to perform
#' much faster.
#' @examples
#' \dontrun{
#' # search for filter topic "id"
#' head(organismFilters("Homo sapiens", topic = "id"), 20)
#' }
#' @references
#' \url{http://biomart.org/}
#'
#' Mapping identifiers for the integration of genomic datasets with the
#' R/Bioconductor package biomaRt. Steffen Durinck, Paul T. Spellman, Ewan
#' Birney and Wolfgang Huber, Nature Protocols 4, 1184-1191 (2009).
#'
#' BioMart and Bioconductor: a powerful link between biological databases and
#' microarray data analysis. Steffen Durinck, Yves Moreau, Arek Kasprzyk, Sean
#' Davis, Bart De Moor, Alvis Brazma and Wolfgang Huber, Bioinformatics 21,
#' 3439-3440 (2005).
#' @family biomaRt
#' @export
organismFilters <- function(organism,
                            update = FALSE,
                            topic = NULL) {
    mart <- dataset <- NULL

    orgBM <- organismBM(organism = organism, update = update)
    message("\n")
    message("Starting retrieval of all available BioMart filters for ", organism, " ...")

    orgMarts <- names(table(orgBM$mart))

    martList <-
        lapply(orgMarts, function(mart)
            dplyr::filter(orgBM, mart == mart))

    filtersTXT <-
        paste0("listFilters_", stringr::str_replace(organism, " ", "_"))

    if (!file.exists(file.path(tempdir(), "_biomart"))) {
        dir.create(file.path(tempdir(), "_biomart"))
    }

    if (!file.exists(file.path(tempdir(), "_biomart",
                               paste0(filtersTXT, ".txt")))) {
        filtersList <- lapply(martList, function(mart) {
            mart <- as.data.frame(mart)

            mart_tbl <-
                do.call(rbind, lapply(seq_len(nrow(mart)),
                                      function(dataset) {
                                              org_name_tmp <- unlist(stringr::str_split(mart$dataset[dataset], "_"))[1]
                                              if (!is.element(mart$dataset[dataset], c(paste0(org_name_tmp, "_structvar_som"), paste0(org_name_tmp, "_structvar")))) {
                                                  message("Processing mart ", mart$mart[dataset], " and dataset ", mart$dataset[dataset], " ...")
                                                  tryCatch({
                                          filters_tbl <-
                                    getFilters(dataset = mart$dataset[dataset],
                                                mart    = mart$mart[dataset])

                                    datasetVec <-
                                  rep(mart$dataset[dataset], nrow(filters_tbl))

                                    filters_tbl <-
                              dplyr::mutate(filters_tbl, dataset = datasetVec)

                                          return(filters_tbl)}, error = function(e) {message("No entries found ...")})
                                              }

                                      }))

            martVec <-
                rep(mart$mart[1], nrow(mart_tbl))
            mart_tbl <-
                dplyr::mutate(mart_tbl, mart = martVec)
            return(mart_tbl)
        })

        utils::write.table(
            do.call(rbind, filtersList),
            file.path(tempdir(), "_biomart", paste0(filtersTXT, ".txt")),
            sep       = "\t",
            quote     = FALSE,
            col.names = TRUE,
            row.names = FALSE
        )

    }

    filterTable <-
        readr::read_tsv(
            file.path(tempdir(), "_biomart", paste0(filtersTXT, ".txt")),
            col_names = TRUE,
            col_types = readr::cols(
                "name" = readr::col_character(),
                "description" = readr::col_character(),
                "dataset" = readr::col_character(),
                "mart" = readr::col_character()
            )
        )

    # summ_filterTable <- dplyr::summarise(dplyr::group_by(filterTable, name),
    # description = names(table(description)), mart = names(table(mart)),
    # dataset = names(table(dataset)))

    if (!is.null(topic)) {
        findTopic <-
            which(unlist(lapply(filterTable$name, function(x)
                stringr::str_detect(x, topic))))

        if (dim(filterTable[findTopic , ])[1] == 0)
            stop("Unfortunately the topic '", topic , "' could not be found.")

        return(filterTable[findTopic , ])

    } else {
        return(filterTable)
    }
}
ropensci/biomartr documentation built on Dec. 11, 2023, 5:37 a.m.