R/XenaHub-class.R

Defines functions `%||%` .collapse_list XenaDataUpdate XenaHub xena_default_hosts

Documented in XenaDataUpdate xena_default_hosts XenaHub

# FUN: Set Class & operate XenaHub object directly

##' Class XenaHub
##' @description  a S4 class to represent UCSC Xena Data Hubs
##' @slot hosts hosts of data hubs
##' @slot cohorts cohorts of data hubs
##' @slot datasets datasets of data hubs
##' @importFrom methods new
##' @export
.XenaHub <- setClass(
  "XenaHub",
  representation = representation(
    hosts = "character",
    cohorts = "character",
    datasets = "character"
  )
)

setMethod("show", "XenaHub", function(object) {
  showsome <- function(label, x) {
    len <- length(x)
    if (len > 6) {
      x <- c(head(x, 3), "...", tail(x, 2))
    }
    cat(label,
      "() (",
      len,
      " total):",
      "\n  ",
      paste0(x, collapse = "\n  "),
      "\n",
      sep = ""
    )
  }
  cat("class:", class(object), "\n")
  cat("hosts():",
    "\n  ", paste0(hosts(object), collapse = "\n  "),
    "\n",
    sep = ""
  )
  showsome("cohorts", cohorts(object))
  showsome("datasets", datasets(object))
})


##' @title UCSC Xena Default Hosts
##' @description Return Xena default hosts
##' @return A character vector include current defalut hosts
##' @author Shixiang Wang <w_shixiang@163.com>
##' @seealso [UCSCXenaTools::XenaHub()]
##' @export
xena_default_hosts <- function() {
  c(
    "https://ucscpublic.xenahubs.net",
    "https://tcga.xenahubs.net",
    "https://gdc.xenahubs.net",
    "https://icgc.xenahubs.net",
    "https://toil.xenahubs.net",
    "https://pancanatlas.xenahubs.net",
    "https://xena.treehouse.gi.ucsc.edu:443",
    "https://pcawg.xenahubs.net",
    "https://atacseq.xenahubs.net",
    "https://singlecellnew.xenahubs.net",
    "https://kidsfirst.xenahubs.net"
    #"https://tdi.xenahubs.net"
  )
}

.xena_hosts <- c(
  "publicHub",
  "tcgaHub",
  "gdcHub",
  "icgcHub",
  "toilHub",
  "pancanAtlasHub",
  "treehouseHub",
  "pcawgHub",
  "atacseqHub",
  "singlecellHub",
  "kidsfirstHub"
  #"tdiHub"
)

names(.xena_hosts) <- xena_default_hosts()

# Add Hiplot mirror url
# Still use UCSC Xena URL if it is not available
.xena_hosts_hiplot <- .xena_hosts
names(.xena_hosts_hiplot) <- c(
  "https://xena-ucscpublic.hiplot.com.cn",
  "https://xena-tcga.hiplot.com.cn",
  "https://xena-gdc.hiplot.com.cn",
  "https://xena-icgc.hiplot.com.cn",
  "https://xena-toil.hiplot.com.cn",
  "https://xena-pancanatlas.hiplot.com.cn",
  "https://xena.treehouse.gi.ucsc.edu:443", #!
  "https://xena-pcawg.hiplot.com.cn",
  "https://xena-atacseq.hiplot.com.cn",
  "https://singlecellnew.xenahubs.net", #!
  "https://kidsfirst.xenahubs.net" #!
  #"https://tdi.xenahubs.net" #!
)
# Map hiplot to ucsc
.xena_mirror_map <- names(.xena_hosts)
names(.xena_mirror_map) <- names(.xena_hosts_hiplot)
# Map ucsc to hiplot
.xena_mirror_map_rv <- names(.xena_hosts_hiplot)
names(.xena_mirror_map_rv) <- names(.xena_hosts)

##' Generate a XenaHub Object
##'
##' It is used to generate original
##' `XenaHub` object according to hosts, cohorts, datasets or hostName.
##' If these arguments not specified, all hosts and corresponding datasets
##' will be returned as a `XenaHub` object. All datasets can be found
##' at <https://xenabrowser.net/datapages/>.
##'
##'
##' @param hosts a character vector specify UCSC Xena hosts, all available hosts can be
##' found by `xena_default_hosts()` function. `hostName` is a more recommend option.
##' @param cohorts default is empty character vector, all cohorts will be returned.
##' @param datasets default is empty character vector, all datasets will be returned.
##' @param hostName name of host, available options can be accessed by `.xena_hosts`
##'  This is an easier option for user than `hosts` option. Note, this option
##'  will overlap `hosts`.
##' @return a [XenaHub] object
##' @author Shixiang Wang <w_shixiang@163.com>
##' @export
##' @importFrom httr stop_for_status POST content
##' @importFrom utils head tail
##' @examples
##' \dontrun{
##' #1 query all hosts, cohorts and datasets
##' xe = XenaHub()
##' xe
##' #2 query only TCGA hosts
##' xe = XenaHub(hostName = "tcgaHub")
##' xe
##' hosts(xe)     # get hosts
##' cohorts(xe)   # get cohorts
##' datasets(xe)  # get datasets
##' samples(xe)   # get samples
##' }
XenaHub <- function(hosts = xena_default_hosts(),
                    cohorts = character(),
                    datasets = character(),
                    hostName = c(
                      "publicHub",
                      "tcgaHub",
                      "gdcHub",
                      "icgcHub",
                      "toilHub",
                      "pancanAtlasHub",
                      "treehouseHub",
                      "pcawgHub",
                      "atacseqHub",
                      "singlecellHub",
                      "kidsfirstHub"
                      #"tdiHub"
                    )) {
  stopifnot(
    is.character(hosts),
    is.character(cohorts),
    is.character(datasets)
  )

  hostName <- unique(hostName)

  if (length(hostName) != length(.xena_hosts) &
    all(
      hostName %in% .xena_hosts
    )) {
    .temp <- names(.xena_hosts)
    names(.temp) <- .xena_hosts
    hostNames <- .temp %>%
      as.data.frame() %>%
      t() %>%
      as.data.frame()
    rm(.temp)

    hosts <- as.character(hostNames[, hostName])
  } else if (!all(hostName %in% .xena_hosts)) {
    stop("Bad hostName, please check")
  }

  if (is.null(names(hosts))) {
    names(hosts) <- hosts
  }

  hosts0 <- hosts
  hosts <- Filter(.host_is_alive, hosts)
  if (length(hosts) == 0L) { # nocov start
    stop(
      "\n  no hosts responding:",
      "\n    ",
      paste0(hosts0, collapse = "\n  ")
    )
  } # nocov end

  all_cohorts <- unlist(.host_cohorts(hosts), use.names = FALSE)
  if (length(cohorts) == 0L) {
    cohorts <- all_cohorts
  } else {
    hosts <- hosts[.cohort_datasets_count(hosts, cohorts) != 0L]
  }

  all_datasets <- unlist(.cohort_datasets(hosts, cohorts),
    use.names = FALSE
  )
  if (length(datasets) == 0L) {
    datasets <- all_datasets
  } else {
    if (!all(datasets %in% all_datasets)) { # nocov start
      bad_dataset <- datasets[!datasets %in% all_datasets]
      message("Following datasets are not in datasets of hosts, ignore them...")
      message(bad_dataset)
    } # nocov end
    datasets <- all_datasets[all_datasets %in% datasets]
  }


  .XenaHub(
    hosts = hosts,
    cohorts = cohorts,
    datasets = datasets
  )
}

##' Get or Update Newest Data Information of UCSC Xena Data Hubs
##' @param saveTolocal logical. Whether save to local R package data directory for permanent use
##' or Not.
##' @return a `data.frame` contains all datasets information of Xena.
##' @author Shixiang Wang <w_shixiang@163.com>
##' @export
##' @examples
##' \dontrun{
##' XenaDataUpdate()
##' XenaDataUpdate(saveTolocal = TRUE)
##' }
XenaDataUpdate <- function(saveTolocal = TRUE) { # nocov start
  # .p_all_cohorts(list(unique(XenaData$XenaHosts)[10]), exclude = list(NULL))
  # .p_dataset_list(list(XenaData$XenaHosts[1]), list(XenaData$XenaCohorts[1]))
  message("Disable hiplot firstly.")
  options(use_hiplot = FALSE)
  try_query = function(h, max_try = 3L) {
    Sys.sleep(0.1)
    tryCatch(
      {
        message("==> Trying #", abs(max_try - 4L))
        .p_all_cohorts(list(h), exclude = list(NULL))
      },
      error = function(e) {
        if (max_try == 1) {
          warning("Tried 3 times but failed, this hub may down or please check URL or your internet connection!", immediate. = TRUE)
          return(NULL)
        } else {
          try_query(h, max_try - 1L)
        }
      }
    )
  }

  query_host = function(h) {
      message("==> Searching cohorts for host ", h, "...")

      chs <- try_query(h, max_try = 3L)
      if (is.null(chs)) {
          return(NULL)
      }
      chs <- setdiff(chs, "(unassigned)")
      message("===> #", length(chs), " cohorts found.")
      message("===> Querying datasets info...")
      zz <- lapply(chs, function(x, h) {
          .p_dataset_list(list(h), list(x))
      }, h = h) %>%
          stats::setNames(chs) %>%
          dplyr::bind_rows(.id = "XenaCohorts")
      message("===> #", nrow(zz), " datasets found.")
      message("==> Done for host ", h, "...")
      zz
  }

  message("=> Obtaining info from UCSC Xena hubs...")
  XenaInfo <- lapply(names(.xena_hosts), query_host) %>%
    stats::setNames(names(.xena_hosts)) %>%
    dplyr::bind_rows(.id = "XenaHosts")

  message("=> Done for obtaining.")
  message("=> Parsing datasets metadata...")

  XenaInfo <- XenaInfo %>%
    dplyr::rename(
      XenaDatasets = .data$name,
      SampleCount = .data$count,
      DataSubtype = .data$datasubtype,
      Type = .data$type,
      LongTitle = .data$longtitle,
      ProbeMap = .data$probemap
    ) %>%
    dplyr::mutate(XenaHostNames = .xena_hosts[.data$XenaHosts])

  j_data <- lapply(XenaInfo$text, function(x) {
    # decode metadata from json format
    # note json data may have different elements for
    #                different cohort datasets
    # more work need to be done here
    #
    # tt$text contains metadata for dataset
    # tt$pmtext contains metadata for probemap
    json_df <- jsonlite::parse_json(x)
    dplyr::tibble(
      Citation = json_df[["citation"]] %||% NA,
      Label = json_df[["label"]] %||% NA,
      Tags = .collapse_list(json_df[["tags"]]) %||% NA,
      AnatomicalOrigin = .collapse_list(json_df[["anatomical_origin"]]) %||% NA,
      SampleType = .collapse_list(json_df[["sample_type"]]) %||% NA,
      Version = json_df[["version"]] %||% NA,
      PrimaryDisease = json_df[["primary_disease"]] %||% NA,
      Platform = json_df[["platform"]] %||% NA,
      Unit = json_df[["unit"]] %||% NA
    )
  })

  message("=> Done for parsing. Tidying...")

  tidy_data <- dplyr::bind_rows(j_data)
  XenaData <- dplyr::bind_cols(XenaInfo, tidy_data)
  XenaData <- dplyr::as_tibble(XenaData)

  XenaData <- XenaData %>%
    dplyr::select(
      c(
        "XenaHosts", "XenaHostNames", "XenaCohorts", "XenaDatasets", "SampleCount",
        "DataSubtype", "Label", "Type", "AnatomicalOrigin", "SampleType",
        "Tags", "ProbeMap", "LongTitle", "Citation", "Version",
        "Unit", "Platform"
      )
    )

  message("=> Tidying done.")

  if (saveTolocal) {
    message("=> Saving...")
    data_dir <- base::system.file("data", package = "UCSCXenaTools")
    if (dir.exists(data_dir)) {
      save(XenaData, file = file.path(data_dir, "XenaData.rda"))
    } else {
      message("There is no data directory ", data_dir)
      message("Please check it.")
    }
  }
  message("=> Done.")
  XenaData
} # nocov end

.collapse_list <- function(x) {
  sapply(x, function(x) x) %>% paste0(collapse = ",")
}

`%||%` <- function(x, y) {
  # ifelse(is.null(x), y, x)
  if (is.null(x)) {
    y
  } else {
    x
  }
}

utils::globalVariables(c(
  ".p_dataset_metadata",
  ".p_all_cohorts",
  ".p_dataset_list"
))
ropensci/UCSCXenaTools documentation built on Jan. 27, 2024, 6:30 a.m.