R/download_data.R

Defines functions clear_iobr_cache list_github_datasets reset_iobr_cache_dir set_iobr_cache_dir get_iobr_cache_dir reset_iobr_mirrors list_iobr_mirrors add_iobr_mirror get_default_mirrors download_iobr_data

Documented in add_iobr_mirror clear_iobr_cache download_iobr_data get_default_mirrors get_iobr_cache_dir list_github_datasets list_iobr_mirrors reset_iobr_cache_dir reset_iobr_mirrors set_iobr_cache_dir

#' Download IOBR Data from GitHub with Mirror Support
#'
#' @description
#' Downloads large datasets from GitHub releases to avoid CRAN size limits.
#' Supports multiple download mirrors for users in different regions.
#' Data is cached locally after first download. Cache directory can be
#' customized via `options(IOBR.cache_dir = "your/path")`.
#'
#' @param name Character string. Name of the dataset to download.
#' @param force Logical. Whether to force re-download even if cached. Default: FALSE.
#' @param verbose Logical. Whether to print progress messages. Default: TRUE.
#' @param mirrors Character vector. URLs of mirrors to try. Default uses
#'   get_default_mirrors().
#' @param cache_dir Character string. Custom cache directory. If NULL, uses
#'   the option `IOBR.cache_dir` or the default system cache location.
#'
#' @return The requested dataset.
#' @export
#'
#' @examples
#' \dontrun{
#' tcga_sig <- download_iobr_data("tcga_stad_sig")
#' }
download_iobr_data <- function(name, force = FALSE, verbose = TRUE,
                               mirrors = get_default_mirrors(),
                               cache_dir = NULL) {
  # Get all available GitHub datasets
  github_data <- list_github_datasets()

  if (!name %in% github_data) {
    stop(sprintf(
      "Dataset '%s' not available for download. Available: %s",
      name, paste(github_data, collapse = ", ")
    ))
  }

  # Set up cache directory (priority: argument > option > default)
  cache_dir <- get_iobr_cache_dir(cache_dir)
  if (!dir.exists(cache_dir)) {
    dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE)
  }

  cache_file <- file.path(cache_dir, sprintf("%s.rda", name))

  # Check if already cached
  if (file.exists(cache_file) && !force) {
    if (verbose) cli::cli_alert_info("Loading cached data: {.val {name}}")
    env <- new.env()
    load(cache_file, envir = env)
    # Return the object (might be named 'data' or the actual name)
    obj_names <- ls(env)
    if (length(obj_names) == 1) {
      return(env[[obj_names[1]]])
    } else {
      return(env[[name]])
    }
  }

  # Check for internet connection if no local file exists
  if (!file.exists(cache_file)) {
    has_internet <- function() {
      # Test both Google and Baidu, return TRUE if either works
      # Using sequential approach to avoid fork() issues on macOS
      urls <- c("https://www.google.com", "https://www.baidu.com")
      for (url_str in urls) {
        result <- tryCatch({
          con <- url(url_str)
          tryCatch({
            readLines(con, n = 1)
            TRUE
          }, error = function(e) FALSE,
            finally = {
              tryCatch(close(con), error = function(e) NULL)
            })
        }, error = function(e) FALSE)
        if (result) return(TRUE)
      }
      FALSE
    }

    if (!has_internet()) {
      cli::cli_alert_warning("No internet connection available and dataset {.val {name}} is not in cache.")
      cli::cli_alert_info("To comply with CRAN policies, returning {.code NULL} instead of an error.")
      return(NULL)
    }
  }

  # Try each mirror until success
  last_error <- NULL
  for (i in seq_along(mirrors)) {
    mirror <- mirrors[i]

    # Construct URL based on mirror type
    if (grepl("github\\.com", mirror)) {
      # GitHub or GitHub proxy
      url <- sprintf(
        "%s/IOBR/IOBR/releases/download/data-v1.0/%s.rda",
        mirror, name
      )
    } else {
      # Standalone mirror (e.g., biotree.top)
      # Ensure there's a slash between mirror and name
      url <- sprintf("%s/%s.rda", sub("/$", "", mirror), name)
    }

    if (verbose) {
      cli::cli_alert_info("Trying mirror {i}/{length(mirrors)}: {.url {mirror}}")
    }

    tryCatch(
      {
        utils::download.file(url, cache_file, mode = "wb", quiet = !verbose)

        if (file.exists(cache_file) && file.size(cache_file) > 0) {
          if (verbose) cli::cli_alert_success("Download complete: {.val {name}}")

          env <- new.env()
          load(cache_file, envir = env)
          obj_names <- ls(env)
          if (length(obj_names) == 1) {
            return(env[[obj_names[1]]])
          } else {
            return(env[[name]])
          }
        }
      },
      error = function(e) {
        last_error <<- e
        if (verbose) cli::cli_alert_warning("Mirror {i} failed: {e$message}")
        # Clean up partial download
        if (file.exists(cache_file)) file.remove(cache_file)
      }
    )
  }

  # All mirrors failed
  cache_dir <- get_iobr_cache_dir()
  manual_url <- sprintf(
    "https://github.com/IOBR/IOBR/releases/download/data-v1.0/%s.rda", name
  )
  cli::cli_alert_danger("All download mirrors failed for dataset: {.val {name}}")
  cli::cli_alert_info("Please try the following manual download steps:")
  cli::cli_ul(c(
    "1. Download the file from: {.url {manual_url}}",
    sprintf("2. Save it to: {.path %s}", file.path(cache_dir, paste0(name, ".rda"))),
    "3. Run your code again - the data will be loaded from cache"
  ))
  cli::cli_alert_info("To comply with CRAN policies, returning {.code NULL} instead of an error.")
  return(NULL)
}

#' Get Default Download Mirrors
#'
#' @description Returns the default list of download mirrors.
#' @return Character vector of mirror URLs.
#' @keywords internal
get_default_mirrors <- function() {
  c(
    # Original GitHub (default)
    "https://github.com",
    # Chinese mirrors (Verified working)
    "https://gh-proxy.com/https://github.com",
    "https://ghproxy.net/https://github.com",
    "https://moeyy.cn/gh-proxy/https://github.com",
    "https://github.akams.cn/https://github.com",
    "http://toolwa.com/github/https://github.com",
    "https://v6.gh-proxy.org/https://github.com",
    "https://gh-proxy.org/https://github.com",
    "https://ghfast.top/https://github.com",
    "https://download.githubcdn.com?url=https://github.com",
    "https://proxy.gitwarp.top/https://github.com",
    # Biotree mirror (Fallback)
    "http://biotree.top:49004/d/IOBR_data"
  )
}

#' Add Custom Download Mirror
#'
#' @description
#' Adds a custom mirror URL to the default mirrors for the current session.
#' The mirror URL should be a base URL. If it contains 'github.com', it will
#' be treated as a GitHub proxy and the relative path to IOBR releases will
#' be appended. Otherwise, it will be treated as a direct repository path.
#'
#' @param url Character string. The mirror URL to add.
#' @param position Character. Where to add the mirror: "first", "last", or
#'   "before_github". Default: "first".
#'
#' @return Invisibly returns the updated mirror list.
#' @export
#'
#' @examples
#' \dontrun{
#' add_iobr_mirror("https://my-mirror.com/https://github.com")
#' }
add_iobr_mirror <- function(url, position = c("first", "last", "before_github")) {
  position <- match.arg(position)

  # Validate URL
  if (!grepl("^https?://", url)) {
    stop("Invalid URL. Must start with http:// or https://")
  }

  # Get current mirrors from option or default
  current_mirrors <- getOption("IOBR.download_mirrors", get_default_mirrors())

  # Remove trailing slash if present
  url <- sub("/$", "", url)

  # Add URL if not already present
  if (url %in% current_mirrors) {
    cli::cli_alert_info("Mirror {.url {url}} already exists in the list")
    return(invisible(current_mirrors))
  }

  # Add at specified position
  new_mirrors <- switch(position,
    "first" = c(url, current_mirrors),
    "last" = c(current_mirrors, url),
    "before_github" = {
      # Insert before the default GitHub URL
      github_idx <- which(current_mirrors == "https://github.com")
      if (length(github_idx) == 0) {
        c(current_mirrors, url)
      } else {
        c(current_mirrors[1:(github_idx - 1)], url, current_mirrors[github_idx:length(current_mirrors)])
      }
    }
  )

  # Store in options
  options(IOBR.download_mirrors = new_mirrors)

  cli::cli_alert_success("Added mirror {.url {url}} to position: {.val {position}}")
  cli::cli_alert_info("Current mirrors: {.val {length(new_mirrors)}} total")

  invisible(new_mirrors)
}

#' List Current Download Mirrors
#'
#' @description Returns the current list of download mirrors.
#' @return Character vector of mirror URLs.
#' @export
#'
#' @examples
#' list_iobr_mirrors()
list_iobr_mirrors <- function() {
  mirrors <- getOption("IOBR.download_mirrors", get_default_mirrors())

  cat("Current IOBR download mirrors:\n")
  cat("==============================\n")
  for (i in seq_along(mirrors)) {
    cat(sprintf("%d. %s\n", i, mirrors[i]))
  }
  cat("==============================\n")
  cat(sprintf("Total: %d mirrors\n", length(mirrors)))

  invisible(mirrors)
}

#' Reset Download Mirrors to Default
#'
#' @description Resets the download mirror list to the default values.
#' @return Invisibly returns the default mirror list.
#' @export
#'
#' @examples
#' reset_iobr_mirrors()
reset_iobr_mirrors <- function() {
  options(IOBR.download_mirrors = NULL)
  cli::cli_alert_success("Download mirrors reset to default")
  invisible(get_default_mirrors())
}

#' Get IOBR Cache Directory
#'
#' @description Returns the current cache directory for IOBR downloaded data.
#' To comply with CRAN policies, the default cache directory is a session-specific
#' temporary directory. Users can opt-in to a persistent cache by setting
#' `options(IOBR.cache_dir = "your/path")` or using `set_iobr_cache_dir()`.
#'
#' The cache directory is determined in the following priority order:
#' 1. Function argument `cache_dir` (if provided)
#' 2. Option `IOBR.cache_dir` (if set via `options()`)
#' 3. Default: A session-specific temporary directory (`file.path(tempdir(), "IOBR_cache")`)
#'
#' @param cache_dir Optional character string to override the current setting.
#' @return Character string with the cache directory path.
#' @export
#'
#' @examples
#' # Get current cache directory (defaults to tempdir)
#' get_iobr_cache_dir()
#'
#' # Set custom cache directory via options (use tempdir() for examples)
#' options(IOBR.cache_dir = tempdir())
#' get_iobr_cache_dir()
get_iobr_cache_dir <- function(cache_dir = NULL) {
  # Priority: argument > option > default
  if (!is.null(cache_dir)) {
    if (!is.character(cache_dir) || length(cache_dir) != 1) {
      stop("'cache_dir' must be a single character string")
    }
    return(path.expand(cache_dir))
  }

  option_cache <- getOption("IOBR.cache_dir")
  if (!is.null(option_cache)) {
    return(path.expand(option_cache))
  }

  # Default to temporary directory to satisfy CRAN policies
  file.path(tempdir(), "IOBR_cache")
}

#' Set IOBR Cache Directory
#'
#' @description Sets a custom cache directory for IOBR downloaded data.
#' This is useful when you want to store cached data in a specific location,
#' such as a shared network drive or a custom directory.
#'
#' To use the standard system cache location (persistent across sessions),
#' you can use: `set_iobr_cache_dir(tools::R_user_dir("IOBR", which = "cache"))`.
#'
#' @param path Character string. The path to the cache directory.
#' @param create Logical. Whether to create the directory if it doesn't exist.
#'   Default: TRUE.
#'
#' @return Invisibly returns the cache directory path.
#' @export
#'
#' @examples
#' \donttest{
#' # Set a custom cache directory (use tempdir() for examples)
#' set_iobr_cache_dir(tempdir())
#'
#' # Use standard system cache (persistent)
#' # set_iobr_cache_dir(tools::R_user_dir("IOBR", which = "cache"))
#'
#' # Check the current cache directory
#' get_iobr_cache_dir()
#'
#' # Download data will now use the custom cache
#' data <- download_iobr_data("lm22")
#' }
set_iobr_cache_dir <- function(path, create = TRUE) {
  if (!is.character(path) || length(path) != 1) {
    stop("'path' must be a single character string")
  }

  path <- path.expand(path)

  if (create && !dir.exists(path)) {
    dir.create(path, recursive = TRUE, showWarnings = FALSE)
    if (!dir.exists(path)) {
      stop(sprintf("Failed to create cache directory: %s", path))
    }
    cli::cli_alert_success("Created cache directory: {.path {path}}")
  }

  options(IOBR.cache_dir = path)
  cli::cli_alert_success("IOBR cache directory set to: {.path {path}}")

  invisible(path)
}

#' Reset IOBR Cache Directory to Default
#'
#' @description Resets the cache directory to the default session-specific
#' temporary directory.
#' @return Invisibly returns the default cache directory path.
#' @export
#'
#' @examples
#' reset_iobr_cache_dir()
reset_iobr_cache_dir <- function() {
  options(IOBR.cache_dir = NULL)
  default_cache <- file.path(tempdir(), "IOBR_cache")
  cli::cli_alert_success("Cache directory reset to session-specific temporary directory: {.path {default_cache}}")
  invisible(default_cache)
}

#' List Available GitHub Datasets
#'
#' @return Character vector of available dataset names.
#' @export
#'
#' @examples
#' list_github_datasets()
list_github_datasets <- function() {
  c(
    # From data/ directory - Reference matrices
    "BRef", "TRef", "lm22",

    # From data/ directory - Annotation files
    "anno_gc_vm32", "anno_grch38", "anno_hug133plus2", "anno_illumina", "anno_rnaseq",

    # From data/ directory - Example datasets
    "tcga_stad_sig", "imvigor210_sig", "eset_stad", "sig_stad",
    "eset_gse62254", "eset_tme_stad", "eset_blca", "deg",

    # From sysdata - Gene sets
    "hallmark", "kegg", "go_bp", "go_cc", "go_mf", "reactome",
    "msig_immune", "msig_sc",

    # From sysdata - Reference data
    "xCell.data", "quantiseq_data",

    # From sysdata - Cell markers and gene lists
    "cancer_type_genes", "cellmarkers", "common_genes", "immuneCuratedData",
    "ips_gene_set", "length_ensembl", "mRNA_cell_default",
    "mus_human_gene_symbol", "onco_sig", "PurityDataAffy", "SI_geneset",

    # From sysdata - Signatures
    "signature_collection_citation", "signature_metabolism",
    "signature_sc", "signature_tumor",

    # From sysdata - Example datasets
    "imvigor210_eset", "melanoma_data",
    "pdata_acrg", "pdata_GSE63557", "pdata_sig_tme",
    "pdata_sig_tme_binary", "pdata_tme_binary", "tcga_stad_var"
  )
}

#' Clear IOBR Data Cache
#'
#' @description Removes all cached data files downloaded from GitHub.
#' @param cache_dir Character string. Custom cache directory. If NULL, uses
#'   the option `IOBR.cache_dir` or the default system cache location.
#' @return Invisible NULL. Called for side effects of clearing the cache.
#' @export
#'
#' @examples
#' clear_iobr_cache()
clear_iobr_cache <- function(cache_dir = NULL) {
  cache_dir <- get_iobr_cache_dir(cache_dir)
  if (dir.exists(cache_dir)) {
    files <- list.files(cache_dir, full.names = TRUE)
    if (length(files) > 0) {
      file.remove(files)
      cli::cli_alert_success("Cache cleared: {.val {length(files)}} file(s) removed from {.path {cache_dir}}")
    } else {
      cli::cli_alert_info("Cache is already empty: {.path {cache_dir}}")
    }
  } else {
    cli::cli_alert_info("Cache directory does not exist: {.path {cache_dir}}")
  }
  invisible(NULL)
}

Try the IOBR package in your browser

Any scripts or data that you put into this service are public.

IOBR documentation built on May 30, 2026, 5:07 p.m.