R/SRA_helper.R

Defines functions extract_run_id find_url_ebi_safe find_url_ebi download.ebi rename.SRA.files download.SRA install.sratoolkit

Documented in download.ebi download.SRA extract_run_id find_url_ebi find_url_ebi_safe install.sratoolkit rename.SRA.files

#' Download sra toolkit
#'
#' Currently supported for Linux (64 bit centos and ubunutu is tested to work)
#' and Mac-OS(64 bit). If other linux distro, centos binaries will be used.
#' @param folder default folder, "~/bin"
#' @param version a string, default "2.11.3"
#' @return path to fastq-dump in sratoolkit
#' @importFrom utils untar
#' @references https://ncbi.github.io/sra-tools/fastq-dump.html
#' @family sra
#' @export
#' @examples
#' # install.sratoolkit()
#' ## Custom folder and version (not adviced)
#' folder <- "/I/WANT/IT/HERE/"
#' # install.sratoolkit(folder, version = "2.10.9")
#'
install.sratoolkit <- function(folder = "~/bin", version = "2.11.3") {
  if (.Platform$OS.type != "unix")
    stop("sratoolkit is not currently supported for windows by ORFik,
         download manually or use WSL (windows subsystem linux)")
  folder <- path.expand(folder)
  is_linux <- Sys.info()[1] == "Linux" # else it is mac
  os <- if(!is_linux) {
    "mac64"
    } else {
     is_ubuntu <- grepl("^Ubuntu", osVersion, ignore.case = TRUE)
     is_centos <- grepl("^cent", osVersion, ignore.case = TRUE)
     found_exact_match <- sum(c(is_ubuntu, is_centos)) == 1
     if (!found_exact_match | is_centos) {
       "centos_linux64"
     } else "ubuntu64"
    }
  # TODO; Check if ubuntu compliation is needed for safer download ->
  #length(grep("Ubuntu", system("cat /etc/*release", intern = TRUE)[1])) == 1

  path.final <- paste0(folder, "/sratoolkit.", version, "-", os)
  path.final <- paste0(path.final, "/bin/fastq-dump")
  if (file.exists(path.final)) {
    message(paste("Using fastq-dump at location:",
                  path.final))
    return(path.final)
  }
  message("Downloading and configuring SRA-toolkit for you,
          this is done only once!")

  url <- paste0("https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/", version, "/")
  url <- paste0(url, "sratoolkit.", version, "-", os, ".tar.gz")

  path <- paste0(folder, "/sratoolkit.tar.gz")

  dir.create(folder, showWarnings = FALSE, recursive = TRUE)

  utils::download.file(url, destfile = path)
  untar(path, exdir = folder)

  # Update access rights
  system(paste("chmod a+x", path.final))
  # Make config file, will give ignorable seqmentation faul warning
  message("Ignore the following config warning: SIGNAL - Segmentation fault ")
  conf <- suppressWarnings(system(paste0(dirname(path.final), "/vdb-config -i"),
                                  intern = TRUE))

  return(path.final)
}

#' Download read libraries from SRA
#'
#' Multicore version download, see documentation for SRA toolkit for more information.
#' @param info character vector of only SRR numbers or
#' a data.frame with SRA metadata information including the SRR numbers in a column called
#' "Run" or "SRR". Can be SRR, ERR or DRR numbers.
#' If only SRR numbers can not rename, since no additional information is given.
#' @param outdir directory to store runs,
#' files are named by default (rename = TRUE) by information from SRA metadata table,
#' if (rename = FALSE) named according to SRR numbers.
#' @param rename logical or character, default TRUE (Auto guess new names). False: Skip
#' renaming. A character vector of equal size as files wanted can also be given.
#' Priority of renaming from
#' the metadata is to check for unique names in the LibraryName column,
#' then the sample_title column if no valid names in LibraryName.
#' If new names found and still duplicates, will
#' add "_rep1", "_rep2" to make them unique. If no valid names, will not
#' rename, that is keep the SRR numbers, you then can manually rename files
#' to something more meaningful.
#' @param fastq.dump.path path to fastq-dump binary, default: path returned
#' from install.sratoolkit()
#' @param settings a string of arguments for fastq-dump,
#' default: paste("--gzip", "--skip-technical", "--split-files")
#' @param subset an integer or NULL, default NULL (no subset). If defined as
#' a integer will download only the first n reads specified by subset. If subset is
#' defined, will force to use fastq-dump which is slower than ebi download.
#' @param compress logical, default TRUE. Download compressed files ".gz".
#' @param use.ebi.ftp logical, default: is.null(subset). Use ORFiks much faster download
#' function that only works when subset is null,
#' if subset is defined, it uses fastqdump, it is slower but supports subsetting.
#' Force it to use fastqdump by setting this to FALSE.
#' @param ebiDLMethod character, default "auto". Which download protocol
#' to use in download.file when using ebi ftp download. Sometimes "curl"
#' is might not work (the default auto usually), in those cases use wget.
#' See "method" argument of ?download.file, for more info.
#' @param timeout 5000, how many seconds before killing download if
#' still active? Will overwrite global option until R session is closed.
#' Increase value if you are on a very slow connection or downloading a large dataset.
#' @param BPPARAM how many cores/threads to use? default: bpparam().
#' To see number of threads used, do \code{bpparam()$workers}
#' @return a character vector of download files filepaths
#' @references https://ncbi.github.io/sra-tools/fastq-dump.html
#' @family sra
#' @export
#' @examples
#' SRR <- c("SRR453566") # Can be more than one
#' \donttest{
#' ## Simple single SRR run of YEAST
#' outdir <- tempdir() # Specify output directory
#' # Download, get 5 first reads
#' #download.SRA(SRR, outdir, rename = FALSE, subset = 5)
#'
#' ## Using metadata column to get SRR numbers and to be able to rename samples
#' outdir <- tempdir() # Specify output directory
#' info <- download.SRA.metadata("SRP226389", outdir) # By study id
#' ## Download, 5 first reads of each library and rename
#' #files <- download.SRA(info, outdir, subset = 5)
#' #Biostrings::readDNAStringSet(files[1], format = "fastq")
#'
#' ## Download full libraries of experiment
#' ## (note, this will take some time to download!)
#' #download.SRA(info, outdir)
#' }
download.SRA <- function(info, outdir, rename = TRUE,
                         fastq.dump.path = install.sratoolkit(),
                         settings =  paste("--skip-technical", "--split-files"),
                         subset = NULL,
                         compress = TRUE,
                         use.ebi.ftp = is.null(subset),
                         ebiDLMethod = "auto",
                         timeout = 5000,
                         BPPARAM = bpparam()) {

  # If character presume SRR, if not check for column Run or SRR
  SRR <- if (is.character(info)) { # if character
    info
  } else { # else metadata
    if (is.null(info$Run)) { # If not called Run
      info$SRR
    } else  { # If called Run
      info$Run
    }
  }
  if (is.null(SRR) | (length(SRR) == 0))
    stop("Could not find SRR numbers in 'info'")

  dir.create(outdir, showWarnings = FALSE, recursive = TRUE)

  settings <- paste("--outdir", outdir, settings)
  if (!is.null(subset)) {
    if(!is.numeric(subset)) stop("subset must be numeric if not NULL")
    subset <- as.integer(subset)
    settings <- paste(settings, "-X", subset)
  } else if (use.ebi.ftp){
    files <- download.ebi(info, outdir, rename, ebiDLMethod, timeout, BPPARAM)
    if (length(files) > 0) return(files)
    message("Checking for fastq files using fastq-dump")
  }
  if (compress) {
    settings <- paste(settings, "--gzip")
  }
  fastq.dump <- fastq.dump.path
  message("Starting download of SRA runs:")
  BiocParallel::bplapply(SRR, function(i, fastq.dump, settings) {
    message(i)
    system(command = paste(fastq.dump, i, settings),
           wait = TRUE)
  }, fastq.dump = fastq.dump, settings = settings, BPPARAM = BPPARAM)

  search_it <- ifelse(compress, "\\.fastq\\.gz$", "\\.fastq$")
  files <- unlist(lapply(SRR, function(S)
    dir(outdir, paste0(S, ".*", search_it), full.names = TRUE))
  )

  valid <- TRUE
  if (length(files) == 0) valid <- FALSE
  any.paired <- length(grep("_[2]\\.fastq\\.gz", files))
  # TODO: validate that this will work in download of mixed
  paired <- ifelse(any.paired,
                   length(grep("_[1-2]\\.fastq\\.gz", files)),
                   0)

  if (length(SRR) != (paired/2 + length(files) - paired))
    valid <- FALSE
  if (!valid) {
    warning("Some of the files specified was not downloaded,",
            " are you behind a strict firewall?")
    message("If only few files remaining, subset to those SRR numbers and run again")
  }

  if (is.logical(rename)) { # Renaming
    # Set to false if no metadata
    if (is.character(info) & rename) {
      rename <- FALSE
      warning("rename = TRUE, but no metadata given. Can not rename!")
    } else if (rename) files <- rename.SRA.files(files, info)
  } else { # else names were assigned manually
    files <- rename.SRA.files(files, rename)
  }
  return(files)
}

#' Rename SRA files from metadata
#'
#' @param files a character vector, with full path to all the files
#' @param new_names a character vector of new names or
#' a data.table with metadata to use to rename (usually from SRA metadata).
#' Priority of renaming from
#' the metadata is to check for unique names in the LibraryName column,
#' then the sample_title column if no valid names in LibraryName.
#' If found and still duplicates, will
#' add "_rep1", "_rep2" to make them unique. Paired end data will get a extension
#' of _p1 and _p2. If no valid names, will not
#' rename, that is keep the SRR numbers, you then can manually rename files
#' to something more meaningful.
#' @return a character vector of new file names
#' @family sra
#' @keywords internal
rename.SRA.files <- function(files, new_names) {
  info <- NULL # Set to default
  if (!is.character(new_names)) { # Then auto-guess from meta data
    message("Auto-guessing new names from metadata, check that they are valid")
    info <- new_names
    new_names <- NULL

    valid_libraryName_column <- !is.null(info$LibraryName) &
      !any(is.na(info$LibraryName)) & !any("" %in% info$LibraryName)
    if (valid_libraryName_column) {
        new_names <- info$LibraryName
    }
    not_defined_yet <- is.null(new_names)
    valid_sample_column <- !is.null(info$sample_title) &
      !any(is.na(info$sample_title)) & !any("" %in% new_names)
    if (not_defined_yet & valid_sample_column) {
      new_names <- info$sample_title
      new_names <- gsub(".*: ", "", new_names)
      new_names <- gsub(";.*", "", new_names)
    }
    libStrat <- info$LibraryStrategy
    libSelect <- info$LibrarySelection
    libStrat_usable <- !is.null(libStrat) &
      !any(is.na(libStrat)) & !all(c("") %in% libStrat) &
      !all(c("OTHER") %in% libStrat) & !all(c("other") %in% libStrat) &
      !all(c("unspecified") %in% libStrat)

    libSelect_usable <- !is.null(libSelect) &
      !any(is.na(libSelect)) & !all(c("") %in% libSelect) &
      !all(c("OTHER") %in% libSelect) & !all(c("other") %in% libSelect) &
      !all(c("unspecified") %in% libSelect)

    if (!is.null(new_names)) {
      new_names <- paste0(toupper(substr(new_names, 1, 1)),
                           substr(new_names, 2, nchar(new_names)))
    }
  }

  if (any(duplicated(new_names))) {
    new_names <- make.unique(new_names, sep = "_rep")
  }

  if (!is.null(new_names)) {
    message("Renaming files:")
    if (!is.null(info)) { # If metadata given, update if paired end
      if (any("PAIRED" %in% info$LibraryLayout)) {
        new_names <- lapply(seq_along(info$LibraryLayout),
                            function(x) if(info$LibraryLayout[x] == "PAIRED") {
                              c(paste0(new_names[x], "_p1"),
                                paste0(new_names[x],"_p2"))
                            } else new_names[x])
        new_names <- unlist(new_names)
      }
    }

    if (length(new_names) != length(files))
      stop("Length of files and new_names to rename by is not equal!",
           " If manual assign of paired end name, repeat each element twice!")

    new_names <- gsub(" |\\(|\\)", "_", new_names)
    new_names <- gsub("__", "_", new_names)
    new_names <- gsub("/", "", new_names)
    is_gzipped <- grep("\\.fastq\\.gz", files)

    new_names <- paste0(dirname(files), "/", basename(new_names), ".fastq")

    new_names[is_gzipped] <- paste0(new_names, ".gz")
    for (i in seq(length(files))) {
      file.rename(files[i], new_names[i])
    }
  } else {
    warning("Did not find a way for valid renaming, returning without renaming!")
    return(files)
  }
  names(new_names) <- basename(files)
  return(new_names)
}

#' Faster download of fastq files
#'
#' Uses ftp download from vol1 drive on EBI ftp server,
#'  for faster download of ERR, SRR or DRR files.
#' But does not support subsetting or custom settings of files!
#' @inheritParams download.SRA
#' @return character, full filepath of downloaded  files
#' @family sra
#' @keywords internal
download.ebi <- function(info, outdir, rename = TRUE,
                         ebiDLMethod = "auto", timeout = 5000,
                         BPPARAM = bpparam()) {

  study <- NULL
  # If character presume SRR, if not check for column Run or SRR
  SRR <- if (is.character(info)) { # if character
    info
  } else { # else metadata
    # Check if study is specified
    if (length(unique(info$BioProject)) == 1)
      study <- info$BioProject[1]
    if (is.null(info$Run)) { # If not called Run
      info$SRR
    } else  { # If called Run
      info$Run
    }
  }
  if (is.null(SRR) | (length(SRR) == 0))
    stop("Could not find SRR numbers in 'info'")
  dir.create(outdir, showWarnings = FALSE, recursive = TRUE)

  urls <- find_url_ebi(SRR, study = study)
  if (length(urls) == 0) {
    message("None of the Fastq files specified found on ebi")
    return(NULL)
  } else if (length(urls) < length(SRR)) {
    message("Not all fastq files specified found on ebi")
    return(NULL)
  }

  files <- file.path(outdir, basename(urls))
  message("Starting download of EBI runs:")
  method <- ebiDLMethod
  withr::local_options(timeout = timeout)
  BiocParallel::bplapply(urls, function(i, outdir, method) {
    message(i)
    download.file(i, destfile = file.path(outdir, basename(i)),
                  method = method, quiet = TRUE)
  }, outdir = outdir, method = method, BPPARAM = BPPARAM)

  if (is.logical(rename)) {
    # Set to false if not metadata
    if (is.character(info) & rename) {
      rename <- FALSE
      warning("rename = TRUE, but no metadata given. Can not rename!")
    } else if (rename) files <- rename.SRA.files(files, info)
  } else { # else manual assign names
    files <- rename.SRA.files(files, rename)
  }
  return(files)
}

#' Locates and check if fastq files exists in ebi
#'
#' Look for files in ebi following url: ftp://ftp.sra.ebi.ac.uk/vol1/fastq
#' Paired end and single end fastq files.\cr
#' EBI uses 3 ways to organize data inside vol1/fastq:\cr
#' - 1: Most common: SRR(3 first)/0(2 last)/whole\cr
#' - 2: less common: SRR(3 first)/00(1 last)/whole\cr
#' - 3: least common SRR(3 first)/whole
#' @param SRR character, SRR, ERR or DRR numbers.
#' @param stop.on.error logical FALSE, if TRUE will stop
#'  if all files are not found. If FALSE returns empty character vector if error
#'  is catched.
#' @param study default NULL, optional PRJ (study id) to speed up search
#' for URLs.
#' @return full url to fastq files, same length as input
#' (2 urls for paired end data). Returns empty character() if all
#' files not found.
#' @examples
#' # Test the 3 ways to get fastq files from EBI
#' # Both single end and paired end data
#'
#' # Most common: SRR(3 first)/0(2 last)/whole
#' # Single
#' ORFik:::find_url_ebi("SRR10503056")
#' # Paired
#' ORFik:::find_url_ebi("SRR10500056")
#'
#' # less common: SRR(3 first)/00(1 last)/whole
#' # Single
#' #ORFik:::find_url_ebi("SRR1562873")
#' # Paired
#' #ORFik:::find_url_ebi("SRR1560083")
#' # least common SRR(3 first)/whole
#' # Single
#' #ORFik:::find_url_ebi("SRR105687")
#' # Paired
#' #ORFik:::find_url_ebi("SRR105788")
find_url_ebi <- function(SRR, stop.on.error = FALSE, study = NULL) {
  message("Finding optimal download urls from ebi...")
  final.path <- if (!is.null(study)) {
    find_url_ebi_safe(study, SRR, stop.on.error = stop.on.error)
  } else find_url_ebi_safe(SRR, stop.on.error = stop.on.error)
  return(final.path)
  # TODO: remove when not needed
  ebi_server <- "ftp://ftp.sra.ebi.ac.uk"
  # Check that we can connect to ebi
  exists.ftp.dir.fast(ebi_server, report.error = TRUE)

  # Create candidate directories
  SRR_first_3 <- substring(SRR, 1, 6)
  SRR_last_3 <- paste0("0", reverse(substring(reverse(SRR), 1, 2)))
  SRR_last_1 <- paste0("00", reverse(substring(reverse(SRR), 1, 1)))
  SRR_default <- file.path(ebi_server, "vol1/fastq", SRR_first_3)

  SRR_fastq <- paste0(SRR, ".fastq.gz")
  SRR_fastq_paired <- c(paste0(SRR, c("_1"), ".fastq.gz"),
                        paste0(SRR, c("_2"), ".fastq.gz"))
  # method 1
  SRR_paths <- file.path(SRR_default, SRR_last_3, SRR, SRR_fastq)
  SRR_paths_paired <- file.path(SRR_default, SRR_last_3, SRR, SRR_fastq_paired)
  # method 2:
  SRR_paths_spec2 <- file.path(SRR_default, SRR_last_1, SRR, SRR_fastq)
  SRR_paths_paired_spec2 <- file.path(SRR_default, SRR_last_1, SRR, SRR_fastq_paired)
  # Method 3: Special location
  SRR_paths_spec <- file.path(SRR_default, SRR, SRR_fastq)
  SRR_paths_spec_paired <- file.path(SRR_default, SRR, SRR_fastq_paired)
  # Check what format the files are found in (3 types: 2 each)
  lib_counter <- 0
  SRR_possibilities <- list(SRR_paths, SRR_paths_paired, SRR_paths_spec2,
                            SRR_paths_paired_spec2, SRR_paths_spec,
                            SRR_paths_spec_paired)
  url.exists <- c()
  for (i in seq_along(SRR_possibilities)) { # For each url area
    if (lib_counter == length(SRR)) break
    url.temp <-  sapply(SRR_possibilities[[i]], function(x)
      exists.ftp.file.fast(x))
    url.temp <- url.temp[url.temp]
    if (i %% 2 == 1) {
      lib_counter <- lib_counter + length(url.temp)
    } else {
      lib_counter <- lib_counter + (length(url.temp)/2)
    }
    url.exists <- c(url.exists, url.temp)
  }

  final.path.temp <- names(url.exists[url.exists])
  # Sort them correctly as input
  final.path <- unlist(sapply(c(SRR, "asdasd"), function(x, final.path.temp) {
    sort(final.path.temp[grepl(x, final.path.temp)])
  }, final.path.temp = final.path.temp), use.names = FALSE)

  valid <- TRUE
  if (lib_counter != length(SRR)) valid <- FALSE
  paired <- length(grep("_[1-2]\\.fastq\\.gz",final.path))
  if (length(SRR) != (paired/2 + length(final.path) - paired))
    valid <- FALSE
  if (!valid & stop.on.error) stop("Did not find all fastq files on ENA",
                                   "set use.ebi.ftp to FALSE,
                                   to use fastq-dump instead")
  if (!valid) final.path <- character()

  return(final.path)
}

#' Find URL for EBI fastq files
#'
#' Safer version
#' @param accession character: (PRJ, SRP, ERP, DRP, SRX, SRR, ERR,..). For studies or samples,
#' it returns all runs per study or sample.
#' @param SRR character, which SRR numbers to subset by (can also be ERR or DRR numbers)
#' @param stop.on.error logical FALSE, if TRUE will stop
#'  if all files are not found. If FALSE returns empty character vector if error
#'  is catched.
#' @return character (1 element per SRR number)
#' @keywords internal
find_url_ebi_safe <- function(accession, SRR = NULL, stop.on.error = FALSE) {
  a <- data.table()
  for (i in accession) {
    search_url <- paste0("http://www.ebi.ac.uk/ena/portal/api/filereport?accession=",
                         i, "&result=read_run&fields=run_accession,fastq_ftp")
    temp <- suppressWarnings(temp <- fread(search_url, header = TRUE))
    a <- rbindlist(list(a, temp))
  }
  if (!is.null(SRR)) {
    if (!all(SRR %in% a$run_accession)) {
      if (stop.on.error) stop("Study does not contain some of the SRR numbers given!")
      return(character())
    }
    a <- a[run_accession %in% SRR,]
  }

  return(unlist(strsplit(a$fastq_ftp, ";")))
}

#' Extract SRR/ERR/DRR run IDs from string
#'
#' @param x character vector to search through.
#' @param search the regex search, default: \code{"(SRR[0-9]+|DRR[0-9]+|ERR[0-9]+)"}
#' @param only_valid logical, default FALSE. If TRUE, return only the hits.
#' @return a character vector of run accepted run ids according to search,
#' if only_valid named character vector for which indices are returned
#' @examples
#' search <- c("SRR1230123_absdb", "SRR1241204124_asdasd", "asd_ERR1231230213",
#'  "DRR12412412_asdqwe", "ASDASD_ASDASD", "SRRASDASD")
#' ORFik:::extract_run_id(search)
#' ORFik:::extract_run_id(search, only_valid = TRUE)
extract_run_id <- function(x, search = "(SRR[0-9]+|DRR[0-9]+|ERR[0-9]+)", only_valid = FALSE) {
  hits <- gsub(paste0(".*", search), "\\1", gsub(paste0(search, ".*"), "\\1", x))
  match <- grep(search, hits)
  if (only_valid) {
    hits <- hits[match]
    names(hits) <- match
  } else hits[!(seq_along(hits) %in% match)] <- ""
  return(hits)
}
Roleren/ORFik documentation built on Oct. 19, 2024, 7:37 a.m.