R/commands.R

# Some helpers for NCBI-related commands like fastq-dump or ascp.

# download fastq files for the SRR's in the given run info table.
fastq_dump <- function(run_info,
                       sample_name_col = "Library_Name",
                       fastq_dump = "fastq-dump",
                       outdir = ".",
                       dump_args=NULL) {
  if (! dir.exists(outdir)) {
    dir.create(outdir, recursive = TRUE)
  }
  for (run in run_info$Run) {
    args <- c(run, "--outdir", outdir, dump_args)
    system2(fastq_dump, args)
  }
}

# Generate a shell command for an ASCP file transfer in a list format usable
# with do.call(system2, cmd).  If you're feeling adventurous, call it that way.
# Note that evidently ascp will follow symlinks for the transfer rather than
# transfer symlinks, so you can set up a contrived directory of symlinks ahead
# of time.
# md: data frame with any number of filename, filename1, ... columns
# email: email address matching NCBI account
# suffix: user-specific suffix used on the upload directory, after the email
# dir_path: optional path to prefix each source data file with.
# address and underscore.  "Aspera command line upload instructions" will give
# this.
# command: path to the ascp command.
# fp_key: path to the Aspera transfer SSH private key.
# dp_dest: upload sub-directory.  by default uses current date in YYYYMMDD
# format.
ascp_make_command <- function(md,
                              email,
                              suffix,
                              dir_path = NULL,
                              command = "ascp",
                              fp_key = "~/.ssh/id_rsa-ncbi",
                              dp_dest = format(Sys.time(), "%Y%m%d")) {
  colidx <- grep("^filename", colnames(md))
  src <- unname(unlist(lapply(md[, colidx], as.character)))
  if (! is.null(dir_path)) {
    src <- file.path(dir_path, src)
  }
  dest <- paste0("subasp@upload.ncbi.nlm.nih.gov:",
                 "uploads/", email, "_", suffix, "/", dp_dest)
  list(command = command,
       args = c("-i",
                path.expand(fp_key),
                "-QT",
                "-l100m",
                "-k1",
                "-d",
                src, dest))
}

# list files in a subdirectory on the NCBI upload server.
# Note that -k1 above means ascp will try to resume where possible when
# uploading, so you don't need to manually track progress.
ascp_list_files <- function(email, suffix,
                            command = "ssh",
                            fp_key = "~/.ssh/id_rsa-ncbi",
                            dp_dest = format(Sys.time(), "%Y%m%d")) {
  dest <- file.path("uploads",
                            paste(email, suffix, sep = "_"),
                            dp_dest)
  cmd_ls <- paste("ls", dest)
  cmd <- list(command = command,
              input = cmd_ls,
              stdout = TRUE,
              args = c("-T",
                       "-i",
                       path.expand(fp_key),
                       "subasp@upload.ncbi.nlm.nih.gov"))
  files <- do.call(system2, cmd)
  # There must be a better way to keep the command prompt out of this.
  files <- sub("^aspsh> ", "", files)
  files <- files[files != ""]
  files
}
ressy/srahelper documentation built on May 29, 2019, 11:03 p.m.