R/TcgaDccDownloader.R

# https://tcga-data.nci.nih.gov/tcgafiles/ftp_auth/distro_ftpusers/tcga4yeo/tumor/ov/cgcc/broad.mit.edu/genome_wide_snp_6/snp/broad.mit.edu_OV.Genome_Wide_SNP_6.12.5.0/MANIFEST.txt
# http://tcga-data.nci.nih.gov/tcgafiles/ftp_auth/distro_ftpusers/anonymous/

setConstructorS3("TcgaDccDownloader", function(urlPath=NULL, rootUrl="tcga-data.nci.nih.gov/tcgafiles/ftp_auth/distro_ftpusers/", protocol=c("auto", "https","http"), username=NULL, password=NULL, ...) {
  # Argument 'protocol':
  protocol <- match.arg(protocol);

  extend(Object(), "TcgaDccDownloader",
    .protocol = protocol,
    .username = username,
    .password = password,
    .rootUrl = rootUrl,
    .urlPath = urlPath
  )
})

setMethodS3("as.character", "TcgaDccDownloader", function(x, ...) {
  # To please R CMD check 
  this <- x;

  s <- sprintf("%s:", class(this)[1]);
  s <- c(s, sprintf("Protocol: %s", getProtocol(this)));
  s <- c(s, sprintf("Root URL: %s", getRootUrl(this)));
  s <- c(s, sprintf("URL path: %s", getUrlPath(this)));
  s <- c(s, sprintf("Root path: %s", getRootPath(this)));

  class(s) <- "GenericSummary"; 
  s;
})

setMethodS3("getRootPath", "TcgaDccDownloader", function(this, ...) {
  "tcgaData";
})

setMethodS3("getProtocol", "TcgaDccDownloader", function(this, ...) {
  protocol <- this$.protocol;
  if (protocol == "auto") {
    path <- getUrlPath(this);
    path <- strsplit(path, split="/", fixed=TRUE)[[1]];
    path <- path[1];
    if (path %in% c("tcga4yeo")) {
      protocol <- "https";
    } else {
      protocol <- "http";
    }
  }
  protocol;
})

setMethodS3("getUsername", "TcgaDccDownloader", function(this, ...) {
  this$.username;
})

setMethodS3("setUsername", "TcgaDccDownloader", function(this, value, ...) {
  this$.username <- value;
  this;
})

setMethodS3("getPassword", "TcgaDccDownloader", function(this, ...) {
  this$.password;
})

setMethodS3("setPassword", "TcgaDccDownloader", function(this, value, ...) {
  this$.password <- value;
  this;
})

setMethodS3("setLogin", "TcgaDccDownloader", function(this, username, password, ...) {
  setUsername(this, username);
  setPassword(this, password);
})

setMethodS3("getRootUrl", "TcgaDccDownloader", function(this, ...) {
  this$.rootUrl;
})

setMethodS3("getUrlPath", "TcgaDccDownloader", function(this, ...) {
  this$.urlPath;
})

setMethodS3("getUrlPathname", "TcgaDccDownloader", function(this, filename, ...) {
  filename <- Arguments$getCharacter(filename);
  url <- paste(c(getRootUrl(this), getUrlPath(this), filename), collapse="/");
  url <- gsub("//", "/", url);
  url;
})

setMethodS3("getUrl", "TcgaDccDownloader", function(this, ...) {
  protocol <- getProtocol(this);
  pathname <- getUrlPathname(this, ...);
  url <- sprintf("%s://%s", protocol, pathname);
  url;
})

setMethodS3("downloadUrl", "TcgaDccDownloader", function(this, url, filename=NULL, path=NULL, ..., skip=TRUE, overwrite=!skip, verbose=FALSE) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'url':
  url <- Arguments$getCharacter(url);

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose);
  if (verbose) {
    pushState(verbose);
    on.exit(popState(verbose));
  }


  # Argument 'filename' & 'path':
  if (is.null(filename)) {
    filename <- basename(url);
  }
  if (is.null(path)) {
    path <- file.path(getRootPath(this), getUrlPath(this));
  }
  pathname <- Arguments$getWritablePathname(filename, path=path,
                                                      mustNotExist=FALSE);

  verbose && enter(verbose, "Downloading URL");
  verbose && cat(verbose, "URL: ", url);

  if (isFile(pathname)) {
    fi <- file.info(pathname);
    if (!is.na(fi$size) && fi$size == 0) {
      file.remove(pathname);
      verbose && cat(verbose, "Removed empty preexisting file: ", pathname);
    } else if (skip) {
      verbose && cat(verbose, "Already downloaded. Skipping.");
      verbose && exit(verbose);
      return(pathname);
    }
  }

  # Download to a temporary pathname
  pathnameT <- sprintf("%s.tmp", pathname);
  pathnameT <- Arguments$getWritablePathname(pathnameT, mustNotExist=TRUE);
  on.exit({
    if (isFile(pathnameT)) {
      file.remove(pathnameT);
    }
  }, add=TRUE);
  
  protocol <- getProtocol(this);
  if (protocol == "https") {
    usr <- getUsername(this);
    pwd <- getPassword(this);
    opts <- sprintf("--http-user=%s --http-passwd=%s", usr, pwd);
  } else {
    opts <- "";
  }
  fmtstr <- "wget --output-document=\"%s\" --no-check-certificate %s %s";
  cmd <- sprintf(fmtstr, pathnameT, opts, url);

  # Download
  verbose && enter(verbose, "Downloading");
  verbose && cat(verbose, "Command: ", cmd);
  res <- system(cmd);
  verbose && cat(verbose, "Downloading finished\n");
  verbose && cat(verbose, "Download result:", res);
  verbose && exit(verbose);

  # Remove failed or "empty" downloads
  fi <- file.info(pathnameT);
  verbose && str(verbose, fi);
  if (res != 0 || is.na(fi$size) || fi$size == 0) {
    file.remove(pathnameT);
    verbose && cat(verbose, "Removed downloaded file because it was empty: ", pathnameT);
    pathname <- NULL;
  } else {
    # Rename temporary file
    file.rename(pathnameT, pathname);
    if (!isFile(pathname)) {
      throw("Failed to rename temporary filename: ", pathnameT, " -> ", pathname);
    }
    if (isFile(pathnameT)) {
      throw("Failed to remove temporary filename: ", pathnameT);
    }
  }

  verbose && exit(verbose);
  
  invisible(pathname);
})


setMethodS3("downloadFile", "TcgaDccDownloader", function(this, filename, ..., onError=c("error", "warning", "quiet"), force=FALSE, verbose=FALSE) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'filename':
  filename <- Arguments$getCharacter(filename);

  # Argument 'onError':
  onError <- match.arg(onError);

  # Argument 'force':
  force <- Arguments$getLogical(force);

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose);
  if (verbose) {
    pushState(verbose);
    on.exit(popState(verbose));
  }


  verbose && enter(verbose, "Downloading URL");

  pathname <- NULL;
  if (force || filename == "MANIFEST.txt" || hasFile(this, filename)) {
    url <- getUrl(this, filename);
    verbose && cat(verbose, "Filename: ", filename);
    verbose && cat(verbose, "URL: ", url);
    tryCatch({
      pathname <- downloadUrl(this, url, ..., verbose=verbose);
    }, error = function(ex) {
      msg <- ex$message;
      verbose && print(verbose, msg);
      if (onError == "error") {
        throw(ex);
      } else if (onError == "warning") {
        warning(msg);
      } else if (onError == "quiet") {
      }
    });
  }

  verbose && cat(verbose, "Downloaded file: ", pathname);

  verbose && exit(verbose);


  pathname;
})

setMethodS3("downloadFiles", "TcgaDccDownloader", function(this, ...) {
  downloadFilesByPattern(this, ...);
}, deprecated=TRUE);

setMethodS3("downloadFilesByPattern", "TcgaDccDownloader", function(this, pattern=NULL, ..., order=c("ascending", "descending", "random"), verbose=FALSE) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'pattern':
  if (!is.null(pattern)) {
    pattern <- Arguments$getRegularExpression(pattern);
  }

  # Argument 'order':
  order <- match.arg(order);

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose);
  if (verbose) {
    pushState(verbose);
    on.exit(popState(verbose));
  }


  verbose && enter(verbose, "Downloading files matching pattern");

  verbose && enter(verbose, "Identifying filenames to be downloading");
  verbose && cat(verbose, "Pattern: ", pattern);
  filenames <- listFiles(this, pattern=pattern, ..., verbose=less(verbose, 5));
  verbose && str(verbose, filenames);
  verbose && exit(verbose);

  # Reorder?
  if (order == "ascending") {
    o <- order(filenames, decreasing=FALSE);
  } else if (order == "ascending") {
    o <- order(filenames, decreasing=TRUE);
  } else if (order == "random") {
    o <- sample(length(filenames), replace=FALSE);
  }
  filenames <- filenames[o];
  verbose && str(verbose, filenames);

  verbose && enter(verbose, "Downloading files");
  verbose && cat(verbose, "Number of files: ", length(filenames));
  pathnames <- sapply(filenames, FUN=function(filename) {
    verbose && print(verbose, filename);
    pathname <- downloadFile(this, filename, ...);
    verbose && print(verbose, pathname);
    pathname;
  });
  verbose && str(verbose, pathnames);
  verbose && exit(verbose);

  verbose && exit(verbose);

  invisible(pathnames);
})


setMethodS3("readLines", "TcgaDccDownloader", function(con, ...) {
  # To please R CMD check
  this <- con;

  pathname <- downloadFile(this, ...);
  if (length(pathname) == 0)
    return(NULL);
  
  lines <- readLines(pathname);
  attr(lines, "pathname") <- pathname;

  lines;
})

setMethodS3("readTextFile", "TcgaDccDownloader", function(this, filename, ...) {
  field <- sprintf(".%s", filename);
  value <- this[[field]];
  if (is.null(value)) {
    value <- readLines(this, filename);
    this[[field]] <- value;
  }
  value;
})

setMethodS3("readManifest", "TcgaDccDownloader", function(this, ...) {
  readTextFile(this, "MANIFEST.txt");
})

setMethodS3("readReadme", "TcgaDccDownloader", function(this, ...) {
  readTextFile(this, "README.txt");
})

setMethodS3("readIdf", "TcgaDccDownloader", function(this, ...) {
  path <- getUrlPath(this);
  filename <- listFiles(this, pattern="[.]idf[.]txt", ...);
  if (length(filename) == 0) {
    throw("Cannot read IDF.  Failed to located *.idf.txt: ");
  }
  if (length(filename) > 2) {
    throw("Cannot read IDF.  Failed to located a unique *.idf.txt: ", 
                                       paste(filename, collapse=", "));
  }
  readTextFile(this, filename);
})

setMethodS3("readSdrf", "TcgaDccDownloader", function(this, ...) {
  path <- getUrlPath(this);
  filename <- listFiles(this, pattern="[.]sdrf[.]txt", ...);
  if (length(filename) == 0) {
    throw("Cannot read IDF.  Failed to located *.idf.txt: ");
  }
  if (length(filename) > 2) {
    throw("Cannot read IDF.  Failed to located a unique *.idf.txt: ", 
                                       paste(filename, collapse=", "));
  }
  readTextFile(this, filename);
})

# Not required
setMethodS3("readDescription", "TcgaDccDownloader", function(this, ...) {
  readTextFile(this, "DESCRIPTION.txt");
})

# Not required
setMethodS3("readSampleInfo", "TcgaDccDownloader", function(this, ...) {
  readTextFile(this, "SampleInfo.txt");
})

setMethodS3("downloadCoreTextFile", "TcgaDccDownloader", function(this, ...) {
  # Must exists
  manifest <- readManifest(this);
  readme <- readReadme(this);

  # Semi-optional
  tryCatch({
    idf <- readIdf(this);
  }, error = function(ex) {});
  tryCatch({
    sdrf <- readSdrf(this);
  }, error = function(ex) {});

  # Optional
  si <- readSampleInfo(this);
  desc <- readDescription(this);
})

setMethodS3("listFiles", "TcgaDccDownloader", function(this, pattern=NULL, ignore.case=TRUE, ...) {
  if (!is.null(pattern)) {
    pattern <- Arguments$getRegularExpression(pattern);
  }
  manifest <- readManifest(this, ...);
  # Check if there are checksums
  pattern2 <- "^([0-9abcdefABCDEF]+)[ ]+(.*)$";
  hasChecksums <- (regexpr(pattern2, manifest) != -1);
  # Sanity check
  stopifnot(length(unique(hasChecksums)) == 1);
  hasChecksums <- hasChecksums[1];
  if (hasChecksums) {
    filenames <- gsub(pattern2, "\\2", manifest);
  } else {
    filenames <- manifest;
  }

  if (!is.null(pattern)) {
    filenames <- grep(pattern=pattern, filenames, 
                      ignore.case=ignore.case, value=TRUE);
  }
  filenames;
})


setMethodS3("hasFiles", "TcgaDccDownloader", function(this, filenames, ...) {
  # Argument 'filenames':
  filenames <- Arguments$getCharacter(filenames);

  availableFilenames <- listFiles(this, ...);
  is.element(filenames, availableFilenames);
})


setMethodS3("hasFile", "TcgaDccDownloader", function(this, filename, ...) {
  # Argument 'filename':
  filename <- Arguments$getCharacter(filename);

  hasFiles(this, filenames=filename, ...);
})


setMethodS3("getDataSet", "TcgaDccDownloader", function(this, ...) {
  path <- getUrlPath(this);
#  path <- getParent(path, depth=0);
  name <- basename(path);
  name;
})

setMethodS3("getDataSetName", "TcgaDccDownloader", function(this, ...) {
  fullname <- getDataSet(this, ...);
  pattern <- TcgaDccData$getDataSetPatterns()$dataset;
  gsub(pattern, "\\1_\\2.\\3", fullname);
})


setMethodS3("getArchive", "TcgaDccDownloader", function(this, ...) {
  fullname <- getDataSet(this, ...);
  pattern <- TcgaDccData$getDataSetPatterns()$dataset;
  gsub(pattern, "\\4", fullname);
})


setMethodS3("getDataType", "TcgaDccDownloader", function(this, ...) {
  path <- getUrlPath(this);
  path <- getParent(path, depth=1);
  name <- basename(path);
  name;
})

setMethodS3("getPlatform", "TcgaDccDownloader", function(this, ...) {
  path <- getUrlPath(this);
  path <- getParent(path, depth=2);
  name <- basename(path);
  name;
})

setMethodS3("getCenter", "TcgaDccDownloader", function(this, ...) {
  path <- getUrlPath(this);
  path <- getParent(path, depth=3);
  name <- basename(path);
  name;
})

setMethodS3("getCenterType", "TcgaDccDownloader", function(this, ...) {
  path <- getUrlPath(this);
  path <- getParent(path, depth=4);
  name <- basename(path);
  name;
})

setMethodS3("getTumorType", "TcgaDccDownloader", function(this, ...) {
  path <- getUrlPath(this);
  path <- getParent(path, depth=5);
  name <- basename(path);
  name;
})

setMethodS3("getAccessRoot", "TcgaDccDownloader", function(this, ...) {
  path <- getUrlPath(this);
  path <- getParent(path, depth=6);
  name <- basename(path);
  name;
})


setMethodS3("getKnownFilePatterns", "TcgaDccDownloader", function(this, types=c("coreFiles", "level1", "level2", "level3"), ...) {
  # Argument 'types':
  types <- Arguments$getCharacters(types);

  center <- getCenter(this);
  platform <- getPlatform(this);
  dataType <- getDataType(this);


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Default patterns
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  patterns <- list(
    coreFiles = c(
      "^MANIFEST.txt$",
      "^README.txt$",
      "[.]idf[.]txt$",
      "[.]sdrf[.]txt$"
    ),
    level1 = c(),
    level2 = c(),
    level3 = c()
  );


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if (center == "hudsonalpha.org" && 
      is.element(platform, c("humanhap550", "human1mduo"))) {
    patterns$level1 <- c(
      "[.]idat$",
      "[.]XandYintensity[.]txt$"
    );

    patterns$level2 <- c(
      "[.]B_Allele_Freq[.]txt$",
      "[.]Delta_B_Allele_Freq[.]txt$",
      "[.]Genotypes[.]txt$",
      "[.]Normal_LogR[.]txt$",
      "[.]Paired_LogR[.]txt$",
      "[.]Unpaired_LogR[.]txt$"
    );

    patterns$level3 <- c(
      "[.]seg[.]txt$",
      "[.]segnormal[.]txt$",
      "[.]loh[.]txt$"
    );
  }

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  else if (center == "broad.mit.edu" && 
           platform == "genome_wide_snp_6") {
    patterns$coreFiles <- c(patterns$coreFiles, 
      "^DESCRIPTION.txt$",
      "^SampleInfo.txt$"
    );
    patterns$level1 <- c(
      "[.](cel|CEL)$"
    );
    patterns$level2 <- c(
         "[.]after_5NN[.]copynumber[.]data[.]txt$", 
          "[.]copynumber[.]byallele[.]data[.]txt$", 
               "^[^.]+[.]copynumber[.]data[.]txt$", 
                      "[.]ismpolish[.]data[.]txt$", 
        "[.]no_outlier[.]copynumber[.]data[.]txt$"
    );
    patterns$level3 <- c(
        "[.]birdseed[.]data[.]txt$",
        "[.]seg[.]data[.]txt$"
    );
  }


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  else if (center == "hms.harvard.edu" && 
           is.element(platform, c("hg-cgh-244a", "hg-cgh-415k_g4124a"))) {
    patterns$level1 <- c(
      "_BioSizing[.]tsv$", 
      "_QA[.]tsv$", 
      "[.]jpg$", 
      "_lowess_normalized_smoothed[.]png$", 
      "[.]pdf$",
      "^TCGA-.*[.]txt$"
    );

    if (platform == "hg-cgh-244a") {
      patterns$level1 <- c(patterns$level1,
        "_Nanodrop[.]tsv$", 
        "_lowess_normalized[.]tsv$",
        "[.]tif$"
      );
    };

    patterns$level2 <- c(
      "[.]tsv$",
      "[.]data[.]txt$"
    );
    patterns$level3 <- c(
      "_Segment[.]tsv$"
    );
  }


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  else if (center == "mskcc.org" && 
           is.element(platform, c("hg-cgh-244a", "cgh-1x1m_g4447a"))) {
    patterns$level1 <- c(
      "^MSK_.*[.]txt$"
    );

    patterns$level2 <- c(
      "[.]data[.]txt$"
    );
    patterns$level3 <- c(
      "[.]CBS[.]txt$"
    );
  }


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  else if (center == "jhu-usc.edu" && 
           platform == "humanmethylation27") {
    patterns$level1 <- c(
      "lvl-1[.].*[.]txt$"
    );

    patterns$level2 <- c(
      "lvl-2[.].*[.]txt$"
    );
    patterns$level3 <- c(
      "lvl-3[.].*[.]txt$"
    );
  }

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  else if (center == "jhu-usc.edu" && 
           is.element(platform, c("illuminadnamethylation_oma002_cpi", "illuminadnamethylation_oma003_cpi"))) {
    patterns$level1 <- c(
      "cy3-cy5-value[.]txt$"
    );

    patterns$level2 <- c(
      "beta-value[.]txt$",
      "detection-p-value[.]txt$"
    );
  }

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  else if (center == "unc.edu" && 
           is.element(platform, c("agilentg4502a_07_1", "agilentg4502a_07_2", "agilentg4502a_07_3"))) {
    patterns$level1 <- c(
      "_([a-Z]{3}|[0-9]{2})[0-9]{2}[.]txt$"
    );
    
    patterns$level2 <- c(
      "lmean[.]out[.]logratio[.]probe[.]tcga_level2[.]data[.]txt$"
    );
    
    patterns$level3 <- c(
      "lmean[.]out[.]logratio[.]gene[.]tcga_level3[.]data[.]txt$"
    );

  }
  
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  else if (center == "unc.edu" && 
           is.element(platform, c("h-mirna_8x15k", "h-mirna_8x15kv2"))) {
    patterns$level1 <- c(
        "_[a-Z]{3}[0-9]{2}_[0-9]_[0-9][.]txt$"
    );
    
    patterns$level2 <- c(
      "probe[.]tcga_level2[.]data[.]txt$"
    );
    
    patterns$level3 <- c(
      "gene[.]tcga_level3[.]data[.]txt$"
    );

  }

  else {
    throw("Unknown center & platform: ", center, ", ", platform);
  }


  patterns <- patterns[types];

  patterns;
})


############################################################################
# HISTORY:
# 2010-04-05
# o Added file patterns from jhu-usc.edu and unc.edu to getKnownFilePatterns().
# 2010-01-17
# o Updated listFiles() of TcgaDccDownloader to also handle the new DCC v3
#   MANIFEST.txt files that contains MD5 checksums.
# 2010-01-04
# o ROBUSTNESS: Updated getKnownFilePatterns() of TcgaDccDownloader to
#   "^[^.]+[.]copynumber[.]data[.]txt$" in order to avoid getting 
#   "[.]no_outlier[.]copynumber[.]data[.]txt$" files too.
# 2009-11-01
# o Added (mskcc.org, cgh-1x1m_g4447a) to getKnownFilePatterns().
# 2009-10-29
# o Added default argument 'ignore.case=TRUE' to listFiles(...) for
#   TcgaDccDownloader.
# 2009-10-22
# o Added argument 'onError' to downloadFile().
# o Added argument 'order' to downloadFilesByPattern().
# o Renamed downloadFiles() to downloadFilesByPattern().
# o Now downloadUrl() downloads to a temporary file and renames it after
#   download is complete.  This lower the risk for incomplete downloads.
# o Now downloadFile() only download files available in the MANIFEST file.
# o Added hasFiles() and hasFile() to TcgaDccDownloader.
# 2009-04-18
# o BUG FIX: Now the output directory passed to wget is put within double
#   quotes.
# o Now downloadUrl() removes pre-existing empty files by default.
# o Added accessor functions for path attributes.
# 2009-04-16
# o Added downloadFiles().
# o Added setLogin().
# 2009-04-13
# o Updated readIdf() and readSdrf() to locate the files via the manifest 
#   file.
# 2009-04-07
# o Created.
############################################################################
HenrikBengtsson/aroma.tcga documentation built on May 7, 2019, 2:51 a.m.