#############################################################################
# TCGA-Assembler : An open-source R program for downloading, processing and analyzing public TCGA data.
# Copyright (C) <2014> <Yitan Zhu>
# This file is part of TCGA-Assembler.
# TCGA-Assembler is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# TCGA-Assembler is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with TCGA-Assembler. If not, see <http://www.gnu.org/licenses/>.
############################################################################
##############################################################################
# TCGA-Assembler Version 1.0.3 Module A
##############################################################################
library(RCurl);
library(httr);
library(stringr);
######################### Main Functions of Module A #####################################################
TraverseAllDirectories <- function(entryPoint, fileLabel)
{
options(warn=-1);
time1 = proc.time();
# get all the files/directories under the entryPoint
writeLines("**********************************************************************************");
writeLines("");
writeLines("Traverse data directories of TCGA DCC to gather URLs of TCGA data files");
URLcontent = getTCGA_URL(entryPoint);
file_url = URLcontent$file_url;
dir_url = URLcontent$dir_url;
counter = 1;
while (counter<=length(dir_url))
{
URLcontent = getTCGA_URL(dir_url[counter]);
file_url = c(file_url, URLcontent$file_url);
dir_url = c(dir_url, URLcontent$dir_url);
if ((counter %% 500) == 0)
{
writeLines(paste("Identified ", counter, " directories and ", length(file_url), " files.", sep = ""));
}
counter = counter + 1;
}
writeLines(paste("IN TOTAL, identified ", length(dir_url), " directories and ", length(file_url), " files.", sep = ""));
# generate the filename to store results
date_string = date();
date_string = strsplit(date_string, split = " ")[[1]];
date_string = paste(date_string[2], date_string[3], date_string[5], sep = "-");
filename = paste(fileLabel, '_', date_string, '.rda', sep = "");
orderID = order(file_url);
file_url = file_url[orderID];
upper_file_url = toupper(file_url);
save(file_url, dir_url, upper_file_url, file = filename);
time = proc.time() - time1;
writeLines(paste("Total elapsed time is ", round(time[3]/3600, digits = 3), " hours.", sep = ""));
writeLines("");
writeLines("**********************************************************************************");
writeLines("\n");
options(warn=0);
}
DownloadClinicalData <- function(traverseResultFile, saveFolderName, cancerType, clinicalDataType, outputFileName = "")
{
options(warn=-1);
writeLines("**********************************************************************************");
writeLines("");
writeLines(paste("Download clinical information of ", cancerType, " patients.", sep = ""));
writeLines("Load information of TCGA data files.");
load(traverseResultFile);
Platform = "bio/clin";
Institution = "nationwidechildrens.org";
dir.create(path = saveFolderName, recursive = TRUE);
if ((outputFileName != "") & (!is.null(outputFileName)))
{
outputFileName = paste(outputFileName, "__", sep = "");
}
for (IDIn in 1:length(Institution))
{
for (IDPl in 1:length(Platform))
{
DirURL = paste("/", cancerType, "/bcr/", Institution[IDIn], "/", Platform[IDPl], "/", sep = "");
ID_DirURL = grep(pattern = toupper(DirURL), x = upper_file_url, ignore.case = FALSE);
ID_DirURL = ID_DirURL[grep(pattern = toupper("Level_2"), x = upper_file_url[ID_DirURL], ignore.case = FALSE)];
for (IDFile in 1:length(clinicalDataType))
{
FileName = paste("clinical_", clinicalDataType[IDFile], sep = "");
ind = ID_DirURL[grep(pattern = toupper(FileName), x = upper_file_url[ID_DirURL], ignore.case = FALSE)];
FileName = unique(sapply(strsplit(file_url[ind], split = "/"), function(x) x[length(x)]));
if (length(FileName) > 0)
{
for (FileNameIndex in 1:length(FileName))
{
ind = ID_DirURL[grepEnd(pattern = toupper(FileName[FileNameIndex]), x = upper_file_url[ID_DirURL], ignore.case = FALSE)];
if (length(ind) == 0)
{
next;
}else{
if (length(ind) > 1)
{
URL = GetNewestURL(AllURL = file_url[ind]);
}else{
URL = file_url[ind];
}
}
SaveFileName = paste(saveFolderName, "/", outputFileName, FileName[FileNameIndex], sep = "");
downloadFile(url = URL, saveFileName = SaveFileName);
}
}
}
}
}
writeLines("");
writeLines("**********************************************************************************");
writeLines("\n");
options(warn=0);
}
DownloadBiospecimenData <- function(traverseResultFile, saveFolderName, cancerType, biospecimenDataType, outputFileName = "")
{
options(warn=-1);
writeLines("**********************************************************************************");
writeLines("");
writeLines(paste("Download biospecimen information of ", cancerType, " patients.", sep = ""));
writeLines("Load information of TCGA data files.");
load(traverseResultFile);
Platform = "bio/clin";
Institution = "nationwidechildrens.org";
dir.create(path = saveFolderName, recursive = TRUE);
if ((outputFileName != "") & (!is.null(outputFileName)))
{
outputFileName = paste(outputFileName, "__", sep = "");
}
for (IDIn in 1:length(Institution))
{
for (IDPl in 1:length(Platform))
{
DirURL = paste("/", cancerType, "/bcr/", Institution[IDIn], "/", Platform[IDPl], "/", sep = "");
ID_DirURL = grep(pattern = toupper(DirURL), x = upper_file_url, ignore.case = FALSE);
ID_DirURL = ID_DirURL[grep(pattern = toupper("Level_2"), x = upper_file_url[ID_DirURL], ignore.case = FALSE)];
for (IDFile in 1:length(biospecimenDataType))
{
FileName = paste("biospecimen_", biospecimenDataType[IDFile], sep = "");
ind = ID_DirURL[grep(pattern = toupper(FileName), x = upper_file_url[ID_DirURL], ignore.case = FALSE)];
FileName = unique(sapply(strsplit(file_url[ind], split = "/"), function(x) x[length(x)]));
if (length(FileName) > 0)
{
for (FileNameIndex in 1:length(FileName))
{
ind = ID_DirURL[grepEnd(pattern = toupper(FileName[FileNameIndex]), x = upper_file_url[ID_DirURL], ignore.case = FALSE)];
if (length(ind) == 0)
{
next;
}else{
if (length(ind) > 1)
{
URL = GetNewestURL(AllURL = file_url[ind]);
}else{
URL = file_url[ind];
}
}
SaveFileName = paste(saveFolderName, "/", outputFileName, FileName[FileNameIndex], sep = "");
downloadFile(url = URL, saveFileName = SaveFileName);
}
}
}
}
}
writeLines("");
writeLines("**********************************************************************************");
writeLines("\n");
options(warn=0);
}
DownloadCNAData <- function(traverseResultFile, saveFolderName, cancerType, assayPlatform = "genome_wide_snp_6", tissueType = NULL, inputPatientIDs = NULL, outputFileName = "")
{
options(warn=-1);
writeLines("**********************************************************************************");
writeLines("");
writeLines(paste("Download copy number data of ", cancerType, " patients.", sep = ""));
if (!is.null(inputPatientIDs))
{
inputPatientIDs = toupper(gsub(pattern = "\\.", replacement = "-", x = inputPatientIDs));
}
if ((outputFileName != "") & (!is.null(outputFileName)))
{
outputFileName = paste(outputFileName, "__", sep = "");
}
# load directory traverse result and create folder to store output files
writeLines("Load information of TCGA data files.");
load(traverseResultFile);
dir.create(path = saveFolderName, recursive = TRUE);
SpecificID = grep(pattern = toupper(paste("/", cancerType, "/cgcc/broad\\.mit\\.edu/", assayPlatform, "/", sep = "")), x = upper_file_url, ignore.case = FALSE);
IDLevel_3InFile_Url = SpecificID[grep(pattern = toupper("Level_3"), x = upper_file_url[SpecificID], ignore.case = FALSE)];
SpecificName = paste(cancerType, "__broad.mit.edu__", assayPlatform, sep = "");
# download and process Sample and Data Relationship Format (SDRF) file
ind = SpecificID[grepEnd(pattern = toupper("\\.sdrf\\.txt"), x = upper_file_url[SpecificID], ignore.case = FALSE)];
if (length(ind) == 0)
{
writeLines("Program existed due to missing SDRF file.");
return();
}
if (length(ind) > 1)
{
URL = GetNewestURL(AllURL = file_url[ind]);
}else{
URL = file_url[ind];
}
downloadResult = urlReadTable(url = URL);
if (downloadResult$errorFlag != 0)
{
writeLines("Error in downloading SDRF file.");
return();
}
sdrf = toupper(downloadResult$data);
level_3_filename_column = which(sdrf[1, ] == toupper("Derived Array Data File"));
DataLevelColID = which(sdrf[1, ] == toupper("Comment [TCGA Data Level]"));
DataLevelColID = DataLevelColID[(length(DataLevelColID)-length(level_3_filename_column)+1):length(DataLevelColID)];
BarcodeColID = which(sdrf[1, ] == toupper("Comment [TCGA Barcode]"));
colnames(sdrf) = sdrf[1, ];
sdrf = unique(sdrf[2:dim(sdrf)[1], , drop = FALSE]);
Level3_ID = 1:dim(sdrf)[1];
for (DataLevelColIDIndex in 1:length(DataLevelColID))
{
Level3_ID = sort(intersect(Level3_ID, union(which(sdrf[, DataLevelColID[DataLevelColIDIndex]] == "LEVEL_3"),
which(sdrf[, DataLevelColID[DataLevelColIDIndex]] == "LEVEL 3"))), decreasing = FALSE);
}
if (length(Level3_ID) == 0)
{
writeLines("Error: there are no Level 3 data");
return();
}
sdrf = unique(sdrf[Level3_ID, sort(c(BarcodeColID, level_3_filename_column, DataLevelColID), decreasing = FALSE), drop = FALSE]);
# If specific patient TCGA barcodes are inputted, only download data of the specified samples.
if (!is.null(inputPatientIDs))
{
indInputPatientID = c();
for (i in 1:length(inputPatientIDs))
{
indInputPatientID = c(indInputPatientID, grepBeginning(pattern = toupper(inputPatientIDs[i]), x = sdrf[, 1], ignore.case = FALSE));
}
if (length(indInputPatientID) == 0)
{
writeLines("No Level 3 data for the inputted TCGA barcodes.");
return();
}else{
sdrf = sdrf[indInputPatientID, , drop = FALSE];
}
}
# Download data of specified tissue
if (!is.null(tissueType))
{
SampleType = cbind(Options = c("TP", "TR", "TB", "TRBM", "TAP", "TM", "TAM", "THOC", "TBM", "NB", "NT", "NBC", "NEBV", "NBM"),
Code = c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14"));
sdrf = sdrf[substr(sdrf[, 1], 14, 15) %in% SampleType[SampleType[, "Options"] %in% tissueType, "Code"], , drop = FALSE];
}
if (dim(sdrf)[1] == 0)
{
writeLines("No available data.");
return();
}
CopyNumberData = vector("list", 4);
names(CopyNumberData) = c("hg18", "hg19", "nocnv_hg18", "nocnv_hg19");
for (i in 1:length(CopyNumberData))
{
CopyNumberData[[i]] = matrix("", 0, 6);
}
for (i in 1:dim(sdrf)[1])
{
time1 = proc.time();
sample_TCGA_id = sdrf[i, 1];
for (DataFileID in 1:floor(dim(sdrf)[2]/2))
{
DataFileName_ID = sdrf[i, DataFileID*2];
DataFileType_ID = strsplit(DataFileName_ID, split = "\\.")[[1]];
DataFileType_ID = DataFileType_ID[length(DataFileType_ID)-2];
CellID = which(toupper(names(CopyNumberData)) == DataFileType_ID);
ind = IDLevel_3InFile_Url[grepEnd(pattern = DataFileName_ID, x = upper_file_url[IDLevel_3InFile_Url], ignore.case = FALSE)];
if (length(ind) == 0)
{
next;
}else{
if (length(ind) > 1)
{
URL = GetNewestURL(AllURL = file_url[ind]);
}else{
URL = file_url[ind];
}
downloadResult = urlReadTable(url = URL);
if (downloadResult$errorFlag != 0)
{
next;
}
s = downloadResult$data;
colnames(s) = s[1, ];
s = s[2:dim(s)[1], , drop = FALSE];
ChromosomeID = grep(pattern = "Chromosome", x = colnames(s), ignore.case = TRUE);
StartID = grep(pattern = "Start", x = colnames(s), ignore.case = TRUE);
EndID = grep(pattern = "End", x = colnames(s), ignore.case = TRUE);
NumProbesID = grep(pattern = "Num_Probes", x = colnames(s), ignore.case = TRUE);
SegmentMeanID = grep(pattern = "Segment_Mean", x = colnames(s), ignore.case = TRUE);
IDX = which(toupper(s[, ChromosomeID]) == "X");
s[IDX, ChromosomeID] = "23";
IDY = which(toupper(s[, ChromosomeID]) == "Y");
s[IDY, ChromosomeID] = "24";
CopyNumberData[[CellID]] = rbind(CopyNumberData[[CellID]], cbind(Sample = rep(sample_TCGA_id, dim(s)[1]),
s[, c(ChromosomeID, StartID, EndID, NumProbesID, SegmentMeanID), drop = FALSE]));
}
}
time = proc.time() - time1;
writeLines(paste("Downloaded - ", SpecificName, " - Sample ", i, " out of ", dim(sdrf)[1], ". ", round(time[3], digits = 1), " seconds elapsed.", sep = ""));
}
writeLines("Save data to local disk.");
ID = str_locate_all(traverseResultFile, "_")[[1]];
ID = ID[dim(ID)[1], 2];
for (FileID in 1:length(CopyNumberData))
{
filename = paste(saveFolderName, "/", outputFileName, SpecificName, "__", names(CopyNumberData)[FileID], "__", substr(traverseResultFile, ID+1, nchar(traverseResultFile)-4), ".txt", sep = "");
write.table(CopyNumberData[[FileID]], file = filename, quote = FALSE, sep = "\t", na = "", col.names = TRUE, row.names = FALSE);
# For returning downloaded data
CopyNumberData[[FileID]] = rbind(colnames(CopyNumberData[[FileID]]), CopyNumberData[[FileID]]);
colnames(CopyNumberData[[FileID]]) = NULL;
rownames(CopyNumberData[[FileID]]) = NULL;
}
writeLines("");
writeLines("**********************************************************************************");
writeLines("\n");
options(warn=0);
# Return downloaded data
names(CopyNumberData) = paste(SpecificName, names(CopyNumberData), sep = "__");
return(CopyNumberData);
}
DownloadMethylationData <- function(traverseResultFile, saveFolderName, cancerType, assayPlatform, tissueType = NULL, inputPatientIDs = NULL, outputFileName = "")
{
options(warn=-1);
writeLines("**********************************************************************************");
writeLines("");
writeLines(paste("Download DNA methylation data of ", cancerType, " patients.", sep = ""));
# Check whether specific TCGA patient IDs are inputted.
if (!is.null(inputPatientIDs))
{
inputPatientIDs = toupper(gsub(pattern = "\\.", replacement = "-", x = inputPatientIDs));
}
if ((outputFileName != "") & (!is.null(outputFileName)))
{
outputFileName = paste(outputFileName, "__", sep = "");
}
# load directory traverse result and create folder to store output files
writeLines("Load information of TCGA data files.");
load(traverseResultFile);
dir.create(path = saveFolderName, recursive = TRUE);
SpecificID = grep(pattern = toupper(paste("/", cancerType, "/cgcc/jhu-usc\\.edu/", assayPlatform, "/", sep = "")), x = upper_file_url, ignore.case = FALSE);
MeLevel3ID = SpecificID[grep(pattern = toupper("Level_3"), x = upper_file_url[SpecificID], ignore.case = FALSE)];
# search for the Sample and Data Relationship Format (SDRF) file of the specified platform and cancer type
ind = SpecificID[grepEnd(pattern = toupper("\\.sdrf\\.txt"), x = upper_file_url[SpecificID], ignore.case = FALSE)];
if (length(ind) == 0)
{
writeLines("Program existed due to missing SDRF file.");
return();
}
if (length(ind) > 1)
{
URL = GetNewestURL(AllURL = file_url[ind]);
}else{
URL = file_url[ind];
}
downloadResult = urlReadTable(url = URL);
if (downloadResult$errorFlag != 0)
{
writeLines("Error in downloading SDRF file.");
return();
}
sdrf = toupper(downloadResult$data);
# Process SDRF file, identify the columns of level 3 data file name and TCGA sample barcodes.
level_3_filename_column = max(grep(pattern = "Data Matrix File", x = sdrf[1, ], ignore.case = TRUE));
DataLevelColID = max(grep(pattern = "TCGA Data Level", x = sdrf[1, ], ignore.case = TRUE));
TCGABarcodeID = min(grep(pattern = "TCGA Barcode", x = sdrf[1, ], ignore.case = TRUE));
colnames(sdrf) = sdrf[1, ];
sdrf = unique(sdrf[2:dim(sdrf)[1], , drop = FALSE]);
sdrf = sdrf[!duplicated(sdrf[, level_3_filename_column]), c(TCGABarcodeID, DataLevelColID, level_3_filename_column), drop = FALSE];
SDRFID = sort(union(which(sdrf[, 2] == "LEVEL_3"), which(sdrf[, 2] == "LEVEL 3")), decreasing = FALSE);
if (length(SDRFID) == 0)
{
writeLines("Error: there are no Level 3 data");
return();
}
sdrf = sdrf[SDRFID, , drop = FALSE];
# If specific patient TCGA barcodes are inputted, only download the specified samples.
if (!is.null(inputPatientIDs))
{
indInputPatientID = c();
for (i in 1:length(inputPatientIDs))
{
indInputPatientID = c(indInputPatientID, grepBeginning(pattern = inputPatientIDs[i], x = sdrf[, 1], ignore.case = FALSE));
}
if (length(indInputPatientID) == 0)
{
writeLines("No Level 3 data for the inputted TCGA barcodes.");
return();
}else{
sdrf = sdrf[indInputPatientID, , drop = FALSE];
}
}
# Download data of specified tissue
if (!is.null(tissueType))
{
SampleType = cbind(Options = c("TP", "TR", "TB", "TRBM", "TAP", "TM", "TAM", "THOC", "TBM", "NB", "NT", "NBC", "NEBV", "NBM"),
Code = c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14"));
sdrf = sdrf[substr(sdrf[, 1], 14, 15) %in% SampleType[SampleType[, "Options"] %in% tissueType, "Code"], , drop = FALSE];
}
if (dim(sdrf)[1] == 0)
{
writeLines("No available data.");
return();
}
# Download data files of all samples.
left_columns = NULL;
AllPosition = NULL;
exp_names = NULL;
data = NULL;
for (i in 1:dim(sdrf)[1])
{
time1 = proc.time();
sample_TCGA_id = sdrf[i, 1];
ind = MeLevel3ID[grepEnd(pattern = toupper(sdrf[i, 3]), x = upper_file_url[MeLevel3ID], ignore.case = FALSE)];
if (length(ind) == 0)
{
next;
}
if (length(ind) > 1)
{
URL = GetNewestURL(AllURL = file_url[ind]);
}else{
URL = file_url[ind];
}
downloadResult = urlReadTable(url = URL);
if (downloadResult$errorFlag != 0)
{
next;
}
s = downloadResult$data;
s = s[2:dim(s)[1], , drop = FALSE];
chr = rep(0, dim(s)[1]);
for (j in 1:22)
{
IDj = which(s[, 4] == as.character(j));
chr[IDj] = j;
}
IDj = which(toupper(s[, 4]) == "X");
chr[IDj] = 23;
IDj = which(toupper(s[, 4]) == "Y");
chr[IDj] = 24;
position = rep(0, dim(s)[1]);
position[2:length(position)] = as.numeric(s[2:dim(s)[1], 5]);
Yj = chr*(10e+10) + position;
orderIDj = order(Yj, decreasing = FALSE);
Yj = Yj[orderIDj];
s = s[orderIDj, , drop = FALSE];
# Need to check whether every data file has the same methylation probes
if (is.null(left_columns))
{
left_columns = s[, c(1, 3, 4, 5), drop = FALSE];
AllPosition = Yj;
exp_names = sample_TCGA_id;
data = s[2:dim(s)[1], 2, drop = FALSE];
}else{
if (sum(AllPosition != Yj) > 0)
{
next;
}
exp_names = c(exp_names, sample_TCGA_id);
data = cbind(data, s[2:dim(s)[1], 2, drop = FALSE]);
}
time = proc.time() - time1;
writeLines(paste("Downloaded - ", cancerType, "__jhu-usc.edu__", assayPlatform, " - sample ", i, " out of ", dim(sdrf)[1], ". ", round(time[3], digits = 1), " seconds elapsed.", sep = ""));
}
writeLines("Save data to local disk.");
ID = str_locate_all(traverseResultFile, "_")[[1]];
ID = ID[dim(ID)[1], 2];
filename = paste(saveFolderName, "/", outputFileName, cancerType, "__jhu-usc.edu__", assayPlatform, "__", substr(traverseResultFile, ID+1, nchar(traverseResultFile)-4), ".txt", sep = "");
header = c(left_columns[1, ], exp_names);
left_columns = left_columns[2:dim(left_columns)[1], , drop = FALSE];
ID = grepBeginning(pattern = "NA", x = left_columns[, 3], ignore.case = TRUE)
ID = sort(setdiff(1:dim(left_columns)[1], ID), decreasing = FALSE);
left_columns = left_columns[ID, , drop = FALSE];
data = data[ID, , drop = FALSE];
write.table(rbind(header, cbind(left_columns, data)), file = filename, quote = FALSE, sep = "\t", na = "", col.names = FALSE, row.names = FALSE);
writeLines("");
writeLines("**********************************************************************************");
writeLines("\n");
options(warn=0);
# Return downloaded data
downloadedData = rbind(header, cbind(left_columns, data));
rownames(downloadedData) = NULL;
colnames(downloadedData) = NULL;
downloadedData;
}
DownloadRNASeqData <-function (traverseResultFile, saveFolderName, cancerType, assayPlatform, dataType = "", tissueType = NULL, inputPatientIDs = NULL, outputFileName = "")
{
options(warn=-1);
writeLines("**********************************************************************************");
writeLines("");
writeLines(paste("Download Gene Expression data of ", cancerType, " patients.", sep = ""));
# Check whether specific TCGA patient IDs are inputted.
if (!is.null(inputPatientIDs))
{
inputPatientIDs = toupper(gsub(pattern = "\\.", replacement = "-", x = inputPatientIDs));
}
if ((outputFileName != "") & (!is.null(outputFileName)))
{
outputFileName = paste(outputFileName, "__", sep = "");
}
# load directory traverse result and create folder to store output files
writeLines("Load information of TCGA data files.");
load(traverseResultFile);
dir.create(path = saveFolderName, recursive = TRUE);
if (assayPlatform == "RNASeqV1")
{
platform = c("illuminaga_rnaseq", "illuminahiseq_rnaseq");
Institution = c("unc.edu", "bcgsc.ca");
}
if (assayPlatform == "RNASeqV2")
{
platform = c("illuminaga_rnaseqv2", "illuminahiseq_rnaseqv2");
Institution = c("unc.edu");
}
if (assayPlatform == "Microarray")
{
platform = c("agilentg4502a_07_3", "ht_hg-u133a", "agilentg4502a_07_1", "agilentg4502a_07_2", "hg-u133_plus_2");
Institution = c("unc.edu", "broad.mit.edu", "genome.wustl.edu");
dataType = "";
}
# For returning downloaded data
downloadedData = vector("list", 0);
dataIndex = 0;
# download RNASeqV2 data
if (assayPlatform == "RNASeqV2")
{
for (IDin in 1:length(Institution))
{
for (IDpl in 1:length(platform))
{
SpecificName = paste(cancerType, "__", Institution[IDin], "__", platform[IDpl], sep = "");
SpecificID = grep(pattern = toupper(paste("/", cancerType, "/cgcc/", Institution[IDin], "/", platform[IDpl], "/", sep = "")), x = upper_file_url, ignore.case = FALSE);
RNALevel3ID = SpecificID[grep(pattern = toupper("Level_3"), x = upper_file_url[SpecificID], ignore.case = FALSE)];
ind = SpecificID[grepEnd(pattern = toupper("sdrf\\.txt"), x = upper_file_url[SpecificID], ignore.case = FALSE)];
if (length(ind) == 0)
{
next;
}
if (length(ind) > 1)
{
URL = GetNewestURL(AllURL = file_url[ind]);
}else{
URL = file_url[ind];
}
downloadResult = urlReadTable(url = URL);
if (downloadResult$errorFlag != 0)
{
next;
}
sdrf = toupper(downloadResult$data);
level_3_filename_column = max(grep(pattern = "Derived Data File", x = sdrf[1, ], ignore.case = TRUE));
DataLevelColID = max(grep(pattern = "Comment \\[TCGA Data Level]", x = sdrf[1, ], ignore.case = TRUE));
ExtractNameColID = grep(pattern = "Comment \\[TCGA Barcode]", x = sdrf[1, ], ignore.case = TRUE);
RefGenomeColID = grep(pattern = "Comment \\[Genome reference]", x = sdrf[1, ], ignore.case = TRUE);
if (length(ExtractNameColID) == 0)
{
ExtractNameColID = min(grep(pattern = "Extract Name", x = sdrf[1, ], ignore.case = TRUE));
}
colnames(sdrf) = sdrf[1, ];
sdrf = unique(sdrf[2:dim(sdrf)[1], , drop = FALSE]);
Level3_ID = sort(union(which(sdrf[, DataLevelColID] == "LEVEL_3"), which(sdrf[, DataLevelColID] == "LEVEL 3")), decreasing = FALSE);
if (length(Level3_ID) == 0)
{
next;
}
sdrf = sdrf[Level3_ID, c(ExtractNameColID, level_3_filename_column, RefGenomeColID), drop = FALSE];
sdrf = sdrf[!duplicated(sdrf[, 2]), , drop = FALSE];
# Only keep the file information for the data types that should be downloaded.
keepID = c();
for (keep_i in 1:length(dataType))
{
keepID = c(keepID, grep(pattern = dataType[keep_i], x = sdrf[, 2], ignore.case = TRUE))
}
sdrf = sdrf[sort(unique(keepID), decreasing = FALSE), , drop = FALSE];
# If specific patient TCGA barcodes are inputted, only download the specified samples.
if (!is.null(inputPatientIDs))
{
indInputPatientID = c();
for (i in 1:length(inputPatientIDs))
{
indInputPatientID = c(indInputPatientID, grepBeginning(pattern = toupper(inputPatientIDs[i]), x = sdrf[, 1], ignore.case = FALSE));
}
if (length(indInputPatientID) == 0)
{
next;
}else{
sdrf = sdrf[indInputPatientID, , drop = FALSE];
}
}
# Download data of specified tissue
if (!is.null(tissueType))
{
SampleType = cbind(Options = c("TP", "TR", "TB", "TRBM", "TAP", "TM", "TAM", "THOC", "TBM", "NB", "NT", "NBC", "NEBV", "NBM"),
Code = c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14"));
sdrf = sdrf[substr(sdrf[, 1], 14, 15) %in% SampleType[SampleType[, "Options"] %in% tissueType, "Code"], , drop = FALSE];
}
if (dim(sdrf)[1] == 0)
{
next;
}
# Start to download data files
exp_gene = NULL;
column_gene = NULL;
gene_left_column = NULL;
gene_data = NULL;
exp_gene_normalized = NULL;
column_gene_normalized = NULL;
gene_normalized_left_column = NULL;
gene_normalized_data = NULL;
exp_isoform = NULL;
column_isoform = NULL;
isoform_left_column = NULL;
isoform_data = NULL;
exp_isoform_normalized = NULL;
column_isoform_normalized = NULL;
isoform_normalized_left_column = NULL;
isoform_normalized_data = NULL;
exp_exon = NULL;
column_exon = NULL;
exon_left_column = NULL;
exon_data=NULL;
exp_junction = NULL;
column_junction = NULL;
junction_left_column = NULL;
junction_data=NULL;
for (i in 1:dim(sdrf)[1])
{
time1 = proc.time();
sample_TCGA_id = sdrf[i, 1];
ind = RNALevel3ID[grepEnd(pattern = sdrf[i, 2], x = upper_file_url[RNALevel3ID], ignore.case = FALSE)];
if (length(ind) == 0)
{
next;
}
if (length(ind) > 1)
{
URL = GetNewestURL(AllURL = file_url[ind]);
}else{
URL = file_url[ind];
}
downloadResult = urlReadTable(url = URL, dataType = "GE");
if (downloadResult$errorFlag != 0)
{
next;
}
s = downloadResult$data;
# read gene expression data
for (jj in 1:1)
{
if (length(grep(pattern = toupper("rsem.genes.results"), x = sdrf[i, 2], ignore.case = FALSE)) > 0)
{
s = s[2:dim(s)[1], , drop = FALSE];
I_order_probes = order(s[, 1], decreasing = FALSE);
s = s[I_order_probes, , drop = FALSE];
if (is.null(gene_left_column))
{
gene_left_column = s[, c(1, 4), drop = FALSE];
gene_data = s[, c(2, 3), drop = FALSE];
exp_gene = c(sample_TCGA_id, sample_TCGA_id);
column_gene = c("raw_count", "scaled_estimate");
}else{
if (sum(gene_left_column[, 1] != s[, 1]) > 0)
{
next;
}
gene_data = cbind(gene_data, s[, c(2, 3), drop = FALSE]);
exp_gene = c(exp_gene, sample_TCGA_id, sample_TCGA_id);
column_gene = c(column_gene, "raw_count", "scaled_estimate");
}
}
}
# read gene normalized data
for (jj in 1:1)
{
if (length(grep(pattern = toupper("rsem.genes.normalized_results"), x = sdrf[i, 2], ignore.case = FALSE)) > 0)
{
s = s[2:dim(s)[1], , drop = FALSE];
I_order_probes = order(s[, 1], decreasing = FALSE);
s = s[I_order_probes, , drop = FALSE];
if (is.null(gene_normalized_left_column))
{
gene_normalized_left_column = s[, 1, drop = FALSE];
gene_normalized_data = s[, 2, drop = FALSE];
exp_gene_normalized = sample_TCGA_id;
column_gene_normalized = "normalized_count";
}else{
if (sum(gene_normalized_left_column[, 1] != s[, 1]) > 0)
{
next;
}
gene_normalized_data = cbind(gene_normalized_data, s[, 2, drop = FALSE]);
exp_gene_normalized = c(exp_gene_normalized, sample_TCGA_id);
column_gene_normalized = c(column_gene_normalized, "normalized_count");
}
}
}
# read isoform data
for (jj in 1:1)
{
if (length(grep(pattern = toupper("rsem.isoforms.results"), x = sdrf[i, 2], ignore.case = FALSE)) > 0)
{
s = s[2:dim(s)[1], , drop = FALSE];
I_order_probes = order(s[, 1], decreasing = FALSE);
s = s[I_order_probes, , drop = FALSE];
if (is.null(isoform_left_column))
{
isoform_left_column = s[, 1, drop = FALSE];
isoform_data = s[, c(2, 3), drop = FALSE];
exp_isoform = c(sample_TCGA_id, sample_TCGA_id);
column_isoform = c("raw_count", "scaled_estimate");
}else{
if (sum(isoform_left_column[, 1] != s[, 1]) > 0)
{
next;
}
isoform_data = cbind(isoform_data, s[, c(2, 3), drop = FALSE]);
exp_isoform = c(exp_isoform, sample_TCGA_id, sample_TCGA_id);
column_isoform = c(column_isoform, "raw_count", "scaled_estimate");
}
}
}
# read isoform normalized data
for (jj in 1:1)
{
if (length(grep(pattern = toupper("rsem.isoforms.normalized_results"), x = sdrf[i, 2], ignore.case = FALSE)) > 0)
{
s = s[2:dim(s)[1], , drop = FALSE];
I_order_probes = order(s[, 1], decreasing = FALSE);
s = s[I_order_probes, , drop = FALSE];
if (is.null(isoform_normalized_left_column))
{
isoform_normalized_left_column = s[, 1, drop = FALSE];
isoform_normalized_data = s[, 2, drop = FALSE];
exp_isoform_normalized = sample_TCGA_id;
column_isoform_normalized = "normalized_count";
}else{
if (sum(isoform_normalized_left_column[, 1] != s[, 1]) > 0)
{
next;
}
isoform_normalized_data = cbind(isoform_normalized_data, s[, 2, drop = FALSE]);
exp_isoform_normalized = c(exp_isoform_normalized, sample_TCGA_id);
column_isoform_normalized = c(column_isoform_normalized, "normalized_count");
}
}
}
# read exon data
for (jj in 1:1)
{
if (length(grep(pattern = toupper("exon_quantification"), x = sdrf[i, 2], ignore.case = FALSE)) > 0)
{
s = s[2:dim(s)[1], , drop = FALSE];
I_order_probes = order(s[, 1], decreasing = FALSE);
s = s[I_order_probes, , drop = FALSE];
if (is.null(exon_left_column))
{
exon_left_column = s[, 1, drop = FALSE];
exon_data = s[, 2:4, drop = FALSE];
exp_exon = c(sample_TCGA_id, sample_TCGA_id, sample_TCGA_id);
column_exon = c("raw_counts", "median_length_normalized", "RPKM");
}else{
if (sum(exon_left_column[, 1] != s[, 1]) > 0)
{
next;
}
exon_data = cbind(exon_data, s[, 2:4, drop = FALSE]);
exp_exon = c(exp_exon, sample_TCGA_id, sample_TCGA_id, sample_TCGA_id);
column_exon = c(column_exon, "raw_counts", "median_length_normalized", "RPKM");
}
}
}
# read junction data
for (jj in 1:1)
{
if (length(grep(pattern = toupper("junction_quantification"), x = sdrf[i, 2], ignore.case = FALSE)) > 0)
{
s = s[2:dim(s)[1], , drop = FALSE];
I_order_probes = order(s[, 1], decreasing = FALSE);
s = s[I_order_probes, , drop = FALSE];
if (is.null(junction_left_column))
{
junction_left_column = s[, 1, drop = FALSE];
junction_data = s[, 2, drop = FALSE];
exp_junction = sample_TCGA_id;
column_junction = "raw_counts";
}else{
if (sum(junction_left_column[, 1] != s[, 1]) > 0)
{
next;
}
junction_data = cbind(junction_data, s[, 2, drop = FALSE]);
exp_junction = c(exp_junction, sample_TCGA_id);
column_junction = c(column_junction, "raw_counts");
}
}
}
time = proc.time() - time1;
writeLines(paste("Downloaded - ", SpecificName, " - file ", i, " out of ", dim(sdrf)[1], ". ", round(time[3], digits = 1), " seconds elapsed.", sep = ""));
}
writeLines("Save data to local disk.");
ID = str_locate_all(traverseResultFile, "_")[[1]];
ID = ID[dim(ID)[1], 2];
if (length(grep(pattern = "rsem.genes.results", x = dataType, ignore.case = TRUE)) > 0)
{
filename = paste(saveFolderName, "/", outputFileName, SpecificName, "__rsem.genes.results__", substr(traverseResultFile, ID+1, nchar(traverseResultFile)-4), ".txt", sep = "");
write.table(rbind(c("Hybridization REF", "Hybridization REF", exp_gene), c("gene_id", "transcript_id", column_gene), cbind(gene_left_column, gene_data)),
file = filename, quote = FALSE, sep = "\t", na = "", col.names = FALSE, row.names = FALSE);
# For returning downloaded data
dataIndex = dataIndex + 1
downloadedData[[dataIndex]] = rbind(c("Hybridization REF", "Hybridization REF", exp_gene), c("gene_id", "transcript_id", column_gene), cbind(gene_left_column, gene_data));
names(downloadedData)[dataIndex] = paste(SpecificName, "rsem.genes.results", sep = "__");
}
if (length(grep(pattern = "rsem.genes.normalized_results", x = dataType, ignore.case = TRUE)) > 0)
{
filename = paste(saveFolderName, "/", outputFileName, SpecificName, "__rsem.genes.normalized_results__", substr(traverseResultFile, ID+1, nchar(traverseResultFile)-4), ".txt", sep = "");
write.table(rbind(c("Hybridization REF", exp_gene_normalized), c("gene_id", column_gene_normalized), cbind(gene_normalized_left_column, gene_normalized_data)),
file = filename, quote = FALSE, sep = "\t", na = "", col.names = FALSE, row.names = FALSE);
# For returning downloaded data
dataIndex = dataIndex + 1
downloadedData[[dataIndex]] = rbind(c("Hybridization REF", exp_gene_normalized), c("gene_id", column_gene_normalized), cbind(gene_normalized_left_column, gene_normalized_data));
names(downloadedData)[dataIndex] = paste(SpecificName, "rsem.genes.normalized_results", sep = "__");
}
if (length(grep(pattern = "rsem.isoforms.results", x = dataType, ignore.case = TRUE)) > 0)
{
filename = paste(saveFolderName, "/", outputFileName, SpecificName, "__rsem.isoforms.results__", substr(traverseResultFile, ID+1, nchar(traverseResultFile)-4), ".txt", sep = "");
write.table(rbind(c("Hybridization REF", exp_isoform), c("isoform_id", column_isoform), cbind(isoform_left_column, isoform_data)),
file = filename, quote = FALSE, sep = "\t", na = "", col.names = FALSE, row.names = FALSE);
# For returning downloaded data
dataIndex = dataIndex + 1
downloadedData[[dataIndex]] = rbind(c("Hybridization REF", exp_isoform), c("isoform_id", column_isoform), cbind(isoform_left_column, isoform_data));
names(downloadedData)[dataIndex] = paste(SpecificName, "rsem.isoforms.results", sep = "__");
}
if (length(grep(pattern = "rsem.isoforms.normalized_results", x = dataType, ignore.case = TRUE)) > 0)
{
filename = paste(saveFolderName, "/", outputFileName, SpecificName, "__rsem.isoforms.normalized_results__", substr(traverseResultFile, ID+1, nchar(traverseResultFile)-4), ".txt", sep = "");
write.table(rbind(c("Hybridization REF", exp_isoform_normalized), c("isoform_id", column_isoform_normalized), cbind(isoform_normalized_left_column, isoform_normalized_data)),
file = filename, quote = FALSE, sep = "\t", na = "", col.names = FALSE, row.names = FALSE);
# For returning downloaded data
dataIndex = dataIndex + 1
downloadedData[[dataIndex]] = rbind(c("Hybridization REF", exp_isoform_normalized), c("isoform_id", column_isoform_normalized), cbind(isoform_normalized_left_column, isoform_normalized_data));
names(downloadedData)[dataIndex] = paste(SpecificName, "rsem.isoforms.normalized_results", sep = "__");
}
if (length(grep(pattern = "exon_quantification", x = dataType, ignore.case = TRUE)) > 0)
{
filename = paste(saveFolderName, "/", outputFileName, SpecificName, "__exon_quantification__", substr(traverseResultFile, ID+1, nchar(traverseResultFile)-4), ".txt", sep = "");
write.table(rbind(c("Hybridization REF", exp_exon), c("exon", column_exon), cbind(exon_left_column, exon_data)),
file = filename, quote = FALSE, sep = "\t", na = "", col.names = FALSE, row.names = FALSE);
# For returning downloaded data
dataIndex = dataIndex + 1
downloadedData[[dataIndex]] = rbind(c("Hybridization REF", exp_exon), c("exon", column_exon), cbind(exon_left_column, exon_data));
names(downloadedData)[dataIndex] = paste(SpecificName, "exon_quantification", sep = "__");
}
if (length(grep(pattern = "junction_quantification", x = dataType, ignore.case = TRUE)) > 0)
{
filename = paste(saveFolderName, "/", outputFileName, SpecificName, "__junction_quantification__", substr(traverseResultFile, ID+1, nchar(traverseResultFile)-4), ".txt", sep = "");
write.table(rbind(c("Hybridization REF", exp_junction), c("junction", column_junction), cbind(junction_left_column, junction_data)),
file = filename, quote = FALSE, sep = "\t", na = "", col.names = FALSE, row.names = FALSE);
# For returning downloaded data
dataIndex = dataIndex + 1
downloadedData[[dataIndex]] = rbind(c("Hybridization REF", exp_junction), c("junction", column_junction), cbind(junction_left_column, junction_data));
names(downloadedData)[dataIndex] = paste(SpecificName, "junction_quantification", sep = "__");
}
}
}
}
# download RNASeqV1 data
if (assayPlatform == "RNASeqV1")
{
for (IDin in 1:length(Institution))
{
for (IDpl in 1:length(platform))
{
SpecificName = paste(cancerType, "__", Institution[IDin], "__", platform[IDpl], sep = "");
SpecificID = grep(pattern = toupper(paste("/", cancerType, "/cgcc/", Institution[IDin], "/", platform[IDpl], "/", sep = "")), x = upper_file_url, ignore.case = FALSE);
RNALevel3ID = SpecificID[grep(pattern = toupper("Level_3"), x = upper_file_url[SpecificID], ignore.case = FALSE)];
ind = SpecificID[grepEnd(pattern = toupper("sdrf\\.txt"), x = upper_file_url[SpecificID], ignore.case = FALSE)];
if (length(ind) == 0)
{
next;
}
if (length(ind) > 1)
{
URL = GetNewestURL(AllURL = file_url[ind]);
}else{
URL = file_url[ind];
}
downloadResult = urlReadTable(url = URL);
if (downloadResult$errorFlag != 0)
{
next;
}
sdrf = toupper(downloadResult$data);
level_3_filename_column = max(grep(pattern = "Derived Data File", x = sdrf[1, ], ignore.case = TRUE));
DataLevelColID = max(grep(pattern = "Comment \\[TCGA Data Level]", x = sdrf[1, ], ignore.case = TRUE));
ExtractNameColID = grep(pattern = "Comment \\[TCGA Barcode]", x = sdrf[1, ], ignore.case = TRUE);
RefGenomeColID = grep(pattern = "Comment \\[Genome reference]", x = sdrf[1, ], ignore.case = TRUE);
if (length(ExtractNameColID) == 0)
{
ExtractNameColID = min(grep(pattern = "Extract Name", x = sdrf[1, ], ignore.case = TRUE));
}
colnames(sdrf) = sdrf[1, ];
sdrf = unique(sdrf[2:dim(sdrf)[1], , drop = FALSE]);
Level3_ID = sort(union(which(sdrf[, DataLevelColID] == "LEVEL_3"), which(sdrf[, DataLevelColID] == "LEVEL 3")), decreasing = FALSE);
if (length(Level3_ID) == 0)
{
next;
}
sdrf = sdrf[Level3_ID, c(ExtractNameColID, level_3_filename_column, RefGenomeColID), drop = FALSE];
sdrf = sdrf[!duplicated(sdrf[, 2]), , drop = FALSE];
# Only keep the file information for the data types that should be downloaded.
keepID = c();
for (keep_i in 1:length(dataType))
{
keepID = c(keepID, grep(pattern = dataType[keep_i], x = sdrf[, 2], ignore.case = TRUE))
}
sdrf = sdrf[sort(unique(keepID), decreasing = FALSE), , drop = FALSE];
# If specific patient TCGA barcodes are inputted, only download the specified samples.
if (!is.null(inputPatientIDs))
{
indInputPatientID = c();
for (i in 1:length(inputPatientIDs))
{
indInputPatientID = c(indInputPatientID, grepBeginning(pattern = toupper(inputPatientIDs[i]), x = sdrf[, 1], ignore.case = FALSE));
}
if (length(indInputPatientID) == 0)
{
next;
}else{
sdrf = sdrf[indInputPatientID, , drop = FALSE];
}
}
# Download data of specified tissue
if (!is.null(tissueType))
{
SampleType = cbind(Options = c("TP", "TR", "TB", "TRBM", "TAP", "TM", "TAM", "THOC", "TBM", "NB", "NT", "NBC", "NEBV", "NBM"),
Code = c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14"));
sdrf = sdrf[substr(sdrf[, 1], 14, 15) %in% SampleType[SampleType[, "Options"] %in% tissueType, "Code"], , drop = FALSE];
}
if (dim(sdrf)[1] == 0)
{
next;
}
exp_names_gene = NULL;
column_gene = NULL;
gene_left_column = NULL;
gene_RPKM = NULL;
exp_names_exon = NULL;
column_exon = NULL;
exon_left_column = NULL;
exon_RPKM = NULL;
junction_count = NULL;
exp_names_junction = NULL;
column_junction = NULL;
junction_left_column = NULL;
for (i in 1:dim(sdrf)[1])
{
time1 = proc.time();
sample_TCGA_id = sdrf[i, 1];
ind = RNALevel3ID[grepEnd(pattern = sdrf[i, 2], x = upper_file_url[RNALevel3ID], ignore.case = FALSE)];
if (length(ind) == 0)
{
next;
}
if (length(ind) > 1)
{
URL = GetNewestURL(AllURL = file_url[ind]);
}else{
URL = file_url[ind];
}
downloadResult = urlReadTable(url = URL, dataType = "GE");
if (downloadResult$errorFlag != 0)
{
next;
}
s = downloadResult$data;
# read gene expression data
for (jj in 1:1)
{
if (length(grep(pattern = toupper("gene.quantification"), x = sdrf[i, 2], ignore.case = FALSE)) > 0)
{
s = s[2:dim(s)[1], , drop = FALSE];
I_order_probes = order(s[, 1], decreasing = FALSE);
s = s[I_order_probes, , drop = FALSE];
if (is.null(gene_left_column))
{
gene_left_column = s[, 1, drop = FALSE];
gene_RPKM = s[, 2:4, drop = FALSE];
exp_names_gene = c(sample_TCGA_id, sample_TCGA_id, sample_TCGA_id);
column_gene = c("raw_counts", "median_length_normalized", "RPKM");
}else{
if (sum(gene_left_column[, 1] != s[, 1]) > 0)
{
next;
}
gene_RPKM = cbind(gene_RPKM, s[, 2:4, drop = FALSE]);
exp_names_gene = c(exp_names_gene, sample_TCGA_id, sample_TCGA_id, sample_TCGA_id);
column_gene = c(column_gene, "raw_counts", "median_length_normalized", "RPKM");
}
}
}
# read exon expression data
for (jj in 1:1)
{
if (length(grep(pattern = toupper("exon.quantification"), x = sdrf[i, 2], ignore.case = FALSE)) > 0)
{
s = s[2:dim(s)[1], , drop = FALSE];
I_order_probes = order(s[, 1], decreasing = FALSE);
s = s[I_order_probes, , drop = FALSE];
if (is.null(exon_left_column))
{
exon_left_column = s[, 1, drop = FALSE];
exon_RPKM = s[, 2:4, drop = FALSE];
exp_names_exon = c(sample_TCGA_id, sample_TCGA_id, sample_TCGA_id);
column_exon = c("raw_counts", "median_length_normalized", "RPKM");
}else{
if (sum(exon_left_column[, 1] != s[, 1]) > 0)
{
next;
}
exon_RPKM = cbind(exon_RPKM, s[, 2:4, drop = FALSE]);
exp_names_exon = c(exp_names_exon, sample_TCGA_id, sample_TCGA_id, sample_TCGA_id);
column_exon = c(column_exon, "raw_counts", "median_length_normalized", "RPKM");
}
}
}
# read junction expression data
for (jj in 1:1)
{
if (length(grep(pattern = toupper("spljxn.quantification"), x = sdrf[i, 2], ignore.case = FALSE)) > 0)
{
s = s[2:dim(s)[1], , drop = FALSE];
I_order_probes = order(s[, 1], decreasing = FALSE);
s = s[I_order_probes, , drop = FALSE];
if (is.null(junction_left_column))
{
junction_left_column = s[, 1, drop = FALSE];
junction_count = s[, 2, drop = FALSE];
exp_names_junction = sample_TCGA_id;
column_junction = "raw_counts";
}else{
if (sum(junction_left_column[, 1] != s[, 1]) > 0)
{
next;
}
junction_count = cbind(junction_count, s[, 2, drop = FALSE]);
exp_names_junction = c(exp_names_junction, sample_TCGA_id);
column_junction = c(column_junction, "raw_counts");
}
}
}
time = proc.time() - time1;
writeLines(paste("Downloaded - ", SpecificName, " - file ", i, " out of ", dim(sdrf)[1], ". ", round(time[3], digits = 1), " seconds elapsed.", sep = ""));
}
writeLines("Save data to local disk.");
ID = str_locate_all(traverseResultFile, "_")[[1]];
ID = ID[dim(ID)[1], 2];
if (length(grep(pattern = "gene.quantification", x = dataType, ignore.case = TRUE)) > 0)
{
filename = paste(saveFolderName, "/", outputFileName, SpecificName, "__gene.quantification__", substr(traverseResultFile, ID+1, nchar(traverseResultFile)-4), ".txt", sep = "");
write.table(rbind(c("Hybridization REF", exp_names_gene), c("gene", column_gene), cbind(gene_left_column, gene_RPKM)),
file = filename, quote = FALSE, sep = "\t", na = "", col.names = FALSE, row.names = FALSE);
# For returning downloaded data
dataIndex = dataIndex + 1;
downloadedData[[dataIndex]] = rbind(c("Hybridization REF", exp_names_gene), c("gene", column_gene), cbind(gene_left_column, gene_RPKM));
names(downloadedData)[dataIndex] = paste(SpecificName, "gene.quantification", sep = "__");
}
if (length(grep(pattern = "exon.quantification", x = dataType, ignore.case = TRUE)) > 0)
{
filename = paste(saveFolderName, "/", outputFileName, SpecificName, "__exon.quantification__", substr(traverseResultFile, ID+1, nchar(traverseResultFile)-4), ".txt", sep = "");
write.table(rbind(c("Hybridization REF", exp_names_exon), c("exon", column_exon), cbind(exon_left_column, exon_RPKM)),
file = filename, quote = FALSE, sep = "\t", na = "", col.names = FALSE, row.names = FALSE);
# For returning downloaded data
dataIndex = dataIndex + 1;
downloadedData[[dataIndex]] = rbind(c("Hybridization REF", exp_names_exon), c("exon", column_exon), cbind(exon_left_column, exon_RPKM));
names(downloadedData)[dataIndex] = paste(SpecificName, "exon.quantification", sep = "__");
}
if (length(grep(pattern = "spljxn.quantification", x = dataType, ignore.case = TRUE)) > 0)
{
filename = paste(saveFolderName, "/", outputFileName, SpecificName, "__spljxn.quantification__", substr(traverseResultFile, ID+1, nchar(traverseResultFile)-4), ".txt", sep = "");
write.table(rbind(c("Hybridization REF", exp_names_junction), c("junction", column_junction), cbind(junction_left_column, junction_count)),
file = filename, quote = FALSE, sep = "\t", na = "", col.names = FALSE, row.names = FALSE);
# For returning downloaded data
dataIndex = dataIndex + 1;
downloadedData[[dataIndex]] = rbind(c("Hybridization REF", exp_names_junction), c("junction", column_junction), cbind(junction_left_column, junction_count));
names(downloadedData)[dataIndex] = paste(SpecificName, "spljxn.quantification", sep = "__");
}
}
}
}
# download Microarray data
if (assayPlatform == "Microarray")
{
for (IDin in 1:length(Institution))
{
for (IDpl in 1:length(platform))
{
SpecificName = paste(cancerType, "__", Institution[IDin], "__", platform[IDpl], sep = "");
SpecificID = grep(pattern = toupper(paste("/", cancerType, "/cgcc/", Institution[IDin], "/", platform[IDpl], "/", sep = "")), x = upper_file_url, ignore.case = FALSE);
RNALevel3ID = SpecificID[grep(pattern = toupper("Level_3"), x = upper_file_url[SpecificID], ignore.case = FALSE)];
ind = SpecificID[grepEnd(pattern = toupper("sdrf\\.txt"), x = upper_file_url[SpecificID], ignore.case = FALSE)];
if (length(ind) == 0)
{
next;
}
if (length(ind) > 1)
{
URL = GetNewestURL(AllURL = file_url[ind]);
}else{
URL = file_url[ind];
}
downloadResult = urlReadTable(url = URL);
if (downloadResult$errorFlag != 0)
{
next;
}
sdrf = toupper(downloadResult$data);
# Derived Array Data Matrix File
level_3_filename_column = max(grep(pattern = "Derived Array Data Matrix File", x = sdrf[1, ], ignore.case = TRUE));
DataLevelColID = max(grep(pattern = "Comment \\[TCGA Data Level]", x = sdrf[1, ], ignore.case = TRUE));
ExtractNameColID = grep(pattern = "Comment \\[TCGA Barcode]", x = sdrf[1, ], ignore.case = TRUE);
if (length(ExtractNameColID) == 0)
{
ExtractNameColID = min(grep(pattern = "Extract Name", x = sdrf[1, ], ignore.case = TRUE));
}
colnames(sdrf) = sdrf[1, ];
sdrf = unique(sdrf[2:dim(sdrf)[1], , drop = FALSE]);
Level3_ID = sort(union(which(sdrf[, DataLevelColID] == "LEVEL_3"), which(sdrf[, DataLevelColID] == "LEVEL 3")), decreasing = FALSE);
if (length(Level3_ID) == 0)
{
next;
}
sdrf = sdrf[Level3_ID, c(ExtractNameColID, level_3_filename_column), drop = FALSE];
sdrf = sdrf[!duplicated(sdrf[, 2]), , drop = FALSE];
# If specific patient TCGA barcodes are inputted, only download the specified samples.
if (!is.null(inputPatientIDs))
{
indInputPatientID = c();
for (i in 1:length(inputPatientIDs))
{
indInputPatientID = c(indInputPatientID, grepBeginning(pattern = toupper(inputPatientIDs[i]), x = sdrf[, 1], ignore.case = FALSE));
}
if (length(indInputPatientID) == 0)
{
next;
}else{
sdrf = sdrf[indInputPatientID, , drop = FALSE];
}
}
# Download data of specified tissue
if (!is.null(tissueType))
{
SampleType = cbind(Options = c("TP", "TR", "TB", "TRBM", "TAP", "TM", "TAM", "THOC", "TBM", "NB", "NT", "NBC", "NEBV", "NBM"),
Code = c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14"));
sdrf = sdrf[substr(sdrf[, 1], 14, 15) %in% SampleType[SampleType[, "Options"] %in% tissueType, "Code"], , drop = FALSE];
}
if (dim(sdrf)[1] == 0)
{
next;
}
exp_names_gene = NULL;
column_gene = NULL;
gene_left_column = NULL;
gene_RPKM = NULL;
for (i in 1:dim(sdrf)[1])
{
time1 = proc.time();
sample_TCGA_id = sdrf[i, 1];
ind = RNALevel3ID[grepEnd(pattern = sdrf[i, 2], x = upper_file_url[RNALevel3ID], ignore.case = FALSE)];
if (length(ind) == 0)
{
next;
}
if (length(ind) > 1)
{
URL = GetNewestURL(AllURL = file_url[ind]);
}else{
URL = file_url[ind];
}
downloadResult = urlReadTable(url = URL, dataType = "GE");
if (downloadResult$errorFlag != 0)
{
next;
}
s = downloadResult$data;
column_gene = c(column_gene, s[2, 2]);
s = s[3:dim(s)[1], , drop = FALSE];
I_order_probes = order(s[, 1], decreasing = FALSE);
s = s[I_order_probes, , drop = FALSE];
if (is.null(gene_left_column))
{
gene_left_column = s[, 1, drop = FALSE];
gene_RPKM = s[, 2, drop = FALSE];
exp_names_gene = c(sample_TCGA_id);
}else{
if (sum(gene_left_column[, 1] != s[, 1]) > 0)
{
next;
}
gene_RPKM = cbind(gene_RPKM, s[, 2, drop = FALSE]);
exp_names_gene = c(exp_names_gene, sample_TCGA_id);
}
time = proc.time() - time1;
writeLines(paste("Downloaded - ", SpecificName, " - file ", i, " out of ", dim(sdrf)[1], ". ", round(time[3], digits = 1), " seconds elapsed.", sep = ""));
}
writeLines("Save data to local disk.");
ID = str_locate_all(traverseResultFile, "_")[[1]];
ID = ID[dim(ID)[1], 2];
filename = paste(saveFolderName, "/", outputFileName, SpecificName, "__gene.quantification__", substr(traverseResultFile, ID+1, nchar(traverseResultFile)-4), ".txt", sep = "");
write.table(rbind(c("Hybridization REF", exp_names_gene), c("Composite Element REF", column_gene), cbind(gene_left_column, gene_RPKM)),
file = filename, quote = FALSE, sep = "\t", na = "", col.names = FALSE, row.names = FALSE);
# For returning downloaded data
dataIndex = dataIndex + 1;
downloadedData[[dataIndex]] = rbind(c("Hybridization REF", exp_names_gene), c("Composite Element REF", column_gene), cbind(gene_left_column, gene_RPKM));
names(downloadedData)[dataIndex] = paste(SpecificName, "gene.quantification", sep = "__");
}
}
}
writeLines("");
writeLines("**********************************************************************************");
writeLines("\n");
options(warn=0);
# Return downloaded data
if (length(downloadedData) > 0)
{
for (i in 1:length(downloadedData))
{
rownames(downloadedData[[i]]) = NULL;
colnames(downloadedData[[i]]) = NULL;
}
}
downloadedData;
}
DownloadRPPAData <- function(traverseResultFile, saveFolderName, cancerType, assayPlatform = "mda_rppa_core", tissueType = NULL, inputPatientIDs = NULL, outputFileName = "")
{
options(warn=-1);
writeLines("**********************************************************************************");
writeLines("");
writeLines(paste("Download RPPA protein expression data of ", cancerType, " patients.", sep = ""));
# Check whether specific TCGA patient IDs are inputted.
if (!is.null(inputPatientIDs))
{
inputPatientIDs = toupper(gsub(pattern = "\\.", replacement = "-", x = inputPatientIDs));
}
if ((outputFileName != "") & (!is.null(outputFileName)))
{
outputFileName = paste(outputFileName, "__", sep = "");
}
# load directory traverse result and create folder to store output files
writeLines("Load information of TCGA data files.");
load(traverseResultFile);
dir.create(path = saveFolderName, recursive = TRUE);
DirURL_ID = grep(pattern = toupper(paste("/", cancerType, "/cgcc/mdanderson\\.org/", assayPlatform, "/", sep = "")), x = upper_file_url, ignore.case = FALSE);
IDLevel_3InFile_Url = DirURL_ID[grep(pattern = toupper("Level_3"), x = upper_file_url[DirURL_ID], ignore.case = FALSE)];
# download protein antibody annotation file
FileEndURL = paste(cancerType, "\\.MDA_RPPA_Core\\.antibody_annotation\\.txt", sep = "");
ind = DirURL_ID[grepEnd(pattern = toupper(FileEndURL), x = upper_file_url[DirURL_ID], ignore.case = FALSE)];
if (length(ind) == 0)
{
writeLines("Error: no antibody annotation file.");
return();
}
if (length(ind) > 1)
{
URL = GetNewestURL(AllURL = file_url[ind]);
}else{
URL = file_url[ind];
}
downloadResult = urlReadTable(url = URL);
if (downloadResult$errorFlag != 0)
{
writeLines("Error in downloading antibody annotation file.");
return();
}
Annotation = downloadResult$data;
for (i in 1:dim(Annotation)[2])
{
Annotation[, i] = gsub(pattern = "\"", replacement = "", x = Annotation[, i], ignore.case = TRUE);
}
colnames(Annotation) = Annotation[1, ];
Annotation = Annotation[2:dim(Annotation)[1], , drop = FALSE];
GeneNameColID = grepBeginning(pattern = toupper("Gene Name"), x = colnames(Annotation), ignore.case = TRUE);
REFColID = grepBeginning(pattern = toupper("Composite Element REF"), x = colnames(Annotation), ignore.case = TRUE);
Annotation = unique(Annotation[, c(GeneNameColID, REFColID), drop = FALSE]);
replaceTable = cbind(wrongSymbol = c("ACACAACACB", "AKT1AKT2 AKT3", "GSK3AGSK3B", "MAPK1MAPK3", "RAB11ARAB11B", "BIRC2 "),
correctSymbol = c("ACACA ACACB", "AKT1 AKT2 AKT3", "GSK3A GSK3B", "MAPK1 MAPK3", "RAB11A RAB11B", "BIRC2"));
for (iReplace in 1:dim(replaceTable)[1])
{
jReplace = which(Annotation[, 1] == replaceTable[iReplace, 1]);
Annotation[jReplace, 1] = replaceTable[iReplace, 2];
}
idTmp = grepBeginning(pattern = "CDK1-", x = Annotation[, 2], ignore.case = TRUE);
Annotation[idTmp, 1] = "CDK1";
# download Sample and Data Relationship Format (SDRF) file
writeLines("Download and process Sample and Data Relationship Format (SDRF) file.");
FileEndURL = paste(cancerType, "\\.MDA_RPPA_Core\\.sdrf\\.txt", sep = "");
ind = DirURL_ID[grepEnd(pattern = toupper(FileEndURL), x = upper_file_url[DirURL_ID], ignore.case = FALSE)];
if (length(ind) == 0)
{
writeLines("Error: no SDRF file.");
return();
}
if (length(ind) > 1)
{
URL = GetNewestURL(AllURL = file_url[ind]);
}else{
URL = file_url[ind];
}
downloadResult = urlReadTable(url = URL);
if (downloadResult$errorFlag != 0)
{
writeLines("Error in downloading SDRF file.");
return();
}
sdrf = toupper(downloadResult$data);
level_3_filename_column = max(grep(pattern = "Derived Array Data Matrix File", x = sdrf[1, ], ignore.case = TRUE));
LevelInfoColID = max(grep(pattern = "Comment \\[TCGA Data Level]", x = sdrf[1, ], ignore.case = TRUE));
SampleNameColID = min(grep(pattern = "Sample Name", x = sdrf[1, ], ignore.case = TRUE));
ExtractNameColID = min(grep(pattern = "Extract Name", x = sdrf[1, ], ignore.case = TRUE));
colnames(sdrf) = sdrf[1, ];
sdrf = unique(sdrf[2:dim(sdrf)[1], , drop = FALSE]);
sdrf = sdrf[!duplicated(sdrf[, level_3_filename_column]), , drop = FALSE];
SDRFID = sort(union(which(sdrf[, LevelInfoColID] == "LEVEL_3"), which(sdrf[, LevelInfoColID] == "LEVEL 3")), decreasing = FALSE);
if (length(SDRFID) == 0)
{
writeLines("Error: there are no Level 3 data");
return();
}
sdrf = sdrf[SDRFID, c(ExtractNameColID, SampleNameColID, level_3_filename_column), drop = FALSE];
sdrf = sdrf[grep(pattern = "TCGA", x = sdrf[, 1], ignore.case = TRUE), , drop = FALSE];
# If specific patient TCGA barcodes are inputted, only download data of the specified samples.
if (!is.null(inputPatientIDs))
{
indInputPatientID = c();
for (i in 1:length(inputPatientIDs))
{
indInputPatientID = c(indInputPatientID, grepBeginning(pattern = toupper(inputPatientIDs[i]), x = sdrf[, 1], ignore.case = FALSE));
}
if (length(indInputPatientID) == 0)
{
writeLines("No Level 3 data for the inputted TCGA barcodes.");
return();
}else{
sdrf = sdrf[indInputPatientID, , drop = FALSE];
}
}
# Download data of specified tissue
if (!is.null(tissueType))
{
SampleType = cbind(Options = c("TP", "TR", "TB", "TRBM", "TAP", "TM", "TAM", "THOC", "TBM", "NB", "NT", "NBC", "NEBV", "NBM"),
Code = c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14"));
sdrf = sdrf[substr(sdrf[, 1], 14, 15) %in% SampleType[SampleType[, "Options"] %in% tissueType, "Code"], , drop = FALSE];
}
if (dim(sdrf)[1] == 0)
{
writeLines("No available data.");
return();
}
left_columns = NULL;
exp_names = NULL;
data = NULL;
for (i in 1:dim(sdrf)[1])
{
time1 = proc.time();
sample_TCGA_id = strsplit(sdrf[i, 1], split = "\\.")[[1]][1];
ind = IDLevel_3InFile_Url[grepEnd(pattern = toupper(sdrf[i, 3]), x = upper_file_url[IDLevel_3InFile_Url], ignore.case = FALSE)];
if (length(ind) == 0)
{
next;
}
if (length(ind) > 1)
{
URL = GetNewestURL(AllURL = file_url[ind]);
}else{
URL = file_url[ind];
}
downloadResult = urlReadTable(url = URL);
if (downloadResult$errorFlag != 0)
{
next;
}
s = downloadResult$data[3:dim(downloadResult$data)[1], , drop = FALSE];
I_order_probes = order(s[, 1], decreasing = FALSE);
s = s[I_order_probes, , drop = FALSE];
if (is.null(left_columns))
{
left_columns = s[, 1];
exp_names = sample_TCGA_id;
data = s[, 2, drop = FALSE];
}else{
if (sum(left_columns != s[ ,1]) > 0)
{
next;
}
exp_names = c(exp_names, sample_TCGA_id);
data = cbind(data, s[, 2, drop = FALSE]);
}
time = proc.time() - time1;
writeLines(paste("Downloaded - ", cancerType, "__mdanderson.org__", assayPlatform, " - sample ", i, " out of ", dim(sdrf)[1], ". ", round(time[3], digits = 1), " seconds elapsed.", sep = ""));
}
colnames(data) = exp_names;
IX = which(toupper(Annotation[, 2]) %in% toupper(left_columns));
if(length(IX) < length(left_columns))
{
writeLines("Downloaded data have antibodies not included in the antibody annotation file.");
}
left_columns_temp = c();
for (i in 1:length(left_columns))
{
IDi = which(toupper(Annotation[, 2]) == toupper(left_columns)[i]);
if (length(IDi) == 0)
{
left_columns_temp = c(left_columns_temp, paste("GeneSymbolNotFound|", left_columns[i], sep = ""));
} else {
IDi = IDi[1];
left_columns_temp = c(left_columns_temp, paste(Annotation[IDi, 1], "|", Annotation[IDi, 2], sep = ""));
}
}
left_columns = left_columns_temp;
writeLines("Save data to local disk.");
ID = str_locate_all(traverseResultFile, "_")[[1]];
ID = ID[dim(ID)[1], 2];
filename = paste(saveFolderName, "/", outputFileName, cancerType, "__mdanderson.org__", assayPlatform, "__", substr(traverseResultFile, ID+1, nchar(traverseResultFile)-4), ".txt", sep = "");
write.table(cbind(Composite.Element.REF = left_columns, data), file = filename, quote = FALSE, sep = "\t", na = "", col.names = TRUE, row.names = FALSE);
writeLines("");
writeLines("**********************************************************************************");
writeLines("\n");
options(warn=0);
# Return downloaded data
downloadedData = cbind(Composite.Element.REF = left_columns, data);
downloadedData = rbind(colnames(downloadedData), downloadedData);
rownames(downloadedData) = NULL;
colnames(downloadedData) = NULL;
downloadedData;
}
DownloadmiRNASeqData <- function(traverseResultFile, saveFolderName, cancerType, assayPlatform = "miRNASeq", tissueType = NULL, inputPatientIDs = NULL, outputFileName = "")
{
options(warn=-1);
writeLines("**********************************************************************************");
writeLines("");
writeLines(paste("Download miRNA-seq data of ", cancerType, " patients.", sep = ""));
# Check whether specific TCGA patient IDs are inputted.
if (!is.null(inputPatientIDs))
{
inputPatientIDs = toupper(gsub(pattern = "\\.", replacement = "-", x = inputPatientIDs));
}
if ((outputFileName != "") & (!is.null(outputFileName)))
{
outputFileName = paste(outputFileName, "__", sep = "");
}
# For returning downloaded data
downloadedData = vector("list", 0);
dataIndex = 0;
# load directory traverse result and create folder to store output files
writeLines("Load information of TCGA data files.");
load(traverseResultFile);
if (assayPlatform == "miRNASeq")
{
platform = c("illuminaga_mirnaseq", "illuminahiseq_mirnaseq");
}
dir.create(path = saveFolderName, recursive = TRUE);
miRNALevel3ID = grep(pattern = toupper("miRNASeq\\.Level_3"), x = upper_file_url, ignore.case = FALSE);
for (IDpl in 1:length(platform))
{
SpecificName = paste(cancerType, "__", "bcgsc.ca", "__", platform[IDpl], sep = "");
SpecificID = grep(pattern = toupper(paste("/", cancerType, "/cgcc/bcgsc\\.ca/", platform[IDpl], sep = "")), x = upper_file_url, ignore.case = FALSE);
ind = SpecificID[grepEnd(pattern = toupper("sdrf\\.txt"), x = upper_file_url[SpecificID], ignore.case = FALSE)];
if (length(ind) == 0)
{
next;
}
if (length(ind) > 1)
{
URL = GetNewestURL(AllURL = file_url[ind]);
}else{
URL = file_url[ind];
}
downloadResult = urlReadTable(url = URL);
if (downloadResult$errorFlag != 0)
{
next;
}
sdrf = toupper(downloadResult$data);
# Process SDRF file, identify the columns of level 3 data file name and TCGA sample barcodes.
level_3_filename_column = max(grep(pattern = "Derived Data File", x = sdrf[1, ], ignore.case = TRUE));
DataLevelColID = max(grep(pattern = "Comment \\[TCGA Data Level]", x = sdrf[1, ], ignore.case = TRUE));
ExtractNameColID = min(grep(pattern = "Comment \\[TCGA Barcode]", x = sdrf[1, ], ignore.case = TRUE));
RefGenomeColID = grep(pattern = "Comment \\[Genome reference]", x = sdrf[1, ], ignore.case = TRUE);
if (length(ExtractNameColID) == 0)
{
ExtractNameColID = min(grep(pattern = "Extract Name", x = sdrf[1, ], ignore.case = TRUE));
}
colnames(sdrf) = sdrf[1, ];
sdrf = unique(sdrf[2:dim(sdrf)[1], , drop = FALSE]);
sdrf = sdrf[!duplicated(sdrf[, level_3_filename_column]), , drop = FALSE];
Level3_ID = sort(union(which(sdrf[, DataLevelColID] == "LEVEL_3"), which(sdrf[, DataLevelColID] == "LEVEL 3")), decreasing = FALSE);
Level3_ID = sort(intersect(Level3_ID, grep(pattern = toupper("mirna\\.quantification"),
x = sdrf[, level_3_filename_column], ignore.case = FALSE)), decreasing = FALSE);
if (length(Level3_ID) == 0)
{
next;
}
sdrf = sdrf[Level3_ID, c(ExtractNameColID, level_3_filename_column, RefGenomeColID), drop = FALSE];
# If specific patient TCGA barcodes are inputted, only download the specified samples.
if (!is.null(inputPatientIDs))
{
indInputPatientID = c();
for (i in 1:length(inputPatientIDs))
{
indInputPatientID = c(indInputPatientID, grepBeginning(pattern = toupper(inputPatientIDs[i]), x = sdrf[, 1], ignore.case = FALSE));
}
if (length(indInputPatientID) == 0)
{
next;
}else{
sdrf = sdrf[indInputPatientID, , drop = FALSE];
}
}
# Download data of specified tissue
if (!is.null(tissueType))
{
SampleType = cbind(Options = c("TP", "TR", "TB", "TRBM", "TAP", "TM", "TAM", "THOC", "TBM", "NB", "NT", "NBC", "NEBV", "NBM"),
Code = c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14"));
sdrf = sdrf[substr(sdrf[, 1], 14, 15) %in% SampleType[SampleType[, "Options"] %in% tissueType, "Code"], , drop = FALSE];
}
if (dim(sdrf)[1] == 0)
{
next;
}
sdrfO = sdrf;
for (RefGId in 1:2)
{
if (RefGId == 1)
{
sdrf = sdrfO[grep(pattern = toupper("NCBI36"), x = sdrfO[, 3], ignore.case = FALSE), , drop = FALSE];
}
if (RefGId == 2)
{
sdrf = sdrfO[grep(pattern = toupper("GRCh37"), x = sdrfO[, 3], ignore.case = FALSE), , drop = FALSE];
}
if (dim(sdrf)[1] == 0)
{
next;
}
exp_names = NULL;
gene_left_column = NULL;
gene_RPM = NULL;
column_gene = NULL;
for (i in 1:dim(sdrf)[1])
{
time1 = proc.time();
sample_TCGA_id = sdrf[i, 1];
ind = intersect(miRNALevel3ID, SpecificID[grepEnd(pattern = sdrf[i, 2], x = upper_file_url[SpecificID], ignore.case = FALSE)]);
if (length(ind) == 0)
{
next;
}
if (length(ind) > 1)
{
URL = GetNewestURL(AllURL = file_url[ind]);
}else{
URL = file_url[ind];
}
downloadResult = urlReadTable(url = URL);
if (downloadResult$errorFlag != 0)
{
next;
}
s = downloadResult$data[2:dim(downloadResult$data)[1], , drop = FALSE];
I_order_probes = order(s[, 1], decreasing = FALSE);
s = s[I_order_probes, , drop = FALSE];
if (is.null(gene_left_column))
{
gene_left_column = s[, 1, drop = FALSE];
exp_names = c(sample_TCGA_id, sample_TCGA_id);
gene_RPM = s[, 2:3, drop = FALSE];
column_gene = c("read_count", "reads_per_million_miRNA_mapped");
}else{
if (sum(gene_left_column[, 1] != s[ ,1]) > 0)
{
next;
}
exp_names = c(exp_names, sample_TCGA_id, sample_TCGA_id);
gene_RPM = cbind(gene_RPM, s[, 2:3, drop = FALSE]);
column_gene = c(column_gene, "read_count", "reads_per_million_miRNA_mapped");
}
time = proc.time() - time1;
if (RefGId == 1)
{
writeLines(paste("Downloaded - ", SpecificName, " - NCBI36 - sample ", i, " out of ", dim(sdrf)[1], ". ", round(time[3], digits = 1), " seconds elapsed.", sep = ""));
}
if (RefGId == 2)
{
writeLines(paste("Downloaded - ", SpecificName, " - GRCh37 - sample ", i, " out of ", dim(sdrf)[1], ". ", round(time[3], digits = 1), " seconds elapsed.", sep = ""));
}
}
if (!is.null(gene_RPM))
{
writeLines("Save data to local disk.");
ID = str_locate_all(traverseResultFile, "_")[[1]];
ID = ID[dim(ID)[1], 2];
if (RefGId == 1)
{
filename = paste(saveFolderName, "/", outputFileName, SpecificName, "__NCBI36__", substr(traverseResultFile, ID+1, nchar(traverseResultFile)-4), ".txt", sep = "");
write.table(rbind(c("Hybridization REF", exp_names), c("miRNA_ID", column_gene), cbind(gene_left_column, gene_RPM)), file = filename, quote = FALSE, sep = "\t", na = "", col.names = FALSE, row.names = FALSE);
# For returning downloaded data
dataIndex = dataIndex+1;
downloadedData[[dataIndex]] = rbind(c("Hybridization REF", exp_names), c("miRNA_ID", column_gene), cbind(gene_left_column, gene_RPM));
rownames(downloadedData[[dataIndex]]) = NULL;
colnames(downloadedData[[dataIndex]]) = NULL;
names(downloadedData)[dataIndex] = paste(SpecificName, "__NCBI36", sep = "");
}
if (RefGId == 2)
{
filename = paste(saveFolderName, "/", outputFileName, SpecificName, "__GRCh37__", substr(traverseResultFile, ID+1, nchar(traverseResultFile)-4), ".txt", sep = "");
write.table(rbind(c("Hybridization REF", exp_names), c("miRNA_ID", column_gene), cbind(gene_left_column, gene_RPM)), file = filename, quote = FALSE, sep = "\t", na = "", col.names = FALSE, row.names = FALSE);
# For returning downloaded data
dataIndex = dataIndex+1;
downloadedData[[dataIndex]] = rbind(c("Hybridization REF", exp_names), c("miRNA_ID", column_gene), cbind(gene_left_column, gene_RPM));
rownames(downloadedData[[dataIndex]]) = NULL;
colnames(downloadedData[[dataIndex]]) = NULL;
names(downloadedData)[dataIndex] = paste(SpecificName, "__GRCh37", sep = "");
}
}
}
}
writeLines("");
writeLines("**********************************************************************************");
writeLines("\n");
options(warn=0);
downloadedData;
}
######################### Auxiliary Functions of Module A #####################################################
# content2 is a function to replace the content function in httr package,
# which can not be correctly executed on Windows computer
# Input arguments:
# urlGet: object returned by GET function in httr pacakge
# as: in what type you want the content to be. Currently only allow "text"
# Output arguments:
# urlCont: content from the URL.
content2 <- function(urlGet, as = "text")
{
if (as != "text")
{
writeLines("content2 function does not allow types other than text");
urlCont = urlGet;
return(urlCont);
}
totalL = length(urlGet$content);
urlCont = paste(rep(" ", totalL), collapse = "");
step = 1000000;
num = ceiling(totalL/step);
for (i in 1:num)
{
starti = (i - 1) * step + 1;
endi = min(i * step, totalL);
substr(urlCont, starti, endi) = paste(unlist(lapply(X = urlGet$content[starti:endi],
FUN = rawToChar)), collapse = "");
}
return(urlCont);
}
# urlReadTable is the function to read a data table from a website and pass it to a variable.
# Input arguments:
# url: URl of the website from which data table will be obtained.
# Output argument:
# data: a character matrix holding the data table obtained from website.
urlReadTable <- function(url, dataType = "otherDataType")
{
if(((nchar(url) >= 261) & (toupper(Sys.info()["sysname"]) == "WINDOWS")) & (dataType == "GE"))
{
# writeLines(paste("Use content2 function for ", url, sep = ""));
data = try(GET(url), silent = TRUE);
if (class(data) == "try-error")
{
return(list(data = data, errorFlag = 1));
}
# data = content2(urlGet = data, as = "text");
data = paste(unlist(lapply(X = data$content, FUN = rawToChar)), collapse = "");
} else {
data = try(content(GET(url), as = "text"), silent = TRUE);
if (class(data) == "try-error")
{
return(list(data = data, errorFlag = 1));
}
}
if (length(intersect(grep(pattern = "404", x = data, ignore.case = TRUE), grep(pattern = "Not Found", x = data, ignore.case = TRUE))) > 0)
{
return(list(data = data, errorFlag = 2));
}
data = gsub(pattern = "\r", replacement = "", x = data);
data = as.matrix(read.table(text = data, sep = "\t", fill = TRUE, quote = NULL, check.names = FALSE));
# if (class(data) == "try-error")
# {
# print(url);
# return(list(data = data, errorFlag = 1));
# }
# if (length(grep(pattern = "HTTP 404: Page Not Found\n\nThe page you requested was not found.", x = data, ignore.case = TRUE)) > 0)
# {
# return(list(data = data, errorFlag = 2));
# }
# data = gsub(pattern = "\r", replacement = "", x = data);
# data = as.matrix(read.table(text = data, sep="\t", fill = TRUE, quote = NULL, check.names = FALSE));
# data = strsplit(data, split = "\n")[[1]];
# numCol = length(strsplit(data[1], split = "\t")[[1]]);
# data = unlist(strsplit(data, split = "\t"));
# if ((length(data) %% numCol) != 0)
# {
# return(list(data = data, errorFlag = 3));
# }
# data = t(matrix(data, numCol, length(data)/numCol));
return(list(data = data, errorFlag = 0));
}
# downloadFile is a function to download content from a website and save it as a local file.
# Input arguments:
# url: URl of the website whose content to be obtained.
# saveFileName: path and name of the file to store the web content.
downloadFile <- function(url, saveFileName)
{
data = try(content(GET(url), as = "text"), silent = TRUE);
if (class(data) == "try-error")
{
return(errorFlag = 1);
}
if (length(grep(pattern = "HTTP 404: Page Not Found\n\nThe page you requested was not found.", x = data, ignore.case = TRUE)) > 0)
{
return(errorFlag = 2);
}
write(x = data, file = saveFileName);
return(errorFlag = 0);
}
# grepEnd is a function similar to grep but identifies the strings with pattern at the end of the strings.
grepEnd <- function(pattern, x, ignore.case = FALSE)
{
ind = grep(pattern = pattern, x = x, ignore.case = ignore.case);
if (ignore.case)
{
ind = ind[nchar(x[ind]) == sapply(str_locate_all(toupper(x[ind]), pattern = toupper(pattern)), function(y){y[dim(y)[1], 2]})];
}else{
ind = ind[nchar(x[ind]) == sapply(str_locate_all(x[ind], pattern = pattern), function(y){y[dim(y)[1], 2]})];
}
}
# grepBeginning is a function similar to grep but identifies the strings with pattern at the Beginning of the strings.
grepBeginning <- function(pattern, x, ignore.case = FALSE)
{
ind = grep(pattern = pattern, x = x, ignore.case = ignore.case);
if (ignore.case)
{
ind = ind[rep(1, length(ind)) == sapply(str_locate_all(toupper(x[ind]), pattern = toupper(pattern)), function(y)y[1, 1])];
}else{
ind = ind[rep(1, length(ind)) == sapply(str_locate_all(x[ind], pattern = pattern), function(y)y[1, 1])];
}
}
# This function analyzes a TCGA webpage and identify all files and directories on it.
# The URLs of files and directories are returned using two character vectors.
# Input arguments:
# TCGA_link: a string of URL for the TCGA webpage to be analyzed.
# Output arguments:
# file_url: a character vector, including the URLs of all files on the webpage.
# dir_url: a character vector, including the URLs of all directories on the webpage.
getTCGA_URL<-function(TCGA_link)
{
file_url = c();
file_name = c();
is_dir = c();
if (substr(TCGA_link, nchar(TCGA_link), nchar(TCGA_link)) != "/")
{
writeLines(paste("Add / to the end of TCGA_link for ", TCGA_link, sep = ""));
TCGA_link = paste(TCGA_link, "/", sep = "");
}
s = try(content(GET(TCGA_link), as = "text"), silent = TRUE);
if (class(s) == "try-error")
{
return(list(file_url = c(), dir_url = c()));
}
if (length(grep(pattern = "HTTP 404: Page Not Found\n\nThe page you requested was not found.", x = s, ignore.case = TRUE)) > 0)
{
return(list(file_url = c(), dir_url = c()));
}
s = substr(s, str_locate(string = s, pattern = "Parent Directory")[1, 2] + 5, nchar(s));
start_points = str_locate_all(toupper(s), "<A HREF")[[1]][, 1];
end_points = str_locate_all(toupper(s), "</A>")[[1]][, 1];
if (length(start_points) != length(end_points))
{
stop(paste("Error in parsing URL", TCGA_link, sep = ", "));
}
if (length(end_points) == 0)
{
return(list(file_url = file_url, is_dir = is_dir));
}
for (i in 1:length(start_points))
{
tmp = substr(s, start_points[i]+1, end_points[i]);
ind = str_locate_all(string = tmp, pattern = "\"")[[1]][, 1];
file_url[i] = substr(tmp, ind[1]+1, ind[2]-1);
is_dir[i] = (substr(file_url[i], nchar(file_url[i]), nchar(file_url[i])) == "/");
}
for (i in 1:length(file_url))
{
HTTP_i = str_locate_all(string = toupper(file_url[i]), pattern = "HTTP")[[1]];
if (dim(HTTP_i)[1] == 0)
{
file_url[i] = paste(TCGA_link, file_url[i], sep = "");
}
}
return(list(file_url = file_url[!is_dir], dir_url = file_url[is_dir]));
}
# This function selects the newest file URL from all the input URLs by considering the
# last folder name tail numbers. The number format is *.*.*
GetNewestURL <- function(AllURL)
{
SeriesNum = matrix(rep("", length(AllURL)*3), length(AllURL), 3);
NumLength = rep(0, 3);
SN = rep("", length(AllURL));
for (i in 1:length(AllURL))
{
Str = AllURL[i];
SepID = str_locate_all(string = Str, pattern = "/")[[1]];
SepID = SepID[(dim(SepID)[1]-1):dim(SepID)[1], 1];
Str = substr(Str, SepID[1]+1, SepID[2]-1);
Str = strsplit(Str, split = "\\.")[[1]];
SeriesNum[i, ] = Str[(length(Str)-2) : length(Str)];
NumLength[1] = max(NumLength[1], nchar(SeriesNum[i, 1]));
NumLength[2] = max(NumLength[2], nchar(SeriesNum[i, 2]));
NumLength[3] = max(NumLength[3], nchar(SeriesNum[i, 3]));
}
for (i in 1:length(AllURL))
{
for (j in 1:3)
{
if (nchar(SeriesNum[i, j]) < NumLength[j])
{
SeriesNum[i, j] = paste(paste(rep("0", NumLength[j] - nchar(SeriesNum[i, j])), collapse = ""), SeriesNum[i, j], sep = "");
}
}
SN[i] = paste(SeriesNum[i, ], collapse = "");
}
AllURL[which.max(as.numeric(SN))];
}
######################### Check whether this is the most updated version of TCGA-Assembler #####################################################
VCwebContent = try(content(GET("http://health.bsd.uchicago.edu/yji/soft.html"), as = "text"), silent = TRUE);
if (class(VCwebContent) == "try-error")
{
rm(VCwebContent);
} else {
VCstartID = str_locate_all(string = VCwebContent, pattern = "LinkToCheckTCGA-AssebmlerVersionNumber");
if (dim(VCstartID[[1]])[1] == 0)
{
rm(VCwebContent, VCstartID);
} else {
VCstartID = VCstartID[[1]][1, "end"]+1;
VCwebContent = substr(VCwebContent, VCstartID, nchar(VCwebContent));
VCstartID = str_locate_all(string = VCwebContent, pattern = "href=\"")[[1]][1, "end"]+1;
VCendID = str_locate_all(string = VCwebContent, pattern = "\">")[[1]][1, "start"]-1;
VCurl = substr(VCwebContent, VCstartID, VCendID);
VCwebContent = try(content(GET(VCurl), as = "text"), silent = TRUE);
if (class(VCwebContent) == "try-error")
{
rm(VCstartID, VCwebContent, VCendID, VCurl);
} else {
VCstartID = str_locate_all(string = VCwebContent, pattern = "CheckVersionNumber1");
if (dim(VCstartID[[1]])[1] == 0)
{
rm(VCstartID, VCwebContent, VCendID, VCurl);
} else {
VCstartID = VCstartID[[1]][1, "end"]+1;
VCwebContent = substr(VCwebContent, VCstartID, nchar(VCwebContent));
VCstartID = str_locate_all(string = VCwebContent, pattern = "\">")[[1]][1, "end"]+1;
VCendID = str_locate_all(string = VCwebContent, pattern = "</span>")[[1]][1, "start"]-1;
VCnewestVersionNum = substr(VCwebContent, VCstartID, VCendID);
if (VCnewestVersionNum != "1.0.3")
{
writeLines("\n");
writeLines("***************************************************************");
writeLines("A new version of TCGA-Assembler is available!")
writeLines(paste("Please download version ", VCnewestVersionNum, " at ", VCurl, sep = ""));
writeLines("***************************************************************");
writeLines("\n");
}
rm(VCstartID, VCwebContent, VCendID, VCnewestVersionNum, VCurl);
}
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.