
Defines functions download_gdc

Documented in download_gdc

#' Download data from GDC Data Portal and GDC Legacy Archive
#' \code{download_gdc} is a function designed to download methylation, mutation,
#' clinical data, protein expression, MAGETAB, gene expression, isoform
#' expression, miRNA expression and clinical images data from GDC Data Portal
#' and GDC Legacy Archive.
#' @param data_type Type of data. It could be \code{"methylation", "mutation",
#'    "clinical_supplement", "biospecimen", "gene", or "clinical"(biotab)}.
#'    \itemize{ \item{Only present
#'    in "Legacy" database:}{\code{"protein", "Exon quantification", "miRNA gene
#'    quantification", "miRNA isoform quantification", "isoform", and "image"}.}
#'    \item{Only present in "GDC" database:}{\code{"miRNA
#'    Expression Quantification", and "Isoform Expression Quantification"
#'    (miRNA)}.}}
#' @param tumor A character string contaning one of the 33 tumors available in
#'    the TCGA project. For instance, the \code{"BRCA"} stands for breast
#'    cancer.
#' @param data_base A character string specifying \code{"GDC"} for GDC Data
#'    Portal or \code{"legacy"} for GDC Legacy Archive.
#' @param htseq A character string indicating which HTSeq workflow data should
#'    be downloaded: \code{"Counts", "FPKM", or "all"}. The default is
#'    \code{"all"}.
#' @param work_dir A character string specifying the path to work directory.
#' @param all_files A logical value. Set \code{FALSE} to avoid the download of
#'    not used data to reduce download size, e.g. quantification files. The
#'    default is \code{FALSE}.
#' @param platform A character string indicating the platform name for
#'    methylation, exon quantificaton, miRNA, and mutation data. \itemize{
#'    \item{For mutation and exon quantificaton data:}{\code{"Illumina GA",
#'    "Illumina HiSeq" or "all"}.} \item{For methylation data}{\code{"Illumina
#'    Human Methylation 450", "Illumina Human Methylation 27" or "all"}.}
#'    \item{For miRNA data:}{\code{"Illumina GA", "Illumina HiSeq",
#'    "H-miRNA_8x15K" (for GBM tumor), "H-miRNA_8x15Kv2" (for OV tumor), or
#'    "all"}.} }The default for all data_type cited is \code{"all"} (when
#'    downloading data).
#' @return the files download are stored inside the determined folders in the
#'    user machine.
#' @import AnnotationDbi clusterProfiler devtools DOSE ggbiplot ggplot2 methods
#'    stringi survminer yarrr
#' @export
#' @importFrom curl curl
#' @importFrom httr content
#' @importFrom httr GET
#' @importFrom jsonlite fromJSON
#' @importFrom tools md5sum
#' @importFrom grDevices dev.off hsv png svg
#' @importFrom graphics abline axis hist image layout legend lines matplot
#'    mtext par plot.new points rect rug text title
#' @importFrom stats TukeyHSD anova aov as.dendrogram as.dist chisq.test coef
#'    confint cor density dist formula hclust kruskal.test median model.matrix
#'    na.exclude na.omit order.dendrogram p.adjust pairwise.wilcox.test prcomp
#'    reorder residuals sd shapiro.test summary.aov summary.lm
#' @importFrom utils combn read.csv read.delim read.table setTxtProgressBar
#'   txtProgressBar untar write.csv write.table
#' @examples
#' library(DOAGDC)
#' # Downloading gene expression data from GDC Legacy Archive
#' download_gdc("gene", "CHOL", "legacy", work_dir = "~/Desktop")
download_gdc <- function(data_type = "gene",
                        data_base = "legacy",
                        htseq = "",
                        all_files = FALSE,
                        platform = "all") {

    # local functions ####
    download_httr <- function(url, destfile) {
        first <- httr::GET(url = url)
        second <- httr::content(x = first, as = "raw")
        writeBin(object = second, con = destfile)

    size_par <- function(tumor, type_of_data, db) {
        if (db == "legacy") {
            first_part <- "https://api.gdc.cancer.gov/legacy/projects/TCGA-"
        } else {
            first_part <- "https://api.gdc.cancer.gov/projects/TCGA-"
        url <- paste0(
            first_part, toupper(tumor),
        jason <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)
        jason <- jason$data
        jason <- jason$summary
        jason <- jason$data_categories
        jason$data_category <- tolower(jason$data_category)
        size <- as.numeric(subset(
            x = jason,
            subset = data_category == tolower(type_of_data),

    # old api url https://gdc-api.nci.nih.gov/
    # selecting the right API

    # code ####
    db_bool <- tolower(data_base) == "legacy"

    if (db_bool) {
        inicio <- "https://api.gdc.cancer.gov/legacy/data/"
        url_inicio <- "https://api.gdc.cancer.gov/legacy/files/"
        status <- "https://api.gdc.cancer.gov/legacy/status"
        folder_name <- paste0(tolower(data_type), "_data")
    } else if (tolower(data_base) == "gdc") {
        inicio <- "https://api.gdc.cancer.gov/data/"
        url_inicio <- "https://api.gdc.cancer.gov/files/"
        status <- "https://api.gdc.cancer.gov/status"
        folder_name <- paste(tolower(data_base), tolower(data_type),
            sep = "_"
    } else {
        stop("Please insert a data base name!")

    message("Please wait, accessing GDC server...")
    tryCatch(tmp <- read.csv(status),
        error = function(e) {
                "GDC server or your internet conection is",
                " off. \n Please try again later!"

    dir.create(path = file.path(work_dir, "DOAGDC"), showWarnings = FALSE)
        path = file.path(work_dir, "DOAGDC", toupper(tumor)),
        showWarnings = FALSE

    # legacy ####
    if (db_bool) {
        # NOTE gene and isoform ####
        if ("gene" %in% tolower(data_type) || "isoform" %in% tolower(data_type)) {
            size_par_rsem <- function(tumor) {
                url <- paste0(
                    "projects/TCGA-", toupper(tumor),
                jason <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)
                jason <- jason$data
                jason <- jason$summary
                jason <- jason$data_categories
                size <- subset(
                    x = jason,
                    subset = data_category == "Gene expression",

            size <- size_par_rsem(tumor = tumor)

            if ("gene" %in% tolower(data_type)) {
                    path = file.path(
                        work_dir, "DOAGDC",
                        toupper(tumor), "gene_data"
                    showWarnings = FALSE
                direc <- file.path(work_dir, "DOAGDC", toupper(tumor),

                url <- paste0(
                    url_inicio, "?pretty=true&expand=cases.samples.",
                    size, "&filters=%7B%22op%22:%22and%22,%22",
                    "%5B%22TCGA-", toupper(tumor),
            } else if ("isoform" %in% tolower(data_type)) {
                    path = file.path(
                        work_dir, "DOAGDC", toupper(tumor),
                    showWarnings = FALSE
                direc <- file.path(
                    work_dir, "DOAGDC", toupper(tumor),

                url <- paste0(
                    url_inicio, "?pretty=true&expand=cases.samples.",
                    size, "&filters=%7B%22op%22:%22and%22,%22",
                    "%5B%22TCGA-", toupper(tumor),

            message("\n\nDownloading manifest...\n")
            json <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)

            manifest_df <- json$data$hits

            manipular <- manifest_df[, "cases"]
            # 16(submitter_id) will be cases
            manifest_df[, c(
                "center", "acl", "state_comment", "cases",
            )] <- NULL

            cases <- matrix(nrow = nrow(manifest_df), ncol = 1)
            for (index in seq_len(length(manipular))) {
                patient_code <- as.character(unlist(manipular[[index]][1]))
                tmp <- patient_code[grep("TCGA", patient_code)]
                if (tmp == paste0('TCGA-', toupper(tumor))) {
                    patient_code <- as.character(unlist(manipular[[index]][2]))
                    cases[index, 1] <- patient_code[grep("TCGA", patient_code)]
                } else {
                    cases[index, 1] <- tmp

            manifest_df$submitter_id <- cases

            if (!all_files) {
                # no need to download these files
                manifest_df <- manifest_df[!grepl(
                ), ]
                manifest_df <- manifest_df[!grepl(
                ), ]
                manifest_df <- manifest_df[!grepl(
                ), ]
                manifest_df <- manifest_df[!grepl(
                ), ]
                manifest_df <- manifest_df[!grepl(
                ), ]

                x = manifest_df, file = paste0(direc, "/manifest.sdrf"),
                quote = FALSE, row.names = FALSE, sep = "\t"

            manifest_df <- manifest_df[, c("file_name", "md5sum", "file_id")]

            colnames(manifest_df) <- c("filename", "md5", "id")

            id_matrix <- manifest_df[, "id"]
    } else if (tolower(data_base) == "gdc") {
        # NOTE gene GDC ####
        if ("gene" %in% tolower(data_type)) {
                path = file.path(
                    work_dir, "DOAGDC", toupper(tumor),
                showWarnings = FALSE
            direc <- file.path(work_dir, "DOAGDC", toupper(tumor),

            size_par <- function(tumor) {
                url <- paste0(
                jason <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)
                jason <- jason$data
                jason <- jason$summary
                jason <- jason$data_categories
                tmp <- jason$data_category == "Transcriptome Profiling"
                size <- subset(x = jason, subset = tmp, "file_count")

            size <- size_par(tumor = tumor)
            # selecting which HTSeq data to download
            url <- paste0(
                url_inicio, "?pretty=true&expand=cases.samples.",
                size, "&filters=%7B%22op%22:%22and%22,%22",
                "%22value%22:%5B%22TCGA-", toupper(tumor),

            message("\n\nDownloading manifest...\n")
            sw <- function(x) {
            sw(json <- jsonlite::fromJSON(readLines(curl::curl(url))))

            manifest_df <- json$data$hits

            manipular <- manifest_df[, "cases"]
            # 16(submitter_id) will be cases
            manifest_df[, c("analysis", "acl")] <- NULL

            cases <- matrix(nrow = nrow(manifest_df), ncol = 1)
            for (index in seq_len(length(manipular))) {
                patient_code <- as.character(unlist(manipular[[index]][1]))
                tmp <- patient_code[grep("TCGA", patient_code)]
                if (tmp == paste0('TCGA-', toupper(tumor))) {
                    patient_code <- as.character(unlist(manipular[[index]][2]))
                    cases[index, 1] <- patient_code[grep("TCGA", patient_code)]
                } else {
                    cases[index, 1] <- tmp

            manifest_df$cases <- cases

            manifest_df[, "cases"] <- as.character(manifest_df[, "cases"])

                x = manifest_df, file = file.path(
                    direc, "manifest.sdrf"
                quote = FALSE, row.names = FALSE, sep = "\t"

            seletor <- unname(sapply(
                function(w) {
                    paste0(unlist(strsplit(w, "_"))[2])

            manifest_df <- manifest_df[
                grep(tolower(htseq), seletor),
                c("file_name", "md5sum", "file_id")

            colnames(manifest_df) <- c("filename", "md5", "id")

            id_matrix <- manifest_df[, "id"]
        } else if ("isoform" %in% tolower(data_type)) {
                "\nThere is no isoform data in GDC data",
                " base! (unfortunately...)\n"

    # NOTE mutation ####
    if ("mutation" %in% tolower(data_type)) {
            path = file.path(
                work_dir, "DOAGDC", toupper(tumor),
            showWarnings = FALSE
        direc <- file.path(work_dir, "DOAGDC", toupper(tumor), folder_name)

        if (db_bool) {
            # legacy exclusive
            if (tolower(platform) %in% "all") {
                tmpform <- "Illumina GA|Illumina HiSeq"
            } else if (tolower(platform) %in% "illumina ga") {
                tmpform <- "Illumina GA"
            } else if (tolower(platform) %in% "illumina hiseq") {
                tmpform <- "Illumina HiSeq"

            platform <- paste0(

            url <- paste0(
                url_inicio, "?pretty=true&expand=cases.samples.",
                "%22:%7B%22field%22:%22files.platform%22,", platform,
                toupper(tumor), "%22%5D%7D%7D%5D%7D&format=JSON"
        } else if (tolower(data_base) == "gdc") {
            url <- paste0(
                url_inicio, "?pretty=true&expand=cases.samples.",
                "%22,%22value%22:%5B%22TCGA-", toupper(tumor),

        json <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)

        manifest_df <- json$data$hits

        # checkin' if there are available data to download (e.g. LAML)
        if (length(manifest_df) == 0) {
            stop("There're not data to be downloaded for this cancer type!")

        # center only available at legacy db
        if (!is.null(manifest_df$center)) {
            center <- manifest_df$center
            manifest_df[, c("tags", "acl", "center")] <- NULL
            manifest_df <- cbind(center, manifest_df)
        } else {
            analysis <- manifest_df$analysis
            manifest_df[, c("analysis", "acl")] <- NULL
            manifest_df <- cbind(analysis, manifest_df)

        manipular <- manifest_df[, "cases"]
        manifest_df[, "cases"] <- NULL

        pre_cases <- matrix(data = "",
            nrow = nrow(manifest_df),
            ncol = ncol(manipular[[1]]["project"][[1]][1, ])
        colnames(pre_cases) <- colnames(manipular[[1]]["project"][[1]][1, ])

        for (i in seq_len(length(manipular))) {
            cases <- unlist(manipular[[i]]["samples"])
            cases <- as.character(cases[grep("TCGA", cases)])
            cases <- unname(sapply(
                function(w) {
                    paste(unlist(strsplit(w, "-"))[1:4], collapse = "-")
            manifest_df[i, "cases"] <- paste(cases, collapse = "/")

            pre_cases[i, ] <- as.matrix(manipular[[i]]["project"][[1]][1, ])

        manifest_df <- cbind(manifest_df, pre_cases)

            x = manifest_df, file = paste0(direc, "/manifest.sdrf"),
            quote = FALSE, row.names = FALSE, sep = "\t"

        if (db_bool) {
            manifest_df <- manifest_df[grep(tmpform, manifest_df$platform), ]

        manifest_df <- manifest_df[, c("file_name", "md5sum", "file_id")]

        colnames(manifest_df) <- c("filename", "md5", "id")

        id_matrix <- manifest_df[, "id"]

    # NOTE methylation ####
    if ("methylation" %in% tolower(data_type)) {
            path = file.path(
                work_dir, "DOAGDC", toupper(tumor),
            showWarnings = FALSE
        direc <- file.path(work_dir, "DOAGDC", toupper(tumor), folder_name)

        if (tolower(platform) %in% "all") {
            platform <- paste0(
            tmpform <- paste0(
                "Illumina Human Methylation 450|Illumina ",
                "Human Methylation 27"
        } else if (tolower(platform) %in% "illumina human methylation 450") {
            platform <- paste0(
            tmpform <- "Illumina Human Methylation 450"
        } else if (tolower(platform) %in% "illumina human methylation 27") {
            platform <- paste0(
            tmpform <- "Illumina Human Methylation 27"

        size <- size_par(
            type_of_data = "DNA Methylation", tumor = tumor,
            db = tolower(data_base)

        if (db_bool) {
            url <- paste0(
                url_inicio, "?pretty=true&expand=cases.samples.",
                "analysis&size=", size,
                ".platform%22,", platform,
        } else if (tolower(data_base) == "gdc") {
            url <- paste0(
                url_inicio, "?pretty=true&expand=cases.samples.",
                size, "&filters=%7B%22op%22:%22and%22,%22content%22",
                "%7B%22field%22:%22files.platform%22,", platform,
                "value%22:%5B%22TCGA-", toupper(tumor),
        message("\n\nDownloading manifest...\n")

        json <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)$data$hits

        manifest_df <- json

        # checkin' if there are available data to download (e.g. LAML)
        if (length(manifest_df) == 0) {
            stop("There're not data to be downloaded for this cancer type!")

        # center only available at legacy db
        if (!is.null(manifest_df$center)) {
            center <- manifest_df$center
            manifest_df[, c("tags", "acl", "center")] <- NULL
            manifest_df <- cbind(center, manifest_df)
        } else {
            analysis <- manifest_df$analysis
            manifest_df[, c("analysis", "acl")] <- NULL
            manifest_df <- cbind(analysis, manifest_df)

        manipular <- manifest_df[, "cases"]

        cases <- matrix(nrow = nrow(manifest_df), ncol = 1)

        for (index in seq_len(length(manipular))) {
            # patient_code <- manipular[[index]][ifelse(db_bool, 2, 1)]
            patient_code <- as.character(unlist(manipular[[index]][1]))
            tmp <- patient_code[grep("TCGA", patient_code)]
            if (tmp == paste0('TCGA-', toupper(tumor))) {
                patient_code <- as.character(unlist(manipular[[index]][2]))
                cases[index, 1] <- patient_code[grep("TCGA", patient_code)]
            } else {
                cases[index, 1] <- tmp

        manifest_df$cases <- cases

            x = manifest_df, file = paste0(direc, "/manifest.sdrf"),
            quote = FALSE, row.names = FALSE, sep = "\t"

        manifest_df <- manifest_df[
            grep(tmpform, manifest_df$platform),
            c("file_name", "md5sum", "file_id")

        colnames(manifest_df) <- c("filename", "md5", "id")

        id_matrix <- manifest_df[, "id"]

    # NOTE clinical and image ####
    tmp <- c("clinical", "biospecimen", "clinical_supplement", "image")
    if (tolower(data_type) %in% tmp) {
            path = file.path(
                work_dir, "DOAGDC", toupper(tumor),
            showWarnings = FALSE
        direc <- file.path(work_dir, "DOAGDC", toupper(tumor), folder_name)

        if (db_bool) {
            size <- size_par(
                tumor = tumor, type_of_data = "Clinical",
                db = "legacy"
            if ("image" == tolower(data_type)) {
                    "A lot of data are going to be downloaded,",
                    " please wait..."
                url <- paste0(
                    url_inicio, "?pretty=true&expand=cases.samples.",
                    toupper(tumor), "%22%5D%7D%7D,%7B%22op%22:%22",
            } else if ("clinical_supplement" == tolower(data_type)) {
                url <- paste0(
                    url_inicio, "?pretty=true&expand=cases.samples.",
                    "center,analysis&size=", size, "&filters=",
            } else if ("clinical" == tolower(data_type)) {
                url <- paste0(
                    url_inicio, "?pretty=true&expand=cases.samples.",
                    "center,analysis&size=", size, "&filters=%7B%",
                    "value%22:%5B%22TCGA-", toupper(tumor),
            } else if ("biospecimen" == tolower(data_type)) {
                url <- paste0(
                    url_inicio, "?pretty=true&expand=cases.samples.",
                    "center,analysis&size=", size, "&filters=%7B%22",
                    "22:%5B%22TCGA-", toupper(tumor), "%22%5D%7D%",
        } else if (tolower(data_base) == "gdc") {
            size <- size_par(
                tumor = tumor, type_of_data = "Clinical",
                db = "gdc"

            if ("clinical_supplement" == tolower(data_type)) {
                url <- paste0(
                    url_inicio, "?pretty=true&expand=cases.samples.",
                    "center,analysis&size=", size,
                    "%22%3A%5B%22TCGA-", toupper(tumor), "%22%5D%",
            } else if ("clinical" == tolower(data_type)) {
                url <- paste0(
                    url_inicio, "?pretty=true&expand=cases.samples.",
                    "center,analysis&size=", size, "&filters=",
                    "%22TCGA-", toupper(tumor), "%22%5D%7D%7D%2C",
            } else if ("biospecimen" == tolower(data_type)) {
                url <- paste0(
                    url_inicio, "?pretty=true&expand=cases.samples.",
                    "ct,center,analysis&size=", size,
                    "22%3A%5B%22TCGA-", toupper(tumor),

        message("\n\nDownloading manifest...\n")
        json <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)

        manifest_df <- json$data$hits

        # checkin' if there are available data to download (e.g. LAML)
        if (length(manifest_df) == 0) {
            stop("There're not data to be downloaded for this cancer type!")

        # center only available at legacy db
        if (!is.null(manifest_df$center)) {
            center <- manifest_df$center
            manifest_df[, c("tags", "acl", "center", "cases")] <- NULL
            manifest_df <- cbind(center, manifest_df)
        } else {
            manifest_df[, c("tags", "acl", "cases")] <- NULL

            x = manifest_df, file = paste0(direc, "/manifest.sdrf"),
            quote = FALSE, row.names = FALSE, sep = "\t"

        manifest_df <- manifest_df[, c("file_name", "md5sum", "file_id")]
        colnames(manifest_df) <- c("filename", "md5", "id")
        id_matrix <- manifest_df[, "id"]

    # NOTE protein ####
    if ("protein" == tolower(data_type)) {
        if (tolower(data_base) == "gdc") {
                "\nThrere is no protein expression data in",
                " GDC data base!!",
                "\nPlease use 'legacy' data base"
            path = file.path(
                work_dir, "DOAGDC", toupper(tumor),
            showWarnings = FALSE
        direc <- file.path(work_dir, "DOAGDC", toupper(tumor), folder_name)

        size <- size_par(
            tumor = tumor, type_of_data = "Protein expression",
            db = "legacy"

        message("Cheking if size is different of zero...")

        url <- paste0(
            url_inicio, "?pretty=true&expand=cases.samples.",
            "analysis&size=", size, "&filters=%7B%22op%22",
            toupper(tumor), "%22%5D%7D%7D%5D%7D&format=JSON"

        message("\n\nDownloading manifest...\n")
        json <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)

        manifest_df <- json$data$hits

        # checkin' if there are available data to download (e.g. LAML)
        if (length(manifest_df) == 0) {
            stop("There're not data to be downloaded for this cancer type!")

        center <- manifest_df$center
        manifest_df[, c("center", "acl", "cases")] <- NULL

        manifest_df <- cbind(center, manifest_df)

        # for some reason sometimes- 16(submitter_id) will be cases
        patient_code <- manifest_df$submitter_id

            x = manifest_df, file = paste0(direc, "/manifest.sdrf"),
            quote = FALSE, row.names = FALSE, sep = "\t"

        manifest_df <- manifest_df[, c("file_name", "md5sum", "file_id")]

        colnames(manifest_df) <- c("filename", "md5", "id")

        id_matrix <- manifest_df[, "id"]

    # NOTE mage ####
    if ("mage" == tolower(data_type)) {
            path = file.path(
                work_dir, "DOAGDC", toupper(tumor),
            showWarnings = FALSE
        direc <- file.path(work_dir, "DOAGDC", toupper(tumor), folder_name)

        url <- paste0(
            url_inicio, "?pretty=true&expand=cases.samples.",

        message("\n\nDownloading manifest...\n")
        json <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)

        manifest_df <- json$data$hits

        # checkin' if there are available data to download (e.g. LAML)
        if (nrow(manifest_df) == 0) {
            stop("There're not data to be downloaded for this cancer type!")

        # 16(submitter_id) will be cases
        manifest_df[, c("acl", "cases")] <- NULL

        manifest_df <- manifest_df[grepl(toupper(tumor), manifest_df$file_name,
            ignore.case = FALSE, perl = TRUE
        ), ]

            x = manifest_df, file = paste0(direc, "/manifest.sdrf"),
            quote = FALSE, row.names = FALSE, sep = "\t"

        manifest_df <- manifest_df[, c("file_name", "md5sum", "file_id")]

        colnames(manifest_df) <- c("filename", "md5", "id")

        id_matrix <- manifest_df[, "id"]

    # NOTE mirna ####
    is_mirna <- "mirna" %in% strsplit(tolower(data_type), split = " ")[[1]][1]
    is_isoform <- "isoform expression quantification" == tolower(data_type)
    if (is_mirna || is_isoform) {
            path = file.path(
                work_dir, "DOAGDC", toupper(tumor),
            showWarnings = FALSE
        direc <- file.path(work_dir, "DOAGDC", toupper(tumor), folder_name)

        if (tolower(platform) %in% "all") {
            platform <- ""
            tmp <- "Illumina HiSeq|Illumina GA|H-miRNA_8x15Kv2|H-miRNA_8x15Kv"
        } else if (tolower(platform) %in% "illumina hiseq") {
            platform <- paste0(
            tmp <- "Illumina HiSeq"
        } else if (tolower(platform) %in% "illumina ga") {
            platform <- paste0(
            tmp <- "Illumina GA"
        } else if (tolower(platform) %in% "h-mirna_8x15kv2") {
            platform <- paste0(
            tmp <- "H-miRNA_8x15Kv2"
        } else if (tolower(platform) %in% "h-mirna_8x15kv") {
            platform <- paste0(
            tmp <- "H-miRNA_8x15Kv"

        if (db_bool) {
            size <- size_par(
                type_of_data = "Gene expression", tumor = tumor,
                db = tolower(data_base)
            if ("mirna gene quantification" %in% tolower(data_type)) {
                url <- paste0(
                    url_inicio, "?pretty=true&expand=cases.samples.",
                    "center,analysis&size=", size,
                    "value%22:%5B%22TCGA-", toupper(tumor),
                    "%22%5D%7D%7D", platform, "%5D%7D&format=JSON"
            } else if ("mirna isoform quantification" %in% tolower(data_type)) {
                url <- paste0(
                    url_inicio, "?pretty=true&expand=cases.samples.",
                    "center,analysis&size=", size,
                    "%22,%22value%22:%5B%22TCGA-", toupper(tumor),
                    "%22%5D%7D%7D", platform, "%5D%7D&format=JSON"
        } else if (tolower(data_base) == "gdc") {
            size <- size_par(
                type_of_data = "Transcriptome Profiling",
                tumor = tumor, db = tolower(data_base)
            if ("mirna expression quantification" %in% tolower(data_type)) {
                url <- paste0(
                    url_inicio, "?pretty=true&expand=cases.samples.",
                    size, "&filters=%7B%22op%22%3A%22and%22%2C%",
                    "5B%22TCGA-", toupper(tumor),
            } else if (is_isoform) {
                url <- paste0(
                    url_inicio, "?pretty=true&expand=cases.samples.",
                    size, "&filters=%7B%22op%22%3A%22and%22%2C%22",
        message("\n\nDownloading manifest...\n")

        json <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)
        manifest_df <- json$data$hits

        # checkin' if there are available data to download (e.g. LAML)
        if (length(manifest_df) == 0) {
            stop("There're not data to be downloaded for this cancer type!")

        # center only available at legacy db
        if (!is.null(manifest_df$center)) {
            center <- manifest_df$center
            manifest_df[, c("tags", "acl", "center")] <- NULL
            manifest_df <- cbind(center, manifest_df)
        } else {
            analysis <- manifest_df$analysis
            manifest_df[, c("analysis", "acl")] <- NULL
            manifest_df <- cbind(analysis, manifest_df)

        manipular <- manifest_df[, "cases"]

        cases <- matrix(nrow = nrow(manifest_df), ncol = 1)

        for (index in seq_len(length(manipular))) {
            patient_code <- as.character(unlist(manipular[[index]][1]))
            tmp <- patient_code[grep("TCGA", patient_code)]
            if (tmp == paste0('TCGA-', toupper(tumor))) {
                patient_code <- as.character(unlist(manipular[[index]][2]))
                cases[index, 1] <- patient_code[grep("TCGA", patient_code)]
            } else {
                cases[index, 1] <- tmp

        manifest_df$cases <- cases

            x = manifest_df, file = paste0(direc, "/manifest.sdrf"),
            quote = FALSE, row.names = FALSE, sep = "\t"

        if (db_bool) {
            manifest_df <- manifest_df[grep(tmp, manifest_df$platform), ]

        manifest_df <- manifest_df[, c("file_name", "md5sum", "file_id")]

        colnames(manifest_df) <- c("filename", "md5", "id")
        id_matrix <- manifest_df[, "id"]

    # NOTE Exon quantification ####
    if ("exon" == strsplit(tolower(data_type), split = " ")[[1]][1]) {
        if (tolower(data_base) == "gdc") {
                "\nThrere is no Exon quantification ",
                "data in GDC data base!!",
                "\nPlease use 'legacy' data base"
            path = file.path(
                work_dir, "DOAGDC", toupper(tumor),
            showWarnings = FALSE
        direc <- file.path(work_dir, "DOAGDC", toupper(tumor), folder_name)

        if (tolower(platform) %in% "all") {
            tmpform <- "Illumina GA|Illumina HiSeq"
        } else if (tolower(platform) %in% "illumina ga") {
            tmpform <- "Illumina GA"
        } else if (tolower(platform) %in% "illumina hiseq") {
            tmpform <- "Illumina HiSeq"

        platform <- paste0(

        size <- size_par(
            tumor = tumor, type_of_data = "Gene expression",
            db = "legacy"

        url <- paste0(
            url_inicio, "?pretty=true&expand=cases.samples.",
            "analysis&size=", size, "&filters=",
            platform, ",%7B%22op%22:",
            ":%5B%22TCGA-", toupper(tumor),

        message("\n\nDownloading manifest...\n")

        json <- tryCatch(jsonlite::fromJSON(url, simplifyDataFrame = TRUE),
            error = function(e) stop(e),
            finally = message(
                "The tumor ", toupper(tumor),
                " does not have ", tolower(data_type),

        manifest_df <- json$data$hits

        # checkin' if there are available data to download (e.g. LAML)
        if (length(manifest_df) == 0) {
            stop("There're not data to be downloaded for this cancer type!")

        center <- manifest_df$center
        manifest_df[, c("center", "acl", "cases", "tags")] <- NULL

        manifest_df <- cbind(center, manifest_df)

        # for some reason sometimes- 16(submitter_id) will be cases
        patient_code <- manifest_df$submitter_id # same protein problem

            x = manifest_df, file = paste0(direc, "/manifest.sdrf"),
            quote = FALSE, row.names = FALSE, sep = "\t"

        manifest_df <- manifest_df[
            grep(tmpform, manifest_df$platform),
            c("file_name", "md5sum", "file_id")

        colnames(manifest_df) <- c("filename", "md5", "id")

        id_matrix <- manifest_df[, "id"]

    # NOTE Download PPD ####
    if (length(dir(direc)) > 1) {
        # verifying if the data is already downloaded
        pattern <- paste(".sdrf", "Data_access_time.txt", sep = "|")
        already_downloaded <- dir(
            path = direc, include.dirs = FALSE,
            recursive = FALSE,
            full.names = TRUE
            dir(path = direc)
        message("Checking md5 from downloaded files\n")
        already_downloaded_md5 <- as.vector(tools::md5sum(already_downloaded))
        selector <- manifest_df[, "md5"] %in% already_downloaded_md5
        id_matrix <- manifest_df[!selector, "id"]

    if (length(id_matrix) != 0) {
        ### download data
        url <- paste0(inicio, id_matrix)
        pb <- txtProgressBar(min = 0, max = length(url), style = 3)
        cont <- 0
        for (id in url) {
            cont <- cont + 1
            message(paste("\nDownloading", tumor, data_type, cont, "of",
                sep = " "
            setTxtProgressBar(pb, cont)
            tmp <- manifest_df[manifest_df$id == id_matrix[cont], "filename"]
            download_httr(url = id, destfile = paste0(direc, "/", tmp))
            md5 <- tools::md5sum(dir(
                path = direc, pattern = tmp,
                full.names = TRUE
            tmp_md5 <- manifest_df[manifest_df$id == id_matrix[cont], "md5"]
            while (md5[[1]] != tmp_md5) {
                    "The md5 of file '", tmp,
                    "' is wrong. Downloading again...\n"
                download_httr(url = id, destfile = paste0(direc, "/", tmp))
                md5 <- tools::md5sum(dir(direc, tmp, full.names = TRUE))
        # from python
        # file_endpt = 'https://api.gdc.cancer.gov/files/'
        # file_uuid = 'd853e541-f16a-4345-9f00-88e03c2dc0bc'
        # response = requests.get(file_endpt + file_uuid)
        # message(sprintf(
        # "On %s I realized %s was...\n%s by the street", Sys.Date(), person,
        # action))
        message("\n\nDownload is done!\n\n")
    } else {
            "There is nothing to download for ", tumor,
            ". You already have all data available",
            " in the selected data base!"

    # saving accession data
    write.table(Sys.time(), paste0(direc, "/Data_access_time.txt"),
        quote = FALSE,
        row.names = FALSE, col.names = FALSE
