R/download_MTBLS242.R

Defines functions curl_download_retry download_MTBLS242

Documented in download_MTBLS242

#' Download MTBLS242
#'
#' Downloads the [MTBLS242](https://www.ebi.ac.uk/metabolights/MTBLS242/protocols)
#' dataset from Gralka et al., 2015. DOI: \doi{10.3945/ajcn.115.110536}.
#' 
#' Besides the destination directory, this function
#' includes three logical parameters to limit the amount of downloaded/saved data.
#' To run the tutorial workflow:
#' - only the "preop" and "three months" timepoints are used,
#' - only subjects measured in *both* preop and three months time points are used
#' - only the CPMG samples are used.
#' 
#' If you want to run the tutorial, you can set those filters to `TRUE`. Then, roughly
#' 800MB will be downloaded, and 77MB of disk space will be used, since for each
#' downloaded sample we remove all the data but the CPMG.
#' 
#' If you set those filters to `FALSE`, roughly 1.8GB of data will be
#' downloaded (since we have more timepoints to download) and 1.8GB
#' of disk space will be used.
#' 
#' Note that we have experienced some sporadic timeouts from Metabolights, 
#' when downloading the dataset. If you get those timeouts simply re-run the
#' download function and it will restart from where it stopped.
#' 
#' Note as well, that we observed several files to have incorrect data:
#' - Obs4_0346s.zip is not present in the FTP server
#' - Obs0_0110s.zip and Obs1_0256s.zip incorrectly contain sample Obs1_0010s
#' 
#' This function removes all three samples from the samples annotations and
#' doesn't download their data.
#' 
#' 
#' @param dest_dir Directory where the dataset should be saved
#' @param force Logical. If `TRUE` we do not re-download files if they exist. The function does not check whether cached versions were
#' downloaded with different `keep_only_*` arguments, so please use `force = TRUE` if you change the `keep_only_*` settings.
#' @param keep_only_CPMG_1r If `TRUE`, remove all other data beyond the CPMG real spectrum, which is enough for the tutorial
#' @param keep_only_preop_and_3months If `TRUE`, keep only the preoperatory and the "three months after surgery" time points, enough for the tutorial
#' @param keep_only_complete_time_points If `TRUE`, remove samples that do not appear on all timepoints. Useful for the tutorial.
#'
#' @return Invisibly, the annotations. See the example for how to download the
#'  annotations and create a dataset from the downloaded files.
#' @export
#'
#' @examples
#' \dontrun{
#'   download_MTBLS242("./MTBLS242")
#'   annot <- readr::read_tsv(annotations_destfile)
#'   
#'   dataset <- nmr_read_samples(annot$filename)
#'   dataset <- nmr_meta_add(dataset, annot)
#'   dataset
#' }
download_MTBLS242 <- function(
        dest_dir = "MTBLS242", force = FALSE,
        keep_only_CPMG_1r = TRUE,
        keep_only_preop_and_3months = TRUE,
        keep_only_complete_time_points = TRUE
    ) {
    require_pkgs(pkg = c("curl", "zip"))
    url <- "ftp://ftp.ebi.ac.uk/pub/databases/metabolights/studies/public/MTBLS242/"

    dir.create(dest_dir, recursive = TRUE, showWarnings = FALSE)

    # Download metadata file s_mtbls242.txt
    meta_file <- "s_mtbls242.txt"
    meta_url <- file.path(url, meta_file)

    # meta_dst <- file.path(dest_dir, meta_file)
    # utils::download.file(meta_url, method = "auto", destfile = meta_dst, mode = "wb")
    annotations_destfile <- file.path(dest_dir, "sample_annotations.tsv")
    annotations_orig_destfile <- file.path(dest_dir, "s_mtbls242.txt")
    dst_rootdir <- file.path(dest_dir, "samples")
    if (!file.exists(annotations_orig_destfile) || force) {
        rlang::inform(c("i" = "Downloading sample annotations..."))
        curl_download_retry(url = meta_url, destfile = annotations_orig_destfile)
    }
    if (!file.exists(annotations_destfile) || force) {
        sample_annot <- tibble::as_tibble(
            utils::read.table(
                annotations_orig_destfile,
                sep = "\t",
                header = TRUE,
                check.names = FALSE
            ),
            .name_repair = "minimal"
        )

        # Keep sample name and time point:
        sample_annot <- dplyr::select(
            sample_annot,
            c("NMRExperiment" = "Sample Name", "TimePoint" = "Factor Value[time point]")
        )
        if (keep_only_preop_and_3months) {
            sample_annot <- dplyr::filter(sample_annot, .data$TimePoint %in% c("preop", "3 months after surgery"))
        }
        sample_annot$NMRExperiment <- gsub(pattern = "-", replacement = "_", sample_annot$NMRExperiment, fixed = TRUE)

        sample_annot <- tidyr::separate(
            sample_annot,
            col = "NMRExperiment",
            into = c("timepoint", "SampleID", "S"),
            sep = "_",
            remove = FALSE
        )
        sample_annot <- dplyr::select(sample_annot, -"S")
        sample_annot$filename <- paste0("Obs", sample_annot$timepoint, "_", sample_annot$SampleID, "s")

        sample_annot$NMRExperiment <- sample_annot$filename
        sample_annot <- dplyr::select(sample_annot, -"timepoint", -"filename")
        
        # File Obs0_0110s.zip incorrectly contains Obs0_0010s. Remove that ID
        sample_annot <- dplyr::filter(sample_annot, .data$NMRExperiment != "Obs0_0110s")
        # File Obs1_0256s.zip incorrectly contains Obs1_0010s. Remove that ID
        sample_annot <- dplyr::filter(sample_annot, .data$NMRExperiment != "Obs1_0256s")
        # File Obs4_0346s.zip does not exist in the FTP server, remove that entry:
        sample_annot <- dplyr::filter(sample_annot, .data$NMRExperiment != "Obs4_0346s")

        
        # Keep samples matched in the two timepoints under study:
        num_timepoints <- length(unique(sample_annot$TimePoint))
        if (keep_only_complete_time_points) {
            sample_annot <- dplyr::group_by(sample_annot, .data$SampleID)
            sample_annot <- dplyr::filter(sample_annot, dplyr::n() == !!num_timepoints)
            sample_annot <- dplyr::ungroup(sample_annot)
        }
        
        # filename.zip!/path/to/sample/in/zip
        if (keep_only_CPMG_1r) {
            sample_annot$filename <- paste0(
                dst_rootdir, "/", sample_annot$NMRExperiment, ".zip",
                "!",
                "/", sample_annot$NMRExperiment
            )
        } else {
            sample_annot$filename <- paste0(
                dst_rootdir, "/", sample_annot$NMRExperiment, ".zip",
                "!",
                "/", sample_annot$NMRExperiment, "/3"
            )
            
        }
        utils::write.table(sample_annot, file = annotations_destfile, sep = "\t", row.names = FALSE)
    } else {
        rlang::inform(c("i" = glue("Annotations were previously saved. Loading {annotations_destfile}")))
        sample_annot <- utils::read.csv(annotations_destfile, header = TRUE, sep = "\t")
    }
    dir.create(dst_rootdir, recursive = TRUE, showWarnings = FALSE)
    report_skipped_downloads <- FALSE
    purrr::walk(
        sample_annot$NMRExperiment,
        function(filename_base, url, dst_rootdir, keep_only_CPMG_1r) {
            filename <- paste0(filename_base, ".zip")
            src_url <- file.path(url, filename)
            final_dst_file <- file.path(dst_rootdir, filename)
            intermediate_dst_file <- file.path(dst_rootdir, paste0(filename, "intermediate.zip"))
            if (file.exists(final_dst_file) && !force) {
                if (!report_skipped_downloads) {
                    rlang::inform(c("i" = "Skipping re-download of previously downloaded samples."))
                    report_skipped_downloads <<- TRUE
                }
                return()
            }
            curl_download_retry(url = src_url, destfile = intermediate_dst_file)
            if (!keep_only_CPMG_1r) {
                file.rename(intermediate_dst_file, final_dst_file)
            } else {
                filenames_in_zip <- zip::zip_list(intermediate_dst_file)[["filename"]]
                prefix_to_keep <- file.path(filename_base, "3", "") # subdirectory 3/ contains the CPMG sample
                filenames_in_zip <- filenames_in_zip[startsWith(filenames_in_zip, prefix_to_keep)]
                # extract 3/ to dst_rootdir:
                zip::unzip(zipfile = intermediate_dst_file, exdir = dst_rootdir, files = filenames_in_zip)
                unlink(intermediate_dst_file)
                file.rename(
                    file.path(dst_rootdir, filename_base, "3"),
                    file.path(dst_rootdir, filename_base, filename_base)
                )
                # Remove files not needed:
                unlink(file.path(dst_rootdir, filename_base, filename_base, "fid"))
                unlink(file.path(dst_rootdir, filename_base, filename_base, "pdata", "1", "1i"))
                zipfile <- file.path(normalizePath(dst_rootdir, mustWork = TRUE), filename)
                zip::zip(
                    zipfile = zipfile,
                    files = filename_base,
                    root = file.path(dst_rootdir, filename_base)
                )
                # And once you have the zip file, remove the directory:
                unlink(file.path(dst_rootdir, filename_base), recursive = TRUE)
            }
        },
        url = url,
        dst_rootdir = dst_rootdir,
        keep_only_CPMG_1r = keep_only_CPMG_1r,
        .progress = "Downloading and preparing samples..."
    )
    invisible(sample_annot)
}

curl_download_retry <- function(url, destfile, ..., timeout_retries = 3) {
    attempts <- 1
    seconds_between_attempts <- 3
    while (attempts <= timeout_retries) {
        tryCatch({
            return(curl::curl_download(url = url, destfile = destfile, ...))
        }, error = function(e) {
            msg <- conditionMessage(e)
            if (grepl("curltmp", msg) || grepl(destfile, msg)) {
                stop(e)
            }
            ctrl_c_error <- "Operation was aborted by an application callback"
            if (msg == ctrl_c_error) {
                cli::cli_abort("Aborting download of {url} (interruption requested by user)", parent=e)
            }
            warn_msg <- c(
                "!" = "Failed to download {url}.",
                "i" = "Attempt {attempts} out of {timeout_retries}."
            )
            if (attempts < timeout_retries) {
              warn_msg <- c(
                  warn_msg,
                  "i" = "Underlying error message: {msg}",
                  "i" = "Next attempt in {seconds_between_attempts} seconds"
              )
              cli::cli_warn(warn_msg)
            } else {
              cli::cli_abort(warn_msg, parent=e)
            }
        })
        attempts <- attempts + 1
    }
    stop("Download failed too many times. Retry later or fix the URL")
}
sipss/AlpsNMR documentation built on May 11, 2024, 11:17 a.m.