R/ID-translation.R

Defines functions filenameToBarcode .matchSort barcodeToUUID UUIDtoUUID UUIDtoBarcode .cleanExpand .buildIDframe .findBarcodeLimit .barcode_cases .subword_id .barcode_files

Documented in barcodeToUUID filenameToBarcode UUIDtoBarcode UUIDtoUUID

## function to figure out exact endpoint based on TCGA barcode
.barcode_files <- function(startPoint = "cases", submitter_id = TRUE) {
    keywords <- c("cases", "samples", "portions", "analytes", "aliquots")
    last <- match.arg(startPoint, keywords)
    indx <- seq_len(which(keywords == last))
    sub_id <- if (submitter_id) "submitter_id" else NULL
    paste(c(keywords[indx], sub_id), collapse = ".")
}

.subword_id <- function(keyword) {
    ret <- paste0(keyword, "_ids")
    setNames(paste0("submitter_", ret), ret)
}

.barcode_cases <- function(bcodeType = "case") {
    if (identical(bcodeType, "case"))
        setNames("submitter_id", "case_id")
    else
        .subword_id(bcodeType)
}

.findBarcodeLimit <- function(barcode) {
    .checkBarcodes(barcode)
    filler <- .uniqueDelim(barcode)
    splitCodes <- strsplit(barcode, filler)
    obsIdx <- unique(lengths(splitCodes))

    if (obsIdx < 3L)
        stop("Minimum barcode fields required: ", 3L,
            "; first three are 'project-TSS-participant'")

    key <- c(rep("case", 3L), "sample", "analyte", "aliquot", "aliquot")[obsIdx]
    if (identical(key, "analyte")) {
        analyte_chars <- unique(
            vapply(splitCodes, function(x) nchar(x[[obsIdx]]), integer(1L))
        )
        if (!S4Vectors::isSingleInteger(analyte_chars))
            stop("Inconsistent '", key, "' barcodes")
        if (analyte_chars < 3)
            key <- "portion"
    } else if (identical(key, "aliquot")) {
        if (identical(obsIdx, 6L)) {
            ali_chars <- vapply(splitCodes, function(x)
                nchar(x[c(obsIdx-1L, obsIdx)]), integer(2L))
            if (identical(ali_chars, c(2L, 3L)))
                key <- "slide"
        }
    }
    key
}

.buildIDframe <- function(info, id_list) {
    barcodes_per_file <- lengths(id_list)
    # And build the data.frame
    data.frame(
        id = rep(ids(info), barcodes_per_file),
        barcode = if (!length(ids(info))) character(0L) else unlist(id_list),
        row.names = NULL,
        stringsAsFactors = FALSE
    )
}

.cleanExpand <- function(result, ids) {
    samps <- result[["samples"]]
    usamps <- unlist(samps)
    splitsamps <- split(unname(usamps), gsub("[0-9]*$", "", names(usamps)))
    splits <- strsplit(names(splitsamps), "\\.")
    cnames <- unique(vapply(splits, function(x) {
        paste0(x[-1], collapse = ".") }, character(1)))
    first <- unlist(splitsamps[c(TRUE, FALSE)])
    second <- unlist(splitsamps[c(FALSE, TRUE)])
    pos <- match(ids, first)
    resframe <- cbind.data.frame(first[pos], second[pos], row.names = NULL,
        stringsAsFactors = FALSE)
    names(resframe) <- cnames
    resframe
}

#' @name ID-translation
#'
#' @title Translate study identifiers from barcode to UUID and vice versa
#'
#' @description These functions allow the user to enter a character vector of
#' identifiers and use the GDC API to translate from TCGA barcodes to
#' Universally Unique Identifiers (UUID) and vice versa. These relationships
#' are not one-to-one. Therefore, a \code{data.frame} is returned for all
#' inputs. The UUID to TCGA barcode translation only applies to file and case
#' UUIDs. Two-way UUID translation is available from 'file_id' to 'case_id'
#' and vice versa. Please double check any results before using these
#' features for analysis. Case / submitter identifiers are translated by
#' default, see the \code{from_type} argument for details. All identifiers are
#' converted to lower case.
#'
#' @details
#' Based on the file UUID supplied, the appropriate entity_id (TCGA barcode) is
#' returned. In previous versions of the package, the 'end_point' parameter
#' would require the user to specify what type of barcode needed. This is no
#' longer supported as `entity_id` returns the appropriate one.
#'
#' @param id_vector A \code{character} vector of UUIDs corresponding to
#' either files or cases (default assumes case_ids)
#' @param from_type Either \code{case_id} or \code{file_id} indicating the type of
#' \code{id_vector} entered (default "case_id")
#' @param legacy (logical default FALSE) whether to search the legacy archives
#'
#' @return A \code{data.frame} of TCGA barcode identifiers and UUIDs
#'
#' @examples
#' ## Translate UUIDs >> TCGA Barcode
#'
#' uuids <- c("0001801b-54b0-4551-8d7a-d66fb59429bf",
#' "002c67f2-ff52-4246-9d65-a3f69df6789e",
#' "003143c8-bbbf-46b9-a96f-f58530f4bb82")
#'
#' UUIDtoBarcode(uuids, from_type = "file_id")
#'
#' UUIDtoBarcode("ae55b2d3-62a1-419e-9f9a-5ddfac356db4", from_type = "case_id")
#'
#' UUIDtoBarcode("d85d8a17-8aea-49d3-8a03-8f13141c163b", "aliquot_ids")
#'
#' @author Sean Davis, M. Ramos
#'
#' @export UUIDtoBarcode
UUIDtoBarcode <-  function(id_vector,
    from_type = c("case_id", "file_id", "aliquot_ids"), legacy = FALSE) {
    from_type <- match.arg(from_type)
    targetElement <- APIendpoint <- "submitter_id"
    if (identical(from_type, "file_id")) {
        APIendpoint <- "associated_entities.entity_submitter_id"
        targetElement <- "associated_entities"
    } else if (identical(from_type, "aliquot_ids")) {
        APIendpoint <- "samples.portions.analytes.aliquots.submitter_id"
        targetElement <- "samples"
    }
    selector <- switch(from_type, case_id = identity, aliquot_ids =
        function(x) select(x = x, fields =
            c(APIendpoint, "samples.portions.analytes.aliquots.aliquot_id")),
        function(x) select(x = x, fields = APIendpoint))

    funcRes <- switch(from_type,
        file_id = files(legacy = legacy),
        case_id = cases(legacy = legacy),
        aliquot_ids = cases(legacy = legacy))
    info <- results_all(
        selector(
            filter(funcRes, as.formula(
                paste("~ ", from_type, "%in% id_vector")
            ))
        )
    )
    if (!length(info))
        stop("No barcodes found, only case and file UUIDs are supported.")

    rframe <-
    if (identical(from_type, "case_id"))
        data.frame(info[[from_type]], info[[targetElement]],
            stringsAsFactors = FALSE)
    else if (identical(from_type, "file_id"))
        data.frame(names(info[[targetElement]]),
            unlist(info[[targetElement]], use.names = FALSE),
            stringsAsFactors = FALSE)
    else
        return(.cleanExpand(info, id_vector))

    names(rframe) <- c(from_type, APIendpoint)
    rframe[[from_type]] <- factor(rframe[[from_type]], levels = id_vector)
    rframe[order(rframe[[from_type]]), , drop = FALSE]
}

#' @rdname ID-translation
#'
#' @param to_type The desired UUID type to obtain, can either be "case_id" or
#' "file_id"
#'
#' @examples
#' ## Translate file UUIDs >> case UUIDs
#'
#' uuids <- c("0001801b-54b0-4551-8d7a-d66fb59429bf",
#' "002c67f2-ff52-4246-9d65-a3f69df6789e",
#' "003143c8-bbbf-46b9-a96f-f58530f4bb82")
#'
#' UUIDtoUUID(uuids)
#'
#' @export UUIDtoUUID
UUIDtoUUID <- function(id_vector, to_type = c("case_id", "file_id"),
    legacy = FALSE) {
    id_vector <- tolower(id_vector)
    type_ops <- c("case_id", "file_id")
    to_type <- match.arg(to_type)
    from_type <- type_ops[!type_ops %in% to_type]
    if (!length(from_type))
        stop("Provide a valid UUID type")

    endpoint <- switch(to_type,
        case_id = "cases.case_id",
        file_id = "files.file_id")
    apifun <- switch(to_type,
        file_id = cases(legacy = legacy),
        case_id = files(legacy = legacy))
    info <- results_all(
        select(filter(apifun, as.formula(
            paste("~ ", from_type, "%in% id_vector")
            )),
        endpoint)
    )
    targetElement <- gsub("(\\w+).*", "\\1", endpoint)
    id_list <- lapply(info[[targetElement]], function(x) {x[[1]]})

    rframe <- .buildIDframe(info, id_list)
    names(rframe) <- c(from_type, endpoint)
    rframe
}

#' @rdname ID-translation
#'
#' @param barcodes A \code{character} vector of TCGA barcodes
#'
#' @examples
#' ## Translate TCGA Barcode >> UUIDs
#'
#' fullBarcodes <- c("TCGA-B0-5117-11A-01D-1421-08",
#' "TCGA-B0-5094-11A-01D-1421-08",
#' "TCGA-E9-A295-10A-01D-A16D-09")
#'
#' sample_ids <- TCGAbarcode(fullBarcodes, sample = TRUE)
#'
#' barcodeToUUID(sample_ids)
#'
#' participant_ids <- c("TCGA-CK-4948", "TCGA-D1-A17N",
#' "TCGA-4V-A9QX", "TCGA-4V-A9QM")
#'
#' barcodeToUUID(participant_ids)
#'
#' @export barcodeToUUID
barcodeToUUID <-
    function(barcodes, legacy = FALSE)
{
    .checkBarcodes(barcodes)
    bend <- .findBarcodeLimit(barcodes)
    endtargets <- .barcode_cases(bend)
    expander <- gsub("cases\\.", "", .barcode_files(bend, FALSE))

    pand <- switch(expander, cases = identity,
        function(x) expand(x = x, expand = expander))
    info <- results_all(
        pand(x = filter(cases(), as.formula(
            paste("~ ", endtargets, "%in% barcodes")
        )))
    )
    if (identical(expander, "cases")) {
        rframe <- as.data.frame(info[c(endtargets, names(endtargets))],
            stringsAsFactors = FALSE)
    } else {
        idnames <- lapply(ids(info), function(ident) {
            info[["samples"]][[ident]]
        })
        if (!identical(expander, "samples")) {
            exFUN <- switch(expander,
                samples.portions =
                    function(x, i) x[["portions"]],
                samples.portions.analytes =
                    function(x, i) unlist(lapply(
                        x[["portions"]], `[[`, "analytes"), recursive = FALSE),
                samples.portions.analytes.aliquots =
                    function(x, i) unlist(lapply(
                        unlist(
                            lapply(x[["portions"]], `[[`, "analytes"),
                            recursive = FALSE), `[[`, "aliquots"),
                        recursive = FALSE)
                )
            idnames <- unlist(lapply(seq_along(idnames), function(i)
                exFUN(x = idnames[[i]], i = i)
            ), recursive = FALSE)
            idnames <- Filter(function(g) length(g) >= 2L, idnames)
        }
        rescols <- lapply(idnames, `[`,
            c("submitter_id", gsub("s$", "", names(endtargets))))
        rframe <- do.call(rbind, c(rescols, stringsAsFactors = FALSE))
        names(rframe) <- c(endtargets, names(endtargets))
    }
    rframe[na.omit(match(barcodes, rframe[[endtargets]])), , drop = FALSE]
}

.matchSort <- function(major, minor) {
    hits <- S4Vectors::findMatches(major, minor)
    order(S4Vectors::subjectHits(hits))
}

#' @rdname ID-translation
#'
#' @param filenames A \code{character} vector of filenames usually obtained
#' from the GenomicDataCommons
#'
#' @examples
#' library(GenomicDataCommons)
#'
#' fquery <- files() %>%
#'     filter(~ cases.project.project_id == "TCGA-COAD" &
#'         data_category == "Copy Number Variation" &
#'         data_type == "Copy Number Segment")
#'
#' fnames <- results(fquery)$file_name[1:6]
#'
#' filenameToBarcode(fnames)
#'
#' @export filenameToBarcode
filenameToBarcode <- function(filenames, legacy = FALSE) {
    filesres <- files(legacy = legacy)
    info <- results_all(
        select(filter(filesres, ~ file_name %in% filenames),
            c("file_name",
            "cases.samples.portions.analytes.aliquots.submitter_id"))
    )

    reps <- lengths(lapply(info[["cases"]], unlist))
    res <- data.frame(file_name = rep(info[["file_name"]], reps),
        file_id = rep(info[["file_id"]], reps),
        aliquots.submitter_id = unname(unlist(info[["cases"]])),
        row.names = NULL,
        stringsAsFactors = FALSE)
    idx <- .matchSort(res[["file_name"]], filenames)
    res[idx, ]
}

Try the TCGAutils package in your browser

Any scripts or data that you put into this service are public.

TCGAutils documentation built on April 17, 2021, 6:04 p.m.