R/Cellosaurus.R

Defines functions .splitNestedCol .splitCol .processEntry .importCelloFromTxt .formatSynonyms .formatStrProfileData .formatSecondaryAccession .formatRefIds .formatHierarchy .formatDiseases .formatDate .formatCrossRefs .formatComments .formatAgeAtSampling .extractComment .extractCrossRef .addTaxonomy .addSangerModelId .addSamplingSite .addPopulation .addOncotree .addNcitDisease .addMsiStatus .addMisspellings .addIsProblematic .addIsContaminated .addIsCancer .addDepmapId .addAtccId

#' Cellosaurus table
#'
#' @name Cellosaurus
#' @note Updated 2024-03-11.
#'
#' @return `Cellosaurus`.
#'
#' @seealso
#' - https://www.cellosaurus.org/
#' - ftp://ftp.expasy.org/databases/cellosaurus/
#' - https://github.com/calipho-sib/cellosaurus/
#'
#' @examples
#' object <- Cellosaurus()
#' print(object)
NULL



#' Add `atccId` column
#'
#' @note Updated 2023-09-01.
#' @noRd
.addAtccId <- function(object) {
    object[["atccId"]] <- .extractCrossRef(
        object = object,
        keyName = "ATCC"
    )
    object
}



#' Add `depmapId` column
#'
#' @details
#' Note that some cell lines, such as "CVCL_0041", map to multiple DepMap
#' identifiers, which is confusing.
#'
#' @note Updated 2023-09-01.
#' @noRd
.addDepmapId <- function(object) {
    object[["depmapId"]] <- .extractCrossRef(
        object = object,
        keyName = "DepMap"
    )
    object
}



#' Add `isCancer` column
#'
#' @note Updated 2023-01-24.
#' @noRd
.addIsCancer <- function(object) {
    assert(
        is(object, "DFrame"),
        isCharacter(object[["category"]])
    )
    lgl <- object[["category"]] == "Cancer cell line"
    object[["isCancer"]] <- lgl
    object
}



#' Add `isContaminated` column
#'
#' @note Updated 2023-08-24.
#' @noRd
.addIsContaminated <- function(object) {
    assert(
        is(object, "DFrame"),
        is(object[["comments"]], "SimpleList")
    )
    lgl <- vapply(
        X = object[["comments"]],
        FUN = function(x) {
            if (!"Problematic cell line" %in% names(x)) {
                return(FALSE)
            }
            x <- x[["Problematic cell line"]]
            any(grepl(pattern = "^Contaminated.", x = x))
        },
        FUN.VALUE = logical(1L)
    )
    object[["isContaminated"]] <- lgl
    object
}



#' Add `isProblematic` column
#'
#' @note Updated 2023-01-24.
#' @noRd
.addIsProblematic <- function(object) {
    assert(
        is(object, "DFrame"),
        is(object[["comments"]], "SimpleList")
    )
    lgl <- vapply(
        X = object[["comments"]],
        FUN = function(x) {
            "Problematic cell line" %in% names(x)
        },
        FUN.VALUE = logical(1L)
    )
    object[["isProblematic"]] <- lgl
    object
}



#' Add `misspellings` column
#'
#' @note Updated 2023-09-01.
#' @noRd
.addMisspellings <- function(object) {
    vals <- .extractComment(object = object, keyName = "Misspelling")
    vals <- sub(pattern = "^([^;]+);.+$", replacement = "\\1", x = vals)
    object[["misspellings"]] <- vals
    object
}



#' Add `msiStatus` column
#'
#' @note Updated 2023-09-01.
#' @noRd
.addMsiStatus <- function(object) {
    object[["msiStatus"]] <- .extractComment(
        object = object,
        keyName = "Microsatellite instability"
    )
    object
}



#' Extract NCI thesaurus disease identifiers
#'
#' @details
#' Some cell lines rarely map to multiple identifiers, such as "CVCL_0028".
#'
#' @note Updated 2023-09-22.
#' @noRd
.addNcitDisease <- function(object) {
    assert(
        is(object, "DFrame"),
        is(object[["diseases"]], "SimpleList")
    )
    spl <- mclapply(
        X = object[["diseases"]],
        FUN = function(x) {
            x <- x[["NCIt"]]
            if (is.null(x)) {
                return(NULL)
            }
            x <- strSplit(x = x, split = "; ")
            x
        }
    )
    id <- CharacterList(lapply(
        X = spl,
        FUN = function(x) {
            x[, 1L]
        }
    ))
    name <- CharacterList(lapply(
        X = spl,
        FUN = function(x) {
            x[, 2L]
        }
    ))
    object[["ncitDiseaseId"]] <- id
    object[["ncitDiseaseName"]] <- name
    object
}



#' Add OntoTree metadata columns
#'
#' @note Updated 2023-08-26.
#' @noRd
.addOncotree <- function(object) {
    assert(
        is(object, "DFrame"),
        is(ncit2Oncotree, "DFrame"),
        is(oncotree, "DFrame"),
        isSubset(c("accession", "ncitDiseaseId"), colnames(object)),
        is(object[["ncitDiseaseId"]], "CharacterList"),
        identical(
            x = colnames(ncit2Oncotree),
            y = c("ncit", "oncotree")
        ),
        identical(
            x = colnames(oncotree),
            y = c(
                "code",
                "color",
                "name",
                "mainType",
                "externalReferences",
                "tissue",
                "parent",
                "history",
                "level",
                "revocations",
                "precursors"
            )
        )
    )
    ## Build our data frame that we will use for the left join.
    x <- object[, c("accession", "ncitDiseaseId")]
    ## Ignore any cell lines with multi-mapped NCIt terms.
    keep <- bapply(
        X = x[["ncitDiseaseId"]],
        FUN = function(x) {
            identical(length(x), 1L)
        }
    )
    x <- x[keep, , drop = FALSE]
    x[["ncitDiseaseId"]] <- unlist(
        x = x[["ncitDiseaseId"]],
        recursive = FALSE,
        use.names = FALSE
    )
    ## Join the NCIt-to-OncoTree mappings.
    y <- ncit2Oncotree
    colnames(y) <- c("ncitDiseaseId", "oncotreeCode")
    x <- leftJoin(x = x, y = y, by = "ncitDiseaseId")
    # Join the OncoTree metadata.
    y <- oncotree
    rownames(y) <- NULL
    y <- y[
        ,
        setdiff(
            x = colnames(y),
            y = c(
                "color",
                "externalReferences",
                "history",
                "precursors",
                "revocations"
            )
        )
    ]
    colnames(y) <- camelCase(paste("oncotree", colnames(y)))
    x <- leftJoin(x = x, y = y, by = "oncotreeCode")
    x[["ncitDiseaseId"]] <- NULL
    out <- leftJoin(x = object, y = x, by = "accession")
    out
}



#' Add `population` column
#'
#' @note Updated 2023-09-01.
#' @noRd
.addPopulation <- function(object) {
    object[["population"]] <- .extractComment(
        object = object,
        keyName = "Population"
    )
    object
}



#' Add `samplingSite` column
#'
#' @details
#' Note that some comments have additional information on the cell type, that
#' we don't want to include here (e.g. CVCL_0003, CVCL_0008).
#'
#' @note Updated 2023-09-01.
#' @noRd
.addSamplingSite <- function(object) {
    object[["samplingSite"]] <- .extractComment(
        object = object,
        keyName = "Derived from site"
    )
    object
}



#' Add `sangerModelId` column
#'
#' @note Updated 2023-09-01.
#' @noRd
.addSangerModelId <- function(object) {
    object[["sangerModelId"]] <- .extractCrossRef(
        object = object,
        keyName = "Cell_Model_Passport"
    )
    object
}



#' Add `ncbiTaxonomyId` and `organism` columns
#'
#' @note Updated 2023-09-22.
#' @noRd
#'
#' @details
#' Note that hybrid lines contain multiple taxonomies, so we want to keep this
#' as a list instead of attempting to coerce to a vector.
#'
#' Examples: CVCL_0464, CVCL_0E28, CVCL_0E29.
.addTaxonomy <- function(object) {
    assert(
        is(object, "DFrame"),
        is(object[["speciesOfOrigin"]], "CharacterList")
    )
    spl <- mclapply(
        X = object[["speciesOfOrigin"]],
        FUN = strSplit,
        split = "; ! "
    )
    taxId <- IntegerList(lapply(
        X = spl,
        FUN = function(x) {
            sub(
                pattern = "NCBI_TaxID=",
                replacement = "",
                x = x[, 1L]
            )
        }
    ))
    organism <- CharacterList(lapply(
        X = spl,
        FUN = function(x) {
            x[, 2L]
        }
    ))
    ## Standardize organism to full Latin name. For example, this step converts
    ## "Homo sapiens (Human)" to simply "Homo sapiens".
    organism <- sub(
        pattern = "\\s\\(.+\\)$",
        replacement = "",
        x = organism
    )
    object[["ncbiTaxonomyId"]] <- taxId
    object[["organism"]] <- organism
    object[["speciesOfOrigin"]] <- NULL
    object
}



#' Extract and assign identifier column from `crossReferences`
#'
#' @details
#' Note that this intentionally picks only the last matching identifier.
#'
#' This helps avoid issues with discontinued identifiers, such as CVCL_0455,
#' which has multiple DepMap identifiers: ACH-000474 - Discontinued; ACH-001075.
#'
#' @note Updated 2023-09-01.
#' @noRd
.extractCrossRef <- function(object, keyName) {
    assert(
        is(object, "DFrame"),
        is(object[["crossReferences"]], "SimpleList"),
        isString(keyName)
    )
    x <- vapply(
        X = object[["crossReferences"]],
        keyName = keyName,
        FUN = function(x, keyName) {
            ids <- x[[keyName]]
            if (is.null(ids)) {
                return(NA_character_)
            }
            ids[[1L]]
        },
        FUN.VALUE = character(1L)
    )
    x
}



#' Extract a key value pair from comments
#'
#' @note Updated 2023-09-01.
#' @noRd
.extractComment <- function(object, keyName) {
    assert(
        is(object, "DFrame"),
        is(object[["comments"]], "SimpleList"),
        isString(keyName)
    )
    x <- object[["comments"]]
    x <- CharacterList(lapply(
        X = x,
        FUN = function(x) {
            if (is.null(x[[keyName]])) {
                return(NULL)
            }
            x[[keyName]]
        }
    ))
    x
}



#' Format the `ageAtSampling` column
#'
#' @note Updated 2023-01-24.
#' @noRd
.formatAgeAtSampling <- function(object) {
    assert(
        is(object, "DFrame"),
        is.character(object[["ageAtSampling"]])
    )
    idx <- which(object[["ageAtSampling"]] == "Age unspecified")
    object[["ageAtSampling"]][idx] <- NA_character_
    object
}



#' Format the `comments` column
#'
#' @note Updated 2023-09-22.
#' @noRd
.formatComments <- function(object) {
    assert(
        is(object, "DFrame"),
        is(object[["comments"]], "CharacterList")
    )
    object[["comments"]] <- unique(object[["comments"]])
    object[["comments"]] <- sub(
        pattern = "\\.$",
        replacement = "",
        x = object[["comments"]]
    )
    .splitNestedCol(
        object = object,
        colName = "comments",
        split = ": "
    )
}



#' Format the `crossReferences` column
#'
#' @note Updated 2023-01-24.
#' @noRd
.formatCrossRefs <- function(object) {
    .splitNestedCol(
        object = object,
        colName = "crossReferences",
        split = "; "
    )
}



#' Format the `date` column
#'
#' @note Updated 2023-01-24.
#' @noRd
.formatDate <- function(object) {
    object <- .splitCol(
        object = object,
        colName = "date",
        split = "; "
    )
    object <- .splitNestedCol(
        object = object,
        colName = "date",
        split = ": "
    )
    object
}



#' Format the `diseases` column
#'
#' @note Updated 2023-01-24.
#' @noRd
.formatDiseases <- function(object) {
    .splitNestedCol(
        object = object,
        colName = "diseases",
        split = "; "
    )
}



#' Format the `hierarchy` column
#'
#' @details
#' Some entries have multiple elements, such as "CVCL_0464".
#'
#' @note Updated 2023-09-22.
#' @noRd
.formatHierarchy <- function(object) {
    assert(
        is(object, "DFrame"),
        is(object[["hierarchy"]], "CharacterList")
    )
    lst <- mclapply(
        X = object[["hierarchy"]],
        FUN = function(x) {
            if (identical(x, character())) {
                return(character())
            }
            spl <- strSplit(x, split = " ! ")
            out <- spl[, 1L]
            out
        }
    )
    lst <- CharacterList(lst)
    object[["hierarchy"]] <- lst
    object
}



#' Format the `referencesIdentifiers` column
#'
#' @note Updated 2023-01-24.
#' @noRd
.formatRefIds <- function(object) {
    assert(
        is(object, "DFrame"),
        is(object[["referencesIdentifiers"]], "CharacterList")
    )
    object[["referencesIdentifiers"]] <- sub(
        pattern = ";$",
        replacement = "",
        x = object[["referencesIdentifiers"]]
    )
    .splitNestedCol(
        object = object,
        colName = "referencesIdentifiers",
        split = "="
    )
}



#' Format the `secondaryAccession` column
#'
#' @note Updated 2023-09-22.
#' @noRd
.formatSecondaryAccession <- function(object) {
    .splitCol(
        object = object,
        colName = "secondaryAccession",
        split = "; "
    )
}



#' Format the `strProfileData` column
#'
#' @note Updated 2023-01-31.
#' @noRd
.formatStrProfileData <- function(object) {
    .splitNestedCol(
        object = object,
        colName = "strProfileData",
        split = ": "
    )
}



#' Format the `synonyms` column
#'
#' @note Updated 2023-01-24.
#' @noRd
.formatSynonyms <- function(object) {
    .splitCol(
        object = object,
        colName = "synonyms",
        split = "; "
    )
}



## ---------  ---------------------------     ----------------------
## Line code  Content                         Occurrence in an entry
## ---------  ---------------------------     ----------------------
## ID         Identifier (cell line name)     Once; starts an entry
## AC         Accession (CVCL_xxxx)           Once
## AS         Secondary accession number(s)   Optional; once
## SY         Synonyms                        Optional; once
## DR         Cross-references                Optional; once or more
## RX         References identifiers          Optional: once or more
## WW         Web pages                       Optional; once or more
## CC         Comments                        Optional; once or more
## ST         STR profile data                Optional; twice or more
## DI         Diseases                        Optional; once or more
## OX         Species of origin               Once or more
## HI         Hierarchy                       Optional; once or more
## OI         Originate from same individual  Optional; once or more
## SX         Sex of cell                     Optional; once
## AG         Age of donor at sampling        Optional; once
## CA         Category                        Once
## DT         Date (entry history)            Once
## //         Terminator                      Once; ends an entry



#' Import Cellosaurus data frame from TXT file
#'
#' @note Updated 2024-03-11.
#' @noRd
#'
#' @seealso
#' - https://github.com/calipho-sib/cellosaurus/
#' - https://ftp.expasy.org/databases/cellosaurus/
.importCelloFromTxt <- function() {
    url <- pasteUrl(
        "ftp.expasy.org",
        "databases",
        "cellosaurus",
        "cellosaurus.txt",
        protocol = "https"
    )
    con <- cacheUrl(url = url, pkg = .pkgName, update = TRUE)
    lines <- import(con = con, format = "lines")
    ## Extract the Cellosaurus data version (release) from the top of the file.
    dataVersion <- strsplit(
        x = grep(
            pattern = "^ Version:",
            x = lines[1L:10L],
            value = TRUE
        ),
        split = ": ",
        fixed = TRUE
    )[[1L]][[2L]]
    assert(isString(dataVersion))
    dataVersion <- as.numeric_version(dataVersion)
    alert(sprintf(
        "Detected Cellosaurus release %s.",
        as.character(dataVersion)
    ))
    alert(sprintf("Mapping lines into {.cls %s} of entries.", "list"))
    x <- Map(
        f = function(lines, i, j) {
            lines[i:(j - 1L)]
        },
        i = grep(pattern = "^ID\\s+", x = lines, value = FALSE),
        j = grep(pattern = "^//$", x = lines, value = FALSE),
        MoreArgs = list("lines" = lines),
        USE.NAMES = FALSE
    )
    requiredKeys <- c("AC", "CA", "DT", "ID")
    nestedKeys <- c("CC", "DI", "DR", "HI", "OI", "OX", "RX", "ST", "WW")
    optionalKeys <- c("AG", "AS", "SX", "SY")
    alert("Processing entries.")
    x <- mclapply(
        X = x,
        FUN = .processEntry,
        nestedKeys = nestedKeys,
        optionalKeys = optionalKeys
    )
    alert(sprintf(
        "Coercing entries {.cls %s} to {.cls %s} with {.pkg %s}::{.fun %s}.",
        "list", "DFrame", "AcidPlyr", "rbindToDataFrame"
    ))
    df <- rbindToDataFrame(x)
    assert(areSetEqual(
        x = colnames(df),
        y = c(requiredKeys, optionalKeys, nestedKeys)
    ))
    for (key in requiredKeys) {
        assert(isCharacter(df[[key]]))
    }
    for (key in optionalKeys) {
        assert(is.character(df[[key]]))
    }
    for (key in nestedKeys) {
        assert(is.list(df[[key]]))
        df[[key]] <- CharacterList(df[[key]])
    }
    colnames(df)[colnames(df) == "AC"] <- "accession"
    colnames(df)[colnames(df) == "AG"] <- "ageAtSampling"
    colnames(df)[colnames(df) == "AS"] <- "secondaryAccession"
    colnames(df)[colnames(df) == "CA"] <- "category"
    colnames(df)[colnames(df) == "CC"] <- "comments"
    colnames(df)[colnames(df) == "DI"] <- "diseases"
    colnames(df)[colnames(df) == "DR"] <- "crossReferences"
    colnames(df)[colnames(df) == "DT"] <- "date"
    colnames(df)[colnames(df) == "HI"] <- "hierarchy"
    colnames(df)[colnames(df) == "ID"] <- "cellLineName"
    colnames(df)[colnames(df) == "OI"] <- "originateFromSameIndividual"
    colnames(df)[colnames(df) == "OX"] <- "speciesOfOrigin"
    colnames(df)[colnames(df) == "RX"] <- "referencesIdentifiers"
    colnames(df)[colnames(df) == "ST"] <- "strProfileData"
    colnames(df)[colnames(df) == "SX"] <- "sexOfCell"
    colnames(df)[colnames(df) == "SY"] <- "synonyms"
    colnames(df)[colnames(df) == "WW"] <- "webPages"
    assert(
        allAreMatchingRegex(x = df[["accession"]], pattern = "^CVCL_"),
        isCharacter(df[["cellLineName"]])
    )
    rownames(df) <- df[["accession"]]
    df <- df[order(rownames(df)), , drop = FALSE]
    metadata(df) <- list(
        "dataVersion" = dataVersion,
        "date" = Sys.Date(),
        "packageVersion" = .pkgVersion
    )
    df
}



#' Process lines per entry
#'
#' @note Updated 2023-09-22.
#' @noRd
.processEntry <- function(x, nestedKeys, optionalKeys) {
    x <- strSplit(x, split = "   ")
    x <- split(x[, 2L], f = x[, 1L])
    for (key in optionalKeys) {
        if (is.null(x[[key]])) {
            x[[key]] <- NA_character_
        }
    }
    for (key in nestedKeys) {
        x[[key]] <- unique(x[[key]])
    }
    ## Filter out discontinued identifiers from DR (e.g. "CVCL_0455").
    discontinued <- grep(
        pattern = "Discontinued:",
        x = x[["CC"]],
        fixed = TRUE,
        value = TRUE
    )
    if (isTRUE(length(discontinued) > 0L)) {
        discontinued <- sub(
            pattern = "^Discontinued: (.+);.+$",
            replacement = "\\1",
            x = discontinued
        )
        x[["DR"]] <- setdiff(x = x[["DR"]], y = discontinued)
    }
    x
}



#' Split a column into a character list
#'
#' @note Updated 2023-09-22.
#' @noRd
.splitCol <- function(object, colName, split = "; ") {
    assert(
        is(object, "DFrame"),
        isString(colName),
        isString(split)
    )
    spl <- strsplit(
        x = object[[colName]],
        split = split,
        fixed = TRUE
    )
    cl <- CharacterList(spl)
    object[[colName]] <- cl
    object
}



#' Split a nested column by key
#'
#' Don't format key names into camel case -- too CPU intensive.
#'
#' @note Updated 2023-09-22.
#' @noRd
.splitNestedCol <- function(object, colName, split) {
    assert(
        is(object, "DFrame"),
        is(object[[colName]], "CharacterList"),
        isString(split)
    )
    lst <- mclapply(
        X = object[[colName]],
        split = split,
        FUN = function(x, split) {
            if (identical(x, character())) {
                return(list())
            }
            x <- strSplit(x = x, split = split, n = 2L)
            ## Formatting into camel case takes too long.
            ## > x[, 1L] <- camelCase(x[, 1L])
            x <- split(x = x[, 2L], f = x[, 1L])
            x
        }
    )
    lst <- SimpleList(lst)
    object[[colName]] <- lst
    object
}



## Updated 2023-09-22.

#' @rdname Cellosaurus
#' @export
Cellosaurus <- # nolint
    function() {
        object <- .importCelloFromTxt()
        alert("Formatting annotations.")
        object <- .formatAgeAtSampling(object)
        object <- .formatComments(object)
        object <- .formatCrossRefs(object)
        object <- .formatDate(object)
        object <- .formatDiseases(object)
        object <- .formatHierarchy(object)
        object <- .formatRefIds(object)
        object <- .formatSecondaryAccession(object)
        object <- .formatStrProfileData(object)
        object <- .formatSynonyms(object)
        alert("Adding annotations.")
        object <- .addAtccId(object)
        object <- .addDepmapId(object)
        object <- .addIsCancer(object)
        object <- .addIsContaminated(object)
        object <- .addIsProblematic(object)
        object <- .addMisspellings(object)
        object <- .addMsiStatus(object)
        object <- .addNcitDisease(object)
        object <- .addOncotree(object)
        object <- .addPopulation(object)
        object <- .addSamplingSite(object)
        object <- .addSangerModelId(object)
        object <- .addTaxonomy(object)
        alert("Encoding metadata.")
        object <- encode(object)
        object <- object[, sort(colnames(object)), drop = FALSE]
        new("Cellosaurus", object)
    }
acidgenomics/Cellosaurus documentation built on March 15, 2024, 1:28 a.m.