R/Hgnc.R

Defines functions .splitToCharacterList

#' Import Human Genome Organization (HUGO) Gene Nomenclature Committee (HGNC)
#' metadata
#'
#' @export
#' @note Updated 2023-11-21.
#'
#' @return `Hgnc`.
#'
#' @seealso
#' - https://www.genenames.org/
#' - https://www.genenames.org/download/statistics-and-files/
#'
#' @examples
#' object <- Hgnc()
#' print(object)
Hgnc <- # nolint
    function() {
        alert("Importing HGNC complete set.")
        url <- pasteUrl(
            "ftp.ebi.ac.uk",
            "pub",
            "databases",
            "genenames",
            "new",
            "tsv",
            "hgnc_complete_set.txt",
            protocol = "https"
        )
        file <- .cacheIt(url)
        lines <- import(con = file, format = "lines")
        lines <- fillLines(lines, format = "tsv")
        con <- textConnection(lines)
        df <- import(con = con, format = "tsv")
        close(con)
        df <- as(df, "DFrame")
        colnames(df) <- camelCase(colnames(df), strict = TRUE)
        df[["agr"]] <- NULL
        df[["gencc"]] <- NULL
        assert(
            isSubset(
                x = c(
                    "dateApprovedReserved",
                    "dateModified",
                    "dateNameChanged",
                    "dateSymbolChanged",
                    "entrezId",
                    "hgncId",
                    "name",
                    "symbol"
                ),
                y = colnames(df)
            ),
            hasNoDuplicates(df[["hgncId"]])
        )
        df[["hgncId"]] <- as.integer(sub(
            pattern = "^HGNC\\:",
            replacement = "",
            x = df[["hgncId"]]
        ))
        df <- df[order(df[["hgncId"]]), , drop = FALSE]
        rownames(df) <- df[["hgncId"]]
        isNested <- bapply(
            X = df,
            FUN = function(x) {
                any(grepl(pattern = "|", x = x, fixed = TRUE))
            }
        )
        if (any(isNested)) {
            vars <- names(isNested)[isNested]
            df <- mutateAt(
                object = df,
                vars = vars,
                fun = .splitToCharacterList,
                split = "|"
            )
        }
        colnames(df)[colnames(df) == "entrezId"] <- "ncbiGeneId"
        colnames(df)[colnames(df) == "name"] <- "description"
        colnames(df)[colnames(df) == "symbol"] <- "geneName"
        df[["dateApprovedReserved"]] <- as.Date(df[["dateApprovedReserved"]])
        df[["dateModified"]] <- as.Date(df[["dateModified"]])
        df[["dateNameChanged"]] <- as.Date(df[["dateNameChanged"]])
        df[["dateSymbolChanged"]] <- as.Date(df[["dateSymbolChanged"]])
        df[["ncbiGeneId"]] <- as.integer(df[["ncbiGeneId"]])
        metadata(df) <- list(
            "date" = Sys.Date(),
            "organism" = "Homo sapiens",
            "packageVersion" = .pkgVersion,
            "url" = url
        )
        new("Hgnc", df)
    }



## Updated 2021-03-19.
.splitToCharacterList <- function(x, split = "|") {
    if (all(is.na(x))) {
        return(x)
    }
    x <- strsplit(x, split = split, fixed = TRUE)
    x <- CharacterList(x)
    ## > x <- sort(unique(x))
    x
}
acidgenomics/AcidGenomes documentation built on Dec. 10, 2023, 10:35 p.m.