From ExperimentHub to gypsum

knitr::opts_chunk$set(error=FALSE, message=FALSE)

Pulling assets from ExperimentHub.

library(ExperimentHub)
hub <- ExperimentHub()
prefix <- "celldex/blueprint_encode/"
norm.hub <- hub[hub$rdatapath==paste0(prefix, "1.0.0/logcounts.rds")]
stopifnot(nrow(norm.hub) == 1L)
norm <- norm.hub[[1]]
str(norm)
coldata.hub <- hub[hub$rdatapath==paste0(prefix, "1.2.0/coldata.rds")] # fixed astrocyte misassignment
stopifnot(nrow(coldata.hub) == 1L)
coldata <- coldata.hub[[1]]
coldata

Removing NA rows.

library(matrixStats)
norm <- norm[!rowAnyNAs(norm),,drop=FALSE]
dim(norm)

Attaching ontology mappings.

path <- system.file("mapping", "blueprint_encode.tsv", package="celldex", mustWork=TRUE)
src <- read.delim(path, header=FALSE, stringsAsFactors=FALSE)

m <- match(coldata$label.fine, src[,1])
stopifnot(all(!is.na(m))) # sanity check

matched <- src[m, 2]
matched[matched==""] <- NA_character_
coldata$label.ont <- matched
coldata

Assembling some metadata.

meta <- list(
    title="Human bulk RNA-seq data from Blueprint and ENCODE",
    description=paste(c(
        "Normalized expression values for 259 bulk RNA-seq samples generated by Blueprint and ENCODE from pure populations of stroma and immune cells (Martens and Stunnenberg, 2013; The ENCODE Consortium, 2012).",
        "The samples were processed and normalized as described in Aran, Looney and Liu et al. (2019), i.e., the raw RNA-seq counts were downloaded from Blueprint and ENCODE in 2016 and normalized via edgeR to log~2~-TPMs.",
        "Blueprint Epigenomics contains 144 RNA-seq pure immune samples annotated to 28 cell types.",
        "ENCODE contains 115 RNA-seq pure stroma and immune samples annotated to 17 cell types.",
        "All together, this reference contains 259 samples with 43 cell types (`label.fine`), manually aggregated into 24 broad classes (`label.main`).",
        "The fine labels have also been mapped to the Cell Ontology (`label.ont`)."
    ), collapse="\n"),
    taxonomy_id="9606",
    genome=character(0), # dunno, it doesn't say.
    sources=list(
        list(provider="PubMed", id="22955616"),
        list(provider="PubMed", id="24091925"),
        list(provider="PubMed", id="30643263"),
        list(provider="GitHub", id="dviraran/SingleR", version="adc4a0e4d5cfa79db18f3821f51a02cbd6484710"),
        list(provider="ExperimentHub", id=norm.hub$ah_id),
        list(provider="ExperimentHub", id=coldata.hub$ah_id)
    ),
    maintainer_name="Friederike Dündar",
    maintainer_email="frd2007@med.cornell.edu"
)

Saving it to disk.

library(celldex)
path <- "2024-02-26_output"
saveReference(norm, coldata, path, meta)


LTLA/CellTypeReferences documentation built on June 1, 2024, 12:12 p.m.