In LTLA/celldex: Index of Reference Cell Type Datasets

library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Overview

This RNA microarray dataset was downloaded from GEO24759. Values were already RMA normalized. The only additional processing done was to collapse probes to gene level values by keeping probes with the highest average across all samples. 'Main' and 'fine' labels were manually assigned to each sample based on cell type as specified in the GEO repository.

Data retrieval and processing

First, we'll retrieve the normalized data from the GEO repository.

library(GEOquery)
gse24759 <- getGEO("GSE24759")
logcounts <- exprs(gse24759[[1]])
dim(logcounts)

The count matrix has probes rather than gene symbols, so we need to get the appropriate gene symbols.

genes <- fData(gse24759[[1]])$`Gene Symbol`
probes <- rownames(logcounts)
genes <- genes[match(rownames(logcounts), probes)]

Some probes don't have gene symbols, so we'll remove those rows. We then set the gene symbols as the row names.

discard <- genes=="" | is.na(genes)
summary(discard)
logcounts <- logcounts[!discard,]
rownames(logcounts) <- genes[!discard]

Some genes have multiple probes, so we will collapse them to use the probe with the highest median signal across samples.

library(matrixStats)
meds <- rowMedians(logcounts)
logcounts <- logcounts[order(rownames(logcounts), -meds),]
logcounts <- logcounts[!duplicated(rownames(logcounts)), ]
dim(logcounts)

Sample labelling

We can now apply human-readable labels to each sample. This requires some translation to make the main labels.

fine <- pData(gse24759[[1]])[["cell type:ch1"]]
fine <- sub("B-cell", "B cell", fine)
fine <- sub("T-cell", "T cell", fine)
fine <- sub("cell$", "cells", fine)
fine <- sub("cell([ _])", "cells\\1", fine)
fine <- sub("Cell$", "Cells", fine)
fine <- sub("progenitor$", "progenitors", fine)
fine <- sub("cyte$", "cytes", fine)
fine <- sub("^Granulocyte ", "Granulocytes ", fine)
fine <- sub("^Granulocyte ", "Granulocytes ", fine)
fine[fine=="Megakaryocyte/ erythroid progenitors"] <- "Megakaryocyte/erythroid progenitors"
fine[fine=="Granulocytes (Neutrophilic Metamyelocyte)"] <- "Granulocytes (Neutrophilic Metamyelocytes)"
fine[fine=="Granulocytes (Neutrophil)"] <- "Granulocytes (Neutrophils)"
fine[fine=="Eosinophill"] <- "Eosinophils"
fine[fine=="NKT"] <- "NK T cells"

dictionary <- c(
    `Basophils`="Basophils",
    `CD4+ Central Memory`="CD4+ T cells",
    `CD4+ Effector Memory`="CD4+ T cells",
    `CD8+ Central Memory`="CD8+ T cells",
    `CD8+ Effector Memory`="CD8+ T cells",
    `CD8+ Effector Memory RA`="CD8+ T cells",
    `Colony Forming Unit-Granulocytes`="Granulocytes",
    `Colony Forming Unit-Megakaryocytic`="Megakaryocytes", 
    `Colony Forming Unit-Monocytes`="Monocytes",
    `Common myeloid progenitors`="CMPs",
    `Early B cells`="B cells",
    `Eosinophils`="Eosinophils",
    `Erythroid_CD34- CD71- GlyA+`="Erythroid cells",
    `Erythroid_CD34- CD71+ GlyA-`="Erythroid cells",
    `Erythroid_CD34- CD71+ GlyA+`="Erythroid cells",
    `Erythroid_CD34- CD71lo GlyA+`="Erythroid cells",
    `Erythroid_CD34+ CD71+ GlyA-`="Erythroid cells",
    `Granulocytes (Neutrophilic Metamyelocytes)`="Granulocytes",
    `Granulocytes (Neutrophils)`="Granulocytes",
    `Granulocyte/monocyte progenitors`="GMPs",
    `Hematopoietic stem cells_CD38- CD34+`="HSCs",
    `Hematopoietic stem cells_CD133+ CD34dim`="HSCs",
    `Mature B cells`="B cells",
    `Mature B cells class able to switch`="B cells",
    `Mature B cells class switched`="B cells",
    `Mature NK cells_CD56- CD16- CD3-`="NK cells",
    `Mature NK cells_CD56- CD16+ CD3-`="NK cells",
    `Mature NK cells_CD56+ CD16+ CD3-`="NK cells",
    `Megakaryocyte/erythroid progenitors`="MEPs",
    `Megakaryocytes`="Megakaryocytes" ,
    `Monocytes`="Monocytes",
    `Myeloid Dendritic Cells`="Dendritic cells",
    `Naïve B cells`="B cells",
    `Naive CD4+ T cells`="CD4+ T cells",
    `Naive CD8+ T cells`="CD8+ T cells",
    `NK T cells`="NK T cells",
    `Plasmacytoid Dendritic Cells`="Dendritic cells",
    `Pro B cells`="B cells"
)

main <- dictionary[fine]
stopifnot(all(!is.na(main)))

library(S4Vectors)
coldata <- DataFrame(row.names = colnames(logcounts),
    label.main = main, label.fine = fine)

We also purge the umlaut as this causes encoding problems on Windows.

coldata$label.fine <- sub("ï", "i", coldata$label.fine)

Saving to file

Now the counts and metadata can be saved for upload to r Biocpkg("ExperimentHub").

path <- file.path("celldex", "dmap", "1.2.0")
dir.create(path, showWarnings = FALSE, recursive = TRUE)

## Saving counts
saveRDS(logcounts, file = file.path(path, "logcounts.rds"))

## Saving the metadata.
saveRDS(coldata, file = file.path(path, "coldata.rds"))