library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Downloading the data

We obtain a single-cell RNA sequencing dataset of the mouse nervous system from @zeisel2018molecular. Counts for endogenous genes are available from http://mousebrain.org/downloads.html as a loom file. We download and cache it using the r Biocpkg("BiocFileCache") package.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
loom.path <- bfcrpath(bfc, "https://storage.googleapis.com/linnarsson-lab-loom/l5_all.loom")

We load this into our R session using the r Biocpkg("LoomExperiment") package.

library(LoomExperiment)
scle <- import(loom.path, type="SingleCellLoomExperiment")
scle 

Then it's just a matter of peeling apart the bits that we need. We do need to clean up the rowData:

rd <- rowData(scle)
colnames(rd) <- sub("^X_", "", colnames(rd))
rownames(rd) <- rd$Accession
rd

We also need to clean up the colData. This is, sadly, a gargantuan effort - so much for loom being a ready-to-use file format!

cd <- colData(scle)

# Useless pieces of information, or pieces that are obvious from the experimental design.
cd$Bucket <- NULL
cd$Species <- NULL
cd$AnalysisProject <- NULL
cd$Transcriptome <- NULL
cd$TimepointPool <- NULL
cd$NGI_PlateWell <- NULL
cd$PlugDate <- NULL
cd$Plug_Date <- NULL
cd$Project <- NULL

# Redundant fields.
cd$Cell_Conc <- NULL
cd$ngperul_cDNA <- NULL
cd$cDNA_Lib_Ok <- NULL
cd$Target_Num_Cells <- NULL
cd$Date_Captured <- NULL
cd$Num_Pooled_Animals <- NULL
cd$PCR_Cycles <- NULL
cd$Sample_Index <- NULL
cd$Seq_Comment <- NULL
cd$Seq_Lib_Date <- NULL
cd$Seq_Lib_Ok <- NULL

# Converting various character fields into numbers, where applicable.
# On occassion, we have to get rid of some nonsense fields with quotation marks;
# something probably got corrupted when the file was saved.
options(warn=2)
for (i in colnames(cd)) {
    current <- cd[[i]]

    if (is.character(current)) {
        converted <- current
        is.bad <- converted=="" | grepl('"', converted)
        converted[is.bad] <- NA
        changed <- TRUE

        if (any(has.pct <- grepl("%$", current))) {
            converted[!has.pct] <- NA
            converted <- sub("%$", "", converted)
            converted <- as.numeric(converted)/100
            cd[[i]] <- converted
        } else if (any(has.comma <- grepl(",[0-9]{3}", current))) {
            converted[!has.comma] <- NA # these are probably corruptions of some sort.
            converted <- gsub(",([0-9]{3})", "\\1", converted)
            converted <- as.numeric(converted)
            cd[[i]] <- converted
        } else if (any(grepl("^[0-9]+,[0-9]+$", current))){
            converted <- sub(",", ".", converted)
            converted <- as.numeric(converted)
            cd[[i]] <- converted
        } else if (any(grepl("^[0-9]+(\\.[0-9]+)?$", current))) {
            cd[[i]] <- as.numeric(converted)
        } else {
            changed <- FALSE
        }

#        # For debugging purposes, to check that the corruptions are purged correctly.
#        if (changed) {
#            print(i)
#            print(names(table(current)))
#            print(names(table(converted)))
#        }
    }
}
options(warn=1)

# Replacing the 'nan's with NA's.
for (i in colnames(cd)) {
    current <- cd[[i]]
    if (is.character(current) && any(lost <- current=="nan")) {
        current[lost] <- NA_character_
        cd[[i]] <- current
    }    
}

# Stripping out the class probabilities into a nested matrix.
clcols <- grep("ClassProbability", colnames(cd))
clprobs <- cd[,clcols]
clprobs <- as.matrix(clprobs)
colnames(clprobs) <- sub("ClassProbability_", "", colnames(clprobs))
cd <- cd[,-clcols]
cd$ClassProbability <- clprobs

# Hacking out the reduced dimensions.
pca.cols <- c("X_PC1", "X_PC2")
tsne.cols <- c("X_tSNE1", "X_tSNE2")
other.cols <- c("X_X", "X_Y")
reddim <- list(
    PCA=as.matrix(cd[,pca.cols]),
    tSNE=as.matrix(cd[,tsne.cols]),
    unnamed=as.matrix(cd[,other.cols])
)
cd <- cd[,!colnames(cd) %in% c(pca.cols, tsne.cols, other.cols)]

# Why the prefixing with X_? Well, whatever.
colnames(cd) <- sub("X_", "", colnames(cd))

cd

Saving to file

We now save all of the relevant components to file for upload to r Biocpkg("ExperimentHub").

path <- file.path("scRNAseq", "zeisel-nervous", "2.6.0")
dir.create(path, showWarnings=FALSE, recursive=TRUE)
saveRDS(rd, file=file.path(path, "rowdata.rds"))
saveRDS(cd, file=file.path(path, "coldata.rds"))
saveRDS(reddim, file=file.path(path, "reddims.rds"))

The column pairs are a bit tricky, but we just coerce them into SelfHits objects and save that.

colpairs <- lapply(colGraphs(scle), function(x) as(x, "SelfHits"))
saveRDS(colpairs, file=file.path(path, "colpairs.rds"))

The trickiest part is resaving the HDF5 file as a sparse matrix.

saveRDS(as(assay(scle), 'dgCMatrix'), file=file.path(path, "counts.rds"))

Session information

sessionInfo()

References



drisso/scRNAseq documentation built on Feb. 16, 2021, 1:18 a.m.