knitr::opts_chunk$set(echo = TRUE)
options(width = 100)

Loading required packages

suppressPackageStartupMessages({
    library(readr)
    library(SummarizedExperiment)
    library(SingleCellExperiment)
    library(rtracklayer)
    library(BiocFileCache)
})

Downloading the csv files

The csv files with counts and metadata were downloaded from the Gene Expression Omnibus (accession number GSE132040) on September 29, 2020.

bfc <- BiocFileCache("bulk-raw-data", ask = FALSE)
count_data <- bfcrpath(bfc, "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE132nnn/GSE132040/suppl/GSE132040_190214_A00111_0269_AHH3J3DSXX_190214_A00111_0270_BHHMFWDSXX.csv.gz")
meta_data <- bfcrpath(bfc, "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE132nnn/GSE132040/suppl/GSE132040_MACA_Bulk_metadata.csv.gz")
gencode_gtf <- bfcrpath(bfc, "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M19/gencode.vM19.annotation.gtf.gz")

Loading the data and preprocessing

Next, we load the count matrix and the metadata. The count matrix contains also some summaries from HTSeq; we'll move those to the metadata.

## Read count matrix
counts <- as.data.frame(readr::read_csv(
    count_data, 
    col_names = TRUE,
    col_types = cols(.default = col_double(), gene = col_character())
))
dim(counts)
head(counts[, 1:5])
tail(counts[, 1:5])  ## need to remove the last few rows; not genes

rownames(counts) <- counts$gene
counts$gene <- NULL
colnames(counts) <- gsub("\\.gencode\\.vM19$", "", colnames(counts))

htseq_stats <- counts[grep("^__", rownames(counts)), ]
dim(htseq_stats)
counts <- counts[grep("^__", rownames(counts), invert = TRUE), ]
dim(counts)

counts <- as.matrix(counts)

## Read metadata
meta <- readr::read_csv(
    meta_data, 
    col_names = TRUE
)
dim(meta)
head(as.data.frame(meta))

## Add HTSeq stats to metadata
stopifnot(all(meta$`Sample name` %in% colnames(htseq_stats)))
htseq_stats <- t(htseq_stats[, match(meta$`Sample name`, colnames(htseq_stats))])
stopifnot(all(meta$`Sample name` == rownames(htseq_stats)))
meta <- cbind(meta, htseq_stats)

## Extract the organ information
meta$organ <- gsub("_*[0-9]+$", "", meta$`source name`)

Generating the SingleCellExperiment object

stopifnot(all(meta$`Sample name` %in% colnames(counts)))
counts <- counts[, match(meta$`Sample name`, colnames(counts))]
stopifnot(all(meta$`Sample name` == colnames(counts)))

se <- SingleCellExperiment(
    assays = list(counts = counts),
    colData = meta
)

Adding information from Gencode vM19 to rowData

gtf <- rtracklayer::import(gencode_gtf)
gtf <- subset(gtf, type == "gene")
stopifnot(all(rownames(se) %in% gtf$gene_name))
gtf <- gtf[match(rownames(se), gtf$gene_name)]
mcols(gtf) <- mcols(gtf)[, c("source", "type", "gene_id", 
                             "gene_type", "gene_name", 
                             "level", "havana_gene", "tag")]
names(gtf) <- gtf$gene_name
rowRanges(se) <- gtf

Saving the components of the SingleCellExperiment object

outpath <- file.path("TabulaMurisSenisData", "tabula-muris-senis-bulk")
dir.create(outpath, showWarnings = FALSE, recursive = TRUE)
saveRDS(rowRanges(se), file = file.path(outpath, "rowranges.rds"))
saveRDS(colData(se), file = file.path(outpath, "coldata.rds"))
saveRDS(assay(se, "counts"), file = file.path(outpath, "counts.rds"))

Session info {-}

sessionInfo()


fmicompbio/TabulaMurisSenisData documentation built on Nov. 5, 2024, 8:45 a.m.