knitr::opts_chunk$set(echo = TRUE) options(width = 100)
suppressPackageStartupMessages({ library(readr) library(SummarizedExperiment) library(SingleCellExperiment) library(rtracklayer) library(BiocFileCache) })
The csv files with counts and metadata were downloaded from the Gene Expression Omnibus (accession number GSE132040) on September 29, 2020.
bfc <- BiocFileCache("bulk-raw-data", ask = FALSE) count_data <- bfcrpath(bfc, "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE132nnn/GSE132040/suppl/GSE132040_190214_A00111_0269_AHH3J3DSXX_190214_A00111_0270_BHHMFWDSXX.csv.gz") meta_data <- bfcrpath(bfc, "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE132nnn/GSE132040/suppl/GSE132040_MACA_Bulk_metadata.csv.gz") gencode_gtf <- bfcrpath(bfc, "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M19/gencode.vM19.annotation.gtf.gz")
Next, we load the count matrix and the metadata. The count matrix contains also some summaries from HTSeq; we'll move those to the metadata.
## Read count matrix counts <- as.data.frame(readr::read_csv( count_data, col_names = TRUE, col_types = cols(.default = col_double(), gene = col_character()) )) dim(counts) head(counts[, 1:5]) tail(counts[, 1:5]) ## need to remove the last few rows; not genes rownames(counts) <- counts$gene counts$gene <- NULL colnames(counts) <- gsub("\\.gencode\\.vM19$", "", colnames(counts)) htseq_stats <- counts[grep("^__", rownames(counts)), ] dim(htseq_stats) counts <- counts[grep("^__", rownames(counts), invert = TRUE), ] dim(counts) counts <- as.matrix(counts) ## Read metadata meta <- readr::read_csv( meta_data, col_names = TRUE ) dim(meta) head(as.data.frame(meta)) ## Add HTSeq stats to metadata stopifnot(all(meta$`Sample name` %in% colnames(htseq_stats))) htseq_stats <- t(htseq_stats[, match(meta$`Sample name`, colnames(htseq_stats))]) stopifnot(all(meta$`Sample name` == rownames(htseq_stats))) meta <- cbind(meta, htseq_stats) ## Extract the organ information meta$organ <- gsub("_*[0-9]+$", "", meta$`source name`)
SingleCellExperiment
objectstopifnot(all(meta$`Sample name` %in% colnames(counts))) counts <- counts[, match(meta$`Sample name`, colnames(counts))] stopifnot(all(meta$`Sample name` == colnames(counts))) se <- SingleCellExperiment( assays = list(counts = counts), colData = meta )
rowData
gtf <- rtracklayer::import(gencode_gtf) gtf <- subset(gtf, type == "gene") stopifnot(all(rownames(se) %in% gtf$gene_name)) gtf <- gtf[match(rownames(se), gtf$gene_name)] mcols(gtf) <- mcols(gtf)[, c("source", "type", "gene_id", "gene_type", "gene_name", "level", "havana_gene", "tag")] names(gtf) <- gtf$gene_name rowRanges(se) <- gtf
SingleCellExperiment
objectoutpath <- file.path("TabulaMurisSenisData", "tabula-muris-senis-bulk") dir.create(outpath, showWarnings = FALSE, recursive = TRUE) saveRDS(rowRanges(se), file = file.path(outpath, "rowranges.rds")) saveRDS(colData(se), file = file.path(outpath, "coldata.rds")) saveRDS(assay(se, "counts"), file = file.path(outpath, "counts.rds"))
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.