# from the dockerfile
# docker run --user rstudio -it ricard_ingest_peaks R

library(BiocStyle)

knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Overview

Here we will get the data for Argelaguet et al.'s multiome dataset. As I haven't been involved in the work around this dataset, for information on their methods please see their paper.

Preparing the RNA data

We obtain the processed count data through the r Biocpkg("BiocFileCache") framework. This caches the data locally upon the first download, avoiding the need to repeat the download on subsequent analyses. We access the data from the FTP server provided on Github.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask=FALSE)
rna_sce <- bfcrpath(bfc,
    "ftp://ftpusr92:5FqIACU9@ftp1.babraham.ac.uk/data/processed/rna/SingleCellExperiment.rds")

atac_tss_se <- bfcrpath(bfc,
    "ftp://ftpusr92:5FqIACU9@ftp1.babraham.ac.uk/data/processed/atac/archR/Matrices/GeneScoreMatrix_TSS_summarized_experiment.rds")
atac_peaks_se <- bfcrpath(bfc,
    "ftp://ftpusr92:5FqIACU9@ftp1.babraham.ac.uk/data/processed/atac/archR/Matrices/PeakMatrix_summarized_experiment.rds")

We load in the RNA data from Ricard's SCE object and number the samples:

library(SingleCellExperiment)
rna <- readRDS(rna_sce)

sample_map = data.frame(sample_name=unique(rna$sample),
    sample = seq_along(unique(rna$sample)))
rna$sample_name = rna$sample
rna$sample = sample_map$sample[match(rna$sample_name, sample_map$sample_name)]

celltype information is only available in the peak objects, so we copy that into the RNA data here.

cd_peak = colData(readRDS(atac_peaks_se))
cd_peak$celltype.mapped = gsub("_", " ", cd_peak$celltype.mapped)
cd_peak$celltype.mapped = gsub("Forebrain Midbrain Hindbrain", "Forebrain/Midbrain/Hindbrain", cd_peak$celltype.mapped)

rna$celltype.mapped = cd_peak$celltype.mapped[match(colnames(rna), rownames(cd_peak))]

We add the rowData from Ensembl 92, to which these data were aligned.

gtf_file = bfcrpath(bfc,
    "ftp://ftp.ensembl.org:/pub/release-92/gtf/mus_musculus/Mus_musculus.GRCm38.92.gtf.gz")
gtf <- read.table(gtf_file, header=FALSE, sep="\t")
gtf <- gtf[gtf$V3 == "gene",]
get <- function(vec, id){
    sub <- vec[min(which(grepl(id, vec)))]
    gsub(paste0(id, " "), "", sub)
}
gdf <- do.call(rbind, lapply(strsplit(gtf$V9, "; "), function(x){
    data.frame(ENSEMBL=get(x, "gene_id"), SYMBOL=get(x, "gene_name"))
}))

We drop r sum(!rownames(rna) %in% gdf$SYMBOL) of the r nrow(rna) genes from Ricard's data because they lack an Ensembl gene ID.

add_rowdata <- function(sce, genes=gdf, ensemblise=FALSE){
    rd = genes[match(rownames(sce), genes$SYMBOL),]
    if(ensemblise){
        drop = is.na(rd[,1])
        sce = sce[!drop,]
        rowData(sce) = rd[!drop,]
        rownames(sce) = rowData(sce)$ENSEMBL
    } else {
        rd$SYMBOL = rownames(sce)
        rowData(sce) = rd
        rownames(sce) = rd$SYMBOL
    }
    sce
}
rna = add_rowdata(rna)

We load the dimensionality reduction data. This was provided directly from the authors.

#handle file inconsistency
process_df = function(f){
    df = read.delim(f, sep = ",", header=TRUE)
    rownames(df) = df$cell
    df = df[,c("V1", "V2")]
}
load_dimred = function(dir){
    files = dir(dir, recursive = TRUE, pattern = "txt.gz$", full.names = TRUE)
    all_umap = process_df(files[grepl("all", files)])
    stage_umap = do.call(rbind, lapply(files[!grepl("all", files)], process_df))
    list(umap = all_umap, umap.perstage = stage_umap)
}
types=c("rna", "atac", "rna_atac")
dimreds = lapply(types, function(x)
    load_dimred(file.path("dimensionality_reduction", x)))
names(dimreds) = types
dimreds = do.call(c, dimreds)

attach_dimred = function(sce, lst=dimreds){
    new = lapply(lst, function(x) x[match(colnames(sce), rownames(x)),])
    for(i in seq_along(new)) rownames(new[[i]]) = colnames(sce)
    reducedDims(sce) = new
    sce
}
rna = attach_dimred(rna)

Correct alignment of column data/row data is shown by plotting the UMAP as follows:

with(reducedDim(rna, "rna.umap"),
    plot(V1, V2, col = MouseGastrulationData::EmbryoCelltypeColours[rna$celltype.mapped]))

We restrict column data columns to a reduced set on request from the dataset author.

allowed_columns = c(
    "barcode",
    "sample",
    "sample_name",
    "stage",
    "genotype",
    "celltype.mapped",
    "nFeature_RNA",
    "nCount_RNA",
    "mitochondrial_percent_RNA",
    "ribosomal_percent_RNA",
    "nFrags_atac",
    "TSSEnrichment_atac",
    "doublet_score",
    "doublet_call",
    "genotype",
    "nFrags_atac",
    "TSSEnrichment_atac",
    "doublet_score",
    "doublet_call"
)

tidy_coldata = function(cd, cols = allowed_columns){
    cd = cd[,names(cd) %in% allowed_columns]
    cd = cd[,order(match(names(cd), allowed_columns))]
    #rename celltype
    names(cd)[names(cd) == "celltype.mapped"] = "celltype"
    cd
}

colData(rna) = tidy_coldata(colData(rna))

We save the data, split by sample

save_files = function(base, sce, assay_name = "counts-processed", assay_id = "counts"){
    dir.create(base, recursive=TRUE, showWarnings=FALSE)
    saveRDS(rowData(sce), file=paste0(base, "/rowdata.rds"))
    for(samp in unique(sce$sample)){
        sub = sce[, sce$sample == samp]
        saveRDS(assay(sub, assay_id), 
            file=paste0(base, "/", assay_name, "-sample", samp, ".rds"))
        saveRDS(colData(sub), 
            file=paste0(base, "/coldata-sample", samp, ".rds"))
        saveRDS(sizeFactors(sub), 
            file=paste0(base, "/sizefac-sample", samp, ".rds"))
        saveRDS(reducedDims(sub), 
            file=paste0(base, "/reduced-dims-sample", samp, ".rds"))
    }
    invisible(0)
}

path <- file.path("MouseGastrulationData", "RA_rna", "1.12.0")
save_files(path, rna)
#memory is precious
rm(rna); gc()

Preparing the TSS ATAC peak data

Next, we access the ATAC-seq peaks. We will upload this more or less exactly as it arrived (i.e., without doing anything fancy around rowRanges). We can rapidly apply the same processes now using the functions we defined earlier. The rowData is an exception and is processed differently below

tss_assay_name = "counts"
tss = readRDS(atac_tss_se)
tss = as(tss, "SingleCellExperiment")

tss$sample_name = tss$sample
tss$sample = sample_map$sample[match(tss$sample_name, sample_map$sample_name)]
rowData(tss)$ENSEMBL = gdf$ENSEMBL[match(rowData(tss)$name, gdf$SYMBOL)]
names(rowData(tss))[match("name", names(rowData(tss)))] = "SYMBOL"
tss = attach_dimred(tss)
names(assays(tss)) = tss_assay_name

colData(tss) = tidy_coldata(colData(tss))

path <- file.path("MouseGastrulationData", "RA_atac_tss", "1.12.0")
save_files(path, tss)
#memory is precious
rm(tss); gc()

Preparing the whole-genome ATAC peak data

Similarly we can reuse a range of functions from before.

peaks_assay_name = "counts"
peaks = readRDS(atac_peaks_se)
peaks = as(peaks, "SingleCellExperiment")

peaks$sample_name = peaks$sample
peaks$sample = sample_map$sample[match(peaks$sample_name, sample_map$sample_name)]
peaks = attach_dimred(peaks)
names(assays(peaks)) = peaks_assay_name

colData(peaks) = tidy_coldata(colData(peaks))

path <- file.path("MouseGastrulationData", "RA_atac_peaks", "1.12.0")
save_files(path, peaks)
#memory is precious
rm(peaks); gc()

Make file metadata

We now make the metadata for ExperimentHub so the files can be made properly available.

make_df = function(
    component = "RNA",
    assay_desc = "Processed counts",
    assay_name = "counts-processed",
    path_component = "RA_rna",
    source_file = "/data/processed/rna/SingleCellExperiment.rds",
    samples = unique(sample_map$sample)){
    data.frame(
    Title = sprintf("RA_multiome %s %s", component,
        c(sprintf("%s (sample %i)", tolower(assay_desc), samples),
            "rowData",
            sprintf("colData (sample %i)", samples),
            sprintf("size factors (sample %i)", samples),
            sprintf("reduced dimensions (sample %i)", samples))
    ),
    Description = sprintf("%s for the RNA component of the Argelaguet et al. mouse embryo multome dataset", 
        c(sprintf("%s for sample %i", assay_desc, samples),
            "Per-gene metadata for all samples",
            sprintf("Per-cell metadata for sample %i", samples),
            sprintf("Size factors for sample %i", samples),
            sprintf("Reduced dimensions for sample %i", samples))
    ),
    RDataPath = c(
        file.path("MouseGastrulationData", path_component, "1.12.0", 
            c(sprintf("%s-sample%i.rds", assay_name, samples),
                "rowdata.rds",
                sprintf("coldata-sample%i.rds", samples),
                sprintf("sizefac-sample%i.rds", samples),
                sprintf("reduced-dims-sample%i.rds", samples)))
    ),
    BiocVersion="3.16",
    Genome="mm10",
    SourceType="RDS",
    SourceUrl="ftp://ftpusr92:5FqIACU9@ftp1.babraham.ac.uk",
    SourceVersion=source_file,
    Species="Mus musculus",
    TaxonomyId="10090",
    Coordinate_1_based=FALSE,
    DataProvider="Jonathan Griffiths",
    Maintainer="Jonathan Griffiths <jonathan.griffiths.94@gmail.com>",
    RDataClass="character",
    DispatchClass="RDS",
    stringsAsFactors = FALSE
  )
}

d1 = make_df()
d2 = make_df(
    component = "ATAC at TSS",
    assay_desc = "ATAC gene score at TSS",
    path_component = "RA_atac_tss",
    source_file = "/data/processed/atac/archR/Matrices/GeneScoreMatrix_TSS_summarized_experiment.rds"
)
d3 = make_df(
    component = "ATAC peaks",
    assay_desc = "ATAC peaks",
    path_component = "RA_atac_peaks",
    source_file = "/data/processed/atac/archR/Matrices/PeakMatrix_summarized_experiment.rds"
)
info = rbind(d1, d2, d3)

write.csv(file="../../extdata/metadata-ra-multiome.csv", info, row.names=FALSE)

Session information

sessionInfo()


MarioniLab/MouseGastrulationData documentation built on Jan. 31, 2024, 11:01 a.m.