knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE )
MOCHA is intended for use after clustering and cell labeling, since all functions of MOCHA operate on per-sample basis within each cell population. MOCHA can be used with an ArchR Project as input, as well as with a generic input. This tutorial has two sections: demonstrating MOCHA with generic inputs - Importing from Signac and Importing from SnapATAC.
This vignette will follow the Signac tutorial to generate an Signac object and label cell types within it. If you already have a Signac object with cell types labelled, skip to section Extract Fragments from Signac
library(Signac) library(Seurat) library(SeuratDisk) library(EnsDb.Hsapiens.v86) library(BSgenome.Hsapiens.UCSC.hg38) set.seed(1234)
# Download files system('wget https://cf.10xgenomics.com/samples/cell-arc/1.0.0/pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5') system('wget https://cf.10xgenomics.com/samples/cell-arc/1.0.0/pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz') system('wget https://cf.10xgenomics.com/samples/cell-arc/1.0.0/pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz.tbi') # Load in ATAC and RNA data counts <- Read10X_h5("pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5") fragpath <- "pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz"
# create a Seurat object containing the RNA adata pbmc <- CreateSeuratObject( counts = counts$`Gene Expression`, assay = "RNA" ) # create ATAC assay and add it to the object pbmc[["ATAC"]] <- CreateChromatinAssay( counts = counts$Peaks, sep = c(":", "-"), fragments = fragpath, annotation = annotation ) DefaultAssay(pbmc) <- "ATAC" pbmc <- NucleosomeSignal(pbmc) pbmc <- TSSEnrichment(pbmc) pbmc <- subset( x = pbmc, subset = nCount_ATAC < 100000 & nCount_RNA < 25000 & nCount_ATAC > 1000 & nCount_RNA > 1000 & nucleosome_signal < 2 & TSS.enrichment > 1 ) # Transform RNA DefaultAssay(pbmc) <- "RNA" pbmc <- SCTransform(pbmc) pbmc <- RunPCA(pbmc) pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
Label cell types:
# load PBMC reference system('wget https://atlas.fredhutch.org/data/nygc/multimodal/pbmc_multimodal.h5seurat') reference <- LoadH5Seurat("pbmc_multimodal.h5seurat")
DefaultAssay(pbmc) <- "SCT" # transfer cell type labels from reference to query transfer_anchors <- FindTransferAnchors( reference = reference, query = pbmc, normalization.method = "SCT", reference.reduction = "spca", recompute.residuals = FALSE, dims = 1:50 ) predictions <- TransferData( anchorset = transfer_anchors, refdata = reference$celltype.l2, weight.reduction = pbmc[['pca']], dims = 1:50 ) pbmc <- AddMetaData( object = pbmc, metadata = predictions )
# set the cell identities to the cell type predictions Idents(pbmc) <- "predicted.id" # set a reasonable order for cell types to be displayed when plotting levels(pbmc) <- c("CD4 Naive", "CD4 TCM", "CD4 CTL", "CD4 TEM", "CD4 Proliferating", "CD8 Naive", "dnT", "CD8 TEM", "CD8 TCM", "CD8 Proliferating", "MAIT", "NK", "NK_CD56bright", "NK Proliferating", "gdT", "Treg", "B naive", "B intermediate", "B memory", "Plasmablast", "CD14 Mono", "CD16 Mono", "cDC1", "cDC2", "pDC", "HSPC", "Eryth", "ASDC", "ILC", "Platelet") saveRDS(pbmc, 'pbmc_Signac_tutorial.rds')
pbmc <- readRDS('pbmc_Signac_tutorial.rds') DefaultAssay(pbmc) <- 'ATAC' fragObj <- Fragments(pbmc)
Signac's training dataset only has one sample, so to simulate multiple samples, we will duplicate the data here. This should not be necessary with a large dataset.
full_pbmc <- merge( pbmc, y = c(pbmc, pbmc), add.cell.ids = c('Sample1', 'Sample2', 'Sample3'), project = 'DuplicateData' )
For a larger dataset, you would interact over the fragObj list and extract each fragments.tsv.gz file.
Instead, we're just duplicating data for the sake of this tutorial.
More importantly, you do need to modify the barcode in the fragment file so it matches the barcodes in Seurat's metadata.
fragList <- parallel::mclapply(1:3, function(x){ frags <- read.table(GetFragmentData(fragObj[[1]])) names(frags) <- c('chr', 'start', 'end', 'barcode', 'val') frags$barcode <- paste("Sample",x,"_", frags$barcode, sep ='') frags }, mc.cores = 3) names(fragList) <- c('Sample1', 'Sample2', 'Sample3')
Generate your sample/cell type list by finding all combinations of Samples and Cell Populations
Here we must also rename the GRangesList to have names in the format CellPopulation#Sample
.
celltype_sample_list <- apply(expand.grid(unique(pbmc@meta.data$predicted.id), names(fragList)), 1, paste, collapse = "#") # Extract metadata, add the Sample column, as well as CellBarcode. fullMeta <- full_pbmc@meta.data fullMeta$Sample = gsub("_.*","", rownames(full_pbmc@meta.data)) fullMeta$CellBarcode = gsub(".*_","", rownames(full_pbmc@meta.data)) # Change the CellPopulation_Sample format to CellPopulation#Sample for MOCHA rownames(fullMeta) <- gsub("_","#", rownames(fullMeta)) CellType_GRanges <- pbapply::pblapply(cl = 30, celltype_sample_list, function(x){ celltype <- gsub("#.*","", x) sample <- gsub(".*#","", x) sortedMeta <- dplyr::filter(fullMeta, predicted.id == celltype, Sample == sample) sortedFrags <- dplyr::filter(fragList[[sample]], barcode %in% rownames(sortedMeta)) makeGRangesFromDataFrame(sortedFrags, keep.extra.columns = TRUE) }) names(CellType_GRanges) <- celltype_sample_list
Calculate the study signal (the median number of fragments per cell)
avg_reads <- lapply(fragList, function(x){ filtFrag <- dplyr::filter(as.data.frame(x), barcode %in% rownames(fullMeta)) as.vector(table(filtFrag$barcode)) }) studySignal <- median(unlist(avg_reads))
# Our blacklist comes included with Signac blackList <- blacklist_hg38_unified # Call Open Tiles tileResults <- MOCHA::callOpenTiles(ATACFragments = CellType_GRanges, cellColData = fullMeta, blackList = blackList, genome = 'BSgenome.Hsapiens.UCSC.hg38', cellPopLabel = 'predicted.id', cellPopulations = fullMeta$predicted.id, studySignal = studySignal, cellCol = 'barcode', TxDb = "TxDb.Hsapiens.UCSC.hg38.refGene", Org = "org.Hs.eg.db", outDir = paste(getwd(),'/MOCHA_Out', sep = ''), numCores= 35) TSAM <- MOCHA::getSampleTileMatrix(tileResults, threshold = 0.2, numCores = 3, verbose = TRUE)
In this example, we follow the SnapATAC 10X PBMC tutorial through the clustering step before extracting the fragments and cell metadata necessary for MOCHA.
If you have a fully-formed Snap file for your analysis with clustering results and sample information added to the metadata, skip to section Formatting Snap File Metadata. If following the vignette, we STRONGLY recommending installing this patched version of SnapATAC from this repository with devtools::install_github("imran-aifi/SnapATAC")
.
Download the all the SnapATAC Tutorial Data (linked above) to your working directory, and load it:
# CLI tool for installing from Google Drive share links pip install gdown # https://drive.google.com/file/d/1YiYd_Ydes3tqsJGpNuqQquUOoVj2EEjE/view?usp=share_link gdown https://drive.google.com/uc?id=1YiYd_Ydes3tqsJGpNuqQquUOoVj2EEjE # https://drive.google.com/file/d/1NvGn4M2_HD06PL5Nj2if5xO-uVA0y8Q5/view?usp=share_link gdown https://drive.google.com/uc?id=1NvGn4M2_HD06PL5Nj2if5xO-uVA0y8Q5 # https://drive.google.com/file/d/1LUOqsXoQN6lVx-4RNlgH90e5oXQ0y9Bd/view?usp=share_link gdown https://drive.google.com/uc?id=1LUOqsXoQN6lVx-4RNlgH90e5oXQ0y9Bd # https://drive.google.com/file/d/1oMJ6wFsfS-q-sY_yaLEtnYebM7RrBix6/view?usp=share_link gdown https://drive.google.com/uc?id=1oMJ6wFsfS-q-sY_yaLEtnYebM7RrBix6 # https://drive.google.com/file/d/1SEFZ5CJgcmoAkmo4_1kCMYS60YOFP379/view?usp=share_link gdown https://drive.google.com/uc?id=1SEFZ5CJgcmoAkmo4_1kCMYS60YOFP379 # https://drive.google.com/file/d/1RlBvTCqz6mhaTAkfYiCdeojD-U2mN2wp/view?usp=share_link gdown https://drive.google.com/uc?id=1RlBvTCqz6mhaTAkfYiCdeojD-U2mN2wp
library(SnapATAC); snap.files = c( "atac_pbmc_5k_nextgem.snap", "atac_pbmc_10k_nextgem.snap" ); sample.names = c( "PBMC 5K", "PBMC 10K" ); barcode.files = c( "atac_pbmc_5k_nextgem_singlecell.csv", "atac_pbmc_10k_nextgem_singlecell.csv" ); x.sp.ls = lapply(seq(snap.files), function(i){ createSnap( file=snap.files[i], sample=sample.names[i] ); }) names(x.sp.ls) = sample.names; barcode.ls = lapply(seq(snap.files), function(i){ barcodes = read.csv( barcode.files[i], head=TRUE ); barcodes = barcodes[2:nrow(barcodes),]; barcodes$logUMI = log10(barcodes$passed_filters + 1); barcodes$promoter_ratio = (barcodes$promoter_region_fragments+1) / (barcodes$passed_filters + 1); barcodes }) x.sp.ls
# for both datasets, we identify usable barcodes using [3.5-5] for log10(UMI) and [0.4-0.8] for promoter ratio as cutoff. cutoff.logUMI.low = c(3.5, 3.5); cutoff.logUMI.high = c(5, 5); cutoff.FRIP.low = c(0.4, 0.4); cutoff.FRIP.high = c(0.8, 0.8); barcode.ls = lapply(seq(snap.files), function(i){ barcodes = barcode.ls[[i]]; idx = which( barcodes$logUMI >= cutoff.logUMI.low[i] & barcodes$logUMI <= cutoff.logUMI.high[i] & barcodes$promoter_ratio >= cutoff.FRIP.low[i] & barcodes$promoter_ratio <= cutoff.FRIP.high[i] ); barcodes[idx,] }); x.sp.ls = lapply(seq(snap.files), function(i){ barcodes = barcode.ls[[i]]; x.sp = x.sp.ls[[i]]; barcode.shared = intersect(x.sp@barcode, barcodes$barcode); x.sp = x.sp[match(barcode.shared, x.sp@barcode),]; barcodes = barcodes[match(barcode.shared, barcodes$barcode),]; x.sp@metaData = barcodes; x.sp }) names(x.sp.ls) = sample.names; x.sp.ls
# combine two snap object x.sp = Reduce(snapRbind, x.sp.ls); x.sp@metaData["Sample"] = x.sp@sample; print(table(x.sp@sample)) x.sp
# Step 2. Add cell-by-bin matrix x.sp = addBmatToSnap(x.sp, bin.size=5000); # Step 3. Matrix binarization x.sp = makeBinary(x.sp, mat="bmat"); # Step 4. Bin filtering library(GenomicRanges); black_list = read.table("hg19.blacklist.bed.gz"); black_list.gr = GRanges( black_list[,1], IRanges(black_list[,2], black_list[,3]) ); idy = queryHits( findOverlaps(x.sp@feature, black_list.gr) ); if(length(idy) > 0){ x.sp = x.sp[,-idy, mat="bmat"]; }; x.sp # Remove unwanted chromosomes chr.exclude = seqlevels(x.sp@feature)[grep("random|chrM", seqlevels(x.sp@feature))]; idy = grep(paste(chr.exclude, collapse="|"), x.sp@feature); if(length(idy) > 0){ x.sp = x.sp[,-idy, mat="bmat"] }; x.sp
# The coverage of bins roughly obeys a log normal distribution. We remove the top 5% bins that overlap with invariant features such as the house keeping gene promoters. bin.cov = log10(Matrix::colSums(x.sp@bmat)+1); hist( bin.cov[bin.cov > 0], xlab="log10(bin cov)", main="log10(Bin Cov)", col="lightblue", xlim=c(0, 5) ); bin.cutoff = quantile(bin.cov[bin.cov > 0], 0.95); idy = which(bin.cov <= bin.cutoff & bin.cov > 0); x.sp = x.sp[, idy, mat="bmat"]; x.sp # We will further remove any cells of bin coverage less than 1,000. The rational behind this is that some cells may have high number of unique fragments but end up with low bin coverage after filtering. This step is optional but highly recommended. idx = which(Matrix::rowSums(x.sp@bmat) > 1000); x.sp = x.sp[idx,]; x.sp
# Step 5. Dimensionality reduction row.covs.dens <- density( x = x.sp@metaData[,"logUMI"], bw = 'nrd', adjust = 1 ); sampling_prob <- 1 / (approx(x = row.covs.dens$x, y = row.covs.dens$y, xout = x.sp@metaData[,"logUMI"])$y + .Machine$double.eps); set.seed(1); idx.landmark.ds <- base::sort(sample(x = seq(nrow(x.sp)), size = 10000, prob = sampling_prob)); x.landmark.sp = x.sp[idx.landmark.ds,]; x.query.sp = x.sp[-idx.landmark.ds,]; x.landmark.sp = runDiffusionMaps( obj= x.landmark.sp, input.mat="bmat", num.eigs=50 ); x.landmark.sp@metaData$landmark = 1; x.query.sp = runDiffusionMapsExtension( obj1=x.landmark.sp, obj2=x.query.sp, input.mat="bmat" ); x.query.sp@metaData$landmark = 0; x.sp = snapRbind(x.landmark.sp, x.query.sp); x.sp = x.sp[order(x.sp@metaData["sample"])]; x.sp = runKNN( obj=x.sp, eigs.dims=1:20, k=15 ); x.sp=runCluster( obj=x.sp, tmp.folder=tempdir(), louvain.lib="R-igraph", #"leiden" preferred, but may cause issues. Requires 'library(leiden)'. seed.use=10, resolution=0.7 )
Add the computed clusters to the Snap object metadata.
The Snap object contains two samples, "PBMC 5K" and "PBMC 10K". Let's add a "Sample" column to the metadata. Let's also add a column "files" pointing to the original .snap files from which each cell came.
# Add clusters (from SnapATAC::runCluster) to metadata x.sp@metaData$cluster = x.sp@cluster # Add Sample name to metadata (if not done previously) x.sp@metaData$Sample = x.sp@sample # Add files to metadata, indicating the original snap file each cell belongs to. snap.files <- c( "atac_pbmc_5k_nextgem.snap", "atac_pbmc_10k_nextgem.snap" ) fileList <- unlist(lapply(x.sp@metaData$Sample, function(x){ ifelse(x == "PBMC 5K", snap.files[[1]],snap.files[[2]]) })) x.sp@metaData$files <- fileList # SAVE this metadata to disk write.csv(x.sp@metaData, "./snapMetadataforMOCHA.csv")
Now we have a Snap object with metadata containing barcodes, unique cell ids (column cell_id), sample names, and cell populations (cluster). We also have our HG19 blackList, black_list.gr
.
Note: Following the tutorial from SnapATAC can often result in a segfault when extracting fragments with
SnapATAC::extractReads
. We recommend running on a machine with large RAM and avoiding parallelization.
Next we extract reads by sample and cell population, ensuring our final GRanges list is named following the pattern CellPopulation#Sample
.
snapMetadata <- read.csv("./snapMetadataforMOCHA.csv") cellCol <- "barcode" cellPopLabel <- "cluster" cellPopulations <- unique(snapMetadata$cluster) allSamples <- unique(snapMetadata$Sample) fragmentsGRangesList <- unlist(lapply(allSamples, function(sample){ barcodesList <- lapply(cellPopulations, function(cellPop) { snapMetadata[snapMetadata$Sample == sample,] # Extract barcodes for a single cell population cellPopIdx <- snapMetadata$cluster == cellPop cellPopBarcodes <- snapMetadata[cellPopIdx,]$barcode # Build the file list for the selected cell barcode files <- snapMetadata[cellPopIdx,]$files # Extract fragments cellPopFrags <- SnapATAC::extractReads(cellPopBarcodes, files, do.par = FALSE) }) names(barcodesList) <- paste(cellPopulations, sample, sep="#") barcodesList }))
Calculate the study signal (the median number of fragments per cell)
avg_reads <- lapply(fragmentsGRangesList, function(x){ filtFrag <- dplyr::filter(as.data.frame(x), barcode %in% snapMetadata$barcode) as.vector(table(filtFrag$barcode)) }) studySignal <- median(unlist(avg_reads))
# Call Open Tiles tileResults <- MOCHA::callOpenTiles( ATACFragments = fragmentsGRangesList, cellColData = snapMetadata, blackList = black_list.gr, genome = "BSgenome.Hsapiens.UCSC.hg38", cellPopLabel = cellPopLabel, cellPopulations = cellPopulations, studySignal = studySignal, cellCol = cellCol, TxDb = "TxDb.Hsapiens.UCSC.hg38.refGene", Org = "org.Hs.eg.db", outDir = paste(getwd(),'/MOCHA_Out', sep = ''), numCores = 5 ) TSAM <- MOCHA::getSampleTileMatrix(tileResults, threshold = 0.2, numCores = 3, verbose = TRUE)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.