SnapATAC: Single Nucleus Analysis Package for ATAC-Seq

In this example, we will be integrating two datasets from adult mouse brain generated using 10X and snATAC-seq.

Step 0. Download data
Step 1. Create snap object
Step 2. Select barcode
Step 3. Add cell-by-bin matrix
Step 4. Combine snap objects
Step 5. Filter bins
Step 6. Dimensionality reduction
Step 7. Determine significant components
Step 8. Remove batch effect
Step 9. Graph-based cluster
Step 10. Visualization

Step 0. Download data

We will download the snap file(See here).

Step 1. Create snap object In this example, we will create a list of snap objects that contains two datasets.

> library(SnapATAC);
> file.list = c("CEMBA180305_2B.snap", "atac_v1_adult_brain_fresh_5k.snap");
> sample.list = c("snATAC", "10X");
> x.sp.ls = lapply(seq(file.list), function(i){
    x.sp = createSnap(file=file.list[i], sample=sample.list[i]);
    x.sp
  })
> names(x.sp.ls) = sample.list;
> sample.list

## $snATAC
## number of barcodes: 15136
## number of bins: 0
## number of genes: 0
## number of peaks: 0
## number of motifs: 0
## 
## $`10X`
## number of barcodes: 20000
## number of bins: 0
## number of genes: 0
## number of peaks: 0
## number of motifs: 0

Step 2. Select barcode Next, we read the barcode list which contains the high-quality barcodes. See here for barcode selection.

> barcode.file.list = c("CEMBA180305_2B.barcode.txt", "atac_v1_adult_brain_fresh_5k.barcode.txt");
> barcode.list = lapply(barcode.file.list, function(file){
    read.table(file)[,1];
  })
> x.sp.list = lapply(seq(x.sp.ls), function(i){
    x.sp = x.sp.ls[[i]];
    x.sp  = x.sp[x.sp@barcode %in% barcode.list[[i]],];
  })
> names(x.sp.list) = sample.list;

## $snATAC
## number of barcodes: 9646
## number of bins: 0
## number of genes: 0
## number of peaks: 0
## number of motifs: 0
## 
## $`10X`
## number of barcodes: 4100
## number of bins: 0
## number of genes: 0
## number of peaks: 0
## number of motifs: 0

Step 3. Add cell-by-bin matrix

> x.sp.list = lapply(seq(x.sp.list), function(i){
    x.sp = addBmatToSnap(x.sp.list[[i]], bin.size=5000);
    x.sp
  })
> x.sp.list

## $snATAC
## number of barcodes: 9646
## number of bins: 545118
## number of genes: 0
## number of peaks: 0
## number of motifs: 0
## 
## $`10X`
## number of barcodes: 4100
## number of bins: 546206
## number of genes: 0
## number of peaks: 0
## number of motifs: 0

These two snap objects have different number of bins because they were generated seperately using slightly different reference genome.

Step 4. Combine snap objects To combine these two snap objects, common bins are selected.

> bin.shared = Reduce(intersect, lapply(x.sp.list, function(x.sp) x.sp@feature$name));
> x.sp.list <- lapply(x.sp.list, function(x.sp){
    idy = match(bin.shared, x.sp@feature$name);
    x.sp[,idy, mat="bmat"];
  })
> x.sp = Reduce(snapRbind, x.sp.list);
> rm(x.sp.list); # free memory
> gc();
> table(x.sp@sample);

##   10X snATAC
##  4100   9646

Step 5. Binarize matrix

> x.sp = makeBinary(x.sp, mat="bmat");

Step 6. Filter bins First, we filter out any bins overlapping with the ENCODE blacklist to prevent from potential artifacts.

> system("wget http://mitra.stanford.edu/kundaje/akundaje/release/blacklists/mm10-mouse/mm10.blacklist.bed.gz");
> library(GenomicRanges);
> black_list = read.table("mm10.blacklist.bed.gz");
> black_list.gr = GRanges(
    black_list[,1], 
    IRanges(black_list[,2], black_list[,3])
  );
> idy = queryHits(findOverlaps(x.sp@feature, black_list.gr));
> if(length(idy) > 0){x.sp = x.sp[,-idy, mat="bmat"]};
> x.sp

## number of barcodes: 13746
## number of bins: 545015
## number of genes: 0
## number of peaks: 0
## number of motifs: 0

Second, we remove unwanted chromosomes.

> chr.exclude = seqlevels(x.sp@feature)[grep("random|chrM", seqlevels(x.sp@feature))];
> idy = grep(paste(chr.exclude, collapse="|"), x.sp@feature);
> if(length(idy) > 0){x.sp = x.sp[,-idy, mat="bmat"]};
> x.sp

## number of barcodes: 13746
## number of bins: 545011
## number of genes: 0
## number of peaks: 0
## number of motifs: 0

Third, the bin coverage roughly obeys a log normal distribution. We remove the top 5% bins that overlap with invariant features such as promoters of the house keeping genes.

> bin.cov = log10(Matrix::colSums(x.sp@bmat)+1);
> bin.cutoff = quantile(bin.cov[bin.cov > 0], 0.95);
> idy = which(bin.cov <= bin.cutoff & bin.cov > 0);
> x.sp = x.sp[, idy, mat="bmat"];
> x.sp

## number of barcodes: 13746
## number of bins: 479127
## number of genes: 0
## number of peaks: 0
## number of motifs: 0

Step 7. Reduce dimensionality We compute landmark diffusion maps for dimentionality reduction. We first randomly sample 10,000 cells as landmarks and project the remaining cells (query) onto the diffusion maps embedding.

> row.covs = log10(Matrix::rowSums(x.sp@bmat)+1);
> row.covs.dens = density(
    x = row.covs, 
    bw = 'nrd', adjust = 1
  );
> sampling_prob = 1 / (approx(x = row.covs.dens$x, y = row.covs.dens$y, xout = row.covs)$y + .Machine$double.eps); 
> set.seed(1);
> idx.landmark.ds = sort(sample(x = seq(nrow(x.sp)), size = 10000, prob = sampling_prob));
> x.landmark.sp = x.sp[idx.landmark.ds,];
> x.query.sp = x.sp[-idx.landmark.ds,];
> x.landmark.sp = runDiffusionMaps(
    obj= x.landmark.sp,
    input.mat="bmat", 
    num.eigs=50
  );
> x.query.sp = runDiffusionMapsExtension(
    obj1=x.landmark.sp, 
    obj2=x.query.sp,
    input.mat="bmat"
  );
> x.landmark.sp@metaData$landmark = 1;
> x.query.sp@metaData$landmark = 0;
> x.sp = snapRbind(x.landmark.sp, x.query.sp);
## combine landmarks and query cells;
> x.sp = x.sp[order(x.sp@sample),]; # IMPORTANT
> rm(x.landmark.sp, x.query.sp); # free memory

Step 8. Determine significant components

> plotDimReductPW(
    obj=x.sp, 
    eigs.dims=1:50,
    point.size=0.3,
    point.color="grey",
    point.shape=19,
    point.alpha=0.6,
    down.sample=5000,
    pdf.file.name=NULL, 
    pdf.height=7, 
    pdf.width=7
  );

Step 9. Remove batch effect

> library(harmony);
> x.after.sp = runHarmony(
    obj=x.sp, 
    eigs.dims=1:22, 
    meta_data=x.sp@sample # sample index
  );

Step 10. Graph-based cluster

> x.after.sp = runKNN(
    obj= x.after.sp,
    eigs.dim=1:22,
    k=15
  );
> x.after.sp = runCluster(
     obj=x.after.sp,
     tmp.folder=tempdir(),
     louvain.lib="R-igraph",
     path.to.snaptools=NULL,
     seed.use=10
  );
> x.after.sp@metaData$cluster = x.after.sp@cluster;

Step 11. Visualization

> x.sp = runViz(
    obj=x.sp, 
    tmp.folder=tempdir(),
    dims=2,
    eigs.dims=1:22, 
    method="Rtsne",
    seed.use=10
  );
> x.after.sp = runViz(
    obj=x.after.sp, 
    tmp.folder=tempdir(),
    dims=2,
    eigs.dims=1:22, 
    method="Rtsne",
    seed.use=10
  );
> par(mfrow = c(2, 3));
> plotViz(
    obj=x.sp,
    method="tsne", 
    main="Before Harmony",
    point.color=x.sp@sample, 
    point.size=0.1, 
    text.add= FALSE,
    down.sample=10000,
    legend.add=TRUE
  );
> plotViz(
    obj=x.after.sp,
    method="tsne", 
    main="After Harmony",
    point.color=x.sp@sample, 
    point.size=0.1, 
    text.add=FALSE,
    down.sample=10000,
    legend.add=TRUE
  );
> plotViz(
    obj=x.after.sp,
    method="tsne", 
    main="Cluster",
    point.color=x.after.sp@cluster, 
    point.size=0.1, 
    text.add=TRUE,
    text.size=1,
    text.color="black",
    text.halo.add=TRUE,
    text.halo.color="white",
    text.halo.width=0.2,
    down.sample=10000,
    legend.add=FALSE
  );