knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  out.width = "60%",
  out.height = "60%"
)

scATACutils: an R/Bioconductor package for working with 10x scATACseq data

The goal of scATACutils is to provide functions to work with 10x scATACseq data. It includes functions to calculate qualtiy control metrics such as banding score, Frip score, and TSS enrichment score etc. Some convinient functions are provided for visulizations as well. Plot the scatter plot of the Frip and read depth. Plot scATACseq coverage tracks by group etc.

Installation

You can install the released version of scATACutils from github with:

devtools::install_github("crazyhottommy/scATACutils")

Example

Demonstration of some useful functions

Download the public 5k pbmc data at https://support.10xgenomics.com/single-cell-atac/datasets/1.1.0/atac_pbmc_5k_v1

library(scATACutils)
library(dplyr)
library(readr)
library(ggplot2)


## this takes 5 mins
#frip<- CalculateFripScore("~/5k_pbmc_atac/fragments.tsv.gz",
#                          "~/5k_pbmc_atac/peaks.bed")

frip<- read_tsv("~/5k_pbmc_atac/frip.txt", col_names = T)

# a tibble with 3 columns, cell-barcode, depth and a Frip score
head(frip)

## read in 5k pbmc atac data valid barcdoe
barcodes<- read_tsv("~/5k_pbmc_atac/pbmc_5k_atac_barcodes.tsv", col_names = F)

# the insert size distribution from https://github.com/crazyhottommy/scATACtools/blob/master/python/get_insert_size_distribution_per_cell.py

insert<- read_tsv("~/5k_pbmc_atac/pbmc_5k_insert_size.txt", col_names = T)

head(insert)

## this takes ~5mins
banding<- CalculateBandingScore(insert, barcodeList = NULL)

## distribution of the banding score after log10 transformation
ggplot(banding, aes(sample = log10(banding_score))) + 
  stat_qq() + 
  stat_qq_line(color = "red") +
  theme_bw(base_size = 14)

TSS enrichment score

This can take ~2hours using 10 CPUs for 5000 cells.

library(TxDb.Hsapiens.UCSC.hg19.knownGene)

tss_scores<- TssEnrichmentFromFrags("~/5k_pbmc_atac/fragments.tsv.gz",
                                    txs = TxDb.Hsapiens.UCSC.hg19.knownGene,
                                    workers = 10,
                                    barcodeList = barcodes$X1)

depth vs Frip and TSS score

read in a pre-computed tss score for the 5k pbmc atac dataset.

tss_scores<- readRDS("~/5k_pbmc_atac/5k_pbmc_atac_tss_scores.rds")
head(tss_scores)
head(frip)

frip_tss<- inner_join(frip, tss_scores)

head(frip_tss)

TSS score

PlotScatter(frip_tss, y = "tss_score", vline = 3, hline = 6)

Plot PC correlation with the sequencing depth

It is known that the first PC is correlated with the sequencing depth, and it is usually discarded for downstream clustering. Let's check that.

Also see Assessment of computational methods for the analysis of single-cell ATAC-seq data for discussing this as well.

library(Seurat)

peaks <- Read10X_h5(filename = "~/5k_pbmc_atac/atac_pbmc_5k_v1_filtered_peak_bc_matrix.h5")

# binarize the matrix
# [email protected][[email protected] >0]<- 1 

## create a seurat object
pbmc_seurat <- CreateSeuratObject(counts = peaks, assay = 'ATAC', project = '5k_pbmc')

pbmc_seurat@meta.data %>% head()

## do TF-IDF transformation and run PCA for dimension reduction/Latent Semantic Index
pbmc_seurat<- RunLSI(pbmc_seurat, n = 50)

## convert to SingleCellExperiment
pbmc_se<- as.SingleCellExperiment(pbmc_seurat)

PlotPCcorrelation(pbmc_seurat, reduction = "lsi")

Note the name of the reduction is different for SingleCellExperiment object.

PlotPCcorrelation(pbmc_se, reduction = "LSI")

Plot ATACseq tracks for each cluster of cells

library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(org.Hs.eg.db)

PlotCoverageByGroup(gene_name = "MS4A1", downstream = 8000, 
                    yaxis_cex = 1,fragment = "~/5k_pbmc_atac/atac_viz/10k_pbmc/atac_v1_pbmc_10k_fragments.tsv.gz",
                     grouping = "~/5k_pbmc_atac/atac_viz/grouping.txt", tick_label_cex = 1, tick.dist = 5000,
                     track_col = "red", 
                     label_cex = 1,
                     minor.tick.dist = 1000, label.margin = -0.6)

plot raw signal for each cell

barcodes<- read_tsv("~/5k_pbmc_atac/pbmc_5k_atac_barcodes.tsv", col_names = FALSE)

PlotCoverageByCell(gene_name = "MS4A1",
                   upstream = 2000, 
                   downstream = 8000,
                   fragment= "~/5k_pbmc_atac/fragments.tsv.gz",
                   barcodeList=barcodes$X1,
                   genome = "hg19", 
                   txdb = TxDb.Hsapiens.UCSC.hg19.knownGene,
                   eg.db = org.Hs.eg.db, cutSite = FALSE,
                   col_fun = c("white", "blue","red"))

You might want to concatenate the raw signal with the coverage plot by celltype using inkscape or adobe illustrator. It is possible to export the karyplot object as a grob and combine with the ComplexHeatmap object using grid (see https://github.com/bernatgel/karyoploteR/issues/51). Currently, it is not implemented.

Transcription factor motif footprint

library(BSgenome.Hsapiens.UCSC.hg19)
library(TFBSTools)
library(motifmatchr)
library(JASPAR2018)
library(rtracklayer)
library(EnrichedHeatmap)

opts<- list()
opts[["species"]] <- "Homo sapiens"
## let's plot GATA2 motif footprint
opts[["name"]] <- "GATA2"
opts[["type"]] <- "ChIP-seq"
opts[["all_versions"]] <- TRUE
PFMatrixList <- getMatrixSet(JASPAR2018, opts)

PFMatrix<- PFMatrixList[[1]]
PWM <- toPWM(PFMatrix, pseudocounts = 0.8)

peaks<- import("~/5k_pbmc_atac/peaks.bed")

For footprint, it is important to read the fragments as cut sites, otherwise you will not observe a dip in the motif where TF binds and protects DNA from being cut.

insertions<- ReadFragments("~/5k_pbmc_atac/fragments.tsv.gz", cutSite = TRUE)
cvg<- GenomicRanges::coverage(insertions)

plots<- PlotMotifFootPrint(PWM = PWM, peaks = peaks, cvg = cvg, extend = 100)
plots$heatmap
plots$lineplot

correct for cutting bias Tn5 has a cutting bias. For more sophisticated bias correcting methods, check:

Acknowlegements



crazyhottommy/scATACutils documentation built on Jan. 13, 2020, 4:40 a.m.