knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "man/figures/README-", out.width = "60%", out.height = "60%" )
The goal of scATACutils is to provide functions to work with 10x scATACseq data. It includes functions to calculate qualtiy control metrics such as banding score, Frip score, and TSS enrichment score etc. Some convinient functions are provided for visulizations as well. Plot the scatter plot of the Frip and read depth. Plot scATACseq coverage tracks by group etc.
You can install the released version of scATACutils from github with:
devtools::install_github("crazyhottommy/scATACutils")
Demonstration of some useful functions
Download the public 5k pbmc data at https://support.10xgenomics.com/single-cell-atac/datasets/1.1.0/atac_pbmc_5k_v1
library(scATACutils) library(dplyr) library(readr) library(ggplot2) ## this takes 5 mins #frip<- CalculateFripScore("~/5k_pbmc_atac/fragments.tsv.gz", # "~/5k_pbmc_atac/peaks.bed") frip<- read_tsv("~/5k_pbmc_atac/frip.txt", col_names = T) # a tibble with 3 columns, cell-barcode, depth and a Frip score head(frip) ## read in 5k pbmc atac data valid barcdoe barcodes<- read_tsv("~/5k_pbmc_atac/pbmc_5k_atac_barcodes.tsv", col_names = F) # the insert size distribution from https://github.com/crazyhottommy/scATACtools/blob/master/python/get_insert_size_distribution_per_cell.py insert<- read_tsv("~/5k_pbmc_atac/pbmc_5k_insert_size.txt", col_names = T) head(insert) ## this takes ~5mins banding<- CalculateBandingScore(insert, barcodeList = NULL) ## distribution of the banding score after log10 transformation ggplot(banding, aes(sample = log10(banding_score))) + stat_qq() + stat_qq_line(color = "red") + theme_bw(base_size = 14)
This can take ~2hours using 10 CPUs for 5000 cells.
library(TxDb.Hsapiens.UCSC.hg19.knownGene) tss_scores<- TssEnrichmentFromFrags("~/5k_pbmc_atac/fragments.tsv.gz", txs = TxDb.Hsapiens.UCSC.hg19.knownGene, workers = 10, barcodeList = barcodes$X1)
read in a pre-computed tss score for the 5k pbmc atac dataset.
tss_scores<- readRDS("~/5k_pbmc_atac/5k_pbmc_atac_tss_scores.rds") head(tss_scores) head(frip) frip_tss<- inner_join(frip, tss_scores) head(frip_tss)
TSS score
PlotScatter(frip_tss, y = "tss_score", vline = 3, hline = 6)
It is known that the first PC is correlated with the sequencing depth, and it is usually discarded for downstream clustering. Let's check that.
Also see Assessment of computational methods for the analysis of single-cell ATAC-seq data for discussing this as well.
library(Seurat) peaks <- Read10X_h5(filename = "~/5k_pbmc_atac/atac_pbmc_5k_v1_filtered_peak_bc_matrix.h5") # binarize the matrix # peaks@x[peaks@x >0]<- 1 ## create a seurat object pbmc_seurat <- CreateSeuratObject(counts = peaks, assay = 'ATAC', project = '5k_pbmc') pbmc_seurat@meta.data %>% head() ## do TF-IDF transformation and run PCA for dimension reduction/Latent Semantic Index pbmc_seurat<- RunLSI(pbmc_seurat, n = 50) ## convert to SingleCellExperiment pbmc_se<- as.SingleCellExperiment(pbmc_seurat) PlotPCcorrelation(pbmc_seurat, reduction = "lsi")
Note the name of the reduction is different for SingleCellExperiment object.
PlotPCcorrelation(pbmc_se, reduction = "LSI")
More examples can be found at https://rpubs.com/crazyhottommy/scATAC_tracks
library(TxDb.Hsapiens.UCSC.hg19.knownGene) library(org.Hs.eg.db) PlotCoverageByGroup(gene_name = "MS4A1", downstream = 8000, yaxis_cex = 1,fragment = "~/5k_pbmc_atac/atac_viz/10k_pbmc/atac_v1_pbmc_10k_fragments.tsv.gz", grouping = "~/5k_pbmc_atac/atac_viz/grouping.txt", tick_label_cex = 1, tick.dist = 5000, track_col = "red", label_cex = 1, minor.tick.dist = 1000, label.margin = -0.6)
barcodes<- read_tsv("~/5k_pbmc_atac/pbmc_5k_atac_barcodes.tsv", col_names = FALSE) p<- PlotCoverageByCell(gene_name = "MS4A1", upstream = 2000, downstream = 8000, fragment= "~/5k_pbmc_atac/fragments.tsv.gz", barcodeList=barcodes$X1, genome = "hg19", txdb = TxDb.Hsapiens.UCSC.hg19.knownGene, eg.db = org.Hs.eg.db, cutSite = FALSE, col_fun = c("white", "blue","red")) p
You might want to concatenate the raw signal with the coverage plot by celltype using inkscape
or adobe illustrator
.
It is possible to export the karyplot object as a grob and combine with the ComplexHeatmap object using grid (see
https://github.com/bernatgel/karyoploteR/issues/51). Currently, it is not implemented.
library(BSgenome.Hsapiens.UCSC.hg19) library(TFBSTools) library(motifmatchr) library(JASPAR2018) library(rtracklayer) library(EnrichedHeatmap) opts<- list() opts[["species"]] <- "Homo sapiens" ## let's plot GATA2 motif footprint opts[["name"]] <- "GATA2" opts[["type"]] <- "ChIP-seq" opts[["all_versions"]] <- TRUE PFMatrixList <- getMatrixSet(JASPAR2018, opts) PFMatrix<- PFMatrixList[[1]] PWM <- toPWM(PFMatrix, pseudocounts = 0.8) peaks<- import("~/5k_pbmc_atac/peaks.bed")
For footprint, it is important to read the fragments as cut sites, otherwise you will not observe a dip in the motif where TF binds and protects DNA from being cut.
insertions<- ReadFragments("~/5k_pbmc_atac/fragments.tsv.gz", cutSite = TRUE) cvg<- GenomicRanges::coverage(insertions) plots<- PlotMotifFootPrint(PWM = PWM, peaks = peaks, cvg = cvg, extend = 100) plots$heatmap plots$lineplot
correct for cutting bias Tn5 has a cutting bias. For more sophisticated bias correcting methods, check:
many of the functions are being integrated to our MAESTRO single-cell RNAseq and ATACseq analysis pipeline developed in Shirley Liu's lab. That being said, I plan to implement more new features in MAESTRO
and will leave scATACutils
relatively quiet :).
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.