knitr::opts_chunk$set(eval = FALSE)
knitr::opts_chunk$set(collapse = TRUE, comment = "#>")

Workflow human 10x scRNA-seq dataset (5')

In this tutorial, we are going to utilize 5' scRNA-seq data on epigenetically de-repressed cancer cell lines to quantify transposable element (TE) expression levels at single-cell and locus resolution. Following the workflow, you'll learn the specifics of Repsc to adapt it to your single-cell dataset.

Getting started

We start the workflow by loading Repsc and the human hg38 BSgenome object into our R environment:

Sys.time()
devtools::load_all('/net/mraid14/export/tgdata/users/davidbr/src/Repsc/')
devtools::load_all('/net/mraid14/export/tgdata/users/davidbr/src/Reputils/')
devtools::load_all('/net/mraid14/export/tgdata/users/davidbr/src/Repdata/')

library(BSgenome.Mmusculus.UCSC.mm10)

Deduplicate Reads (parallel)

# path to BAM/SAM files containing mapped reads                         
bam_paths <-  dir("~/tgdata/data/nowotschin_2019/aligned", full.names = TRUE, pattern = 'bam.bam$')

# split BAM by chromosome
for (bam in bam_paths)
{
  Reputils::splitBAM(bam)
}

# deduplicate split BAMs
bam_paths <- dir("~/tgdata/data/nowotschin_2019/aligned", full.names = TRUE, pattern = 'chr[0-9, X, Y, MT]*.bam$')
future::plan(future.batchtools::batchtools_sge(resources = list(queue = "all.q", threads = 3, memory = 25), workers = Inf))
res <- listenv::listenv()
for (bam in bam_paths)
{
  print(bam)
  res[[bam]] %<-% Reputils::deduplicateBAM(bam, paired = FALSE, ncores = 3, align_dist = 1e4) %packages% "data.table"
}
as.list(res)

Create scSet

We then import our gene and TE annotation files as GRanges objects followed by Repsc-specific curation and formatting using the curateGenes and curateTEs functions.

# path to Gencode gtf file (provided)
gene_path <- system.file(package = 'Repdata', 
                        'extdata', 
                        'mm10',
                        'genes',
                        'gencode.vM22.annotation.gtf.gz')

# path to RepeatMasker mm9 repeat annotation (provided)
rmsk_path <- system.file(package = 'Repdata', 
                         'extdata', 
                         'mm10',
                         'tes',
                         'mm10.fa.out.gz')

# creating the scSet
sc <- createScSet(genome   = Mmusculus,
                  protocol = 'threeprime',
                  tes      = rmsk_path,
                  genes    = gene_path)

Create the input data.frame

# path to bam files containing mapped reads                         
bam_paths <- dir("~/tgdata/data/nowotschin_2019/aligned/", pattern = 'deduplicated.bam$', full.names = TRUE)

#hdf5_paths <- dir("~/tgdata/data/nowotschin_2019/cellranger/", pattern = 'filtered_feature_bc_matrix.h5', full.names = TRUE, recursive = TRUE)

# create a data.frame specifying import parameters                 
input_df    <- data.frame(paths       = bam_paths,
                          paired      = FALSE,       # use FALSE for single-end libraries
                          barcode     = 'CB',
                          meta        = substring(bam_paths, 52, 65),
                          chunk       = chunkFiles(bam_paths, 20),
                          stringsAsFactors = FALSE)

checkInput(input_df)                      

After we have imported and curated the data, we can procede to generate the actual read/UMI count matrix using the addCounts function.

sc <- addCounts(sc,
                bams         = input_df,
                use_gcluster = TRUE)

Call cells

plotCells(sc)
sc_f <- selectCells(sc, max_mito = 0.05, min_size = 5000)
plotCells(sc_f)

Mapping

plotMapping(sc_f)

Call peaks

sc_f <- selectPeaks(sc_f)
plotPeaks(sc_f)

Feature selection

sc_f <- selectFeatures(sc_f)
plotFeatures(sc_f)

References

[1]

Session information

sessionInfo()


tanaylab/repsc documentation built on Jan. 9, 2021, 9:39 a.m.