knitr::opts_chunk$set(eval = FALSE)
knitr::opts_chunk$set(collapse = TRUE, comment = "#>")

Workflow human 10x scRNA-seq dataset (5')

In this tutorial, we are going to utilize 5' scRNA-seq data on epigenetically de-repressed cancer cell lines to quantify transposable element (TE) expression levels at single-cell and locus resolution. Following the workflow, you'll learn the specifics of Repsc to adapt it to your single-cell dataset.

Getting started

We start the workflow by loading Repsc and the human hg38 BSgenome object into our R environment:

Sys.time()
library(Repsc)
library(BSgenome.Hsapiens.UCSC.hg38)

# Repdata contains gene and TE annotation files, you can download and define those files manually (see below)
devtools::load_all('/net/mraid14/export/tgdata/users/davidbr/src/Repdata/')

Deduplicate Reads (parallel)

We remove duplicated reads by creating artificial contigs along the chromosomes followed by deduplication with UMI tools.

# path to BAM/SAM files containing mapped reads                         
bam_paths <-  dir("~/tgdata/data/epitherapy_reanalyzed/aligned", full.names = TRUE, pattern = 'bam.bam$')

# split BAM by chromosome
for (bam in bam_paths)
{
  Reputils::splitBAM(bam)
}

# deduplicate split BAMs
bam_paths <- dir("~/tgdata/data/epitherapy_reanalyzed/aligned", full.names = TRUE, pattern = 'chr[0-9, X, Y, MT]*.bam$')
future::plan(future.batchtools::batchtools_sge(resources = list(queue = "all.q", threads = 3, memory = 25), workers = Inf))
res <- listenv::listenv()
for (bam in bam_paths)
{
  print(bam)
  res[[bam]] %<-% Reputils::deduplicateBAM(bam, paired = TRUE, ncores = 3, align_dist = 1e3) %packages% "data.table"
}
as.list(res)

Create scSet

We then import our gene and TE annotation files as GRanges objects followed by Repsc-specific curation and formatting using the curateGenes and curateTEs functions.

# path to Gencode gtf file (provided)
gene_path <- system.file(package = 'Repdata', 
                         'extdata',
                         'hg38',
                         'genes',                         
                         'gencode.v29.annotation.gtf.gz')

# path to RepeatMasker hg38 repeat annotation (provided)
rmsk_path <- system.file(package = 'Repdata', 
                         'extdata',
                         'hg38',
                         'tes',                         
                         'hg38.fa.out.gz')

# creating the scSet
sc <- createScSet(genome   = Hsapiens,
                  protocol = 'fiveprime',
                  tes      = rmsk_path,
                  genes    = gene_path)

Create the input data.frame

Here we define the input data

# path to bam files containing mapped reads                         
bam_paths <-  dir("~/tgdata/data/epitherapy_reanalyzed/aligned/", 
                  pattern = '_deduplicated.bam$', 
                  recursive = TRUE,
                  full.names = TRUE)

hdf5_paths <- c(
                '/net/mraid14/export/data/users/davidbr/proj/epitherapy/data/hct116/10x/dmso/hct116_DMSO/outs/filtered_gene_bc_matrices_h5.h5',
                '/net/mraid14/export/data/users/davidbr/proj/epitherapy/data/hct116/10x/dacsb/hct116_DACSB/outs/filtered_gene_bc_matrices_h5.h5',
                '/net/mraid14/export/data/users/davidbr/proj/epitherapy/data/h1299/10x/dmso/h1299_DMSO/outs/filtered_gene_bc_matrices_h5.h5',
                '/net/mraid14/export/data/users/davidbr/proj/epitherapy/data/h1299/10x/dacsb/h1299_DACSB/outs/filtered_gene_bc_matrices_h5.h5'
                )             

# create a data.frame specifying import parameters                 
input_df    <- data.frame(paths   = bam_paths,
                        paired  = TRUE,       # use FALSE for single-end libraries
                        mate    = 'first',    # only imports the first mate of properly aligned read pairs, set to NA when using single-end libraries
                        barcode = 'CB',       # 10x barcode included in BAM flag
                        chunk   = Reputils:::chunkFiles(bam_paths, n_chunks = 12),
                        #hdf5    = rep(hdf5_paths, each = 25),
                        meta    = rep(c('h1299_dacsb', 'h1299_dmso', 'hct116_dacsb', 'hct116_dmso'), each = 25),
                        stringsAsFactors = FALSE)

checkInput(input_df)                      
sc <- addCounts(sc,
                bams     = input_df,
                bin_size = 25,
                use_gcluster = TRUE)

Call cells

To distinguish real cells from empty droplets, we utilize the emptyDrops function from the DropletUtils package[1].

plotCells(sc)
sc_f <- selectCells(sc, min_size = 5000, max_mito = 0.05)
plotCells(sc_f)

Mapping

plotMapping(sc_f)

Call peaks

sc_f <- selectPeaks(sc_f)
plotPeaks(sc_f)

Feature selection

sc_f <- selectFeatures(sc_f, min_expr_third = 5)
plotFeatures(sc_f)

Export results

export(sc_f, outdir = tempdir())

References

[1]

Session information

sessionInfo()


tanaylab/repsc documentation built on Jan. 9, 2021, 9:39 a.m.