knitr::opts_chunk$set(eval = FALSE)
knitr::opts_chunk$set(collapse = TRUE, comment = "#>")

Workflow human 10x scRNA-seq dataset (5')

In this tutorial, we are going to utilize 5' scRNA-seq data on epigenetically de-repressed cancer cell lines to quantify transposable element (TE) expression levels at single-cell and locus resolution. Following the workflow, you'll learn the specifics of Repsc to adapt it to your single-cell dataset.

Getting started

We start the workflow by loading Repsc and the human hg38 BSgenome object into our R environment:

library(Repsc)
devtools::load_all('/net/mraid14/export/tgdata/users/davidbr/src/Repdata/')

# adjust to your genome of interest (e.g. BSgenome.Mmusculus.UCSC.mm10)
library(BSgenome.Hsapiens.UCSC.hg38)

Deduplicate Reads (parallel)

# path to BAM/SAM files containing mapped reads                         
bam_paths <-  dir("~/tgdata/data/epitherapy_reanalyzed/aligned", full.names = TRUE, pattern = 'bam.bam$')

# split BAM by chromosome
for (bam in bam_paths)
{
  Reputils::splitBAM(bam)
}

# deduplicate split BAMs
bam_paths <- dir("~/tgdata/data/epitherapy_reanalyzed/aligned", full.names = TRUE, pattern = 'chr[0-9, X, Y, MT]*.bam$')
future::plan(future.batchtools::batchtools_sge(resources = list(queue = "all.q", threads = 3, memory = 25), workers = Inf))
res <- listenv::listenv()
for (bam in bam_paths)
{
  print(bam)
  res[[bam]] %<-% Reputils::deduplicateBAM(bam, paired = TRUE, ncores = 3, align_dist = 1e3) %packages% "data.table"
}
as.list(res)

Create scSet

We then import our gene and TE annotation files as GRanges objects followed by Repsc-specific curation and formatting using the curateGenes and curateTEs functions.

# path to Gencode gtf file (provided)
gene_path <- system.file(package = 'Repdata', 
                         'extdata',
                         'hg38',
                         'genes',                         
                         'gencode.v29.annotation.gtf.gz')

# path to RepeatMasker hg38 repeat annotation (provided)
rmsk_path <- system.file(package = 'Repdata', 
                         'extdata',
                         'hg38',
                         'tes',                         
                         'hg38.fa.out.gz')

# creating the scSet
sc <- createScSet(genome   = Hsapiens,
                  protocol = 'fiveprime',
                  tes      = rmsk_path,
                  genes    = gene_path)

Compute multiple sequence alignments

Repsc computes the read/UMI coverage along genes and the consensus model of TE families. This can be useful to sanity check 5'/3' enrichment (depending on the protocol) and to identify putative TE consensus TSSs (5' protocols), polyA-sites (3' protocols), and to distinguish true de-repression from spurious background signal (e.g. intronic TE read mis-assignment, broad-scale genomic background transcription, etc.). As a rough estimate, we can utilize the consensus mapping information from Repeatmasker or DFAM output files for that purpose. This will usually provide reasonable results for highly conserved families. To increase accuracy, Repsc can also compute family-wise multiple sequence alignments to improve mapping of individual loci onto a de novo alignment. When time and computational ressources are no limitation, we recommend this step by running:

# path to bam files containing mapped reads                         
bam_paths <-  dir("~/tgdata/data/epitherapy_reanalyzed/aligned/", 
                  pattern = '_deduplicated.bam$', 
                  recursive = TRUE,
                  full.names = TRUE)

hdf5_paths <- c(
                '/net/mraid14/export/data/users/davidbr/proj/epitherapy/data/hct116/10x/dmso/hct116_DMSO/outs/filtered_gene_bc_matrices_h5.h5',
                '/net/mraid14/export/data/users/davidbr/proj/epitherapy/data/hct116/10x/dacsb/hct116_DACSB/outs/filtered_gene_bc_matrices_h5.h5',
                '/net/mraid14/export/data/users/davidbr/proj/epitherapy/data/h1299/10x/dmso/h1299_DMSO/outs/filtered_gene_bc_matrices_h5.h5',
                '/net/mraid14/export/data/users/davidbr/proj/epitherapy/data/h1299/10x/dacsb/h1299_DACSB/outs/filtered_gene_bc_matrices_h5.h5'
                )             

# create a data.frame specifying import parameters                 
input_df    <- data.frame(paths   = bam_paths,
                        paired  = TRUE,       # use FALSE for single-end libraries
                        mate    = 'first',    # only imports the first mate of properly aligned read pairs, set to NA when using single-end libraries
                        barcode = 'CB',       # 10x barcode included in BAM flag
                        chunk   = chunkFiles(bam_paths, n_chunks = 12),
                        #hdf5    = rep(hdf5_paths, each = 25),
                        meta    = rep(c('h1299_dacsb', 'h1299_dmso', 'hct116_dacsb', 'hct116_dmso'), each = 25),
                        stringsAsFactors = FALSE)

checkInput(input_df)                      
sc <- addCounts(sc,
                bams     = input_df,
                bin_size = 25,
                msa_dir  = NULL,
                use_gcluster = TRUE)

resolve Overlaps

Call cells

To distinguish real cells from empty droplets, we utilize the emptyDrops function from the DropletUtils package[1].

plotCells(sc)
sc_f <- selectCells(sc, min_size = 5e3, min_ribo = 0.01, max_mito = 0.4)
plotCells(sc_f)

Mapping

plotMapping(sc_f)

Call peaks

sc_f <- selectPeaks(sc_f)
plotPeaks(sc_f)

Feature selection

sc_f <- selectFeatures(sc_f, min_expr_third = 5)
plotFeatures(sc_f)

Export

export(sc_f, outdir = '~/tgdata/data/tmp/')

References

[1]

Session information

sessionInfo()


tanaylab/repsc documentation built on Jan. 9, 2021, 9:39 a.m.