In charles-plessy/smallCAGEqc: ad-hoc QC functions for shallow-sequenced test CAGE libraries

knitr::opts_chunk$set(cache  = TRUE)
knitr::opts_knit$set(verbose = TRUE)
# knitr::opts_chunk$set(dev=c('svg', 'png'))
options(width = 100)
ggplot2::theme_set(ggplot2::theme_bw())

Metadata that varies from run to run:

LIBRARY      <- "<Insert library ID>"  # For example, the sequencing run ID.
ZENBU_COLLAB <- "<Insert Zenbu collaboration ID>"
WORKFLOW     <- "<Insert workflow ID>" # For example "OP-WORKFLOW-CAGEscan-short-reads-v2.0"
MOIRAI_STAMP <- "<Insert Moirai run stamp>" # For example "20150803121351"
MOIRAI_PROJ  <- "<Insert Moirai user name>" # For example "Charles"
MOIRAI_USER  <- "<Insert Moirai user name>" # For example "nano-fluidigm"
ASSEMBLY     <- "<Insert name of genome assembly>" # For example "hg38"
BS_GENOME    <- "<Insert name of a BSGenome>" # For example "BSgenome.Hsapiens.UCSC.hg38"

# # In case of a pair of Fludigm C1 CAGE runs, uncomment the lines below and fill
# # insert missing information.  Otherwise, delete or leave as it is.
# RunA  <- "<Insert ID of Run A>"
# RunB  <- "<Insert ID of Run B>"
# ctrls <- list( RunA=list(posi="<Insert well ID>", nega="<Insert well ID>")
#              , RunB=list(posi="<Insert well ID>", nega="<Insert well ID>"))

Metadata inferred from the above:

BASEDIR      <- "/osc-fs_home/scratch/moirai"
MOIRAI_RUN   <- paste(LIBRARY, WORKFLOW, MOIRAI_STAMP, sep=".")
MOIRAI_BASE  <- file.path(BASEDIR, MOIRAI_USER, "project", MOIRAI_PROJ, MOIRAI_RUN)
MOIRAI_FRAGS <- file.path(MOIRAI_BASE, "CAGEscan_fragments")
MOIRAI_BAM   <- file.path(MOIRAI_BASE, "genome_mapped")
MOIRAI_URL   <- paste0("http://moirai.gsc.riken.jp/", MOIRAI_BASE, "/", MOIRAI_RUN, ".html")

Sequenced on MiSeq (r LIBRARY)

Jump directly to the Analysis section if you are not interested in the details of the data processing.

Data load and QC in R

library("AnnotationHub")
library(BS_GENOME, character.only = T)
library("CAGEr")
library("data.table")
library("ggplot2")
library("magrittr")
library("oscR")
library("plyr")
library("smallCAGEqc")
stopifnot(
    packageVersion("oscR") >= "0.2.0"
  , packageVersion("smallCAGEqc") >= "1"
)
library("SummarizedExperiment")
library("reshape")
library("vegan")

MOIRAI metadata

libs <- loadMoiraiStats(processed_data = MOIRAI_BASE)
# Pipe (%>%) to llPostProcess("nano-fluidigm") or
# llPostProcess("C1 96") if appropriate; see help(llPostProcess).
libs %>% summary

Intranet link: r MOIRAI_RUN

Transcript counts (properly paired)

pathsToInputFiles <- list.files(MOIRAI_FRAGS, "*.bed", full.names = TRUE) %>%
  Filter(f = function(file) file.size(file) > 0)
samplenames <- basename(pathsToInputFiles) %>% sub(".bed", "", .)
myCAGEset <- new( "CAGEset"
                , genomeName = BS_GENOME
                , inputFiles = pathsToInputFiles
                , inputFilesType = "bed"
                , sampleLabels = samplenames)
getCTSS(myCAGEset)
normalizeTagCount(myCAGEset, method = "simpleTpm")

Raw reads per molecule (BED12 data)

bed <- loadBED12(pathsToInputFiles)
levels(bed$library) <- samplenames

CAGEr functions

Here are functions for making CTSS tables, that I hope can be merged in CAGEr. https://github.com/Bioconductor-mirror/CAGEr/pull/2

CTSScoordinatesGR <- function (object){
        ctssCoord <- object@CTSScoordinates
        ctssCoord <- GRanges( ctssCoord$chr
                            , IRanges(ctssCoord$pos, ctssCoord$pos)
                            , ctssCoord$strand)
        genome(ctssCoord) <- object@genomeName
        ctssCoord
}

CTSStagCountSE <- function (object){
        colData <- data.frame( row.names   = object@sampleLabels
                             , samplename  = object@sampleLabels
                             , samplecolor = names(object@sampleLabels))
        SummarizedExperiment( assays  = as.matrix(object@tagCountMatrix)
                            , rowData = CTSScoordinatesGR(object)
                            , colData = colData)
}

l1  <- CTSStagCountSE(myCAGEset)
colnames(l1) <- samplenames
colData(l1) <- libs %>% DataFrame
l1$counts <- colSums(assay(l1))
l1$l1     <- colSums(assay(l1) > 0)

Annotation with GENCODE

Gene symbols

ah <- AnnotationHub()
query(ah, c("Gencode", "gff", "human"))
ah["AH49556"]
gff <- ah[["AH49556"]]
gff

rowRanges(l1)$genes  <- ranges2genes(rowRanges(l1), gff)
genes <- rowsum(assay(l1), rowRanges(l1)$genes) %>% data.frame
l1$genes <- colSums(genes > 0)
l1$geneSymbols <- countSymbols(genes)

Promoter, Exon, Intron, ...

rowRanges(l1)$annotation <- ranges2annot(rowRanges(l1), gff)

colData(l1) %<>% cbind(rowsum(assay(l1), rowRanges(l1)$annotation) %>% t %>% DataFrame)

Cell pictures

# read.fluo <- function(RUN) read.delim( paste0("../imageJ/", RUN, ".txt")
#                                      , row.names="cell_id"
#                                      , stringsAsFactors = FALSE)
# fluo <- rbind(read.fluo(RunA), read.fluo(RunB))
# libs <- cbind(libs, subset(fluo,,c("mean_ch2", "bg_mean_ch2", "mean_ch3", "bg_mean_ch3", "Error", "Comment")))
# libs$Error <- factor( libs$Error
#                     , levels=c("0-No Error", "1-No cell", "2-Debris", "3-OutOfFocus", "4-MultipleCells", "5-Control", "6-Dead"))

A hardocoded treshold of 2.5 is used to identify dead cells. The histogram below is to check if this value makes sense in this dataset.

# hist( libs$mean_ch3 - libs$bg_mean_ch3
#     , br   = 100
#     , main = "Current threshold for identifying dead cells"
#     , xlab = "mean_ch3 - bg_mean_ch3" )
# deadThresh <- 2.5
# abline(v=deadThresh, col="red")
# 
# libs[libs$mean_ch3 - libs$bg_mean_ch3 > deadThresh, "Error"] <- "6-Dead"

Controls

Some samples with errors were repalced by the positive and negative controls.

l1$Error[l1$Well %in% ctrls[["RunA"]] & l1$Run == RunA] <- "5-Control"
l1$Error[l1$Well %in% ctrls[["RunB"]] & l1$Run == RunB] <- "5-Control"
l1$Comment[l1$Well == ctrls$RunA$posi & l1$Run == RunA] <- "Positive control"
l1$Comment[l1$Well == ctrls$RunB$posi & l1$Run == RunB] <- "Positive control"
l1$Comment[l1$Well == ctrls$RunA$nega & l1$Run == RunA] <- "Negative control"
l1$Comment[l1$Well == ctrls$RunB$nega & l1$Run == RunB] <- "Negative control"

cDNA concentration.

read.pg <- function(RUN)
  paste0("../cDNA_yields/", RUN, ".picogreen.xlsx") %>%
    fldgmPicoGreen("PN 100-6160") %>%
    extract(,"concentration")
l1$Concentration <- c(read.pg(RunA), read.pg(RunB))
fldgmConcentrationPlot(colData(l1))

Combined analysis of fluorescence and cDNA concentration.

Array heatmaps.

# fldgmArrayQCplot <- function(RUN) fldgmArrayQC(libs[libs$Run==RUN,], RUN)

# fldgmArrayQCplot(RunA)

# fldgmArrayQCplot(RunB)

Live / dead stain and DNA concentration.

Both runs are plotted together; this explains why there may be two groups, when cDNA yields differed strongly.

# with(libs, plotNameColor(Concentration, mean_ch2 - bg_mean_ch2, Error, Well))
# with(libs, plotNameColor(Concentration, mean_ch3 - bg_mean_ch3, Error, Well))

Richness

l1$r100l1 <- rarefy(t(assay(l1)),100)
l1$r100l1[l1$counts < 100] <- NA

Analysis

# with(libs, plotNameColor(Concentration, r100l1, Error, Well))

# with(subset(libs, Error == "0-No Error"), plotNameColor(Concentration, r100l1, Run, Well))

QC barplots

plotAnnot(colData(l1), 'qc', LIBRARY, l1$Group)

Annotation

plotAnnot(colData(l1), 'counts', LIBRARY, l1$Group)

Correlation between runs

NMF::aheatmap( cor(genes[-1, ])
             , annCol=list(Run=l1$Run))

# Uncomment if you have a Error column
#NMF::aheatmap( cor(genes[-1, libs$Error == "0-No Error"])
#             , annCol=list(Run=libs[libs$Error == "0-No Error", "Run"]))

Gene counts and TSS discovery

Gene count by error code.

dotsize <- 50
ggplot(colData(l1) %>% data.frame, aes(x=Group, y=genes)) +
  stat_summary(fun.y=mean, fun.ymin=mean, fun.ymax=mean, geom="crossbar", color="gray") +
  geom_dotplot(aes(fill=Group), binaxis='y', binwidth=1, dotsize=dotsize, stackdir='center') +
  coord_flip() + facet_wrap(~Run)

Gene count per transcript count.

with(libs, plotNameColor(genes, counts, Run, Well))

Gene counts per C1 run.

#boxplot(data=subset(libs, Error == "0-No Error"), genes ~ Run, ylab="Number of genes detexted (aprox)", main="Comparison of gene detection by run.")
#t.test(data=subset(libs, Error == "0-No Error"), genes ~ Run)

Rarefaction (hanabi plot).

The purpose of these plots is to ease the comparison of the libraries given that they do not have exactly the same sequencing depth, and to evaluate if extra sequencing depth would be useful.

The data is rarefied with enough sampling points to give a smooth appearance to the curves. If it takes too much time, the output can be stored in an object saved on the hard drive. This way, if the commands have been run through knitr, the long computations here can be skipped if needed again in an interactive session.

For example:

rar1 <- hanabi(assay(l1), from = 0)
saveRDS(rar1, "rar1.rds")
# And later...
rar1 <- readRDS("rar1.rds")

Transcript discovery (with raw reads)

Would we detect more transcripts if we sequenced more raw reads ?

````r hanabiPlot( tapply(bed$score, bed$library, hanabi, from = 0) %>% structure(class = "hanabi") , ylab = 'number of molecules detected' , xlab = 'number of properly mapped reads' , main = paste("Transcript discovery for", LIBRARY) , GROUP = bed$library %>% levels %>% sub("_.*", "", .))

### Gene discovery

Would we detect more genes if we detected more transcripts ?

```r
hanabiPlot( hanabi(genes, from = 0)
          , ylab  = 'number of genes detected'
          , xlab  = 'number of unique molecule counts'
          , main  = paste("Gene discovery for", LIBRARY)
          , GROUP = colData(l1)$group)

TSS discovery

Would we detect more TSS if we detected more transcripts ?

hanabiPlot( hanabi(assay(l1), from = 0)
          , ylab  = 'number of TSS detected'
          , xlab  = 'number of unique molecule counts'
          , main  = paste("TSS discovery for", LIBRARY)
          , GROUP = colData(l1)$group)

Comparison between the runs

(only "No Error" cells)

#hanabiPlot(rarg[libs$Error=="0-No Error",], sampleSizeCounts, ylab='number of genes detected', xlab='number of unique molecule counts', main=paste("Gene discovery for", LIBRARY), GROUP=libs[libs$Error=="0-No Error", "Run"])

#legend('topleft',legend=levels(libs[libs$Error=="0-No Error", "Run"]), col=1:length(levels(libs[libs$Error=="0-No Error", "Run"])), pch=1)

Note that the horizontal axis is not the same in every plot:

For TSS or Gene discovery, the horizontal axis is unique molecule counts, that is, transcripts. The purpose of these plots is to show how easy it would be to detect more TSS or genes if we would sequence more transcripts.
For Transcript discovery, the horizontal axis is number of raw reads. The purpose of this plot is to show how possible it is to sequence more transcripts by increasing the number of raw reads.

Data load on Zenbu.

FIXME: will do later FIXME: make idempotent

Accessory functions

Uploading to Zenbu collaboration r ZENBU_COLLAB.

Ad-hoc wrapper to the shell command zenbu_upload

zenbuUpload <- function ( ...
                        , URL="http://fantom.gsc.riken.jp/zenbu"
                        , verbose=FALSE
                        , echo=FALSE
                        , stdout=TRUE) {
  zenbu <- 'zenbu_upload'
  url <- c('-url', URL)
  args <- sapply(c(url, ...), shQuote)
  if (verbose == TRUE) print(paste(c(zenbu, args), collapse=' '))
  if (echo    == FALSE) {
      system2(zenbu, args, stdout=stdout)
  } else {
      system2('echo', c(zenbu, args), stdout=stdout)
  }
}

Helper functions

bedToSample <- function(BED)
  BED %>%
    sub("RunA", RunA, .) %>%
    sub("RunB", RunB, .) %>%
    sub(".bed", "", .)

# To produce a string of keywords that will uniquely identify a sample.
# TODO: move keywords to the top.

sampleDescription <- function(BED)
  paste( MOIRAI_FRAGS
       , bedToSample(BED)
       , "what kind of data" # For innstance "CAGEscan_fragments"
       , "a keyword for easy retrieval" # For instance "KnitrUpload"
         # add more keywords as needed
       )

CSfragmentLocation <- function(BED)
  paste0(MOIRAI_FRAGS, "/", BED, ".bed")

CSuploadCommand <- function(BED)
  paste( CSfragmentLocation(BED)
       , bedToSample(BED)
       , sampleDescription(BED)
       , ""
       , sep = "\t")

levels(bed$library) %>%
  CSuploadCommand %>%
  writeLines(paste0(LIBRARY, ".zUpload.tsv"))

To upload CAGEscan fragments, only if they have not yet been uploaded.

zenbuUpload( "-assembly", ASSEMBLY
           , "-filelist", paste0(LIBRARY, ".zUpload.tsv")
           , "-collab_uuid", ZENBU_COLLAB
           , "-singletag_exp")

Functions to add metadata tags.

zenbuTag <- function (filter, key, value, mode='add', ...) {
  args <- c('-mdedit', filter, mode, key, value)
  zenbuUpload (args, ...)
}

tagError <- function(BED)
  zenbuTag( sampleDescription(BED)
          , 'cellomics'
          , libs[bedToSample(BED), "Error"] %>% as.character)

tagMeta <- function(BED, TAG)
  zenbuTag( sampleDescription(BED)
          , TAG
          , libs[bedToSample(BED), TAG] %>% as.character)

tagComment <- function (Sample, comment, ...) {
  filter <- paste(MOIRAI_FRAGS, Sample, "C1_CAGEscan_fragments", "KnitrUpload")
  zenbuTag(filter, 'cellomics_comment', comment, ...)
}

#apply(libs[,c('Well', 'Error')],   1, function(X) tagError  (X[1], X[2]))
#apply(libs[,c('Well', 'Comment')], 1, function(X) tagComment(X[1], X[2]))

#keywords need to be stringent: "D12"" matches BED12 !!!

Zenbu uploads and tagging

samplesToUpload <- subset(libs, properpairs > 0, "samplename", drop=T) %>%
  sub(RunA, "RunA", .) %>%
  sub(RunB, "RunB", .) %>%
  sub("$", ".bed", .)

sapply(samplesToUpload, tagError)
sapply(samplesToUpload, tagMeta, TAG="Group")
sapply(samplesToUpload, tagMeta, TAG="Run")
sapply(samplesToUpload, tagMeta, TAG="row")
sapply(samplesToUpload, tagMeta, TAG="column")

charles-plessy/smallCAGEqc documentation built on May 13, 2019, 3:31 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

charles-plessy/smallCAGEqc
ad-hoc QC functions for shallow-sequenced test CAGE libraries

In charles-plessy/smallCAGEqc: ad-hoc QC functions for shallow-sequenced test CAGE libraries

Data load and QC in R

MOIRAI metadata

Transcript counts (properly paired)

Raw reads per molecule (BED12 data)

CAGEr functions

Annotation with GENCODE

Gene symbols

Promoter, Exon, Intron, ...

Cell pictures

Controls

cDNA concentration.

Combined analysis of fluorescence and cDNA concentration.

Array heatmaps.

Live / dead stain and DNA concentration.

Richness

Analysis

QC barplots

Annotation

Correlation between runs

Gene counts and TSS discovery

Gene count by error code.

Gene count per transcript count.

Gene counts per C1 run.

Rarefaction (hanabi plot).

Transcript discovery (with raw reads)

TSS discovery

Comparison between the runs

Data load on Zenbu.

Accessory functions

Zenbu uploads and tagging

R Package Documentation

Browse R Packages

We want your feedback!

charles-plessy/smallCAGEqc ad-hoc QC functions for shallow-sequenced test CAGE libraries

In charles-plessy/smallCAGEqc: ad-hoc QC functions for shallow-sequenced test CAGE libraries

Data load and QC in R

MOIRAI metadata

Transcript counts (properly paired)

Raw reads per molecule (BED12 data)

CAGEr functions

Annotation with GENCODE

Gene symbols

Promoter, Exon, Intron, ...

Cell pictures

Controls

cDNA concentration.

Combined analysis of fluorescence and cDNA concentration.

Array heatmaps.

Live / dead stain and DNA concentration.

Richness

Analysis

QC barplots

Annotation

Correlation between runs

Gene counts and TSS discovery

Gene count by error code.

Gene count per transcript count.

Gene counts per C1 run.

Rarefaction (hanabi plot).

Transcript discovery (with raw reads)

TSS discovery

Comparison between the runs

Data load on Zenbu.

Accessory functions

Zenbu uploads and tagging

R Package Documentation

Browse R Packages

We want your feedback!

charles-plessy/smallCAGEqc
ad-hoc QC functions for shallow-sequenced test CAGE libraries