knitr::opts_chunk$set(cache  = TRUE)
knitr::opts_knit$set(verbose = TRUE)
# knitr::opts_chunk$set(dev=c('svg', 'png'))
options(width = 100)
ggplot2::theme_set(ggplot2::theme_bw())

Metadata that varies from run to run:

LIBRARY      <- "<Insert library ID>"  # For example, the sequencing run ID.
ZENBU_COLLAB <- "<Insert Zenbu collaboration ID>"
WORKFLOW     <- "<Insert workflow ID>" # For example "OP-WORKFLOW-CAGEscan-short-reads-v2.0"
MOIRAI_STAMP <- "<Insert Moirai run stamp>" # For example "20150803121351"
MOIRAI_PROJ  <- "<Insert Moirai user name>" # For example "Charles"
MOIRAI_USER  <- "<Insert Moirai user name>" # For example "nano-fluidigm"
ASSEMBLY     <- "<Insert name of genome assembly>" # For example "hg38"
BS_GENOME    <- "<Insert name of a BSGenome>" # For example "BSgenome.Hsapiens.UCSC.hg38"

# # In case of a pair of Fludigm C1 CAGE runs, uncomment the lines below and fill
# # insert missing information.  Otherwise, delete or leave as it is.
# RunA  <- "<Insert ID of Run A>"
# RunB  <- "<Insert ID of Run B>"
# ctrls <- list( RunA=list(posi="<Insert well ID>", nega="<Insert well ID>")
#              , RunB=list(posi="<Insert well ID>", nega="<Insert well ID>"))

Metadata inferred from the above:

BASEDIR      <- "/osc-fs_home/scratch/moirai"
MOIRAI_RUN   <- paste(LIBRARY, WORKFLOW, MOIRAI_STAMP, sep=".")
MOIRAI_BASE  <- file.path(BASEDIR, MOIRAI_USER, "project", MOIRAI_PROJ, MOIRAI_RUN)
MOIRAI_FRAGS <- file.path(MOIRAI_BASE, "CAGEscan_fragments")
MOIRAI_BAM   <- file.path(MOIRAI_BASE, "genome_mapped")
MOIRAI_URL   <- paste0("http://moirai.gsc.riken.jp/", MOIRAI_BASE, "/", MOIRAI_RUN, ".html")

Jump directly to the Analysis section if you are not interested in the details of the data processing.

Data load and QC in R

library("AnnotationHub")
library(BS_GENOME, character.only = T)
library("CAGEr")
library("data.table")
library("ggplot2")
library("magrittr")
library("oscR")
library("plyr")
library("smallCAGEqc")
stopifnot(
    packageVersion("oscR") >= "0.2.0"
  , packageVersion("smallCAGEqc") >= "1"
)
library("SummarizedExperiment")
library("reshape")
library("vegan")

MOIRAI metadata

libs <- loadMoiraiStats(processed_data = MOIRAI_BASE)
# Pipe (%>%) to llPostProcess("nano-fluidigm") or
# llPostProcess("C1 96") if appropriate; see help(llPostProcess).
libs %>% summary

Intranet link: r MOIRAI_RUN

Transcript counts (properly paired)

pathsToInputFiles <- list.files(MOIRAI_FRAGS, "*.bed", full.names = TRUE) %>%
  Filter(f = function(file) file.size(file) > 0)
samplenames <- basename(pathsToInputFiles) %>% sub(".bed", "", .)
myCAGEset <- new( "CAGEset"
                , genomeName = BS_GENOME
                , inputFiles = pathsToInputFiles
                , inputFilesType = "bed"
                , sampleLabels = samplenames)
getCTSS(myCAGEset)
normalizeTagCount(myCAGEset, method = "simpleTpm")

Raw reads per molecule (BED12 data)

bed <- loadBED12(pathsToInputFiles)
levels(bed$library) <- samplenames

CAGEr functions

Here are functions for making CTSS tables, that I hope can be merged in CAGEr. https://github.com/Bioconductor-mirror/CAGEr/pull/2

CTSScoordinatesGR <- function (object){
        ctssCoord <- object@CTSScoordinates
        ctssCoord <- GRanges( ctssCoord$chr
                            , IRanges(ctssCoord$pos, ctssCoord$pos)
                            , ctssCoord$strand)
        genome(ctssCoord) <- object@genomeName
        ctssCoord
}

CTSStagCountSE <- function (object){
        colData <- data.frame( row.names   = object@sampleLabels
                             , samplename  = object@sampleLabels
                             , samplecolor = names(object@sampleLabels))
        SummarizedExperiment( assays  = as.matrix(object@tagCountMatrix)
                            , rowData = CTSScoordinatesGR(object)
                            , colData = colData)
}
l1  <- CTSStagCountSE(myCAGEset)
colnames(l1) <- samplenames
colData(l1) <- libs %>% DataFrame
l1$counts <- colSums(assay(l1))
l1$l1     <- colSums(assay(l1) > 0)

Annotation with GENCODE

Gene symbols

ah <- AnnotationHub()
query(ah, c("Gencode", "gff", "human"))
ah["AH49556"]
gff <- ah[["AH49556"]]
gff

rowRanges(l1)$genes  <- ranges2genes(rowRanges(l1), gff)
genes <- rowsum(assay(l1), rowRanges(l1)$genes) %>% data.frame
l1$genes <- colSums(genes > 0)
l1$geneSymbols <- countSymbols(genes)

Promoter, Exon, Intron, ...

rowRanges(l1)$annotation <- ranges2annot(rowRanges(l1), gff)

colData(l1) %<>% cbind(rowsum(assay(l1), rowRanges(l1)$annotation) %>% t %>% DataFrame)

Cell pictures

# read.fluo <- function(RUN) read.delim( paste0("../imageJ/", RUN, ".txt")
#                                      , row.names="cell_id"
#                                      , stringsAsFactors = FALSE)
# fluo <- rbind(read.fluo(RunA), read.fluo(RunB))
# libs <- cbind(libs, subset(fluo,,c("mean_ch2", "bg_mean_ch2", "mean_ch3", "bg_mean_ch3", "Error", "Comment")))
# libs$Error <- factor( libs$Error
#                     , levels=c("0-No Error", "1-No cell", "2-Debris", "3-OutOfFocus", "4-MultipleCells", "5-Control", "6-Dead"))

A hardocoded treshold of 2.5 is used to identify dead cells. The histogram below is to check if this value makes sense in this dataset.

# hist( libs$mean_ch3 - libs$bg_mean_ch3
#     , br   = 100
#     , main = "Current threshold for identifying dead cells"
#     , xlab = "mean_ch3 - bg_mean_ch3" )
# deadThresh <- 2.5
# abline(v=deadThresh, col="red")
# 
# libs[libs$mean_ch3 - libs$bg_mean_ch3 > deadThresh, "Error"] <- "6-Dead"

Controls

Some samples with errors were repalced by the positive and negative controls.

l1$Error[l1$Well %in% ctrls[["RunA"]] & l1$Run == RunA] <- "5-Control"
l1$Error[l1$Well %in% ctrls[["RunB"]] & l1$Run == RunB] <- "5-Control"
l1$Comment[l1$Well == ctrls$RunA$posi & l1$Run == RunA] <- "Positive control"
l1$Comment[l1$Well == ctrls$RunB$posi & l1$Run == RunB] <- "Positive control"
l1$Comment[l1$Well == ctrls$RunA$nega & l1$Run == RunA] <- "Negative control"
l1$Comment[l1$Well == ctrls$RunB$nega & l1$Run == RunB] <- "Negative control"

cDNA concentration.

read.pg <- function(RUN)
  paste0("../cDNA_yields/", RUN, ".picogreen.xlsx") %>%
    fldgmPicoGreen("PN 100-6160") %>%
    extract(,"concentration")
l1$Concentration <- c(read.pg(RunA), read.pg(RunB))
fldgmConcentrationPlot(colData(l1))

Combined analysis of fluorescence and cDNA concentration.

Array heatmaps.

# fldgmArrayQCplot <- function(RUN) fldgmArrayQC(libs[libs$Run==RUN,], RUN)
# fldgmArrayQCplot(RunA)
# fldgmArrayQCplot(RunB)

Live / dead stain and DNA concentration.

Both runs are plotted together; this explains why there may be two groups, when cDNA yields differed strongly.

# with(libs, plotNameColor(Concentration, mean_ch2 - bg_mean_ch2, Error, Well))
# with(libs, plotNameColor(Concentration, mean_ch3 - bg_mean_ch3, Error, Well))

Richness

l1$r100l1 <- rarefy(t(assay(l1)),100)
l1$r100l1[l1$counts < 100] <- NA

Analysis

# with(libs, plotNameColor(Concentration, r100l1, Error, Well))
# with(subset(libs, Error == "0-No Error"), plotNameColor(Concentration, r100l1, Run, Well))

QC barplots

plotAnnot(colData(l1), 'qc', LIBRARY, l1$Group)

Annotation

plotAnnot(colData(l1), 'counts', LIBRARY, l1$Group)

Correlation between runs

NMF::aheatmap( cor(genes[-1, ])
             , annCol=list(Run=l1$Run))
# Uncomment if you have a Error column
#NMF::aheatmap( cor(genes[-1, libs$Error == "0-No Error"])
#             , annCol=list(Run=libs[libs$Error == "0-No Error", "Run"]))

Gene counts and TSS discovery

Gene count by error code.

dotsize <- 50
ggplot(colData(l1) %>% data.frame, aes(x=Group, y=genes)) +
  stat_summary(fun.y=mean, fun.ymin=mean, fun.ymax=mean, geom="crossbar", color="gray") +
  geom_dotplot(aes(fill=Group), binaxis='y', binwidth=1, dotsize=dotsize, stackdir='center') +
  coord_flip() + facet_wrap(~Run)

Gene count per transcript count.

with(libs, plotNameColor(genes, counts, Run, Well))

Gene counts per C1 run.

#boxplot(data=subset(libs, Error == "0-No Error"), genes ~ Run, ylab="Number of genes detexted (aprox)", main="Comparison of gene detection by run.")
#t.test(data=subset(libs, Error == "0-No Error"), genes ~ Run)

Rarefaction (hanabi plot).

The purpose of these plots is to ease the comparison of the libraries given that they do not have exactly the same sequencing depth, and to evaluate if extra sequencing depth would be useful.

The data is rarefied with enough sampling points to give a smooth appearance to the curves. If it takes too much time, the output can be stored in an object saved on the hard drive. This way, if the commands have been run through knitr, the long computations here can be skipped if needed again in an interactive session.

For example:

rar1 <- hanabi(assay(l1), from = 0)
saveRDS(rar1, "rar1.rds")
# And later...
rar1 <- readRDS("rar1.rds")

Transcript discovery (with raw reads)

Would we detect more transcripts if we sequenced more raw reads ?

````r hanabiPlot( tapply(bed$score, bed$library, hanabi, from = 0) %>% structure(class = "hanabi") , ylab = 'number of molecules detected' , xlab = 'number of properly mapped reads' , main = paste("Transcript discovery for", LIBRARY) , GROUP = bed$library %>% levels %>% sub("_.*", "", .))

### Gene discovery

Would we detect more genes if we detected more transcripts ?

```r
hanabiPlot( hanabi(genes, from = 0)
          , ylab  = 'number of genes detected'
          , xlab  = 'number of unique molecule counts'
          , main  = paste("Gene discovery for", LIBRARY)
          , GROUP = colData(l1)$group)

TSS discovery

Would we detect more TSS if we detected more transcripts ?

hanabiPlot( hanabi(assay(l1), from = 0)
          , ylab  = 'number of TSS detected'
          , xlab  = 'number of unique molecule counts'
          , main  = paste("TSS discovery for", LIBRARY)
          , GROUP = colData(l1)$group)

Comparison between the runs

(only "No Error" cells)

#hanabiPlot(rarg[libs$Error=="0-No Error",], sampleSizeCounts, ylab='number of genes detected', xlab='number of unique molecule counts', main=paste("Gene discovery for", LIBRARY), GROUP=libs[libs$Error=="0-No Error", "Run"])

#legend('topleft',legend=levels(libs[libs$Error=="0-No Error", "Run"]), col=1:length(levels(libs[libs$Error=="0-No Error", "Run"])), pch=1)

Note that the horizontal axis is not the same in every plot:

Data load on Zenbu.

FIXME: will do later FIXME: make idempotent

Accessory functions

Uploading to Zenbu collaboration r ZENBU_COLLAB.

Ad-hoc wrapper to the shell command zenbu_upload

zenbuUpload <- function ( ...
                        , URL="http://fantom.gsc.riken.jp/zenbu"
                        , verbose=FALSE
                        , echo=FALSE
                        , stdout=TRUE) {
  zenbu <- 'zenbu_upload'
  url <- c('-url', URL)
  args <- sapply(c(url, ...), shQuote)
  if (verbose == TRUE) print(paste(c(zenbu, args), collapse=' '))
  if (echo    == FALSE) {
      system2(zenbu, args, stdout=stdout)
  } else {
      system2('echo', c(zenbu, args), stdout=stdout)
  }
}

Helper functions

bedToSample <- function(BED)
  BED %>%
    sub("RunA", RunA, .) %>%
    sub("RunB", RunB, .) %>%
    sub(".bed", "", .)

# To produce a string of keywords that will uniquely identify a sample.
# TODO: move keywords to the top.

sampleDescription <- function(BED)
  paste( MOIRAI_FRAGS
       , bedToSample(BED)
       , "what kind of data" # For innstance "CAGEscan_fragments"
       , "a keyword for easy retrieval" # For instance "KnitrUpload"
         # add more keywords as needed
       )

CSfragmentLocation <- function(BED)
  paste0(MOIRAI_FRAGS, "/", BED, ".bed")

CSuploadCommand <- function(BED)
  paste( CSfragmentLocation(BED)
       , bedToSample(BED)
       , sampleDescription(BED)
       , ""
       , sep = "\t")
levels(bed$library) %>%
  CSuploadCommand %>%
  writeLines(paste0(LIBRARY, ".zUpload.tsv"))

To upload CAGEscan fragments, only if they have not yet been uploaded.

zenbuUpload( "-assembly", ASSEMBLY
           , "-filelist", paste0(LIBRARY, ".zUpload.tsv")
           , "-collab_uuid", ZENBU_COLLAB
           , "-singletag_exp")

Functions to add metadata tags.

zenbuTag <- function (filter, key, value, mode='add', ...) {
  args <- c('-mdedit', filter, mode, key, value)
  zenbuUpload (args, ...)
}

tagError <- function(BED)
  zenbuTag( sampleDescription(BED)
          , 'cellomics'
          , libs[bedToSample(BED), "Error"] %>% as.character)

tagMeta <- function(BED, TAG)
  zenbuTag( sampleDescription(BED)
          , TAG
          , libs[bedToSample(BED), TAG] %>% as.character)

tagComment <- function (Sample, comment, ...) {
  filter <- paste(MOIRAI_FRAGS, Sample, "C1_CAGEscan_fragments", "KnitrUpload")
  zenbuTag(filter, 'cellomics_comment', comment, ...)
}

#apply(libs[,c('Well', 'Error')],   1, function(X) tagError  (X[1], X[2]))
#apply(libs[,c('Well', 'Comment')], 1, function(X) tagComment(X[1], X[2]))

#keywords need to be stringent: "D12"" matches BED12 !!!

Zenbu uploads and tagging

samplesToUpload <- subset(libs, properpairs > 0, "samplename", drop=T) %>%
  sub(RunA, "RunA", .) %>%
  sub(RunB, "RunB", .) %>%
  sub("$", ".bed", .)

sapply(samplesToUpload, tagError)
sapply(samplesToUpload, tagMeta, TAG="Group")
sapply(samplesToUpload, tagMeta, TAG="Run")
sapply(samplesToUpload, tagMeta, TAG="row")
sapply(samplesToUpload, tagMeta, TAG="column")


charles-plessy/smallCAGEqc documentation built on May 13, 2019, 3:31 p.m.