knitr::opts_chunk$set(cache = TRUE) knitr::opts_knit$set(verbose = TRUE) # knitr::opts_chunk$set(dev=c('svg', 'png')) options(width = 100) ggplot2::theme_set(ggplot2::theme_bw())
Metadata that varies from run to run:
LIBRARY <- "<Insert library ID>" # For example, the sequencing run ID. ZENBU_COLLAB <- "<Insert Zenbu collaboration ID>" WORKFLOW <- "<Insert workflow ID>" # For example "OP-WORKFLOW-CAGEscan-short-reads-v2.0" MOIRAI_STAMP <- "<Insert Moirai run stamp>" # For example "20150803121351" MOIRAI_PROJ <- "<Insert Moirai user name>" # For example "Charles" MOIRAI_USER <- "<Insert Moirai user name>" # For example "nano-fluidigm" ASSEMBLY <- "<Insert name of genome assembly>" # For example "hg38" BS_GENOME <- "<Insert name of a BSGenome>" # For example "BSgenome.Hsapiens.UCSC.hg38" # # In case of a pair of Fludigm C1 CAGE runs, uncomment the lines below and fill # # insert missing information. Otherwise, delete or leave as it is. # RunA <- "<Insert ID of Run A>" # RunB <- "<Insert ID of Run B>" # ctrls <- list( RunA=list(posi="<Insert well ID>", nega="<Insert well ID>") # , RunB=list(posi="<Insert well ID>", nega="<Insert well ID>"))
Metadata inferred from the above:
BASEDIR <- "/osc-fs_home/scratch/moirai" MOIRAI_RUN <- paste(LIBRARY, WORKFLOW, MOIRAI_STAMP, sep=".") MOIRAI_BASE <- file.path(BASEDIR, MOIRAI_USER, "project", MOIRAI_PROJ, MOIRAI_RUN) MOIRAI_FRAGS <- file.path(MOIRAI_BASE, "CAGEscan_fragments") MOIRAI_BAM <- file.path(MOIRAI_BASE, "genome_mapped") MOIRAI_URL <- paste0("http://moirai.gsc.riken.jp/", MOIRAI_BASE, "/", MOIRAI_RUN, ".html")
r LIBRARY
)Jump directly to the Analysis section if you are not interested in the details of the data processing.
library("AnnotationHub") library(BS_GENOME, character.only = T) library("CAGEr") library("data.table") library("ggplot2") library("magrittr") library("oscR") library("plyr") library("smallCAGEqc") stopifnot( packageVersion("oscR") >= "0.2.0" , packageVersion("smallCAGEqc") >= "1" ) library("SummarizedExperiment") library("reshape") library("vegan")
libs <- loadMoiraiStats(processed_data = MOIRAI_BASE) # Pipe (%>%) to llPostProcess("nano-fluidigm") or # llPostProcess("C1 96") if appropriate; see help(llPostProcess). libs %>% summary
Intranet link: r MOIRAI_RUN
pathsToInputFiles <- list.files(MOIRAI_FRAGS, "*.bed", full.names = TRUE) %>% Filter(f = function(file) file.size(file) > 0) samplenames <- basename(pathsToInputFiles) %>% sub(".bed", "", .) myCAGEset <- new( "CAGEset" , genomeName = BS_GENOME , inputFiles = pathsToInputFiles , inputFilesType = "bed" , sampleLabels = samplenames) getCTSS(myCAGEset) normalizeTagCount(myCAGEset, method = "simpleTpm")
bed <- loadBED12(pathsToInputFiles) levels(bed$library) <- samplenames
Here are functions for making CTSS tables, that I hope can be merged in CAGEr. https://github.com/Bioconductor-mirror/CAGEr/pull/2
CTSScoordinatesGR <- function (object){ ctssCoord <- object@CTSScoordinates ctssCoord <- GRanges( ctssCoord$chr , IRanges(ctssCoord$pos, ctssCoord$pos) , ctssCoord$strand) genome(ctssCoord) <- object@genomeName ctssCoord } CTSStagCountSE <- function (object){ colData <- data.frame( row.names = object@sampleLabels , samplename = object@sampleLabels , samplecolor = names(object@sampleLabels)) SummarizedExperiment( assays = as.matrix(object@tagCountMatrix) , rowData = CTSScoordinatesGR(object) , colData = colData) }
l1 <- CTSStagCountSE(myCAGEset) colnames(l1) <- samplenames colData(l1) <- libs %>% DataFrame l1$counts <- colSums(assay(l1)) l1$l1 <- colSums(assay(l1) > 0)
ah <- AnnotationHub() query(ah, c("Gencode", "gff", "human")) ah["AH49556"] gff <- ah[["AH49556"]] gff rowRanges(l1)$genes <- ranges2genes(rowRanges(l1), gff) genes <- rowsum(assay(l1), rowRanges(l1)$genes) %>% data.frame l1$genes <- colSums(genes > 0) l1$geneSymbols <- countSymbols(genes)
rowRanges(l1)$annotation <- ranges2annot(rowRanges(l1), gff) colData(l1) %<>% cbind(rowsum(assay(l1), rowRanges(l1)$annotation) %>% t %>% DataFrame)
# read.fluo <- function(RUN) read.delim( paste0("../imageJ/", RUN, ".txt") # , row.names="cell_id" # , stringsAsFactors = FALSE) # fluo <- rbind(read.fluo(RunA), read.fluo(RunB)) # libs <- cbind(libs, subset(fluo,,c("mean_ch2", "bg_mean_ch2", "mean_ch3", "bg_mean_ch3", "Error", "Comment"))) # libs$Error <- factor( libs$Error # , levels=c("0-No Error", "1-No cell", "2-Debris", "3-OutOfFocus", "4-MultipleCells", "5-Control", "6-Dead"))
A hardocoded treshold of 2.5 is used to identify dead cells. The histogram below is to check if this value makes sense in this dataset.
# hist( libs$mean_ch3 - libs$bg_mean_ch3 # , br = 100 # , main = "Current threshold for identifying dead cells" # , xlab = "mean_ch3 - bg_mean_ch3" ) # deadThresh <- 2.5 # abline(v=deadThresh, col="red") # # libs[libs$mean_ch3 - libs$bg_mean_ch3 > deadThresh, "Error"] <- "6-Dead"
Some samples with errors were repalced by the positive and negative controls.
l1$Error[l1$Well %in% ctrls[["RunA"]] & l1$Run == RunA] <- "5-Control" l1$Error[l1$Well %in% ctrls[["RunB"]] & l1$Run == RunB] <- "5-Control" l1$Comment[l1$Well == ctrls$RunA$posi & l1$Run == RunA] <- "Positive control" l1$Comment[l1$Well == ctrls$RunB$posi & l1$Run == RunB] <- "Positive control" l1$Comment[l1$Well == ctrls$RunA$nega & l1$Run == RunA] <- "Negative control" l1$Comment[l1$Well == ctrls$RunB$nega & l1$Run == RunB] <- "Negative control"
read.pg <- function(RUN) paste0("../cDNA_yields/", RUN, ".picogreen.xlsx") %>% fldgmPicoGreen("PN 100-6160") %>% extract(,"concentration") l1$Concentration <- c(read.pg(RunA), read.pg(RunB)) fldgmConcentrationPlot(colData(l1))
# fldgmArrayQCplot <- function(RUN) fldgmArrayQC(libs[libs$Run==RUN,], RUN)
# fldgmArrayQCplot(RunA)
# fldgmArrayQCplot(RunB)
Both runs are plotted together; this explains why there may be two groups, when cDNA yields differed strongly.
# with(libs, plotNameColor(Concentration, mean_ch2 - bg_mean_ch2, Error, Well)) # with(libs, plotNameColor(Concentration, mean_ch3 - bg_mean_ch3, Error, Well))
l1$r100l1 <- rarefy(t(assay(l1)),100) l1$r100l1[l1$counts < 100] <- NA
# with(libs, plotNameColor(Concentration, r100l1, Error, Well))
# with(subset(libs, Error == "0-No Error"), plotNameColor(Concentration, r100l1, Run, Well))
plotAnnot(colData(l1), 'qc', LIBRARY, l1$Group)
plotAnnot(colData(l1), 'counts', LIBRARY, l1$Group)
NMF::aheatmap( cor(genes[-1, ]) , annCol=list(Run=l1$Run))
# Uncomment if you have a Error column #NMF::aheatmap( cor(genes[-1, libs$Error == "0-No Error"]) # , annCol=list(Run=libs[libs$Error == "0-No Error", "Run"]))
dotsize <- 50 ggplot(colData(l1) %>% data.frame, aes(x=Group, y=genes)) + stat_summary(fun.y=mean, fun.ymin=mean, fun.ymax=mean, geom="crossbar", color="gray") + geom_dotplot(aes(fill=Group), binaxis='y', binwidth=1, dotsize=dotsize, stackdir='center') + coord_flip() + facet_wrap(~Run)
with(libs, plotNameColor(genes, counts, Run, Well))
#boxplot(data=subset(libs, Error == "0-No Error"), genes ~ Run, ylab="Number of genes detexted (aprox)", main="Comparison of gene detection by run.") #t.test(data=subset(libs, Error == "0-No Error"), genes ~ Run)
The purpose of these plots is to ease the comparison of the libraries given that they do not have exactly the same sequencing depth, and to evaluate if extra sequencing depth would be useful.
The data is rarefied with enough sampling points to give a smooth appearance to the curves. If it takes too much time, the output can be stored in an object saved on the hard drive. This way, if the commands have been run through knitr, the long computations here can be skipped if needed again in an interactive session.
For example:
rar1 <- hanabi(assay(l1), from = 0) saveRDS(rar1, "rar1.rds") # And later... rar1 <- readRDS("rar1.rds")
Would we detect more transcripts if we sequenced more raw reads ?
````r hanabiPlot( tapply(bed$score, bed$library, hanabi, from = 0) %>% structure(class = "hanabi") , ylab = 'number of molecules detected' , xlab = 'number of properly mapped reads' , main = paste("Transcript discovery for", LIBRARY) , GROUP = bed$library %>% levels %>% sub("_.*", "", .))
### Gene discovery Would we detect more genes if we detected more transcripts ? ```r hanabiPlot( hanabi(genes, from = 0) , ylab = 'number of genes detected' , xlab = 'number of unique molecule counts' , main = paste("Gene discovery for", LIBRARY) , GROUP = colData(l1)$group)
Would we detect more TSS if we detected more transcripts ?
hanabiPlot( hanabi(assay(l1), from = 0) , ylab = 'number of TSS detected' , xlab = 'number of unique molecule counts' , main = paste("TSS discovery for", LIBRARY) , GROUP = colData(l1)$group)
(only "No Error" cells)
#hanabiPlot(rarg[libs$Error=="0-No Error",], sampleSizeCounts, ylab='number of genes detected', xlab='number of unique molecule counts', main=paste("Gene discovery for", LIBRARY), GROUP=libs[libs$Error=="0-No Error", "Run"]) #legend('topleft',legend=levels(libs[libs$Error=="0-No Error", "Run"]), col=1:length(levels(libs[libs$Error=="0-No Error", "Run"])), pch=1)
Note that the horizontal axis is not the same in every plot:
For TSS or Gene discovery, the horizontal axis is unique molecule counts, that is, transcripts. The purpose of these plots is to show how easy it would be to detect more TSS or genes if we would sequence more transcripts.
For Transcript discovery, the horizontal axis is number of raw reads. The purpose of this plot is to show how possible it is to sequence more transcripts by increasing the number of raw reads.
FIXME: will do later FIXME: make idempotent
Uploading to Zenbu collaboration r ZENBU_COLLAB
.
Ad-hoc wrapper to the shell command zenbu_upload
zenbuUpload <- function ( ... , URL="http://fantom.gsc.riken.jp/zenbu" , verbose=FALSE , echo=FALSE , stdout=TRUE) { zenbu <- 'zenbu_upload' url <- c('-url', URL) args <- sapply(c(url, ...), shQuote) if (verbose == TRUE) print(paste(c(zenbu, args), collapse=' ')) if (echo == FALSE) { system2(zenbu, args, stdout=stdout) } else { system2('echo', c(zenbu, args), stdout=stdout) } }
Helper functions
bedToSample <- function(BED) BED %>% sub("RunA", RunA, .) %>% sub("RunB", RunB, .) %>% sub(".bed", "", .) # To produce a string of keywords that will uniquely identify a sample. # TODO: move keywords to the top. sampleDescription <- function(BED) paste( MOIRAI_FRAGS , bedToSample(BED) , "what kind of data" # For innstance "CAGEscan_fragments" , "a keyword for easy retrieval" # For instance "KnitrUpload" # add more keywords as needed ) CSfragmentLocation <- function(BED) paste0(MOIRAI_FRAGS, "/", BED, ".bed") CSuploadCommand <- function(BED) paste( CSfragmentLocation(BED) , bedToSample(BED) , sampleDescription(BED) , "" , sep = "\t")
levels(bed$library) %>% CSuploadCommand %>% writeLines(paste0(LIBRARY, ".zUpload.tsv"))
To upload CAGEscan fragments, only if they have not yet been uploaded.
zenbuUpload( "-assembly", ASSEMBLY , "-filelist", paste0(LIBRARY, ".zUpload.tsv") , "-collab_uuid", ZENBU_COLLAB , "-singletag_exp")
Functions to add metadata tags.
zenbuTag <- function (filter, key, value, mode='add', ...) { args <- c('-mdedit', filter, mode, key, value) zenbuUpload (args, ...) } tagError <- function(BED) zenbuTag( sampleDescription(BED) , 'cellomics' , libs[bedToSample(BED), "Error"] %>% as.character) tagMeta <- function(BED, TAG) zenbuTag( sampleDescription(BED) , TAG , libs[bedToSample(BED), TAG] %>% as.character) tagComment <- function (Sample, comment, ...) { filter <- paste(MOIRAI_FRAGS, Sample, "C1_CAGEscan_fragments", "KnitrUpload") zenbuTag(filter, 'cellomics_comment', comment, ...) } #apply(libs[,c('Well', 'Error')], 1, function(X) tagError (X[1], X[2])) #apply(libs[,c('Well', 'Comment')], 1, function(X) tagComment(X[1], X[2])) #keywords need to be stringent: "D12"" matches BED12 !!!
samplesToUpload <- subset(libs, properpairs > 0, "samplename", drop=T) %>% sub(RunA, "RunA", .) %>% sub(RunB, "RunB", .) %>% sub("$", ".bed", .) sapply(samplesToUpload, tagError) sapply(samplesToUpload, tagMeta, TAG="Group") sapply(samplesToUpload, tagMeta, TAG="Run") sapply(samplesToUpload, tagMeta, TAG="row") sapply(samplesToUpload, tagMeta, TAG="column")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.