knitr::opts_chunk$set(echo = TRUE) require(knitr) require(kableExtra) require(plotly) require(seqLogo) require(ezRun) output <- readRDS("output.rds") param <- readRDS("param.rds") input <- readRDS("input.rds") rawScreenResult <- readRDS("rawScreenResult.rds") procScreenResult <- readRDS("procScreenResult.rds") virusResult <- readRDS("virusResult.rds") if(file.exists('refseqResult.rds')){ refseqResult = readRDS("refseqResult.rds") } else { refseqResult = NULL } krakenResult <- readRDS("krakenResult.rds") rRNAstrandResult <- readRDS("rRNAstrandResult.rds") PWMresult <- readRDS("PWMs.rds") dataset = input$meta samples = input$getNames() debug <- FALSE
Started on r format(Sys.time(), "%Y-%m-%d %H:%M:%S")
The raw reads are checked for adapter content. A local alignment using Bowtie2 is performed. Reads occuring from adapter dimers and adapter contamination at the 3' end can be discovered.
m <- list(l = 80, r = 80, b = 200, t = 100, pad = 0) # more margin space percentMapped = sapply(rawScreenResult, function(x){100 - x$"%Unmapped"}) # MappingRateAdapters p <- plot_ly(x=samples, y=percentMapped, type="bar") %>% layout(title = "MappingRate to Adapters - Bowtie2, local Alignment", yaxis = list(title = "MappedReads in %", range = c(0,100)), margin = m) p
A Bowtie2 end-to-end alignment is performed to estimate the overall rRNA content of the libraries. SILVA is used as the reference database. The read orientation can be used to check if the sequenced libraries preserve the strand information.
# rRNA Mapping rRnaPercentMapped = rRNAstrandResult[ , c("Sense", "Antisense")] / rRNAstrandResult$`Read Count` * 100 rRnaPercentMapped$sample = rownames(rRnaPercentMapped) p <- plot_ly(rRnaPercentMapped, x=~sample, y=~Sense, type="bar", name="sense") %>% add_trace(y = ~Antisense, name = "antisense") %>% layout(title = "rRNA Silva Mapping - Bowtie2, end-to-end Alignment", yaxis = list(title = "rRNA-Mapping-Rate in %"), xaxis = list(title = ""), barmode = 'stack', margin = m) p
Each sample is mapped against multiple references using FastqScreen. This includes the most common model organisms, ribosomal RNA references (large and small subunit) based on SILVA on several taxonomic levels and typical biological and technical contaminants in NGS libraries (tRNAs, PhIX, Mycoplasma, UniVec).
for (nm in samples){ par(mar=c(10.1, 4.1, 4.1, 2.1)) x = procScreenResult[[nm]][ , c("%One_hit_one_genome", "%Multiple_hits_one_genome", "%One_hit_multiple_genomes", "%Multiple_hits_multiple_genomes")] if (nrow(x) > 0){ x = x[ , c("%One_hit_one_genome", "%Multiple_hits_one_genome", "%One_hit_multiple_genomes", "%Multiple_hits_multiple_genomes")] bplt = barplot(t(x), las=2, ylim=c(0,100), legend.text=T, ylab="Mapped Reads in %", main=nm, names.arg=rep('', nrow(x))) text(x = bplt, y = par("usr")[3] - 2, srt = 45, adj = 1, labels = rownames(x), xpd = TRUE) } else { plot(1,1, type="n", axes=FALSE, main=nm, xlab="", ylab="", frame=TRUE) text(1,1, "no hits found") } }
A mapping against all mRNAs in RefSeq can be useful to estimate the fraction reads which is specific for exonic regions. Furthermore contaminations with other species e.g. due to xenograft experiments can be discovered.
refseqSamples <- names(refseqResult) names(refseqSamples) <- refseqSamples if(length(refseqSamples) > 1){ readFractionPerSample <- data.frame(sampleName = rep(refseqSamples[1], nrow(refseqResult[[refseqSamples[1]]])), species = rownames(refseqResult[[refseqSamples[1]]]), UniqueSpeciesPercent = refseqResult[[refseqSamples[1]]]$UniqueSpeciesPercent, MultipleSpeciesPercent = refseqResult[[refseqSamples[1]]]$MultipleSpeciesPercent ) for (j in 2:length(refseqSamples)){ tmp <- data.frame(sampleName = rep(refseqSamples[j], nrow(refseqResult[[refseqSamples[j]]])), species = rownames(refseqResult[[refseqSamples[j]]]), UniqueSpeciesPercent = refseqResult[[refseqSamples[j]]]$UniqueSpeciesPercent, MultipleSpeciesPercent = refseqResult[[refseqSamples[j]]]$MultipleSpeciesPercent) readFractionPerSample <- rbind(readFractionPerSample, tmp) } }
if(length(refseqSamples) > 1){ m <- list(l = 80, r = 80, b = 200, t = 100, pad = 0) topSpecies <- sort(tapply(readFractionPerSample$UniqueSpeciesPercent, readFractionPerSample$species, sum), decreasing = TRUE) if (length(topSpecies) > 0){ i = 1 data <- readFractionPerSample[readFractionPerSample$species == names(topSpecies)[i],] p <- plot_ly(x=data$sampleName, y=data$UniqueSpeciesPercent + data$MultipleSpeciesPercent, type="bar") %>% layout(title = paste0('TopHit', i,':', names(topSpecies)[i], ' (Unique + MultiHits)'), yaxis = list(title = "MappedReads in %", range = c(0,100)), margin = m) p } }
if(length(refseqSamples) > 1){ m <- list(l = 80, r = 80, b = 200, t = 100, pad = 0) if (length(topSpecies) > 1){ i = 2 data <- readFractionPerSample[readFractionPerSample$species == names(topSpecies)[i],] p <- plot_ly(x=data$sampleName, y=data$UniqueSpeciesPercent + data$MultipleSpeciesPercent, type="bar") %>% layout(title = paste0('TopHit', i,':', names(topSpecies)[i], ' (Unique + MultiHits)'), yaxis = list(title = "MappedReads in %", range = c(0,100)), margin = m) p } }
if(length(refseqSamples) > 2){ m <- list(l = 80, r = 80, b = 200, t = 100, pad = 0) if (length(topSpecies) > 2){ i = 3 data <- readFractionPerSample[readFractionPerSample$species == names(topSpecies)[i],] p <- plot_ly(x=data$sampleName, y=data$UniqueSpeciesPercent + data$MultipleSpeciesPercent, type="bar") %>% layout(title = paste0('TopHit', i,':', names(topSpecies)[i], ' (Unique + MultiHits)'), yaxis = list(title = "MappedReads in %", range = c(0,100)), margin = m) p } }
if(length(refseqSamples) > 100){ message('Skipped: > 100 Samples') } else { for (nm in refseqSamples){ par(mar=c(10.1, 4.1, 4.1, 2.1)) x = refseqResult[[nm]] if (is.null(x)) x = matrix(0, 2, 1, dimnames=list(c('UniqueSpeciesHits','MultipleSpeciesHits'),'Misc')) bplot = barplot(t(x), col=c("royalblue3", "lightblue"), las=2, ylim=c(0,100), legend.text=T, ylab="Mapped Reads in %", main=nm, names.arg=rep('',nrow(x)) ) text(y=t(x)[ 1,] + 5, x=bplot, font = 2, labels=t(x)[ 1, ], cex=1.1, col='black') text(x = bplot, y = par("usr")[3] - 2, srt = 45, adj = 1, labels = rownames(x), xpd = TRUE) } }
Kraken2 is a tool to perform a taxonomic classification based on k-mer matches. The reference includes Refseq bacteria, archaea, viral libraries and the GRCh38 human genome. It's especially useful to discover bacterial contamination (e.g. Mycoplasma in cultivated cells.)
krakenSamples <- names(krakenResult) names(krakenSamples) <- krakenSamples if(length(krakenSamples) > 1){ skipMultiSampleKraken <- FALSE readFractionPerSample <- data.frame(sampleName = rep(krakenSamples[1], nrow(krakenResult[[krakenSamples[1]]])), species = krakenResult[[krakenSamples[1]]]$name, readPercentage = krakenResult[[krakenSamples[1]]]$readPercentage ) for (j in 2:length(krakenSamples)){ tmp <- data.frame(sampleName = rep(names(krakenSamples)[j], nrow(krakenResult[[krakenSamples[j]]])), species = krakenResult[[krakenSamples[j]]]$name, readPercentage = krakenResult[[krakenSamples[j]]]$readPercentage ) readFractionPerSample <- rbind(readFractionPerSample, tmp) } } else { skipMultiSampleKraken <- TRUE }
if(!skipMultiSampleKraken){ m <- list(l = 80, r = 80, b = 200, t = 100, pad = 0) topSpecies <- sort(tapply(readFractionPerSample$readPercentage, readFractionPerSample$species, sum), decreasing = TRUE) if (length(topSpecies) > 0){ i = 1 data <- readFractionPerSample[readFractionPerSample$species == names(topSpecies)[i],] p <- plot_ly(x=data$sampleName, y=data$readPercentage, type="bar") %>% layout(title = paste0('TopHit', i,':', names(topSpecies)[i]), yaxis = list(title = "MappedReads in %", range = c(0,100)), margin = m) p } }
if(!skipMultiSampleKraken){ m <- list(l = 80, r = 80, b = 200, t = 100, pad = 0) if (length(topSpecies) > 1){ i = 2 data <- readFractionPerSample[readFractionPerSample$species == names(topSpecies)[i],] p <- plot_ly(x=data$sampleName, y=data$readPercentage, type="bar") %>% layout(title = paste0('TopHit', i,':', names(topSpecies)[i]), yaxis = list(title = "MappedReads in %", range = c(0,100)), margin = m) p } }
if(!skipMultiSampleKraken){ m <- list(l = 80, r = 80, b = 200, t = 100, pad = 0) if (length(topSpecies) > 2){ i = 3 data <- readFractionPerSample[readFractionPerSample$species == names(topSpecies)[i],] p <- plot_ly(x=data$sampleName, y=data$readPercentage, type="bar") %>% layout(title = paste0('TopHit', i,':', names(topSpecies)[i]), yaxis = list(title = "MappedReads in %", range = c(0,100)), margin = m) p } }
if(length(krakenSamples) > 100){ message('Skipped: > 100 Samples') } else if(length(krakenSamples) > 1){ for (nm in krakenSamples){ x = readFractionPerSample[readFractionPerSample$sampleName == nm, 'readPercentage'] names(x) = readFractionPerSample[readFractionPerSample$sampleName == nm, 'species'] par(mar=c(15.1, 6.1, 4.1, 2.1)) if (length(x[x > 0]) > 0){ bplot <- barplot(x[x > 0], names.arg = rep('',length(x[x>0])), col = 'royalblue3', main = nm, ylab = 'Mapped Reads in %', las = 2, ylim=c(0, 5)) text(x = bplot, y = par("usr")[3] - 0.3, srt = 45, adj = 1, labels = names(x)[x > 0], xpd = TRUE) } } }
This check includes a Bowtie2 mapping and is currently only performed for human samples. It uses all unmapped reads of the FastqScreen mapping as input and maps these reads against several common human pathogens (Bacteria and Viruses).
if(param[['virusCheck']]){ for (nm in samples){ par(mar=c(18.1, 7.1, 2.1, 2.1)) x = virusResult[[nm]] if (is.null(x)) x = matrix(0, 2, 1, dimnames=list(c('UniqueSpeciesHits','MultipleSpeciesHits'),'Misc')) bplot = barplot(t(x), col=c("royalblue3", "lightblue"), las = 2, ylim = c(0,100), legend.text=T, ylab="Mapped Reads in %", main=nm, names.arg=rep('',nrow(x)) ) text(y=t(x)[ 1,] + 5, x=bplot, font = 2, labels=t(x)[ 1, ], cex = 1.1, col = 'black') text(x = bplot, y = par("usr")[3] - 2, srt = 60, adj = 1, labels = rownames(x), xpd = TRUE) } } else { cat("not run") }
PWMs <- PWMresult[['R1']] for (nm in names(PWMs)){ cat(nm, "\n \n") seqLogo(PWMs[[nm]]) cat("\n \n") }
PWMs <- PWMresult[['R2']] for (nm in names(PWMs)){ cat(nm, "\n \n") seqLogo(PWMs[[nm]]) cat("\n \n") }
getAppVer <- function(appName) { sub("^.+/([^/]+)$", "\\1", Sys.getenv(appName)) } settings = character() settings["Configuration File:"] = param$confFile settings["RefSeq mRNA Reference:"] = REFSEQ_mRNA_REF settings["FastqScreen Version:"] = getAppVer("FastQScreen") settings["Bowtie2 Version:"] = getAppVer("Bowtie2") settings["Bowtie2 Parameters:"] = param$cmdOptions settings["Minimum AlignmentScore:"] = param$minAlignmentScore settings["TopSpecies:"] = param$nTopSpecies kable(as.data.frame(settings), col.names=NA, row.names=TRUE, format="html") %>% kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
ezInteractiveTableRmd(values=dataset)
ezSessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.