knitr::opts_chunk$set(echo = TRUE)
require(knitr)
require(kableExtra)
require(plotly)
require(seqLogo)
require(ezRun)

output <- readRDS("output.rds")
param <- readRDS("param.rds")
input <- readRDS("input.rds")
rawScreenResult <- readRDS("rawScreenResult.rds")
procScreenResult <- readRDS("procScreenResult.rds")
virusResult <- readRDS("virusResult.rds")
if(file.exists('refseqResult.rds')){
    refseqResult = readRDS("refseqResult.rds")
} else {
    refseqResult = NULL
}
krakenResult <- readRDS("krakenResult.rds")
rRNAstrandResult <- readRDS("rRNAstrandResult.rds")
PWMresult <- readRDS("PWMs.rds")
dataset = input$meta
samples = input$getNames()

debug <- FALSE

Started on r format(Sys.time(), "%Y-%m-%d %H:%M:%S")

FastqScreen_Result {.tabset}

FastqScreen Mapping Rates to Adapters and rRNA

The raw reads are checked for adapter content. A local alignment using Bowtie2 is performed. Reads occuring from adapter dimers and adapter contamination at the 3' end can be discovered.

m <- list(l = 80, r = 80, b = 200, t = 100, pad = 0) # more margin space

percentMapped = sapply(rawScreenResult, function(x){100 - x$"%Unmapped"})

# MappingRateAdapters
p <- plot_ly(x=samples,
             y=percentMapped,
             type="bar") %>%
  layout(title = "MappingRate to Adapters - Bowtie2, local Alignment",
         yaxis = list(title = "MappedReads in %", range = c(0,100)),
         margin = m)
p

A Bowtie2 end-to-end alignment is performed to estimate the overall rRNA content of the libraries. SILVA is used as the reference database. The read orientation can be used to check if the sequenced libraries preserve the strand information.

# rRNA Mapping
rRnaPercentMapped = rRNAstrandResult[ , c("Sense", "Antisense")] / rRNAstrandResult$`Read Count` * 100
rRnaPercentMapped$sample = rownames(rRnaPercentMapped)
p <- plot_ly(rRnaPercentMapped, x=~sample, y=~Sense, 
             type="bar", name="sense") %>%
    add_trace(y = ~Antisense, name = "antisense") %>%
  layout(title = "rRNA Silva Mapping - Bowtie2, end-to-end Alignment",
         yaxis = list(title = "rRNA-Mapping-Rate in %"),
         xaxis = list(title = ""),
         barmode = 'stack', margin = m)
p

FastqScreen Mapping Per Sample

Each sample is mapped against multiple references using FastqScreen. This includes the most common model organisms, ribosomal RNA references (large and small subunit) based on SILVA on several taxonomic levels and typical biological and technical contaminants in NGS libraries (tRNAs, PhIX, Mycoplasma, UniVec).

for (nm in samples){
  par(mar=c(10.1, 4.1, 4.1, 2.1))
  x = procScreenResult[[nm]][ , c("%One_hit_one_genome", "%Multiple_hits_one_genome", 
                                  "%One_hit_multiple_genomes", "%Multiple_hits_multiple_genomes")]
  if (nrow(x) > 0){
    x = x[ , c("%One_hit_one_genome", "%Multiple_hits_one_genome", 
                                  "%One_hit_multiple_genomes", "%Multiple_hits_multiple_genomes")]
    bplt = barplot(t(x), las=2, ylim=c(0,100), 
                   legend.text=T, ylab="Mapped Reads in %", main=nm, names.arg=rep('', nrow(x)))
      text(x = bplt, y = par("usr")[3] - 2, srt = 45, adj = 1, labels = rownames(x), xpd = TRUE)
  } else {
    plot(1,1, type="n", axes=FALSE, main=nm, xlab="", ylab="", frame=TRUE)
    text(1,1, "no hits found")
  }
}

Mapping to RefSeq mRNA Per Sample

A mapping against all mRNAs in RefSeq can be useful to estimate the fraction reads which is specific for exonic regions. Furthermore contaminations with other species e.g. due to xenograft experiments can be discovered.

refseqSamples <- names(refseqResult)
names(refseqSamples) <- refseqSamples

if(length(refseqSamples) > 1){
  readFractionPerSample <- data.frame(sampleName = rep(refseqSamples[1], nrow(refseqResult[[refseqSamples[1]]])), 
                                           species = rownames(refseqResult[[refseqSamples[1]]]), 
                                           UniqueSpeciesPercent = refseqResult[[refseqSamples[1]]]$UniqueSpeciesPercent,
                                           MultipleSpeciesPercent = refseqResult[[refseqSamples[1]]]$MultipleSpeciesPercent
                                           )
  for (j in 2:length(refseqSamples)){
      tmp <- data.frame(sampleName = rep(refseqSamples[j], nrow(refseqResult[[refseqSamples[j]]])), species = rownames(refseqResult[[refseqSamples[j]]]), 
                        UniqueSpeciesPercent = refseqResult[[refseqSamples[j]]]$UniqueSpeciesPercent,
                        MultipleSpeciesPercent = refseqResult[[refseqSamples[j]]]$MultipleSpeciesPercent)
      readFractionPerSample <- rbind(readFractionPerSample, tmp)
  }
}

1. MultiSample Barplots for top hits

if(length(refseqSamples) > 1){
  m <- list(l = 80, r = 80, b = 200, t = 100, pad = 0)
  topSpecies <- sort(tapply(readFractionPerSample$UniqueSpeciesPercent, readFractionPerSample$species, sum), decreasing = TRUE)

    if (length(topSpecies) > 0){
    i = 1
    data <- readFractionPerSample[readFractionPerSample$species == names(topSpecies)[i],]

    p <- plot_ly(x=data$sampleName,
             y=data$UniqueSpeciesPercent + data$MultipleSpeciesPercent,
             type="bar") %>%
    layout(title = paste0('TopHit', i,':', names(topSpecies)[i], ' (Unique + MultiHits)'),
           yaxis = list(title = "MappedReads in %", range = c(0,100)), margin = m)
    p
  }
}
if(length(refseqSamples) > 1){
  m <- list(l = 80, r = 80, b = 200, t = 100, pad = 0)
  if (length(topSpecies) > 1){
    i = 2
    data <- readFractionPerSample[readFractionPerSample$species == names(topSpecies)[i],]

    p <- plot_ly(x=data$sampleName,
             y=data$UniqueSpeciesPercent + data$MultipleSpeciesPercent,
             type="bar") %>%
    layout(title = paste0('TopHit', i,':', names(topSpecies)[i], ' (Unique + MultiHits)'),
           yaxis = list(title = "MappedReads in %", range = c(0,100)), margin = m)
    p
  }
}
if(length(refseqSamples) > 2){
  m <- list(l = 80, r = 80, b = 200, t = 100, pad = 0)
  if (length(topSpecies) > 2){
    i = 3
   data <- readFractionPerSample[readFractionPerSample$species == names(topSpecies)[i],]

    p <- plot_ly(x=data$sampleName,
             y=data$UniqueSpeciesPercent + data$MultipleSpeciesPercent,
             type="bar") %>%
    layout(title = paste0('TopHit', i,':', names(topSpecies)[i], ' (Unique + MultiHits)'),
           yaxis = list(title = "MappedReads in %", range = c(0,100)), margin = m)
    p
  }
}

2. Per sample barplots

if(length(refseqSamples) > 100){
  message('Skipped: > 100 Samples')
} else {
  for (nm in refseqSamples){
        par(mar=c(10.1, 4.1, 4.1, 2.1))
        x = refseqResult[[nm]]
        if (is.null(x)) x = matrix(0, 2, 1, dimnames=list(c('UniqueSpeciesHits','MultipleSpeciesHits'),'Misc'))
        bplot = barplot(t(x), col=c("royalblue3", "lightblue"), las=2, ylim=c(0,100),
                        legend.text=T, ylab="Mapped Reads in %", main=nm, names.arg=rep('',nrow(x)) )
        text(y=t(x)[ 1,] + 5, x=bplot, font = 2, labels=t(x)[ 1, ], cex=1.1, col='black')
        text(x = bplot, y = par("usr")[3] - 2, srt = 45, adj = 1, 
             labels = rownames(x), xpd = TRUE)
    }
  }

Mapping to Genomes with Kraken2

Kraken2 is a tool to perform a taxonomic classification based on k-mer matches. The reference includes Refseq bacteria, archaea, viral libraries and the GRCh38 human genome. It's especially useful to discover bacterial contamination (e.g. Mycoplasma in cultivated cells.)

1. MultiSample Barplots for top hits

krakenSamples <- names(krakenResult)
names(krakenSamples) <- krakenSamples
if(length(krakenSamples) > 1){
  skipMultiSampleKraken <- FALSE

  readFractionPerSample <- data.frame(sampleName = rep(krakenSamples[1], nrow(krakenResult[[krakenSamples[1]]])), 
                                           species = krakenResult[[krakenSamples[1]]]$name, 
                                           readPercentage = krakenResult[[krakenSamples[1]]]$readPercentage
                                           )
  for (j in 2:length(krakenSamples)){
      tmp <- data.frame(sampleName = rep(names(krakenSamples)[j], nrow(krakenResult[[krakenSamples[j]]])), 
                                           species = krakenResult[[krakenSamples[j]]]$name, 
                                           readPercentage = krakenResult[[krakenSamples[j]]]$readPercentage
                                           )
      readFractionPerSample <- rbind(readFractionPerSample, tmp)
  }
} else {
    skipMultiSampleKraken <- TRUE    
}
if(!skipMultiSampleKraken){
  m <- list(l = 80, r = 80, b = 200, t = 100, pad = 0)
  topSpecies <- sort(tapply(readFractionPerSample$readPercentage, readFractionPerSample$species, sum), decreasing = TRUE)

    if (length(topSpecies) > 0){
    i = 1
    data <- readFractionPerSample[readFractionPerSample$species == names(topSpecies)[i],]

    p <- plot_ly(x=data$sampleName,
             y=data$readPercentage,
             type="bar") %>%
    layout(title = paste0('TopHit', i,':', names(topSpecies)[i]),
           yaxis = list(title = "MappedReads in %", range = c(0,100)), margin = m)
    p
  }
}
if(!skipMultiSampleKraken){
  m <- list(l = 80, r = 80, b = 200, t = 100, pad = 0)
    if (length(topSpecies) > 1){
    i = 2
    data <- readFractionPerSample[readFractionPerSample$species == names(topSpecies)[i],]

    p <- plot_ly(x=data$sampleName,
             y=data$readPercentage,
             type="bar") %>%
    layout(title = paste0('TopHit', i,':', names(topSpecies)[i]),
           yaxis = list(title = "MappedReads in %", range = c(0,100)), margin = m)
    p
  }
}
if(!skipMultiSampleKraken){
  m <- list(l = 80, r = 80, b = 200, t = 100, pad = 0)
    if (length(topSpecies) > 2){
    i = 3
    data <- readFractionPerSample[readFractionPerSample$species == names(topSpecies)[i],]

    p <- plot_ly(x=data$sampleName,
             y=data$readPercentage,
             type="bar") %>%
    layout(title = paste0('TopHit', i,':', names(topSpecies)[i]),
           yaxis = list(title = "MappedReads in %", range = c(0,100)), margin = m)
    p
  }
}

2. Per sample barplots

if(length(krakenSamples) > 100){
  message('Skipped: > 100 Samples')
} else if(length(krakenSamples) > 1){
  for (nm in krakenSamples){
      x = readFractionPerSample[readFractionPerSample$sampleName == nm, 'readPercentage']
      names(x) = readFractionPerSample[readFractionPerSample$sampleName == nm, 'species']
      par(mar=c(15.1, 6.1, 4.1, 2.1))
      if (length(x[x > 0]) > 0){
          bplot <- barplot(x[x > 0], names.arg = rep('',length(x[x>0])), col = 'royalblue3', 
                           main = nm, ylab = 'Mapped Reads in %', las = 2, ylim=c(0, 5))
          text(x = bplot, y = par("usr")[3] - 0.3, srt = 45, adj = 1, 
               labels = names(x)[x > 0], xpd = TRUE)
      }
  }
}

Human pathogens check

This check includes a Bowtie2 mapping and is currently only performed for human samples. It uses all unmapped reads of the FastqScreen mapping as input and maps these reads against several common human pathogens (Bacteria and Viruses).

if(param[['virusCheck']]){
  for (nm in samples){ 
    par(mar=c(18.1, 7.1, 2.1, 2.1))
    x = virusResult[[nm]]
    if (is.null(x)) x = matrix(0, 2, 1, dimnames=list(c('UniqueSpeciesHits','MultipleSpeciesHits'),'Misc'))
    bplot = barplot(t(x), col=c("royalblue3", "lightblue"), las = 2, ylim = c(0,100),
                    legend.text=T, ylab="Mapped Reads in %", main=nm, names.arg=rep('',nrow(x)) )
    text(y=t(x)[ 1,] + 5, x=bplot, font = 2, labels=t(x)[ 1, ], cex = 1.1, col = 'black')
    text(x = bplot, y = par("usr")[3] - 2, srt = 60, adj = 1, 
         labels = rownames(x), xpd = TRUE)
  }
} else {
  cat("not run")
}

Read Patterns R1

PWMs <- PWMresult[['R1']]
for (nm in names(PWMs)){
        cat(nm, "\n \n")
        seqLogo(PWMs[[nm]])
        cat("\n \n")
}

Read Patterns R2

PWMs <- PWMresult[['R2']]

for (nm in names(PWMs)){
        cat(nm, "\n \n")
        seqLogo(PWMs[[nm]])
        cat("\n \n")
}

Settings

getAppVer <- function(appName) { sub("^.+/([^/]+)$", "\\1", Sys.getenv(appName)) }
settings = character()
settings["Configuration File:"] = param$confFile
settings["RefSeq mRNA Reference:"] = REFSEQ_mRNA_REF
settings["FastqScreen Version:"] = getAppVer("FastQScreen")
settings["Bowtie2 Version:"] = getAppVer("Bowtie2")
settings["Bowtie2 Parameters:"] = param$cmdOptions
settings["Minimum AlignmentScore:"] = param$minAlignmentScore
settings["TopSpecies:"] = param$nTopSpecies
kable(as.data.frame(settings), col.names=NA, row.names=TRUE, format="html") %>%
  kable_styling(bootstrap_options = "striped", full_width = F,
                position = "left")

Input Dataset

ezInteractiveTableRmd(values=dataset)

SessionInfo

ezSessionInfo()


uzh/ezRun documentation built on May 4, 2024, 3:23 p.m.