knitr::opts_chunk$set(echo = TRUE)
library(knitr)
library(kableExtra)
library(SummarizedExperiment)
library(WGCNA)
library(plotly)
library(matrixStats)
library(reshape2)
library(tidyverse)
library(ezRun)
library(writexl)
library(ggplot2)
library(ggiraph)
if(!exists("dataset")){
  dataset <- readRDS("dataset.rds")
}
if(!exists("param")){
  param <- readRDS("param.rds")
}
if(!exists("resultList")){
  resultList <- readRDS("resultList.rds")
}


conds = ezConditionsFromDataset(dataset, param=param)
samples = rownames(dataset)
sampleColors = getSampleColors(conds, samples)
bamFiles = dataset$BAM
if (param$writeIgvSessionLink){
  if (length(bamFiles) > 4){
    idx = which(!duplicated(conds))
    idx = idx[1:min(4, length(idx))]
  } else {
    idx = 1:length(bamFiles)
  }
  for (each in idx){
    writeIgvSession(genome=getIgvGenome(param), refBuild=param$ezRef["refBuild"],
                    file=basename(sub(".bam", "-igv.xml", bamFiles[each])),
                    bamUrls=paste(PROJECT_BASE_URL, bamFiles[each], sep="/"))
    writeIgvJnlp(jnlpFile=basename(sub(".bam", "-igv.jnlp", bamFiles[each])),
                 projectId=param$projectId,
                 sessionUrl=paste(PROJECT_BASE_URL, sub(".bam", "-igv.xml", 
                                                        bamFiles[each]), 
                                  sep="/"))
  }
}

RNA_BAM_Statistics {.tabset}

Multi-Matching Reported in Bam File

The plot holds for each sample the number of reads in Millions that have X matches in the target and are reported in the file.

mmValues = integer()
for (sm in samples){
  mmValues = union(mmValues, as.integer(names(resultList[[sm]]$multiMatchInFileTable)))
}
mmCounts = ezMatrix(0, rows=samples, cols=sort(mmValues))
for (sm in samples){
  mm = resultList[[sm]]$multiMatchInFileTable
  mmCounts[sm, names(mm)] = mm
}
alignmentCountBarPlot(mmCounts, relative=FALSE)
alignmentCountBarPlot(mmCounts, relative=TRUE)
for (nm in c("multiMatchTargetTypeCounts", "uniqueMatchTargetTypeCounts")){
    if (!is.null(resultList[[1]][[nm]])){
      readSet = switch(nm,
                       multiMatchTargetTypeCounts="Uniquely and multi-matching reads:",
                       uniqueMatchTargetTypeCounts="Uniquely matching reads:")
      cat("\n###", paste(readSet, "Match Count Percentages"), "\n")
      tct = getTypeCountTable(resultList, nm)
      ezWrite.table(tct, file=paste0(nm, ".txt"), digits=4)
      tpt = as.matrix(tct)
      for (cn in colnames(tpt)){
        tpt[ ,cn] = tct[ ,cn]/ tct["total", cn] * 100
      }
      minPercentage = 1
      rowsUse = setdiff(rownames(tpt)[apply(tpt, 1, max) > minPercentage], "total")
      tptUse = tpt[rowsUse, , drop=FALSE]
      if (nrow(tptUse) >= 2 && ncol(tptUse) >= 2){
        tptUseRel = log2(tptUse)
        meanVals = apply(tptUseRel, 1, function(x){mean(x[is.finite(x)])}) ## ignore zero counts
        tptUseRel = tptUseRel - meanVals
        plotCmd = expression({
          ezHeatmap(tptUseRel, margins=c(10, 12), lim=c(-2, 2),
                    Rowv=FALSE, Colv=FALSE, main="Relative Prevalence [log2]")
        })
        eval(plotCmd)
        k <- kable(signif(tptUse, digits=3),
              row.names=TRUE, format = "html",
              caption="Match Count Percentages") %>%
          kable_styling(bootstrap_options = "striped",
                        full_width = F, position = "left") %>%
          footnote(general="Percentage value.")
        print(k)
      }else{
        cat("The plot and table are shown when there are at least two samples!",
            "\n")
      }

      cat("###", paste(readSet, "Read Starts per Base"), "\n")
      tct = as.matrix(getTypeCoverageTable(resultList, nm))
      if (nrow(tct) >= 2 && ncol(tct) >= 2){
        tctRel = log2(sweep(tct, 2, tct["total", ], FUN="/"))
        tctRel = tctRel[rowsUse, , drop=FALSE]
        plotCmd = expression({
          ezHeatmap(tctRel, margins=c(10, 12), lim=c(-5, 5),
                    Rowv=FALSE, Colv=FALSE, main="Coverage Enrichment")
        })
        eval(plotCmd)
        k <- kable(signif(tct[rowsUse, ], digits=4),
                   row.names=TRUE, format = "html",
                   caption="Read Starts per Base") %>%
          kable_styling(bootstrap_options = "striped",
                        full_width = F, position = "left") %>%
          footnote(general="Read Starts per Base is equivalent to Coverage divided by Read length.")
        print(k)
      }else{
        cat("The plot and table are shown when there are at least two samples!",
            "\n")
      }
    }
}
if (length(resultList[[1]][["genebody_coverage"]]) != 0){ ## TODO this could be done better by searching alls results for a valid genebody_coverage element
  cat("###", "Coverage plot\n")
  cat("####", "Genebody coverage plots\n")
  covValues = ezMatrix(NA, cols=0:100, rows=samples)
  for (sm in samples){
    y = resultList[[sm]][["genebody_coverage"]][["1200 to 2400nt"]][["medium expressed"]]
    if (!is.null(y)){
      covValues[sm, ] = y/sum(y)
    }
  }
  xx <- reshape2::melt(covValues, varnames=c("Sample", "Percentile"))
  xx$Sample = as.factor(xx$Sample)
  gbcp <- ggplot(data=xx, aes(x=Percentile, y=value))
  gbcp <-  gbcp + geom_line_interactive(aes(color=Sample,tooltip = Sample, data_id = Sample)) 
  gbcp <-  gbcp + scale_colour_manual(values=sampleColors) + labs(x="percentile of gene (5'->3')", y="relative coverage") + ggtitle("genebody coverage")


  print(ggplot(data=xx, 
       aes(x=Percentile, y=Sample, fill=value)) + geom_tile() + scale_fill_gradientn(colours=getBlueRedScale()) +
       labs(x="percentile of gene (5'->3')", y="relative coverage") + ggtitle("genebody coverage"))
}

tlc = resultList[[1]][["TranscriptsCovered"]]
if (!is.null(tlc)){
  cat("\n\n####", "The fraction of isoform length covered\n")
  geneFraction = ezMatrix(NA, cols=names(tlc), rows=samples)
  for (sm in samples){
    tlc = resultList[[sm]][["TranscriptsCovered"]]
    geneFraction[sm, ] = tlc /sum(tlc)
  }
  xx <- reshape2::melt(geneFraction, varnames=c("Sample", "Percentile"))
  xx$Sample = as.factor(xx$Sample)
  ggplot(xx, aes(x = Sample, y = value, fill = Percentile)) + 
    geom_col(position = "stack") + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
    labs(x="", y="fraction of isoforms")
}
if(exists('gbcp'))
  girafe(ggobj = gbcp, options = list(
    opts_hover_inv(css = "opacity:0.1;"),
    opts_hover(css = "stroke-width:2;"), opts_sizing(rescale = TRUE, width = .5)))
if(!is.null(resultList[[1]][["Junction"]])){
  cat("###", "Junction saturation plot for all samples\n")
  junctionMaxVal = numeric()
  for (nm in names(resultList[[1]][["Junction"]])){
    junctionMaxVal[nm] = 0
    for (sm in samples){
      junctionMaxVal[nm] = max(junctionMaxVal[nm], unlist(resultList[[sm]][["Junction"]][[nm]]))
    }
  }
  junctionPlots <- list()
  for (nm in names(resultList[[1]][["Junction"]][["junctionSaturation"]])){
        junctionDF <- c()
        for (sm in samples){
          junctionDF <- rbind(junctionDF, cbind(resultList[[sm]][["Junction"]][["junctionSaturation"]][[nm]], sm))
        }
        junctionDF <- data.frame(readFraction = rownames(junctionDF), Junctions <- as.numeric(junctionDF[,1]), Sample = as.factor(junctionDF[,2]))
        junctionPlots[[nm]] <- ggplot(data=junctionDF, aes(x=readFraction, y=Junctions, group = Sample))
        junctionPlots[[nm]] <- junctionPlots[[nm]] + geom_line_interactive(aes(color=Sample,tooltip = Sample, data_id = Sample)) + geom_point(aes(col=Sample), size = 1)
        junctionPlots[[nm]] <- junctionPlots[[nm]] + labs(x="percent of total reads", y="Number of splicing junctions (K)") + ggtitle(nm)
        junctionPlots[[nm]] <- girafe(ggobj = junctionPlots[[nm]], options = list(opts_hover_inv(css = "opacity:0.1;"), opts_hover(css = "stroke-width:2;"), opts_sizing(rescale = TRUE, width = .6)))

  }
}
    if(exists('junctionPlots')){
        htmltools::tagList(setNames(junctionPlots, NULL))
        }
if (!is.null(resultList[[1]][["Junction"]])){
  cat("###", "Junction plots\n")
  for (sm in samples){
    cat("\n")
    cat("####", sm, "\n")
    for (nm in names(resultList[[sm]][["Junction"]])){
        junctionPlot = resultList[[sm]][["Junction"]][[nm]]
        plotCmd = expression({
          if (nm %in% c("splice_events", "splice_junction")){
            pie(junctionPlot, col=c(2,3,4), init.angle=30, angle=c(60,120,150),
                density=c(70,70,70),main=nm, 
                labels=paste(names(junctionPlot), paste0(round(junctionPlot), "%")))
          } else if (nm =="junctionSaturation"){
            x = as.numeric(names(junctionPlot[[1]])) * 100
            plot(1,1,xlab="percent of total reads", ylab='Number of splicing junctions (x1000)',type='o',
                 ylim=c(0, junctionMaxVal[nm]/1000), xlim=range(x))
            saturationColors = c("all junctions"="blue", "known junctions"="red", "novel junctions"="green")
            for (item in names(junctionPlot)){
              lines(x, junctionPlot[[item]]/1000, col=saturationColors[item], type="o")
            }
            legend("topleft", legend=names(saturationColors), col=saturationColors,lwd=1,pch=1)
          }
        })
        eval(plotCmd)
    }
    cat("\n")
  }
}
if(!is.null(resultList[[1]]$fragSizeHist)){
  cat("###", "Length distribution of fragments for paired reads\n")
  for (sm in samples){
    fsh = resultList[[sm]]$fragSizeHist
    ezWrite.table(cbind(Length=fsh$mids, Count=fsh$counts),
                  file=paste0(sm, "-fragSizeHist.txt"), row.names=FALSE)
    plotCmd = expression({
        plot(fsh, xlab="fragment size", main=paste(sm, "-- Length Histogram"), 
             ylim=c(0, max(fsh$counts[-length(fsh$counts)]))) ## don't use the longest fragment size
    })
    try({
      eval(plotCmd)
    })
  }
}
if (!is.null(resultList[[1]][["ErrorRates"]])){
  cat("###", "Read position specific error rate\n")
  for (sm in samples){
    for (nm in names(resultList[[sm]][["ErrorRates"]])){
      errorRate = resultList[[sm]][["ErrorRates"]][[nm]]
      if (!is.null(errorRate)){
        plotPosSpecificErrorRate(errorRate, main=paste(sm, nm))
      }
    }
  }
}
if(!is.null(resultList[[1]][["dupRate"]])){
  cat("###", "Duplication rate quality control\n")
  cat("\n")
  cat("The number of reads per base assigned to a gene in an ideal RNA-Seq data set is expected to be proportional to the abundance of its transcripts in the sample. For lowly expressed genes we expect read duplication to happen rarely by chance, while for highly expressed genes - depending on the total sequencing depth - we expect read duplication to happen often.", "\n")
  cat("\n")
  require(dupRadar, quietly = TRUE)
  dupResult <- data.frame(Intercept = 0, Slope = 0, Sample = samples)
  rownames(dupResult) <- samples
  for(sm in samples){
    dupData <- duprateExpFit(DupMat=resultList[[sm]][["dupRate"]])
    dupResult[sm,] <- c(dupData$intercept, dupData$slope, sm)
    duprateExpDensPlot(DupMat=resultList[[sm]][["dupRate"]])
    title(paste(sm, "-- 2D density scatter plot"))
  }
  dupResult$Intercept = as.numeric(dupResult$Intercept)
  dupResult$Slope = as.numeric(dupResult$Slope)
  dupPlot <- ggplot(data = dupResult, mapping = aes(x = Slope, y = Intercept, tooltip = Sample, data_id = Sample))
  dupPlot <- dupPlot + geom_point_interactive(aes(col=Sample), size = 2, hover_nearest = TRUE) + ggtitle('MultiSample Plot')
  dupPlot <- girafe(ggobj = dupPlot, options = list(opts_sizing(rescale = TRUE, width = .7)))
}
    if(exists('dupPlot'))
        dupPlot

Input Dataset

ezInteractiveTableRmd(dataset)


uzh/ezRun documentation built on May 9, 2024, 6:16 p.m.