In BIMSBbioinfo/RCAS: RNA Centric Annotation System

knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE, fig.height = 4.5, fig.width = 8)
knitr::opts_knit$set(progress = FALSE, root.dir = params$workdir)

suppressWarnings(suppressMessages(library(RCAS)))
# Connect to sqlite database
mydb <- RSQLite::dbConnect(RSQLite::SQLite(), params$dbPath)

# Read sample table and check format
sampleTable <- read.table(params$sampleTablePath, header = TRUE,stringsAsFactors = FALSE)
if(sum(c('sampleName', 'sampleGroup') %in% colnames(sampleTable)) != 2) {
  stop("The sample table must consist of at least 
       two columns (separated by white space(s)) 
       with headers 'sampleName' and 'sampleGroup'. 
       See ",params$sampleTable)
}

if(nrow(sampleTable) < 2) {
  stop("The sample table must contain at least one entry. 
       See ",sampleTable)
}

#set up which chunks to run depending on which tables are available in mydb
#1. geneOverlaps table
evalGeneOverlaps <- FALSE
if(RSQLite::dbExistsTable(mydb, 'geneOverlaps')){
  geneOverlaps <- RSQLite::dbReadTable(mydb, 'geneOverlaps')
  samplesNotFound <- setdiff(sampleTable$sampleName, colnames(geneOverlaps))
  if(length(samplesNotFound) > 0){
    warning("Samples with names '",paste0(samplesNotFound, collapse = ', '),
            "' do not exist in 'geneOverlaps' table. Skipping this table")
    } else {
      evalGeneOverlaps <- TRUE
      geneOverlaps <- subset(geneOverlaps, select = sampleTable$sampleName)
    }
  } 

#2. annotationSummaries table
evalAnnotationSummaries <- FALSE
if(RSQLite::dbExistsTable(mydb, 'annotationSummaries')){
  AS <-  RSQLite::dbReadTable(mydb, 'annotationSummaries')
  annotationSummaries <- as.matrix(AS[,c(-1)])
  rownames(annotationSummaries) <- AS$sampleName
  samplesNotFound <- setdiff(sampleTable$sampleName, rownames(annotationSummaries))

  if(length(samplesNotFound) > 0) {
    warning("Samples with names '",paste0(samplesNotFound, collapse = ', '),
          "' do not exist in 'annotationSummaries' table. Skipping this table")
    } else {
      evalAnnotationSummaries <- TRUE
      annotationSummaries <- annotationSummaries[sampleTable$sampleName,]
    }
} 

#3. discoveredMotifs table 
evalMotifDiscovery <- FALSE
if(RSQLite::dbExistsTable(mydb, 'discoveredMotifs')) {
  motifData <- RSQLite::dbReadTable(mydb, 'discoveredMotifs')
  samplesNotFound <- setdiff(sampleTable$sampleName, motifData$sampleName)
  if(length(samplesNotFound) > 0) {
    warning("Samples with names '",paste0(samplesNotFound, collapse = ', '),
            "' do not exist in 'discoveredMotifs' table. Skipping this table")
    } else {
      evalMotifDiscovery <- TRUE
      motifData <- motifData[motifData$sampleName %in% sampleTable$sampleName,]
    }
} 

#4. featureBoundaryCoverageProfiles table
evalCoverageProfiles <- FALSE 
if(RSQLite::dbExistsTable(mydb, 'featureBoundaryCoverageProfiles')) {
  cvg <-  RSQLite::dbReadTable(mydb, 'featureBoundaryCoverageProfiles')
  samplesNotFound <- setdiff(sampleTable$sampleName, cvg$sampleName)
  if(length(samplesNotFound) > 0) {
    warning("Samples with names '",paste0(samplesNotFound, collapse = ', '),
            "' do not exist in 'featureBoundaryCoverageProfiles' table. Skipping this table")
    } else {
      evalCoverageProfiles <- TRUE
      cvg <- cvg[cvg$sampleName %in% sampleTable$sampleName,]
      cvg$sampleGroup <- sampleTable[match(cvg$sampleName, sampleTable$sampleName),]$sampleGroup
    }
}

#initialize figure and table counts for captions
figureCount <- 1
tableCount <- 1

cat("# Distance of samples by various metrics\n")

cat('## Jaccard distance based on shared target genes\n')

cat("**Figure",figureCount,":** Jaccard distance based on shared target genes. 
Here we plot the distance matrix between samples based on the Jaccard index computed for each pairwise sample 
based on how many genes the compared pair of samples co-overlap. \n")

SM <- as.matrix(proxy::dist(x = as.matrix(geneOverlaps), method = 'Jaccard', by_rows = FALSE, diag = TRUE, upper = TRUE))
pheatmap::pheatmap(SM, display_numbers = FALSE)
figureCount <- figureCount + 1

cat("## Clustering of samples based on overlaps with transcript features\n")
cat("**Figure",figureCount,":** Clustering of samples based on overlaps with transcript features\n")

annDF <- data.frame('sampleGroup' = sampleTable[match(rownames(annotationSummaries), sampleTable$sampleName),]$sampleGroup)
rownames(annDF) <- rownames(annotationSummaries)
pheatmap::pheatmap(t(annotationSummaries), scale = 'column', 
         annotation_col = annDF)
figureCount <- figureCount + 1

cat("# Motif Discovery\n")
cat("**Table",tableCount,":** Feature specific motifs discovered for each sample\n")

DT::datatable(motifData, 
          extensions = c('Buttons', 'FixedColumns'), 
          options = list(fixedColumns = TRUE, 
                         scrollX = TRUE,
                         dom = 'Bfrtip',
                         buttons = c('copy', 'print', 'csv','excel', 'pdf')),
          filter = 'bottom'
          )
tableCount <- tableCount + 1

cat("# Feature Boundary Coverage Profiles\n")

cat("## Smoothed line plots\n")
cat("**Figure",figureCount,":** Feature boundary coverage profiles for each sample\n")

#scale meanCoverage values by sampleName and feature
cvgScaled <- do.call(rbind, 
                     lapply(split(cvg, cvg$sampleName),
                            FUN = function(df) {
                              df$scaledCoverage <- scale(df$meanCoverage)
                              return(df)
                              }))

p <- plotly::ggplotly(ggplot2::ggplot(cvgScaled, aes(x = bases, y = scaledCoverage)) + 
                        geom_vline(xintercept = 0, color = 'gray') +
                        facet_grid(feature ~ boundary) + 
                        geom_smooth(aes(color = sampleGroup, alpha = scaledCoverage)) + theme_minimal())
layout(p)
figureCount <- figureCount + 1

plotPCA <- function(cv, title = '') {
  cvM <- dcast(cv, sampleName ~ boundary + bases, value.var = 'meanCoverage')
  M <- as.matrix(cvM[,-1])
  rownames(M) <- cvM[,1]
  pca1 <- prcomp(M)
  scores = as.data.frame(pca1$x)
  scores$sampleName <- cv[match(rownames(scores), cv$sampleName),]$sampleName
  scores$sampleGroup <- cv[match(rownames(scores), cv$sampleName),]$sampleGroup

  p <- ggplot2::ggplot(data = scores, aes(x = PC1, y = PC2, label = sampleName)) +
    geom_point(aes(color = sampleGroup), size = 4) +
    geom_hline(yintercept = 0, colour = "gray65") +
    geom_vline(xintercept = 0, colour = "gray65") +
    labs(title = title) + theme_minimal(base_size = 18) %+replace% 
    theme(legend.position = 'bottom', legend.title = element_blank())
  return(p)
}

pcaPlots <- lapply(unique(cvg$feature), function(f) {
  plotPCA(cv = cvg[cvg$feature == f,], title = f)
})
names(pcaPlots) <- unique(cvg$feature)

cat("## PCA plots {.tabset}\n")
for (i in 1:length(pcaPlots)) {
  cat("### ",names(pcaPlots)[i],"\n")
  print(pcaPlots[[i]])
  cat('\n\n')
}
figureCount <- figureCount + 1

RSQLite::dbDisconnect(mydb)

Acknowledgements

RCAS is developed in the group of Altuna Akalin (head of the Scientific Bioinformatics Platform) by Bora Uyar (Bioinformatics Scientist), Dilmurat Yusuf (Bioinformatics Scientist) and Ricardo Wurmus (System Administrator) at the Berlin Institute of Medical Systems Biology (BIMSB) at the Max-Delbrueck-Center for Molecular Medicine (MDC) in Berlin.

RCAS is developed as a bioinformatics service as part of the RNA Bioinformatics Center, which is one of the eight centers of the German Network for Bioinformatics Infrastructure (de.NBI).