In BIMSBbioinfo/RCAS: RNA Centric Annotation System

knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE, fig.height = 4.5, fig.width = 8)
knitr::opts_knit$set(progress = FALSE, root.dir = params$workdir)

annotationSummary <- params$annotationSummary
goAnalysis <- params$goAnalysis
motifAnalysis <- params$motifAnalysis
provideMotifSummary <- FALSE #if params$motifAnalysis is TRUE and any motif is found, this will set to TRUE
goSummary <- FALSE

library(RCAS)

Introduction

RCAS is an automated system that provides dynamic genome annotations for custom input files that contain transcriptomic regions. Such transcriptomic regions could be, for instance, peak regions detected by CLIP-Seq analysis that detect protein-RNA interactions, RNA modifications (alias the epitranscriptome), CAGE-tag locations, or any other collection of target regions at the level of the transcriptome.

RCAS is designed as a reporting tool for the functional analysis of RNA-binding sites detected by high-throughput experiments. It takes as input a BED format file containing the genomic coordinates of the RNA binding sites and a GTF file that contains the genomic annotation features usually provided by publicly available databases such as Ensembl and UCSC. RCAS performs overlap operations between the genomic coordinates of the RNA binding sites and the genomic annotation features and produces in-depth annotation summaries such as the distribution of binding sites with respect to gene features (exons, introns, 5’/3’ UTR regions, exon-intron boundaries, promoter regions, and whole transcripts) along with functionally enriched term for targeted genes. Moreover, RCAS can darry out discriminative motif discovery in the query regions. The final report of RCAS consists of high-quality dynamic figures and tables, which are readily applicable for publications or other academic usage.

figureCount <- 1
tableCount <- 1

Input Settings

inputParameterDesc <- c('Query BED file', 
                     'Target GTF file',
                     'Annotation Summary Module',
                     'GO Analysis Module',
                     'Motif Analysis Module',
                     'Genome Version',
                     'Species',
                     'Print Processed Tables?',
                     'Randomly sample query regions down to (N)',
                     'Working Directory')
inputParameterValues <- c(params$query, 
                          params$gff,
                          params$annotationSummary,
                          params$goAnalysis,
                          params$motifAnalysis,
                          params$genomeVersion,
                          params$species,
                          params$printProcessedTables,
                          params$sampleN,
                          params$workdir)

inputSettings <- data.frame(parameters = inputParameterDesc,
                            values = inputParameterValues, 
                            stringsAsFactors = FALSE)

DT::datatable(data = inputSettings,
              extensions = 'FixedColumns',
              options = list(fixedColumns = TRUE, 
                         scrollX = TRUE, 
                         pageLength = 20,
                         dom = 't'))

if(params$query == 'testdata') {
  data(queryRegions) 
} else {
  cat('importing BED from filepath',params$query,'\n')
  queryRegions <- importBed(filePath = params$query, sampleN = params$sampleN)
}

if(params$gff == 'testdata') {
  data(gff)         
} else {
  cat('importing GTF from filepath',params$gff,'\n')
  gff <- importGtf(filePath = params$gff)
}

overlaps <- queryGff(queryRegions = queryRegions, gff = gff)
#data.table is used to do quick summary operations
overlaps.dt <- data.table(GenomicRanges::as.data.frame(overlaps)) 

#get all genes from the GTF data 
backgroundGenes <- unique(gff$gene_id)
#get genes that overlap query regions
targetedGenes <- unique(overlaps$gene_id)

txdbFeatures <- getTxdbFeaturesFromGRanges(gffData = gff)

cat('# Annotation Summary for Query Regions \n')

cat('## Distribution of query regions across gene features\n')

cat("**Figure",figureCount,":** The number of query regions that overlap different kinds of gene features are counted. The 'y' axis denotes the types of gene features included in the analysis and the 'x' axis denotes the percentage of query regions (out of total number of query regions denoted with 'n') that overlap at least one genomic interval that host the corresponding feature. Notice that the sum of the percentage values for different features don't add up to 100%, because some query regions may overlap multiple kinds of features \n")

summary <- summarizeQueryRegions(queryRegions = queryRegions, 
                                 txdbFeatures = txdbFeatures)

df <- data.frame(summary)
df$percent <- round((df$count / length(queryRegions)), 3) * 100
p <- plotly::plot_ly( data = df, 
              x = rownames(df), 
              y = df$percent, 
              type = 'bar',
              text = paste("count:", df$count), 
              color = rownames(df)
              )
plotly::layout(p = p, 
       xaxis = list(title = 'features'),
       yaxis = list(title = paste("% query,", 
                                  "n =", length(queryRegions)
                                  )
                    ), 
       margin = list(b = 150, r = 50),
       font = list(size = 14)
       )

if(params$printProcessedTables == TRUE) {
  write.table(x = df, file=paste0('Figure',figureCount,'.summarizeQueryRegions.data.tsv'), quote = FALSE, sep = '\t', row.names = TRUE)
} 

figureCount <- figureCount + 1

cat("## Interactive table of genes that overlap query regions\n")
cat("**Table",tableCount,":** Interactive table of top 100 genes that overlap query regions, grouped by gene features such as introns, exons, UTRs, etc.\n")

dt <- getTargetedGenesTable(queryRegions = queryRegions, 
                           txdbFeatures = txdbFeatures)
dt <- dt[order(transcripts, decreasing = TRUE)]

DT::datatable(dt[1:100], 
          extensions = c('Buttons', 'FixedColumns'), 
          options = list(fixedColumns = TRUE, 
                         scrollX = TRUE,
                         dom = 'Bfrtip',
                         buttons = c('copy', 'print', 'csv','excel', 'pdf')),
          filter = 'bottom'
          )
if(params$printProcessedTables == TRUE) {
  write.table(x = dt, file=paste0('Table',tableCount,'.getTargetedGenesTable.data.tsv'), quote = FALSE, sep = '\t', row.names = TRUE)
} 
tableCount <- tableCount + 1

cat("## Distribution of query regions in the genome grouped by gene types\n")
cat("**Figure",figureCount,":** The number of query regions that overlap different kinds of gene types are counted. The 'x' axis denotes the types of genes included in the analysis and the 'y' axis denotes the percentage of query regions (out of total number of query regions denoted with 'n') that overlap at least one genomic interval that host the corresponding gene type. If the query regions don't overlap any known genes, they are classified as 'Unknown'.\n")

biotype_col <- grep('gene_biotype', colnames(overlaps.dt), value = T)[1]
if(!is.na(biotype_col)) {
  message("Using ",biotype_col," field in the GTF file for gene biotype plot")
  df <- overlaps.dt[,length(unique(queryIndex)), by = biotype_col]
  colnames(df) <- c("feature", "count")
  df$percent <- round(df$count / length(queryRegions) * 100, 1)
  df <- df[order(count, decreasing = TRUE)]
  p <- plotly::plot_ly(data = df, 
               type = "bar",
               x = df$feature,
               y = df$percent,
               text = paste("count:", df$count), color=df$feature)
  plotly::layout(p = p, 
         margin = list(l=100, r=100, b=150), 
         xaxis = list(showticklabels = TRUE,  tickangle = 90), 
         yaxis = list(title = paste("% query,", 
                                    "n =", length(queryRegions))),
         font = list(size = 14))
  if(params$printProcessedTables == TRUE) {
    write.table(x = df, file=paste0('Figure',figureCount,'.query_gene_types.data.tsv'), quote = FALSE, sep = '\t', row.names = TRUE)
  } 
  figureCount <- figureCount + 1
} else {
  warning("Skipping gene biotype plot because no such field exists in input GTF file.")
}

cat("## Coverage profile of query regions at/around Transcription Start/End Sites\n")
cat("**Figure",figureCount,":** The depth of coverage of query regions at and around Transcription Start/End Sites\n")

cvgF <- getFeatureBoundaryCoverage(queryRegions = queryRegions, featureCoords = txdbFeatures$transcripts, flankSize = 1000, boundaryType = 'fiveprime', sampleN = 10000)
cvgT <- getFeatureBoundaryCoverage(queryRegions = queryRegions, featureCoords = txdbFeatures$transcripts, flankSize = 1000, boundaryType = 'threeprime', sampleN = 10000)

plotFeatureBoundaryCoverage(cvgF = cvgF, cvgT = cvgT, featureName = 'transcripts')

if(params$printProcessedTables == TRUE) {
  write.table(x = cvgF, file=paste0('Figure',figureCount,'.transcriptBoundaryCoverage.fiveprime.data.tsv'), quote = FALSE, sep = '\t', row.names = TRUE)
  write.table(x = cvgT, file=paste0('Figure',figureCount,'.transcriptBoundaryCoverage.threeprime.data.tsv'), quote = FALSE, sep = '\t', row.names = TRUE)
} 

figureCount <- figureCount + 1

cat("## Coverage profile of query regions at Exon - Intron Boundaries\n")
cat("**Figure",figureCount,":** The depth of coverage of query regions at exon - intron junctions\n")

#split the exons into corresponding transcripts
exons <- txdbFeatures$exons
exonCounts <- lengths(rtracklayer::split(exons, names(exons)))

#To find the internal exons, for each transcript, remove the first and last exons from the GRanges object
#thus we can make sure that each exon in the end we are looking at is adjacent to an intron
internalExons <- exons[exonCounts[names(exons)] != exons$exon_rank,]
internalExons <- internalExons[internalExons$exon_rank > 1,]

cvgF <- getFeatureBoundaryCoverage(queryRegions = queryRegions, featureCoords = internalExons, boundaryType = 'fiveprime', flankSize = 1000, sampleN = 10000)
cvgT <- getFeatureBoundaryCoverage(queryRegions = queryRegions, featureCoords = internalExons, boundaryType = 'threeprime', flankSize = 1000, sampleN = 10000)

plotFeatureBoundaryCoverage(cvgF = cvgF, cvgT = cvgT, featureName = 'Internal Exons')

if(params$printProcessedTables == TRUE) {
  write.table(x = cvgF, file=paste0('Figure',figureCount,'.exonIntronBoundaryCoverage.fiveprime.data.tsv'), quote = FALSE, sep = '\t', row.names = TRUE)
  write.table(x = cvgT, file=paste0('Figure',figureCount,'.exonIntronBoundaryCoverage.threeprime.data.tsv'), quote = FALSE, sep = '\t', row.names = TRUE)
} 
figureCount <- figureCount + 1

cat("## Coverage profile of query regions across the length of different gene features\n")
cat("**Figure",figureCount,":** The query regions are overlaid with the genomic coordinates of features. Each entry corresponding to a feature is divided into 100 bins of equal length and for each bin the number of query regions that cover the corresponding bin is counted. Features shorter than 100bp are excluded. Thus, a coverage profile is obtained based on the distribution of the query regions. Mean coverage score for each bin is represented with ribbons where the thickness of the ribbon indicates the 95% confidence interval (mean +- standard error of the mean x 1.96). The strandedness of the features are taken into account. The coverage profile is plotted in the 5' to 3' direction.\n")

cvgList <- calculateCoverageProfileList(queryRegions = queryRegions, 
                                       targetRegionsList = txdbFeatures, 
                                       sampleN = 10000)

p <- plotly::plot_ly(data = cvgList, type = 'scatter', mode = 'lines')
for (f in unique(cvgList$feature)){
  data <- cvgList[cvgList$feature == f,]
  p <- plotly::add_trace(p = p, data = data, x = ~bins, y = ~meanCoverage, 
                 legendgroup = f, showlegend = FALSE, opacity = 1, color = f)
  p <- plotly::add_ribbons(p = p, data = data, x = ~bins, 
                   ymin = data$meanCoverage - data$standardError*1.96,
                   ymax = data$meanCoverage + data$standardError*1.96, 
                   legendgroup = f, 
                   name = f, color = f
                   )
}
plotly::layout(p, font = list(size = 14))

if(params$printProcessedTables == TRUE) {
  write.table(x = cvgList, file=paste0('Figure',figureCount,'.coverageprofilelist.data.tsv'), quote = FALSE, sep = '\t', row.names = TRUE)
} 
figureCount <- figureCount + 1

cat('# Motif Discovery Results \n')

message("Started motif discovery module...")
motifData <- discoverFeatureSpecificMotifs(queryRegions = queryRegions,
                                           txdbFeatures = txdbFeatures, 
                                           resizeN = 15, 
                                           sampleN = 10000,
                                           genomeVersion = params$genomeVersion,
                                           motifN = 1, 
                                           nCores = 1)
motifStats <- lapply(motifData, function(x) {
  df <- subset(getMotifSummaryTable(x), select = c('queryFraction', 'controlFraction'))
  df$pattern <- rownames(df)
  return(reshape2::melt(df, id.vars = 'pattern'))
})

cat("## Summary stats for top discovered motifs\n")
cat("**Table",tableCount,":** Top discriminative motif discovered for each transcript feature")  
# drop 'patterns' column, keep rownames with feature types
dt <- data.table(do.call(rbind, lapply(names(motifData), function(f) { 
  s <- getMotifSummaryTable(motifData[[f]])
  s$Feature <- f
  return(s)
})))

# reorder columns
dt <- dt[,c('Feature', 'patterns', 'queryFraction', 'controlFraction', 'oddsRatio', 'pvalue', 
            'querySeqs', 'controlSeqs', 'queryHits', 'controlHits')]
DT::datatable(dt, 
          extensions = c('Buttons', 'FixedColumns'), 
          options = list(fixedColumns = TRUE, 
                         scrollX = TRUE,
                         dom = 'Bfrtip',
                         buttons = c('copy', 'print', 'csv','excel', 'pdf')),
          filter = 'bottom'
          )
if(params$printProcessedTables == TRUE) {
  write.table(x = dt, file=paste0('Table',tableCount,'.motifSummaryTable.tsv'), quote = FALSE, sep = '\t', row.names = TRUE)
} 
tableCount <- tableCount + 1

cat('## Top motifs discovered in the sequences of the query regions {.tabset}\n')
for (i in 1:length(motifData)) {
  cat("### ",names(motifData)[i],"\n")
  print(seqLogo::seqLogo(getPWM(motifData[[i]]$matches_query[[1]])))
  cat('\n\n')
}
figureCount <- figureCount + 1

cat("# Functional Term Enrichment Analysis Results\n")
GP <- findEnrichedFunctions(targetedGenes, params$species)
#order by p-value
#GP <- GP[order(GP$p.value),]

cat("**Table",tableCount,":** Significant terms (FDR < 0.1) 
    enriched for genes that overlap query regions\n")  

#only display top GO terms in the HTML report.
max <- ifelse(nrow(GP) > 1000, 1000, nrow(GP))

DT::datatable(GP[1:max,], 
        extensions = c('Buttons', 'FixedColumns', 'Scroller'), 
        options = list(fixedColumns = TRUE, 
                       scrollY = 400,
                       scrollX = TRUE,
                       scroller = TRUE,
                       dom = 'Bfrtip',
                       buttons = c('colvis', 'copy', 'print', 'csv','excel', 'pdf'), 
                       columnDefs = list(
                         list(targets = c(0,1,2,5,9,11,13,14), visible = FALSE)
                       )
                       ),
        filter = 'bottom'
        )

tableCount <- tableCount + 1

Acknowledgements

RCAS is developed in the group of Altuna Akalin (head of the Scientific Bioinformatics Platform) by Bora Uyar (Bioinformatics Scientist), Dilmurat Yusuf (Bioinformatics Scientist) and Ricardo Wurmus (System Administrator) at the Berlin Institute of Medical Systems Biology (BIMSB) at the Max-Delbrueck-Center for Molecular Medicine (MDC) in Berlin.

RCAS is developed as a bioinformatics service as part of the RNA Bioinformatics Center, which is one of the eight centers of the German Network for Bioinformatics Infrastructure (de.NBI).