options(error=quote({dump.frames(include.GlobalEnv = TRUE)})) knitr::opts_chunk$set(echo = TRUE) library(RiboseQC) # RiboseQC package (contains report functions) library(knitr) # R Markdown to HTML wrapper library(DT) # interactive JavaScript-based tables library(ggplot2) # plotting library(reshape2) # data formatting for ggplot2 library(gridExtra) # multiple plots per figure library(ggpubr) library(viridis) # nice coloring scheme library(Biostrings) # codon usage
# get input from params input_files <- params$input_files # RData file paths to one or more samples (one RData file per sample) input_sample_names <- params$input_sample_names # user-defined sample names to be displayed in report (short(!) names) output_fig_path <- params$output_fig_path names(input_files) <- input_sample_names # load all RData files to R rdata_list <- generate_rdata_list(input_files)
Sample names and data file paths visualized in this report:
cat("\n\n") for (i in 1:length(input_files)){ cat("**", names(input_files[i]), "**: \n", sep="") cat(input_files[i], "\n\n\n") } cat("\n\n")
Per sample, the distribution of reads across different originating compartment (e.g. cytoplasmic and organellar footprints) and biotypes (e.g. CDS regions of protein coding genes) is shown.
# plot barplot for each sample in separate tabs for (i in c(1:length(rdata_list))){ cat("### ", names(rdata_list)[i], " {.tabset} \n \n") res_all <- rdata_list[[names(rdata_list)[i]]] plot_read_biotype_dist_1(res_all$read_stats$positions, names(rdata_list)[i], paste0(output_fig_path, "rds/")) cat("\n\n") }
# plot several stacked barplots per sample in one figure plot_read_biotype_dist_2(rdata_list, paste0(output_fig_path, "rds/")) cat("\n\n")
Per sample, the distribution of read lengths is shown per originating compartment.
# plot distribution for each sample in separate tabs for (i in c(1:length(rdata_list))){ cat("## ", names(rdata_list)[i], " {.tabset} \n \n") res_all <- rdata_list[[names(rdata_list)[i]]] plot_read_length_dist(res_all$read_stats$rld, names(rdata_list)[i], paste0(output_fig_path, "rds/")) cat("\n\n") }
Per sample and originating compartment, read length and location distributions are shown.
For each sample, absolute number of reads and normalized read length distributions are shown.
Read count shows absolute read numbers; in Read count fraction the number of reads for each biotype sums up to 1.
# plot read length distribution per biotype for each sample in separate tabs for (i in c(1:length(rdata_list))){ cat("### ", names(rdata_list)[i], " {.tabset} \n \n") res_all <- rdata_list[[names(rdata_list)[i]]] plot_read_length_dist_by_biotype(res_all$read_stats$reads_summary, names(rdata_list)[i], paste0(output_fig_path, "rds/")) cat("\n\n") }
Per read length, the read distribution for different biotypes is shown (stacked barplot). Read count shows absolute numbers; in Read count fraction, the number of reads for each read length sums up to 1.
# plot read location distribution per read length for each sample in separate tabs for (i in c(1:length(rdata_list))){ cat("### ", names(rdata_list)[i], " {.tabset} \n \n") res_all <- rdata_list[[names(rdata_list)[i]]] plot_read_biotype_dist_by_length(res_all$read_stats$reads_summary, names(rdata_list)[i], paste0(output_fig_path, "rds/")) cat("\n\n") }
Profiles of 5' ends are displayed over a metagene plot aggregating signal over all covered transcripts. 5'end profiles are calculated with sub-codon resolution, and using binned transcript regions.
Different scaling methods can be applied to the calculated profiles. Profiles for individual read lengths (without scaling) can also be visualized.
Disclaimer:When comparing between samples, you might find differences in read lengths displayed, since read lengths are chosen for each sample individually.
for (i in c(1:length(rdata_list))){ cat("\n\n") cat("### ", names(rdata_list)[i], " {.tabset} \n\n") res_all <- rdata_list[[names(rdata_list)[i]]] plot_metagene_hm_rmd(res_all, "profiles_fivepr", names(rdata_list)[i], paste0(output_fig_path, "rds/")) cat("\n\n") }
Read lengths, as well as their individual offsets, are selected according to the parameters specified in the Ribo-seQC run.
Note: Not all samples and originating organelles might be displayed here. Please check the parameters used in the Ribo-seQC run.
The fraction of 5'ends (from Section 4.1) falling on the three possible frames is displayed, for each read length and organelle. Each data point represents one transcripts.
for (i in 1:length(rdata_list)){ cat("####", names(rdata_list)[i], " {.tabset} \n\n") res_all <- rdata_list[[names(rdata_list)[i]]] plot_frame_dist_boxplot_rmd(res_all$selection_cutoffs$analysis_frame_cutoff, names(rdata_list)[i], paste0(output_fig_path, "rds/")) cat("\n\n") }
Cutoffs and frame statistics are shown for selected read lengths:
datasets <- NULL for (i in c(1:length(rdata_list))) { res_all <- rdata_list[[names(rdata_list)[i]]] sample <- names(rdata_list)[[i]] comps <- names(res_all$selection_cutoffs$results_choice) for (comp in comps){ d <- as.data.frame(res_all$selection_cutoffs$results_choice[[comp]]$data)[c(3, 1, 2, 4, 5)] names(d) <- c("read length", "cutoff", "frame preference (%)", "gain_frame_codons", "gain_frame_new_codons") datasets[[sample]][[comp]] <- d } }
out <- NULL for (i in c(1:length(datasets))) { sample <- names(datasets)[[i]] knit_expanded <- paste0("\n\n```r\n\n cat(\"#### ", sample," {.tabset} \n \n\")\n\n ```") out = c(out, knit_expanded) for (comp in names(datasets[[sample]])){ knit_expanded <- paste0("\n\n```r\n\n cat(\"##### ", comp, "\n \n\")\n\n datatable(datasets[[\"", sample, "\"]][[\"", comp, "\"]], rownames=FALSE, options=list(dom='t')) %>% formatRound(c(3,4,5), 2)\n\n ```") out = c(out, knit_expanded) } }
r paste(knit(text = out), collapse = '\n')
Based on the parameters indicated in the Ribo-seQC run, the following read lengths (with their offsets) were selected to infer P-sites positions.
datasets <- NULL for (i in 1:length(rdata_list)){ res_all <- rdata_list[[names(rdata_list)[i]]] sample <- names(rdata_list)[[i]] comps <- names(res_all$selection_cutoffs$results_choice) rl_choices_default <- NULL for (j in comps){ data <- res_all$selection_cutoffs$results_choice[[j]]$data data_default <- data[data[["max_coverage"]],][["read_length"]] # choose only read lengths with max_coverage=TRUE (default choice) rl_choices_default <- rbind(rl_choices_default, c(compartment=j, `read lengths of choice (ordered by frame preference)`=paste(data_default, collapse = ", "))) } datasets[[sample]] <- rl_choices_default }
out = NULL for (i in c(1:length(datasets))) { sample <- names(datasets)[[i]] knit_expanded <- paste0("\n```r\n\n cat(\"#### ", sample," {.tabset} \n \n\")\n\n datatable(datasets[[\"", sample, "\"]], rownames=FALSE, options=list(dom='t'))\n\n ```") out = c(out, knit_expanded) }
r paste(knit(text = out), collapse = '\n')
Read coverage in form of P-site profiles is here displayed, with the same visualization options available in Section 4.2.
Note: Not all samples and originating organelles might be displayed here. Please check the parameters used in the Ribo-seQC run.
for (i in c(1:length(rdata_list))){ res_all <- rdata_list[[names(rdata_list)[i]]] # display only those samples that have P-site profiles if(length(res_all$profiles_P_sites$P_sites_subcodon)!=0){ cat("\n\n") cat("### ", names(rdata_list)[i], " {.tabset} \n\n") plot_metagene_hm_rmd(res_all, "profiles_P_sites", names(rdata_list)[i], paste0(output_fig_path, "rds/")) } cat("\n\n") }
In order to reveal possible contaminating sequences, the top 50 mapping positions (using 5'ends) are listed, together with genomic feature annotation and nucleotide sequences.
datasets <- list() for (i in c(1:length(rdata_list))){ res_all <- rdata_list[[names(rdata_list)[i]]] data <- as.data.frame(res_all$sequence_analysis) data <- data[,c("score", "pct", "seqnames", "region", "gene_biotype", "gene", "seq")] names(data) <- c("read count", "library fraction (%)", "seqnames", "region", "gene biotype", "gene", "sequence") datasets[[i]] <- data }
out = NULL for (i in c(1:length(rdata_list))) { knit_expanded <- paste0("\n```r\n\n cat(\"## \", names(rdata_list)[", i,"], \"\n \n\")\n\n datatable(datasets[[", i, "]], rownames=FALSE)\n\n ```") out = c(out, knit_expanded) }
r paste(knit(text = out), collapse = '\n')
The 50 genes with the highest read counts are listed below for (i) CDS regions of protein coding genes and for (ii) all genes.
datasets <- list() for (i in c(1:length(rdata_list))){ res_all <- rdata_list[[names(rdata_list)[i]]] rc_cds <- as.data.frame(res_all$read_stats$counts_cds_genes) rc_cds <- rc_cds[with(rc_cds, order(-reads, chromosome)), ][1:100,] datasets[[i]] <- rc_cds }
out = NULL for (k in c(1:length(rdata_list))) { knit_expanded <- paste0("\n```r\n\n cat(\"### \", names(rdata_list)[",k ,"], \"\n \n\")\n\n datatable(datasets[[", k, "]])\n\n ```") out = c(out, knit_expanded) }
r paste(knit(text = out), collapse = '\n')
datasets <- list() for (i in c(1:length(rdata_list))){ res_all <- rdata_list[[names(rdata_list)[i]]] rc_cds <- as.data.frame(res_all$read_stats$counts_all_genes) rc_cds <- rc_cds[with(rc_cds, order(-reads, chromosome)), ][1:100,] datasets[[i]] <- rc_cds }
out = NULL for (k in c(1:length(rdata_list))) { knit_expanded <- paste0("\n```r\n\n cat(\"### \", names(rdata_list)[",k ,"], \"\n \n\")\n\n datatable(datasets[[", k, "]])\n\n ```") out = c(out, knit_expanded) }
r paste(knit(text = out), collapse = '\n')
Based on the P-site positions (Section 4.3), codon usage within CDS regions of protein coding genes is here shown. In addition, position-specific values are calculated for the first 11 codons of the CDS, 11 codons from the middle of the CDS, and for the last 11 codons of the CDS - those regions are referred to as start, middle, and stop, respectively.
Codon usage can be accessed with positional information, or summed up over all positions (Section 8 Bulk codon usage).
Codon counts shows codon occurences per each position; P-sites counts shows number of P-sites position mapping to each codon and position; P-sites per codon simply shows the ratio of P-sites counts over Codon counts. Same calculations are performed using A-sites (shifting P-sites +3nt) and E-site (shifting P-sites -3nt) positions. Such values are calculated for all read lengths, and also for individual read lengths (available in the full report). Different scaling methods are also available.
Note: The genetic code, which assigns amino acids to codons, can differ between organelles, species and originating genomes. Different scales are used for ATG/stop codons and other codons.
Note: Codon usage calculation is dependent on successful P-sites calculation.
for (i in c(1:length(rdata_list))){ res_all <- rdata_list[[names(rdata_list)[i]]] if(length(res_all$profiles_P_sites$P_sites_percodon)!=0){ cat("\n\n") cat("## ", names(rdata_list)[i], " {.tabset} \n\n") plot_codon_usage_positional_rmd(res_all, names(rdata_list)[i], paste0(output_fig_path, "rds/")) } cat("\n\n") }
Codon usage (see Section 7) is here shown summed up over all CDS positions.
Note: Codons, as well as corresponding amino acids, are listed. The genetic code, which assigns amino acids to codons, can differ between organelles, species and originating genomes.
Note: Codon usage calculation is dependent on successful P-sites calculation.
for (i in c(1:length(rdata_list))){ res_all <- rdata_list[[names(rdata_list)[i]]] if(length(res_all$profiles_P_sites$P_sites_percodon)!=0){ cat("\n\n") cat("## ", names(rdata_list)[i], " {.tabset} \n\n") plot_codon_usage_bulk_rmd(res_all, names(rdata_list)[i], paste0(output_fig_path, "rds/")) } cat("\n\n") }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.