knitr::opts_chunk$set(echo = TRUE)
Welcome to the ORFik
package.
ORFik
is an R package for analysis of transcript and translation features through manipulation of sequence data and NGS data.
This vignette will preview a simple Ribo-seq pipeline using ORFik. It is important you read all the other vignettes before this one, since functions will not be explained here in detail.
This pipeline will shows steps needed to analyse Ribo-seq from:
The following steps are done:
```r
library(ORFik)
conf <- config.exper(experiment = "Tsvetkov_Yeast", assembly = "Yeast_SacCer3", type = c("Ribo-seq", "RNA-seq"))
study <- download.SRA.metadata("PRJNA644594", auto.detect = TRUE)
study.rfp <- study[LIBRARYTYPE == "RFP",] study.rna <- study[LIBRARYTYPE == "RNA",]
download.SRA(study.rfp, conf["fastq Ribo-seq"], rename = study.rfp$sample_title, subset = 2000000) download.SRA(study.rna, conf["fastq RNA-seq"], rename = study.rna$sample_title, subset = 2000000)
organism <- study$ScientificName[1] # Usually you find organism here, else set it yourself
annotation <- getGenomeAndAnnotation( organism = organism, genome = TRUE, GTF = TRUE, phix = TRUE, ncRNA = TRUE, tRNA = TRUE, rRNA = TRUE, output.dir = conf["ref"], optimize = TRUE, gene_symbols = TRUE, pseudo_5UTRS_if_needed = 100 # If species have not 5' UTR (leader) definitions, make 100nt pseudo leaders. )
index <- STAR.index(annotation, max.ram = 20, SAsparse = 2)
list.genomes()
paired.end.rfp <- study.rfp$LibraryLayout == "PAIRED" paired.end.rna <- study.rna$LibraryLayout == "PAIRED"
STAR.align.folder(conf["fastq Ribo-seq"], conf["bam Ribo-seq"], index, paired.end = paired.end.rfp, steps = "tr-ge", # (trim needed: adapters found, then genome) adapter.sequence = "TCGTATGCCGTC", # Adapters are not auto detected by fastp trim.front = 0, min.length = 20)
STAR.align.folder(conf["fastq RNA-seq"], conf["bam RNA-seq"], index, paired.end = paired.end.rna, steps = "tr-ge", # (trim needed: adapters found, then genome) adapter.sequence = "TCGTATGCCGTC", # Adapters are not auto detected by fastp trim.front = 0, min.length = 20)
library(ORFik) create.experiment(file.path(conf["bam Ribo-seq"], "aligned/"), exper = conf["exp Ribo-seq"], fa = annotation["genome"], txdb = paste0(annotation["gtf"], ".db"), organism = organism, pairedEndBam = paired.end.rfp, rep = study.rfp$REPLICATE, condition = study.rfp$CONDITION, runIDs = study.rfp$Run)
create.experiment(file.path(conf["bam RNA-seq"], "aligned/"), exper = conf["exp RNA-seq"], fa = annotation["genome"], txdb = paste0(annotation["gtf"], ".db"), organism = organism, pairedEndBam = paired.end.rna, rep = study.rna$REPLICATE, condition = study.rna$CONDITION, runIDs = study.rna$Run)
library(ORFik)
list.experiments(validate = FALSE) df.rfp <- read.experiment("Tsvetkov_Yeast_Ribo-seq") df.rna <- read.experiment("Tsvetkov_Yeast_RNA-seq")
ORFikQC(df.rfp) ORFikQC(df.rna)
fpkm_table <- cbind(countTable(df.rfp, type = "fpkm"), countTable(df.rna, type = "fpkm")) pcaPlot(fpkm_table) # The samples seperate well between library types
shiftFootprintsByExperiment(df.rfp, accepted.lengths = c(20:21, 27:30))
shiftPlots(df.rfp, output = "auto", downstream = 30) # Barplots, better details shiftPlots(df.rfp, output = "auto", downstream = 30, type = "heatmap") # Heatmaps, better overview
remove.experiments(df.rfp) # Remove loaded data (it is not pshifted) RiboQC.plot(df.rfp, BPPARAM = BiocParallel::SerialParam(progressbar = TRUE))
remove.experiments(df.rfp)
heatMapRegion(df.rfp, region = c("TIS"), shifting = "5prime", type = "ofst", outdir = file.path(QCfolder(df.rfp), "heatmaps/pre-pshift/")) remove.experiments(df.rfp)
heatMapRegion(df.rfp, region = c("TIS"), shifting = "5prime", type = "pshifted", outdir = file.path(QCfolder(df.rfp), "heatmaps/pshifted/"))
countTable_regions(df.rfp, lib.type = "pshifted", rel.dir = "pshifted")
countsRFP <- countTable(df.rfp, region = "cds", type = "fpkm", collapse = FALSE, count.folder = "pshifted") countsRNA <- countTable(df.rna, region = "mrna", type = "fpkm", collapse = FALSE) countsTE <- (countsRFP + 1) / (countsRNA + 1) # with pseudo count summary(countsTE) # Good stability of TE, no strong ribosome abundance regulation
countsRFP <- countTable(df.rfp, region = "cds", type = "fpkm", collapse = TRUE, count.folder = "pshifted") countsRNA <- countTable(df.rna, region = "mrna", type = "fpkm", collapse = TRUE) countsTE <- (countsRFP + 1) / (countsRNA + 1) # with pseudo count summary(countsTE) # Quite similar abundance over groups
countsRFP <- countTable(df.rfp, region = "cds", type = "fpkm", collapse = "all", count.folder = "pshifted")[[1]] countsRNA <- countTable(df.rna, region = "mrna", type = "fpkm", collapse = "all")[[1]] countsTE <- (countsRFP + 1) / (countsRNA + 1) # with pseudo count summary(countsTE[countsRFP > 10 & countsRNA > 10]) # Gene with biggest normalized ratio is 8 ribosomes per mrna fragment
design(df.rfp, multi.factor = FALSE)
res <- DTEG.analysis(df.rfp, df.rna)
symbols <- symbols(df.rfp) # Let's fetch the gene symbols table we made earlier HSP90_tx_id <- symbols[grep("HSC82", external_gene_name, ignore.case = T)]$ensembl_tx_name res[id == HSP90_tx_id]
res[id == HSP90_tx_id]$Regulation # By mRNA abundance (No change in subset) significant_genes <- res[Regulation != "No change",]
res <- DTEG.analysis(df.rfp, df.rna, design = "condition", RFP_counts = countTable(df.rfp, region = "cds", type = "summarized", count.folder = "pshifted"))
peaks <- findPeaksPerGene(loadRegion(df.rfp, "cds"), reads = RFP_WT_r1, type = "max")
ORFik::windowCoveragePlot(peaks, type = "cds", scoring = "transcriptNormalized")
"YMR186W_mRNA" %in% peaks$gene_id # FALSE
peaks_HSR <- findPeaksPerGene(loadRegion(df.rfp, "cds"), reads = RFP_HSR_r1, type = "max")
"YMR186W_mRNA" %in% peaks$gene_id # FALSE
codon_table <- codon_usage_exp(df.rfp[df.rfp$rep == 1,], outputLibs(df.rfp[df.rfp$rep == 1,], type = "pshifted", output.mode = "list"), cds = loadRegion(df.rfp, "cds", filterTranscripts(df.rfp, minThreeUTR = NULL))) codon_usage_plot(codon_table) # There is an increased dwell time on (R:CGC) of A-sites in both conditions codon_usage_plot(codon_table, ignore_start_stop_codons = TRUE)
cds <- loadRegion(df.rfp, "cds") cds <- ORFik:::removeMetaCols(cds) # Dont need them cds <- cds[filterTranscripts(df.rfp, minThreeUTR = NULL)] # Filter to sane transcripts (annotation is not perfect) dt <- computeFeatures(cds, RFP = fimport(filepath(df.rfp[6,], "pshifted")), RNA = fimport(filepath(df.rna[6,], "ofst")), Gtf = df.rfp, grl.is.sorted = TRUE, faFile = df.rfp, weight.RFP = "score", weight.RNA = "score", riboStart = 21, uorfFeatures = FALSE)
dt[names(cds) %in% significant_genes$id,]
dt[ORFScores > 5,]
table(dt$StartCodons) # 5 Genes with ATA start codons ?
dt[startCodonCoverage > 5,]
devtools::install_github("m-swirski/RiboCrypt", dependencies = TRUE) # Restart R if you already had RiboCrypt installed library(RiboCrypt) cds <- loadRegion(df.rfp, "cds") mrna <- loadRegion(df.rfp, "mrna") RiboCrypt::multiOmicsPlot_list(mrna[HSP90_tx_id], cds[HSP90_tx_id], reference_sequence = findFa(df.rfp@fafile), reads = list(fimport(filepath(df.rna[6,], "ofst")), fimport(filepath(df.rfp[6,], "pshifted"))), ylabels = c("RNA", "RFP"), withFrames = c(F, T), frames_type = "columns")
prediction_output_folder <- file.path(libFolder(df.rfp), "predicted_orfs") tx_subset <- symbols[grep("^HSP|^HSC", external_gene_name)]$ensembl_tx_name # Predict on all HSP/HSC genes
ORFik::detect_ribo_orfs(df.rfp[1:2,], prediction_output_folder, c("uORF", "uoORF", "annotated", "NTE", "NTT", "doORF", "dORF"), startCodon = "ATG|CTG|TTG|GTG", mrna = loadRegion(df.rfp, "mrna", tx_subset), cds = loadRegion(df.rfp, "cds", tx_subset)) # Human also has a lot of ACG uORFs btw table <- riboORFs(df.rfp[1:2,], type = "table", prediction_output_folder)
print(table(table[predicted == TRUE,]$type)) # 16 N-terminal extension of CDS predicted. table[ensembl_tx_name == HSP90_tx_id & predicted == TRUE,]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.