ORFik_Ribo-seq_pipeline.R
In ORFik: Open Reading Frames in Genomics

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)

## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # Ribo-seq HEK293 (2020) Investigative analysis of quality of new Ribo-seq data
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # Article: https://f1000research.com/articles/9-174/v2#ref-5
#  # Design: Wild type (WT) vs codon optimized (CO) (gene F9)
#  library(ORFik)
#  
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # Config
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # Specify paths wanted for NGS data, genome, annotation and STAR index
#  # If you use local files, make a conf variable with existing directories
#  conf <- config.exper(experiment = "Alexaki_Human",
#                       assembly = "Homo_sapiens_GRCh38_101",
#                       type = c("Ribo-seq", "RNA-seq"))
#  
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # Download fastq files for experiment and rename (Skip if you have the files already)
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # SRA Meta data download (work for ERA and DRA too)
#  study <- download.SRA.metadata("PRJNA591214", outdir = conf["fastq Ribo-seq"])
#  # Subset
#  study.rfp <- study[grep("mRNA-Seq", sample_title, invert = TRUE),]
#  study.rna <- study[grep("Ribo-Seq", sample_title, invert = TRUE),]
#  # Download fastq files (uses SRR numbers (RUN column) from study))
#  download.SRA(study.rfp, conf["fastq Ribo-seq"],
#               rename = study.rfp$sample_title, subset = 2000000)
#  download.SRA(study.rna, conf["fastq RNA-seq"],
#               rename = study.rna$sample_title, subset = 2000000)
#  
#  # Which organism is this, scientific name, like "Homo sapiens" or "Danio rerio"
#  organism <- study$ScientificName[1] # Usually you find organism here, else set it yourself
#  paired.end.rfp <- study.rfp$LibraryLayout == "PAIRED"
#  paired.end.rna <- study.rna$LibraryLayout == "PAIRED"
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # Annotation
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  
#  annotation <- getGenomeAndAnnotation(
#    organism = organism,
#    genome = TRUE, GTF = TRUE,
#    phix = TRUE, ncRNA = TRUE, tRNA = TRUE, rRNA = TRUE,
#    output.dir = conf["ref"],
#    assembly_type = "primary_assembly"
#  )
#  
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # STAR index
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # Remove max.ram = 20 and SAsparse = 2, if you have more than 64GB ram
#  index <- STAR.index(annotation, wait = TRUE, max.ram = 20, SAsparse = 2)
#  
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # Alignment (with depletion of phix, rRNA, ncRNA and tRNAs) & (with MultiQC of final STAR alignment)
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  
#  STAR.align.folder(conf["fastq Ribo-seq"], conf["bam Ribo-seq"], index,
#                    paired.end = paired.end.rfp,
#                    steps = "tr-co-ge", # (trim needed: adapters found, then genome)
#                    adapter.sequence = "auto", # Adapters are auto detected
#                    trim.front = 0, min.length = 20)
#  
#  STAR.align.folder(conf["fastq RNA-seq"], conf["bam RNA-seq"], index,
#                    paired.end = paired.end.rna,
#                    steps = "tr-co-ge", # (trim needed: adapters found, then genome)
#                    adapter.sequence = "auto", # Adapters are auto detected
#                    trim.front = 0, min.length = 20)
#  
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # Create experiment (Starting point if alignment is finished)
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  library(ORFik)
#  create.experiment(paste0(conf["bam Ribo-seq"], "/aligned/"),
#                    exper = conf["exp Ribo-seq"],
#                    fa = annotation["genome"],
#                    txdb = paste0(annotation["gtf"], ".db"),
#                    organism = organism,
#                    pairedEndBam = paired.end.rfp,
#                    rep = c(1,2,3,1,2,3),
#                    condition = c(rep("CO", 3), rep("WT", 3)),
#                    viewTemplate = FALSE)
#  create.experiment(paste0(conf["bam RNA-seq"], "/aligned/"),
#                    exper = conf["exp RNA-seq"],
#                    fa = annotation["genome"],
#                    txdb = paste0(annotation["gtf"], ".db"),
#                    organism = organism,
#                    pairedEndBam = paired.end.rna,
#                    rep = c(1,2,3,1,2,3),
#                    condition = c(rep("CO", 3), rep("WT", 3)),
#                    viewTemplate = FALSE)
#  
#  library(ORFik)
#  df.rfp <- read.experiment("Alexaki_Human_Ribo-seq")
#  df.rna <- read.experiment("Alexaki_Human_RNA-seq")
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # Convert files and run Annotation vs alignment QC
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # General QC
#  ORFikQC(df.rfp)
#  ORFikQC(df.rna)
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # P-shifting of Ribo-seq reads:
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # From ORFikQC it looks like 20, 21, 27 and 28 are candidates for Ribosomal footprints
#  shiftFootprintsByExperiment(df.rfp, accepted.lengths = c(20:21, 27:28))
#  # Ribo-seq specific QC
#  remove.experiments(df.rfp) # Remove loaded data (it is not pshifted)
#  ORFik:::RiboQC.plot(df.rfp, BPPARAM = BiocParallel::SerialParam(progressbar = T))
#  remove.experiments(df.rfp)
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # Create heatmaps (Ribo-seq)
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # Pre-pshifting
#  heatMapRegion(df.rfp, region = c("TIS", "TTS"), shifting = "5prime", type = "ofst",
#                outdir = paste0(dirname(df.rfp$filepath[1]), "/QC_STATS/heatmaps/pre-pshift/"))
#  remove.experiments(df.rfp)
#  # After pshifting
#  heatMapRegion(df.rfp, region = c("TIS", "TTS"), shifting = "5prime", type = "pshifted",
#                outdir = paste0(dirname(df.rfp$filepath[1]), "/QC_STATS/heatmaps/pshifted/"))
#  
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # Differential translation analysis (condition: WT vs CO)
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # We here get 11 unique DTEG genes
#  # Now let's check if the CO group overexpress the F9 Gene (ENSG00000101981):
#  res <- DTEG.analysis(df.rfp, df.rna, design = "condition")
#  significant_genes <- res[Regulation != "No change",]
#  gene_names <- txNamesToGeneNames(significant_genes$id, df.rfp)
#  "ENSG00000101981" %in% unique(gene_names) # TRUE
#  # It does, good good.
#  # How is it regulated ?
#  significant_genes[which(gene_names == "ENSG00000101981"),] # By mRNA abundance
#  
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # Peak detection (strong peaks in CDS)
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  
#  peaks <- findPeaksPerGene(loadRegion(df.rfp, "cds"), reads = RFP_WT_r1, type = "max")
#  ORFik::windowCoveragePlot(peaks, type = "cds", scoring = "transcriptNormalized")
#  # The gene does not have a strong max peak in WT rep1
#  "ENSG00000101981" %in% peaks$gene_id # FALSE
#  
#  peaks_CO <- findPeaksPerGene(loadRegion(df.rfp, "cds"), reads = RFP_CO_r1, type = "max")
#  # The gene does not have a strong max peak in CO rep1 either
#  "ENSG00000101981" %in% peaks_CO$gene_id # FALSE
#  
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # Feature table (From WT rep 1)
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  cds <- loadRegion(df.rfp, "cds")
#  cds <- ORFik:::removeMetaCols(cds) # Dont need them
#  cds <- cds[filterTranscripts(df.rfp)] # Filter to sane transcripts (annotation is not perfect)
#  dt <- computeFeatures(cds,
#                  RFP = fimport(filepath(df.rfp[6,], "pshifted")),
#                  RNA = fimport(filepath(df.rna[6,], "ofst")), Gtf = df.rfp,
#                  grl.is.sorted = TRUE, faFile = df.rfp,
#                  weight.RFP = "score", weight.RNA = "score",
#                  riboStart = 21, uorfFeatures = FALSE)
#  # The 8 remaining significant DTEGs.
#  dt[names(cds) %in%  significant_genes$id,]
#  # All genes with strong 3nt periodicity of Ribo-seq
#  dt[ORFScores > 5,]
#  # Not all genes start with ATG, possible errors in annotation
#  table(dt$StartCodons)
#  # All genes with strong start codon peak
#  dt[startCodonCoverage > 5,]
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  # uORF analysis (advanced) (using the extension package to ORFik: uORFomePipe)
#  # Will create a mysql database + files with results
#  #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
#  devtools::install_github("Roleren/uORFomePipe", dependencies = TRUE)
#  library(uORFomePipe)
#  find_uORFome("/media/roler/S/data/Bio_data/projects/Alexaki_uORFome/",
#               df.rfp = df.rfp, df.rna = df.rna, df.cage = NULL, biomart = NULL,
#               startCodons = "ATG|CTG|TTG|GTG", BPPARAM = BiocParallel::MulticoreParam(2))
#  
#  grl <- getUorfsInDb()
#  pred <- readTable("finalPredWithProb")$prediction
#  cov <- readTable("startCodonCoverage")
#  grl[pred == 1 & rowSums(cov) > 5]
#