ORFikOverview.R
In ORFik: Open Reading Frames in Genomics

## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
library(ORFik)                        # This package
library(GenomicFeatures)              # For basic transcript operations
library(data.table)                   # For fast table operations
library(BSgenome.Hsapiens.UCSC.hg19)  # Human genome

## ----eval = TRUE, echo = TRUE-------------------------------------------------
txdbFile <- system.file("extdata", "hg19_knownGene_sample.sqlite", 
                        package = "GenomicFeatures")

## ----eval = TRUE, echo = TRUE-------------------------------------------------
txdb <- loadTxdb(txdbFile)
fiveUTRs <- loadRegion(txdb, "leaders")
fiveUTRs[1]

## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
# Extract sequences of fiveUTRs.
tx_seqs <- extractTranscriptSeqs(BSgenome.Hsapiens.UCSC.hg19::Hsapiens, 
                                 fiveUTRs) 
tx_seqs[1]

## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
fiveUTR_ORFs <- findMapORFs(fiveUTRs, tx_seqs, groupByTx = FALSE)
fiveUTR_ORFs[1:2]

## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
txNames(fiveUTR_ORFs[1:2]) # <- Which transcript

## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
names(fiveUTR_ORFs[1:2]) # <- Which ORF

## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
#  saveRDS(fiveUTR_ORFs[1:2], "save/path/uorfs.rds")

## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
#  export.bed12(fiveUTR_ORFs[1:2], "save/path/uorfs.bed12")

## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
orf_seqs <- extractTranscriptSeqs(BSgenome.Hsapiens.UCSC.hg19::Hsapiens,
                                  fiveUTR_ORFs[1:3])
orf_seqs

## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
#  writeXStringSet(orf_seqs, filepath = "uorfs.fasta")

## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
orf_aa_seq <- Biostrings::translate(orf_seqs)
orf_aa_seq

## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
#  writeXStringSet(orf_aa_seq, filepath = "uorfs_AA.fasta")

## ----eval = TRUE, echo = TRUE-------------------------------------------------
unlisted_ranges <- unlistGrl(fiveUTR_ORFs)
test_ranges <- groupGRangesBy(unlisted_ranges) # <- defualt is tx grouping by names()
test_ranges[1]

## ----eval = TRUE, echo = TRUE-------------------------------------------------
unlisted_ranges <- unlistGrl(fiveUTR_ORFs)
test_ranges <- groupGRangesBy(unlisted_ranges, unlisted_ranges$names)
test_ranges[1:2]

## ----eval = TRUE, echo = TRUE-------------------------------------------------
  unlisted_ranges <- unlistGrl(fiveUTR_ORFs)
  ORFs <- groupGRangesBy(unlisted_ranges, unlisted_ranges$names)
  length(ORFs) # length means how many ORFs left in set

## ----eval = TRUE, echo = TRUE-------------------------------------------------
  ORFs <- ORFs[widthPerGroup(ORFs) >= 60]
  length(ORFs)

## ----eval = TRUE, echo = TRUE-------------------------------------------------
  ORFs <- ORFs[numExonsPerGroup(ORFs) > 1]
  length(ORFs)

## ----eval = TRUE, echo = TRUE-------------------------------------------------
  
  ORFs <- ORFs[strandPerGroup(ORFs) == "+"]
  # all remaining ORFs where on positive strand, so no change
  length(ORFs)

## ----eval = TRUE, echo = TRUE-------------------------------------------------
startSites(fiveUTR_ORFs, asGR = TRUE, keep.names = TRUE, is.sorted = TRUE)
stopSites(fiveUTR_ORFs, asGR = TRUE, keep.names = TRUE, is.sorted = TRUE)

## ----eval = TRUE, echo = TRUE-------------------------------------------------
starts <- startCodons(fiveUTR_ORFs, is.sorted = TRUE)
stops <- stopCodons(fiveUTR_ORFs, is.sorted = TRUE)
starts[1:2]

## ----eval = TRUE, echo = TRUE-------------------------------------------------
txSeqsFromFa(starts, BSgenome.Hsapiens.UCSC.hg19::Hsapiens, is.sorted = TRUE)
"Stop codons"
txSeqsFromFa(stops, BSgenome.Hsapiens.UCSC.hg19::Hsapiens, is.sorted = TRUE)

## ----eval = TRUE, echo = TRUE-------------------------------------------------
  library(Biostrings)
  # strand with ORFs in both directions
  seqs <- DNAStringSet("ATGAAATGAAGTAAATCAAAACAT")
  ######################>######################< (< > is direction of ORF)
  
  # positive strands
  pos <- findORFs(seqs, startCodon = "ATG", minimumLength = 0)
  # negative strands
  neg <- findORFs(reverseComplement(seqs),
                  startCodon = "ATG", minimumLength = 0)

## ----eval = TRUE, echo = TRUE-------------------------------------------------
  pos <- GRanges(pos, strand = "+")
  neg <- GRanges(neg, strand = "-")
  # as GRanges
  res <- c(pos, neg)
  # or merge together and make GRangesList
  res <- split(res, seq.int(1, length(pos) + length(neg)))
  res[1:2]

## ----eval = TRUE, echo = TRUE-------------------------------------------------
  res[strandBool(res)] # Keep only + stranded ORFs

## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE---------------
# path to example CageSeq data from hg19 heart sample
cageData <- system.file("extdata", "cage-seq-heart.bed.bgz", 
                        package = "ORFik")
# get new Transcription Start Sites using CageSeq dataset
newFiveUTRs <- reassignTSSbyCage(fiveUTRs, cageData)
newFiveUTRs

## ----eval = TRUE, echo = TRUE-------------------------------------------------
# Find path to a bam file
bam_file <- system.file("extdata", "ribo-seq.bam", package = "ORFik")
footprints <- readBam(bam_file)

## ----eval = TRUE, echo = TRUE-------------------------------------------------
table(readWidths(footprints))

## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE---------------
gtf_file <- system.file("extdata", "annotations.gtf", package = "ORFik")
txdb <- loadTxdb(gtf_file)
tx <- exonsBy(txdb, by = "tx", use.names = TRUE)
cds <- cdsBy(txdb, by = "tx", use.names = TRUE)
trailers <- threeUTRsByTranscript(txdb, use.names = TRUE)
cds[1]

## ----eval = FALSE, echo = TRUE, warning = FALSE, message = FALSE--------------
#  loadRegions(gtf_file, parts = c("tx", "cds", "trailers"))

## ----eval = TRUE, echo = TRUE-------------------------------------------------
footprintsGR <- convertToOneBasedRanges(footprints, addSizeColumn = TRUE)
footprintsGR

## ----eval = TRUE, echo = TRUE-------------------------------------------------
  hitMap <- windowPerReadLength(cds, tx,  footprintsGR, pShifted = FALSE)
  coverageHeatMap(hitMap, scoring = "transcriptNormalized")

## ----eval = TRUE, echo = TRUE-------------------------------------------------
footprints <- footprints[readWidths(footprints) == 29]
footprintsGR <- footprintsGR[readWidths(footprintsGR) == 29]
footprints

## ----eval = TRUE, echo = TRUE, warning = FALSE--------------------------------
txNames <- filterTranscripts(txdb) # <- get only transcripts that pass filter
tx <- tx[txNames]; cds <- cds[txNames]; trailers <- trailers[txNames];
windowsStart <- startRegion(cds, tx, TRUE, upstream = 30, 29)
windowsStop <- startRegion(trailers, tx, TRUE, upstream = 30, 29)
windowsStart

## ----eval = TRUE, echo = TRUE, warning = FALSE--------------------------------
hitMapStart <- metaWindow(footprintsGR, windowsStart, withFrames = TRUE)
hitMapStop <- metaWindow(footprintsGR, windowsStop, withFrames = TRUE)

## ----eval = TRUE, echo = TRUE, warning = FALSE--------------------------------
  pSitePlot(hitMapStart)

## ----eval = TRUE, echo = TRUE, warning = FALSE--------------------------------
  pSitePlot(hitMapStop, region = "stop")

## ----eval = TRUE, echo = TRUE, warning = FALSE--------------------------------
shifts <- detectRibosomeShifts(footprints, txdb, stop = TRUE)
shifts

## ----eval = TRUE, echo = TRUE, warning = FALSE--------------------------------
shiftedFootprints <- shiftFootprints(footprints, shifts)
shiftedFootprints

## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE---------------
fiveUTRs <- fiveUTRs[1:10]
faFile <- BSgenome.Hsapiens.UCSC.hg19::Hsapiens
tx_seqs <- extractTranscriptSeqs(faFile, fiveUTRs)

fiveUTR_ORFs <- findMapORFs(fiveUTRs, tx_seqs, groupByTx = FALSE)

## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE---------------
starts <- unlist(ORFik:::firstExonPerGroup(fiveUTR_ORFs), use.names = FALSE)
RFP <- promoters(starts, upstream = 0, downstream = 1)
score(RFP) <- rep(29, length(RFP)) # the original read widths
# set RNA-seq seq to duplicate transcripts
RNA <- unlist(exonsBy(txdb, by = "tx", use.names = TRUE), use.names = TRUE)

## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE---------------
# transcript database
txdb <- loadTxdb(txdbFile)
dt <- computeFeatures(fiveUTR_ORFs[1:4], RFP, RNA, txdb, faFile, 
                      sequenceFeatures = TRUE)
dt

## ----eval = TRUE, echo = TRUE-------------------------------------------------
cds <- cdsBy(txdb, by = "tx", use.names = TRUE)[1:10]
tx <- exonsBy(txdb, by = "tx", use.names = TRUE)[names(cds)]
faFile <- BSgenome.Hsapiens.UCSC.hg19::Hsapiens

kozakSequenceScore(cds, tx, faFile, species = "human")



## ----eval = TRUE, echo = TRUE-------------------------------------------------
pfm <- t(matrix(as.integer(c(29,26,28,26,22,35,62,39,28,24,27,17,
                             21,26,24,16,28,32,5,23,35,12,42,21,
                             25,24,22,33,22,19,28,17,27,47,16,34,
                             25,24,26,25,28,14,5,21,10,17,15,28)),
                ncol = 4))

kozakSequenceScore(cds, tx, faFile, species = pfm)

## ----eval = TRUE, echo = TRUE-------------------------------------------------
seqs <- startRegionString(cds, tx, faFile, upstream = 5, downstream = 5)
rate <- fpkm(cds, RFP)
ORFik:::kozakHeatmap(seqs, rate)


## ----eval = TRUE, echo = TRUE-------------------------------------------------
  # First make some toy example
  cds <- GRanges("chr1", IRanges(c(1, 10, 20, 30, 40, 50, 60, 70, 80),
                                 c(5, 15, 25, 35, 45, 55, 65, 75, 85)),
                 "+")
  names(cds) <- c(rep("tx1", 3), rep("tx2", 3), rep("tx3", 3))
  cds <- groupGRangesBy(cds)
  ribo <- GRanges("chr1", c(1, rep.int(23, 4), 30, 34, 34, 43, 60, 64, 71, 74),
                  "+")
  # We could do a simplification and use the ORFik entropy function
  entropy(cds, ribo) # <- spread of reads

## ----eval = TRUE, echo = TRUE-------------------------------------------------
tile <- tile1(cds, FALSE, FALSE) # tile them to 1 based positions
tails <- tails(tile, 9) # get 9 last bases per cds
stopOverlap <- countOverlaps(tails, ribo)
allOverlap <- countOverlaps(cds, ribo)
fractions <- (stopOverlap + 1) / (allOverlap + 1) # pseudocount 1
cdsToRemove <- fractions > 1 / 2 # filter with pseudocounts (1+1)/(3+1) 
cdsToRemove

## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE---------------
# Get the annotation
txdb <- loadTxdb(gtf_file)
# Ribo-seq
bam_file <- system.file("extdata", "ribo-seq.bam", package = "ORFik")
reads <- readGAlignments(bam_file)
shiftedReads <- shiftFootprints(reads, detectRibosomeShifts(reads, txdb))  

## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
# Lets take all valid transcripts, with size restrictions:
# leader > 100 bases, cds > 100 bases, trailer > 100 bases
txNames <- filterTranscripts(txdb, 100, 100, 100) # valid transcripts
loadRegions(txdb, parts = c("leaders", "cds", "trailers", "tx"), 
            names.keep = txNames)

# Create meta coverage per part of transcript
leaderCov <- metaWindow(shiftedReads, leaders, scoring = NULL, 
                        feature = "leaders")

cdsCov <- metaWindow(shiftedReads, cds, scoring = NULL, 
                     feature = "cds")

trailerCov <- metaWindow(shiftedReads, trailers, scoring = NULL, 
                         feature = "trailers")

## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
dt <- rbindlist(list(leaderCov, cdsCov, trailerCov))
dt[, `:=` (fraction = "Ribo-seq")] # Set info column
# zscore gives shape, a good starting plot
windowCoveragePlot(dt, scoring = "zscore", title = "Ribo-seq metaplot") 

## ----eval = TRUE, echo = TRUE-------------------------------------------------
windowCoveragePlot(dt, scoring = "median", title = "Ribo-seq metaplot") 

## ----eval = TRUE, echo = TRUE-------------------------------------------------
if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
  # size 100 window: 50 upstream, 49 downstream of TIS
  windowsStart <- startRegion(cds, tx, TRUE, upstream = 50, 49)
  hitMapStart <- metaWindow(shiftedReads, windowsStart, withFrames = TRUE)
  pSitePlot(hitMapStart, length = "meta coverage")
}

## ----eval = TRUE, echo = TRUE, message = FALSE, fig.height=8------------------
hitMap <- windowPerReadLength(cds, tx,  shiftedReads)
coverageHeatMap(hitMap, addFracPlot = TRUE)

## ----eval = TRUE, echo = TRUE, message=FALSE----------------------------------
if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
  # Load more files like above (Here I make sampled data from earlier Ribo-seq)
  dt2 <- copy(dt)
  dt2[, `:=` (fraction = "Ribo-seq2")]
  dt2$score <- dt2$score + sample(seq(-40, 40), nrow(dt2), replace = TRUE)
  
  dtl <- rbindlist(list(dt, dt2))
  windowCoveragePlot(dtl, scoring = "median", title = "Ribo-seq metaplots") 
}