#' Ribosome P-sites position within reads.
#'
#' This function identifies the exact position of the ribosome P-site within
#' each read, determined by the localisation of its first nucleotide (see
#' \code{Details}). It returns a data table containing, for all samples and read
#' lengths: i) the percentage of reads in the whole dataset, ii) the percentage
#' of reads aligning on the start codon (if any); iii) the distance of the
#' P-site from the two extremities of the reads before and after the correction
#' step; iv) the name of the sample. Optionally, this function plots a
#' collection of read length-specific occupancy metaprofiles displaying the
#' P-site offsets computed through the process.
#'
#' @param data Either list of data tables or GRangesList object from
#' \code{\link{bamtolist}}, \code{\link{bedtolist}},
#' \code{\link{duplicates_filter}} or \code{\link{length_filter}}.
#' @param flanking Integer value specifying for the selected reads the minimum
#' number of nucleotides that must flank the reference codon in both
#' directions. Default is 6.
#' @param start Logical value whether to use the translation initiation site as
#' reference codon. Default is TRUE. If FALSE, the second to last codon is
#' used instead.
#' @param extremity Either "5end", "3end" or "auto". It specifies if the
#' correction step should be based on 5' extremities ("5end") or 3'
#' extremities ("3end"). Default is "auto" i.e. the optimal extremity is
#' automatically selected.
#' @param plot Logical value whether to plot the occupancy metaprofiles
#' displaying the P-site offsets computed in both steps of the algorithm.
#' Default is FALSE.
#' @param plot_dir Character string specifying the directory where read
#' length-specific occupancy metaprofiles shuold be stored. If the specified
#' folder doesn't exist, it is automatically created. If NULL (the default),
#' the metaprofiles are stored in a new subfolder of the working directory,
#' called \emph{offset_plot}. This parameter is considered only if \code{plot}
#' is TRUE.
#' @param plot_format Either "png" (the default) or "pdf". This parameter
#' specifies the file format storing the length-specific occupancy
#' metaprofiles. It is considered only if \code{plot} is TRUE.
#' @param cl Integer value in [1,100] specifying a confidence level for
#' generating occupancy metaprofiles for to a sub-range of read lengths i.e.
#' for the cl% of read lengths associated to the highest signals. Default is
#' 99. This parameter is considered only if \code{plot} is TRUE.
#' @param txt Logical value whether to write in a txt file the extremity used
#' for the correction step and the best offset for each sample. Similar
#' information are displayed by default in the console. Default is FALSE.
#' @param txt_file Character string specifying the path, name and extension
#' (e.g. "PATH/NAME.extension") of the plain text file where the extremity
#' used for the correction step and the best offset for each sample shuold be
#' written. If the specified folder doesn't exist, it is automatically
#' created. If NULL (the default), the information are written in
#' \emph{"best_offset.txt"} and saved in the working directory. This parameter
#' is considered only if \code{txt} is TRUE.
#' @return A data table.
#' @details The P-site offset (PO) is defined as the distance between the
#' extremities of a read and the first nucleotide of the P-site itself. The
#' function processes all samples separately starting from reads mapping on
#' the reference codon (either the start codon or the second to last codon,
#' see \code{start}) of any annotated coding sequences. Read lengths-specific
#' POs are inferred in two steps. First, reads mapping on the reference codon
#' are grouped according to their length, each group corresponding to a bin.
#' Reads whose extremities are too close to the reference codon are discarded
#' (see \code{flanking}). For each bin temporary 5' and 3' POs are defined as
#' the distances between the first nucleotide of the reference codon and the
#' nucleotide corresponding to the global maximum found in the profiles of the
#' 5' and the 3' end at the left and at the right of the reference codon,
#' respectively. After the identification of the P-site for all reads aligning
#' on the reference codon, the POs corresponding to each length are assigned
#' to each read of the dataset. Second, the most frequent temporary POs
#' associated to the optimal extremity (see \code{extremity}) and the
#' predominant bins are exploited as reference values for correcting the
#' temporary POs of smaller bins. Briefly, the correction step defines for
#' each length bin a new PO based on the local maximum, whose distance from
#' the reference codon is the closest to the most frequent temporary POs. For
#' further details please refer to the \strong{riboWaltz} article (available
#' \href{https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1006169}{here}).
#' @examples
#' data(reads_list)
#'
#' ## Compute the P-site offset automatically selecting the optimal read
#' ## extremity for the correction step and not plotting any metaprofile:
#' psite(reads_list, flanking = 6, extremity="auto")
#' @import data.table
#' @import ggplot2
#' @export
psite <- function(data, flanking = 6, start = TRUE, extremity = "auto",
plot = FALSE, plot_dir = NULL, plot_format = "png", cl = 99,
txt = FALSE, txt_file = NULL) {
if (txt == T | txt == TRUE) {
options(warn=-1)
if (length(txt_file) == 0) {
dir <- getwd()
txt_file <- paste0(dir, "/best_offset.txt")
} else {
txt_file_split <- strsplit(txt_file, "/")[[1]]
txt_dir <- paste(txt_file_split[-length(txt_file_split)], collapse = "/")
if (!dir.exists(txt_dir)) {
dir.create(txt_dir, recursive = TRUE)
}
}
options(warn=0)
cat("sample\textremity\toffset(nts)\n", file = txt_file)
}
names <- names(data)
offset <- NULL
for (n in names) {
cat(sprintf("processing %s\n", n))
dt <- data[[n]]
if(class(dt)[1] == "GRanges"){
dt <- as.data.table(dt)[, c("width", "strand") := NULL
][, seqnames := as.character(seqnames)]
setnames(dt, c("seqnames", "start", "end"), c("transcript", "end5", "end3"))
}
lev <- sort(unique(dt$length))
if(start == T | start == TRUE){
base <- 0
dt[, site_dist_end5 := end5 - cds_start]
dt[, site_dist_end3 := end3 - cds_start]
} else {
base <- -5
dt[, site_dist_end5 := end5 - cds_stop - base]
dt[, site_dist_end3 := end3 - cds_stop - base]
}
site_sub <- dt[site_dist_end5 <= -flanking & site_dist_end3 >= flanking - 1]
minlen <- min(site_sub$length)
maxlen <- max(site_sub$length)
t <- table(factor(site_sub$length, levels = lev))
# offset
offset_temp <- data.table(length = as.numeric(as.character(names(t))), percentage = (as.vector(t)/sum(as.vector(t))) * 100)
offset_temp[, around_site := "T"
][percentage == 0, around_site := "F"]
tempoff <- function(v_dist){
ttable <- sort(table(v_dist), decreasing = T)
ttable_sr <- ttable[as.character(as.numeric(names(ttable))+1)]
ttable_sl <- ttable[as.character(as.numeric(names(ttable))-1)]
tsel <- rowSums(cbind(ttable > ttable_sr, ttable > ttable_sl), na.rm = T)
return(as.numeric(names(tsel[tsel == 2][1])))
}
offset_temp5 <- site_sub[, list(offset_from_5 = tempoff(.SD$site_dist_end5)), by = length]
offset_temp3 <- site_sub[, list(offset_from_3 = tempoff(.SD$site_dist_end3)), by = length]
merge_allx <- function(x, y) merge(x, y, all.x = TRUE, by = "length")
offset_temp <- Reduce(merge_allx, list(offset_temp, offset_temp5, offset_temp3))
# adjusted offset
adj_off <- function(dt_site, dist_site, add, bestoff){
temp_v <- dt_site[[dist_site]]
t <- table(factor(temp_v, levels = seq(min(temp_v) - 2, max(temp_v) + add)))
t[1:2] <- t[3] + 1
locmax <- as.numeric(as.character(names(t[which(diff(sign(diff(t))) == -2)]))) + 1
adjoff <- locmax[which.min(abs(locmax - bestoff))]
ifelse(length(adjoff) != 0, adjoff, bestoff)
}
best_from5_tab <- offset_temp[!is.na(offset_from_5), list(perc = sum(percentage)), by = offset_from_5
][perc == max(perc)]
best_from3_tab <- offset_temp[!is.na(offset_from_5), list(perc = sum(percentage)), by = offset_from_3
][perc == max(perc)]
if((extremity == "auto" &
((best_from3_tab[, perc] > best_from5_tab[, perc] &
as.numeric(best_from3_tab[, offset_from_3]) <= minlen - 2) |
(best_from3_tab[, perc] <= best_from5_tab[, perc] &
as.numeric(best_from5_tab[, offset_from_5]) > minlen - 1))) |
extremity == "3end"){
best_offset <- as.numeric(best_from3_tab[, offset_from_3])
line_plot <- "3end"
adj_tab <- site_sub[, list(corrected_offset_from_3 = adj_off(.SD, "site_dist_end3", 0, best_offset)), by = length]
offset_temp <- merge(offset_temp, adj_tab, all.x = TRUE, by = "length")
offset_temp[is.na(corrected_offset_from_3), corrected_offset_from_3 := best_offset
][, corrected_offset_from_5 := -corrected_offset_from_3 + length - 1]
} else {
if((extremity == "auto" &
((best_from3_tab[, perc] <= best_from5_tab[, perc] &
as.numeric(best_from5_tab[, offset_from_5]) <= minlen - 1) |
(best_from3_tab[, perc] > best_from5_tab[, perc] &
as.numeric(best_from3_tab[, offset_from_3]) > minlen - 2))) |
extremity == "5end"){
best_offset <- as.numeric(best_from5_tab[, offset_from_5])
line_plot <- "5end"
adj_tab <- site_sub[, list(corrected_offset_from_5 = adj_off(.SD, "site_dist_end5", 1, best_offset)), by = length]
offset_temp <- merge(offset_temp, adj_tab, all.x = TRUE, by = "length")
offset_temp[is.na(corrected_offset_from_5), corrected_offset_from_5 := best_offset
][, corrected_offset_from_5 := abs(corrected_offset_from_5)
][, corrected_offset_from_3 := abs(corrected_offset_from_5 - length + 1)]
}
}
cat(sprintf("best offset: %i nts from the %s\n", abs(best_offset), gsub("end", "' end", line_plot)))
if(txt == T | txt == TRUE){
cat(sprintf("%s\t%s\t%i\n", n, gsub("end", "'end", line_plot), abs(best_offset)), file = txt_file, append = TRUE)
}
t <- table(factor(dt$length, levels = lev))
offset_temp[!is.na(offset_from_5), offset_from_5 := abs(offset_from_5)
][, total_percentage := as.numeric(format(round((as.vector(t)/sum(as.vector(t))) * 100, 3), nsmall=4))
][, percentage := as.numeric(format(round(percentage, 3), nsmall=4))
][, sample := n]
setcolorder(offset_temp, c("length", "total_percentage", "percentage", "around_site", "offset_from_5", "offset_from_3", "corrected_offset_from_5", "corrected_offset_from_3", "sample"))
if(start == TRUE | start == T){
setnames(offset_temp, c("length", "total_percentage", "start_percentage", "around_start", "offset_from_5", "offset_from_3", "corrected_offset_from_5", "corrected_offset_from_3", "sample"))
xlab_plot<-"Distance from start (nt)"
} else {
setnames(offset_temp, c("length", "total_percentage", "stop_percentage", "around_stop", "offset_from_5", "offset_from_3", "corrected_offset_from_5", "corrected_offset_from_3", "sample"))
xlab_plot<-"Distance from stop (nt)"
}
# plot
if (plot == T | plot == TRUE) {
options(warn=-1)
if (length(plot_dir) == 0) {
dir <- getwd()
plot_dir <- paste(dir, "/offset_plot", sep = "")
}
if (!dir.exists(plot_dir)) {
dir.create(plot_dir)
}
minlen <- ceiling(quantile(site_sub$length, (1 - cl/100)/2))
maxlen <- ceiling(quantile(site_sub$length, 1 - (1 - cl/100)/2))
for (len in minlen:maxlen) {
progress <- ceiling(((len + 1 - minlen)/(maxlen - minlen + 1)) * 25)
cat(sprintf("\rplotting %s\r", paste(paste(rep(c(" ", "<<", "-"),
c(25 - progress, 1, progress)), collapse = ""), " ", as.character(progress*4),
"% ", paste(rep(c("-", ">>", " "), c(progress, 1, 25 - progress)), collapse = ""), sep = "")))
site_temp <- dt[site_dist_end5 %in% seq(-len + 1, 0) & length == len]
site_tab5 <- data.table(table(factor(site_temp$site_dist_end5, levels = (-len + 1) : (len))))
site_temp <- dt[site_dist_end3 %in% seq(0, len - 2) & length == len]
site_tab3 <- data.table(table(factor(site_temp$site_dist_end3, levels = (-len) : (len - 2))))
setnames(site_tab5, c("distance", "reads"))
setnames(site_tab3, c("distance", "reads"))
site_tab5[, distance := as.numeric(as.character(site_tab5$distance))
][, extremity := "5' end"]
site_tab3[, distance := as.numeric(as.character(site_tab3$distance))
][, extremity := "3' end"]
final_tab <- rbind(site_tab5[distance <= 0], site_tab3[distance >= 0])
final_tab[, extremity := factor(extremity, levels = c("5' end", "3' end"))]
p <- ggplot(final_tab, aes(distance, reads, color = extremity)) +
geom_line() +
geom_vline(xintercept = seq(floor(min(final_tab$distance)/3) * 3, floor(max(final_tab$distance)/3) * 3, 3), linetype = 2, color = "gray90") +
geom_vline(xintercept = 0, color = "gray50") +
geom_vline(xintercept = - offset_temp[length == len, offset_from_5], color = "#D55E00", linetype = 2, size = 1.1) +
geom_vline(xintercept = offset_temp[length == len, offset_from_3], color = "#56B4E9", linetype = 2, size = 1.1) +
geom_vline(xintercept = - offset_temp[length == len, corrected_offset_from_5], color = "#D55E00", size = 1.1) +
geom_vline(xintercept = offset_temp[length == len, corrected_offset_from_3], color = "#56B4E9", size = 1.1) +
annotate("rect", ymin = -Inf, ymax = Inf, xmin = flanking - len, xmax = -flanking , fill = "#D55E00", alpha = 0.1) +
annotate("rect", ymin = -Inf, ymax = Inf, xmin = flanking - 1 , xmax = len - flanking - 1, fill = "#56B4E9", alpha = 0.1) +
labs(x = xlab_plot, y = "Number of read extremities", title = paste(n, " - length=", len, " nts", sep = ""), color= "Extremity") +
theme_bw(base_size = 20) +
scale_fill_discrete("") +
theme(panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), strip.placement = "outside") +
theme(plot.title = element_text(hjust = 0.5))
if(line_plot == "3end"){
p <- p + geom_vline(xintercept = best_offset, color = "black", linetype = 3, size = 1.1) +
geom_vline(xintercept = best_offset - len + 1, color = "black", linetype = 3, size = 1.1)
} else {
p <- p + geom_vline(xintercept = best_offset, color = "black", linetype = 3, size = 1.1) +
geom_vline(xintercept = best_offset + len - 1, color = "black", linetype = 3, size = 1.1)
}
p <- p +
scale_x_continuous(limits = c(min(final_tab$distance), max(final_tab$distance)),
breaks = seq(floor(min(final_tab$distance)/5) * 5, floor(max(final_tab$distance)/5) * 5, 5),
labels = as.character(seq(floor(min(final_tab$distance)/5) * 5, floor(max(final_tab$distance)/5) * 5, 5) + base))
subplot_dir <- paste(plot_dir, n, sep = "/")
dir.create(subplot_dir)
ggsave(paste(subplot_dir, "/", len, ".", plot_format, sep = ""), plot = p, width = 15, height = 5, units = "in")
}
cat(sprintf("\rplotting %s\n",
paste(paste(rep(c(" ", "<<", "-"), c(25 - progress, 1, progress)), collapse = ""), " ",
as.character(progress*4), "% ",
paste(rep(c("-", ">>", " "), c(progress, 1, 25 - progress)), collapse = ""), sep = "")))
options(warn=0)
}
dt[, c("site_dist_end5", "site_dist_end3") := NULL]
offset <- rbind(offset, offset_temp)
}
return(offset)
}
#' Update reads information according to the inferred P-sites.
#'
#' This function provides additional reads information according to the position
#' of the P-site identfied by \code{\link{psite}}. It attaches to each data
#' table in a list four columns reporting i) the P-site position with respect to
#' the 1st nucleotide of the transcript, ii) the P-site position with respect to
#' the start and the stop codon of the annotated coding sequence (if any) and
#' iii) the region of the transcript (5' UTR, CDS, 3' UTR) that includes the
#' P-site. Please note: 1) for transcripts not associated to any annotated CDS
#' the P-site position with respect to the start and the stop codon is
#' set to NA; 2) P-sites of short reads (<20 nts) might be located very close to
#' the 5' or 3' extremity, with no biological meaning and causing potential
#' downstream issues; for these reasons, all read lengths showing this feature
#' will be removed. Optionally, additional columns reporting the three
#' nucleotides covered by the P-site, the A-site and the E-site are attached,
#' based on FASTA files or BSgenome data packages containing the transcript
#' nucleotide sequences.
#'
#' @param data Either list of data tables or GRangesList object from
#' \code{\link{bamtolist}}, \code{\link{bedtolist}},
#' \code{\link{duplicates_filter}} or \code{\link{length_filter}}.
#' @param offset Data table from \code{\link{psite}}.
#' @param site Either "psite, "asite", "esite" or a combination of these
#' strings. It specifies if additional column(s) reporting the three
#' nucleotides covered by the ribosome P-site ("psite"), A-site ("asite") and
#' E-site ("esite") should be added. Note: either \code{fastapath} or
#' \code{bsgenome} is required for this purpose. Default is NULL.
#' @param fastapath fastapath Character string specifying the FASTA file used in
#' the alignment step, including its path, name and extension. This file can
#' contain reference nucleotide sequences either of genome asseblies
#' (chromosome sequences) or of transcripts (see \code{Details} and
#' \code{fasta_genome}). Please make sure the sequences derive from the same
#' release of the annotation file used in the \code{\link{create_annotation}}
#' function. Note: either \code{fastapath} or \code{bsgenome} is required to
#' generate additional column(s) specified by \code{site}. Default is NULL.
#' @param fasta_genome Logical value whether the FASTA file specified by
#' \code{fastapath} contains nucleotide sequences of genome asseblies
#' (chromosome sequences). If TRUE (the default), an annotation object is
#' required (see \code{gtfpath} and \code{txdb}). FALSE implies nucleotide
#' sequences of transcripts are provided instead.
#' @param refseq_sep Character specifying the separator between reference
#' sequences' name and additional information to discard, stored in the
#' headers of the FASTA file specified by \code{fastapath} (if any). It might
#' be required for matching the reference sequences' identifiers reported in
#' the input list of data tables. All characters before the first occurrence
#' of the specified separator are kept. Default is NULL i.e. no string
#' splitting is performed.
#' @param bsgenome Character string specifying the BSgenome data package with
#' the genome sequences to be loaded. If not already present in the system, it
#' is automatically installed through the biocLite.R script (check the list of
#' available BSgenome data packages by running the
#' \code{\link[BSgenome]{available.genomes}} function of the BSgenome
#' package). This parameter must be coupled with an annotation object (see
#' \code{gtfpath} and \code{txdb}). Please make sure the sequences included in
#' the specified BSgenome data pakage are in agreement with the sequences used
#' in the alignment step. Note: either \code{fastapath} or \code{bsgenome} is
#' required to generate additional column(s) specified by \code{site}. Default
#' is NULL.
#' @param gtfpath Character string specifying the location of a GTF file,
#' including its path, name and extension. Please make sure the GTF file and
#' the sequences specified by \code{fastapath} or \code{bsgenome} derive from
#' the same release. Note that either \code{gtfpath} or \code{txdb} is
#' required if and only if nucleotide sequences of genome assemblies
#' (chromosome sequences) are provided (see \code{fastapath} or
#' \code{bsgenome}). Default is NULL.
#' @param txdb Character string specifying the TxDb annotation package to be
#' loaded. If not already present in the system, it is automatically installed
#' through the biocLite.R script (check
#' \href{http://bioconductor.org/packages/release/BiocViews.html#___TxDb}{here}
#' the list of available TxDb annotation packages). Please make sure the TxDb
#' annotation package and the sequences specified by \code{fastapath} or
#' \code{bsgenome} derive from the same release. Note that either
#' \code{gtfpath} or \code{txdb} is required if and only if nucleotide
#' sequences of genome assemblies (chromosome sequences) are provided (see
#' \code{fastapath} or \code{bsgenome}). Default is NULL.
#' @param dataSource Optional character string describing the origin of the GTF
#' data file. This parameter is considered only if \code{gtfpath} is
#' specified. For more information about this parameter please refer to the
#' description of \emph{dataSource} of the
#' \code{\link[GenomicFeatures]{makeTxDbFromGFF}} function included in the
#' \code{GenomicFeatures} package.
#' @param organism Optional character string reporting the genus and species of
#' the organism of the GTF data file. This parameter is considered only if
#' \code{gtfpath} is specified. For more information about this parameter
#' please refer to the description of \emph{organism} of the
#' \code{\link[GenomicFeatures]{makeTxDbFromGFF}} function included in the
#' \code{GenomicFeatures} package.
#' @param output_class Either "datatable" or "granges". It specifies the format
#' of the output i.e. a list of data tables or a GRangesList object. Default
#' is "datatable".
#' @return A list of data tables or a GRangesList object.
#' @details \strong{riboWaltz} only works for read alignments based on
#' transcript coordinates. This choice is due to the main purpose of RiboSeq
#' assays to study translational events through the isolation and sequencing
#' of ribosome protected fragments. Most reads from RiboSeq are supposed to
#' map on mRNAs and not on introns and intergenic regions. BAM based on
#' transcript coordinates can be generated in two ways: i) aligning directly
#' against transcript sequences; ii) aligning against sequences of genome
#' assemblies i.e. standard chromosome sequences, thus requiring the outputs
#' to be translated in transcript coordinates. The first option can be easily
#' handled by many aligners (e.g. Bowtie), given a reference FASTA file where
#' each sequence represents a transcript, from the beginning of the 5' UTR to
#' the end of the 3' UTR. The second procedure is based on reference FASTA
#' files where each sequence represents a chromosome, usually coupled with
#' comprehensive gene annotation files (GTF or GFF). The STAR aligner, with
#' its option --quantMode TranscriptomeSAM (see Chapter 6 of its
#' \href{http://labshare.cshl.edu/shares/gingeraslab/www-data/dobin/STAR/STAR.posix/doc/STARmanual.pdf}{manual}),
#' is an example of tool providing such a feature.
#' @examples
#' data(reads_list)
#' data(psite_offset)
#' data(mm81cdna)
#'
#' reads_psite_list <- psite_info(reads_list, psite_offset)
#' @import data.table
#' @export
psite_info <- function(data, offset, site = NULL, fastapath = NULL,
fasta_genome = TRUE, refseq_sep = NULL, bsgenome = NULL,
gtfpath = NULL, txdb = NULL, dataSource = NA,
organism = NA, output_class = "datatable") {
if(!(all(site %in% c("psite", "asite", "esite"))) & length(site) != 0){
cat("\n")
stop("parameter site must be either NULL, \"psite\", \"asite\", \"esite\" or a combination of the three strings \n\n")
} else {
if(length(site) != 0 & length(fastapath) == 0 & length(bsgenome) == 0){
cat("\n")
stop("parameter site is specified but both fastapath and bsgenome are missing \n\n")
}
}
if(length(site) != 0){
if(((length(fastapath) != 0 & (fasta_genome == TRUE | fasta_genome == T)) |
length(bsgenome) != 0) &
length(gtfpath) == 0 & length(txdb) == 0){
cat("\n")
stop("genome annotation file not specified (both GTF path and TxDb object are missing)\n\n")
}
if(length(fastapath) != 0 & length(bsgenome) != 0){
cat("\n")
warning("both fastapath and bsgenome are specified. Only fastapath will be considered\n")
bsgenome = NULL
}
if(length(gtfpath) != 0 & length(txdb) != 0){
cat("\n")
warning("both gtfpath and txdb are specified. Only gtfpath will be considered\n")
txdb = NULL
}
if((length(gtfpath) != 0 | length(txdb) != 0) &
((length(fastapath) == 0 & length(bsgenome) == 0) |
(length(fastapath) != 0 & (fasta_genome == FALSE | fasta_genome == F)))){
cat("\n")
warning("a genome annotation file is specified but no sequences from genome assembly are provided\n")
}
if(length(gtfpath) != 0 | length(txdb) != 0){
if(length(gtfpath) != 0){
path_to_gtf <- gtfpath
txdbanno <- GenomicFeatures::makeTxDbFromGFF(file = path_to_gtf, format = "gtf", dataSource = dataSource, organism = organism)
} else {
if(txdb %in% rownames(installed.packages())){
library(txdb, character.only = TRUE)
} else {
source("https://bioconductor.org/biocLite.R")
biocLite(txdb, suppressUpdates = TRUE)
library(txdb, character.only = TRUE)
}
txdbanno <- get(txdb)
}
}
if(length(fastapath) != 0 | length(bsgenome) != 0){
if(length(fastapath) != 0) {
if(fasta_genome == TRUE | fasta_genome == T){
temp_sequences <- Biostrings::readDNAStringSet(fastapath, format = "fasta", use.names = TRUE)
if(length(refseq_sep) != 0){
names(temp_sequences) <- tstrsplit(names(temp_sequences), refseq_sep, fixed = TRUE, keep = 1)[[1]]
}
exon <- suppressWarnings(GenomicFeatures::exonsBy(txdbanno, by = "tx", use.names = TRUE))
exon <- as.data.table(exon[unique(names(exon))])
sub_exon_plus <- exon[as.character(seqnames) %in% names(temp_sequences) & strand == "+"]
sub_exon_minus <- exon[as.character(seqnames) %in% names(temp_sequences) & strand == "-"
][, new_end := Biostrings::width(temp_sequences[as.character(seqnames)]) - start + 1
][, new_start := Biostrings::width(temp_sequences[as.character(seqnames)]) - end + 1]
seq_dt_plus <- sub_exon_plus[, nt_seq := "emp"
][, nt_seq := as.character(Biostrings::subseq(temp_sequences[as.character(seqnames)],
start = start,
end = end))
][, list(seq = paste(nt_seq, collapse = "")), by = group_name]
revcompl_temp_sequences <- Biostrings::reverseComplement(temp_sequences)
seq_dt_minus <- sub_exon_minus[, nt_seq := "emp"
][, nt_seq := as.character(Biostrings::subseq(revcompl_temp_sequences[as.character(seqnames)],
start = new_start,
end = new_end))
][, list(seq = paste(nt_seq, collapse = "")), by = group_name]
sequences <- Biostrings::DNAStringSet(c(seq_dt_plus$seq, seq_dt_minus$seq))
names(sequences) <- c(unique(sub_exon_plus$group_name), unique(sub_exon_minus$group_name))
} else {
sequences <- Biostrings::readDNAStringSet(fastapath, format = "fasta", use.names = TRUE)
if(length(refseq_sep) != 0){
names(sequences) <- tstrsplit(names(sequences), refseq_sep, fixed = TRUE, keep = 1)[[1]]
}
}
} else {
if(bsgenome %in% installed.genomes()){
library(bsgenome, character.only = TRUE)
} else {
source("http://www.bioconductor.org/biocLite.R")
biocLite(bsgenome, suppressUpdates = TRUE)
library(bsgenome, character.only = TRUE)
}
sequences <- GenomicFeatures::extractTranscriptSeqs(get(bsgenome), txdbanno, use.names=T)
}
}
}
names <- names(data)
for (n in names) {
cat(sprintf("processing %s\n", n))
dt <- data[[n]]
if(class(dt)[1] == "GRanges"){
dt <- as.data.table(dt)[, c("width", "strand") := NULL
][, seqnames := as.character(seqnames)]
setnames(dt, c("seqnames", "start", "end"), c("transcript", "end5", "end3"))
}
suboff <- offset[sample == n]
# check position of P-sites wrt reads extremities
suboff_rm <- suboff[corrected_offset_from_5 < 3 | (length - corrected_offset_from_3) < 4]
if(nrow(suboff_rm) != 0){
cat(sprintf("P-site is too close to read extremities for read length(s): %s;
All reads of such length(s) (%s %% of total amount) will be removed\n",
paste(suboff_rm$length, collapse = ", "), sum(suboff_rm$total_percentage)))
}
dt <- dt[!(length %in% c(suboff_rm$length))]
suboff <- suboff[!(length %in% c(suboff_rm$length))
][, .(length, corrected_offset_from_3)]
cat("1. adding p-site position\n")
dt <- merge.data.table(dt, suboff, by = "length")
setnames(dt, "corrected_offset_from_3", "psite")
dt[, psite := end3 - psite]
setcolorder(dt,c("transcript", "end5", "psite", "end3", "length", "cds_start", "cds_stop"))
dt[, psite_from_start := psite - cds_start
][cds_stop == 0, psite_from_start := 0]
dt[, psite_from_stop := psite - cds_stop
][cds_stop == 0, psite_from_stop := 0]
cat("2. adding transcript region\n")
dt[, psite_region := "5utr"
][psite_from_start >= 0 & psite_from_stop <= 0, psite_region := "cds"
][psite_from_stop > 0, psite_region := "3utr"
][cds_stop == 0, psite_region := NA]
if(length(site) != 0){
cat("3. adding nucleotide sequence(s)\n")
if("psite" %in% site){
dt[, p_site_codon := as.character(Biostrings::subseq(sequences[as.character(dt$transcript)],
start = dt$psite,
end = dt$psite + 2))]
}
if("asite" %in% site){
dt[, a_site_codon := as.character(Biostrings::subseq(sequences[as.character(dt$transcript)],
start = dt$psite + 3,
end = dt$psite + 5))]
}
if("esite" %in% site){
dt[, e_site_codon := as.character(Biostrings::subseq(sequences[as.character(dt$transcript)],
start = dt$psite - 3,
end = dt$psite - 1))]
}
}
setorder(dt, transcript, end5, end3)
if(output_class == "granges"){
dt <- GenomicRanges::makeGRangesFromDataFrame(dt,
keep.extra.columns = TRUE,
ignore.strand = TRUE,
seqnames.field = c("transcript"),
start.field = "end5",
end.field = "end3",
strand.field = "strand",
starts.in.df.are.0based = FALSE)
GenomicRanges::strand(dt) <- "+"
}
data[[n]] <- dt
}
if(output_class == "granges"){
data <- GenomicRanges::GRangesList(data)
}
return(data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.