#' Load a transcriptome lengths file
#' @rdname load_anno
#'
#' @description
#' `load_lengths` reads in a transcriptome lengths file
#'
#' @param length_fname character; file path to transcriptome lengths file
#'
#' @returns A data frame containing transcript names, 5' UTR lengths, CDS
#' lengths, and 3' UTR lengths
load_lengths <- function(lengths_fname) {
transcript_lengths <- read.table(lengths_fname, stringsAsFactors=F,
col.names=c("transcript", "utr5_length",
"cds_length", "utr3_length"))
return(transcript_lengths)
}
#' Read in a GFF file as transcriptome lengths
#' @rdname load_anno
#'
#' @description
#' `gff_to_lengths` reads in a GFF-formatted file and parse for 5' UTR, CDS,
#' and 3' UTR lengths. First column must be the transcript name; third column
#' must be one of `UTR5`, `CDS`, or `UTR3`.
#'
#' @param gff_fname character; file path to GFF annotation file
#'
#' @returns A data frame containing transcript names, 5' UTR lengths, CDS
gff_to_lengths <- function(gff_fname) {
gff <- read.table(gff_fname, col.names=c("seqid", "source", "type", "start", "end",
"score", "strand", "phase", "attributes"),
stringsAsFactors=F)
transcripts <- unique(gff$seqid)
transcript_lengths <- data.frame(transcript = transcripts,
utr5_length = sapply(transcripts,
function(x) {
dat <- subset(gff, seqid==x & type=="UTR5")
return(dat$end - dat$start + 1)
}),
cds_length = sapply(transcripts,
function(x) {
dat <- subset(gff, seqid==x & type=="CDS")
return(dat$end - dat$start + 1)
}),
utr3_length = sapply(transcripts,
function(x) {
dat <- subset(gff, seqid==x & type=="UTR3")
return(dat$end - dat$start + 1)
}),
row.names=NULL, stringsAsFactors=F)
return(transcript_lengths)
}
#' Read in a FASTA file
#' @rdname load_anno
#'
#' @description
#' `load_fasta` reads in a FASTA-formatted file and generates a character vector
#' of transcript sequences
#'
#' @param transcript_fa_fname character; file path to transcriptome .fasta file
#'
#' @returns A named numeric vector of transcript sequences
load_fasta <- function(transcript_fa_fname) {
transcript_sequences <- Biostrings::readDNAStringSet(transcript_fa_fname)
transcript_sequences <- as.character(transcript_sequences)
names(transcript_sequences) <- sapply(names(transcript_sequences),
function(x) {
strsplit(x, split=" ")[[1]][1]
})
return(transcript_sequences)
}
#' Write transcriptome sequence to disk
#'
#' @description
#' Write a character vector of transcript sequences to a .fasta file. Names of
#' `transcript_seq` will become FASTA headers.
#'
#' @param transcript_seq named character vector; file path to output .fasta file
write_fasta <- function(transcript_seq, fa_fname) {
transcript_seq <- Biostrings::DNAStringSet(transcript_seq)
Biostrings::writeXStringSet(transcript_seq, fa_fname)
}
#' Load A site offset rules
#' @rdname load_anno
#'
#' @description
#' `load_offsets` reads in a .txt file containing A-site offset values per
#' RPF length and frame. Row names correspond to RPF length and column names
#' correspond to RPF frame (named `frame_0`, `frame_1`, and `frame_2`).
#'
#' @param offsets_fname character; file path to A site assignment rules .txt file
#'
#' @returns A data frame in long format of A site offset values per RPF length and frame
load_offsets <- function(offsets_fname) {
offsets <- read.table(offsets_fname, header=T)
offsets <- data.frame(frame=as.vector(mapply(rep, 0:2, nrow(offsets))),
length=rep(as.numeric(rownames(offsets)), 3),
offset=c(offsets$frame_0, offsets$frame_1, offsets$frame_2))
return(offsets)
}
#' Read in transcript sequences as codons
#' @rdname load_anno
#'
#' @description
#' `read_fasta_as_codons` reads in a FASTA-formatted file and generates a list
#' of character vectors where transcript sequences have been split into codons.
#'
#' @param transcript_fa_fname character; file path to transcriptome .fasta file
#' @param transcript_length_fname character; file path to transcriptome lengths file
#'
#' @returns A named list of character vectors of transcript sequences as codons
read_fasta_as_codons <- function(transcript_fa_fname, transcript_length_fname) {
transcript_seq <- load_fasta(transcript_fa_fname)
transcript_lengths <- load_lengths(transcript_length_fname)
codon_sequences <- sapply(seq_along(transcript_seq),
function(x) {
tmp_seq <- transcript_seq[x]
utr5_length <- subset(transcript_lengths,
transcript == names(transcript_seq)[x])$utr5_length
num_codons <- floor(nchar(tmp_seq) / 3)
sequence_offset <- utr5_length %% 3
codon_sequence <- substring(tmp_seq,
first=3*(seq(num_codons)-1)+1+sequence_offset,
last=3*seq(num_codons)+sequence_offset)
names(codon_sequence) <- as.character(seq.int(from=-floor(utr5_length/3),
length.out=num_codons)) # start codon is 0
return(codon_sequence)
})
names(codon_sequences) <- names(transcript_seq)
return(codon_sequences)
}
#' Retrieve codons in A-, P-, and E-site positions
#'
#' @description
#' A function to pull codons in the A-, P-, and E-site positions of a ribosome
#' at a position within a transcript.
#'
#' @param transcript_seq named character vector; transcript (+ 5' and 3' UTR regions) sequences
#' @param transcript_name character; one of `names(transcript_seq)`
#' @param cod_idx integer; codon index for A-site codon (1-based)
#' @param utr5_length integer; length of 5' UTR (in nucleotides)
#'
#' @returns A named character vector
get_codons <- function(transcript_name, cod_idx, utr5_length, transcript_seq) {
A_start <- utr5_length + 3*(cod_idx-1) + 1
A_end <- utr5_length + 3*cod_idx
A_codon <- substr(transcript_seq[transcript_name], A_start, A_end)
P_start <- utr5_length + 3*(cod_idx-2) + 1
P_end <- utr5_length + 3*(cod_idx-1)
P_codon <- substr(transcript_seq[transcript_name], P_start, P_end)
E_start <- utr5_length + 3*(cod_idx-3) + 1
E_end <- utr5_length + 3*(cod_idx-2)
E_codon <- substr(transcript_seq[transcript_name], E_start, E_end)
codons <- c(A_codon, P_codon, E_codon)
names(codons) <- c("A", "P", "E")
return(codons)
}
#' Retrieve nucleotides at RPF ends
#'
#' @description
#' A function to pull the nucleotides at the ends of a ribosome-protected
#' fragment, defined by ribosome position and size.
#'
#' @details
#' If `read_type` is `monosome`, then A site codon indices should be named
#' `cod_idx` in `dat`. If `read_type` is `disome`, then A site codon indices
#' should be named `cod_idx_lagging` and `cod_idx_leading` in `dat`.
#'
#' @param dat data frame containing columns `transcript`, `d5`, `d3`, `utr5_length`, and codon indices.
#' @param transcript_seq character; output from `load_fasta`
#' @param bias_region character; one of `f5` or `f3` corresponding to 5' or 3' bias sequence
#' @param bias_length integer; number of nucleotides to include in bias sequence
#' @param read_type character; one of `monosome` or `disome`
#'
#' @returns A character vector corresponding to rows of `dat`
get_bias_seq <- function(dat, transcript_seq, bias_region, bias_length=3,
read_type="monosome") {
### TODO: check that read_type is valid
if(bias_region=="f5") {
if(read_type=="monosome") {
seq_start <- with(dat, utr5_length + 3*(cod_idx-1)+1 - d5)
} else {
seq_start <- with(dat, utr5_length + 3*(cod_idx_lagging-1)+1 - d5)
}
seq_end <- seq_start + bias_length - 1
} else {
if(read_type=="monosome") {
seq_end <- with(dat, utr5_length + 3*cod_idx + d3)
} else {
seq_end <- with(dat, utr5_length + 3*cod_idx_leading + d3)
}
seq_start <- seq_end - bias_length + 1
}
bias_seq <- mapply(substr, transcript_seq[as.character(dat$transcript)],
seq_start, seq_end)
return(bias_seq)
}
#' Convert from DNA to amino acid sequence
#'
#' @description
#' This function extracts CDS sequence from a FASTA-formatted file,
#' translates DNA into amino acid sequence, and writes amino acid sequences to disk.
#'
#' @param transcript_fa_fname character; file path to transcriptome .fasta (DNA) file
#' @param transcript_length_fname character; file path to transcriptome lengths file
#' @param output_fname character; file path for output amino acid sequences
convert_to_aa <- function(transcript_fa_fname, transcript_length_fname, output_fname) {
# 1. load in transcript DNA sequence
transcript_seq <- load_fasta(transcript_fa_fname)
transcript_lengths <- load_lengths(transcript_length_fname)
dna_seq <- sapply(seq_along(transcript_seq),
function(x) {
tmp_seq <- transcript_seq[x]
tmp_name <- names(transcript_seq)[x]
tmp_utr5 <- subset(transcript_lengths, transcript==tmp_name)$utr5_length
tmp_utr3 <- subset(transcript_lengths, transcript==tmp_name)$utr3_length
cds_seq <- substr(tmp_seq, tmp_utr5+1, nchar(tmp_seq)-tmp_utr3)
return(cds_seq)
})
names(dna_seq) <- names(transcript_seq)
dna_seq <- Biostrings::DNAStringSet(dna_seq)
# 2. translate to amino acid sequence
aa_seq <- Biostrings::translate(dna_seq)
# 3. write amino acid sequence to fasta file
Biostrings::writeXStringSet(aa_seq, output_fname)
}
#' Get indices for corresponding rows in two data frames
#'
#' @description
#' Return row indices of (first) matches of `x` in `y`
#'
#' @param x data frame containing rows to find in `y`
#' @param y data frame
#' @param x_col character vector of columns to be used; `NULL` to use all columns
#' @param y_col character vector of columns corresponding to `x_col`; `NULL` to use `x_col`
#'
#' @returns Numeric vector corresponding to rows in `x`
match_rows <- function(x, y, x_col=NULL, y_col=NULL) {
if(is.null(x_col)) {
x_col <- colnames(x)
}
if(is.null(y_col)) {
y_col <- x_col
}
x_terms <- do.call(paste, c(x[, x_col], sep="\r"))
y_terms <- do.call(paste, c(y[, y_col], sep="\r"))
match(x_terms, y_terms)
}
#' Annotate regression coefficients
#'
#' @description
#' Extract regression coefficients from a negative binomial object and annotate
#' coefficient types for downstream analyses.
#'
#' @param nb_fit `negbin` object from `glm.nb`
#'
#' @returns A data frame containing regression coefficients and corresponding annotations.
parse_coefs <- function(nb_fit) {
regression_terms <- attributes(nb_fit$terms)$term.labels
# 1. pull coefficients from nb_fit object
fit_coefs <- data.frame(rownames(summary(nb_fit)$coefficient),
summary(nb_fit)$coefficient,
row.names=NULL)
colnames(fit_coefs) <- c("name", "estimate", "std_error", "t", "p")
# 2. add reference levels
fit_coefs <- rbind(fit_coefs,
data.frame(name=sapply(seq_along(names(nb_fit$xlevels)),
function(x) {
paste0(names(nb_fit$xlevels)[x], nb_fit$xlevels[[x]][1])
}),
estimate=0, std_error=NA, t=NA, p=NA,
row.names=NULL))
# 3. add annotations for individual terms
for(tmp_coef in grep(":", regression_terms, invert=T, value=T)) {
tmp_index <- grepl(paste0("^", tmp_coef), fit_coefs$name) & !grepl(":", fit_coefs$name)
fit_coefs$group[tmp_index] <- tmp_coef
fit_coefs$term[tmp_index] <- sub(paste0("^", tmp_coef), "", fit_coefs$name[tmp_index])
}
# 4. add annotations for interaction terms
for(tmp_coef in grep(":", regression_terms, value=T)) {
term_1 <- sub(":.*$", "", tmp_coef)
term_2 <- sub("^.*:", "", tmp_coef)
tmp_index <- grepl(paste0(term_1, ".*:", term_2), fit_coefs$name)
fit_coefs$group[tmp_index] <- tmp_coef
# transcript:(dataset) interaction terms
## term: transcript
## group_1: "transcript"
## group_2: (dataset)
if("transcript" %in% c(term_1, term_2)) {
if(term_1 == "transcript") { # first term is transcript
fit_coefs$term[tmp_index] <- sub("transcript", "", sub(":.*", "", fit_coefs$name[tmp_index]))
} else { # first term is (dataset)
fit_coefs$term[tmp_index] <- sub(".*:transcript", "", fit_coefs$name[tmp_index])
}
fit_coefs$group_1[tmp_index] <- "transcript"
fit_coefs$group_2[tmp_index] <- ifelse(term_1 == "transcript", term_2, term_1)
}
# A/P/E:(dataset) interaction terms
## term column: codon
## group_1: A/P/E
## group_2: (dataset)
if((term_1 %in% c("A", "P", "E")) | (term_2 %in% c("A", "P", "E"))) {
if(term_1 %in% c("A", "P", "E")) { # first term is codon site
fit_coefs$term[tmp_index] <- substr(sub(":.*$", "", fit_coefs$name[tmp_index]), 2, 4)
fit_coefs$group_1[tmp_index] <- term_1
fit_coefs$group_2[tmp_index] <- term_2
} else { # first term is (dataset)
fit_coefs$term[tmp_index] <- substr(sub("^.*:", "", fit_coefs$name[tmp_index]), 2, 4)
fit_coefs$group_1[tmp_index] <- term_2
fit_coefs$group_2[tmp_index] <- term_1
}
}
# f5:d5 / f3:d3 interaction terms
## term column: bias sequence
## group_1: f5/f3
if(grepl("f(3|5)", tmp_coef) & grepl("d(3|5)", tmp_coef)) {
if(grepl("f", term_1)) { # first term is bias sequence
fit_coefs$term[tmp_index] <- sub("^.*f(3|5)", "", sub(":.*$", "", fit_coefs$name[tmp_index]))
fit_coefs$group_1[tmp_index] <- term_1
fit_coefs$group_2[tmp_index] <- sub(term_2, paste0(term_2, "="),
sub("^.*:", "", fit_coefs$name[tmp_index]))
} else { # first term is digest length
fit_coefs$term[tmp_index] <- sub(".*:.*f(3|5)", "", fit_coefs$name[tmp_index])
fit_coefs$group_1[tmp_index] <- term_2
fit_coefs$group_2[tmp_index] <- sub(term_1, paste0(term_1, "="),
sub(":.*$", "", fit_coefs$name[tmp_index]))
}
}
}
fit_coefs$group <- sub("genome_", "", fit_coefs$group)
fit_coefs$group_1 <- sub("genome_", "", fit_coefs$group_1)
return(fit_coefs)
}
#' Compute GC-content for a ribosome-protected fragment
#'
#' @description
#' This function computes the GC-content of a ribosome-protected fragment,
#' omitting the nucleotides in the A-, P-, and/or E-site codon positions.
#'
#' @param dat data frame containing columns `transcript`, `cod_idx`, `d5`, and `d3`
#' @param omit character; one of `A`, `AP`, or `APE` corresponding to codons to omit
#' @param transcript_fa_fname character; file path to transcriptome fasta file
#' @param transcript_lengths_fname character; file path to transcript lengths file
#' @param read_type character; one of `monosome` or `disome`
#'
#' @returns A numeric vector corresponding to rows of `dat`
compute_rpf_gc <- function(dat, omit="APE", transcript_fa_fname,
transcript_lengths_fname, read_type="monosome") {
### TODO: check whether codon index label matches read_type
transcript_seq <- load_fasta(transcript_fa_fname)
transcript_lengths <- load_lengths(transcript_lengths_fname)
utr5_lengths <- transcript_lengths$utr5_length
names(utr5_lengths) <- transcript_lengths$transcript
num_omit_codons <- ifelse(grepl("E", omit), 2,
ifelse(grepl("P", omit), 1, 0))
dat$transcript <- as.character(dat$transcript)
dat$d5 <- as.numeric(as.character(dat$d5))
dat$d3 <- as.numeric(as.character(dat$d3))
if(read_type=="monosome") {
A_start <- utr5_lengths[dat$transcript] + 1 + 3*(dat$cod_idx-1)
rpf_length <- with(dat, d5+d3+3)
# 1. extract RPF 5' regions
rpf_5_start <- A_start - dat$d5
rpf_5_end <- A_start - 1 - 3*num_omit_codons
rpf_5 <- mapply(substr, transcript_seq[dat$transcript],
rpf_5_start, rpf_5_end)
# 2. extract RPF 3' regions
rpf_3_start <- A_start + 3
rpf_3_end <- A_start + 2 + dat$d3
rpf_3 <- mapply(substr, transcript_seq[dat$transcript],
rpf_3_start, rpf_3_end)
# 3. concatenate regions
rpf_regions <- paste0(rpf_5, rpf_3)
} else {
A_lagging_start <- utr5_lengths[dat$transcript] + 1 +
3*(dat$cod_idx_lagging-1)
A_leading_start <- utr5_lengths[dat$transcript] + 1 +
3*(dat$cod_idx_leading-1)
rpf_length <- with(dat, d5+d3+3*(cod_idx_leading-cod_idx_lagging+1))
# 1. extract region 5' of lagging A site
rpf_5_start <- A_lagging_start - dat$d5
rpf_5_end <- A_lagging_start - 1 - 3*num_omit_codons
rpf_5 <- mapply(substr, transcript_seq[dat$transcript],
rpf_5_start, rpf_5_end)
# 2. extract region between lagging and leading A sites
rpf_inner_start <- A_lagging_start + 3
rpf_inner_end <- A_leading_start - 1 - 3*num_omit_codons
rpf_inner <- mapply(substr, transcript_seq[dat$transcript],
rpf_inner_start, rpf_inner_end)
# 3. extract RPF 3' regions
rpf_3_start <- A_leading_start + 3
rpf_3_end <- A_leading_start + 2 + dat$d3
rpf_3 <- mapply(substr, transcript_seq[dat$transcript],
rpf_3_start, rpf_3_end)
# 4. concatenate regions
rpf_regions <- paste0(rpf_5, rpf_inner, rpf_3)
}
# compute GC content
gc_content <- strsplit(rpf_regions, split="")
gc_content <- sapply(gc_content, function(x) sum(x %in% c("G", "C")))
gc_content <- gc_content / rpf_length
return(gc_content)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.