# getORF
#' Wrapper function for EMBOSS 'getorf' program
#'
#'
#' @param ntFa A transcriptome reference file
#' @param orfFa An ORF output file generated by getorf.
#' Default: "getorf reference.fa refORF.fa -find 1 -noreverse"
#' @return orf.df object. A data.frame describing the ORFs found
#' @keywords RNAframework rf-modcall getorf
#' @examples
#' # Run getorf
#' system('getorf fa/reference.fa fa/refORF.fa -find 1 -noreverse')
#' orf.df = getORF(ntFa = 'reference.fa', orfFa = 'reference_orf.fa')
#' @export
getORF = function(ntFa, orfFa){
# Ensure input ORF file exists
if ( !file.exists(orfFa) ){
stop( paste0('Reference ORF file: ',file,' not found'))
}
# Read the orf.fasta file as a vector of strings
lines <- readLines(orfFa)
headerIndex <- which(substr(lines, 1, 1) == ">")
# Check that at least one entry exists in ORF fasta
if (length(headerIndex) == 0){
stop("No header lines found in the ORF Fasta File")
}
# Full header format
# ">GENENAME_ORFID [START - END] "
#
# Start === NNNNNN_A_TGNNNN
# End === NN_N_TGANNNNNNN
headers <- lines[headerIndex]
parseHeader = function(header){
gene <- strsplit(header, split = ' ')[[1]][1]
orfid <- strsplit(gene, split = "_")[[1]][2]
gene <- strsplit(gene, split = "_")[[1]][1]
gene <- gsub('>','', gene)
start <- strsplit(header, split = ' ')[[1]][2]
start <- gsub('\\[','',start)
end <- strsplit(header, split = ' ')[[1]][4]
end <- gsub(']','',end)
return( c(gene, orfid, start, end) )
}
orf.df <- do.call(rbind, lapply(headers, parseHeader))
# Wrangle into a data.frame
orf.df <- data.frame(orf.df)
colnames(orf.df) <- c('gene','orf','start','end')
orf.df$gene <- factor(orf.df$gene)
orf.df$orf <- factor(orf.df$orf)
orf.df$start <- as.numeric(as.character(orf.df$start))
orf.df$end <- as.numeric(as.character(orf.df$end)) + 1 # correct coordinate system
# Annotate length in nt
orf.df$length <- orf.df$end - orf.df$start
# Annotate the longest ORF per gene
# returns binary vector (longest = T)
# for each entry in orf.df
longestORF <- function( orf.df ){
# name rows by number
rownames(orf.df) = as.character(1:length(orf.df[ ,1]))
# sort orf.df by ORF size
orf.df <- orf.df[ order(orf.df$length, decreasing = T), ]
# highest gene entry is longest
longestEntry <- which(!duplicated(orf.df$gene))
return( as.numeric(rownames(orf.df)[longestEntry]) )
}
longest = longestORF( orf.df )
orf.df$primary <- F
orf.df$primary[longest] <- T
# TODO: Return
# https://r-forge.r-project.org/scm/viewvc.php/pkg/R/read.fasta.R?view=markup&root=seqinr
# Read AA ORF data
return(orf.df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.