R/getORF.R

Defines functions getORF

Documented in getORF

# getORF
#' Wrapper function for EMBOSS 'getorf' program
#'
#'
#' @param ntFa A transcriptome reference file
#' @param orfFa An ORF output file generated by getorf.
#'     Default: "getorf reference.fa refORF.fa -find 1 -noreverse"
#' @return orf.df object. A data.frame describing the ORFs found
#' @keywords RNAframework rf-modcall getorf
#' @examples
#' # Run getorf
#' system('getorf fa/reference.fa fa/refORF.fa -find 1 -noreverse')
#' orf.df = getORF(ntFa = 'reference.fa', orfFa = 'reference_orf.fa')
#' @export
getORF = function(ntFa, orfFa){

  # Ensure input ORF file exists
  if ( !file.exists(orfFa) ){
    stop( paste0('Reference ORF file: ',file,' not found'))
  }

  # Read the orf.fasta file as a vector of strings
  lines   <- readLines(orfFa)
  headerIndex <- which(substr(lines, 1, 1) == ">")

  # Check that at least one entry exists in ORF fasta
  if (length(headerIndex) == 0){
    stop("No header lines found in the ORF Fasta File")
  }

  # Full header format
  # ">GENENAME_ORFID [START - END] "
  #
  # Start === NNNNNN_A_TGNNNN
  # End  === NN_N_TGANNNNNNN
  headers <- lines[headerIndex]

  parseHeader = function(header){
    gene  <- strsplit(header, split = ' ')[[1]][1]
    orfid <- strsplit(gene,   split = "_")[[1]][2]
    gene  <- strsplit(gene,   split = "_")[[1]][1]
    gene  <- gsub('>','', gene)

    start <- strsplit(header, split = ' ')[[1]][2]
    start <- gsub('\\[','',start)

    end   <- strsplit(header, split = ' ')[[1]][4]
    end   <- gsub(']','',end)

    return( c(gene, orfid, start, end) )
  }

  orf.df <- do.call(rbind, lapply(headers, parseHeader))

  # Wrangle into a data.frame
  orf.df <- data.frame(orf.df)
    colnames(orf.df) <- c('gene','orf','start','end')
    orf.df$gene <- factor(orf.df$gene)
    orf.df$orf  <- factor(orf.df$orf)
    orf.df$start <- as.numeric(as.character(orf.df$start))
    orf.df$end   <- as.numeric(as.character(orf.df$end)) + 1 # correct coordinate system

  # Annotate length in nt
  orf.df$length <- orf.df$end - orf.df$start

  # Annotate the longest ORF per gene
  # returns binary vector (longest = T)
  # for each entry in orf.df
  longestORF <- function( orf.df ){

    # name rows by number
    rownames(orf.df) = as.character(1:length(orf.df[ ,1]))

    # sort orf.df by ORF size
    orf.df <- orf.df[ order(orf.df$length, decreasing = T), ]

    # highest gene entry is longest
    longestEntry <- which(!duplicated(orf.df$gene))

    return( as.numeric(rownames(orf.df)[longestEntry]) )

  }

  longest = longestORF( orf.df )

  orf.df$primary          <- F
  orf.df$primary[longest] <- T

  # TODO: Return
  # https://r-forge.r-project.org/scm/viewvc.php/pkg/R/read.fasta.R?view=markup&root=seqinr
  # Read AA ORF data

  return(orf.df)

}
ababaian/RRNAframework documentation built on March 18, 2020, 1:29 a.m.