R/read.orfs.R

Defines functions read.orfs

Documented in read.orfs

#' @title Read output of \code{ORFpred}
#' @description This function reads the output of the \code{\link{ORFpred}} function and stores the 
#' sequence id and number of predicted ORFs in a \code{\link{data.frame}} object.
#' @param input.file fasta file generated by \code{\link{ORFpred}}.
#' @author Hajk-Georg Drost
#' @details The file generated by \code{\link{ORFpred}} is parsed by this function
#' and returned as \code{\link{data.frame}} object. 
#' @examples 
#' # read an example prediction file generated by PredictORFs()
#' ORFPred <- read.orfs(system.file("nt.fa",package = "LTRpred"))
#' 
#' head(ORFPred)
#' @seealso \code{\link{ORFpred}}, \code{\link{LTRpred}}
#' @return 
#' A \code{\link{data.frame}} object storing the \code{seq.id}, \code{orfs} (number of predicted ORFs), \code{start}, and \code{end} of the predicted LTRs.
#' @export
read.orfs <- function(input.file){
  
  if (!file.exists(input.file))
    stop("The file '", input.file, "' does not seem to exist. Please provide a valid file path to input.file for read.orfs() ...", call. = FALSE)
  
  seq.id <- orfs <- NULL
  
  ReadSeqFile <- Biostrings::readDNAStringSet(input.file)
  
  if (length(ReadSeqFile) == 0){
    message("The ORF prediction file was empty ... Therefore, no ORFs are added to the result table.")
    return(dplyr::tibble(seq.id = NULL,
                             orfs = NULL))
  }
  
  SeqFile.table <- table(sapply(ReadSeqFile@ranges@NAMES,
                                function(x)
                                    unlist(stringr::str_split(x, "[|]"))[1]))
  ORFCount.df <- dplyr::tibble(seq.id = names(SeqFile.table),
                                   orfs = as.numeric(SeqFile.table))
  
#   GenomicLocus <- as.data.frame(do.call(rbind, sapply(ORFCount.df$seq.id, function(x){
#     as.numeric(unlist(stringr::str_split(unlist(stringr::str_split(x,"__"))[2],"_")))
#   })), row.names = FALSE)
#   names(GenomicLocus) <- c("start","end")
#   remove.NA <- which(is.na(GenomicLocus$start) | is.na(GenomicLocus$end))
#   GenomicLocus <- GenomicLocus[-remove.NA, ]
#   ORFCount.df <- ORFCount.df[-remove.NA, ]
#   ORFCount.df <- dplyr::mutate(ORFCount.df, 
#                                start = unlist(GenomicLocus$start), 
#                                end   = unlist(GenomicLocus$end))
#   
#   ORFCount.df <- dplyr::mutate(ORFCount.df, chromosome = paste0(unlist(stringr::str_split(seq.id, "__"))[1],"_"))
  #ORFCount.df <- dplyr::select(ORFCount.df,chromosome,seq.id,start,end,orfs)
  ORFCount.df <- dplyr::select(ORFCount.df,seq.id,orfs)
  return(ORFCount.df) 
}
HajkD/LTRpred documentation built on April 22, 2022, 4:35 p.m.