R/read.prediction.R

Defines functions read.prediction

Documented in read.prediction

#' @title Import the output of LTRharvest or LTRdigest
#' @description This function imports the output files generated by \code{\link{LTRharvest}} or \code{\link{LTRdigest}}.
#' @param gff.file path to GFF file generated by either \code{\link{LTRharvest}}, \code{\link{LTRdigest}},
#' or \code{\link{LTRpred}}.
#' @param tabout.file path to \code{*_tabout.csv} file generated by either \code{\link{LTRdigest}} or
#' or \code{\link{LTRpred}}.
#' @param program prediction program used to generate the corresponding GFF file. 
#' Either \code{program = "LTRharvest"}, \code{program = "LTRdigest"}, or \code{program = "LTRpred"}.
#' @param ltr.fasta fasta file generated by either \code{\link{LTRdigest}} or
#' or \code{\link{LTRpred}} storing the sequences of the predicted LTR retrotransposons (e.g. \code{*-ltrdigest_complete.fas}).
#' @param inner.seq.fasta path to the \code{*_BetweenLTRSeqs.fsa} fasta file returned by \code{\link{LTRharvest}} in caste \code{program = "LTRharvest"} is specified.
#' @param data the \code{\link{data.frame}} returned by either \code{\link{LTRharvest}}, \code{\link{LTRdigest}},
#' or \code{\link{LTRpred}} that can be used to modify for example \code{similarity.bin} and \code{min.sim} within this table.
#' @param similarity.bin resolution of similarity binning. E.g. binning 98\%-100\% into 0.5\% intervals would be \code{similarity.bin = 0.5}.
#' @param min.sim minimum similarity between LTRs that can shall be considered for visualization.
#' @author Hajk-Georg Drost
#' @examples 
#' \dontrun{
#' gff.file <- system.file("TAIR10_chr_all_LTRdigestPrediction.gff", package = "LTRpred")
#' tabout.file <- system.file("TAIR10_chr_all-ltrdigest_tabout.csv" ,package = "LTRpred")
#' LTRfile <- read.prediction(gff.file,tabout.file, program = "LTRdigest")
#' 
#' # look at results
#' str(LTRfile$ltr.retrotransposon)
#' 
#' }
#' @export

read.prediction <- function( gff.file        = NULL,
                             tabout.file,
                             program         = "LTRdigest",
                             ltr.fasta       = NULL,
                             inner.seq.fasta = NULL,
                             data            = NULL,
                             similarity.bin  = 2,
                             min.sim         = NULL){
    
    if (!is.null(gff.file) &
        !is.null(data))
        stop(
            "Please only provide either a stored dataset or the path to the gff file that shall be imported."
        )
    
    if (!is.element(program, c("LTRharvest", "LTRdigest")))
        stop("Please choose a prediction returned by either LTRharvest or LTRdigest.")
    
    if (!file.exists(tabout.file))
        stop("The file '",tabout.file,"' does not exist! Please check the correct path or the correct output of LTRharvest() or LTRdigest().", call. = FALSE)
    
    if (!is.null(gff.file)) {
        if (!file.exists(gff.file))
            stop("The file '", gff.file ,"' does not exist! Please check the correct path or the correct output of LTRharvest() or LTRdigest().", call. = FALSE)
        
        # test if prediction gff file is empty
        if (file.info(gff.file)$size == 0 ||
            is.na(file.info(gff.file)$size == 0)) {
            cat("File ",
                gff.file,
                " is empty and therefore is not being processed.")
            return (NULL)
        }
    }
  
    X4 <- X5 <- X9 <- annotation <- ltr_similarity <- chromosome <- NULL
    
    if (program == "LTRharvest") {
        
        if (!is.null(data))
            AnnotationFile <- data
        
        # determine the width of the predicted intervals
        AnnotationFile <-
            dplyr::mutate(AnnotationFile, width = (X5 - X4) + 1)
        colnames(AnnotationFile) <-
            c(
                "chromosome",
                "pred_tool",
                "annotation",
                "start",
                "end",
                "score",
                "strand",
                "frame",
                "X9",
                "width"
            )
        
        ### Post-Processing of repeat_region
        FilteredAnnotationFile.repeat_region <-
            dplyr::filter(AnnotationFile, annotation == "repeat_region")
        
        if (nrow(FilteredAnnotationFile.repeat_region) > 0) {
            # Extract ID feature in column X9 (predicted by LTRharvest for repeat_region)
            RepeatRegionID <-
                vector("character",
                       nrow(FilteredAnnotationFile.repeat_region))
            RepeatRegionID <-
                sapply(seq_len(nrow(
                    FilteredAnnotationFile.repeat_region
                )), function(location) {
                    unlist(stringr::str_split(FilteredAnnotationFile.repeat_region[location, "X9"], "="))[2]
                    
                })
            
            FilteredAnnotationFile.repeat_region <-
                dplyr::mutate(FilteredAnnotationFile.repeat_region,
                              repeat_region = RepeatRegionID)
            FilteredAnnotationFile.repeat_region <-
                dplyr::select(FilteredAnnotationFile.repeat_region, -X9)
        }
        
        cat("(1/5) Filtering for repeat regions has been finished.")
        cat("\n")
        
        ### Post-Processing of LTR_retrotransposons
        FilteredAnnotationFile.LTR_retrotransposon <-
            dplyr::filter(AnnotationFile, annotation == "LTR_retrotransposon")
        
        if (nrow(FilteredAnnotationFile.LTR_retrotransposon) > 0) {
            # Extract ID, Parent, seq_number, and ltr_similarity features from column X9 (predicted by LTRharvest for LTR_retrotransposons)
            LTR_retrotransposonPredictionFeatures <-
                as.data.frame(t(sapply(seq_len(nrow(
                    FilteredAnnotationFile.LTR_retrotransposon
                )),
                function(location)
                    unlist(lapply(stringr::str_split(unlist(
                        stringr::str_split(FilteredAnnotationFile.LTR_retrotransposon[location , "X9"], ";")
                    ), "="), function(x)
                        x[2])))),
                colClasses = c(rep("character", 2), rep("numeric", 2)))
            
            LTR_retrotransposonPredictionFeatures[, 3] <-
                as.numeric(as.vector(LTR_retrotransposonPredictionFeatures[, 3]))
            LTR_retrotransposonPredictionFeatures[, 4] <-
                as.numeric(as.vector(LTR_retrotransposonPredictionFeatures[, 4]))
            
            colnames(LTR_retrotransposonPredictionFeatures) <-
                c("ID", "repeat_region", "ltr_similarity", "seq_number")
            LTR_retrotransposonPredictionFeatures <-
                cbind(
                    FilteredAnnotationFile.LTR_retrotransposon[, -9],
                    LTR_retrotransposonPredictionFeatures
                )
            
            
            #     for (i in seq_len(length(table(LTR_retrotransposonPredictionFeatures[ , "chromosome"])))){
            #         if (i %in% 1:10)
            #             LTR_retrotransposonPredictionFeatures[which(LTR_retrotransposonPredictionFeatures[ , "chromosome"] == as.character(i)), "chromosome"] <- paste0("Chr",i)
            #     }
            
            min.similarity <- vector("numeric", 1)
            max.similarity <- vector("numeric", 1)
            
            if (is.null(min.sim))
                min.similarity <-
                floor(min(LTR_retrotransposonPredictionFeatures[, "ltr_similarity"]))
            if (!is.null(min.sim))
                min.similarity <- min.sim
            
            #     if ((min.similarity > min.sim) | (max.sim > max.similarity))
            #         stop ("Please enter a similarity interval between [0,100] that is present in the dataset!")
            #    evenCheck <- (100 - min.similarity) %% similarity.bin
            LTR_retrotransposonPredictionFeatures <-
                dplyr::mutate(
                    LTR_retrotransposonPredictionFeatures,
                    similarity = cut(
                        ltr_similarity,
                        rev(seq(
                            100, min.similarity, -similarity.bin
                        )),
                        include.lowest = TRUE,
                        right          = TRUE
                    )
                )
            
            #LTR_retrotransposonPredictionFeatures[ , "chromosome"] <- factor(LTR_retrotransposonPredictionFeatures[ , "chromosome"], ordered = TRUE, levels = paste0("Chr",1:10))
            FilteredAnnotationFile.repeat_region <-
                suppressWarnings(
                    dplyr::right_join(
                        FilteredAnnotationFile.repeat_region,
                        LTR_retrotransposonPredictionFeatures[, c("repeat_region", "ltr_similarity")],
                        by = "repeat_region"
                    )
                )
            FilteredAnnotationFile.repeat_region <-
                dplyr::mutate(
                    FilteredAnnotationFile.repeat_region,
                    similarity = cut(
                        ltr_similarity,
                        rev(seq(
                            100, min.similarity, -similarity.bin
                        )),
                        include.lowest = TRUE,
                        right          = TRUE
                    )
                )
        }
        
        
        # rename chromosomes
        ChrNameSplitFunc <-
            function(x)
                as.character(unlist(sapply(x, function(y)
                    unlist(stringr::str_split(y, "_"))[1])))
        
        FilteredAnnotationFile.repeat_region <-
            dplyr::mutate(
                FilteredAnnotationFile.repeat_region,
                chromosome_ltrharvest = chromosome)
        FilteredAnnotationFile.repeat_region <-
            dplyr::mutate(
                FilteredAnnotationFile.repeat_region,
                chromosome = ChrNameSplitFunc(FilteredAnnotationFile.repeat_region$chromosome)
            )
        
        cat("(2/5) Filtering for LTR retrotransposons has been finished.")
        cat("\n")
        
        ### Post-Processing of inverted_repeat
        FilteredAnnotationFile.inverted_repeat <-
            dplyr::filter(AnnotationFile, annotation == "inverted_repeat")
        
        if (nrow(FilteredAnnotationFile.inverted_repeat) > 0) {
            # Extract Parent feature in column X9 (predicted by LTRharvest for inverted_repeat)
            InvertedRepeatParentID <-
                vector("character",
                       nrow(FilteredAnnotationFile.inverted_repeat))
            InvertedRepeatParentID <-
                sapply(seq_len(nrow(
                    FilteredAnnotationFile.inverted_repeat
                )), function(location) {
                    unlist(stringr::str_split(FilteredAnnotationFile.inverted_repeat[location, "X9"], "="))[2]
                    
                })
            
            FilteredAnnotationFile.inverted_repeat <-
                dplyr::mutate(FilteredAnnotationFile.inverted_repeat,
                              repeat_region = InvertedRepeatParentID)
            FilteredAnnotationFile.inverted_repeat <-
                dplyr::select(FilteredAnnotationFile.inverted_repeat, -X9)
        }
        
        cat("(3/5) Filtering for inverted repeats has been finished.")
        cat("\n")
        
        ### Post-Processing of long_terminal_repeat
        FilteredAnnotationFile.long_terminal_repeat <-
            dplyr::filter(AnnotationFile, annotation == "long_terminal_repeat")
        
        if (nrow(FilteredAnnotationFile.long_terminal_repeat) > 0) {
            # Extract Parent feature in column X9 (predicted by LTRharvest for long_terminal_repeat)
            
            LTRParentID <-
                vector("character",
                       nrow(FilteredAnnotationFile.long_terminal_repeat))
            LTRParentID <-
                sapply(seq_len(nrow(
                    FilteredAnnotationFile.long_terminal_repeat
                )), function(location) {
                    unlist(
                        stringr::str_split(
                            FilteredAnnotationFile.long_terminal_repeat[location, "X9"],
                            "="
                        )
                    )[2]
                    
                })
            
            FilteredAnnotationFile.long_terminal_repeat <-
                dplyr::mutate(FilteredAnnotationFile.long_terminal_repeat, ID = LTRParentID)
            FilteredAnnotationFile.long_terminal_repeat <-
                dplyr::select(FilteredAnnotationFile.long_terminal_repeat, -X9)
            
            FilteredAnnotationFile.long_terminal_repeat <-
                suppressWarnings(
                    dplyr::right_join(
                        FilteredAnnotationFile.long_terminal_repeat,
                        LTR_retrotransposonPredictionFeatures[, c("ID", "ltr_similarity")],
                        by = "ID"
                    )
                )
            FilteredAnnotationFile.long_terminal_repeat <-
                dplyr::mutate(
                    FilteredAnnotationFile.long_terminal_repeat,
                    similarity = cut(
                        ltr_similarity,
                        rev(seq(
                            100, min.similarity, -similarity.bin
                        )),
                        include.lowest = TRUE,
                        right          = TRUE
                    )
                )
            
            # FilteredAnnotationFile.long_terminal_repeat$ID <- factor(FilteredAnnotationFile.long_terminal_repeat$ID, levels=unique(FilteredAnnotationFile.long_terminal_repeat$ID))
            # FilteredAnnotationFile.long_terminal_repeat <- dplyr::mutate(FilteredAnnotationFile.long_terminal_repeat, ltr_order = do.call(rbind,lapply(split(FilteredAnnotationFile.long_terminal_repeat,FilteredAnnotationFile.long_terminal_repeat$ID), function(x) if (x[1, "end"] < x[2, "start"]) return (rbind("left","right")) else return(rbind("right","left")))))
            
        }
        
        cat("(4/5) Filtering for LTRs has been finished.")
        cat("\n")
        
        ### Post-Processing of target_site_duplication
        FilteredAnnotationFile.target_site_duplication <-
            dplyr::filter(AnnotationFile, annotation == "target_site_duplication")
        
        if (nrow(FilteredAnnotationFile.target_site_duplication) > 0) {
            # Extract Parent feature in column X9 (predicted by LTRharvest for target_site_duplication)
            TSDParentID <-
                vector("character",
                       nrow(FilteredAnnotationFile.target_site_duplication))
            TSDParentID <-
                sapply(seq_len(
                    nrow(FilteredAnnotationFile.target_site_duplication)
                ), function(location) {
                    unlist(
                        stringr::str_split(
                            FilteredAnnotationFile.target_site_duplication[location, "X9"],
                            "="
                        )
                    )[2]
                    
                })
            
            FilteredAnnotationFile.target_site_duplication <-
                dplyr::mutate(FilteredAnnotationFile.target_site_duplication,
                              repeat_region = TSDParentID)
            FilteredAnnotationFile.target_site_duplication <-
                dplyr::select(FilteredAnnotationFile.target_site_duplication, -X9)
            
        }
        
        cat("(5/5) Filtering for target site duplication has been finished.")
        cat("\n")
        # rename the chromosome number
        #         FilteredAnnotationFile.repeat_region <- dplyr::mutate(FilteredAnnotationFile.repeat_region, seq_number =  chromosome)
        #         FilteredAnnotationFile.repeat_region <- dplyr::mutate(FilteredAnnotationFile.repeat_region, chromosome =  sequence)
        #         FilteredAnnotationFile.repeat_region <- dplyr::select(FilteredAnnotationFile.repeat_region, -sequence)
        #
        
        #         LTR_retrotransposonPredictionFeatures <- dplyr::mutate(LTR_retrotransposonPredictionFeatures, seq_number =  chromosome)
        #         LTR_retrotransposonPredictionFeatures <- dplyr::mutate(LTR_retrotransposonPredictionFeatures, chromosome =  sequence)
        #         LTR_retrotransposonPredictionFeatures <- dplyr::select(LTR_retrotransposonPredictionFeatures, -sequence)
        
        
        #         FilteredAnnotationFile.long_terminal_repeat <- dplyr::mutate(FilteredAnnotationFile.long_terminal_repeat, seq_number =  chromosome)
        #         FilteredAnnotationFile.long_terminal_repeat <- dplyr::mutate(FilteredAnnotationFile.long_terminal_repeat, chromosome =  sequence)
        #         FilteredAnnotationFile.long_terminal_repeat <- dplyr::select(FilteredAnnotationFile.long_terminal_repeat, -sequence)
        #
        #
        #         FilteredAnnotationFile.inverted_repeat <- dplyr::mutate(FilteredAnnotationFile.inverted_repeat, seq_number =  chromosome)
        #         FilteredAnnotationFile.inverted_repeat <- dplyr::mutate(FilteredAnnotationFile.inverted_repeat, chromosome =  sequence)
        #         FilteredAnnotationFile.inverted_repeat <- dplyr::select(FilteredAnnotationFile.inverted_repeat, -sequence)
        #
        #
        #         FilteredAnnotationFile.target_site_duplication <- dplyr::mutate(FilteredAnnotationFile.target_site_duplication, seq_number =  chromosome)
        #         FilteredAnnotationFile.target_site_duplication <- dplyr::mutate(FilteredAnnotationFile.target_site_duplication, chromosome =  sequence)
        #         FilteredAnnotationFile.target_site_duplication <- dplyr::select(FilteredAnnotationFile.target_site_duplication, -sequence)
        #
        if (nrow(LTR_retrotransposonPredictionFeatures) > 0) {
            # replace strand information "?" by "." due to gff file format standard
            FindQ <-
                which(LTR_retrotransposonPredictionFeatures$strand == "?")
            LTR_retrotransposonPredictionFeatures$strand[FindQ] <- "."
        }
        
        return(
            list(
                repeat.region           = FilteredAnnotationFile.repeat_region,
                ltr.retrotransposon     = LTR_retrotransposonPredictionFeatures,
                ltr                     = FilteredAnnotationFile.long_terminal_repeat,
                inverted_repeat         = FilteredAnnotationFile.inverted_repeat,
                target.site.duplication = FilteredAnnotationFile.target_site_duplication
            )
        )
        
    }
    
    
    if (program == "LTRdigest") {
      
        if (!file.exists(gff.file)) {
          message(
            "The file '",
            gff.file ,
            "' does not exist! Please check the correct path or the correct output of LTRharvest() or LTRdigest(). This organism has been omitted!",
            call. = FALSE
          )
          return(NA)
        }
          
        # read gff file content: comment = "###"
        AnnotationFile <-
            readr::read_tsv(gff.file,
                            col_names = FALSE,
                            col_types = readr::cols(
                            "X1" = readr::col_character(),    
                            "X2" = readr::col_character(),
                            "X3" = readr::col_character(),
                            "X4" = readr::col_integer(),
                            "X5" = readr::col_integer(),
                            "X6" = readr::col_character(),
                            "X7" = readr::col_character(),
                            "X8" = readr::col_character(),
                            "X9" = readr::col_character()
                            ),
                            skip = 0,
                            comment = "#")
        cat("\n")
        cat("Input: ", gff.file, " -> Row Number: ", nrow(AnnotationFile))
        cat("\n")
        AnnotationFile <- stats::na.omit(AnnotationFile)
        cat("Remove 'NA' -> New Row Number: ", nrow(AnnotationFile))
        cat("\n")
        
        
        if (!is.null(data))
            AnnotationFile <- data
        
        if (nrow(AnnotationFile) > 0) {
        
        # determine the width of the predicted intervals
        AnnotationFile <-
            dplyr::mutate(AnnotationFile, width = (X5 - X4) + 1)
        colnames(AnnotationFile) <-
            c(
                "chromosome",
                "pred_tool",
                "annotation",
                "start",
                "end",
                "score",
                "strand",
                "frame",
                "X9",
                "width"
            )
        
        ### Post-Processing of repeat_region
        FilteredAnnotationFile.repeat_region <-
            dplyr::filter(AnnotationFile, annotation == "repeat_region")
        
        if (nrow(FilteredAnnotationFile.repeat_region) > 0) {
            # Extract ID feature in column X9 (predicted by LTRharvest for repeat_region)
            RepeatRegionID <-
                vector("character",
                       nrow(FilteredAnnotationFile.repeat_region))
            RepeatRegionID <-
                sapply(seq_len(nrow(
                    FilteredAnnotationFile.repeat_region
                )), function(location) {
                    unlist(stringr::str_split(FilteredAnnotationFile.repeat_region[location, "X9"], "="))[2]
                    
                })
            
            FilteredAnnotationFile.repeat_region <-
                dplyr::mutate(FilteredAnnotationFile.repeat_region,
                              repeat_region = RepeatRegionID)
            FilteredAnnotationFile.repeat_region <-
                dplyr::select(FilteredAnnotationFile.repeat_region, -X9)
            
        }
        
        cat("(1/8) Filtering for repeat regions has been finished.")
        cat("\n")
        
        ### Post-Processing of LTR_retrotransposons
        FilteredAnnotationFile.LTR_retrotransposon <-
            dplyr::filter(AnnotationFile, annotation == "LTR_retrotransposon")
        
        if (nrow(FilteredAnnotationFile.LTR_retrotransposon) > 0) {
            # Extract ID, Parent, seq_number, and ltr_similarity features from column X9 (predicted by LTRharvest for LTR_retrotransposons)
            LTR_retrotransposonPredictionFeatures <-
                as.data.frame(t(sapply(seq_len(nrow(
                    FilteredAnnotationFile.LTR_retrotransposon
                )),
                function(location)
                    unlist(lapply(stringr::str_split(unlist(
                        stringr::str_split(FilteredAnnotationFile.LTR_retrotransposon[location , "X9"], ";")
                    ), "="), function(x)
                        x[2])))),
                colClasses = c(rep("character", 2), rep("numeric", 2)))
            
            LTR_retrotransposonPredictionFeatures[, 3] <-
                as.numeric(as.vector(LTR_retrotransposonPredictionFeatures[, 3]))
            LTR_retrotransposonPredictionFeatures[, 4] <-
                as.numeric(as.vector(LTR_retrotransposonPredictionFeatures[, 4]))
            
            colnames(LTR_retrotransposonPredictionFeatures) <-
                c("ID", "repeat_region", "ltr_similarity", "seq_number")
            LTR_retrotransposonPredictionFeatures <-
                cbind(
                    FilteredAnnotationFile.LTR_retrotransposon[, -9],
                    LTR_retrotransposonPredictionFeatures
                )
            
            
            #     for (i in seq_len(length(table(LTR_retrotransposonPredictionFeatures[ , "chromosome"])))){
            #         if (i %in% 1:10)
            #             LTR_retrotransposonPredictionFeatures[which(LTR_retrotransposonPredictionFeatures[ , "chromosome"] == as.character(i)), "chromosome"] <- paste0("Chr",i)
            #     }
            
            min.similarity <- vector("numeric", 1)
            max.similarity <- vector("numeric", 1)
            
            if (is.null(min.sim))
                min.similarity <-
                floor(min(LTR_retrotransposonPredictionFeatures[, "ltr_similarity"]))
            if (!is.null(min.sim))
                min.similarity <- min.sim
            
            #     if ((min.similarity > min.sim) | (max.sim > max.similarity))
            #         stop ("Please enter a similarity interval between [0,100] that is present in the dataset!")
            #    evenCheck <- (100 - min.similarity) %% similarity.bin
            LTR_retrotransposonPredictionFeatures <-
                dplyr::mutate(
                    LTR_retrotransposonPredictionFeatures,
                    similarity = cut(
                        ltr_similarity,
                        rev(seq(
                            100, min.similarity, -similarity.bin
                        )),
                        include.lowest = TRUE,
                        right          = TRUE
                    )
                )
            
            #LTR_retrotransposonPredictionFeatures[ , "chromosome"] <- factor(LTR_retrotransposonPredictionFeatures[ , "chromosome"], ordered = TRUE, levels = paste0("Chr",1:10))
            
            FilteredAnnotationFile.repeat_region <-
                suppressWarnings(
                    dplyr::right_join(
                        FilteredAnnotationFile.repeat_region,
                        LTR_retrotransposonPredictionFeatures[, c("repeat_region", "ltr_similarity")],
                        by = "repeat_region"
                    )
                )
            FilteredAnnotationFile.repeat_region <-
                dplyr::mutate(
                    FilteredAnnotationFile.repeat_region,
                    similarity = cut(
                        ltr_similarity,
                        rev(seq(
                            100, min.similarity, -similarity.bin
                        )),
                        include.lowest = TRUE,
                        right          = TRUE
                    )
                )
            
            LTRdigest.tabout.file <- read.tabout(tabout.file)
            
            if (nrow(LTRdigest.tabout.file) > 2) {
                LTR_retrotransposonPredictionFeatures <-
                    dplyr::inner_join(
                        LTR_retrotransposonPredictionFeatures,
                        LTRdigest.tabout.file,
                        by = c("start" = "element_start", "end" = "element_end")
                    )
            } else {
                warning("The tabout file '",tabout.file,"' could not be processed, because it is empty and thus has been omitted.")
            }
        }
        
        cat("(2/8) Filtering for LTR retrotransposons has been finished.")
        cat("\n")
        
        ### Post-Processing of inverted_repeat
        FilteredAnnotationFile.inverted_repeat <-
            dplyr::filter(AnnotationFile, annotation == "inverted_repeat")
        
        if (nrow(FilteredAnnotationFile.inverted_repeat) > 0) {
            # Extract Parent feature in column X9 (predicted by LTRharvest for inverted_repeat)
            InvertedRepeatParentID <-
                vector("character",
                       nrow(FilteredAnnotationFile.inverted_repeat))
            InvertedRepeatParentID <-
                sapply(seq_len(nrow(
                    FilteredAnnotationFile.inverted_repeat
                )), function(location) {
                    unlist(stringr::str_split(FilteredAnnotationFile.inverted_repeat[location, "X9"], "="))[2]
                    
                })
            
            FilteredAnnotationFile.inverted_repeat <-
                dplyr::mutate(FilteredAnnotationFile.inverted_repeat,
                              repeat_region = InvertedRepeatParentID)
            FilteredAnnotationFile.inverted_repeat <-
                dplyr::select(FilteredAnnotationFile.inverted_repeat, -X9)
        }
        
        cat("(3/8) Filtering for inverted repeats has been finished.")
        cat("\n")
        
        ### Post-Processing of long_terminal_repeat
        FilteredAnnotationFile.long_terminal_repeat <-
            dplyr::filter(AnnotationFile, annotation == "long_terminal_repeat")
        # Extract Parent feature in column X9 (predicted by LTRharvest for long_terminal_repeat)
        
        if (nrow(FilteredAnnotationFile.long_terminal_repeat) > 0) {
            LTRParentID <-
                vector("character",
                       nrow(FilteredAnnotationFile.long_terminal_repeat))
            LTRParentID <-
                sapply(seq_len(nrow(
                    FilteredAnnotationFile.long_terminal_repeat
                )), function(location) {
                    unlist(
                        stringr::str_split(
                            FilteredAnnotationFile.long_terminal_repeat[location, "X9"],
                            "="
                        )
                    )[2]
                    
                })
            
            FilteredAnnotationFile.long_terminal_repeat <-
                dplyr::mutate(FilteredAnnotationFile.long_terminal_repeat, ID = LTRParentID)
            FilteredAnnotationFile.long_terminal_repeat <-
                dplyr::select(FilteredAnnotationFile.long_terminal_repeat, -X9)
            
            FilteredAnnotationFile.long_terminal_repeat <-
                suppressWarnings(
                    dplyr::inner_join(
                        FilteredAnnotationFile.long_terminal_repeat,
                        LTR_retrotransposonPredictionFeatures[, c("ID", "ltr_similarity")],
                        by = "ID"
                    )
                )
            FilteredAnnotationFile.long_terminal_repeat <-
                dplyr::mutate(
                    FilteredAnnotationFile.long_terminal_repeat,
                    similarity = cut(
                        ltr_similarity,
                        rev(seq(
                            100, min.similarity, -similarity.bin
                        )),
                        include.lowest = TRUE,
                        right          = TRUE
                    )
                )
            
            # FilteredAnnotationFile.long_terminal_repeat$ID <- factor(FilteredAnnotationFile.long_terminal_repeat$ID, levels = unique(FilteredAnnotationFile.long_terminal_repeat$ID))
            #   print(as.data.frame(FilteredAnnotationFile.long_terminal_repeat))
            #
            #   print(do.call(rbind,lapply(split(FilteredAnnotationFile.long_terminal_repeat,FilteredAnnotationFile.long_terminal_repeat$ID), function(x){
            #
            #     if (x[1 , "end"] < x[2 , "start"]){
            #       return (rbind("left","right"))
            #     } else {
            #       return(rbind("right","left"))
            #     }
            #
            #   })))
            #
            #   FilteredAnnotationFile.long_terminal_repeat <- dplyr::mutate(FilteredAnnotationFile.long_terminal_repeat, ltr_order = do.call(rbind,lapply(split(FilteredAnnotationFile.long_terminal_repeat,FilteredAnnotationFile.long_terminal_repeat$ID), function(x){
            #
            #     if (x[1 , "end"] < x[2 , "start"]){
            #       return (rbind("left","right"))
            #     } else {
            #       return(rbind("right","left"))
            #     }
            #
            #   })))
            #
        }
        
        cat("(4/8) Filtering for LTRs has been finished.")
        cat("\n")
        
        ### Post-Processing of target_site_duplication
        FilteredAnnotationFile.target_site_duplication <-
            dplyr::filter(AnnotationFile, annotation == "target_site_duplication")
        # Extract Parent feature in column X9 (predicted by LTRharvest for target_site_duplication)
        
        if (nrow(FilteredAnnotationFile.target_site_duplication) > 0) {
            TSDParentID <-
                vector("character",
                       nrow(FilteredAnnotationFile.target_site_duplication))
            TSDParentID <-
                sapply(seq_len(
                    nrow(FilteredAnnotationFile.target_site_duplication)
                ), function(location) {
                    unlist(
                        stringr::str_split(
                            FilteredAnnotationFile.target_site_duplication[location, "X9"],
                            "="
                        )
                    )[2]
                    
                })
            
            FilteredAnnotationFile.target_site_duplication <-
                dplyr::mutate(FilteredAnnotationFile.target_site_duplication,
                              repeat_region = TSDParentID)
            FilteredAnnotationFile.target_site_duplication <-
                dplyr::select(FilteredAnnotationFile.target_site_duplication, -X9)
        }
        
        cat("(5/8) Filtering for target site duplication has been finished.")
        cat("\n")
        
        
        
        ### Post-Processing of primer_binding_site
        FilteredAnnotationFile.primer_binding_site <-
            dplyr::filter(AnnotationFile, annotation == "primer_binding_site")
        
        if (nrow(FilteredAnnotationFile.primer_binding_site) > 0) {
            # Extract ID, Parent, seq_number, and ltr_similarity features from column X9 (predicted by LTRharvest for LTR_retrotransposons)
            PBSPredictionFeatures <-
                as.data.frame(t(sapply(seq_len(nrow(
                    FilteredAnnotationFile.primer_binding_site
                )),
                function(location)
                    unlist(lapply(stringr::str_split(unlist(
                        stringr::str_split(FilteredAnnotationFile.primer_binding_site[location , "X9"], ";")
                    ), "="), function(x)
                        x[2])))),
                colClasses = c(rep("character", 2), rep("numeric", 3)))
            
            PBSPredictionFeatures[, 3] <-
                as.numeric(as.vector(PBSPredictionFeatures[, 3]))
            PBSPredictionFeatures[, 4] <-
                as.numeric(as.vector(PBSPredictionFeatures[, 4]))
            PBSPredictionFeatures[, 5] <-
                as.numeric(as.vector(PBSPredictionFeatures[, 5]))
            
            colnames(PBSPredictionFeatures) <-
                c("ID", "trna", "trnaoffset", "pbsoffset", "edist")
            FilteredAnnotationFile.primer_binding_site <-
                cbind(FilteredAnnotationFile.primer_binding_site[, -9],
                      PBSPredictionFeatures)
            
            FilteredAnnotationFile.primer_binding_site <-
                suppressWarnings(
                    dplyr::right_join(
                        FilteredAnnotationFile.primer_binding_site,
                        LTR_retrotransposonPredictionFeatures[, c("ID", "ltr_similarity")],
                        by = "ID"
                    )
                )
            FilteredAnnotationFile.primer_binding_site <-
                dplyr::mutate(
                    FilteredAnnotationFile.primer_binding_site,
                    similarity = cut(
                        ltr_similarity,
                        rev(seq(
                            100, min.similarity, -similarity.bin
                        )),
                        include.lowest = TRUE,
                        right          = TRUE
                    )
                )
            
        }
        
        cat("(6/8) Filtering for primer binding site has been finished.")
        cat("\n")
        
        
        ### Post-Processing of protein_match
        FilteredAnnotationFile.protein_match <-
            dplyr::filter(AnnotationFile, annotation == "protein_match")
        
        if (nrow(FilteredAnnotationFile.protein_match) > 0) {
            # Extract ID, Parent, seq_number, and ltr_similarity features from column X9 (predicted by LTRharvest for LTR_retrotransposons)
            ProteinMatchPredictionFeatures <-
                as.data.frame(t(sapply(seq_len(nrow(
                    FilteredAnnotationFile.protein_match
                )),
                function(location)
                    unlist(lapply(stringr::str_split(unlist(
                        stringr::str_split(FilteredAnnotationFile.protein_match[location , "X9"], ";")
                    ), "="), function(x)
                        x[2])))),
                colClasses = c("character", "numeric", "character"))
            
            ProteinMatchPredictionFeatures[, 2] <-
                as.numeric(as.vector(ProteinMatchPredictionFeatures[, 2]))
            
            
            colnames(ProteinMatchPredictionFeatures) <-
                c("ID", "reading_frame", "name")
            FilteredAnnotationFile.protein_match <-
                cbind(FilteredAnnotationFile.protein_match[, -9],
                      ProteinMatchPredictionFeatures)
            
            FilteredAnnotationFile.protein_match <-
                suppressWarnings(
                    dplyr::right_join(
                        FilteredAnnotationFile.protein_match,
                        LTR_retrotransposonPredictionFeatures[, c("ID", "ltr_similarity", "width")],
                        by = "ID"
                    )
                )
            FilteredAnnotationFile.protein_match <-
                dplyr::mutate(
                    FilteredAnnotationFile.protein_match,
                    similarity = cut(
                        ltr_similarity,
                        rev(seq(
                            100, min.similarity, -similarity.bin
                        )),
                        include.lowest = TRUE,
                        right          = TRUE
                    )
                )
            colnames(FilteredAnnotationFile.protein_match)[9] <-
                "match_width"
            colnames(FilteredAnnotationFile.protein_match)[14] <-
                "width"
        }
        
        cat("(7/8) Filtering for protein match has been finished.")
        cat("\n")
        
        ### Post-Processing of RR_tract
        FilteredAnnotationFile.RR_tract <-
            dplyr::filter(AnnotationFile, annotation == "RR_tract")
        
        if (nrow(FilteredAnnotationFile.RR_tract) > 0) {
            # Extract ID, Parent, seq_number, and ltr_similarity features from column X9 (predicted by LTRharvest for LTR_retrotransposons)
            RRtractPredictionFeatures <-
                as.data.frame(sapply(seq_len(nrow(
                    FilteredAnnotationFile.RR_tract
                )),
                function(location)
                    unlist(
                        stringr::str_split(FilteredAnnotationFile.RR_tract[location , "X9"], "=")
                    )[2]),
                colClasses = "character")
            
            colnames(RRtractPredictionFeatures) <- "ID"
            FilteredAnnotationFile.RR_tract <-
                cbind(FilteredAnnotationFile.RR_tract[, -9],
                      RRtractPredictionFeatures)
            
            FilteredAnnotationFile.RR_tract <-
                suppressWarnings(
                    dplyr::right_join(
                        FilteredAnnotationFile.RR_tract,
                        LTR_retrotransposonPredictionFeatures[, c("ID", "ltr_similarity")],
                        by = "ID"
                    )
                )
            FilteredAnnotationFile.RR_tract <-
                dplyr::mutate(
                    FilteredAnnotationFile.RR_tract,
                    similarity = cut(
                        ltr_similarity,
                        rev(seq(
                            100, min.similarity, -similarity.bin
                        )),
                        include.lowest = TRUE,
                        right          = TRUE
                    )
                )
        }
        
        cat("(8/8) Filtering for RR tract has been finished.")
        cat("\n")
        
        # rename the chromosome number
        #         FilteredAnnotationFile.repeat_region <- dplyr::mutate(FilteredAnnotationFile.repeat_region, seq_number =  chromosome)
        #         FilteredAnnotationFile.repeat_region <- dplyr::mutate(FilteredAnnotationFile.repeat_region, chromosome =  sequence)
        #         FilteredAnnotationFile.repeat_region <- dplyr::select(FilteredAnnotationFile.repeat_region, -sequence)
        #
        
        if ((nrow(LTR_retrotransposonPredictionFeatures) > 0))
        {
          
          if (!all(is.element(
            c(
              "chromosome",
              "pred_tool",
              "annotation",
              "start",
              "end",
              "score",
              "strand",
              "frame",
              "width",
              "ID",
              "repeat_region",
              "ltr_similarity",
              "seq_number",
              "similarity",
              "element_length",
              "sequence",
              "lLTR_start",
              "lLTR_end",
              "lLTR_length",
              "rLTR_start",
              "rLTR_end",
              "rLTR_length",
              "lTSD_start",
              "lTSD_end",
              "lTSD_motif",
              "rTSD_start",
              "rTSD_end",
              "rTSD_motif",
              "PPT_start",
              "PPT_end",
              "PPT_motif",
              "PPT_strand",
              "PPT_offset",
              "PBS_start",
              "PBS_end",
              "PBS_strand",
              "tRNA",
              "tRNA_motif",
              "PBS_offset",
              "tRNA_offset",
              "PBS/tRNA_edist",
              "protein_domain"
            ), names(LTR_retrotransposonPredictionFeatures)
          ))) {
            message("LTRdigest prediction file could not be read properly... it seems that there weren't enough predicted loci. Species prediction has been omitted!")
            return(NA)
          }
          
            LTR_retrotransposonPredictionFeatures <-
                dplyr::mutate(LTR_retrotransposonPredictionFeatures, seq_number = chromosome)
            LTR_retrotransposonPredictionFeatures <-
                dplyr::mutate(LTR_retrotransposonPredictionFeatures, chromosome = sequence)
            LTR_retrotransposonPredictionFeatures <-
                dplyr::select(LTR_retrotransposonPredictionFeatures, -sequence)

            # # rename chromosomes
            ChrNameSplitFunc <-
                function(x)
                    as.character(unlist(sapply(x, function(y)
                        unlist(stringr::str_split(y, "_"))[1])))

            LTR_retrotransposonPredictionFeatures <-
                dplyr::mutate(
                    LTR_retrotransposonPredictionFeatures,
                    chromosome_ltrharvest = chromosome)

            LTR_retrotransposonPredictionFeatures <-
                dplyr::mutate(
                    LTR_retrotransposonPredictionFeatures,
                    chromosome = ChrNameSplitFunc(LTR_retrotransposonPredictionFeatures$chromosome)
                )
        }
        
        #         FilteredAnnotationFile.long_terminal_repeat <- dplyr::mutate(FilteredAnnotationFile.long_terminal_repeat, seq_number =  chromosome)
        #         FilteredAnnotationFile.long_terminal_repeat <- dplyr::mutate(FilteredAnnotationFile.long_terminal_repeat, chromosome =  sequence)
        #         FilteredAnnotationFile.long_terminal_repeat <- dplyr::select(FilteredAnnotationFile.long_terminal_repeat, -sequence)
        #
        #
        #         FilteredAnnotationFile.inverted_repeat <- dplyr::mutate(FilteredAnnotationFile.inverted_repeat, seq_number =  chromosome)
        #         FilteredAnnotationFile.inverted_repeat <- dplyr::mutate(FilteredAnnotationFile.inverted_repeat, chromosome =  sequence)
        #         FilteredAnnotationFile.inverted_repeat <- dplyr::select(FilteredAnnotationFile.inverted_repeat, -sequence)
        #
        #
        #         FilteredAnnotationFile.target_site_duplication <- dplyr::mutate(FilteredAnnotationFile.target_site_duplication, seq_number =  chromosome)
        #         FilteredAnnotationFile.target_site_duplication <- dplyr::mutate(FilteredAnnotationFile.target_site_duplication, chromosome =  sequence)
        #         FilteredAnnotationFile.target_site_duplication <- dplyr::select(FilteredAnnotationFile.target_site_duplication, -sequence)
        #
        #
        #         FilteredAnnotationFile.primer_binding_site <- dplyr::mutate(FilteredAnnotationFile.primer_binding_site, seq_number =  chromosome)
        #         FilteredAnnotationFile.primer_binding_site <- dplyr::mutate(FilteredAnnotationFile.primer_binding_site, chromosome =  sequence)
        #         FilteredAnnotationFile.primer_binding_site <- dplyr::select(FilteredAnnotationFile.primer_binding_site, -sequence)
        #
        #
        #         FilteredAnnotationFile.protein_match <- dplyr::mutate(FilteredAnnotationFile.protein_match, seq_number =  chromosome)
        #         FilteredAnnotationFile.protein_match <- dplyr::mutate(FilteredAnnotationFile.protein_match, chromosome =  sequence)
        #         FilteredAnnotationFile.protein_match <- dplyr::select(FilteredAnnotationFile.protein_match, -sequence)
        #
        #         FilteredAnnotationFile.RR_tract <- dplyr::mutate(FilteredAnnotationFile.RR_tract, seq_number =  chromosome)
        #         FilteredAnnotationFile.RR_tract <- dplyr::mutate(FilteredAnnotationFile.RR_tract, chromosome =  sequence)
        #         FilteredAnnotationFile.RR_tract <- dplyr::select(FilteredAnnotationFile.RR_tract, -sequence)
        #
        if (nrow(LTR_retrotransposonPredictionFeatures) > 0) {
            # replace strand information "?" by "." due to gff file format standard
            FindQ <-
                which(LTR_retrotransposonPredictionFeatures$strand == "?")
            LTR_retrotransposonPredictionFeatures$strand[FindQ] <- "."
        }
        
        return(
            list(
                repeat.region           = FilteredAnnotationFile.repeat_region,
                ltr.retrotransposon     = LTR_retrotransposonPredictionFeatures,
                ltr                     = FilteredAnnotationFile.long_terminal_repeat,
                inverted_repeat         = FilteredAnnotationFile.inverted_repeat,
                target.site.duplication = FilteredAnnotationFile.target_site_duplication,
                pbs                     = FilteredAnnotationFile.primer_binding_site,
                protein.match           = FilteredAnnotationFile.protein_match,
                RR_tract                = FilteredAnnotationFile.RR_tract
            )
        )
        } else {
           warning("The imported annotation file '",gff.file,"' was empty and has therefore been omitted.")
            return(
                list(
                    repeat.region           = NULL,
                    ltr.retrotransposon     = NULL,
                    ltr                     = NULL,
                    inverted_repeat         = NULL,
                    target.site.duplication = NULL,
                    pbs                     = NULL,
                    protein.match           = NULL,
                    RR_tract                = NULL
                )
            )
       }
    }
}
HajkD/LTRpred documentation built on April 22, 2022, 4:35 p.m.