#' @title Import the output of LTRharvest or LTRdigest
#' @description This function imports the output files generated by \code{\link{LTRharvest}} or \code{\link{LTRdigest}}.
#' @param gff.file path to GFF file generated by either \code{\link{LTRharvest}}, \code{\link{LTRdigest}},
#' or \code{\link{LTRpred}}.
#' @param tabout.file path to \code{*_tabout.csv} file generated by either \code{\link{LTRdigest}} or
#' or \code{\link{LTRpred}}.
#' @param program prediction program used to generate the corresponding GFF file.
#' Either \code{program = "LTRharvest"}, \code{program = "LTRdigest"}, or \code{program = "LTRpred"}.
#' @param ltr.fasta fasta file generated by either \code{\link{LTRdigest}} or
#' or \code{\link{LTRpred}} storing the sequences of the predicted LTR retrotransposons (e.g. \code{*-ltrdigest_complete.fas}).
#' @param inner.seq.fasta path to the \code{*_BetweenLTRSeqs.fsa} fasta file returned by \code{\link{LTRharvest}} in caste \code{program = "LTRharvest"} is specified.
#' @param data the \code{\link{data.frame}} returned by either \code{\link{LTRharvest}}, \code{\link{LTRdigest}},
#' or \code{\link{LTRpred}} that can be used to modify for example \code{similarity.bin} and \code{min.sim} within this table.
#' @param similarity.bin resolution of similarity binning. E.g. binning 98\%-100\% into 0.5\% intervals would be \code{similarity.bin = 0.5}.
#' @param min.sim minimum similarity between LTRs that can shall be considered for visualization.
#' @author Hajk-Georg Drost
#' @examples
#' \dontrun{
#' gff.file <- system.file("TAIR10_chr_all_LTRdigestPrediction.gff", package = "LTRpred")
#' tabout.file <- system.file("TAIR10_chr_all-ltrdigest_tabout.csv" ,package = "LTRpred")
#' LTRfile <- read.prediction(gff.file,tabout.file, program = "LTRdigest")
#'
#' # look at results
#' str(LTRfile$ltr.retrotransposon)
#'
#' }
#' @export
read.prediction <- function( gff.file = NULL,
tabout.file,
program = "LTRdigest",
ltr.fasta = NULL,
inner.seq.fasta = NULL,
data = NULL,
similarity.bin = 2,
min.sim = NULL){
if (!is.null(gff.file) &
!is.null(data))
stop(
"Please only provide either a stored dataset or the path to the gff file that shall be imported."
)
if (!is.element(program, c("LTRharvest", "LTRdigest")))
stop("Please choose a prediction returned by either LTRharvest or LTRdigest.")
if (!file.exists(tabout.file))
stop("The file '",tabout.file,"' does not exist! Please check the correct path or the correct output of LTRharvest() or LTRdigest().", call. = FALSE)
if (!is.null(gff.file)) {
if (!file.exists(gff.file))
stop("The file '", gff.file ,"' does not exist! Please check the correct path or the correct output of LTRharvest() or LTRdigest().", call. = FALSE)
# test if prediction gff file is empty
if (file.info(gff.file)$size == 0 ||
is.na(file.info(gff.file)$size == 0)) {
cat("File ",
gff.file,
" is empty and therefore is not being processed.")
return (NULL)
}
}
X4 <- X5 <- X9 <- annotation <- ltr_similarity <- chromosome <- NULL
if (program == "LTRharvest") {
if (!is.null(data))
AnnotationFile <- data
# determine the width of the predicted intervals
AnnotationFile <-
dplyr::mutate(AnnotationFile, width = (X5 - X4) + 1)
colnames(AnnotationFile) <-
c(
"chromosome",
"pred_tool",
"annotation",
"start",
"end",
"score",
"strand",
"frame",
"X9",
"width"
)
### Post-Processing of repeat_region
FilteredAnnotationFile.repeat_region <-
dplyr::filter(AnnotationFile, annotation == "repeat_region")
if (nrow(FilteredAnnotationFile.repeat_region) > 0) {
# Extract ID feature in column X9 (predicted by LTRharvest for repeat_region)
RepeatRegionID <-
vector("character",
nrow(FilteredAnnotationFile.repeat_region))
RepeatRegionID <-
sapply(seq_len(nrow(
FilteredAnnotationFile.repeat_region
)), function(location) {
unlist(stringr::str_split(FilteredAnnotationFile.repeat_region[location, "X9"], "="))[2]
})
FilteredAnnotationFile.repeat_region <-
dplyr::mutate(FilteredAnnotationFile.repeat_region,
repeat_region = RepeatRegionID)
FilteredAnnotationFile.repeat_region <-
dplyr::select(FilteredAnnotationFile.repeat_region, -X9)
}
cat("(1/5) Filtering for repeat regions has been finished.")
cat("\n")
### Post-Processing of LTR_retrotransposons
FilteredAnnotationFile.LTR_retrotransposon <-
dplyr::filter(AnnotationFile, annotation == "LTR_retrotransposon")
if (nrow(FilteredAnnotationFile.LTR_retrotransposon) > 0) {
# Extract ID, Parent, seq_number, and ltr_similarity features from column X9 (predicted by LTRharvest for LTR_retrotransposons)
LTR_retrotransposonPredictionFeatures <-
as.data.frame(t(sapply(seq_len(nrow(
FilteredAnnotationFile.LTR_retrotransposon
)),
function(location)
unlist(lapply(stringr::str_split(unlist(
stringr::str_split(FilteredAnnotationFile.LTR_retrotransposon[location , "X9"], ";")
), "="), function(x)
x[2])))),
colClasses = c(rep("character", 2), rep("numeric", 2)))
LTR_retrotransposonPredictionFeatures[, 3] <-
as.numeric(as.vector(LTR_retrotransposonPredictionFeatures[, 3]))
LTR_retrotransposonPredictionFeatures[, 4] <-
as.numeric(as.vector(LTR_retrotransposonPredictionFeatures[, 4]))
colnames(LTR_retrotransposonPredictionFeatures) <-
c("ID", "repeat_region", "ltr_similarity", "seq_number")
LTR_retrotransposonPredictionFeatures <-
cbind(
FilteredAnnotationFile.LTR_retrotransposon[, -9],
LTR_retrotransposonPredictionFeatures
)
# for (i in seq_len(length(table(LTR_retrotransposonPredictionFeatures[ , "chromosome"])))){
# if (i %in% 1:10)
# LTR_retrotransposonPredictionFeatures[which(LTR_retrotransposonPredictionFeatures[ , "chromosome"] == as.character(i)), "chromosome"] <- paste0("Chr",i)
# }
min.similarity <- vector("numeric", 1)
max.similarity <- vector("numeric", 1)
if (is.null(min.sim))
min.similarity <-
floor(min(LTR_retrotransposonPredictionFeatures[, "ltr_similarity"]))
if (!is.null(min.sim))
min.similarity <- min.sim
# if ((min.similarity > min.sim) | (max.sim > max.similarity))
# stop ("Please enter a similarity interval between [0,100] that is present in the dataset!")
# evenCheck <- (100 - min.similarity) %% similarity.bin
LTR_retrotransposonPredictionFeatures <-
dplyr::mutate(
LTR_retrotransposonPredictionFeatures,
similarity = cut(
ltr_similarity,
rev(seq(
100, min.similarity, -similarity.bin
)),
include.lowest = TRUE,
right = TRUE
)
)
#LTR_retrotransposonPredictionFeatures[ , "chromosome"] <- factor(LTR_retrotransposonPredictionFeatures[ , "chromosome"], ordered = TRUE, levels = paste0("Chr",1:10))
FilteredAnnotationFile.repeat_region <-
suppressWarnings(
dplyr::right_join(
FilteredAnnotationFile.repeat_region,
LTR_retrotransposonPredictionFeatures[, c("repeat_region", "ltr_similarity")],
by = "repeat_region"
)
)
FilteredAnnotationFile.repeat_region <-
dplyr::mutate(
FilteredAnnotationFile.repeat_region,
similarity = cut(
ltr_similarity,
rev(seq(
100, min.similarity, -similarity.bin
)),
include.lowest = TRUE,
right = TRUE
)
)
}
# rename chromosomes
ChrNameSplitFunc <-
function(x)
as.character(unlist(sapply(x, function(y)
unlist(stringr::str_split(y, "_"))[1])))
FilteredAnnotationFile.repeat_region <-
dplyr::mutate(
FilteredAnnotationFile.repeat_region,
chromosome_ltrharvest = chromosome)
FilteredAnnotationFile.repeat_region <-
dplyr::mutate(
FilteredAnnotationFile.repeat_region,
chromosome = ChrNameSplitFunc(FilteredAnnotationFile.repeat_region$chromosome)
)
cat("(2/5) Filtering for LTR retrotransposons has been finished.")
cat("\n")
### Post-Processing of inverted_repeat
FilteredAnnotationFile.inverted_repeat <-
dplyr::filter(AnnotationFile, annotation == "inverted_repeat")
if (nrow(FilteredAnnotationFile.inverted_repeat) > 0) {
# Extract Parent feature in column X9 (predicted by LTRharvest for inverted_repeat)
InvertedRepeatParentID <-
vector("character",
nrow(FilteredAnnotationFile.inverted_repeat))
InvertedRepeatParentID <-
sapply(seq_len(nrow(
FilteredAnnotationFile.inverted_repeat
)), function(location) {
unlist(stringr::str_split(FilteredAnnotationFile.inverted_repeat[location, "X9"], "="))[2]
})
FilteredAnnotationFile.inverted_repeat <-
dplyr::mutate(FilteredAnnotationFile.inverted_repeat,
repeat_region = InvertedRepeatParentID)
FilteredAnnotationFile.inverted_repeat <-
dplyr::select(FilteredAnnotationFile.inverted_repeat, -X9)
}
cat("(3/5) Filtering for inverted repeats has been finished.")
cat("\n")
### Post-Processing of long_terminal_repeat
FilteredAnnotationFile.long_terminal_repeat <-
dplyr::filter(AnnotationFile, annotation == "long_terminal_repeat")
if (nrow(FilteredAnnotationFile.long_terminal_repeat) > 0) {
# Extract Parent feature in column X9 (predicted by LTRharvest for long_terminal_repeat)
LTRParentID <-
vector("character",
nrow(FilteredAnnotationFile.long_terminal_repeat))
LTRParentID <-
sapply(seq_len(nrow(
FilteredAnnotationFile.long_terminal_repeat
)), function(location) {
unlist(
stringr::str_split(
FilteredAnnotationFile.long_terminal_repeat[location, "X9"],
"="
)
)[2]
})
FilteredAnnotationFile.long_terminal_repeat <-
dplyr::mutate(FilteredAnnotationFile.long_terminal_repeat, ID = LTRParentID)
FilteredAnnotationFile.long_terminal_repeat <-
dplyr::select(FilteredAnnotationFile.long_terminal_repeat, -X9)
FilteredAnnotationFile.long_terminal_repeat <-
suppressWarnings(
dplyr::right_join(
FilteredAnnotationFile.long_terminal_repeat,
LTR_retrotransposonPredictionFeatures[, c("ID", "ltr_similarity")],
by = "ID"
)
)
FilteredAnnotationFile.long_terminal_repeat <-
dplyr::mutate(
FilteredAnnotationFile.long_terminal_repeat,
similarity = cut(
ltr_similarity,
rev(seq(
100, min.similarity, -similarity.bin
)),
include.lowest = TRUE,
right = TRUE
)
)
# FilteredAnnotationFile.long_terminal_repeat$ID <- factor(FilteredAnnotationFile.long_terminal_repeat$ID, levels=unique(FilteredAnnotationFile.long_terminal_repeat$ID))
# FilteredAnnotationFile.long_terminal_repeat <- dplyr::mutate(FilteredAnnotationFile.long_terminal_repeat, ltr_order = do.call(rbind,lapply(split(FilteredAnnotationFile.long_terminal_repeat,FilteredAnnotationFile.long_terminal_repeat$ID), function(x) if (x[1, "end"] < x[2, "start"]) return (rbind("left","right")) else return(rbind("right","left")))))
}
cat("(4/5) Filtering for LTRs has been finished.")
cat("\n")
### Post-Processing of target_site_duplication
FilteredAnnotationFile.target_site_duplication <-
dplyr::filter(AnnotationFile, annotation == "target_site_duplication")
if (nrow(FilteredAnnotationFile.target_site_duplication) > 0) {
# Extract Parent feature in column X9 (predicted by LTRharvest for target_site_duplication)
TSDParentID <-
vector("character",
nrow(FilteredAnnotationFile.target_site_duplication))
TSDParentID <-
sapply(seq_len(
nrow(FilteredAnnotationFile.target_site_duplication)
), function(location) {
unlist(
stringr::str_split(
FilteredAnnotationFile.target_site_duplication[location, "X9"],
"="
)
)[2]
})
FilteredAnnotationFile.target_site_duplication <-
dplyr::mutate(FilteredAnnotationFile.target_site_duplication,
repeat_region = TSDParentID)
FilteredAnnotationFile.target_site_duplication <-
dplyr::select(FilteredAnnotationFile.target_site_duplication, -X9)
}
cat("(5/5) Filtering for target site duplication has been finished.")
cat("\n")
# rename the chromosome number
# FilteredAnnotationFile.repeat_region <- dplyr::mutate(FilteredAnnotationFile.repeat_region, seq_number = chromosome)
# FilteredAnnotationFile.repeat_region <- dplyr::mutate(FilteredAnnotationFile.repeat_region, chromosome = sequence)
# FilteredAnnotationFile.repeat_region <- dplyr::select(FilteredAnnotationFile.repeat_region, -sequence)
#
# LTR_retrotransposonPredictionFeatures <- dplyr::mutate(LTR_retrotransposonPredictionFeatures, seq_number = chromosome)
# LTR_retrotransposonPredictionFeatures <- dplyr::mutate(LTR_retrotransposonPredictionFeatures, chromosome = sequence)
# LTR_retrotransposonPredictionFeatures <- dplyr::select(LTR_retrotransposonPredictionFeatures, -sequence)
# FilteredAnnotationFile.long_terminal_repeat <- dplyr::mutate(FilteredAnnotationFile.long_terminal_repeat, seq_number = chromosome)
# FilteredAnnotationFile.long_terminal_repeat <- dplyr::mutate(FilteredAnnotationFile.long_terminal_repeat, chromosome = sequence)
# FilteredAnnotationFile.long_terminal_repeat <- dplyr::select(FilteredAnnotationFile.long_terminal_repeat, -sequence)
#
#
# FilteredAnnotationFile.inverted_repeat <- dplyr::mutate(FilteredAnnotationFile.inverted_repeat, seq_number = chromosome)
# FilteredAnnotationFile.inverted_repeat <- dplyr::mutate(FilteredAnnotationFile.inverted_repeat, chromosome = sequence)
# FilteredAnnotationFile.inverted_repeat <- dplyr::select(FilteredAnnotationFile.inverted_repeat, -sequence)
#
#
# FilteredAnnotationFile.target_site_duplication <- dplyr::mutate(FilteredAnnotationFile.target_site_duplication, seq_number = chromosome)
# FilteredAnnotationFile.target_site_duplication <- dplyr::mutate(FilteredAnnotationFile.target_site_duplication, chromosome = sequence)
# FilteredAnnotationFile.target_site_duplication <- dplyr::select(FilteredAnnotationFile.target_site_duplication, -sequence)
#
if (nrow(LTR_retrotransposonPredictionFeatures) > 0) {
# replace strand information "?" by "." due to gff file format standard
FindQ <-
which(LTR_retrotransposonPredictionFeatures$strand == "?")
LTR_retrotransposonPredictionFeatures$strand[FindQ] <- "."
}
return(
list(
repeat.region = FilteredAnnotationFile.repeat_region,
ltr.retrotransposon = LTR_retrotransposonPredictionFeatures,
ltr = FilteredAnnotationFile.long_terminal_repeat,
inverted_repeat = FilteredAnnotationFile.inverted_repeat,
target.site.duplication = FilteredAnnotationFile.target_site_duplication
)
)
}
if (program == "LTRdigest") {
if (!file.exists(gff.file)) {
message(
"The file '",
gff.file ,
"' does not exist! Please check the correct path or the correct output of LTRharvest() or LTRdigest(). This organism has been omitted!",
call. = FALSE
)
return(NA)
}
# read gff file content: comment = "###"
AnnotationFile <-
readr::read_tsv(gff.file,
col_names = FALSE,
col_types = readr::cols(
"X1" = readr::col_character(),
"X2" = readr::col_character(),
"X3" = readr::col_character(),
"X4" = readr::col_integer(),
"X5" = readr::col_integer(),
"X6" = readr::col_character(),
"X7" = readr::col_character(),
"X8" = readr::col_character(),
"X9" = readr::col_character()
),
skip = 0,
comment = "#")
cat("\n")
cat("Input: ", gff.file, " -> Row Number: ", nrow(AnnotationFile))
cat("\n")
AnnotationFile <- stats::na.omit(AnnotationFile)
cat("Remove 'NA' -> New Row Number: ", nrow(AnnotationFile))
cat("\n")
if (!is.null(data))
AnnotationFile <- data
if (nrow(AnnotationFile) > 0) {
# determine the width of the predicted intervals
AnnotationFile <-
dplyr::mutate(AnnotationFile, width = (X5 - X4) + 1)
colnames(AnnotationFile) <-
c(
"chromosome",
"pred_tool",
"annotation",
"start",
"end",
"score",
"strand",
"frame",
"X9",
"width"
)
### Post-Processing of repeat_region
FilteredAnnotationFile.repeat_region <-
dplyr::filter(AnnotationFile, annotation == "repeat_region")
if (nrow(FilteredAnnotationFile.repeat_region) > 0) {
# Extract ID feature in column X9 (predicted by LTRharvest for repeat_region)
RepeatRegionID <-
vector("character",
nrow(FilteredAnnotationFile.repeat_region))
RepeatRegionID <-
sapply(seq_len(nrow(
FilteredAnnotationFile.repeat_region
)), function(location) {
unlist(stringr::str_split(FilteredAnnotationFile.repeat_region[location, "X9"], "="))[2]
})
FilteredAnnotationFile.repeat_region <-
dplyr::mutate(FilteredAnnotationFile.repeat_region,
repeat_region = RepeatRegionID)
FilteredAnnotationFile.repeat_region <-
dplyr::select(FilteredAnnotationFile.repeat_region, -X9)
}
cat("(1/8) Filtering for repeat regions has been finished.")
cat("\n")
### Post-Processing of LTR_retrotransposons
FilteredAnnotationFile.LTR_retrotransposon <-
dplyr::filter(AnnotationFile, annotation == "LTR_retrotransposon")
if (nrow(FilteredAnnotationFile.LTR_retrotransposon) > 0) {
# Extract ID, Parent, seq_number, and ltr_similarity features from column X9 (predicted by LTRharvest for LTR_retrotransposons)
LTR_retrotransposonPredictionFeatures <-
as.data.frame(t(sapply(seq_len(nrow(
FilteredAnnotationFile.LTR_retrotransposon
)),
function(location)
unlist(lapply(stringr::str_split(unlist(
stringr::str_split(FilteredAnnotationFile.LTR_retrotransposon[location , "X9"], ";")
), "="), function(x)
x[2])))),
colClasses = c(rep("character", 2), rep("numeric", 2)))
LTR_retrotransposonPredictionFeatures[, 3] <-
as.numeric(as.vector(LTR_retrotransposonPredictionFeatures[, 3]))
LTR_retrotransposonPredictionFeatures[, 4] <-
as.numeric(as.vector(LTR_retrotransposonPredictionFeatures[, 4]))
colnames(LTR_retrotransposonPredictionFeatures) <-
c("ID", "repeat_region", "ltr_similarity", "seq_number")
LTR_retrotransposonPredictionFeatures <-
cbind(
FilteredAnnotationFile.LTR_retrotransposon[, -9],
LTR_retrotransposonPredictionFeatures
)
# for (i in seq_len(length(table(LTR_retrotransposonPredictionFeatures[ , "chromosome"])))){
# if (i %in% 1:10)
# LTR_retrotransposonPredictionFeatures[which(LTR_retrotransposonPredictionFeatures[ , "chromosome"] == as.character(i)), "chromosome"] <- paste0("Chr",i)
# }
min.similarity <- vector("numeric", 1)
max.similarity <- vector("numeric", 1)
if (is.null(min.sim))
min.similarity <-
floor(min(LTR_retrotransposonPredictionFeatures[, "ltr_similarity"]))
if (!is.null(min.sim))
min.similarity <- min.sim
# if ((min.similarity > min.sim) | (max.sim > max.similarity))
# stop ("Please enter a similarity interval between [0,100] that is present in the dataset!")
# evenCheck <- (100 - min.similarity) %% similarity.bin
LTR_retrotransposonPredictionFeatures <-
dplyr::mutate(
LTR_retrotransposonPredictionFeatures,
similarity = cut(
ltr_similarity,
rev(seq(
100, min.similarity, -similarity.bin
)),
include.lowest = TRUE,
right = TRUE
)
)
#LTR_retrotransposonPredictionFeatures[ , "chromosome"] <- factor(LTR_retrotransposonPredictionFeatures[ , "chromosome"], ordered = TRUE, levels = paste0("Chr",1:10))
FilteredAnnotationFile.repeat_region <-
suppressWarnings(
dplyr::right_join(
FilteredAnnotationFile.repeat_region,
LTR_retrotransposonPredictionFeatures[, c("repeat_region", "ltr_similarity")],
by = "repeat_region"
)
)
FilteredAnnotationFile.repeat_region <-
dplyr::mutate(
FilteredAnnotationFile.repeat_region,
similarity = cut(
ltr_similarity,
rev(seq(
100, min.similarity, -similarity.bin
)),
include.lowest = TRUE,
right = TRUE
)
)
LTRdigest.tabout.file <- read.tabout(tabout.file)
if (nrow(LTRdigest.tabout.file) > 2) {
LTR_retrotransposonPredictionFeatures <-
dplyr::inner_join(
LTR_retrotransposonPredictionFeatures,
LTRdigest.tabout.file,
by = c("start" = "element_start", "end" = "element_end")
)
} else {
warning("The tabout file '",tabout.file,"' could not be processed, because it is empty and thus has been omitted.")
}
}
cat("(2/8) Filtering for LTR retrotransposons has been finished.")
cat("\n")
### Post-Processing of inverted_repeat
FilteredAnnotationFile.inverted_repeat <-
dplyr::filter(AnnotationFile, annotation == "inverted_repeat")
if (nrow(FilteredAnnotationFile.inverted_repeat) > 0) {
# Extract Parent feature in column X9 (predicted by LTRharvest for inverted_repeat)
InvertedRepeatParentID <-
vector("character",
nrow(FilteredAnnotationFile.inverted_repeat))
InvertedRepeatParentID <-
sapply(seq_len(nrow(
FilteredAnnotationFile.inverted_repeat
)), function(location) {
unlist(stringr::str_split(FilteredAnnotationFile.inverted_repeat[location, "X9"], "="))[2]
})
FilteredAnnotationFile.inverted_repeat <-
dplyr::mutate(FilteredAnnotationFile.inverted_repeat,
repeat_region = InvertedRepeatParentID)
FilteredAnnotationFile.inverted_repeat <-
dplyr::select(FilteredAnnotationFile.inverted_repeat, -X9)
}
cat("(3/8) Filtering for inverted repeats has been finished.")
cat("\n")
### Post-Processing of long_terminal_repeat
FilteredAnnotationFile.long_terminal_repeat <-
dplyr::filter(AnnotationFile, annotation == "long_terminal_repeat")
# Extract Parent feature in column X9 (predicted by LTRharvest for long_terminal_repeat)
if (nrow(FilteredAnnotationFile.long_terminal_repeat) > 0) {
LTRParentID <-
vector("character",
nrow(FilteredAnnotationFile.long_terminal_repeat))
LTRParentID <-
sapply(seq_len(nrow(
FilteredAnnotationFile.long_terminal_repeat
)), function(location) {
unlist(
stringr::str_split(
FilteredAnnotationFile.long_terminal_repeat[location, "X9"],
"="
)
)[2]
})
FilteredAnnotationFile.long_terminal_repeat <-
dplyr::mutate(FilteredAnnotationFile.long_terminal_repeat, ID = LTRParentID)
FilteredAnnotationFile.long_terminal_repeat <-
dplyr::select(FilteredAnnotationFile.long_terminal_repeat, -X9)
FilteredAnnotationFile.long_terminal_repeat <-
suppressWarnings(
dplyr::inner_join(
FilteredAnnotationFile.long_terminal_repeat,
LTR_retrotransposonPredictionFeatures[, c("ID", "ltr_similarity")],
by = "ID"
)
)
FilteredAnnotationFile.long_terminal_repeat <-
dplyr::mutate(
FilteredAnnotationFile.long_terminal_repeat,
similarity = cut(
ltr_similarity,
rev(seq(
100, min.similarity, -similarity.bin
)),
include.lowest = TRUE,
right = TRUE
)
)
# FilteredAnnotationFile.long_terminal_repeat$ID <- factor(FilteredAnnotationFile.long_terminal_repeat$ID, levels = unique(FilteredAnnotationFile.long_terminal_repeat$ID))
# print(as.data.frame(FilteredAnnotationFile.long_terminal_repeat))
#
# print(do.call(rbind,lapply(split(FilteredAnnotationFile.long_terminal_repeat,FilteredAnnotationFile.long_terminal_repeat$ID), function(x){
#
# if (x[1 , "end"] < x[2 , "start"]){
# return (rbind("left","right"))
# } else {
# return(rbind("right","left"))
# }
#
# })))
#
# FilteredAnnotationFile.long_terminal_repeat <- dplyr::mutate(FilteredAnnotationFile.long_terminal_repeat, ltr_order = do.call(rbind,lapply(split(FilteredAnnotationFile.long_terminal_repeat,FilteredAnnotationFile.long_terminal_repeat$ID), function(x){
#
# if (x[1 , "end"] < x[2 , "start"]){
# return (rbind("left","right"))
# } else {
# return(rbind("right","left"))
# }
#
# })))
#
}
cat("(4/8) Filtering for LTRs has been finished.")
cat("\n")
### Post-Processing of target_site_duplication
FilteredAnnotationFile.target_site_duplication <-
dplyr::filter(AnnotationFile, annotation == "target_site_duplication")
# Extract Parent feature in column X9 (predicted by LTRharvest for target_site_duplication)
if (nrow(FilteredAnnotationFile.target_site_duplication) > 0) {
TSDParentID <-
vector("character",
nrow(FilteredAnnotationFile.target_site_duplication))
TSDParentID <-
sapply(seq_len(
nrow(FilteredAnnotationFile.target_site_duplication)
), function(location) {
unlist(
stringr::str_split(
FilteredAnnotationFile.target_site_duplication[location, "X9"],
"="
)
)[2]
})
FilteredAnnotationFile.target_site_duplication <-
dplyr::mutate(FilteredAnnotationFile.target_site_duplication,
repeat_region = TSDParentID)
FilteredAnnotationFile.target_site_duplication <-
dplyr::select(FilteredAnnotationFile.target_site_duplication, -X9)
}
cat("(5/8) Filtering for target site duplication has been finished.")
cat("\n")
### Post-Processing of primer_binding_site
FilteredAnnotationFile.primer_binding_site <-
dplyr::filter(AnnotationFile, annotation == "primer_binding_site")
if (nrow(FilteredAnnotationFile.primer_binding_site) > 0) {
# Extract ID, Parent, seq_number, and ltr_similarity features from column X9 (predicted by LTRharvest for LTR_retrotransposons)
PBSPredictionFeatures <-
as.data.frame(t(sapply(seq_len(nrow(
FilteredAnnotationFile.primer_binding_site
)),
function(location)
unlist(lapply(stringr::str_split(unlist(
stringr::str_split(FilteredAnnotationFile.primer_binding_site[location , "X9"], ";")
), "="), function(x)
x[2])))),
colClasses = c(rep("character", 2), rep("numeric", 3)))
PBSPredictionFeatures[, 3] <-
as.numeric(as.vector(PBSPredictionFeatures[, 3]))
PBSPredictionFeatures[, 4] <-
as.numeric(as.vector(PBSPredictionFeatures[, 4]))
PBSPredictionFeatures[, 5] <-
as.numeric(as.vector(PBSPredictionFeatures[, 5]))
colnames(PBSPredictionFeatures) <-
c("ID", "trna", "trnaoffset", "pbsoffset", "edist")
FilteredAnnotationFile.primer_binding_site <-
cbind(FilteredAnnotationFile.primer_binding_site[, -9],
PBSPredictionFeatures)
FilteredAnnotationFile.primer_binding_site <-
suppressWarnings(
dplyr::right_join(
FilteredAnnotationFile.primer_binding_site,
LTR_retrotransposonPredictionFeatures[, c("ID", "ltr_similarity")],
by = "ID"
)
)
FilteredAnnotationFile.primer_binding_site <-
dplyr::mutate(
FilteredAnnotationFile.primer_binding_site,
similarity = cut(
ltr_similarity,
rev(seq(
100, min.similarity, -similarity.bin
)),
include.lowest = TRUE,
right = TRUE
)
)
}
cat("(6/8) Filtering for primer binding site has been finished.")
cat("\n")
### Post-Processing of protein_match
FilteredAnnotationFile.protein_match <-
dplyr::filter(AnnotationFile, annotation == "protein_match")
if (nrow(FilteredAnnotationFile.protein_match) > 0) {
# Extract ID, Parent, seq_number, and ltr_similarity features from column X9 (predicted by LTRharvest for LTR_retrotransposons)
ProteinMatchPredictionFeatures <-
as.data.frame(t(sapply(seq_len(nrow(
FilteredAnnotationFile.protein_match
)),
function(location)
unlist(lapply(stringr::str_split(unlist(
stringr::str_split(FilteredAnnotationFile.protein_match[location , "X9"], ";")
), "="), function(x)
x[2])))),
colClasses = c("character", "numeric", "character"))
ProteinMatchPredictionFeatures[, 2] <-
as.numeric(as.vector(ProteinMatchPredictionFeatures[, 2]))
colnames(ProteinMatchPredictionFeatures) <-
c("ID", "reading_frame", "name")
FilteredAnnotationFile.protein_match <-
cbind(FilteredAnnotationFile.protein_match[, -9],
ProteinMatchPredictionFeatures)
FilteredAnnotationFile.protein_match <-
suppressWarnings(
dplyr::right_join(
FilteredAnnotationFile.protein_match,
LTR_retrotransposonPredictionFeatures[, c("ID", "ltr_similarity", "width")],
by = "ID"
)
)
FilteredAnnotationFile.protein_match <-
dplyr::mutate(
FilteredAnnotationFile.protein_match,
similarity = cut(
ltr_similarity,
rev(seq(
100, min.similarity, -similarity.bin
)),
include.lowest = TRUE,
right = TRUE
)
)
colnames(FilteredAnnotationFile.protein_match)[9] <-
"match_width"
colnames(FilteredAnnotationFile.protein_match)[14] <-
"width"
}
cat("(7/8) Filtering for protein match has been finished.")
cat("\n")
### Post-Processing of RR_tract
FilteredAnnotationFile.RR_tract <-
dplyr::filter(AnnotationFile, annotation == "RR_tract")
if (nrow(FilteredAnnotationFile.RR_tract) > 0) {
# Extract ID, Parent, seq_number, and ltr_similarity features from column X9 (predicted by LTRharvest for LTR_retrotransposons)
RRtractPredictionFeatures <-
as.data.frame(sapply(seq_len(nrow(
FilteredAnnotationFile.RR_tract
)),
function(location)
unlist(
stringr::str_split(FilteredAnnotationFile.RR_tract[location , "X9"], "=")
)[2]),
colClasses = "character")
colnames(RRtractPredictionFeatures) <- "ID"
FilteredAnnotationFile.RR_tract <-
cbind(FilteredAnnotationFile.RR_tract[, -9],
RRtractPredictionFeatures)
FilteredAnnotationFile.RR_tract <-
suppressWarnings(
dplyr::right_join(
FilteredAnnotationFile.RR_tract,
LTR_retrotransposonPredictionFeatures[, c("ID", "ltr_similarity")],
by = "ID"
)
)
FilteredAnnotationFile.RR_tract <-
dplyr::mutate(
FilteredAnnotationFile.RR_tract,
similarity = cut(
ltr_similarity,
rev(seq(
100, min.similarity, -similarity.bin
)),
include.lowest = TRUE,
right = TRUE
)
)
}
cat("(8/8) Filtering for RR tract has been finished.")
cat("\n")
# rename the chromosome number
# FilteredAnnotationFile.repeat_region <- dplyr::mutate(FilteredAnnotationFile.repeat_region, seq_number = chromosome)
# FilteredAnnotationFile.repeat_region <- dplyr::mutate(FilteredAnnotationFile.repeat_region, chromosome = sequence)
# FilteredAnnotationFile.repeat_region <- dplyr::select(FilteredAnnotationFile.repeat_region, -sequence)
#
if ((nrow(LTR_retrotransposonPredictionFeatures) > 0))
{
if (!all(is.element(
c(
"chromosome",
"pred_tool",
"annotation",
"start",
"end",
"score",
"strand",
"frame",
"width",
"ID",
"repeat_region",
"ltr_similarity",
"seq_number",
"similarity",
"element_length",
"sequence",
"lLTR_start",
"lLTR_end",
"lLTR_length",
"rLTR_start",
"rLTR_end",
"rLTR_length",
"lTSD_start",
"lTSD_end",
"lTSD_motif",
"rTSD_start",
"rTSD_end",
"rTSD_motif",
"PPT_start",
"PPT_end",
"PPT_motif",
"PPT_strand",
"PPT_offset",
"PBS_start",
"PBS_end",
"PBS_strand",
"tRNA",
"tRNA_motif",
"PBS_offset",
"tRNA_offset",
"PBS/tRNA_edist",
"protein_domain"
), names(LTR_retrotransposonPredictionFeatures)
))) {
message("LTRdigest prediction file could not be read properly... it seems that there weren't enough predicted loci. Species prediction has been omitted!")
return(NA)
}
LTR_retrotransposonPredictionFeatures <-
dplyr::mutate(LTR_retrotransposonPredictionFeatures, seq_number = chromosome)
LTR_retrotransposonPredictionFeatures <-
dplyr::mutate(LTR_retrotransposonPredictionFeatures, chromosome = sequence)
LTR_retrotransposonPredictionFeatures <-
dplyr::select(LTR_retrotransposonPredictionFeatures, -sequence)
# # rename chromosomes
ChrNameSplitFunc <-
function(x)
as.character(unlist(sapply(x, function(y)
unlist(stringr::str_split(y, "_"))[1])))
LTR_retrotransposonPredictionFeatures <-
dplyr::mutate(
LTR_retrotransposonPredictionFeatures,
chromosome_ltrharvest = chromosome)
LTR_retrotransposonPredictionFeatures <-
dplyr::mutate(
LTR_retrotransposonPredictionFeatures,
chromosome = ChrNameSplitFunc(LTR_retrotransposonPredictionFeatures$chromosome)
)
}
# FilteredAnnotationFile.long_terminal_repeat <- dplyr::mutate(FilteredAnnotationFile.long_terminal_repeat, seq_number = chromosome)
# FilteredAnnotationFile.long_terminal_repeat <- dplyr::mutate(FilteredAnnotationFile.long_terminal_repeat, chromosome = sequence)
# FilteredAnnotationFile.long_terminal_repeat <- dplyr::select(FilteredAnnotationFile.long_terminal_repeat, -sequence)
#
#
# FilteredAnnotationFile.inverted_repeat <- dplyr::mutate(FilteredAnnotationFile.inverted_repeat, seq_number = chromosome)
# FilteredAnnotationFile.inverted_repeat <- dplyr::mutate(FilteredAnnotationFile.inverted_repeat, chromosome = sequence)
# FilteredAnnotationFile.inverted_repeat <- dplyr::select(FilteredAnnotationFile.inverted_repeat, -sequence)
#
#
# FilteredAnnotationFile.target_site_duplication <- dplyr::mutate(FilteredAnnotationFile.target_site_duplication, seq_number = chromosome)
# FilteredAnnotationFile.target_site_duplication <- dplyr::mutate(FilteredAnnotationFile.target_site_duplication, chromosome = sequence)
# FilteredAnnotationFile.target_site_duplication <- dplyr::select(FilteredAnnotationFile.target_site_duplication, -sequence)
#
#
# FilteredAnnotationFile.primer_binding_site <- dplyr::mutate(FilteredAnnotationFile.primer_binding_site, seq_number = chromosome)
# FilteredAnnotationFile.primer_binding_site <- dplyr::mutate(FilteredAnnotationFile.primer_binding_site, chromosome = sequence)
# FilteredAnnotationFile.primer_binding_site <- dplyr::select(FilteredAnnotationFile.primer_binding_site, -sequence)
#
#
# FilteredAnnotationFile.protein_match <- dplyr::mutate(FilteredAnnotationFile.protein_match, seq_number = chromosome)
# FilteredAnnotationFile.protein_match <- dplyr::mutate(FilteredAnnotationFile.protein_match, chromosome = sequence)
# FilteredAnnotationFile.protein_match <- dplyr::select(FilteredAnnotationFile.protein_match, -sequence)
#
# FilteredAnnotationFile.RR_tract <- dplyr::mutate(FilteredAnnotationFile.RR_tract, seq_number = chromosome)
# FilteredAnnotationFile.RR_tract <- dplyr::mutate(FilteredAnnotationFile.RR_tract, chromosome = sequence)
# FilteredAnnotationFile.RR_tract <- dplyr::select(FilteredAnnotationFile.RR_tract, -sequence)
#
if (nrow(LTR_retrotransposonPredictionFeatures) > 0) {
# replace strand information "?" by "." due to gff file format standard
FindQ <-
which(LTR_retrotransposonPredictionFeatures$strand == "?")
LTR_retrotransposonPredictionFeatures$strand[FindQ] <- "."
}
return(
list(
repeat.region = FilteredAnnotationFile.repeat_region,
ltr.retrotransposon = LTR_retrotransposonPredictionFeatures,
ltr = FilteredAnnotationFile.long_terminal_repeat,
inverted_repeat = FilteredAnnotationFile.inverted_repeat,
target.site.duplication = FilteredAnnotationFile.target_site_duplication,
pbs = FilteredAnnotationFile.primer_binding_site,
protein.match = FilteredAnnotationFile.protein_match,
RR_tract = FilteredAnnotationFile.RR_tract
)
)
} else {
warning("The imported annotation file '",gff.file,"' was empty and has therefore been omitted.")
return(
list(
repeat.region = NULL,
ltr.retrotransposon = NULL,
ltr = NULL,
inverted_repeat = NULL,
target.site.duplication = NULL,
pbs = NULL,
protein.match = NULL,
RR_tract = NULL
)
)
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.