#' @title Copy Number Quantification of predicted LTRs (solo LTR prediction)
#' @description Detect solo LTR copies and genomic locations of predicted LTR transposons using a BLAST search strategy.
#' @param data.sheet path to the \code{*_LTRpred_DataSheet.csv} file generated by \code{\link{LTRpred}}.
#' @param LTR.fasta_3ltr path to fasta file storing sequences of 3 prime LTRs generated by \code{\link{LTRpred}}.
#' @param LTR.fasta_5ltr path to fasta file storing sequences of 5 prime LTRs generated by \code{\link{LTRpred}}.
#' @param genome file path to the reference genome in which solo LTRs shall be found (in \code{fasta} format).
#' @param ltr.similarity similarity threshold for defining LTR similarity.
#' @param scope.cutoff similarity threshold for the scope variable. The scope of a BLAST hit is defined by \code{abs(s_len - alig_length) / s_len}
#' and aims to quantify the scope ('length similarity between hit and query') of the alignment. Default is \code{scope.cutoff = 0.85} meaning that
#' at least 85\% of the length of the query sequence must match with the length of the subject sequence.
#' @param perc.ident.cutoff choose the minimum sequence identity between the query sequence and subject hit sequence that have a scope >= \code{scope.cutoff} (e.g. 0.85).
#' Default is \code{perc.ident.cutoff = 70}. This threshold aims to detect BLAST hits that have similar sequence lengths
#' (controlled by the \code{scope.cutoff} variable; e.g. \code{scope.cutoff = 0.85}) and within this scope at least e.g. 70\% sequence identity (controlled by the \code{perc.ident.cutoff} variable).
#' @param output file name of the BLAST output. If \code{output = NULL} (default) then the BLAST output file will be deleted after the result \code{data.frame} is returned by this function.
#' @param max.hits maximum number of hits that shall be retrieved that still fulfill the e-value criterium.
#' Default is \code{max.hits = 65000}.
#' @param eval e-value threshold for BLAST hit detection. Default is \code{eval = 1E-10}.
#' @param cores number of cores for parallel computations.
#' @author Hajk-Georg Drost
#' @references
#' Altschul, S.F., Gish, W., Miller, W., Myers, E.W. & Lipman, D.J. (1990) "Basic local alignment search tool." J. Mol. Biol. 215:403-410.
#'
#' Gish, W. & States, D.J. (1993) "Identification of protein coding regions by database similarity search." Nature Genet. 3:266-272.
#'
#' Madden, T.L., Tatusov, R.L. & Zhang, J. (1996) "Applications of network BLAST server" Meth. Enzymol. 266:131-141.
#'
#' Altschul, S.F., Madden, T.L., Schaeffer, A.A., Zhang, J., Zhang, Z., Miller, W. & Lipman, D.J. (1997) "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs." Nucleic Acids Res. 25:3389-3402.
#'
#' Zhang Z., Schwartz S., Wagner L., & Miller W. (2000), "A greedy algorithm for aligning DNA sequences" J Comput Biol 2000; 7(1-2):203-14.
#' @export
ltr.cn <- function(data.sheet,
LTR.fasta_3ltr,
LTR.fasta_5ltr,
genome,
ltr.similarity = 70,
scope.cutoff = 0.85,
perc.ident.cutoff = 70,
output = NULL,
max.hits = 500,
eval = 1E-10,
cores = 1){
# ltr_similarity <-
# s_len <- alig_length <- scope <- perc_identity <- NULL
# s_start <-
# s_end <-
# s_start_new <-
# s_end_new <- subject_id <- chromosome <- query_id <- NULL
# q_len <- strand <- bit_score <- evalue <- NULL
#
test_installation_blast()
ltr_similarity <- s_len <- alig_length <- scope <- perc_identity <- s_start <- NULL
s_end <- s_start_new <- s_end_new <- subject_id <- chromosome <- query_id <- NULL
q_len <- strand <- bit_score <- evalue <- NULL
if (!file.exists(data.sheet)) {
stop("ltr.cn: The file '",
data.sheet,
"' has not been found. Please check the path.")
}
if (!file.exists(LTR.fasta_3ltr)) {
stop("ltr.cn: The file '",
LTR.fasta_3ltr,
"' has not been found. Please check the path.")
}
if (!file.exists(LTR.fasta_5ltr)) {
stop("ltr.cn: The file '",
LTR.fasta_5ltr,
"' has not been found. Please check the path.")
}
if (!file.exists(genome)) {
stop("ltr.cn: The file '",
genome,
"' has not been found. Please check the path.")
}
if (is.null(output)) {
output_3ltr <-
file.path(tempdir(), "solo_ltrs_blast_output_3ltr.txt")
output_5ltr <-
file.path(tempdir(), "solo_ltrs_blast_output_5ltr.txt")
}
if (!is.null(output)) {
output_3ltr <- file.path(tempdir(), paste0(output, "_3ltr"))
output_5ltr <- file.path(tempdir(), paste0(output, "_5ltr"))
}
folder.name <- basename(genome)
#folder.name <- stringr::str_replace(folder.name,"_ltrpred","")
#ltrdigest.folder <- paste0(folder.name,"_ltrdigest")
LTRpred.tbl <- read.ltrpred(data.sheet = data.sheet)
if (nrow(LTRpred.tbl) < 2) {
message("Only one element has been found... so LTR copy number estimation is omitted!")
} else {
#LTR.fasta_3ltr <- file.path(LTRpred.folder,ltrdigest.folder,paste0(folder.name,"-ltrdigest_3ltr.fas"))
#LTR.fasta_5ltr <- file.path(LTRpred.folder,ltrdigest.folder,paste0(folder.name,"-ltrdigest_5ltr.fas"))
LTR.filtered.fasta_3ltr <-
file.path(tempdir(),
paste0(folder.name, "-ltrdigest_3ltr_", ltr.similarity, ".fas"))
LTR.filtered.fasta_5ltr <-
file.path(tempdir(),
paste0(folder.name, "-ltrdigest_5ltr_", ltr.similarity, ".fas"))
pred2fasta(
LTRpred.tbl = dplyr::filter(LTRpred.tbl, ltr_similarity >= ltr.similarity),
prediction.file = LTR.fasta_3ltr,
output = LTR.filtered.fasta_3ltr
)
pred2fasta(
LTRpred.tbl = dplyr::filter(LTRpred.tbl, ltr_similarity >= ltr.similarity),
prediction.file = LTR.fasta_5ltr,
output = LTR.filtered.fasta_5ltr
)
if (!file.exists(LTR.filtered.fasta_3ltr) ||
!file.exists(LTR.filtered.fasta_5ltr)) {
stop(
"Something went wrong with the solo LTR prediction! Files '",
LTR.filtered.fasta_3ltr,
"' and/or '",
LTR.filtered.fasta_5ltr,
"' could not be found or gnerated.",
call. = FALSE
)
}
if (stringr::str_detect(genome, "[.]gz")) {
system(paste0("gunzip ", ws.wrap.path(genome)))
genome <- stringr::str_replace(genome, "[.]gz","")
}
# generate a BLASTable database
message("Run makeblastdb of the genome assembly...")
system(paste0(
"makeblastdb -in ",
genome,
" -dbtype nucl -hash_index"
))
message("Perform BLAST searches of 3' prime LTRs against genome assembly...")
# BLAST putative LTRs against genome file for 3 prime LTR
system(
paste0(
"blastn -query ",
ws.wrap.path(LTR.filtered.fasta_3ltr),
" -db ",
ws.wrap.path(genome),
" -out ",
ws.wrap.path(output_3ltr) ,
" ",
"-evalue ",
eval,
" -max_target_seqs ",
max.hits,
" -num_threads ",
cores,
" -dust no -outfmt '6 qseqid sseqid pident nident length mismatch gapopen gaps positive ppos qstart qend qlen sstart send slen evalue bitscore score'"
)
)
message("Perform BLAST searches of 5' prime LTRs against genome assembly...")
# BLAST putative LTRs against genome file for 5 prime LTR
system(
paste0(
"blastn -query ",
ws.wrap.path(LTR.filtered.fasta_5ltr),
" -db ",
ws.wrap.path(genome),
" -out ",
ws.wrap.path(output_5ltr) ,
" ",
"-evalue ",
eval,
" -max_target_seqs ",
max.hits,
" -num_threads ",
cores,
" -dust no -outfmt '6 qseqid sseqid pident nident length mismatch gapopen gaps positive ppos qstart qend qlen sstart send slen evalue bitscore score'"
)
)
# Define the column names of the BLAST output
BLASTColNames <- c(
"query_id",
"subject_id",
"perc_identity",
"num_ident_matches",
"alig_length",
"mismatches",
"gap_openings",
"n_gaps",
"pos_match",
"ppos",
"q_start",
"q_end",
"q_len",
"s_start",
"s_end",
"s_len",
"evalue",
"bit_score",
"score_raw"
)
message("Import BLAST results...")
# Read the BLAST output and define the columns
BLASTOutput_3ltr <-
readr::read_tsv(output_3ltr, col_names = FALSE, col_types = readr::cols(
"X1" = readr::col_character(),
"X2" = readr::col_character(),
"X3" = readr::col_double(),
"X4" = readr::col_integer(),
"X5" = readr::col_integer(),
"X6" = readr::col_integer(),
"X7" = readr::col_integer(),
"X8" = readr::col_integer(),
"X9" = readr::col_integer(),
"X10" = readr::col_double(),
"X11" = readr::col_integer(),
"X12" = readr::col_integer(),
"X13" = readr::col_integer(),
"X14" = readr::col_integer(),
"X15" = readr::col_integer(),
"X16" = readr::col_integer(),
"X17" = readr::col_double(),
"X18" = readr::col_double(),
"X19" = readr::col_double()
))
BLASTOutput_5ltr <-
readr::read_tsv(output_5ltr, col_names = FALSE, col_types = readr::cols(
"X1" = readr::col_character(),
"X2" = readr::col_character(),
"X3" = readr::col_double(),
"X4" = readr::col_integer(),
"X5" = readr::col_integer(),
"X6" = readr::col_integer(),
"X7" = readr::col_integer(),
"X8" = readr::col_integer(),
"X9" = readr::col_integer(),
"X10" = readr::col_double(),
"X11" = readr::col_integer(),
"X12" = readr::col_integer(),
"X13" = readr::col_integer(),
"X14" = readr::col_integer(),
"X15" = readr::col_integer(),
"X16" = readr::col_integer(),
"X17" = readr::col_double(),
"X18" = readr::col_double(),
"X19" = readr::col_double()
))
colnames(BLASTOutput_3ltr) <- BLASTColNames
colnames(BLASTOutput_5ltr) <- BLASTColNames
message("Filter hit results...")
# generate scope variable
BLASTOutput_3ltr <-
dplyr::mutate(BLASTOutput_3ltr, scope = abs(s_len - alig_length) / s_len)
BLASTOutput_5ltr <-
dplyr::mutate(BLASTOutput_5ltr, scope = abs(s_len - alig_length) / s_len)
# filter for potentially (biologically meaningful) solo LTR hits
BLASTOutput_3ltr <-
dplyr::filter(BLASTOutput_3ltr,
scope >= scope.cutoff,
perc_identity >= perc.ident.cutoff)
BLASTOutput_5ltr <-
dplyr::filter(BLASTOutput_5ltr,
scope >= scope.cutoff,
perc_identity >= perc.ident.cutoff)
# assign strand information
BLASTOutput_3ltr <-
dplyr::mutate(BLASTOutput_3ltr, strand = ifelse(s_start < s_end, "+", "-"))
BLASTOutput_5ltr <-
dplyr::mutate(BLASTOutput_5ltr, strand = ifelse(s_start < s_end, "+", "-"))
# swap start and end positions for "-" strand
BLASTOutput_3ltr <-
dplyr::mutate(
BLASTOutput_3ltr,
s_start_new = ifelse(s_start < s_end, s_start, s_end),
s_end_new = ifelse(s_start < s_end, s_end, s_start)
)
BLASTOutput_5ltr <-
dplyr::mutate(
BLASTOutput_5ltr,
s_start_new = ifelse(s_start < s_end, s_start, s_end),
s_end_new = ifelse(s_start < s_end, s_end, s_start)
)
BLASTOutput_3ltr <-
dplyr::mutate(BLASTOutput_3ltr, s_start = s_start_new, s_end = s_end_new)
BLASTOutput_5ltr <-
dplyr::mutate(BLASTOutput_5ltr, s_start = s_start_new, s_end = s_end_new)
BLASTOutput_3ltr <-
dplyr::mutate(BLASTOutput_3ltr, subject_id = as.character(subject_id))
BLASTOutput_5ltr <-
dplyr::mutate(BLASTOutput_5ltr, subject_id = as.character(subject_id))
# here match Chromosomes....
BLASTOutput_3ltr <-
dplyr::mutate(BLASTOutput_3ltr,
subject_id = as.character(subject_id))
BLASTOutput_5ltr <-
dplyr::mutate(BLASTOutput_5ltr,
subject_id = as.character(subject_id))
## remove non-matching chromosomes or mitochondria or chloroplast (sequences)
# test whether or not 3ltr and 5 ltr loci overlap with predicted full ltr transposon locus
full.te.chr <- names(table(LTRpred.tbl$chromosome))
ltr_chr <- names(table(BLASTOutput_3ltr$subject_id))
# setdiff is not symmetrical so setdiff(A,B) != setdiff(B,A)
setdiff.full.te <- Biostrings::setdiff(full.te.chr, ltr_chr)
setdiff.ltr <- Biostrings::setdiff(ltr_chr, full.te.chr)
LTR.fasta_full.te <-
dplyr::filter(LTRpred.tbl, !is.element(chromosome, setdiff.full.te))
BLASTOutput_3ltr <-
dplyr::filter(BLASTOutput_3ltr, !is.element(subject_id, setdiff.ltr))
BLASTOutput_5ltr <-
dplyr::filter(BLASTOutput_5ltr, !is.element(subject_id, setdiff.ltr))
# test again after removal of different chromosomes if chromosomes in both tables match
full.te.chr <- names(table(LTR.fasta_full.te$chromosome))
ltr_chr <- names(table(BLASTOutput_3ltr$subject_id))
if (!identical(full.te.chr, ltr_chr))
stop(
"Chromosome names in full LTR transposon sequence file and LTR element blast file do not match!",
"\n",
"TE name = ",
full.te.chr[1],
" and LTR blast name = ",
ltr_chr[1],
". Maybe there are mitochondira or chloroplasts in your FASTA file. Please fix...",
call. = FALSE
)
BLAST.ir.3ltr.list <- vector("list", length(full.te.chr))
BLAST.ir.5ltr.list <- vector("list", length(full.te.chr))
#nested_3ltr <- vector("list")
message("Estimate CNV for each LTR sequence...")
for (i in seq_len(length(full.te.chr))) {
BLASTOutput_3ltr_chr <-
dplyr::filter(BLASTOutput_3ltr, subject_id == full.te.chr[i])
BLASTOutput_5ltr_chr <-
dplyr::filter(BLASTOutput_5ltr, subject_id == full.te.chr[i])
LTR.fasta_full.te_chr <-
dplyr::filter(LTR.fasta_full.te, chromosome == full.te.chr[i])
ir.3ltr <-
IRanges::IRanges(start = BLASTOutput_3ltr_chr$s_start, end = BLASTOutput_3ltr_chr$s_end)
ir.5ltr <-
IRanges::IRanges(start = BLASTOutput_5ltr_chr$s_start, end = BLASTOutput_5ltr_chr$s_end)
ir.full.te <-
IRanges::IRanges(start = LTR.fasta_full.te_chr$start, end = LTR.fasta_full.te_chr$end)
# overlap between 3ltr locus and full ltr transposon locus
ir.3ltr_ov_ir.full.te <-
IRanges::findOverlaps(ir.3ltr, ir.full.te, type = "any")
# overlap between 5ltr locus and full ltr transposon locus
ir.5ltr_ov_ir.full.te <-
IRanges::findOverlaps(ir.5ltr, ir.full.te, type = "any")
# overlap between 3ltr locus and 5ltr locus (do they report the same solo ltr locus?)
ir.5ltr_ov_ir.3ltr <-
IRanges::findOverlaps(ir.5ltr, ir.3ltr, type = "equal")
# # any overlap with itself: 3ltr locus with 3ltr locus
# ir.3ltr_ov_ir.3ltr <-
# IRanges::findOverlaps(ir.3ltr, ir.3ltr, type = "within")
# # any overlap with itself: 5ltr locus with 5ltr locus
# ir.5ltr_ov_ir.5ltr <-
# IRanges::findOverlaps(ir.5ltr, ir.5ltr, type = "within")
# for (j in names(table(ir.3ltr_ov_ir.3ltr@to))) {
# nested_3ltr_all <- which((ir.3ltr_ov_ir.3ltr@from == j) && (ir.3ltr_ov_ir.3ltr@to == j))
# print(nested_3ltr_all)
# nested_3ltr_subset <- BLASTOutput_3ltr_chr[nested_3ltr_all, ]
# nested_3ltr[j] <- list(nested_3ltr_subset[which.max(unlist(nested_3ltr_subset[ , "bit_score"])), c("query_id","subject_id","s_start","s_end","bit_score")])
# }
#
# nested_3ltr <- do.call(rbind,nested_3ltr)
#
# print(head(nested_3ltr))
# print(nrow(nested_3ltr))
#
# print(length(unique(ir.3ltr_ov_ir.3ltr@to)))
# cat("\n")
# print(length(unique(ir.5ltr_ov_ir.5ltr@to)))
# cat("\n")
if ((length(ir.3ltr_ov_ir.full.te) > 0) &
(length(ir.5ltr_ov_ir.3ltr) > 0) &
(length(ir.5ltr_ov_ir.full.te) > 0)) {
# exclude overlaps between 3ltr locus and full ltr transposon locus and keep equal overlaps between 3ltr locus and 5ltr locus
BLASTOutput_3ltr_chr <-
BLASTOutput_3ltr_chr[-ir.3ltr_ov_ir.full.te@from, ]
# exclude overlaps between 5ltr locus and full ltr transposon locus and remove equal overlaps between 3ltr locus and 5ltr locus
BLASTOutput_5ltr_chr <-
BLASTOutput_5ltr_chr[-c(ir.5ltr_ov_ir.full.te@from,
ir.5ltr_ov_ir.3ltr@from), ]
}
if ((length(ir.3ltr_ov_ir.full.te) > 0) &
(length(ir.5ltr_ov_ir.3ltr) == 0) &
(length(ir.5ltr_ov_ir.full.te) > 0)) {
# exclude overlaps between 3ltr locus and full ltr transposon locus
BLASTOutput_3ltr_chr <-
BLASTOutput_3ltr_chr[-c(ir.3ltr_ov_ir.full.te@from), ]
# exclude overlaps between 5ltr locus and full ltr transposon locus
BLASTOutput_5ltr_chr <-
BLASTOutput_5ltr_chr[-c(ir.5ltr_ov_ir.full.te@from), ]
}
if ((length(ir.3ltr_ov_ir.full.te) == 0) &
(length(ir.5ltr_ov_ir.3ltr) == 0) &
(length(ir.5ltr_ov_ir.full.te) > 0)) {
# exclude overlaps between 5ltr locus and full ltr transposon locus
BLASTOutput_5ltr_chr <-
BLASTOutput_5ltr_chr[-c(ir.5ltr_ov_ir.full.te@from), ]
}
if ((length(ir.3ltr_ov_ir.full.te) > 0) &
(length(ir.5ltr_ov_ir.3ltr) == 0) &
(length(ir.5ltr_ov_ir.full.te) == 0)) {
# exclude overlaps between 3ltr locus and full ltr transposon locus and overlaps between 3ltr locus and 5ltr locus
BLASTOutput_3ltr_chr <-
BLASTOutput_3ltr_chr[-c(ir.3ltr_ov_ir.full.te@from), ]
}
#[ ,c("query_id","q_len","subject_id","s_start","s_end","scope","strand", "alig_length", "perc_identity", "bit_score", "evalue")]
BLAST.ir.3ltr.list[i] <-
list(
dplyr::select(
BLASTOutput_3ltr_chr,
query_id,
q_len,
subject_id,
s_start,
s_end,
scope,
strand,
alig_length,
perc_identity,
bit_score,
evalue
)
)
BLAST.ir.5ltr.list[i] <-
list(
dplyr::select(
BLASTOutput_5ltr_chr,
query_id,
q_len,
subject_id,
s_start,
s_end,
scope,
strand,
alig_length,
perc_identity,
bit_score,
evalue
)
)
}
ir.3ltr.result <- do.call(rbind, BLAST.ir.3ltr.list)
ir.5ltr.result <- do.call(rbind, BLAST.ir.5ltr.list)
res <- vector("list", 2)
res <-
list(pred_3ltr = ir.3ltr.result, pred_5ltr = ir.5ltr.result)
message("Finished LTR CNV estimation!")
return(res)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.