#' @title get coordinates aggregated across kmers
#'
#' @description
#' \code{make_grFromKmers} find all positions with an exact match to any kmers
#' and join them
#'
#' @param dnass DNAStringSet containg the assembly fata
#' @param kmers DNAStringSet containing the kmers to find
#' @param nCores Integer, the number of parallel threads
#'
#' @details finds kmers and reduces overlapping intervals
#'
#' @return a granges object containing any sequences masked by the kmers
#'
#' @examples
#' \dontrun{
#' # coming soon.
#' }
#'
#' @import data.table
#' @importFrom parallel mclapply
#' @importFrom Biostrings PDict matchPDict
#' @importFrom IRanges IRanges
#' @importFrom GenomicRanges GRanges reduce
#' @importFrom BiocGenerics start end
#' @export
find_contigsGapsTelos <- function(dnass,
teloKmers,
minContigGapSize = 100,
maxDistBtwTelo = 20,
minTeloSize = 200,
minTeloDens = 0.75,
minChrSize = 0,
maxDist2end = 1e4,
verbose = TRUE){
##############################################################################
# 1. prep the data.
# -- 1.1 read in the fasta as a DNAStringSet
ss <- dnass
# -- 1.2 subset to chromosomes bigger than minChrSize
nchr <- length(ss)
ss <- ss[width(ss) > minChrSize]
if(verbose & minChrSize > 0)
cat(sprintf(
"Dropping %s of %s chromosomes that are smaller than %sMb\n",
nchr - length(ss), nchr, round(minChrSize/1e6, 2)))
# -- 1.3 re-name chromosomes if necessary
ss <- rename_chrs(ss)
# -- 1.4 get chromosome lengths in the order they are in the fasta
chrv <- width(ss); names(chrv) <- names(ss)
# -- 1.5 print initial scaffold stats
if(verbose)
cat(strwrap(sprintf(
"\tN. scaffolds = %s (N50 = %sMb); total length = %sMb\n",
length(chrv),
round(N50(chrv / 1e6), 2),
round(sum(chrv / 1e6, na.rm = T), 2)),
indent = 8, exdent = 16), sep = "\n")
##############################################################################
# 2. Get the gaps and report contig-level stats
gapGr <- find_runsOfNs(
dnass = ss,
minRunLength = minContigGapSize)
contigGr <- find_gaps(gapGr)
cat(strwrap(sprintf(
"\tN. contigs = %s (N50 = %sMb); total length = %sMb\n",
length(contigGr),
round(N50(width(contigGr) / 1e6), 2),
round(sum(width(contigGr) / 1e6, na.rm = T), 2)),
indent = 8, exdent = 16), sep = "\n")
##############################################################################
# 3. Get the telomere kmer positions and cluster
teloKmers <- c(
reverseComplement(DNAStringSet(teloKmers)),
teloKmers)
teloGr <- find_telomeres(
dnass = ss, kmers = teloKmers, maxDistBtwTelo = maxDistBtwTelo,
minTeloSize = minTeloSize, minTeloDens = minTeloDens,
maxDist2end = maxDist2end)
if(is.null(teloGr)){
cat("\tCould not find any telomeres\n")
}else{
teloMd <- data.table(
chr = as.character(seqnames(teloGr)),
pos = toupper(substr(teloGr@elementMetadata$position, 1, 1)))
teloMd <- teloMd[,list(pos = paste(pos, collapse = "")), by = "chr"]
cat(strwrap(sprintf(
"\tN. telomeres = %s (%skb) on chrs: %s\n",
length(teloGr),
round(N50(width(teloGr) / 1e3), 2),
paste(sprintf("%s%s", teloMd$chr, teloMd$pos), collapse = ", ")),
indent = 8, exdent = 16), sep = "\n")
}
##############################################################################
# 4. Return the results
return(list(gaps = gapGr, contigs = contigGr, telomeres = teloGr))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.