#!/usr/bin/env Rscript
################################################################################
################################################################################
# -- ad hoc functions
find_kmerPos <- function(dnass, kmers, jumpSize, minWidth, kmerProp){
revmap <- mlist <- mergem <- out <- nbp <- propSeq <- chr <- i <- NULL
# -- for each chromosome, find and aggregate all kmers
acrossChrs <- rbindlist(lapply(names(dnass), function(i){
mlist <- lapply(kmers, function(x) vmatchPattern(x, subject = dnass[i])[[1]])
mergem <- reduce(do.call(c, mlist), min.gapwidth = 0, drop.empty.ranges = T)
if(length(mergem) == 0)
return(NULL)
mergem <- reduce(
mergem,
min.gapwidth = jumpSize,
drop.empty.ranges = TRUE,
with.revmap = TRUE)
if(length(mergem) == 0)
return(NULL)
out <- as.data.table(mergem)
if("revmap" %in% colnames(mergem)){
out[,nbp := sapply(revmap, length) * max(nchar(kmers))]
out[,`:=`(propSeq = nbp / width, chr = i)]
out <- subset(out, width >= minWidth & propSeq >= kmerProp)
out <- out[,c("chr", "start", "end")]
}else{
out[,chr := i]
out <- subset(out, width >= minWidth)
out <- out[,c("chr", "start", "end")]
}
return(out)
}))
if(!is.null(acrossChrs)){
if(nrow(acrossChrs) < 1){
acrossChrs <- NULL
}
}
return(acrossChrs)
}
pull_chrLens <- function(dnass){
data.table(
chr = names(dnass),
chrStart = 1,
chrEnd = width(dnass))
}
add_nogapBlocks <- function(chrLens, kmerBed){
# -- combine kmer-blocks with chromosome lengths
tp <- merge(chrLens, kmerBed, by = "chr")
setkey(tp, chr, start, end)
tp[,index := 1:.N, by = "chr"]
# -- get lagged values for the left block
tp[,`:=`(startLeft = c(1, end[-.N]+1),
endLeft = c(start-1)),
by = "chr"]
tpo <- with(tp, data.table(
chr = c(chr, chr),
type = rep(c(tp$type, rep("noGap", nrow(tp)))),
start = c(start, startLeft),
end = c(end, endLeft)))
tpo <- subset(tpo, complete.cases(tpo))
tpe <- tp[,list(start = ifelse(all(is.na(end)), 1L, as.integer(max(end, na.rm = T)+1)),
end = chrEnd[1]), by = "chr"]
tpe[,type := "noGap"]
tp <- rbind(tpo, tpe)
setkey(tp, chr, start, end)
tp[,width := end - start]
setcolorder(tp, colnames(kmerBed))
tmp <- subset(chrLens, !chr %in% tp$chr)
if(nrow(tmp > 0)){
chrMiss <- with(tmp, data.table(
chr = chr, start = chrStart, end = chrEnd, width = chrEnd, type = "noGap"))
tp <- rbind(tp, chrMiss)
}
return(tp)
}
get_teloPos <- function(bed, chrLens, teloBuff){
bed <- merge(chrLens, bed, by = "chr", all.x = T)
bed[,pos :=
ifelse(is.na(start), "none",
ifelse(start < teloBuff, "left",
ifelse(end > (chrEnd - teloBuff), "right", "mid")))]
bedMd <- bed[,list(teloPosition = paste(unique(pos), collapse = ",")),
by = c("chr", "chrEnd")]
setnames(bedMd, "chrEnd", "width")
return(bedMd)
}
rename_chrs <- function(ss){
ns <- as.data.table(tstrsplit(trimws(names(ss)), " "))
hasna <- sapply(ns, function(x) any(is.na(x)))
if(all(hasna)){
return(ss)
}
ns <- ns[,!hasna, with = F]
if(ncol(ns) == 1){
# -- if one column (no whitespace) return names
chrns <- ns[[1]]
}else{
# -- if the first column is all unique and the first entries are "chrx" or numeric, use it
u1 <- ns[[1]]
if(!any(duplicated(u1)) && gsub("chr|chromosome|scaffold|chr0","",tolower(u1)[[1]]) == "1"){
chrns <- ns[[1]]
}else{
# -- if there is a column with "chr" in it and it is all unique use that
wh <- grepl("chr", tolower(unlist(ns[1,])))
us <- apply(ns, 2, function(x) all(!duplicated(x)))
if(sum(wh & us) == 1){
chrns <- as.character(ns[[wh & us]])
}else{
# -- if there is a column with "chr", followed by one totally unique, use that
if(sum(wh) == 1){
index <- which(wh) + 1
if(all(!duplicated(ns[[index]]))){
chrns <- ns[[index]]
}else{
# -- otherwise just return the names
chrns <- names(ss)
}
}else{
chrns <- names(ss)
}
}
}
}
names(ss) <- chrns
return(ss)
}
classify_engine <- function(faFile,
teloKmers,
gapSize,
teloGapSize,
minTeloSize,
teloRepDens,
verbose,
minChrSize){
if(!requireNamespace("Biostrings", quietly = TRUE))
stop("you need to install Biostrings from Bioconductor before running\n")
if(!requireNamespace("data.table", quietly = TRUE))
stop("you need to install data.table before running\n")
suppressPackageStartupMessages(library(Biostrings))
suppressPackageStartupMessages(library(data.table))
# -- read in the fasta as a DNAStringSet
if(verbose)
cat(sprintf("Reading: %s\n", faFile))
ss <- readDNAStringSet(faFile)
ss <- ss[width(ss) > minChrSize]
ss <- rename_chrs(ss)
chrLens <- pull_chrLens(ss)
chrLens <- subset(chrLens, complete.cases(chrLens))
chrLens[,`:=`(v1 = frank(chr), v2 = frank(as.numeric(gsub("[^0-9.-]", "", chr))))]
setkey(chrLens, v2, v1, chr)
chrLens[,`:=`(v1 = NULL, v2 = NULL)]
chrv <- chrLens$chrEnd; names(chrv) <- chrLens$chr
if(verbose)
cat(strwrap(sprintf(
"\tN. scaffolds = %s (N50 = %sMb); total length = %sMb\n",
nrow(chrLens),
round(N50(chrLens$chrEnd / 1e6), 2),
round(sum(chrLens$chrEnd / 1e6, na.rm = T), 2)),
indent = 8, exdent = 16), sep = "\n")
# -- make a bed with the start and end positions of the gap kner (N)
# then cluster into gap coordinates
gapNs <- find_kmerPos(
dnass = ss,
kmers = "N",
jumpSize = 1,
minWidth = gapSize,
kmerProp = .99)
if(is.null(gapNs)){
contigCoords <- with(chrLens, data.table(
chr = chr, start = chrStart, end = chrEnd, type = "noGap"))
noGap <- data.table(contigCoords)
}else{
gapNs[,type := "gap"]
# -- pull ungapped (contig) coordinates
seqi <- Seqinfo(chrLens$chr, seqlengths = chrLens$chrEnd)
gapr <- with(gapNs, GRanges(
chr, IRanges(start = start, end = end), seqinfo = seqi))
noGap <- gaps(gapr)
noGap <- subset(noGap, strand == "*")
noGap <- with(as.data.table(noGap), data.table(
chr = seqnames, start = start, end = end, type = "noGap"))
contigCoords <- rbind(noGap, gapNs)
}
setkey(contigCoords, chr, start, end)
cat(strwrap(sprintf(
"\tN. contigs = %s (N50 = %sMb); total length = %sMb\n",
nrow(noGap),
round(N50((noGap$end - noGap$start) / 1e6), 2),
round(sum((noGap$end - noGap$start) / 1e6, na.rm = T), 2)),
indent = 8, exdent = 16), sep = "\n")
# -- make a bed with the start and end positions of the telomere kmers
# then cluster into gap coordinates
teloBuff <- 1e5
teloKmers <- unique(
c(teloKmers, as.character(reverseComplement(DNAStringSet(teloKmers)))))
telos <- find_kmerPos(
dnass = ss,
kmers = teloKmers,
jumpSize = teloGapSize,
kmerProp = teloRepDens,
minWidth = minTeloSize)
if(!is.null(telos)){
telos[,type := "telo"]
telos[,chrLen := chrv[chr]]
telos[,pos := ifelse(start <= teloBuff, "L", ifelse((chrLen - end) < teloBuff, "R", "M"))]
telopos <- telos[,list(pos = paste(unique(pos), collapse = "")), by = "chr"]
cat(strwrap(sprintf(
"\tN. telomeres = %s (%skb) on chrs: %s\n",
nrow(telos),
round(N50((telos$end - telos$start) / 1e3), 2),
paste(sprintf("%s%s", telopos$chr, telopos$pos), collapse = ", ")),
indent = 8, exdent = 16), sep = "\n")
telos[,`:=`(chrLen = NULL, pos = NULL)]
out <- rbind(telos, contigCoords)
}else{
out <- data.table(contigCoords)
cat("\tCould not find any telomeres\n")
}
return(out)
}
plot_classes <- function(blockCalls, palette){
mxn <- with(blockCalls, max(tapply(
type, chr, FUN = function(x) sum(x == "noGap"))))
nColors <- floor(ifelse(mxn/3 < 5, 5, mxn / 3))
if(nColors > 50)
nColors <- 50
cols <- palette(nColors)
tp <- data.table(blockCalls)
tp[,genome := factor(genome, levels = unique(genome))]
if(!"genome" %in% colnames(tp))
tp[,genome := "genome"]
tpb <- subset(tp, type == "noGap")
tel <- subset(tp, type == "telo")
tel[,x := ((end + start)/2)/1e6]
tpb[,index := rep(1:nColors, nrow(tp))[1:.N], by = c("genome", "chr")]
tpb[,y1 := as.numeric(factor(paste(genome, chr), levels = unique(paste(genome, chr))))]
tpb[,y2 := y1 - .6]
tpb[,y := (y1 + y2)/2]
tpb[,genome := factor(genome, levels = unique(genome))]
ng <- tpb[,list(n = .N, x = max(end)/1e6), by = c("genome", "chr", "y")]
tmp <- subset(tpb, !duplicated(paste(chr, genome)))
yv <- tmp$y2; names(yv) <- with(tmp, paste(chr, genome))
tel[,y2 := yv[paste(chr, genome)]]
tpl <- subset(tpb, !duplicated(paste(genome, chr)))
p <- ggplot()+
geom_rect(data = tpb,
aes(xmin = start / 1e6, xmax = end / 1e6,
ymin = y2, ymax = y1,
fill = factor(index)))+
scale_y_reverse(
expand = c(0.01, 0.01),
breaks = tpl$y,
labels = tpl$chr,
name = "Chromosome")+
scale_x_continuous(
name = "Physical position (Mb); * indicate telomere sequence")+
geom_text(data = ng,
aes(x = x, y = y, label = n),
hjust = -.25, size = 2)+
geom_point(data = tel, aes(x = x, y = y2), shape = "*", col = "red", size = 4)+
scale_fill_manual(values = cols, guide = "none")+
theme(panel.background = element_rect(fill = "darkgrey"),
panel.grid = element_blank())+
facet_wrap(~genome, scale = "free")+
ggtitle(sprintf(
"Contig positions: each cycle through colors is %s contigs (%s gaps)",
nColors, nColors - 1))
print(p)
}
classify_genomes <- function(faFiles,
toStrip = ".fa$|.fa.gz$",
teloKmers = c("CCCGAAA", "CCCTAAA"),
gapSize = 1e4,
teloGapSize = 1e4,
minTeloSize = 1e3,
teloRepDens = 0.1,
binScaffolds = TRUE,
minChrSize = 1e6,
palette = viridisLite::viridis,
verbose = TRUE){
blockCoords <- rbindlist(lapply(faFiles, function(i){
bc <- classify_engine(
faFile = i,
teloKmers = teloKmers,
gapSize = gapSize,
teloGapSize = teloGapSize,
minTeloSize = minTeloSize,
teloRepDens = teloRepDens,
minChrSize = minChrSize,
verbose = verbose)
bc[,genome := gsub(toStrip, "", basename(i))]
return(bc)
}))
# -- make the plot
blockCoords[,genome := factor(genome, levels = unique(genome))]
blockCoords[,chr := gsub("chr|chromosome|scaffold|contig", "", tolower(chr))]
blockCoords[,`:=`(chri = as.numeric(gsub('\\D+','', chr)),
chrn = frank(chr, ties.method = "dense")), by = "genome"]
setorder(blockCoords, genome, chri, chrn, na.last = T)
plot_classes(
blockCalls = data.table(blockCoords),
palette = palette)
return(blockCoords)
}
################################################################################
if (!require('argparse', quietly = T))
install.packages('argparse')
suppressPackageStartupMessages(library("argparse"))
################################################################################
################################################################################
# --create parser object
parser <- ArgumentParser(
description = "make a maps of contig and telomere positions from fasta assembly files")
# specify our desired options
# by default ArgumentParser will add an help option
parser$add_argument(
"-f", "--faFiles", type = "character", default = "",
help = "comma-separated string of paths to fasta files")
parser$add_argument(
"-k", "--kmers", type = "character", default = "CCCGAAA,CCCTAAA",
help = "comma-separated string of telomere kmers [default = CCCGAAA,CCCTAAA]")
parser$add_argument(
"-t", "--teloParams", type = "character", default = "50,500,0.75",
help = "comma-separated cluster params (max gap, min width, min perc. telo sequence) [default = 50,500,0.75]")
parser$add_argument(
"-p", "--pdfFile", type = "character", default = "contigPlot.pdf",
help = "output pdf file name [default= contigPlot.pdf]")
parser$add_argument(
"-o", "--outFile", type = "character", default = "contigCoords.txt",
help = "output text file with contig coordinates [default= contigCoords.txt]")
parser$add_argument(
"-m", "--minChrLen", type = "integer", default = 1e6,
help = "the smallest chromosome to show [default= 1Mb]")
parser$add_argument(
"-g", "--gapSize", type = "integer", default = 1e3,
help = "window size to average over to merge gaps [default= 1kb]")
parser$add_argument(
"-r", "--regex", type = "character", default = ".fa.gz$|.fa$",
help = "quoted character, a regex to strip from the file name [default= .fa.gz$|.fa$]")
parser$add_argument(
"-q", "--quietly", action = "store_false",
dest = "verbose", help = "Print little output")
parser$add_argument(
"-c", "--copy2stdout", action = "store_true",
dest = "copy2stdout", help = "print the summary file to stdout instead of outfile")
################################################################################
################################################################################
# -- get dependencies in order
if(!requireNamespace("Biostrings", quietly = TRUE))
stop("you need to install Biostrings from Bioconductor before running\n")
if(!requireNamespace("data.table", quietly = TRUE))
stop("you need to install data.table before running\n")
if(!requireNamespace("ggplot2", quietly = TRUE))
stop("you need to install ggplot2 before running\n")
if(!requireNamespace("viridisLite", quietly = TRUE))
stop("you need to install viridisLite before running\n")
if(!requireNamespace("GenomicRanges", quietly = TRUE))
stop("you need to install GenomicRanges before running\n")
suppressPackageStartupMessages(library(Biostrings))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(GenomicRanges))
################################################################################
################################################################################
# -- get the fafiles organized
args <- parser$parse_args()
faf <- as.character(args$faFiles)
fs <- NULL
if(faf == "")
faf <- getwd()
if(dir.exists(faf)){
fs <- list.files(path = faf, pattern = ".fa$|.fa.gz$", full.names = T)
}
if(length(fs) == 0){
fs <- strsplit(faf, ",")[[1]]
fs <- fs[file.exists(fs)]
}
if(length(fs) == 0)
stop("could not find faFiles specified\n")
faFilesIn <- fs
# -- get arguments in order
toStripIn <- as.character(args$regex)
teloKmersIn <- as.character(strsplit(args$kmers, ",")[[1]])
gapSizeIn <- as.numeric(args$gapSize)
teloParamIn <- as.numeric(strsplit(args$teloParams, ",")[[1]])
minChrSizeIn <- args$minChrLen
paletteIn <- viridisLite::viridis
verboseIn <- args$verbose
pdfFileIn <- args$pdfFile
# -- write the plot to file
plotx <- ceiling(sqrt(length(faFilesIn))) * 4 + 1.5
ploty <- floor(sqrt(length(faFilesIn))) * 3 + 1
pdf(pdfFileIn, height = ploty, width = plotx)
test <- classify_genomes(
faFiles = faFilesIn,
toStrip = toStripIn,
teloKmers = teloKmersIn,
gapSize = gapSizeIn,
teloGapSize = teloParamIn[1],
minTeloSize = teloParamIn[2],
teloRepDens = teloParamIn[3],
binScaffolds = binScaffoldsIn,
minChrSize = minChrSizeIn,
palette = paletteIn,
verbose = verboseIn)
nope <- dev.off()
if(args$copy2stdout){
fwrite(test, sep = "\t", file = "", quote = FALSE)
}else{
fwrite(test, file = args$outFile, sep = "\t")
}
# -- write the coordinates to the coordinates file
if(args$verbose)
cat("Done!\n")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.