#' @title flag and split syntenic hits
#' @description
#' \code{synteny} from an annotated blast file, assign syntenic (or not) blocks
#' to each hit.
#' @name synteny
#'
#' @param gsParam A list of genespace parameters. This should be created
#' by init_genespace.
#' @param verbose logical, should updates be printed to the console?
#' @param hits data.table of hits, see read_allBlast
#' @param synRad see init_genespace
#' @param onlyOgAnchors see init_genespace
#' @param blkSize see init_genespace
#' @param blkRadius see init_genespace
#' @param tmpDir see init_genespace
#' @param topn1 integer, the number of best scoring hits per gene in genome 1
#' @param topn2 integer, the number of best scoring hits per gene in genome 1
#' @param nGaps see init_genespace
#' @param onlySameChrs logical, should only hits on chromosomes with the same
#' name be permitted?
#' @param MCScanX_hCall see init_genespace
#'
#' \cr
#' If called, \code{synteny} returns its own arguments.
#'
#' @title Flag syntenic hits
#' @description
#' \code{synteny} The main engine to call syntenic blocks and regions
#' @rdname synteny
#' @import data.table
#' @import R.utils
#' @importFrom dbscan dbscan frNN
#' @importFrom parallel mclapply
#' @export
synteny <- function(gsParam, verbose = TRUE){
##############################################################################
# 1. setup
# -- 1.1 get env vars set up
query <- target <- lab <- nRegionHits <- nRegions <- nAnchorHits <- nBlks <-
nSVs <- selfOnly <- queryPloidy <- targetPloidy <- nGlobOgHits <- synHits <-
nTotalHits <- chunk <- inBuffer <- allBlast <- isArrayRep1 <- isArrayRep2 <-
noAnchor <- ord1 <- ord2 <- isAnchor <- note <- genome1 <- genome2 <-
ofID1 <- ofID2 <- nSynBlks <- status <- isSyntenic <- NULL
nCores <- gsParam$params$nCores
runOFINBLK <- gsParam$params$orthofinderInBlk
# -- 1.2 check that files are all ok
if(!"synteny" %in% names(gsParam))
stop("must run set_syntenyParams prior to synteny")
if(!all(file.exists(gsParam$synteny$blast$allBlast)))
stop("some annotated blast files dont exist, run annotate_blast() first\n")
# -- 1.1 split the metadata into chunks
blMd <- data.table(gsParam$synteny$blast)
blMd[,lab := align_charLeft(sprintf("%s v. %s:", query, target))]
blMd[,selfOnly := query == target & queryPloidy == 1 & targetPloidy == 1]
if(!"nGlobOgHits" %in% colnames(blMd))
blMd[,nGlobOgHits := file.size(allBlast)]
if(!"nTotalHits" %in% colnames(blMd))
blMd[,nTotalHits := file.size(allBlast)]
setorder(blMd, selfOnly, -nGlobOgHits, -nTotalHits)
blMd[,chunk := rep(1:.N, each = nCores)[1:.N]]
synMdSpl <- split(blMd, by = "chunk")
##############################################################################
# -- 2. loop through each chunk
blMdOut <- rbindlist(lapply(1:length(synMdSpl), function(chnki){
chnk <- data.table(synMdSpl[[chnki]])
if(nCores > 1)
cat(sprintf(
"\t# Chunk %s / %s (%s) ... \n",
chnki, max(blMd$chunk), format(Sys.time(), "%X")))
############################################################################
# -- loop through each row in each chunk
outChnk <- rbindlist(mclapply(1:nrow(chnk), mc.cores = nCores, function(i){
# -- 2.1 read in the metadata and hits
outMd <- data.table(chnk[i,])
x <- data.table(outMd)
rawHits <- read_allBlast(x$allBlast)
synHits <- subset(rawHits, isArrayRep1 & isArrayRep2 & !noAnchor)
if(x$query == x$target & x$queryPloidy == 1){
outHits <- find_selfSyn(
hits = data.table(rawHits), synRad = x$synRad)
}
if(x$query == x$target & x$queryPloidy > 1){
# -- self synteny (as above)
selfHits <- find_selfSyn(
hits = data.table(rawHits),
synRad = x$synRad)
# -- self synteny + mask (to reduce effects of large tandem dups)
selfHits[,inBuffer := flag_hitsInRadius(
x = ord1, y = ord2, isAnchor = isAnchor, radius = x$synBuff*2),
by = c("chr1", "chr2")]
maskHits <- subset(selfHits, !inBuffer)
# -- run the synteny engine
maskHits <- synteny_engine(
hits = data.table(maskHits),
nGaps = x$nGapsSecond,
blkRadius = x$blkRadiusSecond,
blkSize = x$blkSizeSecond,
synRad = x$synRad,
topn1 = x$targetPloidy - 1,
topn2 = x$queryPloidy - 1,
onlySameChrs = gsParam$params$onlySameChrs,
MCScanX_hCall = gsParam$shellCalls$mcscanx_h,
tmpDir = gsParam$paths$tmp,
onlyOgAnchors = x$onlyOgAnchorsSelf,
verbose = FALSE)
# -- if there is no synteny, return an error
if(sum(maskHits$isAnchor) < x$blkSize){
um <- NA
maskHits <- maskHits[1,]
maskHits[,note := sprintf("WARNING!! no paralogous syntenic hits in %s vs %s. You may need a better outgroup or set onlyOgAnchorsSelf = FALSE", genome1, genome2)]
}else{
um <- with(maskHits, unique(paste(ofID1, ofID2)))
}
outHits <- rbind(maskHits, subset(selfHits, !paste(ofID1, ofID2) %in% um), fill = T)
}
if(x$query != x$target){
##########################################################################
# 2.2. primary hits
outHits <- synteny_engine(
hits = data.table(synHits),
nGaps = x$nGaps,
blkSize = x$blkSize,
blkRadius = x$blkRadius,
topn1 = x$targetPloidy,
topn2 = x$queryPloidy,
synRad = x$synRad,
onlySameChrs = gsParam$params$onlySameChrs,
MCScanX_hCall = gsParam$shellCalls$mcscanx_h,
tmpDir = gsParam$paths$tmp,
onlyOgAnchors = x$onlyOgAnchors,
verbose = FALSE)
}
##########################################################################
# 2.3 secondary hits
if(x$nSecondaryHits > 0){
hitsMask <- data.table(outHits)
hitsMask[,inBuffer := flag_hitsInRadius(
x = ord1, y = ord2, isAnchor = isAnchor, radius = x$synBuff),
by = c("chr1", "chr2")]
hitsPrim <- subset(hitsMask, inBuffer)
hitsMask <- synteny_engine(
hits = data.table(subset(hitsMask, !inBuffer)),
nGaps = x$nGapsSecond,
blkSize = x$blkSizeSecond,
blkRadius = x$blkRadiusSecond,
topn1 = x$targetPloidy,
topn2 = x$queryPloidy,
synRad = x$synRad,
onlySameChrs = gsParam$params$onlySameChrs,
MCScanX_hCall = gsParam$shellCalls$mcscanx_h,
tmpDir = gsParam$paths$tmp,
onlyOgAnchors = x$onlyOgAnchorsSecond)
wh <- which(!is.na(hitsMask$regID))
hitsMask$regID[wh] <- sprintf("%s_secondary", hitsMask$regID[wh])
wh <- which(!is.na(hitsMask$blkID))
hitsMask$blkID[wh] <- sprintf("%s_secondary", hitsMask$blkID[wh])
outHits <- rbind(hitsMask, hitsPrim)
}
##########################################################################
# 2.4 check and compile notes
outHits$isAnchor[is.na(outHits$isAnchor)] <- FALSE
outHits$isSyntenic[is.na(outHits$isSyntenic)] <- FALSE
out <- with(outHits, data.table(
lab = chnk$lab[i],
nTotalHits = length(chr1),
nOgHits = sum(sameOG),
nAnchorHits = sum(isAnchor),
nSyntenicHits = sum(isSyntenic),
nSynRegions = uniqueN(regID, na.rm = T),
nSynBlks = uniqueN(blkID, na.rm = T)))
nsynchr <- with(subset(outHits, isAnchor), uniqueN(paste(chr1, chr2)))
out[,nSVs := nSynBlks - nsynchr]
if("note" %in% colnames(outHits)){
out[,status := ifelse(
any(grepl("WARNING", outHits$note)), "no paralog synteny",
ifelse(any(grepl("ERROR", outHits$note)), "no syntenic hits"))]
}else{
out[,status := "PASS"]
}
out <- data.table(outMd, out)
write_allBlast(x = outHits, filepath = x$allBlast)
write_synHits(x = subset(outHits, isSyntenic), filepath = x$synHits)
return(out)
}), fill = T)
if(any(outChnk$status == "no syntenic hits")){
cat("\n############\nERROR - No synteny found for the following pairs:\n")
print(subset(outChnk, status == "no syntenic hits")[,c("query", "target")])
stop("GENESPACE requires synteny to work ... try new parameters or genomes\n")
}
if(any(outChnk$status == "no paralog synteny")){
cat("\n############\nWARNING - No synteny found for paralogs among the following pairs:\n")
print(subset(outChnk, status == "no paralog synteny")[,c("query", "target")])
cat(strwrap(
"If you have polyploids, you really should use an outgroup (see README).
If this isn't possible, you can relax secondary synteny parameters or set
onlyOgAnchorsSelf to FALSE\n", indent = 8, exdent = 8), sep = "\n")
cat("############\n")
}
with(outChnk, cat(sprintf(
"\t...%s %s hits (%s anchors) in %s blocks (%s SVs, %s regions)\n",
lab, nSyntenicHits, nAnchorHits, nSynBlks, nSVs, nSynRegions)))
return(outChnk)
}))
gsParam$synteny$blast <- blMdOut
return(gsParam)
}
#' @title Flag synteny in self-hits
#' @description
#' \code{find_selfSyn} Self hits where genes are array reps are the anchors.
#' Also pulls all inbuffer hits around the anchors
#' @rdname synteny
#' @import data.table
#' @importFrom dbscan dbscan frNN
#' @export
find_selfSyn <- function(hits, synRad){
##############################################################################
# 1. get the minimum distance between two gene as either ancOrd or ord
ofID1 <- ofID2 <- chr1 <- chr2 <- ord1 <- ord2 <- genome1 <- n <-
blkID <- isAnchor <- isSyntenic <- genome2 <- NULL
hits[,isAnchor := ofID1 == ofID2]
tmp <- subset(hits, chr1 == chr2)
chrs2search <- tmp[,list(n = diff(range(ord1))), by = "chr1"]
chrNosearch <- subset(chrs2search, n <= synRad)
chrs2search <- subset(chrs2search, n > synRad)
spl <- split(subset(tmp, chr1 %in% chrs2search$chr1), by = "chr1")
ofInBuff <- with(rbindlist(lapply(spl, function(x){
nn <- with(x, frNN(x = data.frame(ord1, ord2), eps = synRad))
wh <- unique(c(which(x$isAnchor), unlist(nn$id[x$isAnchor])))
return(subset(x, 1:nrow(x) %in% wh)[,c("ofID1", "ofID2")])
})), paste(ofID1, ofID2))
hits[,isSyntenic := paste(ofID1, ofID2) %in% ofInBuff |
(chr1 == chr2 & chr1 %in% chrNosearch$chr1)]
hits[,blkID := ifelse(isSyntenic, sprintf(
"%s %s selfBlk %s", genome1, genome2, chr1), NA)]
return(data.table(hits))
}
#' @title Find syntenic regions
#' @description
#' \code{synteny_engine} The main engine for synteny discovery. Takes a "hits"
#' data.table and classifies block IDs, whether a gene is an anchor in the block
#' and whether it is within a syntenic buffer of the block anchor hits. The
#' steps are as follows:
#' 1) Filter to initial hits - onlyOgAnchors (if TRUE) and topN hits depending
#' on ploidy. Built global collinear hits by piping these hits into MCScanX.
#' 2) Cluster global collinear hits into large regions searching within synRad
#' then pull all (or onlyOG hits) within synRad of global anchor hits.
#' 3) Re-run MCScanX within large regions, re-cluster and re-cull region hits
#' within synRad. Split overlapping regions. These are the "regions" named in
#' "regID" column
#' 4) For each region, pull all hits (regardless of score, OG etc) and re-run
#' MCScanX. Cluster these 'potential' anchors into blocks with blkRadius search
#' radius and blkSize minimum size. The resulting hits are the anchors, flagged
#' 'isAnchor = TRUE.
#' 5) Split anchors of interleaved blocks, then extract hits within the physical
#' bounds of the blocks and within blkRadius distance of an anchor (within-block
#' distance). These hits are flagged 'isSyntenic = TRUE'.
#' @rdname synteny
#' @import data.table
#' @importFrom dbscan dbscan frNN
#' @export
synteny_engine <- function(hits,
onlyOgAnchors,
blkSize,
blkRadius,
tmpDir,
topn1,
topn2,
nGaps,
synRad,
MCScanX_hCall,
onlySameChrs,
verbose = FALSE){
noAnchor <- isArrayRep1 <- isArrayRep2 <- sameOG <- sr1 <- bitScore <- sr2 <-
ord1 <- ord2 <- isAnchor <- ofID1 <- ofID2 <- chr1 <- chr2 <- blkID <-
nAnchorhits <- genome1 <- genome2 <- start1 <- start2 <- end1 <- end2 <-
i.blkID <- blkID1 <- blkID2 <- tmp <- note <- inBuffer <- need2keep <-
need4anchor <- anyNeed <- hasSameOG1 <- hasSameOG2 <- hasSameOG <- clus <-
nOG <- rl <- clus1 <- clus2 <- regID <- V1 <- V2 <- regID <- isSyntenic <-
NULL
# hits[,regID := NULL]
raw <- data.table(hits)
cull <- subset(hits, !noAnchor & isArrayRep1 & isArrayRep2)
if(onlySameChrs)
cull <- subset(cull, chr1 == chr2)
if(verbose)
cat(sprintf("%s vs %s:\n\tn. total hits = %s\n\tn. rep. hits = %s\n",
hits$genome1[1], hits$genome2[2], nrow(hits), nrow(cull)))
##############################################################################
##############################################################################
# 1. Pull global collinear hits
# The overall goal here is to find the 'seed' anchors. These are hits that
# MCScanX thinks are collinear, based on the initial criteria: (a) only OG
# hits (optional, but default) and (b) the approx top scoring hits based on
# ploidy (nhits). These initial seed hits are re-ranked by order across the
# genome, then fed into MCScanX. Those that MCScanX thinks are collinear are
# flagged
##############################################################################
##############
# -- 1.1 [optionally] subset to only OG hits
h1 <- data.table(cull)
if(onlyOgAnchors)
h1 <- subset(h1, sameOG)
# if no hits are availble, return an note that will be convert to error
if(nrow(h1) < blkSize){
errs <- hits[1,]
errs[,note := "ERROR no array rep hits"]
return(errs)
}
##############
# -- 1.2 Subset to top-scoring hits
# keeping anything within 10 or so, this gives a bit of wiggle in intial
# anchor
h1[,sr1 := frank(-round(bitScore, -1), ties.method = "dense"), by = "ofID1"]
h1[,sr2 := frank(-round(bitScore, -1), ties.method = "dense"), by = "ofID2"]
h1 <- subset(h1, sr1 <= topn1 & sr2 <= topn2)
##############
# -- 1.3 rerank order
h1[,`:=`(ord1 = frank(ord1, ties.method = "dense"),
ord2 = frank(ord2, ties.method = "dense"))]
##############
# -- 1.4 pull collinear hits via mcscanx
tmp <- h1[,c("ofID1", "ofID2", "chr1", "chr2", "ord1", "ord2", "bitScore")]
setnames(tmp, "bitScore", "score")
tmp <- subset(tmp, complete.cases(tmp))
anchu <- run_mcscanx(
nGaps = 25,
tmpDir = tmpDir,
blkSize = 5,
hits = data.table(tmp),
MCScanX_hCall = MCScanX_hCall)
cull[,isAnchor := paste(ofID1, ofID2) %in% names(anchu)]
if(sum(cull$isAnchor) < blkSize){
errs <- hits[1,]
errs[,note := "ERROR no synteny"]
return(errs)
}
if(verbose)
cat(sprintf("\tn. potenial anchor hits = %s\n\tn. seed anchor hits = %s\n",
nrow(h1), sum(cull$isAnchor)))
##############################################################################
##############################################################################
# 2. Find hits in 'initial' seed regions
# using the seed anchor hits from step 1, we find large syntenic 'initial
# regions'. These are determined by re-ranking the position of the seed hits
# and clustering using the syntenic radius ('synRad') in 2d dbscan. These
# initial regions are used to physically query any nearby hits as follows: (a)
# hits must be within the physical bounds, (b) must be within the synteny
# radius of anchors, except that (c) if the query and target are same
# orthogroup, can extend outside of the regions (still needs to be within the
# synBuff of an initial seed anchor)
##############################################################################
##############
# -- 2.1 cluster into large regions
h2 <- subset(cull, isAnchor)
h2[,`:=`(ord1 = frank(ord1, ties.method = "dense"),
ord2 = frank(ord2, ties.method = "dense"))]
h2[,blkID := dbscan(frNN(
x = cbind(ord1, ord2),
eps = synRad),
minPts = blkSize)$cluster,
by = c("chr1", "chr2")]
h2 <- subset(h2, blkID != 0)
h2[,blkID := paste(chr1, chr2, blkID)]
bu <- h2$blkID; names(bu) <- with(h2, paste(ofID1, ofID2))
cull[,`:=`(blkID = bu[paste(ofID1, ofID2)],
isAnchor = paste(ofID1, ofID2) %in% names(bu))]
##############
# -- 2.2 pull hits on same chromosomes as anchors and re-rank
chru <- with(subset(h2, !is.na(blkID)), unique(paste(chr1, chr2)))
h2 <- subset(cull, paste(chr1, chr2) %in% chru)
h2[,`:=`(ord1 = frank(ord1, ties.method = "dense"),
ord2 = frank(ord2, ties.method = "dense"))]
##############
# -- 2.3 pull hits in buffer to the original anchors
h2[,inBuffer := flag_hitsInRadius(
x = ord1, y = ord2,
isAnchor = isAnchor, radius = synRad),
by = c("chr1", "chr2")]
h2 <- subset(h2, inBuffer)
##############
# -- 2.4 flag the block IDs of each hit
h2[,blkID := flag_hitsInBlk(x = ord1, y = ord2, blkID = blkID)]
##############
# -- 2.5 Let the coordinates expand to any genes in the same orthogroup
h2[,need2keep := is.na(blkID) & sameOG & inBuffer]
h2[,need4anchor := !is.na(blkID) & isAnchor & !is.na(blkID)]
h2[,anyNeed := any(need2keep) & any(need4anchor), by = c("chr1", "chr2")]
spl <- split(subset(h2, anyNeed), by = c("chr1", "chr2"))
nnout <- rbindlist(lapply(spl, function(inchr){
hasAnch <- subset(inchr, need4anchor)
out <- subset(inchr, need2keep)
fn <- frNN(
x = hasAnch[,c("ord1", "ord2")],
query = out[,c("ord1", "ord2")],
eps = synRad)
index <- sapply(fn$id, function(x) x[1])
out <- out[!is.na(index)]
index <- index[!is.na(index)]
out[,blkID := hasAnch$blkID[index]]
out <- subset(out, !is.na(blkID))
if(nrow(out) > 0)
return(out)
}))
u <- with(nnout, paste(ofID1, ofID2))
h2 <- rbind(subset(h2, !paste(ofID1, ofID2) %in% u), nnout)
##############
# -- 2.6 ensure we have synteny
h3 <- subset(h2, !is.na(blkID))
h2 <- NULL
if(nrow(h3) < blkSize){
errs <- hits[1,]
errs[,note := "ERROR no synteny"]
return(errs)
}
if(verbose)
cat(sprintf("\tn. inital anchors hits / blocks = %s / %s\n",
nrow(h3), uniqueN(h3$blkID)))
##############################################################################
##############################################################################
# 3. Find Syntenic anchors
# Here, we change our view of synteny from that of the genome wide comparison
# (sections 1-2) to purely within the large inital regions (built in step 2).
# The main logic here is that when considered in isolation, collinearity
# within each syntenic region can be inferred with the same parameters, even
# if they have very different evolutionary histories (this subgenome
# dominance). There are four steps here: (a) potential anchors are defined
# as hits with the top scoring hit for both the query and target gene OR in
# the same orthogroup within each large region, (b) gene rank order position
# is re-calculated within each region, (c) mcscanx is re-run within each
# region, (d) hits within a fixed radius of anchors from (c) are pulled and
# iteratively clustered into blocks using a radius from blkRadius to blkSize.
# These anchors are cleaned up into finalized blocks in the next section.
##############################################################################
##############
# -- 3.1 just to the top scoring or OG hits in each block
h3[,sr1 := frank(-bitScore, ties.method = "first"),
by = c("blkID", "ofID1")]
h3[,sr2 := frank(-bitScore, ties.method = "first"),
by = c("blkID", "ofID2")]
h3 <- subset(h3, (sr1 == 1 & sr2 == 1) | sameOG)
##############
# -- 3.2 drop hits that are top scoring but are not OG and either the
# query or target has an OG hit in the block
h3[,hasSameOG1 := any(sameOG), by = c("blkID", "ofID1")]
h3[,hasSameOG2 := any(sameOG), by = c("blkID", "ofID2")]
h3[,hasSameOG := hasSameOG1 | hasSameOG2]
h3 <- subset(h3, !hasSameOG | (hasSameOG & sameOG))
##############
# -- 3.3 re-rank gene order within blocks
h3[,`:=`(ord1 = frank(ord1, ties.method = "dense"),
ord2 = frank(ord2, ties.method = "dense")),
by = "blkID"]
##############
# -- 3.4 mcscanx within block to find good proximate hits
spl <- split(h3, by = "blkID")
outu <- rbindlist(lapply(spl, function(h3spl){
tmp <- h3spl[,c("ofID1", "ofID2", "chr1", "chr2", "ord1", "ord2", "bitScore")]
setnames(tmp, "bitScore", "score")
tmp <- subset(tmp, complete.cases(tmp))
anchu <- run_mcscanx(
nGaps = nGaps,
tmpDir = tmpDir,
blkSize = blkSize,
hits = data.table(tmp),
MCScanX_hCall = MCScanX_hCall)
tmp[,isAnchor := paste(ofID1, ofID2) %in% names(anchu)]
return(subset(tmp, isAnchor)[,c("ofID1", "ofID2")])
}))
anchu <- with(outu, paste(ofID1, ofID2))
h3[,isAnchor := paste(ofID1, ofID2) %in% anchu]
##############
# -- 3.5 pull inbuffer hits
h3[,inBuffer := flag_hitsInRadius(
x = ord1, y = ord2,
isAnchor = isAnchor, radius = blkRadius),
by = c("chr1", "chr2", "blkID")]
h3 <- subset(h3, inBuffer)
##############
# -- 3.6 iterative pruning to drop non-collinear hits and form new anchors
steps <- seq(from = blkRadius, to = blkSize)
steps <- floor(steps)
steps <- steps[!duplicated(steps)]
for(i in steps){
h3[,`:=`(ord1 = frank(ord1, ties.method = "dense"),
ord2 = frank(ord2, ties.method = "dense")), by = "blkID"]
h3[,clus := dbscan(frNN(
x = cbind(ord1, ord2),
eps = i),
minPts = blkSize)$cluster,
by = c("chr1", "chr2", "blkID")]
h3 <- subset(h3, clus > 0)
if(onlyOgAnchors){
h3[,nOG := sum(sameOG), by = c("chr1", "chr2", "blkID", "clus")]
h3 <- subset(h3, nOG > 0)
}
}
##############
# -- 3.7 drop no-OG blocks
setkey(h3, blkID, ord1)
h3[,rl := add_rle(clus, which = "id"), by = "blkID"]
setkey(h3, blkID, ord2)
h3[,rl := add_rle(rl, which = "id"), by = "blkID"]
if(onlyOgAnchors){
h3[,nOG := sum(sameOG), by = c("chr1", "chr2", "blkID", "rl")]
h3 <- subset(h3, nOG > 0)
}
##############
# -- 3.8 final clustering of anchors within regions
h3[,`:=`(ord1 = frank(ord1, ties.method = "dense"),
ord2 = frank(ord2, ties.method = "dense")), by = "blkID"]
h3[,clus := dbscan(frNN(
x = cbind(ord1, ord2),
eps = i),
minPts = 1)$cluster,
by = c("chr1", "chr2", "blkID")]
h3 <- subset(h3, clus > 0)
if(onlyOgAnchors){
h3[,nOG := sum(sameOG), by = c("chr1", "chr2", "blkID", "clus")]
h3 <- subset(h3, nOG > 0)
}
##############
# -- 3.9 final splitting within regions by RLEs
setkey(h3, blkID, ord1)
h3[,rl := add_rle(clus, which = "id"), by = "blkID"]
setkey(h3, blkID, ord2)
h3[,rl := add_rle(rl, which = "id"), by = "blkID"]
if(onlyOgAnchors){
h3[,nOG := sum(sameOG), by = c("chr1", "chr2", "blkID", "rl")]
h3 <- subset(h3, nOG > 0)
}
h3[,blkID := paste(chr1, chr2, blkID, clus, rl)]
setkey(h3, ord1)
h3[,blkID := as.integer(factor(blkID, levels = unique(blkID)))]
bv <- h3$blkID; names(bv) <- with(h3, paste(ofID1, ofID2))
cull[,blkID := bv[paste(ofID1, ofID2)]]
##############################################################################
##############################################################################
# 4. Final clean up and checking of blocks/regions
#
##############################################################################
##############
# -- 4.1 re-rank gene order for syntenic anchors
h4 <- subset(cull, isAnchor)
h4 <- subset(h4, !is.na(blkID))
h4[,`:=`(ord1 = frank(ord1, ties.method = "dense"),
ord2 = frank(ord2, ties.method = "dense"))]
##############
# -- 4.2 check for consecutive gaps inside blocks > blkRadius
setkey(h4, ord1)
h4[,clus1 := cumsum(c(1, as.numeric(diff(ord1) > blkRadius))),
by = c("chr1", "blkID")]
setkey(h4, ord2)
h4[,clus2 := cumsum(c(1, as.numeric(diff(ord2) > blkRadius))),
by = c("chr2", "blkID")]
h4[,blkID := paste(blkID, clus1, clus2)]
##############
# -- 4.3 add new block IDs back to the original data
h4[,blkID := as.integer(factor(blkID, levels = unique(blkID)))]
h4[,blkID := sprintf("%s_vs_%s: %s", genome1, genome2, blkID)]
bv <- h4$blkID; names(bv) <- with(h4, paste(ofID1, ofID2))
cull[,blkID := bv[paste(ofID1, ofID2)]]
cull[,isAnchor := !is.na(blkID)]
##############
# -- 4.4 initial cluster blocks into regions
h4 <- subset(cull, isAnchor)
h4[,`:=`(ord1 = frank(ord1, ties.method = "dense"),
ord2 = frank(ord2, ties.method = "dense"))]
h4[,clus := dbscan(frNN(
x = cbind(ord1, ord2),
eps = blkRadius),
minPts = blkSize)$cluster,
by = c("chr1", "chr2")]
h4[,regID := paste(chr1, chr2, clus)]
h4[,regID := as.integer(factor(regID, levels = unique(regID)))]
##############
# -- 4.5 merge regions that have overlapping blocks
regMerge <- h4[,CJ(unique(blkID), unique(blkID)), by = "regID"]
regMerge[,clus := clus_igraph(V1, V2)[V1]]
regMerge <- subset(regMerge, !duplicated(V1))
rv <- regMerge$V2; names(rv) <- regMerge$V1
h4[,regID := rv[blkID]]
h4 <- subset(h4, !is.na(blkID))
##############
# -- 4.6 add regions back to the data
setkey(h4, ord1)
h4 <- subset(h4, !is.na(regID))
h4[,regID := as.integer(factor(regID, levels = unique(regID)))]
h4[,regID := sprintf("%s_vs_%s: %s", genome1, genome2, regID)]
rv <- h4$regID; names(rv) <- with(h4, paste(ofID1, ofID2))
cull[,regID := rv[paste(ofID1, ofID2)]]
################################################################################
# 5. Flag syntenic hits
##############
# -- 5.1 find global hits in buffer
h5 <- data.table(cull)
h5[,inBuffer := flag_hitsInRadius(
x = ord1, y = ord2,
isAnchor = isAnchor, radius = blkRadius),
by = c("chr1", "chr2")]
##############
# -- 5.2 find hits in physical limits of blocks
h5 <- subset(h5, inBuffer)
h5[,regID := flag_hitsInBlk(x = ord1, y = ord2, blkID = regID)]
h5[,blkID := flag_hitsInBlk(x = ord1, y = ord2, blkID = blkID)]
##############
# -- 5.3 assign block IDs to OG hits just outside of block coords
h5[,need2keep := is.na(regID) & sameOG & inBuffer]
h5[,need4anchor := !is.na(regID) & isAnchor & !is.na(blkID)]
h5[,anyNeed := any(need2keep) & any(need4anchor), by = c("chr1", "chr2")]
if(any(h5$anyNeed)){
spl <- split(subset(h5, anyNeed), by = c("chr1", "chr2"))
nnout <- rbindlist(lapply(spl, function(inchr){
hasAnch <- subset(inchr, need4anchor)
out <- subset(inchr, need2keep)
fn <- frNN(
x = hasAnch[,c("ord1", "ord2")],
query = out[,c("ord1", "ord2")],
eps = synRad)
index <- sapply(fn$id, function(x) x[1])
out <- out[!is.na(index)]
index <- index[!is.na(index)]
out[,blkID := hasAnch$blkID[index]]
out[,regID := hasAnch$regID[index]]
out <- subset(out, !is.na(blkID) | !is.na(regID))
if(nrow(out) > 0)
return(out)
}))
u <- with(nnout, paste(ofID1, ofID2))
h5 <- rbind(subset(h5, !paste(ofID1, ofID2) %in% u), nnout)
}
rv <- h5$regID; bv <- h5$blkID
names(rv) <- names(bv) <- with(h5, paste(ofID1, ofID2))
cull[,`:=`(regID = rv[paste(ofID1, ofID2)],
blkID = bv[paste(ofID1, ofID2)])]
cull[,isSyntenic := !is.na(regID) | !is.na(blkID)]
if(verbose)
cat(sprintf("\tn. final anchors = %s \n\tn. final syntenic hits = %s\n",
sum(cull$isAnchor), sum((cull$isSyntenic))))
if(verbose)
cat(sprintf("\tn. final blocks = %s \n\tn. final regions = %s\n",
uniqueN(cull$blkID, na.rm = T),
uniqueN(cull$regID, na.rm = T)))
u <- with(cull, paste(ofID1, ofID2))
out <- subset(raw, !paste(ofID1, ofID2) %in% u)
outSyn <- cull[,colnames(out), with = F]
out <- rbind(outSyn, out)
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.