R/ceScan.R

Defines functions blatCNE cneMergeGRangePairs ceScanAxt ceScanR

Documented in blatCNE

### -----------------------------------------------------------------
### The main function for scanning the axts and get the CNEs
### Not exported!
ceScanR <- function(axts, tFilter=NULL, qFilter=NULL, tSizes, qSizes, 
                    thresholds=c("49_50")){
  winSize <- as.integer(sapply(strsplit(thresholds, "_"), "[", 2))
  minScore <- as.integer(sapply(strsplit(thresholds, "_"), "[", 1))
  resFiles <- tempfile(pattern=paste(minScore, winSize, "ceScan", sep="-"), 
                       tmpdir=tempdir(), fileext="")
  ## How stupid I am...
  if(is.null(tFilter) && is.null(qFilter)){
    .Call2("myCeScan", NULL, NULL, NULL,
           NULL, NULL, NULL,
           as.character(seqnames(qSizes)), as.integer(seqlengths(qSizes)),
           as.character(seqnames(targetRanges(axts))),
           start(targetRanges(axts)), end(targetRanges(axts)),
           as.character(strand(targetRanges(axts))),
           as.character(targetSeqs(axts)),
           as.character(seqnames(queryRanges(axts))),
           start(queryRanges(axts)), end(queryRanges(axts)),
           as.character(strand(queryRanges(axts))),
           as.character(querySeqs(axts)),
           score(axts), symCount(axts), winSize, minScore,
           as.character(resFiles),
           PACKAGE="CNEr")
  }else if(is.null(tFilter) && !is.null(qFilter)){
    .Call2("myCeScan", NULL, NULL, NULL,
           as.character(seqnames(qFilter)), start(qFilter), end(qFilter),
           as.character(seqnames(qSizes)), as.integer(seqlengths(qSizes)),
           as.character(seqnames(targetRanges(axts))),
           start(targetRanges(axts)), end(targetRanges(axts)),
           as.character(strand(targetRanges(axts))),
           as.character(targetSeqs(axts)),
           as.character(seqnames(queryRanges(axts))),
           start(queryRanges(axts)), end(queryRanges(axts)),
           as.character(strand(queryRanges(axts))),
           as.character(querySeqs(axts)),
           score(axts), symCount(axts), winSize, minScore,
           as.character(resFiles),
           PACKAGE="CNEr")
  }else if(!is.null(tFilter) && is.null(qFilter)){
    .Call2("myCeScan", as.character(seqnames(tFilter)), 
           start(tFilter), end(tFilter),
           NULL, NULL, NULL,
           as.character(seqnames(qSizes)), as.integer(seqlengths(qSizes)),
           as.character(seqnames(targetRanges(axts))),
           start(targetRanges(axts)), end(targetRanges(axts)),
           as.character(strand(targetRanges(axts))),
           as.character(targetSeqs(axts)),
           as.character(seqnames(queryRanges(axts))),
           start(queryRanges(axts)), end(queryRanges(axts)),
           as.character(strand(queryRanges(axts))),
           as.character(querySeqs(axts)),
           score(axts), symCount(axts), winSize, minScore,
           as.character(resFiles),
           PACKAGE="CNEr")
  }else{
    .Call2("myCeScan", 
           as.character(seqnames(tFilter)), start(tFilter), end(tFilter),
           as.character(seqnames(qFilter)), start(qFilter), end(qFilter),
           as.character(seqnames(qSizes)), as.integer(seqlengths(qSizes)), 
           as.character(seqnames(targetRanges(axts))), 
           start(targetRanges(axts)), end(targetRanges(axts)), 
           as.character(strand(targetRanges(axts))), 
           as.character(targetSeqs(axts)),
           as.character(seqnames(queryRanges(axts))), 
           start(queryRanges(axts)), end(queryRanges(axts)), 
           as.character(strand(queryRanges(axts))), 
           as.character(querySeqs(axts)),
           score(axts), symCount(axts), winSize, minScore, 
           as.character(resFiles),
           PACKAGE="CNEr")
  }
  CNE <- lapply(resFiles, 
                function(x){
                  res <- try(read.table(x, header=FALSE, sep="\t", as.is=TRUE),
                             silent=TRUE)
                  if(class(res) == "try-error"){
                    return(GRangePairs(first=GRanges(seqinfo=tSizes), 
                                       second=GRanges(seqinfo=qSizes))
                           )
                  }
                  colnames(res) <- c("tName", "tStart", "tEnd", 
                                     "qName", "qStart", "qEnd",
                                     "strand", "score", "cigar")
                  ans <- GRangePairs(first=GRanges(seqnames=res$tName,
                                             ranges=IRanges(start=res$tStart,
                                                            end=res$tEnd),
                                             strand="+",
                                             seqinfo=tSizes),
                                     second=GRanges(seqnames=res$qName,
                                              ranges=IRanges(start=res$qStart,
                                                             end=res$qEnd),
                                              strand=res$strand,
                                              seqinfo=qSizes),
                                     score=res$score,
                                     cigar=res$cigar
                                     )
                  return(ans)
                  }
                )
  names(CNE) <-  paste(minScore, winSize, sep="_")
  unlink(resFiles)
  return(CNE)
}

### -----------------------------------------------------------------
### The function for ceScan: one way for axt object, two way for CNE
### Exported!
setGeneric("ceScan", function(x, tFilter=NULL, qFilter=NULL,
                              tSizes=NULL, qSizes=NULL,
                              window=50L, identity=50L)
  standardGeneric("ceScan"))

setMethod("ceScan", "Axt", function(x, tFilter=NULL, qFilter=NULL,
                                    tSizes=NULL, qSizes=NULL,
                                    window=50L, identity=50L){
  ceScanAxt(x, tFilter=tFilter, qFilter=qFilter, tSizes=tSizes, qSizes=qSizes, 
            window=window, identity=identity)
})

setMethod("ceScan", "CNE", function(x, tFilter=NULL, qFilter=NULL,
                                    tSizes=NULL, qSizes=NULL,
                                    window=50L, identity=50L){
  axt12 <- readAxt(x@axt12Fn)
  axt21 <- readAxt(x@axt21Fn)
  cne12 <- ceScan(axt12, tFilter, qFilter,
                  tSizes=seqinfo(TwoBitFile(x@assembly1Fn)),
                  qSizes=seqinfo(TwoBitFile(x@assembly2Fn)),
                  window=window, identity=identity)
  cne21 <- ceScan(axt21, qFilter, tFilter,
                  tSizes=seqinfo(TwoBitFile(x@assembly2Fn)),
                  qSizes=seqinfo(TwoBitFile(x@assembly1Fn)),
                  window=window, identity=identity)
  ans <- list()
  for(i in 1:length(cne12)){
    ans[[names(cne12)[i]]] <- BiocGenerics:::replaceSlots(x, 
                                                          CNE12=cne12[[i]],
                                                          CNE21=cne21[[i]],
                                                          window=window[i],
                                                          identity=identity[i])
  }
  return(ans)
})

ceScanAxt <- function(axts, tFilter=NULL, qFilter=NULL,
                      tSizes=NULL, qSizes=NULL,
                      window=50L, identity=50L){
  # Prepare tSizes
  if(is.null(tSizes)){
    ## tSizes is NULL
    tSizes <- seqinfo(targetRanges(axts))
    if(any(is.na(seqlengths(tSizes)))){
      stop("tSizes must be provided or seqinfo is available in Axt object!")
    }
  }else if(!is(tSizes, "Seqinfo")){
    ## tSizes is integer vector
    tSizes <- Seqinfo(seqnames=names(tSizes), seqlengths=tSizes)
  }
  ## Check tFilter, tFilter's seqnames must %in% tSizes
  if(!is.null(tFilter)){
    if(!is(tFilter, "GRanges")){
      stop("tFilter must be NULL or a GRanges object!")
    }else{
      if(!all(seqnames(tFilter) %in% seqnames(tSizes))){
        stop("All the chromosomes in tFilter must exist in tSizes!")
      }
    }
  }
  ## Check axts target chromosomes
  if(!all(seqnames(targetRanges(axts)) %in% seqnames(tSizes))){
    stop("All the chromosomes in targetRanges of Axt object must exist in tSizes!")
  }
  
  # Prepare qSizes
  if(is.null(qSizes)){
    ## qSizes is NULL
    qSizes <- seqinfo(queryRanges(axts))
    if(any(is.na(seqlengths(qSizes)))){
      stop("qSizes must be provided or seqinfo is available in Axt object!")
    }
  }else if(!is(qSizes, "Seqinfo")){
    ## qSizes is integer vector
    qSizes <- Seqinfo(seqnames=names(qSizes), seqlengths=qSizes)
  }
  ## Check qFilter, qFilter's seqnames must %in% qSizes
  if(!is.null(qFilter)){
    if(!is(qFilter, "GRanges")){
      stop("qFilter must be NULL or a GRanges object!")
    }else{
      if(!all(seqnames(qFilter) %in% seqnames(qSizes))){
        stop("All the chromosomes in qFilter must exist in qSizes!")
      }
    }
  }
  ## Check axts query chromosomes
  if(!all(seqnames(queryRanges(axts)) %in% seqnames(qSizes))){
    stop("All the chromosomes in queryRanges of Axt object must exist in qSizes!")
  }
  
  # Check identity and window size
  if(any(window < identity)){
    stop("The scanning window size must be equal or larger than identity!")
  }
  if(length(identity) != length(window)){
    stop("The length of identity and window are different!")
  }
  thresholds <- paste(identity, window, sep="_")
  ans <- ceScanR(axts, tFilter=tFilter, qFilter=qFilter, 
                 tSizes=tSizes, qSizes=qSizes, thresholds=thresholds)
}

### -----------------------------------------------------------------
### Merge two side cnes (GRangePairs object), mcols will be discarded.
### strand information is no longer important.
### Exported!
setGeneric("cneMerge", function(cne12, cne21) standardGeneric("cneMerge"))

setMethod("cneMerge", signature(cne12="CNE", cne21="missing"),
          function(cne12, cne21){
            cneMerged <- cneMergeGRangePairs(cne12@CNE12, cne12@CNE21)
            BiocGenerics:::replaceSlots(cne12, CNEMerged=cneMerged)
          })

setMethod("cneMerge", signature(cne12="GRangePairs", cne21="GRangePairs"),
          function(cne12, cne21){
            cneMergeGRangePairs(cne12, cne21)
          })

cneMergeGRangePairs <- function(cne12, cne21){
  if(!is(cne12, "GRangePairs") || !is(cne21, "GRangePairs")){
    stop("cne12 and cne21 must be a GRangePairs object!")
  }
  strand(cne12@first) <- strand(cne12@second) <- strand(cne21@first) <- 
    strand(cne21@second) <- "+"
  cne <- c(cne12, swap(cne21))
  
  # 1. using reduce: the problem: it won't deal with {1,2,3}, {1,2} case
  # firstReduce <- reduce(first(cne), with.revmap=TRUE)
  # lastReduce <- reduce(last(cne), with.revmap=TRUE)
  # overlapFirstLast <- IntegerList(intersect(firstReduce$revmap,
  #                                           lastReduce$revmap))
  # 
  # ## First deal with the merged ranges
  # overlapFirstLast <- overlapFirstLast[elementNROWS(overlapFirstLast) > 1L]
  # firstIndex <- match(paste(overlapFirstLast, collapse="-"),
  #                     paste(firstReduce$revmap, collapse="-"))
  # lastIndex <- match(paste(overlapFirstLast, collapse="-"),
  #                    paste(lastReduce$revmap, collapse="-"))
  # ansFirst <- firstReduce[firstIndex]
  # mcols(ansFirst) <- NULL
  # ansLast <- lastReduce[lastIndex]
  # mcols(ansLast) <- NULL
  # 
  # ## Then deal with the unmerged ranges
  # ansFirst <- c(ansFirst, first(cne)[-sort(unlist(overlapFirstLast))])
  # ansLast <- c(ansLast, last(cne)[-sort(unlist(overlapFirstLast))])
  # return(GRangePairs(first=ansFirst, last=ansLast))
  
  # 2. using the findOverlaps:
  firstHits <- findOverlaps(first(cne), type="within",
                            drop.self=TRUE, drop.redundant=TRUE)
  lastHist <- findOverlaps(second(cne), type="within",
                           drop.self=TRUE, drop.redundant=TRUE)
  redundance <- IRanges::intersect(firstHits, lastHist)
  if(length(redundance) == 0L){
    return(cne)
  }else{
    return(cne[-queryHits(redundance)])
  }
}

### -----------------------------------------------------------------
### blatCNE: 
### Exported!
blatCNE <- function(cne, blatOptions=NULL, cutIdentity=90){
  if(!is(cne, "CNE")){
    stop("CNE must be a CNE class object!")
  }
  if(!file.exists(cne@assembly1Fn)){
    stop("The assembly1 twoBit file must exit!")
  }
  if(!file.exists(cne@assembly2Fn)){
    stop("The assembly2 twoBit file must exit!")
  }
  blatOptionsALL <- list("DEF_BLAT_OPT_WSLO"=
                         "-tileSize=9 -minScore=24 -repMatch=16384",
                         "DEF_BLAT_OPT_WSMID"=
                         "-tileSize=10 -minScore=28 -repMatch=4096",
                         "DEF_BLAT_OPT_WSHI"=
                         "-tileSize=11 -minScore=30 -repMatch=1024")
  winSize <- cne@window
  if(is.null(blatOptions)){
    if(winSize > 45L)
      blatOptions <- blatOptionsALL[["DEF_BLAT_OPT_WSHI"]]
    else if(winSize > 35L)
      blatOptions <- blatOptionsALL[["DEF_BLAT_OPT_WSMID"]]
    else
      blatOptions <- blatOptionsALL[["DEF_BLAT_OPT_WSLO"]]
  }
  if(cutIdentity > 100  || cutIdentity < 0)
    stop("cutIdentity must be between 0 and 100!")
  cutoffs1 <- cne@cutoffs1
  cutoffs2 <- cne@cutoffs2
  
  .run_blat <- function(cne, cutIdentity, whichAssembly=c("first", "last"),
                        blatOptions){
    whichAssembly <- match.arg(whichAssembly)
    temp_cne <- tempfile(pattern="cne-")
    temp_psl <- tempfile(pattern="psl-")
    # For Blat, the start is 0-based and end is 1-based. 
    # So make cne's coordinates to comply with it.
    if(whichAssembly == "first"){
      assemblyTwobit <- cne@assembly1Fn
      cneDataFrame <- paste0(assemblyTwobit, ":", 
                    as.character(seqnames(first(cne@CNEMerged))),
                    ":", format(start(first(cne@CNEMerged))-1,
                                trim=TRUE, scientific=FALSE),
                    "-", format(end(first(cne@CNEMerged)),
                                trim=TRUE, scientific=FALSE))
    }else{
      assemblyTwobit <- cne@assembly2Fn
      cneDataFrame <- paste0(assemblyTwobit, ":", 
                    as.character(seqnames(second(cne@CNEMerged))), 
                    ":", format(start(second(cne@CNEMerged))-1,
                                trim=TRUE, scientific=FALSE),
                    "-", format(end(second(cne@CNEMerged)),
                                trim=TRUE, scientific=FALSE))
    }
    cneDataFrame <- unique(cneDataFrame)
    writeLines(cneDataFrame, con=temp_cne)
    cmd <- paste0(cne@aligner, " ", blatOptions, " ",
                  "-minIdentity=", cutIdentity,
                  " ", assemblyTwobit, " ", temp_cne, " ", temp_psl)
    my.system(cmd)
    unlink(temp_cne)
    return(temp_psl)
  }
  psl1Fn <- .run_blat(cne, cutIdentity, "first", blatOptions)
  psl2Fn <- .run_blat(cne, cutIdentity, "last", blatOptions)
  psl1 <- read.table(psl1Fn, header=FALSE, sep="\t", skip=5, as.is=TRUE)
  psl2 <- read.table(psl2Fn, header=FALSE, sep="\t", skip=5, as.is=TRUE)
  colnames(psl1) <- colnames(psl2) <- 
    c("matches", "misMatches", "repMatches", "nCount", 
      "qNumInsert", "qBaseInsert", "tNumInsert", "tBaseInsert", 
      "strand", "qName", "qSize", "qStart", "qEnd", "tName", 
      "tSize", "tStart", "tEnd", "blockCount", "blockSizes", 
      "qStarts", "tStarts")
  ##psl1 = subset(psl1, matches/qSize >= cutIdentity/100)
  ##for some reason, subset will cause error duing the Bioconductor build
  psl1 <- psl1[psl1$matches / psl1$qSize >= cutIdentity/100, ]
  ##psl2 = subset(psl2, matches/qSize >= cutIdentity/100)
  psl2 <- psl2[psl2$matches / psl2$qSize >= cutIdentity/100, ]
  psl1 <- table(psl1[ , "qName"])
  psl2 <- table(psl2[ , "qName"])
  CNEtNameIndex <- unname(psl1[paste0(as.character(
                                        seqnames(first(cne@CNEMerged))),
                                      ":", format(start(first(cne@CNEMerged))-1,
                                                  trim=TRUE, scientific=FALSE),
                                      "-", format(end(first(cne@CNEMerged)),
                                                  trim=TRUE, scientific=FALSE))
                               ]
                          )
  CNEtNameIndex[is.na(CNEtNameIndex)] <- 0
  CNEqNameIndex <- unname(psl2[paste0(as.character(
                                        seqnames(second(cne@CNEMerged))), 
                                      ":", format(start(second(cne@CNEMerged))-1,
                                                  trim=TRUE, scientific=FALSE),
                                      "-", format(end(second(cne@CNEMerged)),
                                                  trim=TRUE, scientific=FALSE))
                               ]
                          )
  CNEqNameIndex[is.na(CNEqNameIndex)] <- 0
  cneFinal <- cne@CNEMerged[as.integer(CNEtNameIndex) <= cutoffs1 &
                            as.integer(CNEqNameIndex) <= cutoffs2]
  BiocGenerics:::replaceSlots(cne, CNEFinal=cneFinal)
}
ge11232002/CNEr documentation built on Oct. 26, 2022, 7:08 p.m.