R/ExpandDiagonal.R

Defines functions ExpandDiagonal

Documented in ExpandDiagonal

###### -- Expand blocks of predicted pairs ------------------------------------
# author: nicholas cooley
# contact: npc19@pitt.edu / npcooley@gmail.com

ExpandDiagonal <- function(SynExtendObject,
                           DataBase01,
                           InheritConfidence = FALSE,
                           GapTolerance = 100L,
                           DropSingletons = FALSE,
                           UserConfidence = list("PID" = 0.3),
                           Processors = 1,
                           Verbose = FALSE) {
  # start with timing
  if (Verbose) {
    pBar <- txtProgressBar(style = 1)
    TimeStart <- Sys.time()
  }
  # overhead checking
  if (!is(object = SynExtendObject,
          class2 = "PairSummaries")) {
    stop ("SynExtendObject must be an object of class 'PairSummaries'.")
  }
  # check DBPATH first
  if (is.character(DataBase01)) {
    if (!requireNamespace(package = "RSQLite",
                          quietly = TRUE)) {
      stop("Package 'RSQLite' must be installed.")
    }
    if (!("package:RSQLite" %in% search())) {
      print("Eventually character vector access to DECIPHER DBs will be deprecated.")
      requireNamespace(package = "RSQLite",
                       quietly = TRUE)
    }
    dbConn <- dbConnect(dbDriver("SQLite"), DataBase01)
    on.exit(dbDisconnect(dbConn))
  } else {
    dbConn <- DataBase01
    if (!dbIsValid(dbConn)) {
      stop("The connection has expired.")
    }
  }
  # check that user criteria makes sense
  # check the dimensions of the object

  # deal with Processors, this mimics Erik's error checking
  if (!is.null(Processors) && !is.numeric(Processors)) {
    stop("Processors must be a numeric.")
  }
  if (!is.null(Processors) && floor(Processors) != Processors) {
    stop("Processors must be a whole number.")
  }
  if (!is.null(Processors) && Processors < 1) {
    stop("Processors must be at least 1.")
  }
  if (is.null(Processors)) {
    Processors <- DECIPHER:::.detectCores()
  } else {
    Processors <- as.integer(Processors)
  }
  
  AlignmentFun <- attr(x = SynExtendObject,
                       which = "AlignmentFunction")
  if (!(AlignmentFun %in% c("AlignPairs",
                            "AlignProfiles"))) {
    stop ("Unrecognized alignment function in the 'AlignmentFunction' attribute.")
  }
  MAT1 <- get(data("HEC_MI1",
                   package = "DECIPHER",
                   envir = environment()))
  MAT2 <- get(data("HEC_MI2",
                   package = "DECIPHER",
                   envir = environment()))
  if (AlignmentFun == "AlignProfiles") {
    
    structureMatrix <- matrix(c(0.187, -0.8, -0.873,
                                -0.8, 0.561, -0.979,
                                -0.873, -0.979, 0.221),
                              3,
                              3,
                              dimnames=list(c("H", "E", "C"),
                                            c("H", "E", "C")))
    substitutionMatrix <- matrix(c(1.5, -2.134, -0.739, -1.298,
                                   -2.134, 1.832, -2.462, 0.2,
                                   -0.739, -2.462, 1.522, -2.062,
                                   -1.298, 0.2, -2.062, 1.275),
                                 nrow = 4,
                                 dimnames = list(DNA_BASES, DNA_BASES))
  }
  if (InheritConfidence) {
    UserConfidence <- attr(x = SynExtendObject,
                           which = "UserConfidence")
    if (is.null(UserConfidence)) {
      stop ("No 'UserConfidence' attribute present in supplied SynExtendObject.")
    }
  } else {
    if (is.null(UserConfidence)) {
      stop ("When InheritConfidence is FALSE, users must supply 'UserConfidence'.")
    } else {
      UserConfidence <- UserConfidence
    }
  }
  Criteria <- names(UserConfidence)
  Floor <- unname(UserConfidence)
  KmerSize <- attr(x = SynExtendObject,
                   which = "KVal")
  # break PairSummaries object down into a workable format
  # build overhead data in a way that makes sense
  GeneCalls <- attr(x = SynExtendObject,
                    which = "GeneCalls")
  # SuppliedFeatures <- as.integer(FeatureSeqs$IDs)
  GCIDs <- as.integer(names(GeneCalls))
  L <- length(GeneCalls)
  L2 <- (L * (L - 1L)) / 2L
  POIDs <- paste(SynExtendObject$p1,
                 SynExtendObject$p2,
                 sep = "_")
  
  if (is.null(attr(x = SynExtendObject,
                   which = "Retain"))) {
    NewClusterID <- -1L
  } else {
    NewClusterID <- max(as.integer(names(attr(x = SynExtendObject,
                                              which = "Retain")))) + 1L
  }
  
  FeaturesMat <- do.call(rbind,
                         strsplit(x = POIDs,
                                  split = "_",
                                  fixed = TRUE))
  FeaturesMat <- data.frame("g1" = as.integer(FeaturesMat[, 1L]),
                            "i1" = as.integer(FeaturesMat[, 2L]),
                            "f1" = as.integer(FeaturesMat[, 3L]),
                            "g2" = as.integer(FeaturesMat[, 4L]),
                            "i2" = as.integer(FeaturesMat[, 5L]),
                            "f2" = as.integer(FeaturesMat[, 6L]))
  
  GMat <- unique(FeaturesMat[, c(1, 4)])
  rownames(GMat) <- NULL
  
  # change Res to lower case in all places!
  Res <- vector(mode = "list",
                length = nrow(GMat))
  
  # we have to scroll through each occupied cell of the upper triangle, like before
  # now instead of figuring out our edges based on hits, we're seeing whether any
  # 'adjacent' edges along the diagonals fit a user defined criteria for a positive edge
  prev_w1 <- 0L
  prev_w2 <- 0L
  for (m1 in seq(nrow(GMat))) {
    # what is the current comparison
    w1 <- GMat[m1, 1L]
    w2 <- GMat[m1, 2L]
    # wa1 <- match(x = as.character(w1),
    #              table = names(attr(x = Pairs,
    #                                 which = "GeneCalls")))
    # wa2 <- match(x = as.character(w2),
    #              table = names(attr(x = Pairs,
    #                                 which = "GeneCalls")))
    
    if (Verbose) {
      cat(paste0("### Genome pair: ",
                 w1,
                 " - ",
                 w2,
                 " ###\n"))
    }
    
    w3 <- FeaturesMat[, 1L] == w1
    w4 <- FeaturesMat[, 4L] == w2
    CMat <- FeaturesMat[w3 & w4, ]
    rownames(CMat) <- NULL
    # separate index pairs
    IMat <- split(x = CMat,
                  f = list(CMat$i1,
                           CMat$i2),
                  drop = TRUE)
    i1 <- which(GCIDs == w1)
    i2 <- which(GCIDs == w2)
    # i3 <- unique(GeneCalls[[i1]]$Index)
    # i4 <- unique(GeneCalls[[i2]]$Index)
    i3 <- seq_len(max(GeneCalls[[i1]]$Index))
    i4 <- seq_len(max(GeneCalls[[i2]]$Index))
    i5 <- sapply(i3,
                 function(x) {
                   which(GeneCalls[[i1]]$Index == x)
                 },
                 simplify = FALSE)
    i6 <- sapply(i4,
                 function(x) {
                   which(GeneCalls[[i2]]$Index == x)
                 },
                 simplify = FALSE)
    i1lower <- sapply(i5,
                      function(x) {
                        if (length(x) > 0L) {
                          min(x)
                        } else {
                          NA # NA Placeholder - should never be accessed, but necessary to keep index matching
                        }
                      },
                      simplify = TRUE)
    i1upper <- sapply(i5,
                      function(x) {
                        if (length(x) > 0L) {
                          max(x)
                        } else {
                          NA
                        }
                      },
                      simplify = TRUE)
    i2lower <- sapply(i6,
                      function(x) {
                        if (length(x) > 0L) {
                          min(x)
                        } else {
                          NA # NA Placeholder - should never be accessed, but necessary to keep index matching
                        }
                      },
                      simplify = TRUE)
    i2upper <- sapply(i6,
                      function(x) {
                        if (length(x) > 0L) {
                          max(x)
                        } else {
                          NA
                        }
                      },
                      simplify = TRUE)
    
    # pull seqs, this needs to be more efficient in the future
    if (prev_w1 != w1) {
      out1 <- SearchDB(dbFile = dbConn,
                       tblName = "NTs",
                       identifier = as.character(w1),
                       verbose = FALSE,
                       nameBy = "description")
      out2 <- SearchDB(dbFile = dbConn,
                       tblName = "AAs",
                       identifier = as.character(w1),
                       verbose = FALSE,
                       nameBy = "description")
      
      # DBQUERY <- paste("select length, mod, code, cds from NTs where identifier is",
      #                  w1)
      # out3 <- dbGetQuery(conn = dbConn,
      #                    statement = DBQUERY)
      CurrentW1Seqs <- list("DNA" = out1,
                            "AA" = out2,
                            "Struct" = PredictHEC(myAAStringSet = out2,
                                                  type = "probabilities",
                                                  HEC_MI1 = MAT1,
                                                  HEC_MI2 = MAT2),
                            "NTCount" = width(out1),
                            "CodingVal1" = GeneCalls[[match(x = w1,
                                                            table = GCIDs)]]$Coding,
                            "CodingVal2" = width(out1) %% 3L == 0L,
                            "CDSCount" = lengths(GeneCalls[[match(x = w1,
                                                                  table = GCIDs)]]$Range))
    }
    
    if (prev_w2 != w2) {
      out1 <- SearchDB(dbFile = dbConn,
                       tblName = "NTs",
                       identifier = as.character(w2),
                       verbose = FALSE,
                       nameBy = "description")
      out2 <- SearchDB(dbFile = dbConn,
                       tblName = "AAs",
                       identifier = as.character(w2),
                       verbose = FALSE,
                       nameBy = "description")
      # DBQUERY <- paste("select length, mod, code, cds from NTs where identifier is",
      #                  w2)
      # out3 <- dbGetQuery(conn = dbConn,
      #                    statement = DBQUERY)
      CurrentW2Seqs <- list("DNA" = out1,
                            "AA" = out2,
                            "Struct" = PredictHEC(myAAStringSet = out2,
                                                  type = "probabilities",
                                                  HEC_MI1 = MAT1,
                                                  HEC_MI2 = MAT2),
                            "NTCount" = width(out1),
                            "CodingVal1" = GeneCalls[[match(x = w2,
                                                            table = GCIDs)]]$Coding,
                            "CodingVal2" = width(out1) %% 3L == 0L,
                            "CDSCount" = lengths(GeneCalls[[match(x = w2,
                                                                  table = GCIDs)]]$Range))
    }
    
    
    nuc1 <- oligonucleotideFrequency(x = CurrentW1Seqs$DNA,
                                     width = KmerSize,
                                     as.prob = TRUE)
    nuc2 <- oligonucleotideFrequency(x = CurrentW2Seqs$DNA,
                                     width = KmerSize,
                                     as.prob = TRUE)
    Features01Match <- match(x = names(CurrentW1Seqs$DNA),
                             table = names(CurrentW1Seqs$AA))
    Features02Match <- match(x = names(CurrentW2Seqs$DNA),
                             table = names(CurrentW2Seqs$AA))
    
    # for each Index Pair Matrix
    Res[[m1]] <- vector(mode = "list",
                        length = length(IMat))
    for (m2 in seq_along(IMat)) {
      if (Verbose) {
        cat(paste0("Index pair ",
                   IMat[[m2]][1L, 2L],
                   " - ",
                   IMat[[m2]][1L, 5L],
                   "\n"))
      }
      # current offsets
      ci1lower <- i1lower[IMat[[m2]][1L, 2L]]
      ci1upper <- i1upper[IMat[[m2]][1L, 2L]]
      ci2lower <- i2lower[IMat[[m2]][1L, 5L]]
      ci2upper <- i2upper[IMat[[m2]][1L, 5L]]
      if (Verbose) {
        cat(paste0("Feature set ",
                   ci1lower,
                   " - ",
                   ci1upper,
                   " and ",
                   ci2lower,
                   " - ",
                   ci2upper,
                   ":\n"))
      }
      dr1 <- IMat[[m2]][, 3L] - IMat[[m2]][, 6L]
      dr2 <- IMat[[m2]][, 3L] + IMat[[m2]][, 6L]
      IMat[[m2]] <- cbind(IMat[[m2]],
                          "ID" = seq(nrow(IMat[[m2]])),
                          "rank1" = dr1,
                          "rank2" = dr2)
      
      # given the diagonal and anti-diagonal `ranks` that have been assigned
      # build 3 ledgers:
      # one for the diagonal, one for the ant-diagonal, and one for singletons
      # you can be in both the diagonal and the anti-diagonal ledger at the same tiem
      # but you cannot be in the singleton ledger and either diag or anti-diag
      
      # at this point we have 4 points describing each pair
      # 1: g1 feature ID number
      # 2: g2 features ID number
      # 3: diag rank
      # 4: anti-diag rank
      
      dr3 <- unname(split(x = IMat[[m2]],
                          f = dr1,
                          drop = TRUE))
      dr4 <- unname(split(x = IMat[[m2]],
                          f = dr2,
                          drop = TRUE))
      
      for (m3 in seq_along(dr3)) {
        # if the current rank has more than one pair
        if (nrow(dr3[[m3]]) > 1L) {
          # build a dummy vector for the split
          sp1 <- vector(mode = "integer",
                        length = nrow(dr3[[m3]]))
          # extract the feature positions that are always increasing
          sp2 <- dr3[[m3]][, 3L]
          # construct iterators
          it1 <- 1L
          it2 <- sp2[1L]
          # loop through dummy vector, and assign the split iterator
          # updating it when a gap larger than the tolerance appears
          for (m4 in seq_along(sp1)) {
            it3 <- sp2[m4]
            if (it3 - it2 > GapTolerance) {
              it1 <- it1 + 1L
            }
            sp1[m4] <- it1
            it2 <- it3
          } # end loop through split map
          
          if (it1 > 1L) {
            # the split map was updated, split the matrix
            dr3[[m3]] <- unname(split(x = dr3[[m3]],
                                      f = sp1))
          } else {
            dr3[[m3]] <- dr3[m3]
          }
          
        } else {
          dr3[[m3]] <- dr3[m3]
        }
      } # end m3 loop through dr3
      for (m3 in seq_along(dr4)) {
        # if the current rank has more than one pair
        if (nrow(dr4[[m3]]) > 1L) {
          # build a dummy vector for the split
          sp1 <- vector(mode = "integer",
                        length = nrow(dr4[[m3]]))
          # extract the feature positions that are always increasing
          sp2 <- dr4[[m3]][, 3L]
          # construct iterators
          it1 <- 1L
          it2 <- sp2[1L]
          # loop through dummy vector, and assign the split iterator
          # updating it when a gap larger than the tolerance appears
          for (m4 in seq_along(sp1)) {
            it3 <- sp2[m4]
            if (it3 - it2 > GapTolerance) {
              it1 <- it1 + 1L
            }
            sp1[m4] <- it1
            it2 <- it3
          } # end loop through split map
          
          if (it1 > 1L) {
            # the split map was updated, split the matrix
            dr4[[m3]] <- unname(split(x = dr4[[m3]],
                                      f = sp1))
          } else {
            dr4[[m3]] <- dr4[m3]
          }
          
        } else {
          dr4[[m3]] <- dr4[m3]
        }
      } # end m3 loop through dr4
      
      dr3 <- unlist(dr3,
                    recursive = FALSE)
      # dr3 IDs singletons
      dr3a <- unlist(sapply(dr3,
                            function(x) {
                              if (nrow(x) == 1L) {
                                x$ID
                              } else {
                                NA
                              }
                            },
                            simplify = FALSE))
      # dr3 IDs blocks
      dr3b <- unlist(sapply(dr3,
                            function(x) {
                              if (nrow(x) > 1L) {
                                x$ID
                              }
                            },
                            simplify = FALSE))
      dr4 <- unlist(dr4,
                    recursive = FALSE)
      # dr4 IDs singletons
      dr4a <- unlist(sapply(dr4,
                            function(x) {
                              if (nrow(x) == 1L) {
                                x$ID
                              } else {
                                NA
                              }
                            },
                            simplify = FALSE))
      # dr4 IDs blocks
      dr4b <- unlist(sapply(dr4,
                            function(x) {
                              if (nrow(x) > 1L) {
                                x$ID
                              }
                            },
                            simplify = FALSE))
      
      # drop dr3 singleton positions that are present in dr4
      dr3c <- dr3a %in% dr4b
      # drop dr4 singleton positions that are present in dr3
      dr4c <- dr4a %in% dr3b
      # drop singletons from one rank set that are present in a block from the
      # other set
      
      dr3 <- dr3[!dr3c]
      dr4 <- dr4[!dr4c]
      
      dr5 <- c(dr3, dr4)
      dr5 <- unique(dr5)
      if (DropSingletons) {
        checkrows <- sapply(dr5,
                            function(x) nrow(x),
                            simplify = TRUE)
        dr5 <- dr5[checkrows > 1L]
        if (length(dr5) == 0L) {
          # break out of m2 position without assigning Res[[m1]][[m2]] anything
          next
        }
      }
      # loop through dr5
      # if the position is a singleton pair, assign all possible expansions
      # if the position is a blocked set of pairs assign the two expansions
      
      dr6 <- vector(mode = "list",
                    length = length(dr5))
      for (m3 in seq_along(dr6)) {
        f1s <- dr5[[m3]][, 3L]
        f2s <- dr5[[m3]][, 6L]
        if (length(f1s) == 1L) {
          # singleton pair
          dr6[[m3]] <- data.frame("f1" = c(f1s - 1L, f1s - 1L, f1s + 1L, f1s + 1L),
                                  "f2" = c(f2s - 1L, f2s + 1L, f2s + 1L, f2s - 1L),
                                  "direction" = c(1L, 2L, 3L, 4L))
        } else {
          # a contiguous block of pairs
          if (length(unique(dr5[[m3]][, 8L])) == 1L) {
            # the regular diagonal
            f1s <- dr5[[m3]][, 3L]
            f2s <- dr5[[m3]][, 6L]
            f1f <- seq(from = min(f1s) - 1L,
                       to = max(f1s) + 1L,
                       by = 1L)
            f2f <- seq(from = min(f2s) - 1L,
                       to = max(f2s) + 1L,
                       by = 1L)
            f1f <- f1f[!(f1f %in% f1s)]
            f2f <- f2f[!(f2f %in% f2s)]
            
            if (length(f1f) > 2L) {
              # gaps in the block assign as no-expanding checks
              dr6[[m3]] <- data.frame("f1" = f1f,
                                      "f2" = f2f,
                                      "direction" = c(1L,
                                                      rep(0L,
                                                          length(f1f) - 2L),
                                                      3L))
            } else {
              # no gaps in the block
              dr6[[m3]] <- data.frame("f1" = f1f,
                                      "f2" = f2f,
                                      "direction" = c(1L,
                                                      3L))
            }
          } else {
            # the anti diagonal
            f1s <- dr5[[m3]][, 3L]
            f2s <- dr5[[m3]][, 6L]
            f1f <- seq(from = min(f1s) - 1L,
                       to = max(f1s) + 1L,
                       by = 1L)
            f2f <- seq(from = max(f2s) + 1L,
                       to = min(f2s) - 1L,
                       by = -1L)
            f1f <- f1f[!(f1f %in% f1s)]
            f2f <- f2f[!(f2f %in% f2s)]
            
            if (length(f1f) > 2L) {
              # gaps in the block assign as no-expanding checks
              dr6[[m3]] <- data.frame("f1" = f1f,
                                      "f2" = f2f,
                                      "direction" = c(2L,
                                                      rep(0L,
                                                          length(f1f) - 2L),
                                                      4L))
            } else {
              # no gaps in the block
              dr6[[m3]] <- data.frame("f1" = f1f,
                                      "f2" = f2f,
                                      "direction" = c(2L,
                                                      4L))
            }
          }
        } # end row check
      } # end dr6 loop
      
      dr6 <- do.call(rbind,
                     dr6)
      
      dr6 <- dr6[dr6[, 1L] >= ci1lower &
                   dr6[, 1L] <= ci1upper &
                   dr6[, 2L] >= ci2lower &
                   dr6[, 2L] <= ci2upper, , drop = FALSE]
      
      if (nrow(dr6) < 1L) {
        next
      }
      
      dr6 <- dr6[order(dr6[, 1L]), ]
      # return(dr6)
      # for every line in dr6
      # n + ? alignments will be attempted
      # for every n alignments that pass some threshold a new `Pair` is recorded
      # to add to pair summaries
      L <- nrow(dr6)
      VSize <- L * 2L
      
      if (AlignmentFun == "AlignProfiles") {
        
        p1placeholder <- p2placeholder <- p1FeatureLength <- p2FeatureLength <- rep(NA_integer_,
                                                                                    times = VSize)
        PIDVector <- SCOREVector <- NucDist <- rep(NA_real_,
                                                   times = VSize)
        AType <- rep(NA_character_,
                     times = VSize)
        
        Count <- 1L
        Continue <- TRUE
        if (Verbose) {
          pBar <- txtProgressBar(style = 1L)
        }
        
        for (m3 in seq(nrow(dr6))) {
          
          # each line contains feature coordinates for an alignment
          # and the direction to expand, if the alignment passes a threshold
          # with each expansion, check whether the expansion is within bounds
          f1 <- dr6[m3, 1L]
          f2 <- dr6[m3, 2L]
          advanceID <- dr6[m3, 3L]
          
          while (Continue) {
            # first ask if f1 and f2 are within bounds
            # if they are, do not attempt alignment
            # exit the while loop and move to the next position in dr6
            if (f1 > ci1upper |
                f1 < ci1lower |
                f2 > ci2upper |
                f2 < ci2lower) {
              # Continue <- FALSE
              break
            }
            # third ask if these sequences have already been aligned
            # CURRENTLY NOT IMPLEMENTED
            # third ask if both f1 and f2 are both coding
            if (CurrentW1Seqs$CodingVal1[f1] &
                CurrentW1Seqs$CodingVal2[f1] &
                CurrentW2Seqs$CodingVal1[f2] &
                CurrentW2Seqs$CodingVal2[f2]) {
              # both are coding
              
              ali <- AlignProfiles(pattern = CurrentW1Seqs$AA[Features01Match[f1]],
                                   subject = CurrentW2Seqs$AA[Features02Match[f2]],
                                   p.struct = CurrentW1Seqs$Struct[Features01Match[f1]],
                                   s.struct = CurrentW2Seqs$Struct[Features02Match[f2]])
              PID <- 1 - DistanceMatrix(myXStringSet = ali,
                                        type = "matrix",
                                        includeTerminalGaps = TRUE,
                                        verbose = FALSE)[1L, 2L]
              SCORE <- ScoreAlignment(myXStringSet = ali,
                                      structures = PredictHEC(ali,
                                                              type = "probabilities",
                                                              HEC_MI1 = MAT1,
                                                              HEC_MI2 = MAT2),
                                      structureMatrix = structureMatrix)
              
              CType <- "AA"
            } else {
              # at least one is not coding
              ali <- AlignProfiles(pattern = CurrentW1Seqs$DNA[f1],
                                   subject = CurrentW2Seqs$DNA[f2])
              PID <- 1 - DistanceMatrix(myXStringSet = ali,
                                        type = "matrix",
                                        includeTerminalGaps = TRUE,
                                        verbose = FALSE)[1L, 2L]
              SCORE <- ScoreAlignment(myXStringSet = ali,
                                      substitutionMatrix = substitutionMatrix)
              CType <- "NT"
            } # end if else statement for coding / non
            CurrentDist <- sqrt(sum((nuc1[f1, ] - nuc2[f2, ])^2)) / ((sum(nuc1[f1, ]) + sum(nuc2[f2, ])) / 2)
            # assign the dist 
            # update the count
            if (Criteria == "PID") {
              CHECK <- PID
            } else if (Criteria == "Score") {
              CHECK <- SCORE
            }
            if (CHECK < Floor) {
              # the current PID does not meet inclusion criteria
              # exit the while loop and move to the next position in dr6
              Continue <- FALSE
            } else {
              # PID meets inclusion criteria
              # assign integers to result vectors
              p1placeholder[Count] <- f1
              p2placeholder[Count] <- f2
              p1FeatureLength[Count] <- CurrentW1Seqs$NTCount[f1]
              p2FeatureLength[Count] <- CurrentW2Seqs$NTCount[f2]
              PIDVector[Count] <- PID
              SCOREVector[Count] <- SCORE
              AType[Count] <- CType
              NucDist[Count] <- CurrentDist
              
              # update f1 and f2
              if (advanceID == 0L) {
                # interior alignment, no forward movement allowed
                Continue <- FALSE
              } else if (advanceID == 1L) {
                # advancing down the diagonal
                f1 <- f1 - 1L
                f2 <- f2 - 1L
              } else if (advanceID == 2L) {
                # advancing up the anti-diagonal
                f1 <- f1 - 1L
                f2 <- f2 + 1L
              } else if (advanceID == 3L) {
                # advancing up the diagonal
                f1 <- f1 + 1L
                f2 <- f2 + 1L
              } else if (advanceID == 4L) {
                # advancing down the anti-diagonal
                f1 <- f1 + 1L
                f2 <- f2 - 1L
              } # end advancement if elses 
              
              Count <- Count + 1L
              if (Count >= VSize) {
                # if Count exceeds VSize, increase size
                VSize <- VSize * 2L
                p1placeholder <- c(p1placeholder,
                                   rep(NA_integer_,
                                       times = VSize))
                p2placeholder <- c(p2placeholder,
                                   rep(NA_integer_,
                                       times = VSize))
                p1FeatureLength <- c(p1FeatureLength,
                                     rep(NA_integer_,
                                         times = VSize))
                p2FeatureLength <- c(p2FeatureLength,
                                     rep(NA_integer_,
                                         times = VSize))
                PIDVector <- c(PIDVector,
                               rep(NA_real_,
                                   times = VSize))
                SCOREVector <- c(SCOREVector,
                                 rep(NA_real_,
                                     times = VSize))
                AType <- c(AType,
                           rep(NA_character_,
                               times = VSize))
                NucDist <- c(NucDist,
                             rep(NA_real_,
                                 times = VSize))
              } # end count size if statement
              
            } # end PID check
          } # end while loop
          Continue <- TRUE
          
          if (Verbose) {
            setTxtProgressBar(pb = pBar,
                              value = m3 / L)
          }
        } # end of m3 loop through dr6
        
        if (Verbose) {
          close(pBar)
        }
        
        if (any(!is.na(PIDVector))) {
          L2 <- max(which(!is.na(PIDVector)))
          
          p1placeholder <- p1placeholder[seq_len(L2)]
          p2placeholder <- p2placeholder[seq_len(L2)]
          p1FeatureLength <- p1FeatureLength[seq_len(L2)]
          p2FeatureLength <- p2FeatureLength[seq_len(L2)]
          PIDVector <- PIDVector[seq_len(L2)]
          SCOREVector <- SCOREVector[seq_len(L2)]
          AType <- AType[seq_len(L2)]
          NucDist <- NucDist[seq_len(L2)]
          
          Res[[m1]][[m2]] <- data.frame("p1" = paste(rep(w1,
                                                         times = L2),
                                                     rep(IMat[[m2]][1L, 2L],
                                                         times = L2),
                                                     p1placeholder,
                                                     sep = "_"),
                                        "p2" = paste(rep(w2,
                                                         times = L2),
                                                     rep(IMat[[m2]][1L, 5L],
                                                         times = L2),
                                                     p2placeholder,
                                                     sep = "_"),
                                        "Consensus" = rep(0,
                                                          times = L2),
                                        "p1featurelength" = p1FeatureLength,
                                        "p2featurelength" = p2FeatureLength,
                                        "blocksize" = rep(1L,
                                                          times = L2),
                                        "KDist" = NucDist,
                                        "TotalMatch" = rep(0L,
                                                           times = L2),
                                        "MaxMatch" = rep(0L,
                                                         times = L2),
                                        "UniqueMatches" = rep(0L,
                                                              times = L2),
                                        "PID" = PIDVector,
                                        "Score" = SCOREVector,
                                        "Alignment" = AType,
                                        "Block_UID" = rep(-1L,
                                                          times = L2),
                                        "ClusterID" = rep(NewClusterID,
                                                          times = L2),
                                        stringsAsFactors = FALSE)
          
        } else {
          # do nothing
        }
        
      } else if (AlignmentFun == "AlignPairs") {
        # split dr6 into coding and non-coding then align
        if (Verbose) {
          pBar <- txtProgressBar(style = 1)
        }
        dr6 <- unique(dr6[, c(1, 2)])
        rownames(dr6) <- NULL
        code_select <- CurrentW1Seqs$CodingVal1[dr6[, 1L]] &
          CurrentW1Seqs$CodingVal2[dr6[, 1L]] &
          CurrentW2Seqs$CodingVal1[dr6[, 2L]] &
          CurrentW2Seqs$CodingVal2[dr6[, 2L]]
        
        # if (m2 == 2) {
        #   return(dr6)
        # }
        
        if (any(code_select)) {
          aa_pairs <- data.frame("Pattern" = Features01Match[dr6$f1[code_select]],
                                 "Subject" = Features02Match[dr6$f2[code_select]])
          # return(list("a" = aa_pairs,
          #             "b" = dr6,
          #             "c" = CurrentW1Seqs,
          #             "d" = CurrentW2Seqs))
          aa_pairs$Position <- mapply(SIMPLIFY = FALSE,
                                      FUN = function(y, z) {
                                        cbind(matrix(0L,
                                                     4),
                                              matrix(data = c(y,y,z,z),
                                                     4))
                                      },
                                      y = width(CurrentW1Seqs$AA[aa_pairs$Pattern]) + 1L,
                                      z = width(CurrentW2Seqs$AA[aa_pairs$Subject]) + 1L)
          aa_res <- AlignPairs(pattern = CurrentW1Seqs$AA,
                               subject = CurrentW2Seqs$AA,
                               pairs = aa_pairs,
                               processors = Processors,
                               verbose = FALSE)
          
          aa_pid <- aa_res$Matches / aa_res$AlignmentLength
          aa_score <- aa_res$Score / aa_res$AlignmentLength
          aa_dist <- mapply(USE.NAMES = FALSE,
                            FUN = function(x, y) {
                              sqrt(sum((nuc1[x, ] - nuc2[y, ])^2)) / ((sum(nuc1[x, ]) + sum(nuc2[y, ])) / 2)
                            },
                            x = dr6$f1[code_select],
                            y = dr6$f2[code_select])
          # build out a df
          L2 <- nrow(aa_res)
          aa_int <- data.frame("p1" = names(CurrentW1Seqs$AA[aa_pairs$Pattern]),
                               "p2" = names(CurrentW2Seqs$AA[aa_pairs$Subject]),
                               "Consensus" = rep(0,
                                                 times = L2),
                               "p1featurelength" = CurrentW1Seqs$NTCount[dr6$f1[code_select]],
                               "p2featurelength" = CurrentW2Seqs$NTCount[dr6$f2[code_select]],
                               "blocksize" = rep(1L,
                                                 times = L2),
                               "KDist" = aa_dist,
                               "TotalMatch" = rep(0L,
                                                  times = L2),
                               "MaxMatch" = rep(0L,
                                                times = L2),
                               "UniqueMatches" = rep(0L,
                                                     times = L2),
                               "PID" = aa_pid,
                               "Score" = aa_score,
                               "Alignment" = rep("AA",
                                                 times = L2),
                               "Block_UID" = rep(-1L,
                                                 times = L2),
                               "ClusterID" = rep(NewClusterID,
                                                 times = L2),
                               stringsAsFactors = FALSE)
        } else {
          # build a df with no rows
          aa_int <- data.frame("p1" = character(0L),
                               "p2" = character(0L),
                               "Consensus" = numeric(0L),
                               "p1featurelength" = integer(0L),
                               "p2featurelength" = integer(0L),
                               "blocksize" = integer(0L),
                               "KDist" = numeric(0L),
                               "TotalMatch" = integer(0L),
                               "MaxMatch" = integer(0L),
                               "UniqueMatches" = integer(0L),
                               "PID" = numeric(0L),
                               "Score" = numeric(0L),
                               "Alignment" = character(0L),
                               "Block_UID" = integer(0L),
                               "ClusterID" = integer(0L),
                               stringsAsFactors = FALSE)
        }
        
        if (Verbose) {
          setTxtProgressBar(pb = pBar,
                            value = 1 / 2)
        }
        
        if (any(!code_select)) {
          nt_pairs <- dr6[!code_select, ]
          colnames(nt_pairs) <- c("Pattern",
                                  "Subject")
          rownames(nt_pairs) <- NULL
          nt_pairs$Position <- mapply(SIMPLIFY = FALSE,
                                      FUN = function(y, z) {
                                        cbind(matrix(0L,
                                                     4),
                                              matrix(data = c(y,y,z,z),
                                                     4))
                                      },
                                      y = width(CurrentW1Seqs$DNA[nt_pairs$Pattern]) + 1L,
                                      z = width(CurrentW2Seqs$DNA[nt_pairs$Subject]) + 1L)
          nt_res <- AlignPairs(pattern = CurrentW1Seqs$DNA,
                               subject = CurrentW2Seqs$DNA,
                               pairs = nt_pairs,
                               processors = Processors,
                               verbose = FALSE)
          nt_pid <- nt_res$Matches / nt_res$AlignmentLength
          nt_score <- nt_res$Score / nt_res$AlignmentLength
          nt_dist <- mapply(USE.NAMES = FALSE,
                            FUN = function(x, y) {
                              sqrt(sum((nuc1[x, ] - nuc2[y, ])^2)) / ((sum(nuc1[x, ]) + sum(nuc2[y, ])) / 2)
                            },
                            x = nt_res$Pattern,
                            y = nt_res$Subject)
          L2 <- nrow(nt_res)
          # build out a df
          # return(list("p1" = names(CurrentW1Seqs$DNA[nt_pairs$Pattern]),
          #             "p2" = names(CurrentW2Seqs$DNA[nt_pairs$Subject]),
          #             "Consensus" = rep(0,
          #                               times = L2),
          #             "p1featurelength" = CurrentW1Seqs$NTCount[dr6$f1[!code_select]],
          #             "p2featurelength" = CurrentW2Seqs$NTCount[dr6$f2[!code_select]],
          #             "blocksize" = rep(1L,
          #                               times = L2),
          #             "KDist" = nt_dist,
          #             "TotalMatch" = rep(0L,
          #                                times = L2),
          #             "MaxMatch" = rep(0L,
          #                              times = L2),
          #             "UniqueMatches" = rep(0L,
          #                                   times = L2),
          #             "PID" = nt_pid,
          #             "Score" = nt_score,
          #             "Alignment" = rep("AA",
          #                               times = L2),
          #             "Block_UID" = rep(-1L,
          #                               times = L2),
          #             "ClusterID" = rep(NewClusterID,
          #                               times = L2)))
          nt_int <- data.frame("p1" = names(CurrentW1Seqs$DNA[nt_pairs$Pattern]),
                               "p2" = names(CurrentW2Seqs$DNA[nt_pairs$Subject]),
                               "Consensus" = rep(0,
                                                 times = L2),
                               "p1featurelength" = CurrentW1Seqs$NTCount[dr6$f1[!code_select]],
                               "p2featurelength" = CurrentW2Seqs$NTCount[dr6$f2[!code_select]],
                               "blocksize" = rep(1L,
                                                 times = L2),
                               "KDist" = nt_dist,
                               "TotalMatch" = rep(0L,
                                                  times = L2),
                               "MaxMatch" = rep(0L,
                                                times = L2),
                               "UniqueMatches" = rep(0L,
                                                     times = L2),
                               "PID" = nt_pid,
                               "Score" = nt_score,
                               "Alignment" = rep("NT",
                                                 times = L2),
                               "Block_UID" = rep(-1L,
                                                 times = L2),
                               "ClusterID" = rep(NewClusterID,
                                                 times = L2),
                               stringsAsFactors = FALSE)
        } else {
          # build out a df with no rows
          nt_int <- data.frame("p1" = character(0L),
                               "p2" = character(0L),
                               "Consensus" = numeric(0L),
                               "p1featurelength" = integer(0L),
                               "p2featurelength" = integer(0L),
                               "blocksize" = integer(0L),
                               "KDist" = numeric(0L),
                               "TotalMatch" = integer(0L),
                               "MaxMatch" = integer(0L),
                               "UniqueMatches" = integer(0L),
                               "PID" = numeric(0L),
                               "Score" = numeric(0L),
                               "Alignment" = character(0L),
                               "Block_UID" = integer(0L),
                               "ClusterID" = integer(0L),
                               stringsAsFactors = FALSE)
        }
        
        if (Verbose) {
          setTxtProgressBar(pb = pBar,
                            value = 2 / 2)
        }
        
        Res[[m1]][[m2]] <- rbind(aa_int,
                                 nt_int)
        Res[[m1]][[m2]] <- Res[[m1]][[m2]][Res[[m1]][[m2]][, Criteria] > Floor, ]
        # return(list("aa" = aa_res,
        #             "nt" = nt_res,
        #             "in1" = aa_pairs,
        #             "in2" = nt_pairs))
        # stop ("Expansion with AlignPairs() is not yet implemented.")
        if (Verbose) {
          close(pBar)
        }
      }
      
    } # end m2 loop
  } # end m1 loop
  
  Res <- unlist(Res,
                recursive = FALSE)
  Res <- do.call(rbind,
                 Res)
  
  if (is.null(Res)) {
    # no new pairs discovered
    Res <- data.frame("p1" = character(0L),
                      "p2" = character(0L),
                      "Consensus" = numeric(0L),
                      "p1featurelength" = integer(0L),
                      "p2featurelength" = integer(0L),
                      "blocksize" = integer(0L),
                      "KDist" = integer(0L),
                      "TotalMatch" = integer(0L),
                      "MaxMatch" = integer(0L),
                      "UniqueMatches" = integer(0L),
                      "PID" = numeric(0L),
                      "SCORE" = numeric(0L),
                      "Alignment" = character(0L),
                      "Block_UID" = integer(0L),
                      "ClusterID" = integer(0L),
                      stringsAsFactors = FALSE)
  } else {
    # if Res is not NULL
    # remove duplicates
    if (nrow(Res) > 1L) {
      IDS <- paste(Res$p1,
                   Res$p2,
                   sep = "_")
      check1 <- !duplicated(IDS)
      Res <- Res[check1, , drop = FALSE]
      # remove IDs present in the original object IDs
      if (nrow(Res) > 0L) {
        IDS <- paste(Res$p1,
                     Res$p2,
                     sep = "_")
        check2 <- !(IDS %in% POIDs)
        Res <- Res[check2, , drop = FALSE]
      }
    }
  } # end logical check for whether Res is NULL or not
  # if an object that wasn't clustered beforehand was supplied, delete the column
  if (!("ClusterID" %in% colnames(SynExtendObject))) {
    Res <- Res[, -which(colnames(Res) == "ClusterID")]
  }
  # if new rows have been added, fold them in, recalculate blocksize
  if (nrow(Res) > 0L) {
    # return(list(SynExtendObject,
    #             Res,
    #             CurrentW1Seqs,
    #             CurrentW2Seqs))
    Res <- rbind(SynExtendObject,
                 Res)
    
    block_uid <- 1L
    FeaturesMat2 <- do.call(rbind,
                            strsplit(x = paste(Res$p1,
                                               Res$p2,
                                               sep = "_"),
                                     split = "_",
                                     fixed = TRUE))
    FeaturesMat2 <- data.frame("g1" = as.integer(FeaturesMat2[, 1L]),
                               "i1" = as.integer(FeaturesMat2[, 2L]),
                               "f1" = as.integer(FeaturesMat2[, 3L]),
                               "g2" = as.integer(FeaturesMat2[, 4L]),
                               "i2" = as.integer(FeaturesMat2[, 5L]),
                               "f2" = as.integer(FeaturesMat2[, 6L]))
    o1 <- order(FeaturesMat2[, 4L],
                FeaturesMat2[, 1L],
                FeaturesMat2[, 5L],
                FeaturesMat2[, 2L],
                FeaturesMat2[, 6L],
                FeaturesMat2[, 3L],
                decreasing = FALSE)
    
    Res <- Res[o1, ]
    FeaturesMat2 <- FeaturesMat2[o1, ]
    rownames(Res) <- NULL
    # reset the blocksizes
    blockreplacement <- rep(1L,
                            nrow(Res))
    for (m2 in seq_len(nrow(GMat))) {
      wi1 <- which(FeaturesMat2[, 1L] == GMat[m2, 1L] &
                     FeaturesMat2[, 4L] == GMat[m2, 2L])
      names(wi1) <- rownames(FeaturesMat2[wi1, ])
      
      # block size determination
      if (length(wi1) > 1) {
        SubFeatures <- FeaturesMat2[wi1, c(2,3,5,6)]
        # only run block size checks if enough rows are present
        # FeaturesMat <- data.frame("i1" = IMatrix[, 1L],
        #                           "f1" = PMatrix[, 1L],
        #                           "i2" = IMatrix[, 2L],
        #                           "f2" = PMatrix[, 2L])
        dr1 <- SubFeatures[, 2L] + SubFeatures[, 4L]
        dr2 <- SubFeatures[, 2L] - SubFeatures[, 4L]
        InitialBlocks1 <- unname(split(x = SubFeatures,
                                       f = list(as.integer(SubFeatures[, 1L]),
                                                as.integer(SubFeatures[, 3L]),
                                                dr1),
                                       drop = TRUE))
        InitialBlocks2 <- unname(split(x = SubFeatures,
                                       f = list(as.integer(SubFeatures[, 1L]),
                                                as.integer(SubFeatures[, 3L]),
                                                dr2),
                                       drop = TRUE))
        Blocks <- c(InitialBlocks1[sapply(InitialBlocks1,
                                          function(x) nrow(x),
                                          simplify = TRUE) > 1],
                    InitialBlocks2[sapply(InitialBlocks2,
                                          function(x) nrow(x),
                                          simplify = TRUE) > 1])
        L01 <- length(Blocks)
        if (L01 > 0) {
          for (m3 in seq_along(Blocks)) {
            # blocks are guaranteed to contain more than 1 row
            
            sp1 <- vector(mode = "integer",
                          length = nrow(Blocks[[m3]]))
            # we need to check both columns here, this currently is not correct
            # in all cases
            sp2 <- Blocks[[m3]][, 4L]
            sp3 <- Blocks[[m3]][, 2L]
            
            it1 <- 1L
            it2 <- sp2[1L]
            it4 <- sp3[1L]
            # create a map vector on which to split the groups, if necessary
            for (m4 in seq_along(sp1)) {
              it3 <- sp2[m4]
              it5 <- sp3[m4]
              if ((it3 - it2 > 1L) |
                  (it5 - it4 > 1L)) {
                # if predicted pairs are not contiguous, update the iterator
                it1 <- it1 + 1L
              }
              sp1[m4] <- it1
              it2 <- it3
              it4 <- it5
            }
            
            # if the splitting iterator was updated at all, a gap was detected
            if (it1 > 1L) {
              Blocks[[m3]] <- unname(split(x = Blocks[[m3]],
                                           f = sp1))
            } else {
              Blocks[[m3]] <- Blocks[m3]
            }
            
          } # end m3 loop
          # Blocks is now a list where each position is a set of blocked pairs
          Blocks <- unlist(Blocks,
                           recursive = FALSE)
          # drop blocks of size 1, they do not need to be evaluated
          Blocks <- Blocks[sapply(X = Blocks,
                                  FUN = function(x) {
                                    nrow(x)
                                  },
                                  simplify = TRUE) > 1L]
          L01 <- length(Blocks)
          AbsBlockSize <- rep(1L,
                              nrow(SubFeatures))
          BlockID_Map <- rep(-1L,
                             nrow(SubFeatures))
          # only bother with this if there are blocks remaining
          # otherwise AbsBlockSize, which is initialized as a vector of 1s
          # will be left as a vector of 1s, all pairs are singleton pairs in this scenario
          if (L01 > 0L) {
            for (m3 in seq_along(Blocks)) {
              # rownames of the Blocks dfs relate to row positions in the original
              # matrix
              pos <- match(x = rownames(Blocks[[m3]]),
                           table = names(wi1))
              val <- rep(nrow(Blocks[[m3]]),
                         nrow(Blocks[[m3]]))
              # do not overwrite positions that are in larger blocks
              keep <- AbsBlockSize[pos] < val
              if (any(keep)) {
                AbsBlockSize[pos[keep]] <- val[keep]
                BlockID_Map[pos[keep]] <- rep(block_uid,
                                              sum(keep))
                block_uid <- block_uid + 1L
              }
            } # end m3 loop
          } # end logical check for block size
        } else {
          # no blocks observed, all pairs present are singleton pairs
          AbsBlockSize <- rep(1L,
                              nrow(SubFeatures))
          BlockID_Map <- rep(-1L,
                             nrow(SubFeatures))
        }
      } else {
        AbsBlockSize <- 1L
      }
      blockreplacement[wi1] <- AbsBlockSize
      
    } # end m2 loop through genome comparisons
    Res$blocksize <- blockreplacement
    attr(x = Res,
         which = "KVal") <- attr(x = SynExtendObject,
                                 which = "KVal")
    if (Verbose) {
      TimeEnd <- Sys.time()
      print(TimeEnd - TimeStart)
    }
    return(Res)
  } else {
    if (Verbose) {
      TimeEnd <- Sys.time()
      print(TimeEnd - TimeStart)
    }
    return(SynExtendObject)
  }
}
npcooley/SynExtend documentation built on Nov. 15, 2024, 3:02 p.m.