R/SummarizePairs.R

Defines functions SummarizePairs

Documented in SummarizePairs

###### -- summarize seqs ------------------------------------------------------
# author: nicholas cooley
# maintainer: nicholas cooley
# contact: npc19@pitt.edu
# given a linked pairs object, and a database connection return a pairsummaries
# object
# this function will always align, unlike PairSummaries
# TODO:
# implement:
# ShowPlot
# Processors -- partially implemented, at least for AlignPairs
# ellipses

# this has undergone a pretty significant rewrite, including the implementation
# of some subfunctions to better compartmentalize some complicated tasks
# still need to implement a showplot argument and leverage processors for more
# than just search index and align pairs

SummarizePairs <- function(SynExtendObject,
                           DataBase01,
                           AlignmentFun = "AlignPairs",
                           DefaultTranslationTable = "11",
                           KmerSize = 5,
                           Verbose = FALSE,
                           ShowPlot = FALSE,
                           Processors = 1,
                           Storage = 2,
                           IndexParams = list("K" = 5),
                           SearchParams = list("perPatternLimit" = 0),
                           SearchScheme = "spike",
                           RejectBy = "rank",
                           RetainInternal = FALSE,
                           ...) {
  
  if (Verbose) {
    TimeStart <- Sys.time()
    pBar <- txtProgressBar(style = 1L)
  }
  
  # overhead checking
  # object types
  if (!is(object = SynExtendObject,
          class2 = "LinkedPairs")) {
    stop ("'SynExtendObject' is not an object of class 'LinkedPairs'.")
  }
  Size <- nrow(SynExtendObject)
  # if (!is(object = FeatureSeqs,
  #         class2 = "FeatureSeqs")) {
  #   stop ("'FeatureSeqs' is not an object of class 'FeatureSeqs'.")
  # }
  # we should only need to talk to the DataBase IF FeatureSeqs is not the right length
  
  if (is.character(DataBase01)) {
    if (!requireNamespace(package = "RSQLite",
                          quietly = TRUE)) {
      stop("Package 'RSQLite' must be installed.")
    }
    if (!("package:RSQLite" %in% search())) {
      print("Eventually character vector access to DECIPHER DBs will be deprecated.")
      requireNamespace(package = "RSQLite",
                       quietly = TRUE)
    }
    dbConn <- dbConnect(dbDriver("SQLite"), DataBase01)
    on.exit(dbDisconnect(dbConn))
  } else {
    dbConn <- DataBase01
    if (!dbIsValid(dbConn)) {
      stop("The connection has expired.")
    }
  }
  if (!is.character(AlignmentFun) |
      length(AlignmentFun) > 1) {
    stop("AlignmentFun must be either 'AlignPairs' or 'AlignProfiles'.")
  }
  if (!(AlignmentFun %in% c("AlignPairs",
                            "AlignProfiles"))) {
    stop("AlignmentFun must be either 'AlignPairs' or 'AlignProfiles'.")
  }
  if (!is.character(DefaultTranslationTable) |
      length(DefaultTranslationTable) > 1) {
    stop("DefaultTranslationTable must be a character of length 1.")
  }
  # check storage
  if (Storage < 0) {
    stop("Storage must be greater than zero.")
  } else {
    Storage <- Storage * 1e9 # conversion to gigabytes
  }
  # deal with Processors, this mimics Erik's error checking
  if (!is.null(Processors) && !is.numeric(Processors)) {
    stop("Processors must be a numeric.")
  }
  if (!is.null(Processors) && floor(Processors) != Processors) {
    stop("Processors must be a whole number.")
  }
  if (!is.null(Processors) && Processors < 1) {
    stop("Processors must be at least 1.")
  }
  if (is.null(Processors)) {
    Processors <- DECIPHER:::.detectCores()
  } else {
    Processors <- as.integer(Processors)
  }
  # deal with user arguments
  # ignore 'verbose'
  UserArgs <- list(...)
  # if (length(UserArgs) > 0) {
  #   UserArgNames <- names(UserArgs)
  #   AlignPairsArgs <- formals(AlignPairs)
  #   AlignPairsArgNames <- names(AlignPairsArgs)
  #   AlignProfilesArgs <- formals(AlignProfiles)
  #   AlignProfilesArgNames <- names(AlignProfilesArgs)
  #   DistanceMatrixArgs <- formals(DistanceMatrix)
  #   DistanceMatrixArgNames <- names(DistanceMatrixArgs)
  #   
  #   APaArgNames <- APrArgNames <- DMArgNames <- vector(mode = "character",
  #                                                    length = length(UserArgNames))
  #   for (m1 in seq_along(UserArgNames)) {
  #     APaArgNames[m1] <- match.arg(arg = UserArgNames[m1],
  #                                  choices = AlignPairsArgNames)
  #     APrArgNames[m1] <- match.arg(arg = UserArgNames[m1],
  #                                  choices = AlignProfilesArgNames)
  #     DMArgNames[m1] <- match.arg(arg = UserArgNames[m1],
  #                                 choices = DistanceMatrixArgNames)
  #   }
  #   # set the user args for AlignProfiles
  #   if (any(APaArgNames)) {
  #     
  #   }
  #   # set the user args for AlignPairs
  #   # set the user args for DistanceMatrix
  # }
  
  GeneCalls <- attr(x = SynExtendObject,
                    which = "GeneCalls")
  GeneCallIDs <- names(GeneCalls)
  ObjectIDs <- rownames(SynExtendObject)
  # when we subset a LinkedPairsObject it doesn't smartly handle the genecalls yet...
  if (!all(ObjectIDs %in% GeneCallIDs)) {
    stop("Function expects all IDs in the SynExtendObject to have supplied GeneCalls.")
  }
  
  AA_matrix <- DECIPHER:::.getSubMatrix("PFASUM50")
  NT_matrix <- DECIPHER:::.nucleotideSubstitutionMatrix(2L, -1L, 1L)
  
  feature_match <- match(x = ObjectIDs,
                         table = GeneCallIDs)
  # These need to all be switched over to feature_match
  # we're only going to scroll through the cells that are supplied in the object
  # the datapool only needs to be as long as the gene calls object
  DataPool <- vector(mode = "list",
                     length = length(GeneCallIDs))
  
  MAT1 <- get(data("HEC_MI1",
                   package = "DECIPHER",
                   envir = environment()))
  MAT2 <- get(data("HEC_MI2",
                   package = "DECIPHER",
                   envir = environment()))
  # set initial progress bars and iterators
  # PH is going to be the container that 'res' eventually gets constructed from
  # while Total
  if (AlignmentFun == "AlignProfiles") {
    # progress bar ticks through each alignment
    # this is the slower non-default option
    Total <- (Size - (Size - 1L)) / 2
    PH <- Attr_PH <- vector(mode = "list",
                            length = Total)
    Total <- sum(sapply(SynExtendObject[upper.tri(SynExtendObject)],
                        function(x) nrow(x),
                        USE.NAMES = FALSE,
                        simplify = TRUE))
    
    structureMatrix <- matrix(c(0.187, -0.8, -0.873,
                                -0.8, 0.561, -0.979,
                                -0.873, -0.979, 0.221),
                              3,
                              3,
                              dimnames=list(c("H", "E", "C"),
                                            c("H", "E", "C")))
    substitutionMatrix <- matrix(c(1.5, -2.134, -0.739, -1.298,
                                   -2.134, 1.832, -2.462, 0.2,
                                   -0.739, -2.462, 1.522, -2.062,
                                   -1.298, 0.2, -2.062, 1.275),
                                 nrow = 4,
                                 dimnames = list(DNA_BASES, DNA_BASES))
    
    # read this message after neighbors and k-mer dists ?
    if (Verbose) {
      cat("Aligning pairs.\n")
    }
  } else if (AlignmentFun == "AlignPairs") {
    # progress bar ticks through each cell
    # this is the faster default option
    # technically these alignments have the possibility of not being
    # as good as align profiles, but that's a hit we're willing to take
    Total <- (Size * (Size - 1L)) / 2L
    PH <- Attr_PH <- vector(mode = "list",
                            length = Total)
    Total <- Total * 2L
    if (Verbose) {
      cat("Collecting pairs.\n")
    }
  }
  
  Count <- 1L
  PBCount <- 0L
  # upper key!
  # QueryGene == 1
  # SubjectGene == 2
  # ExactOverlap == 3
  # QueryIndex == 4
  # SubjectIndex == 5
  # QLeft == 6
  # QRight == 7
  # SLeft == 8
  # SRight == 9
  # MaxKmer == 10
  # TotalKmer == 11
  
  # lower key!
  # same minus max and total, but for individual linking kmers
  # not all linking kmers
  block_uid <- 0L
  Prev_m1 <- 0L
  for (m1 in seq_len(Size - 1L)) {
    for (m2 in (m1 + 1L):Size) {
      # print(c(m1, m2))
      # build our data pool first
      # regardless of how we align or if we're including search index or not, we need to prepare and collect
      # the same basic statistics and look aheads
      if (is.null(DataPool[[feature_match[m1]]])) {
        # the pool position is empty, pull from the DB
        # and generate the AAStructures
        DataPool[[feature_match[m1]]]$DNA <- SearchDB(dbFile = dbConn,
                                                      tblName = "NTs",
                                                      identifier = ObjectIDs[m1],
                                                      verbose = FALSE,
                                                      nameBy = "description",
                                                      type = "DNAStringSet")
        DataPool[[feature_match[m1]]]$AA <- SearchDB(dbFile = dbConn,
                                                     tblName = "AAs",
                                                     identifier = ObjectIDs[m1],
                                                     verbose = FALSE,
                                                     nameBy = "description",
                                                     type = "AAStringSet")
        # # return(list("a" = DataPool[[feature_match[m1]]]$DNA,
        # #             "b" = DataPool[[feature_match[m1]]]$AA))
        # print("a")
        # cds values here aren't really CDS values, but an integer value counting
        # the number of feature ranges pulled from the GFFs
        DataPool[[feature_match[m1]]]$len <- width(DataPool[[feature_match[m1]]]$DNA)
        DataPool[[feature_match[m1]]]$mod <- DataPool[[feature_match[m1]]]$len %% 3L == 0
        DataPool[[feature_match[m1]]]$code <- GeneCalls[[feature_match[m1]]]$Coding
        DataPool[[feature_match[m1]]]$cds <- lengths(GeneCalls[[feature_match[m1]]]$Range)
        
        DataPool[[feature_match[m1]]]$struct <- PredictHEC(myAAStringSet = DataPool[[feature_match[m1]]]$AA,
                                                           type = "probabilities",
                                                           HEC_MI1 = MAT1,
                                                           HEC_MI2 = MAT2)
        DataPool[[feature_match[m1]]]$aa_backgrounds <- alphabetFrequency(x = DataPool[[feature_match[m1]]]$AA)
        DataPool[[feature_match[m1]]]$aa_backgrounds <- DataPool[[feature_match[m1]]]$aa_backgrounds[, colnames(AA_matrix)]
        DataPool[[feature_match[m1]]]$aa_backgrounds <- DataPool[[feature_match[m1]]]$aa_backgrounds / rowSums(DataPool[[feature_match[m1]]]$aa_backgrounds)
        DataPool[[feature_match[m1]]]$dna_backgrounds <- alphabetFrequency(x = DataPool[[feature_match[m1]]]$DNA)
        DataPool[[feature_match[m1]]]$dna_backgrounds <- DataPool[[feature_match[m1]]]$dna_backgrounds[, colnames(NT_matrix)]
        DataPool[[feature_match[m1]]]$dna_backgrounds <- DataPool[[feature_match[m1]]]$dna_backgrounds / rowSums(DataPool[[feature_match[m1]]]$dna_backgrounds)
        DataPool[[feature_match[m1]]]$aa_register <- match(table = which(DataPool[[feature_match[m1]]]$mod &
                                                                           DataPool[[feature_match[m1]]]$code),
                                                           x = seq(length(DataPool[[feature_match[m1]]]$DNA)))
        
        DataPool[[feature_match[m1]]]$index <- do.call(what = "IndexSeqs",
                                                       args = c(list("subject" = DataPool[[feature_match[m1]]]$AA,
                                                                     "verbose" = FALSE),
                                                                IndexParams))
        
      } else {
        # the pool position is not empty, assume that it's populated with all the information
        # that it needs
      }
      
      if (is.null(DataPool[[feature_match[m2]]])) {
        # the pool position is empty, pull from DB
        # and generate the AAStructures
        DataPool[[feature_match[m2]]]$DNA <- SearchDB(dbFile = dbConn,
                                                      tblName = "NTs",
                                                      identifier = ObjectIDs[m2],
                                                      verbose = FALSE,
                                                      nameBy = "description",
                                                      type = "DNAStringSet")
        DataPool[[feature_match[m2]]]$AA <- SearchDB(dbFile = dbConn,
                                                     tblName = "AAs",
                                                     identifier = ObjectIDs[m2],
                                                     verbose = FALSE,
                                                     nameBy = "description",
                                                     type = "AAStringSet")
        DataPool[[feature_match[m2]]]$len <- width(DataPool[[feature_match[m2]]]$DNA)
        DataPool[[feature_match[m2]]]$mod <- DataPool[[feature_match[m2]]]$len %% 3L == 0
        DataPool[[feature_match[m2]]]$code <- GeneCalls[[feature_match[m2]]]$Coding
        DataPool[[feature_match[m2]]]$cds <- lengths(GeneCalls[[feature_match[m2]]]$Range)
        
        DataPool[[feature_match[m2]]]$struct <- PredictHEC(myAAStringSet = DataPool[[feature_match[m2]]]$AA,
                                                           type = "probabilities",
                                                           HEC_MI1 = MAT1,
                                                           HEC_MI2 = MAT2)
        DataPool[[feature_match[m2]]]$aa_backgrounds <- alphabetFrequency(x = DataPool[[feature_match[m2]]]$AA)
        DataPool[[feature_match[m2]]]$dna_backgrounds <- alphabetFrequency(x = DataPool[[feature_match[m2]]]$DNA)
        DataPool[[feature_match[m2]]]$aa_backgrounds <- DataPool[[feature_match[m2]]]$aa_backgrounds[, colnames(AA_matrix)]
        DataPool[[feature_match[m2]]]$aa_backgrounds <- DataPool[[feature_match[m2]]]$aa_backgrounds / rowSums(DataPool[[feature_match[m2]]]$aa_backgrounds)
        DataPool[[feature_match[m2]]]$dna_backgrounds <- alphabetFrequency(x = DataPool[[feature_match[m2]]]$DNA)
        DataPool[[feature_match[m2]]]$dna_backgrounds <- DataPool[[feature_match[m2]]]$dna_backgrounds[, colnames(NT_matrix)]
        DataPool[[feature_match[m2]]]$dna_backgrounds <- DataPool[[feature_match[m2]]]$dna_backgrounds / rowSums(DataPool[[feature_match[m2]]]$dna_backgrounds)
        DataPool[[feature_match[m2]]]$aa_register <- match(table = which(DataPool[[feature_match[m2]]]$mod &
                                                                           DataPool[[feature_match[m2]]]$code),
                                                           x = seq(length(DataPool[[feature_match[m2]]]$DNA)))
        
        DataPool[[feature_match[m2]]]$index <- do.call(what = "IndexSeqs",
                                                       args = c(list("subject" = DataPool[[feature_match[m2]]]$AA,
                                                                     "verbose" = FALSE),
                                                                IndexParams))
      } else {
        # the pool position is not empty, assume that it's populated with all the information
        # that it needs
      }
      
      if (Prev_m1 != m1) {
        QueryDNA <- DataPool[[feature_match[m1]]]$DNA
        QueryAA <- DataPool[[feature_match[m1]]]$AA
        QNTCount <- DataPool[[feature_match[m1]]]$len
        QMod <- DataPool[[feature_match[m1]]]$mod
        QCode <- DataPool[[feature_match[m1]]]$code
        QCDSCount <- DataPool[[feature_match[m1]]]$cds
        QueryStruct <- DataPool[[feature_match[m1]]]$struct
        QueryIndex <- DataPool[[feature_match[m1]]]$index
        Query_Background_AA <- DataPool[[feature_match[m1]]]$aa_backgrounds
        Query_Background_NT <- DataPool[[feature_match[m1]]]$dna_backgrounds
        Q_AA_Register <- DataPool[[feature_match[m1]]]$aa_register
        if (KmerSize < 10L) {
          Q_NucF <- oligonucleotideFrequency(x = QueryDNA,
                                             width = KmerSize,
                                             as.prob = TRUE)
        } else {
          stop ("non-overlapping kmers not implemented")
        }
      } else {
        # do something else?
      }
      
      # m2 never stays the same, it will always change in the current search
      # strategy
      SubjectDNA <- DataPool[[feature_match[m2]]]$DNA
      SubjectAA <- DataPool[[feature_match[m2]]]$AA
      SNTCount <- DataPool[[feature_match[m2]]]$len
      SMod <- DataPool[[feature_match[m2]]]$mod
      SCode <- DataPool[[feature_match[m2]]]$code
      SCDSCount <- DataPool[[feature_match[m2]]]$cds
      SubjectStruct <- DataPool[[feature_match[m2]]]$struct
      SubjectIndex <- DataPool[[feature_match[m2]]]$index
      Subject_Background_AA <- DataPool[[feature_match[m2]]]$aa_backgrounds
      Subject_Background_NT <- DataPool[[feature_match[m2]]]$dna_backgrounds
      S_AA_Register <- DataPool[[feature_match[m2]]]$aa_register
      if (KmerSize < 10L) {
        S_NucF <- oligonucleotideFrequency(x = SubjectDNA,
                                           width = KmerSize,
                                           as.prob = TRUE)
      } else {
        stop ("non-overlapping kmers not implemented")
      }
      
      # return(list("QueryData" = list("dna" = QueryDNA,
      #                                "aa" = QueryAA,
      #                                "len" = QNTCount,
      #                                "mod" = QMod,
      #                                "code" = QCode,
      #                                "CDS" = QCDSCount,
      #                                "struct" = QueryStruct,
      #                                "index" = QueryIndex,
      #                                "aa_background" = Query_Background_AA,
      #                                "nt_background" = Query_Background_NT,
      #                                "register" = Q_AA_Register),
      #             "SubjectData" = list("dna" = SubjectDNA,
      #                                  "aa" = SubjectAA,
      #                                  "len" = SNTCount,
      #                                  "mod" = SMod,
      #                                  "code" = SCode,
      #                                  "CDS" = SCDSCount,
      #                                  "struct" = SubjectStruct,
      #                                  "index" = SubjectIndex,
      #                                  "aa_background" = Subject_Background_AA,
      #                                  "nt_background" = Subject_Background_NT,
      #                                  "register" = S_AA_Register)))
      
      # step 1: build indexes into the data pool if they don't exist already
      # # this is already accomplished
      
      # step 2:
      # changing the strategy here for erik:
      # if RejectBy == "none" run as was previously the case
      # if RejectBy == "Rank" run Erik's ranking scheme, i.e. search the reverse and then rank the hits
      # if RejectBy == "kmeans" run what amounts to ClusterByK, include the reverse search as the 
      
      # a forward search will always be performed
      # we can just search in the forward frame,
      # perform a reciprocal search
      # or perform a search for a negative spike
      
      search_df1 <- do.call(what = "SearchIndex",
                            args = c(list("pattern" = QueryAA,
                                          "invertedIndex" = SubjectIndex,
                                          "subject" = SubjectAA,
                                          "verbose" = FALSE,
                                          "processors" = Processors),
                                     SearchParams))
      if (SearchScheme == "standard") {
        # just create the search_pairs object from 
        search_pairs <- data.frame("p1" = names(QueryAA)[search_df1$Pattern],
                                   "p2" = names(SubjectAA)[search_df1$Subject],
                                   "Pattern" = search_df1$Pattern,
                                   "Subject" = search_df1$Subject,
                                   "group" = rep(1L,
                                                 nrow(search_df1)))
        search_pairs$f_hits <- search_df1$Position
        rownames(search_pairs) <- NULL
      } else if (SearchScheme == "spike") {
        # the ranking strategy and the Kmeans strategy will both use the same template of data
        # start with the search for the reverse
        search_df2 <- do.call(what = "SearchIndex",
                              args = c(list("pattern" = reverse(QueryAA),
                                            "invertedIndex" = SubjectIndex,
                                            "subject" = SubjectAA,
                                            "verbose" = FALSE,
                                            "processors" = Processors),
                                       SearchParams))
        search_pairs <- data.frame("p1" = c(names(QueryAA)[search_df1$Pattern],
                                            names(QueryAA)[search_df2$Pattern]),
                                   "p2" = c(names(SubjectAA)[search_df1$Subject],
                                            names(SubjectAA)[search_df2$Subject]),
                                   "Pattern" = c(search_df1$Pattern,
                                                 search_df2$Pattern),
                                   "Subject" = c(search_df1$Subject,
                                                 search_df2$Subject),
                                   "group" = c(rep(1L,
                                                   nrow(search_df1)),
                                               rep(2L,
                                                   nrow(search_df2))))
        search_pairs$f_hits <- c(search_df1$Position,
                                 search_df2$Position)
        rownames(search_pairs) <- NULL
      } else if (SearchScheme == "reciprocal") {
        # the prior strategy is retained by running no rejection
        search_df2 <- do.call(what = "SearchIndex",
                              args = c(list("pattern" = SubjectAA,
                                            "invertedIndex" = QueryIndex,
                                            "subject" = QueryAA,
                                            "verbose" = FALSE,
                                            "processors" = Processors),
                                       SearchParams))
        if (feature_match[m1] == feature_match[m2]) {
          search_df1 <- search_df1[search_df1$Pattern != search_df1$Subject, ]
          search_df2 <- search_df2[search_df2$Pattern != search_df2$Subject, ]
        }
        #direction 1 is pattern -> subject
        #direction 2 is subject -> pattern
        search_df1 <- search_df1[order(search_df1$Pattern,
                                       search_df1$Subject), ]
        search_df2 <- search_df2[order(search_df2$Subject,
                                       search_df2$Pattern), ]
        search_i1 <- paste(search_df1$Pattern,
                           search_df1$Subject,
                           sep = "_")
        search_i2 <- paste(search_df2$Subject,
                           search_df2$Pattern,
                           sep = "_")
        
        # from here if search pairs is zero rows, we're done
        # these names are in the frame of the combined search space,
        # when we subset later 
        search_pairs <- data.frame("p1" = names(QueryAA)[search_df1$Pattern[search_i1 %in% search_i2]],
                                   "p2" = names(SubjectAA)[search_df1$Subject[search_i1 %in% search_i2]],
                                   "Pattern" = search_df1$Pattern[search_i1 %in% search_i2],
                                   "Subject" = search_df1$Subject[search_i1 %in% search_i2],
                                   "group" = rep(1L,
                                                 sum(search_i1 %in% search_i2)))
        search_pairs$f_hits <- search_df1$Position[search_i1 %in% search_i2]
        rownames(search_pairs) <- NULL
      } else {
        stop ("search scheme not recognized.")
      }
      # print(nrow(search_pairs))
      # drop all duplicated pairings every time they appear that isn't the first
      check_this <- paste(search_pairs$Pattern,
                          search_pairs$Subject,
                          sep = "_")
      check_this <- duplicated(check_this)
      if (any(check_this)) {
        search_pairs <- search_pairs[!check_this, ]
      }
      if (nrow(search_pairs) > 0L) {
        
        place_holder1 <- do.call(rbind,
                                 strsplit(x = search_pairs$p1,
                                          split = "_",
                                          fixed = TRUE))
        search_pairs$i1 <- as.integer(place_holder1[, 2L])
        search_pairs$f1 <- as.integer(place_holder1[, 3L])
        place_holder2 <- do.call(rbind,
                                 strsplit(x = search_pairs$p2,
                                          split = "_",
                                          fixed = TRUE))
        search_pairs$i2 <- as.integer(place_holder2[, 2L])
        search_pairs$f2 <- as.integer(place_holder2[, 3L])
        # search_pairs$f_hits <- search_df1$Position[search_i1 %in% search_i2]
        search_pairs$s1 <- GeneCalls[[feature_match[m1]]]$Strand[search_pairs$f1]
        search_pairs$s2 <- GeneCalls[[feature_match[m2]]]$Strand[search_pairs$f2]
        search_pairs$start1 <- GeneCalls[[feature_match[m1]]]$Start[search_pairs$f1]
        search_pairs$start2 <- GeneCalls[[feature_match[m2]]]$Start[search_pairs$f2]
        search_pairs$stop1 <- GeneCalls[[feature_match[m1]]]$Stop[search_pairs$f1]
        search_pairs$stop2 <- GeneCalls[[feature_match[m2]]]$Stop[search_pairs$f2]
        
        # flip strands on search pairs from the reverse search
        # because we only reverse the query when we create the background search
        # we only need to do this for the query strandedness
        search_pairs$s1[search_pairs$group == 2L] <- as.integer(!(as.logical(search_pairs$s1[search_pairs$group == 2L])))
        
        # slam everything together -- i.e. build out rows that need to be
        # added to the linked pairs object
        hit_adjust_start <- do.call(cbind,
                                    search_pairs$f_hits)
        hit_key <- vapply(X = search_pairs$f_hits,
                          FUN = function(x) {
                            ncol(x)
                          },
                          FUN.VALUE = vector(mode = "integer",
                                             length = 1L))
        hit_q_partner <- rep(search_pairs$f1,
                             times = hit_key)
        hit_s_partner <- rep(search_pairs$f2,
                             times = hit_key)
        
        # should be a list in the same shape as the SearchIndex Positions
        # but with the hits mapped to (mostly) the right spots
        # !!! IMPORTANT !!! This is limited to sequences without introns
        # for this to work correctly with introns, I need to be able to divy
        # these offsets up across CDSs, which though possible now will need
        # some significant infrastructure changes to make work cleanly
        
        # return(list("f_hits" = search_pairs$f_hits,
        #             "s1" = search_pairs$s1,
        #             "s2" = search_pairs$s2,
        #             "start1" = search_pairs$start1,
        #             "start2" = search_pairs$start2,
        #             "stop1" = search_pairs$stop1,
        #             "stop2" = search_pairs$stop2))
        
        # this scoping function will make it easier to implement
        # intron/exon re-framing, but it will need access to cds bounds
        # as opposed to just feature bounds
        hit_arrangement <- AAHitScoping(hitlist = search_pairs$f_hits,
                                        fstrand1 = search_pairs$s1,
                                        fstrand2 = search_pairs$s2,
                                        fstart1 = search_pairs$start1,
                                        fstart2 = search_pairs$start2,
                                        fstop1 = search_pairs$stop1,
                                        fstop2 = search_pairs$stop2)
        # print(length(hit_arrangement))
        # return(list(hitlist = search_pairs$f_hits,
        #             fstrand1 = search_pairs$s1,
        #             fstrand2 = search_pairs$s2,
        #             fstart1 = search_pairs$start1,
        #             fstart2 = search_pairs$start2,
        #             fstop1 = search_pairs$stop1,
        #             fstop2 = search_pairs$stop2,
        #             output = hit_arrangement))
        hit_rearrangement <- t(do.call(cbind,
                                       hit_arrangement))
        block_bounds <- lapply(X = hit_arrangement,
                               FUN = function(x) {
                                 c(min(x[1, ]),
                                   max(x[2, ]),
                                   min(x[3, ]),
                                   max(x[4, ]))
                               })
        block_bounds <- do.call(rbind,
                                block_bounds)
        
        hit_widths <- lapply(X = search_pairs$f_hits,
                             FUN = function(x) {
                               x[2, ] - x[1, ] + 1L
                             })
        hit_totals <- vapply(X = hit_widths,
                             FUN = function(x) {
                               sum(x)
                             },
                             FUN.VALUE = vector(mode = "integer",
                                                length = 1))
        hit_max <- vapply(X = hit_widths,
                          FUN = function(x) {
                            max(x)
                          },
                          FUN.VALUE = vector(mode = "integer",
                                             length = 1))
        # print("a")
        # return(list("QueryGene" = hit_q_partner,
        #             "SubjectGene" = hit_s_partner,
        #             "ExactOverlap" = unlist(hit_widths),
        #             "QueryIndex" = rep(search_pairs$i1,
        #                                times = hit_key),
        #             "SubjectIndex" = rep(search_pairs$i2,
        #                                  times = hit_key),
        #             "QLeftPos" = hit_rearrangement[, 1],
        #             "QRightPos" = hit_rearrangement[, 2],
        #             "SLeftPos" = hit_rearrangement[, 3],
        #             "SRightPos" = hit_rearrangement[, 4]))
        add_by_hit <- data.frame("QueryGene" = hit_q_partner,
                                 "SubjectGene" = hit_s_partner,
                                 "ExactOverlap" = unlist(hit_widths),
                                 "QueryIndex" = rep(search_pairs$i1,
                                                    times = hit_key),
                                 "SubjectIndex" = rep(search_pairs$i2,
                                                      times = hit_key),
                                 "QLeftPos" = hit_rearrangement[, 1],
                                 "QRightPos" = hit_rearrangement[, 2],
                                 "SLeftPos" = hit_rearrangement[, 3],
                                 "SRightPos" = hit_rearrangement[, 4])
        add_by_block <- data.frame("QueryGene" = search_pairs$f1,
                                   "SubjectGene" = search_pairs$f2,
                                   "ExactOverlap" = hit_totals,
                                   "QueryIndex" = search_pairs$i1,
                                   "SubjectIndex" = search_pairs$i2,
                                   "QLeftPos" = block_bounds[, 1],
                                   "QRightPos" = block_bounds[, 2],
                                   "SLeftPos" = block_bounds[, 3],
                                   "SRightPos" = block_bounds[, 4],
                                   "MaxKmerSize" = hit_max,
                                   "TotalKmerHits" = hit_key,
                                   "group" = search_pairs$group)
        
        # using forward hits from here:
        # append pairs that don't appear in the current linked pairs object
        # onto the current linked pairs object
        
        
        # hits are now transposed into the context of the whole sequence
        # as opposed to the feature
        # build out rows to add, and then add them
        if (nrow(SynExtendObject[[m1, m2]]) > 0) {
          select_row1 <- paste(SynExtendObject[[m1, m2]][, 1L],
                               SynExtendObject[[m1, m2]][, 4L],
                               SynExtendObject[[m1, m2]][, 2L],
                               SynExtendObject[[m1, m2]][, 5L],
                               sep = "_")
          select_row2 <- paste(SynExtendObject[[m2, m1]][, 1L],
                               SynExtendObject[[m2, m1]][, 4L],
                               SynExtendObject[[m2, m1]][, 2L],
                               SynExtendObject[[m2, m1]][, 5L],
                               sep = "_")
        } else {
          select_row1 <- select_row2 <- vector(mode = "character",
                                               length = 0)
        }
        # upper diagonal is blocks,
        # lower diagonal is hits
        select_row3 <- paste(add_by_block$QueryGene,
                             add_by_block$QueryIndex,
                             add_by_block$SubjectGene,
                             add_by_block$SubjectIndex,
                             sep = "_")
        select_row4 <- paste(add_by_hit$QueryGene,
                             add_by_hit$QueryIndex,
                             add_by_hit$SubjectGene,
                             add_by_hit$SubjectIndex,
                             sep = "_")
        sr_upper <- !(select_row3 %in% select_row1)
        sr_lower <- !(select_row4 %in% select_row2)
        
        # return(list("upper1" = SynExtendObject[[m1, m2]],
        #             "lower1" = SynExtendObject[[m2, m1]],
        #             "upper2" = as.matrix(add_by_block[sr_upper, ]),
        #             "lower2" = as.matrix(add_by_hit[sr_lower, ])))
        
        # return(list(select_row1,
        #             select_row2,
        #             select_row3,
        #             select_row4,
        #             SynExtendObject[[m1, m2]],
        #             SynExtendObject[[m2, m1]],
        #             add_by_hit,
        #             add_by_block))
        if (any(sr_upper)) {
          SynExtendObject[[m1, m2]] <- cbind(SynExtendObject[[m1, m2]],
                                             "group" = rep(1L,
                                                           nrow(SynExtendObject[[m1, m2]])))
          SynExtendObject[[m1, m2]] <- rbind(SynExtendObject[[m1, m2]],
                                             as.matrix(add_by_block[sr_upper, ]))
          SynExtendObject[[m1, m2]] <- SynExtendObject[[m1, m2]][order(SynExtendObject[[m1, m2]][, 1L],
                                                                       SynExtendObject[[m1, m2]][, 2L]), ]
          rownames(SynExtendObject[[m1, m2]]) <- NULL
          SynExtendObject[[m2, m1]] <- rbind(SynExtendObject[[m2, m1]],
                                             as.matrix(add_by_hit[sr_lower, ]))
          SynExtendObject[[m2, m1]] <- SynExtendObject[[m2, m1]][order(SynExtendObject[[m2, m1]][, 1L],
                                                                       SynExtendObject[[m2, m1]][, 2L]), ]
          rownames(SynExtendObject[[m2, m1]]) <- NULL
        } else {
          SynExtendObject[[m1, m2]] <- cbind(SynExtendObject[[m1, m2]],
                                             "group" = rep(1L,
                                                           nrow(SynExtendObject[[m1, m2]])))
        }
      } else {
        stop ("an unexpected condition occured, please contact the maintainer -- search_pairs is empty when it is not expected to be")
        # it is technically possible to not find anything with search but still have synteny hits to parse,
        # i need to figure out how to handle this and under what circumstances it occurs
      }
      # return(list("upper" = SynExtendObject[[m1, m2]],
      #             "lower" = SynExtendObject[[m2, m1]]))
      # print("b")
      # next step if RejectBy is not none, run the rejection scheme
      
      
      # step 3: morph the searches into the SynExtendObject so other things
      # go smoothly
      
      # alignments happen later and profile vs pairs is chosen by the user
      
      # if (m1 == 1 & m2 == 3) {
      #   return(list(SynExtendObject[[m1, m2]],
      #               SynExtendObject[[m2, m1]]))
      # }
      
      
      ###### -- only evaluate valid positions ---------------------------------
      if (nrow(SynExtendObject[[m1, m2]]) > 0L) {
        # links table is populated, do whatever
        PMatrix <- cbind(SynExtendObject[[m1, m2]][, 1L],
                         SynExtendObject[[m1, m2]][, 2L])
        
        IMatrix <- cbind(SynExtendObject[[m1, m2]][, 4L],
                         SynExtendObject[[m1, m2]][, 5L])
        # find the Consensus of each linking kmer hit
        
        strand1_adj <- rep(SynExtendObject[[m1, m2]][, 12],
                           times = SynExtendObject[[m1, m2]][, 11])
        strand1_ph <- GeneCalls[[feature_match[m1]]]$Strand[SynExtendObject[[m2, m1]][, 1L]]
        strand1_ph[strand1_adj == 2L] <- as.integer(!(as.logical(strand1_ph[strand1_adj == 2L])))
        
        # return(list(gene1left = GeneCalls[[feature_match[m1]]]$Start[SynExtendObject[[m2, m1]][, 1L]],
        #             gene2left = GeneCalls[[feature_match[m2]]]$Start[SynExtendObject[[m2, m1]][, 2L]],
        #             gene1right = GeneCalls[[feature_match[m1]]]$Stop[SynExtendObject[[m2, m1]][, 1L]],
        #             gene2right = GeneCalls[[feature_match[m2]]]$Stop[SynExtendObject[[m2, m1]][, 2L]],
        #             hit1left = SynExtendObject[[m2, m1]][, 6L],
        #             hit1right = SynExtendObject[[m2, m1]][, 7L],
        #             hit2left = SynExtendObject[[m2, m1]][, 8L],
        #             hit2right = SynExtendObject[[m2, m1]][, 9L],
        #             strand1 = strand1_ph,
        #             strand2 = GeneCalls[[feature_match[m2]]]$Strand[SynExtendObject[[m2, m1]][, 2L]],
        #             index1 = SynExtendObject[[m2, m1]][, 1L],
        #             index2 = SynExtendObject[[m2, m1]][, 2L],
        #             group = SynExtendObject[[m1, m2]][, "group"],
        #             upper = SynExtendObject[[m1, m2]],
        #             lower = SynExtendObject[[m2, m1]]))
        
        diff1 <- HitConsensus(gene1left = GeneCalls[[feature_match[m1]]]$Start[SynExtendObject[[m2, m1]][, 1L]],
                              gene2left = GeneCalls[[feature_match[m2]]]$Start[SynExtendObject[[m2, m1]][, 2L]],
                              gene1right = GeneCalls[[feature_match[m1]]]$Stop[SynExtendObject[[m2, m1]][, 1L]],
                              gene2right = GeneCalls[[feature_match[m2]]]$Stop[SynExtendObject[[m2, m1]][, 2L]],
                              hit1left = SynExtendObject[[m2, m1]][, 6L],
                              hit1right = SynExtendObject[[m2, m1]][, 7L],
                              hit2left = SynExtendObject[[m2, m1]][, 8L],
                              hit2right = SynExtendObject[[m2, m1]][, 9L],
                              strand1 = strand1_ph,
                              strand2 = GeneCalls[[feature_match[m2]]]$Strand[SynExtendObject[[m2, m1]][, 2L]])
        # print("c")
        
        
        # get the mean consensus
        diff2 <- vector(mode = "numeric",
                        length = nrow(SynExtendObject[[m1, m2]]))
        hit_relations <- vector(mode = "integer",
                                length = nrow(SynExtendObject[[m2, m1]]))
        hit_iterator <- 0L
        loop_iterator <- 1L
        h1 <- 0L
        h2 <- 0L
        continue <- TRUE
        while (continue) {
          if (SynExtendObject[[m2, m1]][loop_iterator, 1L] == h1 &
              SynExtendObject[[m2, m1]][loop_iterator, 2L] == h2) {
            hit_relations[loop_iterator] <- hit_iterator
            loop_iterator <- loop_iterator + 1L
          } else {
            hit_iterator <- hit_iterator + 1L
            hit_relations[loop_iterator] <- hit_iterator
            h1 <- SynExtendObject[[m2, m1]][loop_iterator, 1L]
            h2 <- SynExtendObject[[m2, m1]][loop_iterator, 2L]
            loop_iterator <- loop_iterator + 1L
          }
          # setTxtProgressBar(pb = pBar,
          #                   value = loop_iterator / nrow(SynExtendObject[[m2, m1]]))
          if (loop_iterator > nrow(SynExtendObject[[m2, m1]])) {
            continue <- FALSE
          }
        }
        
        diff2 <- 1 - unname(tapply(X = diff1,
                                   INDEX = hit_relations,
                                   FUN = function(x) {
                                     mean(x)
                                   }))
        # max match size
        MatchMax <- SynExtendObject[[m1, m2]][, "MaxKmerSize"]
        # total unique matches
        UniqueMatches <- SynExtendObject[[m1, m2]][, "TotalKmerHits"]
        # total matches at all
        TotalMatch <- SynExtendObject[[m1, m2]][, "ExactOverlap"]
        
        # print("d")
        # return(list(p1 = SynExtendObject[[m1, m2]][, 1L],
        #             p2 = SynExtendObject[[m1, m2]][, 2L],
        #             code1 = QCode[SynExtendObject[[m1, m2]][, 1L]],
        #             code2 = SCode[SynExtendObject[[m1, m2]][, 2L]],
        #             mod1 = QMod[SynExtendObject[[m1, m2]][, 1L]],
        #             mod2 = SMod[SynExtendObject[[m1, m2]][, 2L]],
        #             aa1 = Query_Background_AA,
        #             aa2 = Subject_Background_AA,
        #             nt1 = Query_Background_NT,
        #             nt2 = Subject_Background_NT,
        #             register1 = Q_AA_Register,
        #             register2 = S_AA_Register,
        #             aamat = AA_matrix,
        #             ntmat = NT_matrix))
        
        diff3 <- ApproximateBackground(p1 = SynExtendObject[[m1, m2]][, 1L],
                                       p2 = SynExtendObject[[m1, m2]][, 2L],
                                       code1 = QCode[SynExtendObject[[m1, m2]][, 1L]],
                                       code2 = SCode[SynExtendObject[[m1, m2]][, 2L]],
                                       mod1 = QMod[SynExtendObject[[m1, m2]][, 1L]],
                                       mod2 = SMod[SynExtendObject[[m1, m2]][, 2L]],
                                       aa1 = Query_Background_AA,
                                       aa2 = Subject_Background_AA,
                                       nt1 = Query_Background_NT,
                                       nt2 = Subject_Background_NT,
                                       register1 = Q_AA_Register,
                                       register2 = S_AA_Register,
                                       aamat = AA_matrix,
                                       ntmat = NT_matrix)
        # print("e")
        # from here we need to get the kmer differences
        # the PIDs
        # the SCOREs
        
        # if both positions are present in the FeatureSeqs object, do nothing
        # if either or both is missing, they need to be pulled from the DB
        # this functionality needs to be expanded eventually to take in cases where the
        # we're overlaying something with gene calls on something that doesn't have gene calls
        # if (!all(ObjectIDs[c(m1, m2)] %in% FeatureSeqs$IDs)) {
        #   # an object ID does not have a seqs present, pull them
        #   # this is not a priority so we're leaving this blank for a second
        # } else {
        #   TMPSeqs01 <- FALSE
        #   TMPSeqs02 <- FALSE
        # }
        
        # align everyone as AAs who can be, i.e. modulo of 3, is coding, etc
        # then align everyone else as nucs
        # translate the hit locations to the anchor positions
        # every hit is an anchor
        # all hits are stored in the LinkedPairs object in nucleotide space
        # as left/right bounds, orientations will be dependant upon strandedness of the genes
        
        # prepare the kmer distance stuff:
        NucDist <- vector(mode = "numeric",
                          length = nrow(PMatrix))
        
        QueryFeatureLength <- QNTCount[PMatrix[, 1L]]
        SubjectFeatureLength <- SNTCount[PMatrix[, 2L]]
        
        # Perform the alignments
        if (AlignmentFun == "AlignPairs") {
          
          # grab kmer distances ahead of time because AlignPairs doesn't need to loop
          # through anything
          for (m3 in seq_along(NucDist)) {
            it1 <- PMatrix[m3, 1L]
            it2 <- PMatrix[m3, 2L]
            NucDist[m3] <- sqrt(sum((Q_NucF[it1, ] - S_NucF[it2, ])^2)) / ((sum(Q_NucF[it1, ]) + sum(S_NucF[it2, ])) / 2)
            # NucDist[m3] <- sqrt(sum((nuc1[m3, ] - nuc2[m3, ])^2)) / ((sum(nuc1[m3, ]) + sum(nuc2[m3, ])) / 2)
          }
          
          # print("f")
          # spit out the subset vectors and logicals to correctly call both AlignPairs calls
          # and both dfs
          AASelect <- PMatrix[, 1L] %in% which(QCode & QMod) & PMatrix[, 2L] %in% which(SCode & SMod)
          NTSelect <- !AASelect
          
          df_aa <- data.frame("Pattern" = Q_AA_Register[PMatrix[AASelect, 1L]],
                              "Subject" = S_AA_Register[PMatrix[AASelect, 2L]])
          df_nt <- data.frame("Pattern" = PMatrix[NTSelect, 1L],
                              "Subject" = PMatrix[NTSelect, 2L])
          
          check1 <- paste(search_pairs$Pattern,
                          search_pairs$Subject,
                          sep = "_")
          check2 <- paste(df_aa$Pattern,
                          df_aa$Subject,
                          sep = "_")
          aa_pos <- vector(mode = "list",
                           length = nrow(df_aa))
          aa_match1 <- match(x = check2,
                             table = check1)
          aa_match2 <- which(!is.na(aa_match1))
          aa_match3 <- which(is.na(aa_match1))
          aa_pos[aa_match2] <- search_pairs$f_hits[aa_match1[!is.na(aa_match1)]]
          aa_pos[aa_match3] <- mapply(SIMPLIFY = FALSE,
                                      USE.NAMES = FALSE,
                                      FUN = function(y, z) {
                                        cbind(matrix(data = 0L,
                                                     nrow = 4),
                                              matrix(data = c(y,y,z,z),
                                                     nrow = 4))
                                      },
                                      y = width(QueryAA)[df_aa$Pattern[aa_match3]] + 1L,
                                      z = width(SubjectAA)[df_aa$Subject[aa_match3]] + 1L)
          nt_pos <- mapply(SIMPLIFY = FALSE,
                           USE.NAMES = FALSE,
                           FUN = function(y, z) {
                             cbind(matrix(data = 0L,
                                          nrow = 4),
                                   matrix(data = c(y,y,z,z),
                                          nrow = 4))
                           },
                           y = QNTCount[df_nt$Pattern] + 1L,
                           z = SNTCount[df_nt$Subject] + 1L)
          
          # return(list("a" = search_pairs,
          #             "b" = df_aa,
          #             "c" = df_nt,
          #             "e" = Q_AA_Register,
          #             "f" = S_AA_Register,
          #             "g" = check1,
          #             "h" = check2))
          # when patterns are set to zero, almost everything will have a local match
          # just set those from their object above,
          # and if a candidate set doesn't search index hits, just set the anchors to global
          # df_aa$Position <- WithinQueryAAs
          # df_aa$Pattern <- Q_AA_Register[df_aa$Pattern]
          # df_aa$Subject <- S_AA_Register[df_aa$Subject]
          # erik's request is to drop direct global alignment
          # this means that global alignments are now approximations and not true global alignments
          # except in the case where an inferred pair came from find synteny alone and it's bounds couldn't
          # be reconciled for align pairs
          df_aa$Position <- aa_pos
          df_nt$Position <- nt_pos
          # return(list(WithinQueryAAs,
          #             WithinQueryNucs,
          #             QueryAA,
          #             SubjectAA))
          
          # done with anchors at this point
          
          if (sum(AASelect) > 0) {
            # return(list("q" = QueryAA,
            #             "s" = SubjectAA,
            #             "df" = df_aa))
            aapairs <- AlignPairs(pattern = QueryAA,
                                  subject = SubjectAA,
                                  pairs = df_aa,
                                  verbose = FALSE,
                                  processors = Processors)
            current_local_aa_pids <- aapairs$Matches / aapairs$AlignmentLength
            current_global_aa_pids <- aapairs$Matches / pmax(width(QueryAA)[aapairs$Pattern],
                                                             width(SubjectAA)[aapairs$Subject])
            current_local_aa_scores <- aapairs$Score / aapairs$AlignmentLength
            current_global_aa_scores <- aapairs$Score / pmax(width(QueryAA)[aapairs$Pattern],
                                                             width(SubjectAA)[aapairs$Subject])
            current_global_aa_rawscore <- aapairs$Score
          } else {
            current_local_aa_pids <- numeric()
            current_global_aa_pids <- numeric()
            current_local_aa_scores <- numeric()
            current_global_aa_scores <- numeric()
            current_global_aa_rawscore <- numeric()
          }
          if (Verbose) {
            PBCount <- PBCount + 1L
            setTxtProgressBar(pb = pBar,
                              value = PBCount / Total)
          }
          
          if (sum(NTSelect) > 0) {
            ntpairs <- AlignPairs(pattern = QueryDNA,
                                  subject = SubjectDNA,
                                  pairs = df_nt,
                                  verbose = FALSE,
                                  processors = Processors)
            current_local_nt_pids <- ntpairs$Matches / ntpairs$AlignmentLength
            current_global_nt_pids <- ntpairs$Matches / pmax(width(QueryDNA)[ntpairs$Pattern],
                                                             width(SubjectDNA)[ntpairs$Subject])
            current_local_nt_scores <- ntpairs$Score / ntpairs$AlignmentLength
            current_global_nt_scores <- ntpairs$Score / pmax(width(QueryDNA)[ntpairs$Pattern],
                                                             width(SubjectDNA)[ntpairs$Subject])
            current_global_nt_rawscore <- ntpairs$Score
          } else {
            current_local_nt_pids <- numeric()
            current_global_nt_pids <- numeric()
            current_local_nt_scores <- numeric()
            current_global_nt_scores <- numeric()
            current_global_nt_rawscore <- numeric()
          }
          if (Verbose) {
            PBCount <- PBCount + 1L
            setTxtProgressBar(pb = pBar,
                              value = PBCount / Total)
          }
          
          vec1 <- vec2 <- vec3 <- vec4 <- vec5 <- vector(mode = "numeric",
                                                         length = nrow(PMatrix))
          vec1[AASelect] <- current_local_aa_pids
          vec1[NTSelect] <- current_local_nt_pids
          vec2[AASelect] <- current_local_aa_scores
          vec2[NTSelect] <- current_local_nt_scores
          vec3[AASelect] <- current_global_aa_pids
          vec3[NTSelect] <- current_global_nt_pids
          vec4[AASelect] <- current_global_aa_scores
          vec4[NTSelect] <- current_global_nt_scores
          vec5[AASelect] <- current_global_aa_rawscore
          vec5[NTSelect] <- current_global_nt_rawscore
          # For Testing
          # AA_Anchors[[Count]] <- WithinQueryAAs
          # NT_Anchors[[Count]] <- WithinQueryNucs[NTSelect]
          # return(list(aapairs,
          #             ntpairs))
          
        } else if (AlignmentFun == "AlignProfiles") {
          
          stop ("currently not supported due to other priorities...")
          # build out a map of who is being called where
          # if we're aligning nucleotides, just call the position in the DNA
          # stringsets,
          # if you not, use the matched positions
          # ws1 <- match(table = names(QueryAA),
          #              x = names(QueryDNA)[PMatrix[, 1L]])
          # ws2 <- match(table = names(SubjectAA),
          #              x = names(SubjectDNA)[PMatrix[, 2L]])
          # vec1 <- vec2 <- vector(mode = "numeric",
          #                        length = length(NucDist))
          # 
          # for (m3 in seq_along(NucDist)) {
          #   
          #   NucDist[m3] <- sqrt(sum((nuc1[m3, ] - nuc2[m3, ])^2)) / ((sum(nuc1[m3, ]) + sum(nuc2[m3, ])) / 2)
          #   
          #   if (AASelect[m3]) {
          #     # align as amino acids
          #     ph1 <- AlignProfiles(pattern = QueryAA[ws1[m3]],
          #                          subject = SubjectAA[ws2[m3]],
          #                          p.struct = QueryStruct[ws1[m3]],
          #                          s.struct = SubjectStruct[ws2[m3]])
          #     ph2 <- DistanceMatrix(myXStringSet = ph1,
          #                           includeTerminalGaps = TRUE,
          #                           type = "matrix",
          #                           verbose = FALSE)
          #     ph3 <- ScoreAlignment(myXStringSet = ph1,
          #                           structures = PredictHEC(myAAStringSet = ph1,
          #                                                   type = "probabilities",
          #                                                   HEC_MI1 = MAT1,
          #                                                   HEC_MI2 = MAT2),
          #                           structureMatrix = structureMatrix)
          #   } else {
          #     # align as nucleotides
          #     ph1 <- AlignProfiles(pattern = QueryDNA[PMatrix[m3, 1L]],
          #                          subject = SubjectDNA[PMatrix[m3, 2L]])
          #     ph2 <- DistanceMatrix(myXStringSet = ph1,
          #                           includeTerminalGaps = TRUE,
          #                           type = "matrix",
          #                           verbose = FALSE)
          #     ph3 <- ScoreAlignment(myXStringSet = ph1,
          #                           substitutionMatrix = substitutionMatrix)
          #   }
          #   vec1[m3] <- 1 - ph2[1, 2]
          #   vec2[m3] <- ph3
          #   
          #   if (Verbose) {
          #     PBCount <- PBCount + 1L
          #     setTxtProgressBar(pb = pBar,
          #                       value = PBCount / Total)
          #   }
          # } # end m3 loop
          
        } # end if else on alignment function
        
        internal_vals <- data.frame("consensus" = diff2,
                                    "featurediff" = abs(QueryFeatureLength - SubjectFeatureLength) / pmax(QueryFeatureLength,
                                                                                                          SubjectFeatureLength),
                                    "kmerdist" = NucDist,
                                    "localpid" = vec1,
                                    "globalpid" = vec3,
                                    "matchcoverage" = (TotalMatch * 2L) / (QueryFeatureLength + SubjectFeatureLength),
                                    "localscore" = vec2,
                                    "deltabackground" = vec4,
                                    "rawscore" = vec5,
                                    "response" = ifelse(test = SynExtendObject[[m1, m2]][, "group"] == 1L,
                                                        yes = TRUE,
                                                        no = FALSE),
                                    "alitype" = ifelse(test = AASelect,
                                                       yes = "AA",
                                                       no = "NT"))
        if (RejectBy == "glm") {
          w_retain <- RejectionBy(input = internal_vals,
                                  method = "glm")
        } else if (RejectBy == "kmeans") {
          w_retain <- RejectionBy(input = internal_vals,
                                  method = "kmeans")
        } else if (RejectBy == "lm") {
          w_retain <- RejectionBy(input = internal_vals,
                                  method = "lm")
        } else if (RejectBy == "direct") {
          w_retain <- RejectionBy(input = internal_vals,
                                  method = "direct")
        } else {
          w_retain <- seq(nrow(internal_vals))
        }
        Attr_PH[[Count]] <- internal_vals
        
        # if (RejectBy == "Rank") {
        #   return()
        #   z1 <- data.frame("local_PID" = vec1,
        #                    "local_Score" = vec2,
        #                    "approx_global_pid" = vec3,
        #                    # "approx_global_score" = vec4,
        #                    # "Delta_Background" = vec4 - diff3,
        #                    "response" = ifelse(test = SynExtendObject[[m1, m2]][, "group"] == 1L,
        #                                        yes = TRUE,
        #                                        no = FALSE))
        #   g <- glm(response ~ .,
        #            family = "quasibinomial",
        #            data = z1)
        #   
        #   FDR <- RejectionCriteria$FDR
        #   # FDR <- 0.005 # maximum allowed false discovery rate
        #   N <- sum(z1$response)
        #   
        #   ranking <- order(g$fitted.values,
        #                    decreasing = TRUE)
        #   ranking <- ranking[cumprod(cumsum(ranking > N) <= seq_along(ranking)*FDR) == 1L]
        #   w_retain <- sort(ranking[ranking <= N])
        #   # return(list("a" = z1,
        #   #             "b" = ranking,
        #   #             "c" = w_retain))
        #   # return(list("init" = z1,
        #   #             "rank" = sort(ranking[ranking <= N])))
        #   # # z2 <- z1[sort(ranking[ranking <= N]), ]
        #   # # return(list("init" = z1,
        #   # #             "retained" = z2))
        #   
        # } else if (RejectBy == "KMeans") {
        #   FDR <- RejectionCriteria$FDR
        #   
        #   z1 <- data.frame("Consensus" = diff2,
        #                    "KDist" = NucDist,
        #                    "local_PID" = vec1,
        #                    "local_Score" = vec2,
        #                    "approx_global_pid" = vec3,
        #                    "approx_global_score" = vec4,
        #                    "Delta_Background" = vec4 - diff3,
        #                    "response" = ifelse(test = SynExtendObject[[m1, m2]][, "group"] == 1L,
        #                                        yes = TRUE,
        #                                        no = FALSE))
        #   
        #   simplek <- suppressMessages(kmeans(x = z1[, -8L],
        #                                      centers = 10,
        #                                      nstart = 25))
        #   kres <- do.call(rbind,
        #                   tapply(X = as.factor(z1$response),
        #                          INDEX = simplek$cluster,
        #                          FUN = function(x) {
        #                            table(x)
        #                          }))
        #   kvals <- kres[, 1L] / rowSums(kres)
        #   kvals[is.infinite(kvals)] <- 1L
        #   w_kvals <- which(kvals < FDR)
        #   if (length(w_kvals) < 1) {
        #     w_retain <- seq(nrow(z1))
        #     return(list("k" = simplek,
        #                 "z" = z1,
        #                 "kres" = kres,
        #                 "kvals" = kvals,
        #                 "w_kvals" = w_kvals,
        #                 "fdr" = FDR,
        #                 "retain" = w_retain))
        #     warning("kmeans does not appear to return appropriate grouping for rejection, all candidates are being returned")
        #   } else {
        #     w_retain <- which(simplek$cluster %in% which(kvals < FDR))
        #   }
        #   
        #   
        #   
        #   # mimic-ing the old routine here doesn't seem to work as well, and i should
        #   # figure out why
        #   # continue <- TRUE
        #   # kres <- vector(mode = "list",
        #   #                length = 20L)
        #   # wss <- vector(mode = "numeric",
        #   #               length = length(kres))
        #   # count <- 2L
        #   # offsetindex <- 1L
        #   # offsetorigin <- 2L
        #   # calcstart <- 10L
        #   # select_clustering <- 0L
        #   # while (continue) {
        #   #   kres[[count - offsetindex]] <- suppressMessages(kmeans(x = z1[, -8L],
        #   #                                                          centers = count,
        #   #                                                          iter.max = 25L,
        #   #                                                          nstart = 25L))
        #   #   wss[count - offsetindex] <- kres[[count - offsetindex]]$tot.withinss
        #   #   startingBMax <- max(wss)
        #   #   startingHalfMax <- unname(quantile(wss[wss > 0], 0.5))
        #   #   if (count >= calcstart) {
        #   #     dat <- data.frame("n" = seq(count - offsetindex) - 1L,
        #   #                       "wss" = abs(wss[wss > 0] - wss[1]))
        #   #     
        #   #     
        #   #     fitval <- nls(formula = wss ~ OneSite(X = n,
        #   #                                           Bmax,
        #   #                                           Kd),
        #   #                   data = dat,
        #   #                   start = list(Bmax = max(dat$wss),
        #   #                                Kd = unname(quantile(dat$n, 0.25))))
        #   #     fitsum <- summary(fitval)
        #   #     
        #   #     return(list("dat" = dat,
        #   #                 "table" = z1,
        #   #                 "kmeans" = kres,
        #   #                 "fitval" = fitval,
        #   #                 "fitsum" = fitsum))
        #   #     # FitA <- nls(dat[, 2L]~OneSite(X = dat[, 1L],
        #   #     #                               Bmax,
        #   #     #                               Kd),
        #   #     #             start = list(Bmax = startingBMax,
        #   #     #                          Kd = startingHalfMax))
        #   #     # fitasum <- summary(FitA)
        #   #     return(fitasum)
        #   #   }
        #   #   count <- count + 1L
        #   # }
        # } else if (RejectBy == "None") {
        #   w_retain <- seq(nrow(SynExtendObject[[m1, m2]]))
        # }
        
        # block size determination
        if (length(w_retain) > 1) {
          blockres <- BlockByRank(index1 = IMatrix[w_retain, 1L],
                                  partner1 = PMatrix[w_retain, 1L],
                                  index2 = IMatrix[w_retain, 2L],
                                  partner2 = PMatrix[w_retain, 2L])
        } else if (length(w_retain) == 1) {
          blockres <- list("absblocksize" = 1L,
                           "blockidmap" = -1L)
        } else {
          blockres <- list("absblocksize" = vector(mode = "integer",
                                                   length = 0L),
                           "blockidmap" = vector(mode = "integer",
                                                 length = 0L))
        }
        block_ph <- blockres$blockidmap
        w1 <- block_ph > 0
        # if (any(w1)) {
        #   block_ph[w1] <- block_ph[w1] + block_offset
        #   block_offset <- block_offset + max(block_ph)
        # }
        if (any(w1)) {
          block_offset <- block_uid
          blockres$blockidmap[blockres$blockidmap > 0] <- blockres$blockidmap[blockres$blockidmap > 0] + block_offset
          block_uid <- max(blockres$blockidmap[blockres$blockidmap > 0])
        } else {
          # nothing to update or add
        }
        
        # BlockSize evaluation is complete
        # we're eventually moving blocksize stuff down to happen post everything else
        PH[[Count]] <- data.frame("p1" = names(QueryDNA)[PMatrix[w_retain, 1]],
                                  "p2" = names(SubjectDNA)[PMatrix[w_retain, 2]],
                                  "Consensus" = diff2[w_retain],
                                  "p1featurelength" = QueryFeatureLength[w_retain],
                                  "p2featurelength" = SubjectFeatureLength[w_retain],
                                  "blocksize" = blockres$absblocksize, # calculated after retention determination
                                  "KDist" = NucDist[w_retain],
                                  "TotalMatch" = TotalMatch[w_retain],
                                  "MaxMatch" = MatchMax[w_retain],
                                  "UniqueMatches" = UniqueMatches[w_retain],
                                  "Local_PID" = vec1[w_retain],
                                  "Local_Score" = vec2[w_retain],
                                  "Approx_Global_PID" = vec3[w_retain],
                                  "Approx_Global_Score" = vec4[w_retain],
                                  "Alignment" = ifelse(test = AASelect,
                                                       yes = "AA",
                                                       no = "NT")[w_retain],
                                  "Block_UID" = blockres$blockidmap,
                                  "Delta_Background" = (vec4 - diff3)[w_retain])
      } else {
        # link table is not populated
        PH[[Count]] <- data.frame("p1" = character(),
                                  "p2" = character(),
                                  "Consensus" = numeric(),
                                  "p1featurelength" = integer(),
                                  "p2featurelength" = integer(),
                                  "blocksize" = integer(),
                                  "KDist" = numeric(),
                                  "TotalMatch" = integer(),
                                  "MaxMatch" = integer(),
                                  "UniqueMatches" = integer(),
                                  "Local_PID" = numeric(),
                                  "Local_Score" = numeric(),
                                  "Approx_Global_PID" = numeric(),
                                  "Approx_Global_Score" = numeric(),
                                  "Alignment" = character(),
                                  "Block_UID" = integer(),
                                  "Delta_Background" = numeric())
      }
      # Count and PBCount are unlinked,
      # iterate through both separately and correctly
      Count <- Count + 1L
      
      if (object.size(DataPool) > Storage) {
        # ok ... 
        # i need to nuke positions in the pool based on a few things:
        # i can't nuke the current m1,
        # i can't nuke the next m1
        # i can't nuke the next m2
        # return(list("m1" = m1,
        #             "m2" = m2,
        #             "pool" = DataPool))
        sw1 <- sapply(X = DataPool,
                      FUN = function(x) {
                        !is.null(x)
                      })
        sw2 <- which(sw1)
        sw2 <- sw2[!(sw2 %in% c(m1, (m1 + 1L), (m2 + 1L)))]
        if (length(sw2) > 0) {
          # bonk the first one ... this might realistically need to happen in a while loop, but for now we can live with this
          # !!! assigning NULL by default deletes the position, shortening the vector,
          # we can replace the list (the container), with another container (containing NULL) without
          # shortening the list, R inferno reference and relevant examples here:
          # https://stackoverflow.com/questions/7944809/assigning-null-to-a-list-element-in-r
          DataPool[sw2[1L]] <- list(NULL)
        } else {
          # i don't know if this case can happen, but we're putting a print statement here just in case
          print("Please allocate more storage.")
        }
      }
      Prev_m1 <- m1
    } # end m2
  } # end m1
  res <- do.call(rbind,
                 PH)
  Attr_PH <- do.call(rbind,
                     Attr_PH)
  rownames(Attr_PH) <- NULL
  if (nrow(res) > 0) {
    # return(res)
    All_UIDs <- unique(res$Block_UID)
    res$Block_UID[res$Block_UID == -1L] <- seq(from = max(All_UIDs) + 1L,
                                               by = 1L,
                                               length.out = sum(res$Block_UID == -1L))
  }
  attr(x = res,
       which = "GeneCalls") <- GeneCalls
  class(res) <- c("data.frame",
                  "PairSummaries")
  attr(x = res,
       which = "AlignmentFunction") <- AlignmentFun
  attr(x = res,
       which = "KVal") <- KmerSize
  attr(x = res,
       which = "AA_matrix") <- AA_matrix
  attr(x = res,
       which = "NT_matrix") <- NT_matrix
  if (RetainInternal) {
    attr(x = res,
         which = "internal_vals") <- Attr_PH
  }
  # attr(x = res,
  #      which = "NT_Anchors") <- NT_Anchors
  
  # close pBar and return res
  if (Verbose) {
    TimeEnd <- Sys.time()
    close(pBar)
    print(TimeEnd - TimeStart)
  }
  return(res)
}
npcooley/SynExtend documentation built on June 8, 2025, 5:24 a.m.