MMDIT: Mitochondrial mixture project

Documented in addKnownHaplotypes estimateLogLikelihood getAllDistances getAmpCoordinates getAmps getMitoGenomes getMtgenomeLength getMtgenomeSequence getNN getPops getRefAmpSeqs getSeqdiffs getSeqdiffsByAmp loadMMDIT loadReferenceGenome makeDB makeDeploidSeqGraph naiveGetAllNN naiveGetAllNNBySampAndStop writeAmp

#!/usr/bin/env Rscript
# Written by August Woerner
# This creates the (empty) database
# makeDB function
#
# And it loads it with the appropriate data

# database dependencies

#library(devtools)

# Note to self
# Install bit, bit64 and blob FIRST!
# the dependencies are not handled (correctly)...
#devtools::install_github("Ahhgust/RSQLite", ref="devel", upgrade=TRUE)
#devtools::install_github("Ahhgust/Haplotypical", ref="master", upgrade=TRUE)


# laziness. For when I source this script... nice to load libs
#loadLibs <- function() {
#library(Rcpp)
#library(DBI)
#library(RSQLite)
#library(stringr)
#library(Haplotypical)
#library(stringdist)

# for reading fasta files (reference genome)
#library(seqinr)

# quality of life dependencies
#suppressPackageStartupMessages( library(tibble) )
#suppressPackageStartupMessages( library(readr) )
#suppressPackageStartupMessages( library(dplyr) )
#suppressPackageStartupMessages( library(tidyr) )
#suppressPackageStartupMessages( library(magrittr) )

#}

# Intent:
#   This function suppresses the following notes generated by "R CMD check":
#   - "Note: no visible binding for global variable '.->ConfigString'"
#   - "Note: no visible binding for '<<-' assignment to 'ConfigString'"
# Usage:
#   Add the following right in the beginning of the .r file (before the Reference
#   class is defined in the sourced .r file):
#   suppressBindingNotes(c(".->ConfigString","ConfigString"))
suppressBindingNotes <- function(variablesMentionedInNotes) {
  for(variable in variablesMentionedInNotes) {
    assign(variable,NULL, envir = .GlobalEnv)
  }
}



# CONSTANTS
# data directory
dataDir <- 'inst/extdata'

# final database filename
dbName <- 'mmdit.sqlite3'

# reference genome
refGenome <- 'rCRS.fasta'

precisionIDBed <- "PrecisionID_mtDNA_WG_targets.bed"
default_kit <- "PrecisionID"

hmtFilename <- "Empop.tsv"

# helper function.
# Normally we send a query and then clear it. this wraps it up into 1 statement
dbSendQueryAndClear <- function(db, query) {
  DBI::dbClearResult( DBI::dbSendQuery(db, query) )
}

#' Creates (an empty) MMDIT database
#'
#' This *creates* an EMPTY MMDIT database
#' @param path a directory (e.g., data)
#' @param dbFilename (e.g., mmdit.sqlite3)
#' @param overwrite (if there's already a file with that name, should I overwrite it?)
#' and creates a database with the requisite tables in it
#' if overwrite is FALSE, then it won't overwrite an existing database
#' in that path.
#' otherwise, it will clobber what was there (use overwrite=TRUE with caution!!)
#'
#' @return the handle to the database (throughout all other documentation, this is db)
#' @export
#' @examples
#' \dontrun{
#' # Make a new MMDIT database...
#'
#' db <- makeDB(dataDir, dbName, overwrite=TRUE )
#' # add the reference genome (can be only 1)
#' loadReferenceGenome(db, paste(dataDir, refGenome, sep="/") )
#' }
#'
makeDB <- function(path, dbFilename, overwrite=FALSE) {


  if (! dir.exists(path) ) {
    dir.create(path)
    if (! dir.exists(path)) {
      stop(
        paste("Failed to create directory: ", path , "\nThis is probably a file permission issue...")
      )
      return(NULL)
    }
  }

  dbFile <- paste(path, dbFilename, sep="/")

  if (file.exists(dbFile)) { #DB already exists
    if (overwrite==FALSE) {
      stop(
          paste("DB file " , dbFile, " already exists. Let's not overwrite things, shall we?")
          )
      return(NULL)
    }
  }

  db <- DBI::dbConnect( RSQLite::SQLite(), dbFile, loadable.extensions = TRUE)
  RSQLite::initExtension(db)


  # create the database tables:

  # information on the amplicons
  dbSendQueryAndClear(db, "DROP TABLE IF EXISTS amps")
  dbSendQueryAndClear(db,
              "CREATE TABLE amps
              (ampid INTEGER PRIMARY KEY ASC,
              kit TEXT NOT NULL,
              start INTEGER NOT NULL,
              stop INTEGER NOT NULL)" )

  # and the sample IDs (and their corresponding populations and information where they came from (provenance))
  dbSendQueryAndClear(db, "DROP TABLE IF EXISTS populations")
  dbSendQueryAndClear(db,
              "CREATE TABLE populations
              (sampleid TEXT PRIMARY KEY,
              whence TEXT,
              pop TEXT NOT NULL)" )

# primary key is the rowid (default in SQlite)
  dbSendQueryAndClear(db, "DROP TABLE IF EXISTS full_mito_diffseqs")
  dbSendQueryAndClear(db,
            "CREATE TABLE full_mito_diffseqs
              (
              sampleid TEXT NOT NULL,
                 position INTEGER NOT NULL,
                 event TEXT NOT NULL,
                 basecall TEXT NOT NULL,
                 FOREIGN KEY (sampleid) REFERENCES populations(sampleid)
              )" )


  # a trivial table of just the reference genome
  dbSendQueryAndClear(db, "DROP TABLE IF EXISTS mtgenome")
  dbSendQueryAndClear(db,
                      "CREATE TABLE mtgenome
              (mitoid INTEGER PRIMARY KEY ASC,
                sequence TEXT,
                seqlen INTEGER NOT NULL)")




  return(db)
}


#' Loads MMDIT
#'
#' This *loads* the MMDIT database
#' path can either be the full path of the sqlite3 file
#'  (dbFilename must be "")
#' OR
#' path can be the directory
#' and the database name can be supplied separately
#'
#' @param path a directory (e.g., data)
#' @param dbFilename (e.g., mmdit.sqlite3)
#'
#' @export
loadMMDIT <- function(path=dataDir, dbFilename=dbName) {

  if (path==dataDir) {
     path<-system.file("extdata", "", package="MMDIT")
  }

  if (dbFilename != "") {
    dbFile <- paste(path, dbFilename, sep="/")
  } else {
    dbFile <- path
  }

  if (file.exists(dbFile)) {
    db <- DBI::dbConnect( RSQLite::SQLite(), dbFile, loadable.extensions = TRUE)
    RSQLite::initExtension(db)
    return(db)
  }
  stop(paste(dbFile, "does not exist...", path, dbFilename))
  return(NULL)
}

#' Use this to add the rCRS sequence to MMDIT
#' It adds the record to the table mtgenome
#' this table has *1* row in it. The reference genome.
#' If you call this function multiple times, the last time
#' will be the record that "sticks"
#'
#' Technically any genome would work here, BUT
#' the difference encodings MUST be with respect to this reference genome!!
#' i.e., in all likelihood this should *ONLY* be the rCRS sequence.
#'
#' @param db (database handle)
#' @param fastaFile (must be both the name of the file and the path to open it. This is passed directly to seqinr::read.fasta
#'
#' @examples
#' \dontrun{
#' # loadReferenceGenome(db, paste(dataDir, refGenome, sep="/") )
#' }
loadReferenceGenome <- function(db, fastaFile) {

   fa <- seqinr::read.fasta(fastaFile, as.string=TRUE, seqtype="DNA", seqonly = TRUE)
   len <- nchar(fa)
   dbSendQueryAndClear(db,
                 sprintf(
                   "INSERT INTO mtgenome (mitoid, sequence, seqlen) VALUES
                    (%d, \"%s\", %d)",
                    1, fa[[1]], len[[1]])
               )
   return(db)
}

#' Writes amplicon data to MMDIT
#'
#' this loads in amplicon data (i.e., the coordinates of the amps)
#' into the MITO db.
#' If the alignments were to a modified reference (duplicating bits to accommodate circular alignment)
#' this routine converts those back to the original units.
#' thus the last amplicon may "wrap around" (eg., 16541-80 is a valid locus)
#'
#' @param db (the database connection)
#' @param ampFile (a BED file of amplicons; columns 2 and 3 are used only)
#' @param kitName (the NAME of the kit that this corresponds to (e.g., PrecisionID) )
#' @param sep (\"\\t\" for a tab-separated file. Anything else is a little wacky... no longer a bed file)
#' @param append (whether or not the amplicons are APPENDED to the list of current amplicons; set to FALSE to overwrite)
#'
#' @return the database handle (not necessary to keep this)
#'
#' @examples
#' # One way to add another kit:
#' \dontrun{
#' # loadAmpData(db, "data/Mybedfile.bed", kit="SomeOtherKit", append=TRUE)
#' }
#'
writeAmp <- function(db, ampFile,  kitName, sep="\t", append=TRUE) {
   amps <- suppressMessages( readr::read_delim(ampFile, delim=sep) )

   overwrite<-FALSE
   if (append == FALSE) {
     overwrite<-TRUE
   }

   if (! overwrite) {
     redundantCount <-
       DBI::dbGetQuery(db, sprintf(
       "SELECT count(kit) FROM amps WHERE kit == '%s'", kitName))

    if (redundantCount[[1]] > 0) {
        stop(
          sprintf("Kit %s is already in the database...\nYou can't add it twice!", kitName)
        )
      return(db)
    }
   }
   genomeLength <- DBI::dbGetQuery(db, "SELECT seqlen FROM mtgenome LIMIT 1")[[1]]
   amps.df <- tibble::tibble(kit=kitName,
                             start= as.integer(dplyr::pull(amps,2)),
                             stop = as.integer(dplyr::pull(amps,3))) # coerce into a  tibble
   DBI::dbWriteTable(db,
                "amps", amps.df, append=append, overwrite=overwrite,
                field.types=c("kit"="character", "start"="integer","stop"="integer")
                )
   return(db)
}


#' Get amplicon coordinates
#'
#' Generates the amplicon coordinates
#' for some kit
#' For use with MMDIT only!
#'
#' @param db the database connection
#' @param kit the kit name (must be a valid KIT in the amps table)
#' @return returns data frame with 4 columns: the amp ID, the start coordinate (0-based), the stop coordinate (1-based)
#'
#' @examples
#' \dontrun{
#' amps <- getAmpCoordinates(db, kit="PrecisionID")
#' }
#' @export
getAmpCoordinates <- function(db, kit=default_kit) {
  #TODO: Take in start/stop coordinates as well as a kit.
  loc <- DBI::dbGetQuery(db,
                         sprintf(
                           "SELECT start, stop
                    FROM
                     amps
                    WHERE
                    kit= '%s'", kit) )

  if (nrow(loc) == 0) {
    stop(
      paste(
        "No data for kit ", kit , " was found...")
    )
  }
  return(loc)
}


#' Amplicon sequences of reference
#'
#' Generates the amplicon sequences
#' based on the reference sequence.
#' For use with MMDIT only!
#'
#' @param db the database connection
#' @param kit the kit name (must be a valid KIT in the amps table)
#' @return returns data frame with 4 columns: the amp ID, the start coordinate (0-based), the stop coordinate (1-based), and the Seq(uence) of the rcrs for the amp
#'
#' @examples
#' \dontrun{
#' amps <- getRefAmpSeqs(db, kit="PrecisionID")
#' }
#' @export
getRefAmpSeqs <- function(db, kit=default_kit) {
  #TODO: Take in start/stop coordinates as well as a kit.
  loc <- DBI::dbGetQuery(db,
                    sprintf(
                    "SELECT start, stop
                    FROM
                     amps
                    WHERE
                    kit= '%s'", kit) )

  if (nrow(loc) == 0) {
     stop(
       paste(
       "No data for kit ", kit , " was found...")
     )
  }
  # get the mitochondrial genome (concatenated with itself)
  # the self-concatentation allows for the easy lookup of amplicons that span the
  # first and last bases of the linearized mitochondrial genome.
  mtgenomeSeq <- stringr::str_dup(
    DBI::dbGetQuery(db,
                            "SELECT sequence FROM mtgenome LIMIT 1")[[1]],
    2)

  mtgenomeLen <- DBI::dbGetQuery(db,
                            "SELECT seqlen FROM mtgenome LIMIT 1")[[1]]

  # recall substr using 1-based indexing
  # and the coordinates are bed format (0-based start, 1-based stop)
  loc$Seq <-
      stringr::str_sub(mtgenomeSeq,
                       loc$start + 1,
                       ifelse( loc$start < loc$stop,
                         loc$stop,
                         loc$stop + mtgenomeLen))

  loc
}


getSeqdiffCodes <- function() {
  c(X=0, D=1, I=2)
}


initDB <- function() {
  dataDir<-system.file("extdata", "", package="MMDIT")

  db <- makeDB(dataDir, dbName, overwrite=TRUE )

# add the reference genome (can be only 1)
  loadReferenceGenome(db, paste(dataDir, refGenome, sep="/") )
# add amplicons for the precision ID panel
  loadAmpData(db, paste(dataDir, precisionIDBed, sep="/"), kitName=default_kit, sep="\t" )

# make the edit distance a true editdist
  initEditDists(db)

  refAmps <- getRefAmpSeqs(db)
  return(db)
}


#' get all seqdiffs for some population
#'
#' @param db the database handle
#' @param pop the population requested (defaults to all). vectorized populations okay
#' @param ignoreIndels strips out indel events (default TRUE)
#' @param getPopulation gets the population label along with the sequence differences
#' @export
getSeqdiffs <- function(db, pop="%", ignoreIndels=FALSE, getPopulation=FALSE) {

  # optionally we want to filter out indels...
  indelString <- ""
  if (ignoreIndels) {
    indelString <- " AND event = 'X'"
  }

  popString <- ""
  if (getPopulation) {
    popString <- ", populations.pop "
  }

  genomeLength <- DBI::dbGetQuery(db, "SELECT seqlen FROM mtgenome LIMIT 1")[[1]]

  diffs <- DBI::dbGetQuery(db,
                           paste0(
                             "SELECT populations.sampleid, position, event, basecall ", popString ,
                             "FROM   full_mito_diffseqs, populations ",
                             "WHERE  populations.sampleid  = full_mito_diffseqs.sampleid ",
                             "AND (",
                             "pop LIKE '",
                             stringr::str_c(pop, collapse="' OR pop LIKE '"),
                             "')" ,
                             indelString
                           )
  )

  return(diffs)
}




#' get all seqdiffs for some population
#'
#' @param db the database handle
#' @param pop the population requested (defaults to all). vectorized populations okay
#' @param ignoreIndels strips out indel events (default TRUE)
#' @param getPopulation gets the population label along with the sequence differences
#' @param kit nameof amplicon sequencing kit...
#' @export
getSeqdiffsByAmp <- function(db, pop="%", ignoreIndels=FALSE, getPopulation=FALSE, kit=default_kit) {

# optionally we want to filter out indels...
  indelString <- ""
  if (ignoreIndels) {
    indelString <- " AND event = 'X'"
  }
  popString <- ""
  if (getPopulation) {
    popString <- ", populations.pop "
  }

  genomeLength <- DBI::dbGetQuery(db, "SELECT seqlen FROM mtgenome LIMIT 1")[[1]]

  diffs <- DBI::dbGetQuery(db,
                            paste0(
                              "SELECT populations.sampleid, position, event, basecall, amps.start, amps.stop ", popString ,
                              "FROM   full_mito_diffseqs, populations, amps ",
                              "WHERE  populations.sampleid  = full_mito_diffseqs.sampleid ",
                              "AND amps.kit == '" , kit , "' ",
                              "AND (position > amps.start AND position <= amps.stop OR ",
                                   "position + " , genomeLength , " >amps.start AND position +" , genomeLength, " <= amps.stop)" ,
                              "AND (",
                                      "pop LIKE '",
                                      stringr::str_c(pop, collapse="' OR pop LIKE '"),
                                      "')" ,
                              indelString
                             )
  )

  return(diffs)
}

#' Length of mito-genome (reference)
#'
#' This is the non-circularized (concatenated)
#' mtgenome length (16569 unless some real funny-business is happening)
#'
#' @param db the database handle
#'
#' @export
getMtgenomeLength <- function(db) {
  DBI::dbGetQuery(db,
                  "SELECT seqlen FROM mtgenome LIMIT 1")[[1]]
}

#' Mito-genome (reference) sequence
#'
#' Returns the self-concatenated mitochondrial reference genome
#' Self-concatenation allows us to handle the circular alignment problem
#' (eg, reads, amplicons that span the canonical "start" of the mt-genome)
#' and coordinates from a circular alignment
#'
#' @param db the database handle
#' @param double whether or not the sequence should be self-concatenated (circularized)
#' @export
getMtgenomeSequence <- function(db, double=TRUE) {
  if (double) {
  return(stringr::str_dup(
    DBI::dbGetQuery(db,
                    "SELECT sequence FROM mtgenome LIMIT 1")[[1]],
    2))
  }
  return(DBI::dbGetQuery(db,
                  "SELECT sequence FROM mtgenome LIMIT 1")[[1]])
}

#' gets full mitogenomes
#'
#' generates amplicon sequence data from difference encodings
#' @importFrom magrittr %>%
#' @param db the database handle
#' @param pop the population requested (defaults to all). vectorized populations okay
#' @param ignoreIndels strips out indel events (default FALSE)
#' @param blk exclusion list of sites to filter out!
#'
#' @export
getMitoGenomes <- function(db, pop='%', ignoreIndels=FALSE, blk=c()) {

  mtgenomeLen <- DBI::dbGetQuery(db,
                                 "SELECT seqlen FROM mtgenome LIMIT 1")[[1]]

  lookupcodes <- getSeqdiffCodes() # translates X,I,D into 0,1,2
  diffs <- getSeqdiffs(db, pop, ignoreIndels, getPopulation=TRUE)
  # making this into a factor retains individuals (regardless of filtering)
  diffs$sampleid <- factor(diffs$sampleid)

  diffs<-dplyr::filter(diffs,
    ! position %in% blk,
    ! position %in% (blk+mtgenomeLen) )

  refGenome <- getMtgenomeSequence(db, double=FALSE)

  # for each individual / amplicon (with a difference to the rCRS)
  #
  dplyr::group_by(diffs,
                  sampleid,
                  pop
  )  %>%
    dplyr::mutate( # deal with circular coordinates, compute the coordinate of the SNP
      # within the amplicon
      position=as.integer(position)
    ) %>%
    dplyr::arrange(position) %>%
    dplyr::summarize( # and interpolate; make the string based on the string differences
      sequence=seqdiffs2seq(
        refGenome,
        position,
        lookupcodes[ event ],
        basecall
      )
    ) -> foo

  return(foo)
}

#' seqgraphs!
#'
#' generates a sequence graph for NN search
#' the input is a mitochondrial mixture (in "empop long" format)
#' the output is a seqgraph (from Haplotypical)
#'
#' @importFrom magrittr %>%
#' @param db the database handle
#' @param positions positions of alleles in mixture (1-based)
#' @param alleles allele calls in mixture (1-based)
#' @param types type of variation; X == mismatch, I == insertion, D == deletion.
#' @param ignoreIndels Must be set to true. Correctness is only guaranteed if indels are ignored!
#' @export
makeDeploidSeqGraph <- function(db, positions, alleles, types, ignoreIndels=TRUE) {

  if (!ignoreIndels) {
    stop("Cannot be. deploidNN only currently supports mismatches...")
  }

  if (sum(!types %in% c("X", "I", "D"))>0 ) {
    stop("Event types must be in the set {X I D}, which is not the case!")
  }

  tib <- tibble::tibble(Pos=positions, Al=alleles, Type=types)
  dplyr::filter(tib, Type=='X') %>%
    dplyr::group_by(Pos) %>%
    dplyr::summarize(N=dplyr::n_distinct(Al)) %>%
    dplyr::filter(N==1) %>%
    dplyr::ungroup() -> homoMismatches

  # join pragma only works with mismatches
  # filter the table to just consider the sites that are monomorphic
  homos <- dplyr::semi_join(tib, homoMismatches, by="Pos") %>%
    dplyr::arrange(Pos) %>%
    dplyr::filter(Type=='X')

  ref <- getMtgenomeSequence(db, double=FALSE)
  types <- rep(0L, nrow(homos))

  refRecoded <- seqdiffs2seq(ref, homos$Pos, types, homos$Al, nchar(ref)+1)

  # (wrong iff indels) get the ambiguous sites...
  others <- dplyr::anti_join(tib, homos, by="Pos")
  getSeqdiffCodes() -> typeLUT

  # commented out code for edit-distance. if we're tossing indels, let's go simpler (Hamming)
  #Haplotypical::makeSequenceGraph(refRecoded, others$Al, others$Pos, typeLUT[ others$Type ]+1)

  Haplotypical::makeSequenceHammingGraph(refRecoded, others$Al, others$Pos)
}


testDeploidNN <- function(db) {


  ref <- getMtgenomeSequence(db, double=FALSE)

  # whatever design you choose, do this *once*
  mtGenomes <- getMitoGenomes(db, pop="AM", ignoreIndels=TRUE) # make haploid genomes
  allDiffs <- getSeqdiffs(db, "AM", ignoreIndels=TRUE) # all sequence differences for the "AM" population


  # and then iterate over pairs...
  mixy <- dplyr::filter(allDiffs, sampleid=="AM_AR_0001"|sampleid=="AM_AR_0002") # pick 2 folks.


  # We need to emualate the basecalls as per converge
  # namely-- if the two people have the same allele, we need to have *1 row*; in the current form there's two rows with the same INFO
  # (other than who has the allele)

  # if the two people have different alleles and their both not the rCRS, we need two rows (with both alleles)
  # (deploid will filter that out anyways)
  # AND
  # if the two people are different and one == the rCRS at that position
  # the current representation has 1 row
  #
  # for converge processing there would be two alleles at this location instead
  # (a "heterozygote" call)
  # which would get split into two rows, one for each allele (with the value of each row either being the allele call or the reference)

  # get all of the reference alleles
  dplyr::pull(mixy, position) %>% unique() %>% sort -> mixPos
  refAlleles <- substring(ref, mixPos, mixPos)
  refTib <- tibble::tibble(position=mixPos, RefAllele=refAlleles)

  # add a RefAllele column; inner_join is equivalent here
  dplyr::left_join(mixy, refTib, by="position") ->mixy

  dplyr::group_by(mixy, position) %>%
    dplyr::summarize(N=dplyr::n(), # number of allele calls at site. 1 or 2
                     A1=basecall[[1]], # no matter what, first allele is what we have
                     A2=ifelse(N==1, RefAllele[[1]], basecall[[2]]), # second allele is the ref if there's no other info,
                     Col=ifelse(A1==A2, A1, paste(A1, A2, sep=";"))  # concatenate the two records as a ; separated string
                     ) -> mixSplit

  dplyr::select(mixSplit, position, Col) %>%
    tidyr::separate_rows(Col, sep=";") %>% # split the Col string; two rows for het calls; 1 row for hom calls
    dplyr::mutate(event="X") %>% # define the event to be a substitution
    dplyr::arrange(position) -> mixAsPerConverge


  # make a sequence graph
  # supports indels...
  makeDeploidSeqGraph(db, mixAsPerConverge$position, mixAsPerConverge$Col, mixAsPerConverge$event, ignoreIndels=TRUE) -> gr

  # all edit distances; capped at 4 (0-3 are meaningful distances)
  mtGenomes$edDist <- Haplotypical::fastBoundedHammingGraphDist(gr, mtGenomes$sequence, 4)
  filter(mtGenomes, edDist<4, sampleid!="AM_AR_0001", sampleid!="AM_AR_0002") %>% pull(sampleid) %>% as.character() -> nearNeighbors

  # old code (levenshtein distance)
  # mtGenomes$edDist <- -1

  #for (i in 53:nrow(mtGenomes)) {
   # print(i)
  #  mtGenomes$edDist[[i]] <- getSequenceGraphEditDistance(alignSequenceGraph(gr, mtGenomes$sequence[[i]] ))
  #}



}


#' makes amps
#'
#' generates amplicon sequence data from difference encodings
#' @importFrom magrittr %>%
#' @param db the database handle
#' @param pop the population requested (defaults to all). vectorized populations okay
#' @param ignoreIndels strips out indel events (default TRUE)
#' @param kit the amplicon kit
#' @param blk blacklist of sites to filter out!
#' @export
getAmps <- function(db, pop='%', ignoreIndels=FALSE, kit=default_kit, blk=c()) {

  mtgenomeLen <- DBI::dbGetQuery(db,
                                 "SELECT seqlen FROM mtgenome LIMIT 1")[[1]]



  origdiffs <- getSeqdiffsByAmp(db, pop, ignoreIndels, getPopulation=TRUE, kit) # without subsampling sites...

  # just the sequence differences we care about
  # note that the sample id may be filtered (if only variant for this person is in the blacklist)
  # thus to get the original smapleids we need to refer to origdiffs
  diffs <- dplyr::filter(origdiffs,
      ! position %in% blk,
      ! position %in% (blk+mtgenomeLen) )

  refamps <-   getRefAmpSeqs(db, kit)
  lookupcodes <- getSeqdiffCodes()

  # for each individual / amplicon (with a difference to the rCRS)
  #
  dplyr::group_by(diffs,
      sampleid,
      pop,
      start,
      stop
    ) %>%
    dplyr::inner_join( # grab the reference sequence associated with that amplicon
      refamps,
      by=c("start", "stop")
    ) %>%
    dplyr::mutate( # deal with circular coordinates, compute the coordinate of the SNP
      # within the amplicon
      position=as.integer(position),
      relpos=as.integer(ifelse(
        position > start,
        position - start,
        position + mtgenomeLen - start))
    ) %>%
    dplyr::arrange(position) %>%
    summarize( # and interpolate; make the string based on the string differences
      sequence=seqdiffs2seq(
        Seq[[1]],
        relpos,
        lookupcodes[ event ],
        basecall
        )
      ) -> foo
# at this point, foo only has the sequence differences.
# any amp that is == to the rCRS is not present.
# we add a no-op sequence difference for individuals that == the rCRS
# (to ensure the individual isn't lost with the join)
  base::expand.grid( # first, make a data frame with all pairs of individuals and stop coordinates
    sampleid=unique(origdiffs$sampleid),
    stop=refamps$stop, stringsAsFactors = FALSE) %>%
    dplyr::left_join(refamps, # and add in the start and reference Seq
              by=c("stop")) %>%
    dplyr::left_join(foo, # and left-join with the sequence differences
              by=c("sampleid", "start", "stop")) %>%
    # from here, if there was a difference encoding for the amp, that amps sequence is 'sequence'
    # otherwise it's NA (no differences to reference)
    dplyr::mutate(sequence=ifelse(is.na(sequence), Seq, sequence)) %>% # change NAs to reference
    dplyr::select(sampleid, start, stop, sequence) -> amps # and reorder the columns


  return(amps)
}

#' finds nearest neighbors
#'
#' @param alldist the output from getAllDistances
#' @param knn the number of nearest neighbors to find (ties broken arbitrarily)
#' @param maxdist if >-1, returns all neighbors within distance <= maxdist (ignores knn argument)
#' @param useIdentityFunction counts the number of amplicons that differ, not the sum of the distances
getNN <- function(alldist, knn=20, maxdist=-1, useIdentityFunction=FALSE) {
   dplyr::group_by(alldist, sampleid) %>%
    dplyr::summarize(DistSum=sum(dist), SumMismatch=sum(dist>0)) %>%
    dplyr::arrange(DistSum) %>%
    dplyr::ungroup() -> bydist
   if (maxdist < 0) {
    return(  utils::head(bydist, n=knn) )
   }
   if (useIdentityFunction) {
    return(  dplyr::filter(bydist, SumMismatch <= maxdist) )
   }
  return(  dplyr::filter(bydist, DistSum <= maxdist) )
}

#' solves the All 1NN
#'
#' (much) more efficient solutions exist (cover trees)
#' O(n log n) for n individuals and a constant genome length
#' the solution provided is O(n^2) by the same count.
#'
#' This looks at each individual in amps
#' and finds the 2NN of that individual and the associated distances
#' and the selects the 2nd individual. in theory this could give you
#' the same individual as you queried (ties are broken arbitrarily).
#' If you care about that... write your own :)
#' I just want the distances!
#'
#' @param amps the output from getAmps
#' @param ignoreIndels (whether or not indels were used to constitute the strings...)
naiveGetAllNN <- function(amps, ignoreIndels=FALSE) {
  dplyr::group_by(amps, sampleid) %>%
    dplyr::summarize(NNDist=
                list(
                  getNN(
                    getAllDistances(amps, sequence,stop, ignoreIndels=ignoreIndels),
                    knn=2)[2,]

                )
    )

}

#' solves the All 1NN
#'
#' (much) more efficient solutions exist (cover trees)
#' O(n log n) for n individuals and a constant genome length
#' the solution provided is O(n^2) by the same count.
#' This version of the code finds the NN for each amplicon (not for the whole mito)
#'
#' This looks at each individual in amps
#' and finds the 2NN of that individual and the associated distances
#' and the selects the 2nd individual. in theory this could give you
#' the same individual as you queried (ties are broken arbitrarily).
#' If you care about that... write your own :)
#' I just want the distances!
#'
#' @param amps the output from getAmps
#' @param ignoreIndels (whether or not indels were used to constitute the strings...)
naiveGetAllNNBySampAndStop <- function(amps, ignoreIndels=FALSE) {
  amps %>%
  dplyr::group_by(sampleid, stop) %>%
    dplyr::summarize(NNDist=
                       list(
                           getAllDistances(dplyr::filter(amps, stop==stop), sequence,stop, ignoreIndels=ignoreIndels)[2,]
                        )
    ) %>% tidyr::unnest()


}


writeOneNNTib <- function(amps) {
    amps %>% naiveGetAllNN(TRUE) -> all1NNTib
  readr::write_tsv( tidyr::unnest(all1NNTib), "All1NN.tsv")
  amps %>% naiveGetAllNNBySampAndStop(TRUE) -> all1NNByAmp
  readr::write_tsv(all1NNByAmp, "All1NN_byAmp.tsv")


}

#' generates amplicon sequence data from difference encodings

#' @param amps the output from getAmps
#' @param seqs query sequences
#' @param stops the stop coordinate of said sequences
#' @param ignoreIndels boolean; must match amps
getAllDistances <- function(amps, seqs, stops, ignoreIndels=FALSE) {

  tmp <- tibble::tibble(QuerySeq=seqs, stop=stops)
  dplyr::left_join(tmp,
                   amps,
                   by=c("stop")) ->cmbnd

  type <- "lv" # levenstein
  if (ignoreIndels) { # choose the appropriate distance function...
    type <- "hamming"
  }
  cmbnd$dist <- stringdist::stringdist(cmbnd$QuerySeq, cmbnd$sequence, method=type)
  return(dplyr::arrange(cmbnd, stop, dist))
}

#' gets population labels
#'
#' This takes in the database handle
#' and returns a the unique populations
#'
#' @param db database handle
#'
#' @export
getPops <- function(db) {
  DBI::dbGetQuery(db, "SELECT DISTINCT(pop) FROM populations")
}

#' reads empop; adds haplotypes to database

#' This function takes an empop file containing at least four (mandatory) columns
#' (refer to https://empop.online/downloads for the emp file format)
#' and returns a tibble with two columns - SampleID and Variant
#'
#' @importFrom magrittr %>%
#' @param db database handle to MMDIT
#' @param empopFile a file of empop sequences...
#'
addKnownHaplotypes <- function(db, empopFile) {
   hmt <- Empop2variant( empopFile, s=3 )
   hmt <- tidyr::separate(hmt,
     SampleID, c("Population"), remove=F, extra='drop')

   dplyr::bind_cols(
     hmt,
     Variant2snp(hmt$Variant)) -> hmt
   dplyr::mutate(hmt,
          Type=
            ifelse(Type=="Substitution", "X",
                       substr(Type, 1, 1)) # insertion -> I, deletion to D
   ) -> hmt

  samps <-  DBI::dbGetQuery(db,
                               "SELECT DISTINCT sampleid FROM populations")
  if (samps[1,] %in% hmt$SampleID) {
    stop("Duplicate sample IDs found!")
    return(0)
  }

  dplyr::group_by(hmt, SampleID, Population) %>%
    dplyr::summarize(whence=empopFile) %>%
    dplyr::ungroup() %>%
    dplyr::select(sampleid=SampleID, whence, pop=Population) -> samps2add

  # add the individuals to the DB
  DBI::dbWriteTable(db, "populations", samps2add, append=TRUE)
  # and add the difference encodings

  dplyr::select(hmt, sampleid=SampleID,
              position=Pos,
              event=Type,
              basecall=Allele) -> toAdd

  # check for mutually exclusive allele calls
  # same person, position
  # insertions are their own bird (position is really the position between this and the next)
  toAdd %>%
      dplyr::group_by(sampleid, position, event=="I") %>%
      dplyr::summarize(N=n()) %>%
    dplyr::filter(N>1) %>%
    dplyr::ungroup() -> mistakes

  if (nrow(mistakes)>0) {
    warning(paste("Some individuals have point heteroplasmies. That is not allowed!\n", paste(unique(mistakes$sampleid))))
  }

  # pick an arbitrary allele call.
  toAdd %>%
    dplyr::mutate(IsInsertion=event=="I") %>%
    dplyr::group_by(sampleid, position, IsInsertion) %>%
      dplyr::summarize(basecall=basecall[[1]], event=event[[1]]) %>%
      dplyr::ungroup() %>%
    dplyr::select(-IsInsertion) -> toAdd

  # add the individuals to the DB
  DBI::dbWriteTable(db, "full_mito_diffseqs", toAdd, append=TRUE)


  return(1)
}




#' Multi-population log-likelihood estimation
#'
#' This computes the P(evidence|Hypothesis) for a series of alleles
#' (allelesObserved, taken from known haplotypes and allelesThatExplain, populated from database)
#' which together explain the evidence.
#'
#' The likelihood is estimated in multiple populations. To do so, provide parallel vectors of alleles (databaseAlleles)
#' and the population affiliations (databasePopulations). Each are treated as categorical variables (case sensitive,
#' either the same or different).
#'
#' If a theta value is provided, the theta-corrected likelihood is estimated
#' (see: estimateLog10LikelihoodTheta)
#' otherwise the naive estimator is used.
#'
#' @param allelesObserved vector of alleles observed (empirically) (not database, may be empty)
#' @param allelesThatExplain vector of alleles that explain the mixture
#' @param theta theta-correction (Fst, taken from fst_buckleton)
#' @param databaseAlleles a vector of alleles sampled from the population (also from database, but may in practice come from a different sub-population)
#' @param databasePopulations a vector of population identifiers (paired with databaseAlleles; also from database)
#' @param ... (passed to estimateLog10LikelihoodNaive in the case that theta is null)
estimateLogLikelihood <- function(allelesObserved, allelesThatExplain, databaseAlleles, databasePopulations, theta=NULL, ...) {

  dat <- tibble::tibble(
    PopAlleles=databaseAlleles,
    Population    =databasePopulations)



  if (is.null(theta)) {
    # all alleles together that explain the mixture
     ally <- c(allelesObserved, allelesThatExplain, recursive=TRUE)
     dplyr::group_by(dat, .data$Population) %>%
      dplyr::summarize(LogLike=estimateLog10LikelihoodNaive(
        ally,
        PopAlleles,
        populationCounts=NULL,
        ...),
        Estimator='Naive') -> ret

  } else {
    dplyr::group_by(dat, .data$Population) %>%
      dplyr::summarize(LogLike=
                         estimateLog10LikelihoodTheta(
                           allelesObserved,
                           allelesThatExplain,
                           theta,
                           PopAlleles,
                           populationCounts=NULL
                         ),
                       Estimator='Theta corrected'
      ) -> ret
  }

  return(ret)
}