R/jointDVI.R
In dvir: Disaster Victim Identification

Documented in compactJointRes jointDVI

#' Joint DVI search
#'
#' Victims are given as a list of singletons, and references as a list of
#' pedigrees. All possible assignments are evaluated and solutions ranked
#' according to the likelihood.
#'
#' @param dvi A `dviData` object, typically created with [dviData()].
#' @param pairings A list of possible pairings for each victim. If NULL, all
#'   sex-consistent pairings are used.
#' @param ignoreSex A logical.
#' @param assignments A data frame containing the assignments to be considered
#'   in the joint analysis. By default, this is automatically generated by
#'   taking all combinations from `pairings`.
#' @param limit A positive number, by default 0. Only pairwise LR values above
#'   this are considered.
#' @param nkeep An integer, or NULL. If given, only the `nkeep` most likely
#'   pairings are considered for each victim.
#' @param markers A vector indicating which markers should be included in the
#'   analysis. By default all markers are included.
#' @param disableMutations A logical, or NA (default). The default action is to
#'   disable mutations in all reference families without Mendelian errors.
#' @param undisputed A logical, by default TRUE.
#' @param maxAssign A positive integer. If the number of assignments going into
#'   the joint calculation exceeds this, the function will abort with an
#'   informative error message. Default: 1e5.
#' @param threshold A positive number, passed onto [findUndisputed()]. Default:
#'   1e4.
#' @param strict A logical, passed onto [findUndisputed()]. Default: FALSE.
#' @param relax Deprecated.
#' @param numCores An integer; the number of cores used in parallelisation.
#'   Default: 1.
#' @param check A logical, indicating if the input data should be checked for
#'   consistency.
#' @param verbose A logical.
#' @param jointRes A data frame produced by `jointDVI()`.
#' @param LRthresh A positive number, used as upper limit for the LR comparing the
#'   top result with all others.
#'
#' @return A data frame. Each row describes an assignment of victims to missing
#'   persons, accompanied with its log likelihood, the LR compared to the null
#'   (i.e., no identifications), and the posterior corresponding to a flat
#'   prior.
#'
#'   The function `compactJointRes()` removes columns without assignments, and
#'   solutions whose LR compared with the top result is below `1/LRthresh`. 
#'   
#' @seealso [pairwiseLR()], [findUndisputed()]
#'
#' @examples
#' jointDVI(example2)
#'
#' @importFrom utils setTxtProgressBar txtProgressBar
#'
#' @export
jointDVI = function(dvi, pairings = NULL, ignoreSex = FALSE, assignments = NULL, 
                    limit = 0, nkeep = NULL, undisputed = TRUE, markers = NULL, 
                    threshold = 1e4, strict = FALSE, relax = !strict, disableMutations = NA, 
                    maxAssign = 1e5, numCores = 1, check = TRUE, verbose = TRUE) {
  
  st = Sys.time()
  
  # Ensure proper dviData object
  dvi = consolidateDVI(dvi)
  
  if(length(dvi$pm) == 0)
    undisputed = FALSE
  
  origVics = vics = names(dvi$pm)
  
  if(!is.null(markers)) {
    stop2("Marker selection not implemented")
    # pm = selectMarkers(pm, markers)
    # am = selectMarkers(am, markers)
  }
  
  if(verbose)
    print.dviData(dvi)
  
  if(check)
    checkDVI(dvi, pairings = pairings, ignoreSex = ignoreSex, verbose = verbose)
  
  ### Mutation disabling
  if(any(allowsMutations(dvi$am))) {
    am = dvi$am
    
    if(verbose) 
      cat("\nMutation modelling:\n")
    
    if(isTRUE(disableMutations)) {
      if(verbose) cat(" Disabling mutations in all families\n")
      disableFams = seq_along(am)
    }
    else if(identical(disableMutations, NA)) {
      am.nomut = setMutmod(am, model = NULL)
      badFams = vapply(am.nomut, loglikTotal, FUN.VALUE = 1) == -Inf
      if(verbose) {
        if(any(badFams)) 
          cat("", sum(badFams), "inconsistent families:", trunc(which(badFams)), "\n")
        cat("", sum(!badFams), "consistent families. Disabling mutations in these\n")
      }
      disableFams = which(!badFams)
    }
    else disableFams = NULL
  
    if(length(disableFams)) {
      am[disableFams] = setMutmod(am[disableFams], model = NULL)
    }
    
    # Update DVI object
    dvi$am = am
  }
  
  ### Identify and fixate "undisputed" matches
  undisp = as.data.frame(list())
  
  if(undisputed && is.null(assignments)) {
    
    r = findUndisputed(dvi, pairings = pairings, ignoreSex = ignoreSex, 
                       threshold = threshold, strict = strict, limit = limit, 
                       nkeep = nkeep, numCores = numCores, verbose = verbose)
    
    # List of undisputed, and their LR's
    undisp = r$summary
    Nun = nrow(undisp)
    
    # If all are undisputed, return early
    # Either all *victims* are matched or all *missing* are identified
    # The code below covers both scenarios
    if(Nun == length(dvi$pm) || Nun == length(dvi$missing)) {
      
      # Build solution assignment (must be a data frame)
      sol = rep(list("*"), length(dvi$pm))
      names(sol) = vics
      sol[undisp$Sample] = undisp$Missing
      sol = as.data.frame(sol)
      
      # Run through jointDVI() with the solution as the only assignment 
      res = jointDVI(dvi, ignoreSex = ignoreSex, assignments = sol, undisputed = FALSE,
                     markers = markers, threshold = NULL, check = FALSE, verbose = FALSE)
      return(res)
    }
    
    # Reduced DVI problem to be used in the joint analysis
    dvi = r$dviReduced
    vics = names(dvi$pm)
    
    # pairings: These exclude those with LR = 0!
    pairings = r$pairings
    
    if(verbose && Nun > 0)
      print.dviData(dvi, heading = "\nReduced DVI dataset:")
  }
  
  pm = dvi$pm 
  am = dvi$am
  
  if(is.null(pairings) && is.null(assignments))
    pairings = pairwiseLR(dvi, pairings = pairings, ignoreSex = ignoreSex, limit = limit, nkeep = nkeep)$pairings
 
  if(is.null(assignments)) {
    if(verbose) cat("\nCalculating pairing combinations\n")
    # Expand pairings to assignment data frame
    assignments = expand.grid.nodup(pairings, max = maxAssign)
  }
  else {
    if(verbose) cat("\nChecking supplied pairing combinations\n")
    if(!setequal(names(assignments), origVics))
      stop2("Names of `assignments` do not match `pm` names")
    assignments = assignments[origVics]
  }
  
  nAss = nrow(assignments)
  if(nAss == 0)
    stop2("No possible solutions!")
  if(verbose)
    cat("Assignments to consider in the joint analysis:", nAss, "\n\n")
  
  # Convert to list; more handy below
  assignmentList = lapply(1:nAss, function(i) as.character(assignments[i, ]))
  
  # Initial loglikelihoods
  logliks.PM = vapply(pm, loglikTotal, FUN.VALUE = 1)
  logliks.AM = vapply(am, loglikTotal, FUN.VALUE = 1)
  
  loglik0 = sum(logliks.PM) + sum(logliks.AM)
  if(loglik0 == -Inf)
    stop2("Impossible initial data: AM component ", which(logliks.AM == -Inf))
  
  # Parallelise
  if(numCores > 1) {
    
    if(verbose) 
      cat("Using", numCores, "cores\n")
    
    cl = makeCluster(numCores)
    on.exit(stopCluster(cl))
    clusterEvalQ(cl, library(dvir))
    clusterExport(cl, "loglikAssign", envir = environment())

    # Loop through assignments
    loglik = parLapply(cl, assignmentList, function(a) 
      loglikAssign(pm, am, vics, a, loglik0, logliks.PM, logliks.AM))
  }
  else {
    # Setup progress bar
    if(progbar <- verbose && interactive())
      pb = txtProgressBar(min = 0, max = nAss, style = 3)
    
    loglik = lapply(seq_len(nAss), function(i) {
      if(progbar) setTxtProgressBar(pb, i)
      loglikAssign(pm, am, vics, assignmentList[[i]], loglik0, logliks.PM, logliks.AM)
    })
    
    # Close progress bar
    if(progbar) close(pb)
  }
  
  loglik = unlist(loglik)
  
  LR = exp(loglik - loglik0)
  posterior = LR/sum(LR) # assumes a flat prior
  
  # Add undisputed matches
  Nun = nrow(undisp)
  if(Nun > 0) {
    # Add ID columns
    assignments[, undisp$Sample] = rep(undisp$Missing, each = nAss)
    
    # Bug fix: Add * columns for victims lost in `subsetDVI` in undisputed
    missVic = setdiff(origVics, names(assignments))
    for(v in missVic)
      assignments[[v]] = "*"
    
    # Fix ordering
    assignments = assignments[origVics]
    
    # Fix LR: Multiply with that of the undisputed
    LR = LR * prod(undisp$LR)
  }
    
  # Collect results
  tab = cbind(assignments, loglik = loglik, LR = LR, posterior = posterior)
  
  # Sort in decreasing likelihood, break ties with grid
  g = assignments
  g[g == "*"] = NA
  tab = tab[do.call(order, g), , drop = FALSE] # first sort assignments alphabetically
  tab = tab[order(round(tab$loglik, 10), decreasing = TRUE), , drop = FALSE]
  
  rownames(tab) = NULL
  
  if(verbose)
    cat("Time used:", format(Sys.time() - st, digits = 3), "\n")
  
  tab
}

#' @rdname jointDVI
#' @export
compactJointRes = function(jointRes, LRthresh = NULL) {
  if(!is.null(LRthresh)) {
    # Require LR > threshold
    keepRows = jointRes$LR >= LRthresh
    
    # Also require LR_1:a > threshold (i.e. with top result as numerator)
    logLRtop = jointRes$loglik[1] - jointRes$loglik
    keepRows = keepRows & logLRtop - log(LRthresh) < sqrt(.Machine$double.eps)
    
    # Last row to keep
    last = match(TRUE, keepRows, nomatch = 0L) + 1L
    jointRes = jointRes[1:last, , drop = FALSE]
  }
  
  empty = sapply(jointRes, function(v) all(v == "*"))
  jointRes[, !empty, drop = FALSE]
}


# Function for computing the total log-likelihood of a single assignment
loglikAssign = function(pm, am, vics, assignment, loglik0, logliks.PM, logliks.AM) {
  
  # Victims which actually move
  vicMove = vics[assignment != "*"]
  mpsMove = assignment[assignment != "*"]
  
  if(length(vicMove) == 0)
    return(loglik0)
  
  # The relevant AM components
  compNo = unique.default(getComponent(am, mpsMove, checkUnique = TRUE))
  
  # Move victim data
  changedComps = transferMarkers(pm[vicMove], am[compNo], idsFrom = vicMove, 
                                 idsTo = mpsMove, erase = FALSE)

  # Update likelihood of modified AM comps
  logliks.AM.new = logliks.AM
  logliks.AM.new[compNo] = vapply(changedComps, function(a) loglikTotal(a), FUN.VALUE = 1)
  
  # Likelihood of remaining PMs
  logliks.PM.new = logliks.PM[setdiff(vics, vicMove)]
  
  # Return total loglik of assignments
  sum(logliks.PM.new) + sum(logliks.AM.new)
}