R/DuplicateCheck.R
In sequoia: Pedigree Inference from SNPs

Documented in DuplicateCheck

#' @title Check Data for Duplicates.
#'
#' @description Check the genotype and life history data for duplicate IDs (not
#'   permitted) and duplicated genotypes (not advised), and count how many
#'   individuals in the genotype data are not included in the life history data
#'   (permitted). The order of IDs in the genotype and life history data is not
#'   required to be identical.
#'
#' @param GenoM matrix with genotype data, size nInd x nSnp.
#' @param FortPARAM.dup list with Fortran-ready parameter values, as generated by
#'   \code{\link{MkFortParams}}.
#' @param quiet suppress messages.
#'
#' @return A list with one or more of the following elements:
#' \item{DupGenoID}{Dataframe, row numbers of duplicated IDs in genotype data.
#'   Please do remove or relabel these to avoid downstream confusion.}
#' \item{DupGenotype}{Dataframe, duplicated genotypes (with or without identical
#'   IDs). The specified number of maximum mismatches is allowed, and this
#'   dataframe may include pairs of closely related individuals. Mismatch =
#'   number of SNPs at which genotypes differ, LLR = likelihood ratio between
#'   'self' and most likely non-self.}
#'
#' @seealso \code{\link{CheckLH}}, which performs the check for duplicated IDs
#'   in the life history data, as well as for IDs (in genotype data) for which
#'   no life history data is provided.
#'
#' @useDynLib sequoia, .registration = TRUE
# @useDynLib sequoia duplicates
#'
#' @keywords internal

DuplicateCheck <- function(GenoM = NULL,
                           FortPARAM.dup,
                           quiet)
{
  on.exit(.Fortran(deallocall), add=TRUE)

  # AgePriors not used, LifeHist not passed to Fortran
  AP.dup <- MakeAgePrior(MaxAgeParent=10, Plot=FALSE, quiet=TRUE)
  FortPARAM.dup[["SpecsInt"]]["nAgeCl"] <- nrow(AP.dup)

  Ng <- nrow(GenoM)
  gID <- rownames(GenoM)

  DUP <- .Fortran(duplicates,
                  ng = as.integer(FortPARAM.dup$Ng),
                  specsint = as.integer(FortPARAM.dup$SpecsInt),
                  specsdbl = as.double(FortPARAM.dup$SpecsDbl),
                  errv = as.double(FortPARAM.dup$ErrM),
                  genofr = as.integer(GenoM),

                  sexrf = as.integer(rep(3, Ng)),
                  byrf = as.integer(rep(-999, 3*Ng)),
                  aprf = as.double(AP.dup),

                  ndupgenos = as.integer(0),
                  dupgenos = integer(2*Ng),
                  nmismatch = integer(Ng),
                  snpdboth = integer(Ng),
                  duplr = double(Ng))

  Duplicates <- list()
  if (any(duplicated(gID))) {
    r1 <- which(duplicated(gID))
    r2 <- which(duplicated(gID, fromLast=TRUE))
    Duplicates$DupGenoID <- data.frame(row1 = r1,
                                       row2 = r2,
                                       ID = gID[r1])
  }
  if(DUP$ndupgenos>0) {
    tmp <- VtoM(DUP$dupgenos, DUP$ndupgenos)
    Duplicates$DupGenotype <- data.frame(row1 = tmp[, 1],
                                         row2 = tmp[, 2],
                                         ID1 = gID[tmp[, 1]],
                                         ID2 = gID[tmp[, 2]],
                                         Mismatch = DUP$nmismatch[seq_len(DUP$ndupgenos)],
                                         SnpdBoth = DUP$snpdboth[seq_len(DUP$ndupgenos)],
                                         LLR = DUP$duplr[seq_len(DUP$ndupgenos)])
    Duplicates$DupGenotype <- Duplicates$DupGenotype[order(Duplicates$DupGenotype$LLR,
                                                           decreasing=TRUE), ]
  }

  # print warnings
  if (!quiet) {
    if (any(duplicated(gID))) cli::cli_alert_danger(c("duplicate IDs found in `GenoM`, ",
                                                       "please remove to avoid confusion"))
    if (DUP$ndupgenos>0 && DUP$ndupgenos > sum(duplicated(gID))) {
      cli::cli_alert_info(c('Found {ifelse(DUP$ndupgenos == Ng, ">=", "")}',
              "{DUP$ndupgenos} likely duplicate genotypes,",
              "please double check and consider removing"), wrap=TRUE)
    }
  }

  return( Duplicates )
}