R/GetMaybeRel.R
In sequoia: Pedigree Inference from SNPs

Documented in GetMaybeRel

#' @title Find Putative Relatives
#'
#' @description Identify pairs of individuals likely to be related, but not
#'   assigned as such in the provided pedigree.
#'
#' @param SeqList list with output from \code{\link{sequoia}}.
#'   \code{SeqList$Pedigree} is used if present, and \code{SeqList$PedigreePar}
#'   otherwise, and overrides the input parameter \code{Pedigree}. If 'Specs' is
#'   present, its elements override all input parameters with the same name. The
#'   list elements  `LifeHist', `AgePriors', and `ErrM' are also used if
#'   present, and similarly override the corresponding input parameters.
#' @param Pedigree dataframe with id - dam - sire in columns 1-3. May include
#'   non-genotyped individuals, which will be treated as dummy individuals. When
#'   provided, all likelihoods (and thus all maybe-relatives) are conditional on
#'   this pedigree. Note: \code{SeqList$Pedigree} or \code{SeqList$PedigreePar} take
#'   precedent (for this function only).
#' @param AgePrior Agepriors matrix, as generated by \code{\link{MakeAgePrior}}
#'   and included in the \code{\link{sequoia}} output. Affects which
#'   relationships are considered possible (only those where \eqn{P(A|R) / P(A)
#'   > 0}).
#' @param Module type of relatives to check for. One of
#'   \describe{
#'     \item{par}{parent - offspring pairs}
#'     \item{ped}{all first and second degree relatives}
#'   }
#'   When 'par', all pairs are returned that are more likely parent-offspring
#'   than unrelated, potentially including pairs that are even more likely to
#'   be otherwise related.
#' @param MaxPairs  the maximum number of putative pairs to return.
#' @param quiet logical, suppress messages.
#' @param ParSib \strong{DEPRECATED, use \code{Module}} either 'par' to check
#'   for putative parent-offspring pairs only, or 'sib' to check for all types
#'   of first and second degree relatives.
#' @inheritParams sequoia
#'
#' @return A list with
#' \item{MaybePar}{A dataframe with non-assigned likely parent-offspring pairs,
#'  with columns:
#'     \itemize{
#'       \item ID1
#'       \item ID2
#'       \item TopRel: the most likely relationship, using abbreviations listed
#'         below
#'        \item LLR: Log10-Likelihood Ratio between most likely and next most
#'        likely relationship
#'        \item OH: Number of loci at which the two individuals are opposite
#'        homozygotes
#'        \item BirthYear1: Birth year of ID1 (copied from LifeHistData)
#'        \item BirthYear2
#'        \item AgeDif: Age difference; BirthYear1 - BirthYear2
#'        \item Sex1: Sex of ID1 (copied from LifeHistData)
#'        \item Sex2
#'        \item SnpdBoth: Number of loci at which the two individuals are both
#'        successfully genotyped
#'     }}
#' \item{MaybeRel}{A dataframe with non-assigned likely pairs of relatives,
#' with columns identical to \code{MaybePar}}
#' \item{MaybeTrio}{A dataframe with non-assigned parent-parent-offspring
#'     trios, with columns:
#'     \itemize{
#'       \item ID
#'       \item parent1
#'       \item parent2
#'       \item TopRel: the most likely relationship, using abbreviations listed
#'         below
#'        \item LLRparent1: Log10-Likelihood Ratio between parent1 being a
#'        parent of ID vs the next most likely relationship between the pair,
#'        ignoring parent2
#'        \item LLRparent2: as LLRparent1
#'        \item LLRpair: LLR for the parental pair, versus the next most likely
#'        configuration between the three individuals (with one or neither
#'        parent assigned)
#'        \item OHparent1: Number of loci at which ID and parent1 are opposite
#'        homozygotes
#'        \item OHparent2: as OHparent1
#'        \item MEpair: Number of Mendelian errors between the offspring and the
#'    parent pair, includes OH as well as e.g. parents being opposing
#'    homozygotes, but the offspring not being a heterozygote. The offspring
#'    being OH with both parents is counted as 2 errors.
#'        \item SNPd.id.parent1: Number of loci at which ID and parent1 are both
#'        successfully genotyped
#'        \item SNPd.id.parent2: as SNPd.id.parent1
#'     }}
#' The following categories are used in column 'TopRel', indicating the most
#' likely relationship category:
#' \item{PO}{Parent-Offspring}
#' \item{FS}{Full Siblings}
#' \item{HS}{Half Siblings}
#' \item{GP}{GrandParent - grand-offspring}
#' \item{FA}{Full Avuncular (aunt/uncle)}
#' \item{2nd}{2nd degree relatives, not enough information to distinguish
#'   between HS,GP and FA}
#' \item{Q}{Unclear, but probably 1st, 2nd or 3rd degree relatives}
#'
#' @details When \code{Module="par"}, the age difference of the putative pair is
#'   temporarily set to NA so that genetic parent-offspring pairs declared to be
#'   born in the same year may be discovered. When \code{Module="ped"}, only
#'   relationships possible given the age difference, if known from the
#'   LifeHistData, are considered.
#'
#' @seealso \code{\link{sequoia}} to identify likely pairs of duplicate
#'   genotypes and for pedigree reconstruction; \code{\link{GetRelM}} to
#'   identify all pairs of relatives in a pedigree; \code{\link{CalcPairLL}} for
#'   the likelihoods underlying the LLR.
#'
#' @examples
#' \dontrun{
#' # without conditioning on pedigree
#' MaybeRel_griffin <- GetMaybeRel(GenoM=Geno_griffin, Err=0.001, Module='par')
#' }
#' names(MaybeRel_griffin)
#'
#' # conditioning on pedigree
#' MaybePO <- GetMaybeRel(GenoM = Geno_griffin, SeqList = SeqOUT_griffin,
#'                       Module = 'par')
#' head(MaybePO$MaybePar)
#'
#' # instead of providing the entire SeqList, one may specify the relevant
#' # elements separately
#' Maybe <- GetMaybeRel(GenoM = Geno_griffin,
#'                      Pedigree = SeqOUT_griffin$PedigreePar,
#'                      LifeHistData = LH_griffin,
#'                      Err=0.0001, Complex = "full",
#'                      Module = "ped")
#' head(Maybe$MaybeRel)
#'
#' # visualise results, turn dataframe into matrix first:
#' MaybeM <- GetRelM(Pairs = Maybe$MaybeRel)
#' PlotRelPairs(MaybeM)
#' # or combine with pedigree (note suffix '?')
#' RelM <- GetRelM(Pedigree =SeqOUT_griffin$PedigreePar, Pairs = Maybe$MaybeRel)
#' PlotRelPairs(RelM)
#'
#' @useDynLib sequoia, .registration = TRUE
#'
#' @export

GetMaybeRel <- function(GenoM = NULL,
                        SeqList = NULL,
                        Pedigree = NULL,
                        LifeHistData = NULL,
                        AgePrior = NULL,
                        Module = "par",
                        Complex = "full",
                        Herm = "no",
                        Err = 0.0001,
                        ErrFlavour = "version2.9",
                        Tassign = 0.5,
                        Tfilter = -2.0,
                        MaxPairs =  7*nrow(GenoM),
                        quiet = FALSE,
                        ParSib = NULL,  # DEPRECATED
                        MaxMismatch = NA)   # DEPRECATED
{
  on.exit(.Fortran(deallocall), add=TRUE)

  # backwards compatibility, until ParSib deprecated:
  if (!is.null(ParSib)) {
    if (ParSib %in% c("par", "sib", "ped")) {
      Module <- switch(ParSib, par = "par", sib = "ped", ped="ped")
    } else {
      stop("'ParSib' must be 'par' or 'sib'; or use only 'Module' instead")
    }
  }
  if (!Module %in% c("par", "ped"))  stop("'Module' must be 'par' or 'ped'")
  if(!quiet)  cli::cli_alert_info(paste0("Searching for non-assigned ",
                      c(par="parent-offspring", ped="relative")[Module], " pairs ...",
                      " (Module = ", Module, ")"))


  # unpack SeqList ----
  if (!is.null(SeqList)) {
    if (!inherits(SeqList, 'list'))  stop("'SeqList' must be a list")
    NewName = c(Pedigree = "Pedigree",
                PedigreePar = "Pedigree",
                LifeHist = "LifeHistData",
                AgePriors = "AgePrior")
    for (x in names(NewName)) {
#      if (x %in% c("Pedigree", "PedigreePar") & !is.null(Pedigree))  next
## SeqList$Pedigree takes precedent for this function only.
      if (x %in% names(SeqList)) {
        if (x == "PedigreePar" & "Pedigree" %in% names(SeqList))  next
        if (!quiet) cli::cli_alert_info("using {x} in SeqList")
        assign(NewName[x], SeqList[[x]])
      }
    }
  }

  # input check ----
  if (!quiet %in% c(TRUE, FALSE))  stop("'quiet' must be TRUE or FALSE")

  GenoM <- CheckGeno(GenoM, quiet=quiet, Plot=FALSE)
  gID <- rownames(GenoM)

  Pedigree <- PedPolish(Pedigree, gID, DropNonSNPd = FALSE,
                        NullOK = TRUE)   # OK if no pedigree
  if (!quiet) {
    if (is.null(Pedigree)) {
      cli::cli_alert_info("Not conditioning on any pedigree")
    } else {
      N <- c(i=nrow(Pedigree), d=sum(!is.na(Pedigree$dam)), s=sum(!is.na(Pedigree$sire)))
      cli::cli_alert_info("Conditioning on pedigree with {N['i']} individuals, {N['d']} dams and {N['s']} sires")
    }
  }

  LH <- CheckLH(LifeHistData, gID, sorted=TRUE)

  if (!is.null(AgePrior)) {
    AgePrior <- CheckAP(AgePrior)
  } else {
    AgePrior <- MakeAgePrior(Pedigree, LifeHistData, Plot=FALSE, quiet=quiet)
  }


  # Specs / param ----
  if ("Specs" %in% names(SeqList)) {
    if(!quiet)  cli::cli_alert_info("settings in SeqList$Specs will overrule input parameters")
    SeqList$Specs$Module <- Module
    PARAM <- SpecsToParam(SeqList$Specs, SeqList$ErrM, ErrFlavour,
                          dimGeno = dim(GenoM), Module, MaxPairs, quiet)  # overrule values in SeqList
  } else {
    PARAM <- namedlist(dimGeno = dim(GenoM),
                       MaxPairs,
                       Err,
                       ErrFlavour,
                       Tfilter,
                       Tassign,
                       nAgeClasses = nrow(AgePrior),
                       MaxSibshipSize = max(table(Pedigree$dam), table(Pedigree$sire), 90,
                                            na.rm=TRUE) +10,
                       Module = as.character(Module),
                       Complex,
                       Herm,
                       quiet)
    PARAM$ErrM <- ErrToM(Err, flavour = ErrFlavour, Return = "matrix")
  }

  # MaxMismatch ----
  # vector with max. mismatches for duplicates, PO pairs, PPO trios
  if (!"MaxMismatchV" %in% names(PARAM)) {  # DUP/OH/ME from version 2.0 onwards
    sts <- SnpStats(GenoM, Plot=FALSE)
    PARAM$MaxMismatchV <- setNames(CalcMaxMismatch(Err=PARAM$ErrM,
                                                   MAF=sts[,"AF"],
                                                   ErrFlavour=PARAM$ErrFlavour,
                                                   qntl=0.9999^(1/nrow(GenoM))),
                                   c("DUP", "OH", "ME"))
  }


  # check parameter values ----
  CheckParams(PARAM)
  utils::flush.console()

  if (any(LifeHistData$Sex==4) && PARAM$Herm == "no") {  #!grepl("herm", PARAM$Complex)) {
    if (!quiet) cli::cli_alert_warning("detected hermaphrodites (sex=4), changing Herm to 'A'")
#    PARAM$Complex <- "herm"
    PARAM$Herm <- "A"
  }


  #=========================
  # call Fortran ----
  Ng <- nrow(GenoM)
  gID <- rownames(GenoM)
  FortPARAM <- MkFortParams(PARAM, fun="mayberel")

  # turn IDs into numbers, & turn non-genotyped parents into temporary dummies
  PedN <- PedToNum(Pedigree, gID, DoDummies = "new")  # list: PedPar - DumPar - Renamed - Nd
  LHF <- orderLH(LH, gID)

  TMP <- .Fortran(findambig,
                  ng = as.integer(Ng),
                  specsint = as.integer(FortPARAM$SpecsInt),
                  specsintamb = as.integer(FortPARAM$SpecsIntAmb),
                  specsdbl = as.double(FortPARAM$SpecsDbl),
                  errv = as.double(FortPARAM$ErrM),
                  genofr = as.integer(GenoM),
                  sexrf = as.integer(LHF$Sex),
                  byrf = as.integer(c(LHF$BirthYear, LHF$BY.min, LHF$BY.max)),
                  aprf = as.double(AgePrior),
                  parentsrf = as.integer(PedN$PedPar),
                  dumparrf = as.integer(PedN$DumPar),

                  namb = as.integer(0),
                  ambigid = integer(2*MaxPairs),
                  ambigrel = integer(2*MaxPairs),
                  ambiglr = double(2*MaxPairs),
                  ambigoh = integer(MaxPairs),

                  ntrio = as.integer(0),
                  trioids = integer(3*Ng),
                  triolr = double(3*Ng),
                  triooh = integer(3*Ng))

  TMP$ambiglr[abs(TMP$ambiglr - 999) < 0.1] <- NA
  TMP$ambiglr <- round(TMP$ambiglr, 2)
  TMP$ambigoh[TMP$ambigoh < 0] <- NA
  TMP$triolr[abs(TMP$triolr - 999) < 0.1] <- NA
  TMP$triolr <- round(TMP$triolr, 2)
  TMP$triooh[TMP$triooh < 0] <- NA

  #=========================
  # non-assigned probable relative pairs
  if (TMP$namb > 0) {
    RelName <- c("PO", "FS", "HS", "GP", "FA", "HA", "U ", "Q", "2nd")
    Na <- TMP$namb
    TMP$ambigid <- NumToID(TMP$ambigid, 0, gID, NULL)   # maybe-rel only between genotyped indivs
    AmbigRel <- factor(TMP$ambigrel, levels=1:9, labels=RelName)

    MaybeRel <- data.frame(VtoM(TMP$ambigid, Na),
                           VtoM(AmbigRel, Na),
                           VtoM(TMP$ambiglr, Na),
                           stringsAsFactors=FALSE)
    names(MaybeRel) <- c("ID1", "ID2", "Relx", "TopRel", "LLR_Rx_U", "LLR")
    MaybeRel <- MaybeRel[,-which(names(MaybeRel) %in% c("Relx", "LLR_Rx_U"))]  # drop; confusing.
    MaybeRel$OH <-  TMP$ambigoh[seq_len(Na)]

    if (nrow(MaybeRel)>0) {
      LH$BirthYear[LH$BirthYear<0] <- NA
      MaybeRel <- merge(MaybeRel, setNames(LH[,1:3], c("ID1","Sex1","BirthYear1")), all.x=TRUE)
      MaybeRel <- merge(MaybeRel, setNames(LH[,1:3], c("ID2","Sex2","BirthYear2")), all.x=TRUE)
      MaybeRel$AgeDif <- with(MaybeRel, BirthYear1 - BirthYear2)
      MaybeRel <- MaybeRel[, c("ID1", "ID2", "TopRel", "LLR", "OH",
                               "BirthYear1", "BirthYear2", "AgeDif", "Sex1", "Sex2")]
      for (i in 1:Na) {
        if (is.na(MaybeRel$AgeDif[i]))  next
        if (MaybeRel$AgeDif[i] < 0) {
          tmpRel <- MaybeRel[i,]
          tmpRel$AgeDif <- abs(tmpRel$AgeDif)
          MaybeRel[i,] <- tmpRel[,c("ID2","ID1","TopRel", "LLR", "OH",
                                    "BirthYear2", "BirthYear1","AgeDif", "Sex2", "Sex1")]
        }
      }
    }

    if (Module == "ped") {
      MaybeRel <- with(MaybeRel, MaybeRel[TopRel %in% c("PO", "FS", "HS","GP",
                                                        "FA", "2nd", "Q"), ])  # drop HA,XX,U
    }
    MaybeRel <- MaybeRel[order(ordered(MaybeRel$TopRel, levels=RelName),
                               -MaybeRel$LLR),]

    #~~~~~~~~
    CalcSnpdBoth <- function(Pairs, GenoM) {
      sapply(seq_along(Pairs[,1]), function(i, G=GenoM) {
        sum(G[Pairs[i,1], ] >=0 & G[Pairs[i,2], ] >=0)
      })
    }
    #~~~~~~~~

    if (nrow(MaybeRel)==0) {
      MaybeRel <- NULL
    } else {
      rownames(MaybeRel) <- 1:nrow(MaybeRel)
      MaybeRel$SNPdBoth <- CalcSnpdBoth(MaybeRel[, c("ID1", "ID2")], GenoM)
    }
  } else  MaybeRel <- NULL

  if (quiet<1) {   #  && nrow(MaybeRel)>0
    if (!is.null(MaybeRel)) {
      nRel <- nrow(MaybeRel)
      nPO <- sum(MaybeRel$TopRel=="PO")
    } else {
      nRel <- 0
      nPO <- 0
    }
    cli::cli_alert_success(c("Found {nPO} likely parent-offspring pairs, ",
            "and {nRel-nPO}, other non-assigned pairs of possible relatives"))
  }

  #=========================
  # non-assigned parent-parent-offspring trios
  if (TMP$ntrio>0) {
    trios <- data.frame(VtoM(TMP$trioids, nr=TMP$ntrio, nc=3),
                        VtoM(TMP$triolr, nr=TMP$ntrio, nc=3),
                        VtoM(TMP$triooh, nr=TMP$ntrio, nc=3),
                        stringsAsFactors=FALSE)
    names(trios) <- c("id", "parent1", "parent2", "LLRparent1", "LLRparent2", "LLRpair",
                      "OHparent1", "OHparent2", "MEpair")
    for (k in 1:3) trios[, k] <- NumToID(trios[, k], k-1, gID, NULL)
    trios$SNPd.id.parent1 <- CalcSnpdBoth(trios[, c("id", "parent1")], GenoM)
    trios$SNPd.id.parent2 <- CalcSnpdBoth(trios[, c("id", "parent2")], GenoM)
    if (quiet<1)   cli::cli_alert_success("Found {nrow(trios)} parent-parent-offspring trios")
  } else  trios <- NULL

  if (Module == "par") {
    return( list(MaybePar = MaybeRel,
                 MaybeTrio = trios) )
  } else {
    return( list(MaybeRel = MaybeRel,
                 MaybeTrio = trios) )
  }
}