R/CalcOHLLR.R

Defines functions CalcOHLLR

Documented in CalcOHLLR

#=======================================================================
#' @title Calculate OH and LLR for a pedigree
#'
#' @description Count opposite homozygous (OH) loci between parent-offspring
#'   pairs and Mendelian errors (ME) between parent-parent-offspring trios, and
#'   calculate the parental log-likelihood ratios (LLR).
#'
#' @details Any individual in \code{Pedigree} that does not occur in
#'   \code{GenoM} is substituted by a dummy individual; these can be recognised
#'   by the value 0' in columns 'SNPd.id.dam' and `SNPd.id.sire` in the output.
#'   For non-genotyped individuals the parental log-likelihood ratio can be
#'   calculated if they have at least one genotyped offspring (see also
#'   \code{\link{getAssignCat}}).
#'
#'   The birth years in \code{LifeHistData} and the \code{AgePrior} are not used
#'   in the calculation and do not affect the value of the likelihoods for the
#'   various relationships, but they _are_ used during some filtering steps, and
#'   may therefore affect the likelihood _ratio_. The default
#'   (\code{AgePrior=FALSE}) assumes all age-relationship combinations are
#'   possible, which may mean that some additional alternatives are considered
#'   compared to the \code{\link{sequoia}} default, resulting in somewhat lower
#'   \code{LLR} values.
#'
#'   A negative LLR for A's parent B indicates either that B is not truely the
#'   parent of A, or that B's parents are incorrect. The latter may cause B's
#'   presumed true, unobserved genotype to divert from its observed genotype,
#'   with downstream consequences for its offspring. In rare cases it may also
#'   be due to 'weird', non-implemented double or triple relationships between A
#'   and B.
#'
#'
#' @param Pedigree  dataframe with columns id-dam-sire. May include
#'   non-genotyped individuals, which will be treated as dummy individuals. If
#'   provided, any pedigree in \code{SeqList} is ignored.
#' @param CalcLLR  calculate log-likelihood ratios for all assigned parents
#'   (genotyped + dummy/non-genotyped; parent vs. otherwise related). If
#'   \strong{\code{FALSE}}, only number of mismatching SNPs are counted (OH &
#'   ME), and parameters \code{LifeHistData}, \code{AgePrior}, \code{Err},
#'   \code{Tassign}, and \code{Complex} are \strong{ignored}. Note also that
#'   calculating likelihood ratios is much more time consuming than counting OH
#'   & ME.
#' @param AgePrior logical (\code{TRUE/FALSE}) whether to estimate the ageprior
#'   from \code{Pedigree} and \code{LifeHistData}, or a matrix as generated by
#'   \code{\link{MakeAgePrior}} and included in the \code{\link{sequoia}}
#'   output. The \code{AgePrior} affects which relationships are considered
#'   possible: only those where \eqn{P(A|R) / P(A) > 0}.  When \code{TRUE},
#'   \code{\link{MakeAgePrior}} is called using its default values. When
#'   \code{FALSE}, all relationships are considered possible for all age
#'   differences, except that parent-offspring pairs cannot have age difference
#'   zero, and grand-parental pairs have an age difference of at least two.
#' @param SeqList list with output from \code{\link{sequoia}}. If input
#'   parameter \code{Pedigree=NULL}, \code{SeqList$Pedigree} will be used if
#'   present, and \code{SeqList$PedigreePar} otherwise. If \code{SeqList$Specs}
#'   is present, input parameters with the same name as its items are ignored,
#'   except 'CalcLLR' and 'AgePriors=FALSE'. The list elements  `LifeHist',
#'   `AgePriors', and `ErrM' are also used if present, and override the
#'   corresponding input parameters.
#' @param quiet logical, suppress messages
#' @inheritParams sequoia
#'
#' @return The \code{Pedigree} dataframe with additional columns:
#'  \item{LLRdam}{Log10-Likelihood Ratio (LLR) of this female being the mother,
#'  versus the next most likely relationship between the focal individual and
#'  this female (see Details for relationships considered)}
#'  \item{LLRsire}{idem, for male parent}
#'  \item{LLRpair}{LLR for the parental pair, versus the next most likely
#'   configuration between the three individuals (with one or neither parent
#'   assigned)}
#'  \item{OHdam}{Number of loci at which the offspring and mother are
#'    opposite homozygotes}
#'  \item{OHsire}{idem, for father}
#'  \item{MEpair}{Number of Mendelian errors between the offspring and the
#'    parent pair, includes OH as well as e.g. parents being opposing
#'    homozygotes, but the offspring not being a heterozygote. The offspring
#'    being OH with both parents is counted as 2 errors.}
#'  \item{SNPd.id}{Number of SNPs scored (non-missing) for the focal individual}
#'  \item{SNPd.id.dam}{Number of SNPs scored (non-missing) for both individual
#'    and dam}
#'  \item{SNPd.id.sire}{Number of SNPs scored for both individual and sire}
#'  \item{Sexx}{Sex in LifeHistData, or inferred Sex when assigned as part of
#'    parent-pair}
#'  \item{BY.est}{mode of birth year probability distribution}
#'  \item{BY.lo}{lower limit of 95\% highest density region of birth year
#'  probability distribution}
#'  \item{BY.hi}{higher limit}
#'
#' The columns 'LLRdam', 'LLRsire' and 'LLRpair' are only included when
#' \code{CalcLLR=TRUE}. When a parent or parent-pair is incompatible with the
#' lifehistory data or presumed genotyping error rate, the error value '777' may
#' be given.
#'
#' The columns 'Sexx', 'BY.est', 'BY.lo' and 'BY.hi' are only included when
#' \code{LifeHistData} is provided, and at least one genotyped individual has an
#' unknown birth year or unknown sex.
#'
#'
#' @seealso \code{\link{SummarySeq}} for visualisation of OH & LLR
#'   distributions; \code{\link{CalcPairLL}} for the likelihoods underlying the
#'   LLR, \code{\link{GenoConvert}} to read in various genotype data formats,
#'   \code{\link{CheckGeno}}; \code{\link{PedPolish}} to check and 'polish' the
#'   pedigree; \code{\link{getAssignCat}} to find which id-parent pairs are both
#'   genotyped or can be substituted by dummy individuals; \code{\link{sequoia}}
#'   for pedigree reconstruction.
#'
#' @examples
#' # count Mendelian errors in an existing pedigree
#' Ped.OH <- CalcOHLLR(Pedigree = Ped_HSg5, GenoM = SimGeno_example,
#'                     CalcLLR = FALSE)
#' Ped.OH[50:55,]
#' # view histograms
#' SummarySeq(Ped.OH, Panels="OH")
#'
#' # Parent likelihood ratios in an existing pedigree, including for
#' # non-genotyped parents
#' Ped.LLR <- CalcOHLLR(Pedigree = Ped_HSg5, GenoM = SimGeno_example,
#'                     CalcLLR = TRUE, LifeHistData=LH_HSg5, AgePrior=TRUE)
#' SummarySeq(Ped.LLR, Panels="LLR")
#'
#' \dontrun{
#' # likelihood ratios change with presumed genotyping error rate:
#' Ped.LLR.B <- CalcOHLLR(Pedigree = Ped_HSg5, GenoM = SimGeno_example,
#'                     CalcLLR = TRUE, LifeHistData=LH_HSg5, AgePrior=TRUE,
#'                     Err = 0.005)
#' SummarySeq(Ped.LLR.B, Panels="LLR")
#'
#' # run sequoia with CalcLLR=FALSE, and add OH + LLR later:
#' SeqOUT <- sequoia(Geno_griffin, LH_griffin, CalcLLR=FALSE,quiet=TRUE,Plot=FALSE)
#' PedA <- CalcOHLLR(Pedigree = SeqOUT[["Pedigree"]][, 1:3], GenoM = Genotypes,
#'   LifeHistData = LH_griffin, AgePrior = TRUE, Complex = "full")
#' SummarySeq(PedA, Panels=c("LLR", "OH"))
#' }
#'
#' @useDynLib sequoia, .registration = TRUE
#
#' @export

CalcOHLLR <- function(Pedigree = NULL,
                      GenoM = NULL,
                      CalcLLR = TRUE,
                      LifeHistData = NULL,
                      AgePrior = FALSE,
                      SeqList = NULL,
                      Err = 1e-4,
                      ErrFlavour = "version2.0",
                      Tassign = 0.5,
                      Tfilter = -2.0,
                      Complex = "full",
                      Herm = "no",
                      quiet = FALSE)
{

  on.exit(.Fortran(deallocall), add=TRUE)

  # check genotype data ----
  GenoM <- CheckGeno(GenoM, quiet=TRUE, Plot=FALSE)

  # unpack SeqList ----
  if (!is.null(SeqList)) {
    NewName = c(Pedigree = "Pedigree",
                PedigreePar = "Pedigree",
                LifeHist = "LifeHistData",
                AgePriors = "AgePrior")
    for (x in names(NewName)) {
	  if (x %in% c("Pedigree", "PedigreePar") & !is.null(Pedigree))  next
      if (x %in% names(SeqList)) {
        if (x == "PedigreePar" & "Pedigree" %in% names(SeqList))  next
        if (x == "AgePrior" & AgePrior == FALSE)  next
        if (!quiet)  message("using ", x, " in SeqList")
        assign(NewName[x], SeqList[[x]])
      }
    }
  }

  #~~~~~~~~~
  # drop individuals not in Pedigree from GenoM
  PedX <- PedPolish(Pedigree)  # ensure all dams & sires have own id row
  gID <- intersect(rownames(GenoM), PedX$id)
  GenoM <- GenoM[gID,]
  Ped <- PedPolish(Pedigree, gID=gID, DropNonSNPd=FALSE, NullOK = FALSE)
  Ped <- Ped[, !names(Ped) %in% c("OHdam", "OHsire", "MEpair",
                                     "SNPd.id.dam", "SNPd.id.sire",
                                     "LLRdam", "LLRsire", "LLRpair")]

  #~~~~~~~~~
  LhIN <- CheckLH(LifeHistData, rownames(GenoM), sorted=FALSE)

  # check/make ageprior ----
  if (is.logical(AgePrior) && AgePrior %in% c(TRUE, FALSE)) {  # not: NA
    if (AgePrior & CalcLLR) {  # if !CalcLLR, AgePrior not used at all.
      AP <- MakeAgePrior(Pedigree=Ped, LifeHistData,
                         Plot = FALSE, quiet = quiet)
    } else {
      AP <- MakeAgePrior(Pedigree=NULL, LifeHistData,
                         MaxAgeParent = 99, Plot=FALSE, quiet = TRUE)  # non-informative ageprior
    }
  } else if (is.matrix(AgePrior) | is.data.frame(AgePrior)) {
    AP <- CheckAP(AgePrior)
  } else {
    stop("'AgePrior' must be TRUE or FALSE, or a matrix")
  }


  # Specs / param ----
  if ("Specs" %in% names(SeqList)) {
    PARAM <- SpecsToParam(SeqList$Specs, SeqList$ErrM, ErrFlavour,
                          dimGeno = dim(GenoM), CalcLLR, quiet)
                          # overrule values in SeqList
  } else {
    PARAM <- namedlist(dimGeno = dim(GenoM),
                       CalcLLR,
                       Err,
                       ErrFlavour,
                       Tfilter,
                       Tassign,
                       nAgeClasses = nrow(AP),
                       MaxSibshipSize = max(table(Ped$dam), table(Ped$sire), 90,
                                            na.rm=TRUE) +10,
                       Complex,
                       Herm,
                       quiet)
    PARAM$ErrM <- ErrToM(Err, flavour = ErrFlavour, Return = "matrix")
  }

  # MaxMismatch ----
  # vector with max. mismatches for duplicates, PO pairs, PPO trios
  if (!"MaxMismatchV" %in% names(PARAM)) {  # DUP/OH/ME from version 2.0 onwards
    sts <- SnpStats(GenoM, Plot=FALSE)
    PARAM$MaxMismatchV <- setNames(CalcMaxMismatch(Err=PARAM$ErrM,
                                                   MAF=sts[,"AF"],
                                                   ErrFlavour=PARAM$ErrFlavour,
                                                   qntl=0.9999^(1/nrow(GenoM))),
                                   c("DUP", "OH", "ME"))
  }

  # check parameter values ----
  CheckParams(PARAM)
  utils::flush.console()


  #=========================
  # call fortran ----
  Ng <- nrow(GenoM)
  gID <- rownames(GenoM)
  LHF <- orderLH(LhIN, gID)
  FortPARAM <- MkFortParams(PARAM, fun="CalcOH")

  # turn IDs into numbers, & turn non-genotyped parents into temporary dummies
  if (CalcLLR) {
    PedN <- PedToNum(Ped, gID, DoDummies = "new")
  } else {  # ignore non-genotyped individuals
    PedN <- PedToNum(Ped, gID, DoDummies = "no")
  }

  TMP <- .Fortran(getpedllr,
                  ng = as.integer(Ng),
                  specsint = as.integer(FortPARAM$SpecsInt),
                  specsintmkped = as.integer(FortPARAM$SpecsIntMkPed),
                  specsdbl = as.double(FortPARAM$SpecsDbl),
									errv = as.double(FortPARAM$ErrM),
#                  calcllr = as.integer(CalcLLR),   # mv to SpecsIntMkPed
                  genofr = as.integer(GenoM),
                  sexrf = as.integer(LHF$Sex),
                  byrf = as.integer(c(LHF$BirthYear, LHF$BY.min, LHF$BY.max)),
                  aprf = as.double(AP),
                  parentsrf = as.integer(PedN$PedPar),

                  ohrf = integer(3*Ng),
                  lrrf = double(3*Ng),
                  snpdboth = integer(2*Ng),
                  dumparrf = as.integer(PedN$DumPar),
                  dumlrrf = double(3*Ng),
									dumbyrf = integer(3*Ng) )

  #=========================
  # polish output ----
  TMP$lrrf[abs(TMP$lrrf - 999) < 0.1] <- NA
  TMP$lrrf <- round(TMP$lrrf, 2)
	TMP$dumlrrf[abs(TMP$dumlrrf - 999) < 0.1] <- NA
	TMP$dumlrrf <- round(TMP$dumlrrf, 2)
  TMP$ohrf[TMP$ohrf < 0] <- NA
  TMP$snpdboth[TMP$snpdboth < 0] <- NA

  NewCols <- data.frame(id = gID,
                        VtoM(TMP$lrrf, nc=3),
                        VtoM(TMP$ohrf, nc=3),
                        SNPd.id = apply(GenoM, 1, function(x) sum(x>=0)),
                        VtoM(TMP$snpdboth, nc=2))
  names(NewCols) <- c("id", "LLRdam", "LLRsire", "LLRpair",
                      "OHdam", "OHsire", "MEpair",
                      "SNPd.id", "SNPd.id.dam", "SNPd.id.sire")
	if (!is.null(LifeHistData) & any(LHF$BirthYear < 0 | LHF$Sex == 3)) {
		LhOUT <- data.frame(Sexx = TMP$sexrf,
												BY.est = TMP$byrf[1:Ng],
												BY.lo = TMP$byrf[1:Ng + Ng],
												BY.hi = TMP$byrf[1:Ng + 2*Ng],
												stringsAsFactors = FALSE)
		LhOUT$BY.est[LhOUT$BY.est < 0] <- NA
		NewCols <- cbind(NewCols, LhOUT)
	}

  if (any(PedN$Nd > 0)) {
    NgOdd <- Ng%%2==1
    DumCols <- data.frame(id=c(PedN$Renamed$dam[,"name"],
                               PedN$Renamed$sire[,"name"]),
                          VtoM(TMP$dumlrrf, sum(PedN$Nd), 3, NgOdd),
                          OHdam=NA, OHsire=NA, MEpair=NA)
    names(DumCols)[2:4] <- c("LLRdam", "LLRsire", "LLRpair")
    DumCols$SNPd.id <- 0
    DumCols$SNPd.id.dam <- ifelse(!is.na(DumCols$LLRdam), 0, NA)
    DumCols$SNPd.id.sire <- ifelse(!is.na(DumCols$LLRsire), 0, NA)
    if (!is.null(LifeHistData) & any(LHF$BirthYear < 0 | LHF$Sex == 3)) {
      DumCols <- cbind(DumCols,
                       Sexx = rep(1:2, PedN$Nd),
                       VtoM(TMP$dumbyrf, sum(PedN$Nd),3, NgOdd))
      names(DumCols)[12:14] <- c("BY.est", "BY.lo", "BY.hi")
    }
    NewCols <- rbind(NewCols, DumCols)
  }
  if (!CalcLLR) {
    NewCols <- NewCols[, -c(2:4)]
  }
  Ped$rownum <- seq_along(Ped$id)
  Ped <- merge(Ped, NewCols, all.x=TRUE)  # merge changes order
  Ped <- Ped[order(Ped$rownum), ]
  Ped <- Ped[, names(Ped)!="rownum"]
  rownames(Ped) <- seq_along(Ped$id)

  ColumnOrder <- c("id", "dam", "sire",
                   "LLRdam", "LLRsire", "LLRpair",
                   "OHdam", "OHsire", "MEpair",
                   "SNPd.id", "SNPd.id.dam", "SNPd.id.sire",
                   "BY.est", "BY.lo", "BY.hi")
  Ped <- Ped[, intersect(ColumnOrder, names(Ped))]

  return( Ped )
}

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Try the sequoia package in your browser

Any scripts or data that you put into this service are public.

sequoia documentation built on Sept. 8, 2023, 5:29 p.m.