R/ComparePeds.R

Defines functions tdf SibMatch Vcomp PedCompare

Documented in PedCompare SibMatch Vcomp

#============================================================================
#============================================================================
#' @title Compare Two Pedigrees
#'
#' @description Compare an inferred pedigree (Ped2) to a previous or simulated
#'   pedigree (Ped1), including comparison of sibship clusters and sibship
#'   grandparents.
#'
#' @details The comparison is divided into different classes of `assignable'
#'   parents (\code{\link{getAssignCat}}). This includes cases where the focal
#'   individual and parent according to Ped1 are both Genotyped (G-G), as well
#'   as cases where the non-genotyped parent according to Ped1 can be lined up
#'   with a sibship Dummy parent in Ped2 (G-D), or where the non-genotyped focal
#'   individual in Ped1 can be matched to a dummy individual in Ped2 (D-G and
#'   D-D). If SNPd is NULL (the default), and DumPrefix is set to NULL, the
#'   intersect between the IDs in Pedigrees 1 and 2 is taken as the vector of
#'   genotyped individuals.
#'
#' @param  Ped1 first (e.g. original) pedigree, dataframe with columns
#'   id-dam-sire; only the first 3 columns will be used.
#' @param  Ped2 second pedigree, e.g. newly inferred \code{SeqOUT$Pedigree} or
#'   \code{SeqOUT$PedigreePar}, with columns id-dam-sire.
#' @param  DumPrefix  character vector with the prefixes identifying dummy
#'   individuals in \code{Ped2}. Use 'F0' ('M0') to avoid matching to regular
#'   individuals with IDs starting with 'F' ('M'), provided \code{Ped2} has
#'   fewer than 999 dummy females (males).
#' @param SNPd character vector with IDs of genotyped individuals. If
#'   \code{NULL}, defaults to the IDs occurring in both \code{Ped1} and
#'   \code{Ped2} and not starting with any of the prefixes in \code{DumPrefix}.
#' @param Symmetrical  when determining the category of individuals
#'   (Genotyped/Dummy/X), use the 'highest' category across the two pedigrees
#'   (\code{TRUE}, default) or only consider \code{Ped1} (\code{Symmetrical =
#'   FALSE}).
#' @param minSibSize  minimum requirements to be considered 'dummifiable',
#'   passed to \code{\link{getAssignCat}}:
#'   \itemize{
#'      \item '1sib' : sibship of size 1, with or without grandparents. The
#'      latter aren't really a sibship, but can be useful in some situations.
#'      \item '1sib1GP': sibship of size 1 with at least 1 grandparent (default)
#'      \item '2sib': at least 2 siblings, with or without grandparents
#'        (default prior to version 2.4)
#'  }
#' @param Plot show square Venn diagrams of counts?
#'
#' @return A list with
#' \item{Counts}{A 7 x 5 x 2 named numeric array with the
#'   number of matches and mismatches, see below}
#' \item{Counts.detail}{a large numeric array with number of matches and
#'   mismatches, with more detail for all possible combination of categories}
#' \item{MergedPed}{A dataframe with side-by-side comparison of the two
#'   pedigrees}
#' \item{ConsensusPed}{A consensus pedigree, with Pedigree 2 taking priority
#'   over Pedigree 1}
#' \item{DummyMatch}{Dataframe with all dummy IDs in Pedigree 2 (id.2), and the
#' best-matching individual in Pedigree 1 (id.1). Also includes the class of the
#' dam & sire, as well as counts of offspring per outcome class (off.Match,
#' off.Mismatch, etc.)}
#' \item{Mismatch}{A subset of MergedPed with mismatches between Ped1 and Ped2,
#'  as defined below}
#' \item{Ped1only}{as Mismatches, with parents in Ped1 that were not assigned
#'   in Ped2}
#' \item{Ped2only}{as Mismatches, with parents in Ped2 that were missing in
#'   Ped1}
#'
#' 'MergedPed', 'Mismatch', 'Ped1only' and 'Ped2only' provide the following
#' columns:
#' \item{id}{All ids in both Pedigree 1 and 2. For dummy individuals, this is
#' the id \emph{in pedigree 2}}
#' \item{dam.1, sire.1}{parents in Pedigree 1}
#' \item{dam.2, sire.2}{parents in Pedigree 2}
#' \item{id.r, dam.r, sire.r}{The \emph{real} id of dummy individuals or parents
#' in Pedigree 2, i.e. the best-matching non-genotyped individual in Pedigree 1,
#' or "nomatch". If a sibship in Pedigree 1 is divided over 2 sibships in
#' Pedigree 2, the smaller one will be denoted as "nomatch"}
#' \item{id.dam.cat, id.sire.cat}{the category of the individual (first letter)
#' and \emph{highest category} of the dam (sire) in Pedigree 1 or 2:
#' G=Genotyped, D=(potential) dummy, X=none. Individual, one-letter categories
#' are generated by \code{\link{getAssignCat}}. Using the 'best' category from
#' both pedigrees makes comparison between two inferred pedigrees symmetrical
#' and more intuitive.}
#' \item{dam.class, sire.class}{classification of dam and sire: Match, Mismatch,
#' P1only, P2only, or '_' when no parent is assigned in either pedigree}
#'
#' The first dimension of \code{Counts} denotes the following categories:
#' \item{GG}{Genotyped individual, assigned a genotyped parent in either
#'   pedigree}
#' \item{GD}{Genotyped individual, assigned a dummy parent, or at least 1
#'   genotyped sibling or a genotyped grandparent in Pedigree 1)}
#' \item{GT}{Genotyped individual, total}
#' \item{DG}{Dummy individual, assigned a genotyped parent (i.e., grandparent
#'    of the sibship in Pedigree 2)}
#' \item{DD}{Dummy individual, assigned a dummy parent (i.e., avuncular
#'   relationship between sibships in Pedigree 2)}
#' \item{DT}{Dummy total}
#' \item{TT}{Total total, includes all genotyped individuals, plus
#'   non-genotyped individuals in Pedigree 1, plus non-replaced dummy
#'   individuals (see below) in Pedigree 2}
#'
#' The second dimension of \code{Counts} gives the outcomes:
#' \item{Total}{The total number of individuals with a parent assigned in
#'    either or both pedigrees}
#' \item{Match}{The same parent is assigned in both pedigrees (non-missing).
#'      For dummy parents, it is considered a match if the inferred sibship
#'      which contains the most offspring of a non-genotyped parent, consists
#'      for more than half of this individual's offspring.}
#' \item{Mismatch}{Different parents assigned in the two pedigrees. When
#'    a sibship according to Pedigree 1 is split over two sibships in Pedigree
#'    2, the smaller fraction is included in the count here.}
#' \item{P1only}{Parent in Pedigree 1 but not 2; includes non-assignable
#'     parents (e.g. not genotyped and no genotyped offspring).}
#' \item{P2only}{Parent in Pedigree 2 but not 1.}
#'
#' The third dimension \code{Counts} separates between maternal and paternal
#' assignments, where e.g. paternal 'DT' is the assignment of fathers to both
#' maternal and paternal sibships (i.e., to dummies of both sexes).
#'
#' In 'ConsensusPed', the priority used is parent.r (if not "nomatch") >
#'   parent.2 > parent.1. The columns 'id.cat', dam.cat' and 'sire.cat' have two
#'   additional levels compared to 'MergedPed':
#'   \item{G}{Genotyped}
#'   \item{D}{Dummy individual (in Pedigree 2)}
#'   \item{R}{Dummy individual in pedigree 2 replaced by best matching
#'     non-genotyped individual in pedigree 1}
#'   \item{U}{Ungenotyped, Unconfirmed (parent in Pedigree 1, with no dummy
#'     match in Pedigree 2)}
#'   \item{X}{No parent in either pedigree}
#'
#' @section Assignable:
#'    Note that 'assignable' may be overly optimistic. Some parents from
#'   \code{Ped1} indicated as assignable may never be assigned by sequoia, for
#'   example parent-offspring pairs where it cannot be determined which is the
#'   older of the two, or grandparents that are indistinguishable from full
#'   avuncular (i.e. genetics inconclusive because the candidate has no parent
#'   assigned, and ageprior inconclusive).
#'
#' @section Dummifiable:
#'   Considered as potential dummy individuals are all non-genotyped individuals
#'   in Pedigree 1 who have, according to either pedigree, at least 2 genotyped
#'   offspring, or at least one genotyped offspring and a genotyped parent.
#'
#' @section Mismatches:
#'   Perhaps unexpectedly, cases where all siblings are correct but a dummy
#'   parent rather than the genotyped Ped1-parent are assigned, are classified
#'   as a mismatch (for each of the siblings). These are typically due to a too
#'   low assumed genotyping error rate, a wrong parental birth year, or some
#'   other issue that requires user inspection. To identify these cases,
#'   \code{\link{ComparePairs}} may be of help.
#'
#' @section Genotyped 'mystery samples':
#'  If Pedigree 2 includes samples for which the ID is unknown, the behaviour of
#'  \code{PedCompare} depends on whether the temporary IDs for these samples are
#'  included in \code{SNPd}. If they are included, matching (actual) IDs in
#'  Pedigree 1 will be flagged as mismatches (because the IDs differ). If they
#'  are not included in \code{SNPd}, or \code{SNPd} is not explicitly provided,
#'  matches are accepted, as the situation is indistinguishable from comparing
#'  dummy parents across pedigrees.
#'
#'  This is of course all conditional on relatives of the mystery sample being
#'  assigned in Pedigree 2.
#'
#'
#' @author Jisca Huisman, \email{jisca.huisman@gmail.com}
#'
#' @seealso \code{\link{ComparePairs}} for comparison of all pairwise
#'   relationships in 2 pedigrees; \code{\link{EstConf}} for repeated
#'   simulate-reconstruct-compare; \code{\link{getAssignCat}} for all parents in
#'   the reference pedigree that could have been assigned;
#'   \code{\link{CalcOHLLR}} to check how well an 'old' pedigree fits with the
#'   SNP data.
#'
#' @examples
#' compare <- PedCompare(Ped_griffin, SeqOUT_griffin$Pedigree)
#' compare$Counts["TT",,]  # totals only; 45 dams & 47 sires non-assigned
#' compare$Counts[,,"dam"]  # dams only
#'
#' # inspect non-assigned in Ped2, id genotyped, dam might-be-dummy
#' PedM <- compare$MergedPed  # for brevity
#' PedM[PedM$id.dam.cat=='GD' & PedM$dam.class=='P1only',]
#' # zoom in on specific dam
#' PedM[which(PedM$dam.1=="i011_2001_F"), ]
#' # no sire for 'i034_2002_F' -> impossible to tell if half-sibs or avuncular
#'
#' # overview of all non-genotyped -- dummy matches
#' head(compare$DummyMatch)
#'
#' # success of paternity assignment, if genotyped mother correctly assigned
#' dimnames(compare$Counts.detail)
#' compare$Counts.detail["G","G",,"Match",]
#'
#' # default before version 3.5: minSibSize = '2sib'
#' compare_2s <- PedCompare(Ped_griffin, SeqOUT_griffin$Pedigree,
#'                          minSibSize = '2sib')
#' compare_2s$Counts[,,"dam"]  # note decrease in Total 'dummies
#' with(compare_2s$MergedPed, table(id.dam.cat, dam.class))
#' # some with id.cat = 'X' or dam.cat='X' are nonetheless dam.class='Match'
#' @export

PedCompare <- function(Ped1 = NULL,
                       Ped2 = NULL,
                       DumPrefix = c("F0", "M0"),
                       SNPd = NULL,
                       Symmetrical = TRUE,
                       minSibSize = "1sib1GP",
                       Plot = TRUE)
{
  Ped1 <- PedPolish(Ped1, ZeroToNA=TRUE, NullOK = FALSE, StopIfInvalid=FALSE, KeepAllColumns=FALSE)
  Ped2 <- PedPolish(Ped2, ZeroToNA=TRUE, NullOK = FALSE, StopIfInvalid=FALSE, KeepAllColumns=FALSE)
  if (!any(Ped2$id %in% Ped1$id))  stop("no common IDs in Ped1 and Ped2")

  if (is.null(SNPd)) {
    SNPd <- intersect(Ped2$id, Ped1$id)
    if (!is.null(DumPrefix)) {
      for (x in seq_along(DumPrefix)) {
        SNPd <- SNPd[substr(SNPd,1,nchar(DumPrefix[x])) != DumPrefix[x] ]
      }
    }
  }
  if (sum(SNPd %in% Ped2$id)==0)  stop("none of 'SNPd' in Ped2")
  SNPd <- stats::na.exclude(SNPd)

  names(Ped1) <- c("id", "dam.1", "sire.1")
  names(Ped2) <- c("id", "dam.2", "sire.2")
  PedX <- merge(Ped1, Ped2[Ped2$id %in% SNPd, ], all.y=TRUE)
  DumPed <- Ped2[!Ped2$id %in% SNPd, ]
  Dummies <- list(dam = DumPed$id[DumPed$id %in% Ped2$dam],
                  sire = DumPed$id[DumPed$id %in% Ped2$sire])

  Par <- rbind(dam =c("dam.1", "dam.2", "dam.r"),
               sire=c("sire.1", "sire.2", "sire.r"))


  #===  match dummies to non-genotyped parents  ===
  Sibs1 <- list()
  SibScore <- list()
  DumReal <- list()
  for (p in 1:2) {
    Sibs1[[p]] <- split(PedX$id, PedX[, Par[p,1]])
    Sibs1[[p]] <- Sibs1[[p]][!names(Sibs1[[p]]) %in% SNPd]
  }
  NoDummies <- with(PedX, all(dam.2 %in% SNPd | is.na(dam.2)) &
                      all(sire.2 %in% SNPd | is.na(sire.2)))
  if (!NoDummies) {
    for (p in 1:2) {
      Sibs2 <- split(PedX$id, PedX[, Par[p,2]])
      Sibs2 <- Sibs2[!names(Sibs2) %in% SNPd]

      if (length(Sibs1[[p]])>0 & length(Sibs2)>0) {
        # This does the actual matching:
        SibScore[[p]] <- tdf(sapply(Sibs1[[p]], SibMatch, Sibs2, SNPd))

        if (length(stats::na.exclude(SibScore[[p]][, "Best"])) >
            length(unique(stats::na.exclude(SibScore[[p]][, "Best"])))) {
          SibScore[[p]] <- SibScore[[p]][order(SibScore[[p]][, "OK"],
                                               decreasing=TRUE), ]
          dups <- duplicated(SibScore[[p]][, "Best"], incomparables=NA)
          BestS.d <- unique(SibScore[[p]]$Best[dups])
          if (sum(dups) > 1) {
            SibScore[[p]][dups, "Er"] <- rowSums(SibScore[[p]][dups, c("OK", "Er")])
          } else {
            SibScore[[p]][dups, "Er"] <- sum(SibScore[[p]][dups, c("OK", "Er")])
          }
          SibScore[[p]][dups, c("Best", "OK")] <- NA
          for (s in seq_along(BestS.d)) {
            tmp <- SibScore[[p]][which(SibScore[[p]]$Best==BestS.d[s]), ]
            if (tmp$OK==1 & tmp$Er>=1) {
              SibScore[[p]][which(SibScore[[p]]$Best==BestS.d[s]),
                            c("Best", "OK")] <- NA
            }
          }
        }
        tmp <- SibScore[[p]][!is.na(SibScore[[p]][, "Best"]), ]
        DumReal[[p]] <- as.data.frame(cbind(real = rownames(tmp),
                              dummy = tmp$Best), stringsAsFactors=FALSE)
        DumReal[[p]] <- merge(DumReal[[p]], DumPed[DumPed$id %in% Dummies[[p]],],
                              by.x="dummy", by.y="id", all=TRUE)[,c("real", "dummy")]
        DumReal[[p]][,"real"][is.na(DumReal[[p]][,"real"])] <- "nomatch"
      } else {
        SibScore[[p]] <- NA
        DumReal[[p]] <- data.frame(real=NA, dummy=NA)
      }
    }
    for (p in 1:2) {
      tmp <- stats::setNames(DumReal[[p]], c(Par[p,3], Par[p,2]))
      PedX <- merge(PedX, tmp, all.x=TRUE)
      DumPed <- merge(DumPed, DumReal[[p]], by.x="id", by.y="dummy", all.x=TRUE,
                      suffixes = c(".x",".y"))
    }
    DumPed$id.r <- apply(DumPed[,c("real.x", "real.y")], 1, function(x)
      ifelse(all(is.na(x)), NA, min(x, na.rm=T)))
    DumPed <- DumPed[, c("id", "dam.2", "sire.2", "id.r")]
    DumPed <- merge(DumPed, Ped1, all.x=TRUE, by.x="id.r", by.y="id",
                    suffixes=c(".2", ".1"))
    for (p in 1:2) {
      DumPed <- merge(DumPed, stats::setNames(DumReal[[p]], c(Par[p,3], Par[p,2])),
                      all.x=TRUE)
    }
  } else {
    PedX$dam.r <- NA
    PedX$sire.r <- NA
  }

  #===  Combined pedigree  ===
  PedX$id.r <- PedX$id
  PedY <- merge(PedX, DumPed, all=TRUE, sort=FALSE)  # NA's for id.r = "nomatch"
  PedY <- merge(PedY, stats::setNames(Ped1, c("id.r", "dam.1", "sire.1" )),
                all=TRUE, sort=FALSE)

  for (p in 1:2) {
    PedTmp <- PedY[which(!PedY[,Par[p,2]] %in% SNPd &
                           !is.na(PedY[,Par[p,2]]) &
                           PedY[,Par[p,3]]!="nomatch"),]
    tbl <- table(PedTmp[,Par[p,2]])
    npar1 <- plyr::daply(PedTmp, Par[p,2],
                   function(x) length(unique(stats::na.exclude(x[,Par[p,1]]))))
    unmatch <- names(which(c(tbl)==npar1 & npar1>1))
    if (length(unmatch)>0) {
      PedY[which(PedY[,Par[p,2]] %in% unmatch), Par[p,3]] <- "nomatch"
    }
  }
  PedY <- PedY[, c("id", "dam.1", "sire.1", "dam.2", "sire.2",
                 "id.r", "dam.r", "sire.r")]  # fix column order


  #===  Categorise  ===
  CatNames <- c("G", "D", "X")

  Ped.a <- getAssignCat(PedY[, c("id.r", "dam.1", "sire.1")], SNPd,
                        minSibSize = minSibSize)
  PedZ <- merge(PedY, Ped.a[, c("id", "id.cat", "dam.cat", "sire.cat")],
                by.x="id.r", by.y="id", all.x=TRUE)

  if (any(is.na(PedZ$id.cat)) | any(is.na(PedZ$dam.cat)) | any(is.na(PedZ$sire.cat)))
    stop('PedCompare() batman bug!!')

  if (Symmetrical) {  # take 'highest' category across the 2 pedigrees
    Ped.b <- getAssignCat(PedY[, c("id", "dam.2", "sire.2")], SNPd,
                          minSibSize = minSibSize)
    PedZ <- merge(PedZ, Ped.b[,c("id", "id.cat", "dam.cat", "sire.cat")],
                  by="id", all.x=TRUE, suffixes=c(".1", ".2"))
    for (x in paste0(c("id", "dam", "sire"), ".cat.2")) {
      PedZ[is.na(PedZ[,x]), x] <- "X"
    }
    for (x in c(outer(c("id", "dam", "sire"), 1:2, paste, sep=".cat."))) {
      PedZ[,x] <- factor(PedZ[,x], levels=CatNames)
    }
    for (p in c("id", "dam", "sire")) {
      PedZ[, paste0(p, ".cat")] <- factor(pmin(as.numeric(PedZ[,paste0(p,".cat.1")]),
                                               as.numeric(PedZ[,paste0(p,".cat.2")])),
                                          levels=1:3, labels=CatNames)
    }
  } else {
    for (p in c("id", "dam", "sire")) {
      PedZ[, paste0(p, ".cat")] <- factor(PedZ[, paste0(p, ".cat")],
                                          levels=CatNames)
    }
  }


  #===  Classify: match/mismatch  ===
  ClassNames <- c("Match", "Mismatch", "P1only", "P2only", "_")
  for (p in c("dam", "sire")) {
    PP <- PedZ[,paste(p, c(1,2,"r"), sep=".")]
    PedZ[, paste0(p, ".class")] <- ifelse(is.na(PP[,1]) & is.na(PP[,2]), "_",
                                           ifelse(!is.na(PP[,1]) & is.na(PP[,2]), "P1only",
                                              ifelse(is.na(PP[,1]) & !is.na(PP[,2]), "P2only",
                                                 ifelse(PP[,1]==PP[,2] |
                                                          (!is.na(PP[,3]) & PP[,1] == PP[,3]),
                                                        "Match", "Mismatch"))))
  }


  #===  Count  ===
  Counts.all <- with(PedZ, table(id.cat, dam.cat, sire.cat,
                                 dam.class = factor(dam.class, levels=ClassNames),
                                 sire.class = factor(sire.class, levels=ClassNames)))

  PedZ$id.dam.cat <- with(PedZ, paste0(id.cat, dam.cat))  # need 'X' for (sub)totals
  PedZ$id.sire.cat <- with(PedZ, paste0(id.cat, sire.cat))
  Cat2Names <- c("GG", "GD", "GT", "DG", "DD", "DT", "TT")
  ClassNames <- c("Total", "Match", "Mismatch", "P1only", "P2only")
  Score <- array(0, dim=c(7, 5, 2),
                 dimnames=list(cat=Cat2Names, class=ClassNames, parent=c("dam", "sire")))
  for (p in c("dam", "sire")) {
    PedP <- PedZ[!is.na(PedZ[,paste0(p,".1")]) | !is.na(PedZ[,paste0(p,".2")]), ]
    Score[,,p] <-  table(factor(PedP[,paste0("id.",p, ".cat")], levels=Cat2Names),
                         factor(PedP[,paste0(p, ".class")], levels=ClassNames))
    Score[,"Total",p] <- table(factor(PedP[,paste0("id.",p, ".cat")], levels=Cat2Names))

    Score[c("GT", "DT"),,p] <- table(factor(PedP[,"id.cat"], levels=c("G","D")),
                                     factor(PedP[,paste0(p, ".class")], levels=ClassNames))
    Score[c("GT", "DT"),"Total",p] <- table(factor(PedP[,"id.cat"], levels=c("G","D")))
    Score["TT",,p] <- table(factor(PedP[,paste0(p, ".class")], levels=ClassNames))  # = colSums(Score[c("GT", "DT"),,p])
    Score["TT", "Total", p] <- nrow(PedP)
  }


  #===  consensus pedigree  ===
  PedC <- with(PedZ, data.frame(
    id = ifelse(!is.na(id.r) & id.r!="nomatch", id.r, id),
    id.catc = ifelse(id.cat=="G", "G",
                     ifelse(id.r == "nomatch", "D", "R")),
    stringsAsFactors = FALSE))
  for (p in c("dam", "sire")) {
    PP <- PedZ[,paste(p, c(1,2,"r"), sep=".")]
    PedC[,paste0(p, ".c")] <- ifelse(!is.na(PP[,3]) & PP[,3]!="nomatch", PP[,3],
                   ifelse(!is.na(PP[,2]), PP[,2], PP[,1]))
    PedC[, paste0(p, ".cat")] <- ifelse(!is.na(PP[,1]) & is.na(PP[,2]), "U",   # Unconfirmed
                                     ifelse(PP[,2] %in% Dummies[[p]],
                                        ifelse(PP[,3]=="nomatch", "D", "R"),
                                        as.character(PedZ[,paste0(p, ".cat")])))
  }
  PedC <- PedC[, c("id", "dam.c", "sire.c", "id.catc", "dam.cat", "sire.cat")]  # re-order columns
  names(PedC)[4] <- "id.cat"

  PedZ <- PedZ[, c(names(PedY), "id.dam.cat", "id.sire.cat", "dam.class", "sire.class")]

  #===  short-list non-matching parents  ===
  NoMatch <- list()
  for (x in c("Mismatch", "P1only", "P2only")) {
    NoMatch[[x]] <- PedZ[PedZ$dam.class == x | PedZ$sire.class == x, ]
  }



  #===  dummy match  ===================
  if (!NoDummies) {
    DumPed$Sex <- ifelse(DumPed$id %in% Dummies$dam, 1, 2)
    DumMatch <- merge(DumPed[, c("id", "id.r", "Sex")],
                      PedZ[, c("id", "dam.class", "sire.class")],
                      all.x = TRUE)

    OffMatch <-  sapply(seq_along(DumMatch$id), function(i) {
      k <- DumMatch$Sex[i]
      tmp <- PedZ[which(PedZ[,Par[k,1]] == DumMatch$id.r[i] | PedZ[,Par[k,2]] == DumMatch$id[i]), ]
      table(factor(tmp[, paste0(c("dam","sire")[k] ,".class")],
                   levels = c("Match", "Mismatch", "P1only", "P2only")))
    })

    names(DumMatch)[1:2] <- c("id.2", "id.1")
    rownames(OffMatch) <- paste0("off.", rownames(OffMatch))
    DumMatch <- cbind(DumMatch, t(OffMatch))
    DumMatch <- DumMatch[order(DumMatch$id.2), ]
  } else {
    DumMatch <- NULL
  }


  #===  plot  ==========================
  if (Plot) {
    cat.props <- Score[,"Total",] / max(Score["TT","Total",])
    img <- tryCatch(
      {
        suppressWarnings(PlotPedComp(Counts = Score,
                                     sameSize = any(cat.props > 0 & cat.props < 0.01)))
      },
      error = function(e) {
        message("PlotPedComp: Plotting area too small")
        return(NA)
      })
  }


  #===  out  ===========================
  return( c(list(Counts = Score,
                 Counts.detail = Counts.all,
                 MergedPed = PedZ,
                 ConsensusPed = PedC,
                 DummyMatch = DumMatch),
            NoMatch) )

}



#============================================================================
#============================================================================
# Utils functions for comparisons

#======================================================================
#' Compare two vectors
#'
#' Compare a vector with inferred sibs to a vector of `true' sibs
#'
#' @param  Infrd  vector of inferred sibs
#' @param  Simld  vector of true sibs
#' @param  SNPd character vector with IDs of genotyped individuals
#'
#' @return a named numeric vector of length 4, with the total length of Simld,
#'   the length of the intersect of the two vectors, the number occurring in
#'   Infrd but not Simld ('err'), and the number occuring in Simld but not
#'   Infrd ('missed').
#'
#' @keywords internal

Vcomp <- function(Infrd, Simld, SNPd)
{
  out <- c("total" = length(Simld),
           "both" = length(intersect(Infrd, Simld)),
           "err" = length(setdiff(Infrd[Infrd %in% SNPd], Simld)),
           "missed" = length(setdiff(Simld, Infrd)))
  out
}


#======================================================================
#' Find the closest matching inferred sibship to a true sibship
#'
#' @param SimX  a vector with the IDs in the true (Ped1) sibship
#' @param Infrd  a list of vectors with the IDs in the inferred (Ped2) sibships
#' @param SNPd character vector with IDs of genotyped individuals
#'
#' @return a named numeric vector with the number of matches ('NumMatch'),
#'   the position of the best match ('Best'), the inferred sibship size of
#'   this best match ('Tot'), the number of matching IDs ('OK'), and the
#'   number of mismatches ('err').
#'
#' @keywords internal

SibMatch <- function(SimX, Infrd, SNPd)
{
  VC <- sapply(Infrd, Vcomp, SimX, SNPd)
  mtch <- which(VC["both",]>0)
  if (length(mtch)>1) {
    if (VC["total",1]==2) {
      if (all(VC["err", mtch]==0)) {
        mtch <- which.max(VC["both",])[1]  # pair split into two singletons
      } else {
        mtch <- NULL  # no clear match
      }
    } else {
      mtch <- which.max(VC["both",])[1]  # which inferred sibship has most members of true sibship
    }
  }
  if (length(mtch)==1) {
    if (VC["err", mtch] > VC["both", mtch]) {  # was: >=
      mtch <- NULL
    }
  }
  if (length(mtch)==1) {
    OUT <- data.frame(NumMatch = sum(VC[2,]>0),
             Best = colnames(VC)[mtch],  # as.numeric(substr(colnames(VC)[mtch],3,5)),
             Tot = VC["total", mtch],
             OK = VC["both", mtch],
             Er = VC["err", mtch], stringsAsFactors=FALSE)
  } else {
    OUT <- data.frame(NumMatch=0, Best=NA,  Tot= length(SimX), OK = NA, Er = NA)
  }
  OUT
}


#======================================================================
# transpose matrix to data.frame  (sapply returns wrong format)
tdf <- function(M)
{
  DF <- matrix(NA, ncol(M), nrow(M))
  for (r in 1:nrow(M)) {
    DF[, r] <- unlist(M[r, ])
  }
  DF <- as.data.frame(DF, stringsAsFactors = FALSE, row.names=colnames(M))
  for (r in 1:ncol(DF)) {
    if (allNumeric(DF[, r]))  DF[, r] <- as.numeric(DF[, r])
  }
  names(DF) <- rownames(M)
  DF
}


#======================================================================
# adapted from Hmisc::all.is.numeric
allNumeric <- function (x, what = c("test", "vector"))
{
  what <- match.arg(what)
  isnum <- suppressWarnings(!any(is.na(as.numeric(stats::na.exclude(x)))))
  if (what == "test")
    isnum
  else if (isnum)
    as.numeric(x)
  else x
}


#============================================================================
#============================================================================

Try the sequoia package in your browser

Any scripts or data that you put into this service are public.

sequoia documentation built on Sept. 8, 2023, 5:29 p.m.