Nothing
#============================================================================
#============================================================================
#' @title Compare Two Pedigrees
#'
#' @description Compare an inferred pedigree (Ped2) to a previous or simulated
#' pedigree (Ped1), including comparison of sibship clusters and sibship
#' grandparents.
#'
#' @details The comparison is divided into different classes of `assignable'
#' parents (\code{\link{getAssignCat}}). This includes cases where the focal
#' individual and parent according to Ped1 are both Genotyped (G-G), as well
#' as cases where the non-genotyped parent according to Ped1 can be lined up
#' with a sibship Dummy parent in Ped2 (G-D), or where the non-genotyped focal
#' individual in Ped1 can be matched to a dummy individual in Ped2 (D-G and
#' D-D). If SNPd is NULL (the default), and DumPrefix is set to NULL, the
#' intersect between the IDs in Pedigrees 1 and 2 is taken as the vector of
#' genotyped individuals.
#'
#' @param Ped1 first (e.g. original) pedigree, dataframe with columns
#' id-dam-sire; only the first 3 columns will be used.
#' @param Ped2 second pedigree, e.g. newly inferred \code{SeqOUT$Pedigree} or
#' \code{SeqOUT$PedigreePar}, with columns id-dam-sire.
#' @param DumPrefix character vector with the prefixes identifying dummy
#' individuals in \code{Ped2}. Use 'F0' ('M0') to avoid matching to regular
#' individuals with IDs starting with 'F' ('M'), provided \code{Ped2} has
#' fewer than 999 dummy females (males).
#' @param SNPd character vector with IDs of genotyped individuals. If
#' \code{NULL}, defaults to the IDs occurring in both \code{Ped1} and
#' \code{Ped2} and not starting with any of the prefixes in \code{DumPrefix}.
#' @param Symmetrical when determining the category of individuals
#' (Genotyped/Dummy/X), use the 'highest' category across the two pedigrees
#' (\code{TRUE}, default) or only consider \code{Ped1} (\code{Symmetrical =
#' FALSE}).
#' @param minSibSize minimum requirements to be considered 'dummifiable',
#' passed to \code{\link{getAssignCat}}:
#' \itemize{
#' \item '1sib' : sibship of size 1, with or without grandparents. The
#' latter aren't really a sibship, but can be useful in some situations.
#' \item '1sib1GP': sibship of size 1 with at least 1 grandparent (default)
#' \item '2sib': at least 2 siblings, with or without grandparents
#' (default prior to version 2.4)
#' }
#' @param Plot show square Venn diagrams of counts?
#'
#' @return A list with
#' \item{Counts}{A 7 x 5 x 2 named numeric array with the
#' number of matches and mismatches, see below}
#' \item{Counts.detail}{a large numeric array with number of matches and
#' mismatches, with more detail for all possible combination of categories}
#' \item{MergedPed}{A dataframe with side-by-side comparison of the two
#' pedigrees}
#' \item{ConsensusPed}{A consensus pedigree, with Pedigree 2 taking priority
#' over Pedigree 1}
#' \item{DummyMatch}{Dataframe with all dummy IDs in Pedigree 2 (id.2), and the
#' best-matching individual in Pedigree 1 (id.1). Also includes the class of the
#' dam & sire, as well as counts of offspring per outcome class (off.Match,
#' off.Mismatch, etc.)}
#' \item{Mismatch}{A subset of MergedPed with mismatches between Ped1 and Ped2,
#' as defined below}
#' \item{Ped1only}{as Mismatches, with parents in Ped1 that were not assigned
#' in Ped2}
#' \item{Ped2only}{as Mismatches, with parents in Ped2 that were missing in
#' Ped1}
#'
#' 'MergedPed', 'Mismatch', 'Ped1only' and 'Ped2only' provide the following
#' columns:
#' \item{id}{All ids in both Pedigree 1 and 2. For dummy individuals, this is
#' the id \emph{in pedigree 2}}
#' \item{dam.1, sire.1}{parents in Pedigree 1}
#' \item{dam.2, sire.2}{parents in Pedigree 2}
#' \item{id.r, dam.r, sire.r}{The \emph{real} id of dummy individuals or parents
#' in Pedigree 2, i.e. the best-matching non-genotyped individual in Pedigree 1,
#' or "nomatch". If a sibship in Pedigree 1 is divided over 2 sibships in
#' Pedigree 2, the smaller one will be denoted as "nomatch"}
#' \item{id.dam.cat, id.sire.cat}{the category of the individual (first letter)
#' and \emph{highest category} of the dam (sire) in Pedigree 1 or 2:
#' G=Genotyped, D=(potential) dummy, X=none. Individual, one-letter categories
#' are generated by \code{\link{getAssignCat}}. Using the 'best' category from
#' both pedigrees makes comparison between two inferred pedigrees symmetrical
#' and more intuitive.}
#' \item{dam.class, sire.class}{classification of dam and sire: Match, Mismatch,
#' P1only, P2only, or '_' when no parent is assigned in either pedigree}
#'
#' The first dimension of \code{Counts} denotes the following categories:
#' \item{GG}{Genotyped individual, assigned a genotyped parent in either
#' pedigree}
#' \item{GD}{Genotyped individual, assigned a dummy parent, or at least 1
#' genotyped sibling or a genotyped grandparent in Pedigree 1)}
#' \item{GT}{Genotyped individual, total}
#' \item{DG}{Dummy individual, assigned a genotyped parent (i.e., grandparent
#' of the sibship in Pedigree 2)}
#' \item{DD}{Dummy individual, assigned a dummy parent (i.e., avuncular
#' relationship between sibships in Pedigree 2)}
#' \item{DT}{Dummy total}
#' \item{TT}{Total total, includes all genotyped individuals, plus
#' non-genotyped individuals in Pedigree 1, plus non-replaced dummy
#' individuals (see below) in Pedigree 2}
#'
#' The second dimension of \code{Counts} gives the outcomes:
#' \item{Total}{The total number of individuals with a parent assigned in
#' either or both pedigrees}
#' \item{Match}{The same parent is assigned in both pedigrees (non-missing).
#' For dummy parents, it is considered a match if the inferred sibship
#' which contains the most offspring of a non-genotyped parent, consists
#' for more than half of this individual's offspring.}
#' \item{Mismatch}{Different parents assigned in the two pedigrees. When
#' a sibship according to Pedigree 1 is split over two sibships in Pedigree
#' 2, the smaller fraction is included in the count here.}
#' \item{P1only}{Parent in Pedigree 1 but not 2; includes non-assignable
#' parents (e.g. not genotyped and no genotyped offspring).}
#' \item{P2only}{Parent in Pedigree 2 but not 1.}
#'
#' The third dimension \code{Counts} separates between maternal and paternal
#' assignments, where e.g. paternal 'DT' is the assignment of fathers to both
#' maternal and paternal sibships (i.e., to dummies of both sexes).
#'
#' In 'ConsensusPed', the priority used is parent.r (if not "nomatch") >
#' parent.2 > parent.1. The columns 'id.cat', dam.cat' and 'sire.cat' have two
#' additional levels compared to 'MergedPed':
#' \item{G}{Genotyped}
#' \item{D}{Dummy individual (in Pedigree 2)}
#' \item{R}{Dummy individual in pedigree 2 replaced by best matching
#' non-genotyped individual in pedigree 1}
#' \item{U}{Ungenotyped, Unconfirmed (parent in Pedigree 1, with no dummy
#' match in Pedigree 2)}
#' \item{X}{No parent in either pedigree}
#'
#' @section Assignable:
#' Note that 'assignable' may be overly optimistic. Some parents from
#' \code{Ped1} indicated as assignable may never be assigned by sequoia, for
#' example parent-offspring pairs where it cannot be determined which is the
#' older of the two, or grandparents that are indistinguishable from full
#' avuncular (i.e. genetics inconclusive because the candidate has no parent
#' assigned, and ageprior inconclusive).
#'
#' @section Dummifiable:
#' Considered as potential dummy individuals are all non-genotyped individuals
#' in Pedigree 1 who have, according to either pedigree, at least 2 genotyped
#' offspring, or at least one genotyped offspring and a genotyped parent.
#'
#' @section Mismatches:
#' Perhaps unexpectedly, cases where all siblings are correct but a dummy
#' parent rather than the genotyped Ped1-parent are assigned, are classified
#' as a mismatch (for each of the siblings). These are typically due to a too
#' low assumed genotyping error rate, a wrong parental birth year, or some
#' other issue that requires user inspection. To identify these cases,
#' \code{\link{ComparePairs}} may be of help.
#'
#' @section Genotyped 'mystery samples':
#' If Pedigree 2 includes samples for which the ID is unknown, the behaviour of
#' \code{PedCompare} depends on whether the temporary IDs for these samples are
#' included in \code{SNPd}. If they are included, matching (actual) IDs in
#' Pedigree 1 will be flagged as mismatches (because the IDs differ). If they
#' are not included in \code{SNPd}, or \code{SNPd} is not explicitly provided,
#' matches are accepted, as the situation is indistinguishable from comparing
#' dummy parents across pedigrees.
#'
#' This is of course all conditional on relatives of the mystery sample being
#' assigned in Pedigree 2.
#'
#'
#' @author Jisca Huisman, \email{jisca.huisman@gmail.com}
#'
#' @seealso \code{\link{ComparePairs}} for comparison of all pairwise
#' relationships in 2 pedigrees; \code{\link{EstConf}} for repeated
#' simulate-reconstruct-compare; \code{\link{getAssignCat}} for all parents in
#' the reference pedigree that could have been assigned;
#' \code{\link{CalcOHLLR}} to check how well an 'old' pedigree fits with the
#' SNP data.
#'
#' @examples
#' compare <- PedCompare(Ped_griffin, SeqOUT_griffin$Pedigree)
#' compare$Counts["TT",,] # totals only; 45 dams & 47 sires non-assigned
#' compare$Counts[,,"dam"] # dams only
#'
#' # inspect non-assigned in Ped2, id genotyped, dam might-be-dummy
#' PedM <- compare$MergedPed # for brevity
#' PedM[PedM$id.dam.cat=='GD' & PedM$dam.class=='P1only',]
#' # zoom in on specific dam
#' PedM[which(PedM$dam.1=="i011_2001_F"), ]
#' # no sire for 'i034_2002_F' -> impossible to tell if half-sibs or avuncular
#'
#' # overview of all non-genotyped -- dummy matches
#' head(compare$DummyMatch)
#'
#' # success of paternity assignment, if genotyped mother correctly assigned
#' dimnames(compare$Counts.detail)
#' compare$Counts.detail["G","G",,"Match",]
#'
#' # default before version 3.5: minSibSize = '2sib'
#' compare_2s <- PedCompare(Ped_griffin, SeqOUT_griffin$Pedigree,
#' minSibSize = '2sib')
#' compare_2s$Counts[,,"dam"] # note decrease in Total 'dummies
#' with(compare_2s$MergedPed, table(id.dam.cat, dam.class))
#' # some with id.cat = 'X' or dam.cat='X' are nonetheless dam.class='Match'
#' @export
PedCompare <- function(Ped1 = NULL,
Ped2 = NULL,
DumPrefix = c("F0", "M0"),
SNPd = NULL,
Symmetrical = TRUE,
minSibSize = "1sib1GP",
Plot = TRUE)
{
Ped1 <- PedPolish(Ped1, ZeroToNA=TRUE, NullOK = FALSE, StopIfInvalid=FALSE, KeepAllColumns=FALSE)
Ped2 <- PedPolish(Ped2, ZeroToNA=TRUE, NullOK = FALSE, StopIfInvalid=FALSE, KeepAllColumns=FALSE)
if (!any(Ped2$id %in% Ped1$id)) stop("no common IDs in Ped1 and Ped2")
if (is.null(SNPd)) {
SNPd <- intersect(Ped2$id, Ped1$id)
if (!is.null(DumPrefix)) {
for (x in seq_along(DumPrefix)) {
SNPd <- SNPd[substr(SNPd,1,nchar(DumPrefix[x])) != DumPrefix[x] ]
}
}
}
if (sum(SNPd %in% Ped2$id)==0) stop("none of 'SNPd' in Ped2")
SNPd <- stats::na.exclude(SNPd)
names(Ped1) <- c("id", "dam.1", "sire.1")
names(Ped2) <- c("id", "dam.2", "sire.2")
PedX <- merge(Ped1, Ped2[Ped2$id %in% SNPd, ], all.y=TRUE)
DumPed <- Ped2[!Ped2$id %in% SNPd, ]
Dummies <- list(dam = DumPed$id[DumPed$id %in% Ped2$dam],
sire = DumPed$id[DumPed$id %in% Ped2$sire])
Par <- rbind(dam =c("dam.1", "dam.2", "dam.r"),
sire=c("sire.1", "sire.2", "sire.r"))
#=== match dummies to non-genotyped parents ===
Sibs1 <- list()
SibScore <- list()
DumReal <- list()
for (p in 1:2) {
Sibs1[[p]] <- split(PedX$id, PedX[, Par[p,1]])
Sibs1[[p]] <- Sibs1[[p]][!names(Sibs1[[p]]) %in% SNPd]
}
NoDummies <- with(PedX, all(dam.2 %in% SNPd | is.na(dam.2)) &
all(sire.2 %in% SNPd | is.na(sire.2)))
if (!NoDummies) {
for (p in 1:2) {
Sibs2 <- split(PedX$id, PedX[, Par[p,2]])
Sibs2 <- Sibs2[!names(Sibs2) %in% SNPd]
if (length(Sibs1[[p]])>0 & length(Sibs2)>0) {
# This does the actual matching:
SibScore[[p]] <- tdf(sapply(Sibs1[[p]], SibMatch, Sibs2, SNPd))
if (length(stats::na.exclude(SibScore[[p]][, "Best"])) >
length(unique(stats::na.exclude(SibScore[[p]][, "Best"])))) {
SibScore[[p]] <- SibScore[[p]][order(SibScore[[p]][, "OK"],
decreasing=TRUE), ]
dups <- duplicated(SibScore[[p]][, "Best"], incomparables=NA)
BestS.d <- unique(SibScore[[p]]$Best[dups])
if (sum(dups) > 1) {
SibScore[[p]][dups, "Er"] <- rowSums(SibScore[[p]][dups, c("OK", "Er")])
} else {
SibScore[[p]][dups, "Er"] <- sum(SibScore[[p]][dups, c("OK", "Er")])
}
SibScore[[p]][dups, c("Best", "OK")] <- NA
for (s in seq_along(BestS.d)) {
tmp <- SibScore[[p]][which(SibScore[[p]]$Best==BestS.d[s]), ]
if (tmp$OK==1 & tmp$Er>=1) {
SibScore[[p]][which(SibScore[[p]]$Best==BestS.d[s]),
c("Best", "OK")] <- NA
}
}
}
tmp <- SibScore[[p]][!is.na(SibScore[[p]][, "Best"]), ]
DumReal[[p]] <- as.data.frame(cbind(real = rownames(tmp),
dummy = tmp$Best), stringsAsFactors=FALSE)
DumReal[[p]] <- merge(DumReal[[p]], DumPed[DumPed$id %in% Dummies[[p]],],
by.x="dummy", by.y="id", all=TRUE)[,c("real", "dummy")]
DumReal[[p]][,"real"][is.na(DumReal[[p]][,"real"])] <- "nomatch"
} else {
SibScore[[p]] <- NA
DumReal[[p]] <- data.frame(real=NA, dummy=NA)
}
}
for (p in 1:2) {
tmp <- stats::setNames(DumReal[[p]], c(Par[p,3], Par[p,2]))
PedX <- merge(PedX, tmp, all.x=TRUE)
DumPed <- merge(DumPed, DumReal[[p]], by.x="id", by.y="dummy", all.x=TRUE,
suffixes = c(".x",".y"))
}
DumPed$id.r <- apply(DumPed[,c("real.x", "real.y")], 1, function(x)
ifelse(all(is.na(x)), NA, min(x, na.rm=T)))
DumPed <- DumPed[, c("id", "dam.2", "sire.2", "id.r")]
DumPed <- merge(DumPed, Ped1, all.x=TRUE, by.x="id.r", by.y="id",
suffixes=c(".2", ".1"))
for (p in 1:2) {
DumPed <- merge(DumPed, stats::setNames(DumReal[[p]], c(Par[p,3], Par[p,2])),
all.x=TRUE)
}
} else {
PedX$dam.r <- NA
PedX$sire.r <- NA
}
#=== Combined pedigree ===
PedX$id.r <- PedX$id
PedY <- merge(PedX, DumPed, all=TRUE, sort=FALSE) # NA's for id.r = "nomatch"
PedY <- merge(PedY, stats::setNames(Ped1, c("id.r", "dam.1", "sire.1" )),
all=TRUE, sort=FALSE)
for (p in 1:2) {
PedTmp <- PedY[which(!PedY[,Par[p,2]] %in% SNPd &
!is.na(PedY[,Par[p,2]]) &
PedY[,Par[p,3]]!="nomatch"),]
tbl <- table(PedTmp[,Par[p,2]])
npar1 <- plyr::daply(PedTmp, Par[p,2],
function(x) length(unique(stats::na.exclude(x[,Par[p,1]]))))
unmatch <- names(which(c(tbl)==npar1 & npar1>1))
if (length(unmatch)>0) {
PedY[which(PedY[,Par[p,2]] %in% unmatch), Par[p,3]] <- "nomatch"
}
}
PedY <- PedY[, c("id", "dam.1", "sire.1", "dam.2", "sire.2",
"id.r", "dam.r", "sire.r")] # fix column order
#=== Categorise ===
CatNames <- c("G", "D", "X")
Ped.a <- getAssignCat(PedY[, c("id.r", "dam.1", "sire.1")], SNPd,
minSibSize = minSibSize)
PedZ <- merge(PedY, Ped.a[, c("id", "id.cat", "dam.cat", "sire.cat")],
by.x="id.r", by.y="id", all.x=TRUE)
if (any(is.na(PedZ$id.cat)) | any(is.na(PedZ$dam.cat)) | any(is.na(PedZ$sire.cat)))
stop('PedCompare() batman bug!!')
if (Symmetrical) { # take 'highest' category across the 2 pedigrees
Ped.b <- getAssignCat(PedY[, c("id", "dam.2", "sire.2")], SNPd,
minSibSize = minSibSize)
PedZ <- merge(PedZ, Ped.b[,c("id", "id.cat", "dam.cat", "sire.cat")],
by="id", all.x=TRUE, suffixes=c(".1", ".2"))
for (x in paste0(c("id", "dam", "sire"), ".cat.2")) {
PedZ[is.na(PedZ[,x]), x] <- "X"
}
for (x in c(outer(c("id", "dam", "sire"), 1:2, paste, sep=".cat."))) {
PedZ[,x] <- factor(PedZ[,x], levels=CatNames)
}
for (p in c("id", "dam", "sire")) {
PedZ[, paste0(p, ".cat")] <- factor(pmin(as.numeric(PedZ[,paste0(p,".cat.1")]),
as.numeric(PedZ[,paste0(p,".cat.2")])),
levels=1:3, labels=CatNames)
}
} else {
for (p in c("id", "dam", "sire")) {
PedZ[, paste0(p, ".cat")] <- factor(PedZ[, paste0(p, ".cat")],
levels=CatNames)
}
}
#=== Classify: match/mismatch ===
ClassNames <- c("Match", "Mismatch", "P1only", "P2only", "_")
for (p in c("dam", "sire")) {
PP <- PedZ[,paste(p, c(1,2,"r"), sep=".")]
PedZ[, paste0(p, ".class")] <- ifelse(is.na(PP[,1]) & is.na(PP[,2]), "_",
ifelse(!is.na(PP[,1]) & is.na(PP[,2]), "P1only",
ifelse(is.na(PP[,1]) & !is.na(PP[,2]), "P2only",
ifelse(PP[,1]==PP[,2] |
(!is.na(PP[,3]) & PP[,1] == PP[,3]),
"Match", "Mismatch"))))
}
#=== Count ===
Counts.all <- with(PedZ, table(id.cat, dam.cat, sire.cat,
dam.class = factor(dam.class, levels=ClassNames),
sire.class = factor(sire.class, levels=ClassNames)))
PedZ$id.dam.cat <- with(PedZ, paste0(id.cat, dam.cat)) # need 'X' for (sub)totals
PedZ$id.sire.cat <- with(PedZ, paste0(id.cat, sire.cat))
Cat2Names <- c("GG", "GD", "GT", "DG", "DD", "DT", "TT")
ClassNames <- c("Total", "Match", "Mismatch", "P1only", "P2only")
Score <- array(0, dim=c(7, 5, 2),
dimnames=list(cat=Cat2Names, class=ClassNames, parent=c("dam", "sire")))
for (p in c("dam", "sire")) {
PedP <- PedZ[!is.na(PedZ[,paste0(p,".1")]) | !is.na(PedZ[,paste0(p,".2")]), ]
Score[,,p] <- table(factor(PedP[,paste0("id.",p, ".cat")], levels=Cat2Names),
factor(PedP[,paste0(p, ".class")], levels=ClassNames))
Score[,"Total",p] <- table(factor(PedP[,paste0("id.",p, ".cat")], levels=Cat2Names))
Score[c("GT", "DT"),,p] <- table(factor(PedP[,"id.cat"], levels=c("G","D")),
factor(PedP[,paste0(p, ".class")], levels=ClassNames))
Score[c("GT", "DT"),"Total",p] <- table(factor(PedP[,"id.cat"], levels=c("G","D")))
Score["TT",,p] <- table(factor(PedP[,paste0(p, ".class")], levels=ClassNames)) # = colSums(Score[c("GT", "DT"),,p])
Score["TT", "Total", p] <- nrow(PedP)
}
#=== consensus pedigree ===
PedC <- with(PedZ, data.frame(
id = ifelse(!is.na(id.r) & id.r!="nomatch", id.r, id),
id.catc = ifelse(id.cat=="G", "G",
ifelse(id.r == "nomatch", "D", "R")),
stringsAsFactors = FALSE))
for (p in c("dam", "sire")) {
PP <- PedZ[,paste(p, c(1,2,"r"), sep=".")]
PedC[,paste0(p, ".c")] <- ifelse(!is.na(PP[,3]) & PP[,3]!="nomatch", PP[,3],
ifelse(!is.na(PP[,2]), PP[,2], PP[,1]))
PedC[, paste0(p, ".cat")] <- ifelse(!is.na(PP[,1]) & is.na(PP[,2]), "U", # Unconfirmed
ifelse(PP[,2] %in% Dummies[[p]],
ifelse(PP[,3]=="nomatch", "D", "R"),
as.character(PedZ[,paste0(p, ".cat")])))
}
PedC <- PedC[, c("id", "dam.c", "sire.c", "id.catc", "dam.cat", "sire.cat")] # re-order columns
names(PedC)[4] <- "id.cat"
PedZ <- PedZ[, c(names(PedY), "id.dam.cat", "id.sire.cat", "dam.class", "sire.class")]
#=== short-list non-matching parents ===
NoMatch <- list()
for (x in c("Mismatch", "P1only", "P2only")) {
NoMatch[[x]] <- PedZ[PedZ$dam.class == x | PedZ$sire.class == x, ]
}
#=== dummy match ===================
if (!NoDummies) {
DumPed$Sex <- ifelse(DumPed$id %in% Dummies$dam, 1, 2)
DumMatch <- merge(DumPed[, c("id", "id.r", "Sex")],
PedZ[, c("id", "dam.class", "sire.class")],
all.x = TRUE)
OffMatch <- sapply(seq_along(DumMatch$id), function(i) {
k <- DumMatch$Sex[i]
tmp <- PedZ[which(PedZ[,Par[k,1]] == DumMatch$id.r[i] | PedZ[,Par[k,2]] == DumMatch$id[i]), ]
table(factor(tmp[, paste0(c("dam","sire")[k] ,".class")],
levels = c("Match", "Mismatch", "P1only", "P2only")))
})
names(DumMatch)[1:2] <- c("id.2", "id.1")
rownames(OffMatch) <- paste0("off.", rownames(OffMatch))
DumMatch <- cbind(DumMatch, t(OffMatch))
DumMatch <- DumMatch[order(DumMatch$id.2), ]
} else {
DumMatch <- NULL
}
#=== plot ==========================
if (Plot) {
cat.props <- Score[,"Total",] / max(Score["TT","Total",])
img <- tryCatch(
{
suppressWarnings(PlotPedComp(Counts = Score,
sameSize = any(cat.props > 0 & cat.props < 0.01)))
},
error = function(e) {
message("PlotPedComp: Plotting area too small")
return(NA)
})
}
#=== out ===========================
return( c(list(Counts = Score,
Counts.detail = Counts.all,
MergedPed = PedZ,
ConsensusPed = PedC,
DummyMatch = DumMatch),
NoMatch) )
}
#============================================================================
#============================================================================
# Utils functions for comparisons
#======================================================================
#' Compare two vectors
#'
#' Compare a vector with inferred sibs to a vector of `true' sibs
#'
#' @param Infrd vector of inferred sibs
#' @param Simld vector of true sibs
#' @param SNPd character vector with IDs of genotyped individuals
#'
#' @return a named numeric vector of length 4, with the total length of Simld,
#' the length of the intersect of the two vectors, the number occurring in
#' Infrd but not Simld ('err'), and the number occuring in Simld but not
#' Infrd ('missed').
#'
#' @keywords internal
Vcomp <- function(Infrd, Simld, SNPd)
{
out <- c("total" = length(Simld),
"both" = length(intersect(Infrd, Simld)),
"err" = length(setdiff(Infrd[Infrd %in% SNPd], Simld)),
"missed" = length(setdiff(Simld, Infrd)))
out
}
#======================================================================
#' Find the closest matching inferred sibship to a true sibship
#'
#' @param SimX a vector with the IDs in the true (Ped1) sibship
#' @param Infrd a list of vectors with the IDs in the inferred (Ped2) sibships
#' @param SNPd character vector with IDs of genotyped individuals
#'
#' @return a named numeric vector with the number of matches ('NumMatch'),
#' the position of the best match ('Best'), the inferred sibship size of
#' this best match ('Tot'), the number of matching IDs ('OK'), and the
#' number of mismatches ('err').
#'
#' @keywords internal
SibMatch <- function(SimX, Infrd, SNPd)
{
VC <- sapply(Infrd, Vcomp, SimX, SNPd)
mtch <- which(VC["both",]>0)
if (length(mtch)>1) {
if (VC["total",1]==2) {
if (all(VC["err", mtch]==0)) {
mtch <- which.max(VC["both",])[1] # pair split into two singletons
} else {
mtch <- NULL # no clear match
}
} else {
mtch <- which.max(VC["both",])[1] # which inferred sibship has most members of true sibship
}
}
if (length(mtch)==1) {
if (VC["err", mtch] > VC["both", mtch]) { # was: >=
mtch <- NULL
}
}
if (length(mtch)==1) {
OUT <- data.frame(NumMatch = sum(VC[2,]>0),
Best = colnames(VC)[mtch], # as.numeric(substr(colnames(VC)[mtch],3,5)),
Tot = VC["total", mtch],
OK = VC["both", mtch],
Er = VC["err", mtch], stringsAsFactors=FALSE)
} else {
OUT <- data.frame(NumMatch=0, Best=NA, Tot= length(SimX), OK = NA, Er = NA)
}
OUT
}
#======================================================================
# transpose matrix to data.frame (sapply returns wrong format)
tdf <- function(M)
{
DF <- matrix(NA, ncol(M), nrow(M))
for (r in 1:nrow(M)) {
DF[, r] <- unlist(M[r, ])
}
DF <- as.data.frame(DF, stringsAsFactors = FALSE, row.names=colnames(M))
for (r in 1:ncol(DF)) {
if (allNumeric(DF[, r])) DF[, r] <- as.numeric(DF[, r])
}
names(DF) <- rownames(M)
DF
}
#======================================================================
# adapted from Hmisc::all.is.numeric
allNumeric <- function (x, what = c("test", "vector"))
{
what <- match.arg(what)
isnum <- suppressWarnings(!any(is.na(as.numeric(stats::na.exclude(x)))))
if (what == "test")
isnum
else if (isnum)
as.numeric(x)
else x
}
#============================================================================
#============================================================================
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.