## just simple utility function to test for empty/not set column names
noColNames <- function(x){
CN <- colnames(x)
all(CN %in% paste("V", 1:ncol(x), sep=""))
}
## takes a pedigree (data.frame, pedigree or pedigreeList) as input and
## generates a valid pedigree for the submitted ids (if no ids are specified
## it returns the ped). basically, the function subsets the full pedigree to
## the individuals ids and recursively adds parents if needed.
buildPedigree <- function(ped, ids){
## check pedigree and eventually add missing parents...
if(missing(ids))
return(ped)
## want to have a data.frame or matrix...
if(is(ped, "pedigree") | is(ped, "pedigreeList"))
ped <- ped2df(ped)
if(!all(ids %in% rownames(ped)))
stop("Some of the submitted ids are not in the pedigree!")
subfam <- ped[as.character(ids), ]
## now recursively add parents, if needed...
parents <- as.character(unique(c(subfam$father, subfam$mother)))
## parents[is.na(parents)] <- 0
## parents <- parents[parents!=0]
parents <- parents[!is.na(parents)]
while(!all(parents %in% as.character(subfam$id))){
subfam <- ped[unique(c(as.character(subfam$id), parents)), ]
parents <- as.character(unique(c(subfam$father, subfam$mother)))
## parents[is.na(parents)] <- 0
## parents <- parents[parents!=0]
parents <- parents[!is.na(parents)]
}
subfam
}
doGetSiblings <- function(ped, id = NULL) {
if(is.null(id))
stop("At least one if has to be specified!")
parents <- doGetParents(ped, id = id)
siblings <- doGetChildren(ped, id = parents, maxlevel = 1)
siblings
}
## gets all the ancestors for (one or more) id(s)
doGetAncestors <- function(ped, id = NULL, maxlevel = 3) {
if (is.null(id))
stop("At least one id has to be specified!")
id <- as.character(id)
allids <- NULL
for (i in 1:maxlevel) {
## get the parents of all the allids
newids <- unlist(
ped[which(ped$id %in% id), c("father", "mother")],
use.names = FALSE
)
zeros <- is.na(newids)
##zeros <- newids == founder
if(all(zeros) | length(newids)==0)
break
newids <- newids[!zeros]
id <- newids
allids <- c(allids, newids)
}
as.character(unique(allids))
}
doGetParents <- function(ped, id = NULL) {
doGetAncestors(ped = ped, id = id, maxlevel = 1)
}
## gets all the children for (one or more) id(s)
## note: these are only children in direct blood line.
doGetChildren <- function(ped, id=NULL, maxlevel=16){
if(is.null(id))
stop("At least one id has to be specified!")
allids <- NULL
for(i in 1:maxlevel){
## get all the children for the ids
newids <- ped[which(ped$father %in% id | ped$mother %in% id), "id"]
if(length(newids)==0)
break
id <- newids
allids <- c(allids, newids)
}
as.character(unique(allids))
}
## check for the submitted id(s) whether a mate (spouse) is
## available (in column mother or father of the pedigree ped)
## and if yes add them to the returned list of ids.
## returned value contains
## eventually missing mates.
doGetMissingMate <- function(ped, id) {
if(is.null(id))
stop("Al least one id has to be specified")
newids <- c(ped[which(ped$father %in% id), "mother"],
ped[which(ped$mother %in% id), "father"])
unique(newids)
}
writePed <- function(ped, file){
Sexes <- as.numeric(sanitizeSex(ped$sex))
Sexes[is.na(Sexes)] <- 0
ped$sex <- Sexes
write.table(ped, file=file, quote=FALSE, sep="\t", row.names=FALSE,
col.names=FALSE)
}
## creates a data.frame (or matrix) for a pedigree...
ped2df <- function(ped){
ped <- do.call(cbind, ped)
if(!any(colnames(ped)=="famid"))
ped <- cbind(ped, famid=rep(1, nrow(ped)))
ped <- ped[ , c("famid", "id", "findex", "mindex", "sex")]
## FIXME!!! Can I get NAs as findex too?
## have to rematch the fater id and mother id...
notZ <- which(ped[, "findex"] > 0)
ped[notZ, "findex"] <- ped[ped[notZ, "findex"], "id"]
notZ <- which(ped[, "findex"] > 0)
ped[notZ, "mindex"] <- ped[ped[notZ, "mindex"], "id"]
colnames(ped) <- .PEDCN
ped <- data.frame(ped)
rownames(ped) <- ped$id
ped
}
df2ped <- function(ped){
ped <- checkPedCol(ped)
if(!any(colnames(ped)=="family"))
ped <- cbind(ped, family=rep(1, nrow(ped)))
## if there is only one family, remove that one ;)
if(length(unique(ped$family)) == 1)
ped <- ped[, colnames(ped)!="family"]
##
if(any(colnames(ped) == "affected")){
if(any(colnames(ped) == "family")){
peddi <- kinship2::pedigree(id=ped$id, dadid=ped$father,
momid=ped$mother, famid=ped$family,
sex=as.numeric(ped$sex),
affected=ped$affected)
}else{
peddi <- kinship2::pedigree(id=ped$id, dadid=ped$father,
momid=ped$mother,
sex=as.numeric(ped$sex),
affected=ped$affected)
}
}else{
if(any(colnames(ped) == "family")){
peddi <- kinship2::pedigree(id=ped$id, dadid=ped$father,
momid=ped$mother, famid=ped$family,
sex=as.numeric(ped$sex))
}else{
peddi <- kinship2::pedigree(id=ped$id, dadid=ped$father,
momid=ped$mother,
sex=as.numeric(ped$sex))
}
}
peddi
}
## finds related individuals, i.e. individuals sharing the same blood line.
## should we call this "shareBloodLine" or "shareKinship"
## ped has to be the data.frame that we get with the pedigree method.
## Use `rmKinship` for fine control: omit all individuals with kinship less
## than or equal to this paramter. Since it defaults to 0, all individuals
## sharing blood line will be reported by default.
doShareKinship <- function(ped, kin, id, rmKinship=0){
if(missing(ped) & missing(kin))
stop("ped or kin has to be submitted!")
if(missing(id))
stop("At least one id has to be submitted!")
id <- as.character(id)
## calculate kin from ped
if(missing(kin)){
kin <- kinship(id=ped$id, momid=ped$mother, dadid=ped$father,
sex=ped$sex)
}
if(!all(id %in% colnames(kin)))
stop("id has to be present in ped or kin!")
## subset to id (which can be more than one)
kin <- kin[id, , drop=FALSE]
related <- colnames(kin)[colSums(kin>rmKinship)>0]
related
}
####============================================================
## doGetFounders
##
## Returns the ids of the founders in the specified pedigree.
####------------------------------------------------------------
doGetFounders <- function(ped){
ped[is.na(ped$mother) & is.na(ped$father), "id"]
}
####============================================================
## doGetSingletons
##
## Return the ids of those founders that have no child in the
## pedigree.
####------------------------------------------------------------
doGetSingletons <- function(ped){
founders <- ped[is.na(ped$mother) & is.na(ped$father), , drop=FALSE]
if(nrow(founders) > 0){
childl <- !((founders$id %in% ped$father) |
(founders$id %in% ped$mother))
if(any(childl)){
return(founders[childl, "id"])
}
}
character()
}
## gets all parents.
getParents <- function(ped, id){
if(is(ped, "pedigree") | is(ped, "pedigreeList"))
ped <- ped2df(ped)
ped <- checkPedCol(ped)
if(missing(id))
stop("id is missing!")
if(is.null(rownames(ped)))
rownames(ped) <- ped[, "id"]
id <- as.character(id)
if(!is.data.frame(ped))
ped <- as.data.frame(ped)
ped[id, c("father", "mother"), drop=FALSE]
}
## just a simple helper function that checks that the ped parameter
## has the required colnames, is a data.frame and has rownames set.
checkPedCol <- function(ped){
if(missing(ped))
stop("ped is required!")
if(!all(c("id", "father", "mother") %in% colnames(ped)))
stop("id, father, mother are required columns in ped.")
if(!is.data.frame(ped))
ped <- as.data.frame(ped)
## add a family column if not present...
if(!any(colnames(ped) == "family"))
ped <- cbind(ped, family=rep(1, nrow(ped)))
rownames(ped) <- as.character(ped$id)
ped
}
## Sanitize the pedigree the way we want: sex should be sanitized (if present),
## father and mother column should contain NA for founders instead of "" or 0.
sanitizePed <- function(ped){
if(any(colnames(ped) == "sex"))
ped$sex <- sanitizeSex(ped$sex)
Father <- ped$father
if(is.factor(Father))
Father <- as.character(Father)
if(is.numeric(Father))
Father[which(Father == 0)] <- NA
if(is.character(Father))
Father[which(Father == "")] <- NA
Mother <- ped$mother
if(is.factor(Mother))
Mother <- as.character(Mother)
if(is.numeric(Mother))
Mother[which(Mother == 0)] <- NA
if(is.character(Mother))
Mother[which(Mother == "")] <- NA
ped$father <- Father
ped$mother <- Mother
ped
}
## Add density from values x to a density d computed by a previous call to this
## function. Make the initial call by omitting d.
## Resulting density of the partitioned dataset is a (pretty decent)
## approximation of the density obtained when submitting the whole dataset.
inc.density <- function(x, d = NULL)
{
if( is.null(d) ) {
## adjust has been determined empirically.
dd <- density(x = x, n = 512, adjust = 0.75)
} else {
if( length(d$x)!=512 )
stop("Argument d has not been created by a call to this function.")
## Determine the density data structure that has the larger domain and
## add interpolated density values from the other data structure. By
## this, the domain can grow and will never omit values that appear
## later in the iterative process.
dd <- density(x, bw = d$bw, n = 512)
if( dd$x[512]<d$x[512] ) {
tmp <- dd; dd <- d; d <- tmp; rm(tmp)
}
delta <- dd$x[2] - dd$x[1]
f <- approxfun(d$x, d$y)
y <- f(dd$x)
y[is.na(y)] <- 0
dd$y <- dd$y + y
dd$y <- dd$y/(delta*sum(dd$y))
dd$n <- dd$n + d$n
}
dd
}
## Add histogram from values x to a histogram h computed by a previous call to
## this function. Additionally, reports the number of values that produced an
## overflow in variable nreplace (i.e. values did were too large to be included
## in any bin of the histogram h). Make the initial call by omitting h.
##
## Resulting histogram of the partitioned dataset is a (pretty decent)
## approximation of the histogram obtained when submitting the whole dataset.
## Drawback: the domain of the histogram cannot be changed during addition of
## new values. We therefore rely on the first batch as a random sample of the
## population and estimate the overall domain.
inc.hist <- function(x, h = NULL)
{
if( is.null(h) ) {
## Empirically determined overall size of the domain (110%).
hh <- hist(x, breaks = 1/127*(0:127)*max(x)*1.1, plot = FALSE)
hh$nreplace <- 0
} else {
if( is.null(h$nreplace) )
stop("Argument h has not been created by a call to this function.")
maxv <- h$breaks[length(h$breaks)]
nreplace <- sum(x>maxv)
if( nreplace>0 )
x[x>maxv] <- maxv
hh <- hist(x, breaks = h$breaks, plot = FALSE)
hh$counts <- hh$counts + h$counts
hh$density <- (hh$counts/sum(hh$counts))/diff(hh$breaks)
hh$nreplace <- h$nreplace + nreplace
}
hh
}
## Add tabulated data from values x to a table t computed by a previous call to
## this function. Make the initial call by omitting t.
## This function produces identical results when used in the iterative way
## compared to submitting the whole dataset at once.
inc.table <- function(x, t = NULL)
{
if( is.null(t) ) {
tt <- table(x)
} else {
tt <- table(x)
both <- intersect(names(t), names(tt))
tt[both] <- tt[both] + t[both]
tonly <- setdiff(names(t), names(tt))
tt[tonly] <- t[tonly]
}
tt
}
## Convert 1D count table with only integers as counted objects to a histogram.
table2hist.int <- function(t)
{
## Only 1D tables have names(), for higher dimensional tables, it is set to
## NULL. dimnames() OTH works opposite.
nms <- names(t)
stopifnot("Table must be one-dimensional" = length(nms)>=1)
suppressWarnings( bins <- as.integer(names(t)) )
stopifnot("Table can be integer only." = all(as.character(bins)==nms))
h <- list(breaks = c(bins, max(bins) + 1),
counts = as.vector(t),
mids = bins + 0.5,
xname = "generic",
equidist = TRUE)
attr(h, "class") <- "histogram"
h
}
#' find the subPedigree (smallest pedigree) including the individuals specified
#' with id and all eventually needed additionals
#'
#' @param ped pedigree `data.frame`.
#'
#' @param id `character` with the IDs of the individuals.
#'
#' @param all `logical(1)`
#'
#' @md
#'
#' @noRd
#'
#' @author Johannes Rainer
subPedigree <- function(ped, id = NULL, all = TRUE) {
CL <- class(ped)
## handling for pedigree and pedigreeList classes: unfortunately these
## are not correct classes, otherwise I could define methods instead of
## a function...
if (is(ped, "pedigree") | is(ped, "pedigreeList")) {
ped <- ped2df(ped)
}
ped <- checkPedCol(ped)
if (is.null(id))
stop("id is missing!")
id <- as.character(id)
subgr <- connectedSubgraph(ped2graph(ped), nodes = id, mode = "all",
all.nodes = all, ifnotfound = NA)
if (!is(subgr, "igraph")) {
stop("No common pedigree for the submitted individuals found!")
}
inds <- names(V(subgr))
## get missing mates
inds <- unique(c(inds, doGetMissingMate(ped, id = inds)))
subped <- ped[inds, ]
subped <- removeSingletons(subped)
## fix founders.
subped[!(subped$father %in% subped$id), "father"] <- NA
subped[!(subped$mother %in% subped$id), "mother"] <- NA
## Fix if we have individuals with a mother or a father, but not both.
idx <- which(is.na(subped$father) != is.na(subped$mother))
if (length(idx)) {
idx_ped <- match(subped$id[idx], ped$id)
id_msng <- ped$mother[idx_ped][is.na(subped$mother[idx])]
id_msng <- c(id_msng, ped$father[idx_ped][is.na(subped$father[idx])])
subped$father[idx] <- ped$father[idx_ped]
subped$mother[idx] <- ped$mother[idx_ped]
if (length(id_msng)) {
subped <- rbind(subped, ped[match(id_msng, ped$id), ])
subped[!(subped$father %in% subped$id), "father"] <- NA
subped[!(subped$mother %in% subped$id), "mother"] <- NA
}
}
if(CL == "pedigree" | CL == "pedigreeList")
subped <- df2ped(subped)
subped
}
## find the (eventually smallest) connected subgraph of all specified
## nodes.
## graph: igraph object.
## nodes: node (vertex) names.
## mode: to calculate the smallest distance.
## all.nodes: if all nodes should be present in the resulting graph.
## ifnotfound: by default, the function throws an error if no connected
## subgraph is found, or, if all.nodes=TRUE, not all input nodes
## are in the connected subgraph. If ifnotfound is specified, its
## value is returned instead of an error.
connectedSubgraph <- function(graph, nodes, mode="all", all.nodes=TRUE,
ifnotfound){
if(!is(graph, "igraph"))
stop("graph has to be an igraph object!")
mode <- match.arg(mode, c("all", "in", "out"))
if(missing(nodes) | length(nodes) < 2)
stop("At least two nodes have to be submitted!")
nodes <- as.character(nodes)
## check if nodes are present in the graph
nono <- !(nodes %in% names(V(graph)))
if(any(nono)){
missingNodes <- nodes[nono]
if(all.nodes)
stop("With argument all.nodes=TRUE all nodes have to be present ",
"in the graph! Nodes ", paste(missingNodes, collapse=", "),
" not present in the original graph.")
nodes <- nodes[!nono]
warning("Nodes ", paste(missingNodes, collapse=", "),
" removed as they are not present in graph! ")
if(length(nodes) < 2)
stop("At least two nodes have to be present in the graph!")
}
## OK, now we can start...
## 1) iterate over all nodes and calculate the shortest path between all.
## 2) get all nodes along the shortest path and add to the needed nodes.
needNodes <- character()
## two versions: if mode="all" the directionality does not matter, thus we
## can run a faster version. for the other modes really calculate distances
## between all (both directions).
if(mode == "all"){
for(i in 1:(length(nodes)-1)){
suppressWarnings(
paths <- shortest_paths(graph, from=nodes[i],
to=nodes[(i+1):length(nodes)],
mode=mode)
)
needNodes <- unique(c(needNodes,
unlist(lapply(paths$vpath, names))))
}
}else{
for(i in 1:length(nodes)){
suppressWarnings(
paths <- shortest_paths(graph, from=nodes[i],
to=nodes[-i],
mode=mode)
)
needNodes <- unique(c(needNodes,
unlist(lapply(paths$vpath, names))))
}
}
if(length(needNodes)==0){
if(missing(ifnotfound)){
stop("Could not find any connected subgraph!")
}
warning("Could not find any connected subgraph!")
return(ifnotfound)
}
## check the subgraph.
if(!all(nodes %in% needNodes)){
if(all.nodes){
if(missing(ifnotfound)){
stop("Could not find any connected subgraph!")
}
warning("Could not find any connected subgraph!")
return(ifnotfound)
}else{
warning("Nodes ", paste(nodes[!(nodes %in% needNodes)]),
" not present in the returned graph!")
}
}
## finally build the graph...
## subgraph is a little tedious as it requires submitting of vertex INDICES!
induced_subgraph(graph, vids=needNodes)
}
## search for a common ancestor couple in a pedigree.
## ped: data.frame or pedigree or pedigreeList.
doGetCommonAncestor <- function(ped, id, method="min.dist"){
method <- match.arg(method, c("min.dist", "smallest.mean.dist"))
if(missing(id))
stop("id has to be specified")
if(length(id) <= 1)
stop("The minimum number of ids has to be 2.")
if(is(ped, "pedigree") | is(ped, "pedigreeList"))
ped <- ped2df(ped)
ped <- checkPedCol(ped)
## check if id are in ped:
id <- as.character(id)
if(!all(id %in% rownames(ped))){
stop("all of the sumitted ids should be present in the pedigree!")
}
## check if the ids are in the same family
if(any(colnames(ped)=="family")){
if(length(unique(ped[id, "family"])) > 1){
warning("The provided ids are from different families; still",
" trying to find a common ancestor but I'll probably fail.")
}else{
## subset the pedigree to this family...
ped <- ped[ped$family == ped[id, "family"][1], , drop=FALSE]
}
}
## OK; done with input parameter checking.
## transform the pedigree into a graph
igr <- ped2graph(ped)
## "in": go from children -> parents
## LLLL TODO can do faster!!!
din <- distances(igr, mode="in")
## use the ROWS in that.
subD <- din[id, , drop=FALSE]
subD <- subD[, !apply(subD, MARGIN=2, function(x){
return(all(infOrZero(x)))
}) , drop=FALSE]
## check if we've got one common ancestor:
if(all(is.infinite(colSums(subD)))){
warning("Did not find any common ancestor!")
return(NA)
}
## remains to find the "one and only" (well, usually we expect a couple).
subD <- subD[, !is.infinite(colSums(subD)), drop=FALSE]
## now get the ones that are closest:
if(method == "min.dist"){
dists <- apply(subD, MARGIN=2, min)
ancs <- names(dists)[dists==min(dists)]
}
if(method == "smallest.mean.dist"){
dists <- colMeans(subD)
ancs <- names(dists)[dists==min(dists)]
}
## at last trying to get the mate... if any; usually, ancs should
## already correspond to the ancestor couple.
ancs <- c(ancs, doGetMissingMate(ped, ancs))
unique(ancs)
}
## returns true if a value is 0 or infinite
infOrZero <- function(x){
(is.infinite(x) | x == 0)
}
## transform a pedigree into a (directed) igraph
## direction is always parent->child.
## this time each individual is treated as a single node!
ped2graph <- function(ped){
if(is(ped, "pedigree") | is(ped, "pedigreeList"))
ped <- ped2df(ped)
ped <- checkPedCol(ped)
## define end/starting points
ped[!ped[, "father"] %in% ped$id, "father"] <- NA
ped[!ped[, "mother"] %in% ped$id, "mother"] <- NA
eds <- apply(ped[, c("id", "father", "mother")], MARGIN=1, function(x){
retval <- c()
if(!is.na(x["father"]))
retval <- x[c("father", "id")]
if(!is.na(x["mother"]))
retval <- c(retval, x[c("mother", "id")])
return(retval)
})
## what the heck... I need character vertex names!
Edges <- unlist(eds, use.names=FALSE)
igr <- graph(edges=as.character(Edges))
igr
}
## takes a pedigree as input and tries to identify the founders
## in case there are more than two founder pairs (i.e. founders with the same,
## largest number of offspring generations), it returns only one of them.
doFindFounders <- function(ped, family, id){
if(!any(colnames(ped) == "family"))
ped <- data.frame(ped, family=rep(1, nrow(ped)), stringsAsFactors=FALSE)
families <- unique(as.character(ped$family))
if(missing(family))
family <- NULL
if (missing(id))
id <- NULL
if(is.null(family) & is.null(id)){
family <- families[1]
warning("No family specified; using the first one in the pedigree: ",
family, ".")
}
if (!is.null(family))
ped <- ped[ped$family == family, , drop=FALSE]
if (!is.null(id)) {
return(doFindFoundersForId(ped = ped, id = id))
} else {
founders <- ped$id[which(is.na(ped$father) & is.na(ped$mother))]
## determine for each founder the number of generations of its children,
## grandchildren etc.
founderGen <- doCountGenerations(ped, id=founders, direction="down")
founders <- names(founderGen)[founderGen == max(founderGen)][1]
founders <- c(founders, doGetMissingMate(ped, founders))
return(unique(founders))
}
}
##' Find founders for the pedigree of a given individual. Have to first build
##' the largest pedigree for the given individual and identify the founders in
##' that pedigree.
##'
##' @note The function returns only a single founder pair, i.e. the one with the
##' largest number of offspring generations.
##' @noRd
doFindFoundersForId <- function(ped, id) {
if (length(id) > 1) {
id <- id[1]
warning("length of 'id' was > 1, but only the first id was selected!")
}
## Check if we've got this id in the pedigree
if (!any(ped[, "id"] == id))
stop("Individual with id '", id, "' not found in the pedigree!")
## Building the pedigree for that individual.
ids <- unique(c(doGetAncestors(ped = ped, id = id, maxlevel = 200), id))
ids <- unique(c(ids, doGetChildren(ped = ped, id = ids, maxlevel = 200)))
ids <- unique(c(ids, doGetMissingMate(ped, id = ids)))
ped <- ped[ped[, "id"] %in% ids, , drop = FALSE]
## OK, now we have the pedigree for the individual, now find founders.
founders <- ped[which(is.na(ped[, "father"]) &
is.na(ped[, "mother"])), "id"]
## determine for each founder the number of generations of its children,
## grandchildren etc.
founderGen <- doCountGenerations(ped, id = founders, direction = "down")
founders <- names(founderGen)[founderGen == max(founderGen)][1]
founders <- c(founders, doGetMissingMate(ped, founders))
unique(founders)
}
## calculate for each id the number of generations up- or down.
doCountGenerations <- function(ped, id=NULL, direction="down"){
if(is.null(id))
stop("At least one id has to be specified!")
direction <- match.arg(direction, c("up", "down"))
if(direction == "up")
walkFun <- doGetAncestors
if(direction == "down")
walkFun <- doGetChildren
res <- sapply(id, function(x){
generation <- 0
ids <- x
repeat{
ids <- walkFun(ped, id=ids, maxlevel=1)
if(length(ids) == 0){
return(generation)
}else{
generation <- generation + 1
}
}
})
names(res) <- id
res
}
## returns a list of generations for each family in the pedigree.
## generations are relative to the founders.
doEstimateGenerationsFor2 <- function(ped, family=NULL){
if(!is.null(family)){
ped <- ped[ped$family %in% family, ]
if(nrow(ped) == 0)
stop("The submitted family id is not present in the pedigree!")
}
## OK, now loop through the families...
pedList <- split(ped, ped$family)
Gens <- lapply(pedList, function(currentPed){
founders <- doFindFounders(currentPed, family=currentPed[1, "family"])
generation <- doGetGenerationFrom2(currentPed, founders[1])
return(generation)
})
Gens
}
getAllSibsNMates <- function(ped, id){
## recursively fetch siblings and mates
sibsNmates <- id
repeat{
sibs <- unique(c(id, doGetSiblings(ped, id=id)))
sibsNmates <- unique(c(doGetMissingMate(ped, id=sibs), sibs))
if(length(sibsNmates) != length(id)){
id <- sibsNmates
}else{
break
}
}
sibsNmates
}
doGetGenerationFrom2 <- function(ped, id, generation=0){
if(missing(id))
stop("id has to be specified!")
if(missing(ped))
stop("ped has to be specified!")
id <- as.character(id)
if(length(id) > 1){
id <- id[1]
warning("Length of 'id' is larger 1. Using only the first id ",
"in 'id': ", id)
}
if(!any(ped$id %in% id))
stop("Can not find id in the pedigree!")
famid <- ped[ped$id == id, "family"]
ped <- ped[ped$family == famid, , drop=FALSE]
## first define the connected!!! pedigree.
pedg <- ped2graph(ped)
if(clusters(pedg)$no > 1){
Cl <- clusters(pedg)
warning("Found ", Cl$no, " connected sub-graphs in the pedigree ",
"for family ", famid, "! Will restrict the analysis to the ",
"subgraph containing individual ", id, ".")
vertices <- names(Cl$membership)[Cl$membership == Cl$membership[id]]
pedg <- induced_subgraph(pedg, vids=vertices)
}
individuals <- names(V(pedg))
gens4individuals <- rep(NA, length(individuals))
names(gens4individuals) <- individuals
if(!any(names(gens4individuals) == id))
stop("Individual ", id, " is not part of the connected pedigree!")
gens4individuals[id] <- 0
fromId <- id
Tested <- rep(FALSE, length(gens4individuals))
repeat{
## get the generation of the individuals from which we start
fromIdBool <- individuals %in% fromId
Tested[fromIdBool] <- TRUE
currentGens <- gens4individuals[fromIdBool]
currentGens <- split(currentGens, currentGens)
Res <- lapply(currentGens, allids=individuals, FUN=function(z, allids){
## get the ancestors and set the generation for them
ancs <- getAncestors(ped, id=names(z), max.generations=1)
if(length(ancs) > 0){
ancret <- cbind(which(allids %in% ancs), (z[1] - 1))
}else{
ancret <- NULL
}
## get the children
kids <- getChildren(ped, id=names(z), max.generation=1)
if(length(kids) > 0){
kidret <- cbind(which(allids %in% kids), (z[1] + 1))
}else{
kidret <- NULL
}
return(rbind(ancret, kidret))
})
Res <- do.call(rbind, Res)
if(nrow(Res) == 0){
warning("What the heck... stopped before we got all?")
break
}
## don't want to search the same id several times... if Tested is
## TRUE, don't use them again.
toTestIdx <- Res[, 1][!Tested[Res[, 1]]]
fromId <- individuals[toTestIdx]
gens4individuals[Res[, 1]] <- Res[, 2]
if(sum(is.na(gens4individuals)) == 0)
break
}
## make a larger generation vector.
allgens <- rep(NA, nrow(ped))
names(allgens) <- ped$id
allgens[names(gens4individuals)] <- gens4individuals
allgens
}
##********************************************************************
##
## Utility functions
##
##********************************************************************
sanitizeSex <- function(x){
if(is.numeric(x)){
## require values 1, 2, NA
numrange <- unique(x)
if(!all(c(1, 2) %in% numrange))
stop("If column sex is numeric it has to contain 1 (=male) ",
"and 2 (=female) values! Unknown or missing can be encoded",
" by NA or any other number than 1 or 2.")
news <- rep(NA, length(x))
news[which(x == 1)] <- "M"
news[which(x == 2)] <- "F"
return(factor(news, levels=c("M", "F")))
}
if(is.factor(x))
x <- as.character(x)
if(is.character(x)){
## remove all white spaces:
x <- gsub(x, pattern="\\s", replacement="")
## that's tricky...kind of...
## just get the first character; want to have M and F...
x <- substr(toupper(x), 1, 1)
chars <- unique(x)
if(length(chars) == 2){
if(!all(c("M", "F") %in% chars))
stop("Male and female individuals have to be represented",
" by 'M' and 'F', respectively!")
}
if(length(chars) == 1){
if(!any(c("M", "F") %in% chars))
stop("Male and female individuals have to be represented",
" by 'M' and 'F', respectively!")
}
if(length(chars[!is.na(chars)]) > 2)
warning("All characters in column sex other than M and F",
" (or male, female etc) are set to NA!")
news <- rep(NA, length(x))
news[which(x == "M")] <- "M"
news[which(x == "F")] <- "F"
return(factor(news, levels=c("M", "F")))
}
}
#####
#### NOTE:
## isn't that the same as removeSingletons???
##
####
## remove non-connected individuals. We're assuming that ped is the pedigree
## of a SINGLE family, as we expect to get ONE graph. in case we've got 2, we're
## using the largest connected one.
## solveMultiGraph: what should we do if we get multiple graphs?
## + use.all: just use all, i.e. return vertix names of all.
## + use.largest: just return vertices of the largest connected sub-graph.
doPrunePed <- function(ped, addMissingMates=FALSE, solveMultiGraph="use.all"){
.Deprecated("removeSingletons")
solveMultiGraph <- match.arg(solveMultiGraph, c("use.all", "use.largest"))
pedg <- ped2graph(ped)
if(solveMultiGraph == "use.largest"){
if(clusters(pedg)$no > 1){
Cl <- clusters(pedg)
warning("Found ", Cl$no,
" connected sub-graphs in the pedigree! Will restrict ",
"the analysis to the largest connected pedigree",
" in that family.")
vertices <-
names(Cl$membership)[Cl$membership ==
as.numeric(sort(table(Cl$membership),
decreasing=TRUE))[1]]
pedg <- induced_subgraph(pedg, vids=vertices)
}
}
keepIds <- names(V(pedg))
if(addMissingMates){
missMates <- doGetMissingMate(ped, id=keepIds)
keepIds <- unique(c(missMates, keepIds))
}
notThere <- !(as.character(ped$id) %in% keepIds)
ped <- ped[!notThere, , drop=FALSE]
if(any(notThere))
warning("Had to remove ", sum(notThere),
" un-connected individuals from the pedigree.")
ped
}
## a singleton a.k.a childless founder is an individual which id is not
## present in the father or mother column of the pedigree and which has 0
## in mother and father
.removeSingletons <- function(ped){
message("Removing singletons...", appendLF=FALSE)
ped <- checkPedCol(ped)
ped <- sanitizePed(ped)
childlF <- doGetSingletons(ped)
if(length(childlF) > 0){
ped <- ped[!(ped$id %in% childlF), ]
message(" ", length(childlF), " removed.")
}else{
message(" none present.")
}
ped
}
#' @title Extract pairs of individuals matching certain kinship criteria
#'
#' @name kinshipPairs
#'
#' @description
#'
#' The `kinshipPairs` function allows to extract pairs of individuals matching
#' a user-defined kinship *condition* (e.g. individuals with a kinship larger
#' than 0.0625). Such sets of paired individuals (along with paired unrelated
#' values) would enable a *familial resemblance* analysis on quantitative
#' traits (Ziegler 2010) (see examples below for details).
#'
#' By default, `kinshipPairs` returns all pairs of individuals for which the
#' `condition` on the kinship matrix matches (e.g. all pairs of individuals with
#' a kinship coefficient larger than or equal to 0.25). Individuals can thus
#' be reported multiple times (see examples below). Parameter `duplicates` can
#' be used to define a strategy to avoid such duplicated IDs. Supported are:
#'
#' - `duplicates = "keep"`: the default, return all values.
#' - `duplicates = "first"`: report only the first pair of individuals for each
#' individual ID.
#' - `duplicates = "last"`: report only the last pair of individuals for each
#' individual ID.
#' - `duplicates = "random"`: randomly select one pair of individuals for
#' each individual ID.
#'
#' For any setting different than `duplicates = "keep"` each individual will
#' only be listed **once** in the resulting matrix.
#'
#' @param x A `FAData` object (or object inheriting from that).
#'
#' @param condition A `function` defining how individuals should be selected
#' based on the object's kinship matrix. The default is to select all
#' individuals with a kinship `>= 0.25`. Note that the diagonal of the
#' kinship matrix (i.e. the kinship of individuals with itself) is always
#' skipped, so no additional criteria is needed to avoid self-pairs.
#'
#' @param duplicates `character(1)` defining how to deal with duplicated IDs
#' in the result returned by the function.
#' See function description and examples below for more details. Defaults
#' to `duplicates = "keep"` returning all pairs of IDs matching `condition`.
#'
#' @param family optional family identifiers if pairs should only defined for
#' selected families. Defaults to `family = NULL` hence the full data set is
#' considered.
#'
#' @param id optional identifiers of subsets of individuals on which the pairs
#' should be defined. Defaults to `id = NULL` hence the full data set is
#' considered.
#'
#' @return
#'
#' A two column `matrix` with the IDs (colnames/rownames of the kinship matrix
#' or as defined in `x$id`) of the pairs. If `duplicates` is either `"first"`,
#' `"last"` or `"random"` each ID is only returned once (i.e. no ID is reported
#' more than one time).
#'
#' @author Johannes Rainer
#'
#' @references
#'
#' Ziegler A., Koenig I. R. (2010). Familiality, Heristability, and Segregation
#' Analysis. In A Statistical Approach to Genetic Epidemiology: With Access to
#' E-Learning Platform by Friedrich Pahlke, Second Edition.
#' \doi{10.1002/9783527633654.ch6}.
#'
#' @seealso [PedigreeUtils] for other pedigree utility functions.
#'
#' @md
#'
#' @examples
#'
#' ##########################
#' ##
#' ## Create a new FAData object
#' ##
#' ## Load the Minnesota Breast Cancer record and subset to the
#' ## first families.
#' data(minnbreast)
#' mbsub <- minnbreast[minnbreast$famid %in% 1:20, ]
#' mbped <- mbsub[, c("famid", "id", "fatherid", "motherid", "sex")]
#' ## Renaming column names
#' colnames(mbped) <- c("family", "id", "father", "mother", "sex")
#' ## Defining the optional argument age.
#' Age <- mbsub$endage
#' names(Age) <- mbsub$id
#' ## Create the object
#' fad <- FAData(pedigree=mbped, age=Age)
#'
#' ## Getting all pairs of individuals with a kinship coefficient >= 0.25
#' ## keeping all duplicates
#' rel_pairs <- kinshipPairs(fad)
#' head(rel_pairs)
#' ## As we see, we have multiple times the individual 1 etc.
#'
#' ## For an actual correlation analysis it would be better to drop duplicates.
#' ## Below we randomly select individual pairs if they occurr multiple times
#' rel_pairs <- kinshipPairs(fad, duplicates = "random")
#' head(rel_pairs)
#'
#' ## In addition we extract pairs of individuals that are much less related.
#' ## For this examples we consider all individuals with a kinship
#' ## coefficient < 0.03125 (second cousin) to be *unrelated*.
#' unrel_pairs <- kinshipPairs(fad, duplicates = "random",
#' condition = function(z) z < 0.03125)
#' head(unrel_pairs)
#'
#' ## For a familial resemblance analysis we can now calculate the correlation
#' ## coefficient of a quantitative trait between pairs of related individuals
#' ## and compare that with the correlation coefficient calculated on unrelated
#' ## individuals. For our toy example we use the participant's age, since we
#' ## don't have any other quantitative values available.
#' cor_rel <- cor(age(fad)[rel_pairs[, 1]], age(fad)[rel_pairs[, 2]],
#' use = "pairwise.complete.obs")
#'
#' cor_unrel <- cor(age(fad)[unrel_pairs[, 1]], age(fad)[unrel_pairs[, 2]],
#' use = "pairwise.complete.obs")
#' cor_rel
#' cor_unrel
#'
#' ## We don't see a clear difference in the correlation, thus, the age (as
#' ## expected) has no familial component.
kinshipPairs <- function(x, condition = function(x) x >= 0.25,
duplicates = c("keep", "first", "last", "random"),
id = NULL, family = NULL) {
duplicates <- match.arg(duplicates)
if (!inherits(x, "FAData"))
stop("'kinshipPairs' is currently only implemented for 'FAData'")
if (length(family)) {
id <- x$id[x$family %in% family]
if (!length(id))
stop("No individuals for the provided families found")
}
ks <- kinship(x)
if (length(id)) {
is_id <- rownames(ks) %in% id
if (!any(is_id))
stop("None of the specified individuals found")
ks <- ks[is_id, is_id, drop = FALSE]
}
res <- condition(ks)
if (!length(dim(res)) || ncol(ks) != ncol(res) ||
nrow(ks) != nrow(res) || !is.logical(res[1, 1]))
stop("'condition' function did not return a 'logical' 'matrix' with ",
"dimensions matching the kinship matrix in 'x'")
res[lower.tri(res, diag = TRUE)] <- NA
keep <- which(res, arr.ind = TRUE, useNames = FALSE)
if (nrow(keep)) {
if (duplicates != "keep") {
if (duplicates == "first")
dupl_fun <- function(z) z[1, , drop = FALSE]
if (duplicates == "last")
dupl_fun <- function(z) z[nrow(z), , drop = FALSE]
if (duplicates == "random")
dupl_fun <- function(z)
z[sample(seq_len(nrow(z)), 1), , drop = FALSE]
uids <- unique(as.integer(keep))
to_test <- rep(TRUE, nrow(keep))
res <- matrix(ncol = 2, nrow = length(uids))
for (i in seq_along(uids)) {
id <- uids[i]
idx <- which((keep[, 1] == id | keep[, 2] == id) & to_test)
if (length(idx)) {
pair <- dupl_fun(keep[idx, , drop = FALSE])
## Ensure we're not reporting these individuals again.
to_test[keep[, 1] %in% pair | keep[, 2] %in% pair] <- FALSE
res[i, ] <- pair
}
}
res <- res[!is.na(res[, 1]), , drop = FALSE]
unname(cbind(rownames(ks)[res[, 1]], rownames(ks)[res[, 2]]))
} else unname(cbind(rownames(ks)[keep[, 1]], rownames(ks)[keep[, 2]]))
} else matrix(nrow = 0, ncol = 2)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.