FamAgg: Pedigree Analysis and Familial Aggregation

Documented in connectedSubgraph kinshipPairs ped2graph subPedigree

## just simple utility function to test for empty/not set column names
noColNames <- function(x){
    CN <- colnames(x)
    all(CN %in% paste("V", 1:ncol(x), sep=""))
}

## takes a pedigree (data.frame, pedigree or pedigreeList) as input and
## generates a valid pedigree for the submitted ids (if no ids are specified
## it returns the ped). basically, the function subsets the full pedigree to
## the individuals ids and recursively adds parents if needed.
buildPedigree <- function(ped, ids){
    ## check pedigree and eventually add missing parents...
    if(missing(ids))
        return(ped)
    ## want to have a data.frame or matrix...
    if(is(ped, "pedigree") | is(ped, "pedigreeList"))
        ped <- ped2df(ped)
    if(!all(ids %in% rownames(ped)))
        stop("Some of the submitted ids are not in the pedigree!")
    subfam <- ped[as.character(ids), ]
    ## now recursively add parents, if needed...
    parents <- as.character(unique(c(subfam$father, subfam$mother)))
    ## parents[is.na(parents)] <- 0
    ## parents <- parents[parents!=0]
    parents <- parents[!is.na(parents)]
    while(!all(parents %in% as.character(subfam$id))){
        subfam <- ped[unique(c(as.character(subfam$id), parents)), ]
        parents <- as.character(unique(c(subfam$father, subfam$mother)))
        ## parents[is.na(parents)] <- 0
        ## parents <- parents[parents!=0]
        parents <- parents[!is.na(parents)]
    }
    subfam
}


doGetSiblings <- function(ped, id = NULL) {
    if(is.null(id))
        stop("At least one if has to be specified!")
    parents <- doGetParents(ped, id = id)
    siblings <- doGetChildren(ped, id = parents, maxlevel = 1)
    siblings
}

## gets all the ancestors for (one or more) id(s)
doGetAncestors <- function(ped, id = NULL, maxlevel = 3) {
    if (is.null(id))
        stop("At least one id has to be specified!")
    id <- as.character(id)
    allids <- NULL
    for (i in 1:maxlevel) {
        ## get the parents of all the allids
        newids <- unlist(
            ped[which(ped$id %in% id), c("father", "mother")],
            use.names = FALSE
        )
        zeros <- is.na(newids)
        ##zeros <- newids == founder
        if(all(zeros) | length(newids)==0)
            break
        newids <- newids[!zeros]
        id <- newids
        allids <- c(allids, newids)
    }
    as.character(unique(allids))
}

doGetParents <- function(ped, id = NULL) {
    doGetAncestors(ped = ped, id = id, maxlevel = 1)
}

## gets all the children for (one or more) id(s)
## note: these are only children in direct blood line.
doGetChildren <- function(ped, id=NULL, maxlevel=16){
    if(is.null(id))
        stop("At least one id has to be specified!")
    allids <- NULL
    for(i in 1:maxlevel){
        ## get all the children for the ids
        newids <- ped[which(ped$father %in% id | ped$mother %in% id), "id"]
        if(length(newids)==0)
            break
        id <- newids
        allids <- c(allids, newids)
    }
    as.character(unique(allids))
}

## check for the submitted id(s) whether a mate (spouse) is
## available (in column mother or father of the pedigree ped)
## and if yes add them to the returned list of ids.
## returned value contains
## eventually missing mates.
doGetMissingMate <- function(ped, id) {
    if(is.null(id))
        stop("Al least one id has to be specified")
    newids <- c(ped[which(ped$father %in% id), "mother"],
                ped[which(ped$mother %in% id), "father"])
    unique(newids)
}

writePed <- function(ped, file){
    Sexes <- as.numeric(sanitizeSex(ped$sex))
    Sexes[is.na(Sexes)] <- 0
    ped$sex <- Sexes
    write.table(ped, file=file, quote=FALSE, sep="\t", row.names=FALSE,
                col.names=FALSE)
}

## creates a data.frame (or matrix) for a pedigree...
ped2df <- function(ped){
    ped <- do.call(cbind, ped)
    if(!any(colnames(ped)=="famid"))
        ped <- cbind(ped, famid=rep(1, nrow(ped)))
    ped <- ped[ , c("famid", "id", "findex", "mindex", "sex")]
    ## FIXME!!! Can I get NAs as findex too?
    ## have to rematch the fater id and mother id...
    notZ <- which(ped[, "findex"] > 0)
    ped[notZ, "findex"] <- ped[ped[notZ, "findex"], "id"]
    notZ <- which(ped[, "findex"] > 0)
    ped[notZ, "mindex"] <- ped[ped[notZ, "mindex"], "id"]
    colnames(ped) <- .PEDCN
    ped <- data.frame(ped)
    rownames(ped) <- ped$id
    ped
}


df2ped <- function(ped){
    ped <- checkPedCol(ped)
    if(!any(colnames(ped)=="family"))
        ped <- cbind(ped, family=rep(1, nrow(ped)))
    ## if there is only one family, remove that one ;)
    if(length(unique(ped$family)) == 1)
        ped <- ped[, colnames(ped)!="family"]
    ##
    if(any(colnames(ped) == "affected")){
        if(any(colnames(ped) == "family")){
            peddi <- kinship2::pedigree(id=ped$id, dadid=ped$father,
                                        momid=ped$mother, famid=ped$family,
                                        sex=as.numeric(ped$sex),
                                        affected=ped$affected)
        }else{
            peddi <- kinship2::pedigree(id=ped$id, dadid=ped$father,
                                        momid=ped$mother,
                                        sex=as.numeric(ped$sex),
                                        affected=ped$affected)
        }
    }else{
        if(any(colnames(ped) == "family")){
            peddi <- kinship2::pedigree(id=ped$id, dadid=ped$father,
                                        momid=ped$mother, famid=ped$family,
                                        sex=as.numeric(ped$sex))
        }else{
            peddi <- kinship2::pedigree(id=ped$id, dadid=ped$father,
                                        momid=ped$mother,
                                        sex=as.numeric(ped$sex))
        }
    }
    peddi
}


## finds related individuals, i.e. individuals sharing the same blood line.
## should we call this "shareBloodLine" or "shareKinship"
## ped has to be the data.frame that we get with the pedigree method.
## Use `rmKinship` for fine control: omit all individuals with kinship less
## than or equal to this paramter. Since it defaults to 0, all individuals
## sharing blood line will be reported by default.
doShareKinship <- function(ped, kin, id, rmKinship=0){
    if(missing(ped) & missing(kin))
        stop("ped or kin has to be submitted!")
    if(missing(id))
        stop("At least one id has to be submitted!")
    id <- as.character(id)
    ## calculate kin from ped
    if(missing(kin)){
        kin <- kinship(id=ped$id, momid=ped$mother, dadid=ped$father,
                    sex=ped$sex)
    }
    if(!all(id %in% colnames(kin)))
        stop("id has to be present in ped or kin!")
    ## subset to id (which can be more than one)
    kin <- kin[id, , drop=FALSE]
    related <- colnames(kin)[colSums(kin>rmKinship)>0]
    related
}

####============================================================
##  doGetFounders
##
##  Returns the ids of the founders in the specified pedigree.
####------------------------------------------------------------
doGetFounders <- function(ped){
    ped[is.na(ped$mother) & is.na(ped$father), "id"]
}

####============================================================
##  doGetSingletons
##
##  Return the ids of those founders that have no child in the
##  pedigree.
####------------------------------------------------------------
doGetSingletons <- function(ped){
    founders <- ped[is.na(ped$mother) & is.na(ped$father), , drop=FALSE]
    if(nrow(founders) > 0){
        childl <- !((founders$id %in% ped$father) |
                    (founders$id %in% ped$mother))
        if(any(childl)){
            return(founders[childl, "id"])
        }
    }
    character()
}


## gets all parents.
getParents <- function(ped, id){
    if(is(ped, "pedigree") | is(ped, "pedigreeList"))
        ped <- ped2df(ped)
    ped <- checkPedCol(ped)
    if(missing(id))
        stop("id is missing!")
    if(is.null(rownames(ped)))
        rownames(ped) <- ped[, "id"]
    id <- as.character(id)
    if(!is.data.frame(ped))
        ped <- as.data.frame(ped)
    ped[id, c("father", "mother"), drop=FALSE]
}


## just a simple helper function that checks that the ped parameter
## has the required colnames, is a data.frame and has rownames set.
checkPedCol <- function(ped){
    if(missing(ped))
        stop("ped is required!")
    if(!all(c("id", "father", "mother") %in% colnames(ped)))
        stop("id, father, mother are required columns in ped.")
    if(!is.data.frame(ped))
        ped <- as.data.frame(ped)
    ## add a family column if not present...
    if(!any(colnames(ped) == "family"))
        ped <- cbind(ped, family=rep(1, nrow(ped)))
    rownames(ped) <- as.character(ped$id)
    ped
}

## Sanitize the pedigree the way we want: sex should be sanitized (if present),
## father and mother column should contain NA for founders instead of "" or 0.
sanitizePed <- function(ped){
    if(any(colnames(ped) == "sex"))
        ped$sex <- sanitizeSex(ped$sex)
    Father <- ped$father
    if(is.factor(Father))
        Father <- as.character(Father)
    if(is.numeric(Father))
        Father[which(Father == 0)] <- NA
    if(is.character(Father))
        Father[which(Father == "")] <- NA
    Mother <- ped$mother
    if(is.factor(Mother))
        Mother <- as.character(Mother)
    if(is.numeric(Mother))
        Mother[which(Mother == 0)] <- NA
    if(is.character(Mother))
        Mother[which(Mother == "")] <- NA
    ped$father <- Father
    ped$mother <- Mother
    ped
}

## Add density from values x to a density d computed by a previous call to this
## function. Make the initial call by omitting d.
## Resulting density of the partitioned dataset is a (pretty decent)
## approximation of the density obtained when submitting the whole dataset.
inc.density <- function(x, d = NULL)
{
    if( is.null(d) ) {
        ## adjust has been determined empirically.
        dd <- density(x = x, n = 512, adjust = 0.75)
    } else {
        if( length(d$x)!=512 )
            stop("Argument d has not been created by a call to this function.")
        ## Determine the density data structure that has the larger domain and
        ## add interpolated density values from the other data structure. By
        ## this, the domain can grow and will never omit values that appear
        ## later in the iterative process.
        dd <- density(x, bw = d$bw, n = 512)
        if( dd$x[512]<d$x[512] ) {
            tmp <- dd; dd <- d; d <- tmp; rm(tmp)
        }
        delta <- dd$x[2] - dd$x[1]
        f <- approxfun(d$x, d$y)
        y <- f(dd$x)
        y[is.na(y)] <- 0
        dd$y <- dd$y + y
        dd$y <- dd$y/(delta*sum(dd$y))
        dd$n <- dd$n + d$n
    }
    dd
}

## Add histogram from values x to a histogram h computed by a previous call to
## this function. Additionally, reports the number of values that produced an
## overflow in variable nreplace (i.e. values did were too large to be included
## in any bin of the histogram h). Make the initial call by omitting h.
##
## Resulting histogram of the partitioned dataset is a (pretty decent)
## approximation of the histogram obtained when submitting the whole dataset.
## Drawback: the domain of the histogram cannot be changed during addition of
## new values. We therefore rely on the first batch as a random sample of the
## population and estimate the overall domain.
inc.hist <- function(x, h = NULL)
{
    if( is.null(h) ) {
        ## Empirically determined overall size of the domain (110%).
        hh <- hist(x, breaks = 1/127*(0:127)*max(x)*1.1, plot = FALSE)
        hh$nreplace <- 0
    } else {
        if( is.null(h$nreplace) )
            stop("Argument h has not been created by a call to this function.")
        maxv <- h$breaks[length(h$breaks)]
        nreplace <- sum(x>maxv)
        if( nreplace>0 )
            x[x>maxv] <- maxv
        hh <- hist(x, breaks = h$breaks, plot = FALSE)
        hh$counts <- hh$counts + h$counts
        hh$density <- (hh$counts/sum(hh$counts))/diff(hh$breaks)
        hh$nreplace <- h$nreplace + nreplace
    }
    hh
}

## Add tabulated data from values x to a table t computed by a previous call to
## this function. Make the initial call by omitting t.
## This function produces identical results when used in the iterative way
## compared to submitting the whole dataset at once.
inc.table <- function(x, t = NULL)
{
    if( is.null(t) ) {
        tt <- table(x)
    } else {
        tt <- table(x)
        both <- intersect(names(t), names(tt))
        tt[both] <- tt[both] + t[both]
        tonly <- setdiff(names(t), names(tt))
        tt[tonly] <- t[tonly]
    }
    tt
}

## Convert 1D count table with only integers as counted objects to a histogram.
table2hist.int <- function(t)
{
    ## Only 1D tables have names(), for higher dimensional tables, it is set to
    ## NULL. dimnames() OTH works opposite.
    nms <- names(t)
    stopifnot("Table must be one-dimensional" = length(nms)>=1)
    suppressWarnings( bins <- as.integer(names(t)) )
    stopifnot("Table can be integer only." = all(as.character(bins)==nms))
    h <- list(breaks = c(bins, max(bins) + 1),
              counts = as.vector(t),
              mids = bins + 0.5,
              xname = "generic",
              equidist = TRUE)
    attr(h, "class") <- "histogram"
    h
}


#' find the subPedigree (smallest pedigree) including the individuals specified
#' with id and all eventually needed additionals
#'
#' @param ped pedigree `data.frame`.
#'
#' @param id `character` with the IDs of the individuals.
#'
#' @param all `logical(1)`
#'
#' @md
#'
#' @noRd
#'
#' @author Johannes Rainer
subPedigree <- function(ped, id = NULL, all = TRUE) {
    CL <- class(ped)
    ## handling for pedigree and pedigreeList classes: unfortunately these
    ## are not correct classes, otherwise I could define methods instead of
    ## a function...
    if (is(ped, "pedigree") | is(ped, "pedigreeList")) {
        ped <- ped2df(ped)
    }
    ped <- checkPedCol(ped)
    if (is.null(id))
        stop("id is missing!")
    id <- as.character(id)
    subgr <- connectedSubgraph(ped2graph(ped), nodes = id, mode = "all",
                               all.nodes = all, ifnotfound = NA)
    if (!is(subgr, "igraph")) {
        stop("No common pedigree for the submitted individuals found!")
    }
    inds <- names(V(subgr))
    ## get missing mates
    inds <- unique(c(inds, doGetMissingMate(ped, id = inds)))
    subped <- ped[inds, ]
    subped <- removeSingletons(subped)
    ## fix founders.
    subped[!(subped$father %in% subped$id), "father"] <- NA
    subped[!(subped$mother %in% subped$id), "mother"] <- NA
    ## Fix if we have individuals with a mother or a father, but not both.
    idx <- which(is.na(subped$father) != is.na(subped$mother))
    if (length(idx)) {
        idx_ped <- match(subped$id[idx], ped$id)
        id_msng <- ped$mother[idx_ped][is.na(subped$mother[idx])]
        id_msng <- c(id_msng, ped$father[idx_ped][is.na(subped$father[idx])])
        subped$father[idx] <- ped$father[idx_ped]
        subped$mother[idx] <- ped$mother[idx_ped]
        if (length(id_msng)) {
            subped <- rbind(subped, ped[match(id_msng, ped$id), ])
            subped[!(subped$father %in% subped$id), "father"] <- NA
            subped[!(subped$mother %in% subped$id), "mother"] <- NA
        }
    }
    if(CL == "pedigree" | CL == "pedigreeList")
        subped <- df2ped(subped)
    subped
}


## find the (eventually smallest) connected subgraph of all specified
## nodes.
## graph: igraph object.
## nodes: node (vertex) names.
## mode: to calculate the smallest distance.
## all.nodes: if all nodes should be present in the resulting graph.
## ifnotfound: by default, the function throws an error if no connected
##             subgraph is found, or, if all.nodes=TRUE, not all input nodes
##             are in the connected subgraph. If ifnotfound is specified, its
##             value is returned instead of an error.
connectedSubgraph <- function(graph, nodes, mode="all", all.nodes=TRUE,
                              ifnotfound){
    if(!is(graph, "igraph"))
        stop("graph has to be an igraph object!")
    mode <- match.arg(mode, c("all", "in", "out"))
    if(missing(nodes) | length(nodes) < 2)
        stop("At least two nodes have to be submitted!")
    nodes <- as.character(nodes)
    ## check if nodes are present in the graph
    nono <- !(nodes %in% names(V(graph)))
    if(any(nono)){
        missingNodes <- nodes[nono]
        if(all.nodes)
            stop("With argument all.nodes=TRUE all nodes have to be present ",
                 "in the graph! Nodes ", paste(missingNodes, collapse=", "),
                 " not present in the original graph.")
        nodes <- nodes[!nono]
        warning("Nodes ", paste(missingNodes, collapse=", "),
                " removed as they are not present in graph! ")
        if(length(nodes) < 2)
            stop("At least two nodes have to be present in the graph!")
    }
    ## OK, now we can start...
    ## 1) iterate over all nodes and calculate the shortest path between all.
    ## 2) get all nodes along the shortest path and add to the needed nodes.
    needNodes <- character()
    ## two versions: if mode="all" the directionality does not matter, thus we
    ## can run a faster version. for the other modes really calculate distances
    ## between all (both directions).
    if(mode == "all"){
        for(i in 1:(length(nodes)-1)){
            suppressWarnings(
                paths <- shortest_paths(graph, from=nodes[i],
                                        to=nodes[(i+1):length(nodes)],
                                        mode=mode)
            )
            needNodes <- unique(c(needNodes,
                                  unlist(lapply(paths$vpath, names))))
        }
    }else{
        for(i in 1:length(nodes)){
            suppressWarnings(
                paths <- shortest_paths(graph, from=nodes[i],
                                        to=nodes[-i],
                                        mode=mode)
            )
            needNodes <- unique(c(needNodes,
                                  unlist(lapply(paths$vpath, names))))
        }
    }
    if(length(needNodes)==0){
        if(missing(ifnotfound)){
            stop("Could not find any connected subgraph!")
        }
        warning("Could not find any connected subgraph!")
        return(ifnotfound)
    }
    ## check the subgraph.
    if(!all(nodes %in% needNodes)){
        if(all.nodes){
            if(missing(ifnotfound)){
                stop("Could not find any connected subgraph!")
            }
            warning("Could not find any connected subgraph!")
            return(ifnotfound)
        }else{
            warning("Nodes ", paste(nodes[!(nodes %in% needNodes)]),
                    " not present in the returned graph!")
        }
    }
    ## finally build the graph...
    ## subgraph is a little tedious as it requires submitting of vertex INDICES!
    induced_subgraph(graph, vids=needNodes)
}

## search for a common ancestor couple in a pedigree.
## ped: data.frame or pedigree or pedigreeList.
doGetCommonAncestor <- function(ped, id, method="min.dist"){
    method <- match.arg(method, c("min.dist", "smallest.mean.dist"))
    if(missing(id))
        stop("id has to be specified")
    if(length(id) <= 1)
        stop("The minimum number of ids has to be 2.")
    if(is(ped, "pedigree") | is(ped, "pedigreeList"))
        ped <- ped2df(ped)
    ped <- checkPedCol(ped)
    ## check if id are in ped:
    id <- as.character(id)
    if(!all(id %in% rownames(ped))){
        stop("all of the sumitted ids should be present in the pedigree!")
    }
    ## check if the ids are in the same family
    if(any(colnames(ped)=="family")){
        if(length(unique(ped[id, "family"])) > 1){
            warning("The provided ids are from different families; still",
                    " trying to find a common ancestor but I'll probably fail.")
        }else{
            ## subset the pedigree to this family...
            ped <- ped[ped$family == ped[id, "family"][1], , drop=FALSE]
        }
    }
    ## OK; done with input parameter checking.
    ## transform the pedigree into a graph
    igr <- ped2graph(ped)
    ## "in": go from children -> parents
    ## LLLL TODO can do faster!!!
    din <- distances(igr, mode="in")
    ## use the ROWS in that.
    subD <- din[id, , drop=FALSE]
    subD <- subD[, !apply(subD, MARGIN=2, function(x){
        return(all(infOrZero(x)))
    }) , drop=FALSE]
    ## check if we've got one common ancestor:
    if(all(is.infinite(colSums(subD)))){
        warning("Did not find any common ancestor!")
        return(NA)
    }
    ## remains to find the "one and only" (well, usually we expect a couple).
    subD <- subD[, !is.infinite(colSums(subD)), drop=FALSE]
    ## now get the ones that are closest:
    if(method == "min.dist"){
        dists <- apply(subD, MARGIN=2, min)
        ancs <- names(dists)[dists==min(dists)]
    }
    if(method == "smallest.mean.dist"){
        dists <- colMeans(subD)
        ancs <- names(dists)[dists==min(dists)]
    }
    ## at last trying to get the mate... if any; usually, ancs should
    ## already correspond to the ancestor couple.
    ancs <- c(ancs, doGetMissingMate(ped, ancs))
    unique(ancs)
}

## returns true if a value is 0 or infinite
infOrZero <- function(x){
    (is.infinite(x) | x == 0)
}

## transform a pedigree into a (directed) igraph
## direction is always parent->child.
## this time each individual is treated as a single node!
ped2graph <- function(ped){
    if(is(ped, "pedigree") | is(ped, "pedigreeList"))
        ped <- ped2df(ped)
    ped <- checkPedCol(ped)
    ## define end/starting points
    ped[!ped[, "father"] %in% ped$id, "father"] <- NA
    ped[!ped[, "mother"] %in% ped$id, "mother"] <- NA
    eds <- apply(ped[, c("id", "father", "mother")], MARGIN=1, function(x){
        retval <- c()
        if(!is.na(x["father"]))
            retval <- x[c("father", "id")]
        if(!is.na(x["mother"]))
            retval <- c(retval, x[c("mother", "id")])
        return(retval)
    })
    ## what the heck... I need character vertex names!
    Edges <- unlist(eds, use.names=FALSE)
    igr <- graph(edges=as.character(Edges))
    igr
}


## takes a pedigree as input and tries to identify the founders
## in case there are more than two founder pairs (i.e. founders with the same,
## largest number of offspring generations), it returns only one of them.
doFindFounders <- function(ped, family, id){
    if(!any(colnames(ped) == "family"))
        ped <- data.frame(ped, family=rep(1, nrow(ped)), stringsAsFactors=FALSE)
    families <- unique(as.character(ped$family))
    if(missing(family))
        family <- NULL
    if (missing(id))
        id <- NULL
    if(is.null(family) & is.null(id)){
        family <- families[1]
        warning("No family specified; using the first one in the pedigree: ",
                family, ".")
    }
    if (!is.null(family))
        ped <- ped[ped$family == family, , drop=FALSE]
    if (!is.null(id)) {
        return(doFindFoundersForId(ped = ped, id = id))
    } else {
        founders <- ped$id[which(is.na(ped$father) & is.na(ped$mother))]
        ## determine for each founder the number of generations of its children,
        ## grandchildren etc.
        founderGen <- doCountGenerations(ped, id=founders, direction="down")
        founders <- names(founderGen)[founderGen == max(founderGen)][1]
        founders <- c(founders, doGetMissingMate(ped, founders))
        return(unique(founders))
    }
}

##' Find founders for the pedigree of a given individual. Have to first build
##' the largest pedigree for the given individual and identify the founders in
##' that pedigree.
##'
##' @note The function returns only a single founder pair, i.e. the one with the
##' largest number of offspring generations.
##' @noRd
doFindFoundersForId <- function(ped, id) {
    if (length(id) > 1) {
        id <- id[1]
        warning("length of 'id' was > 1, but only the first id was selected!")
    }
    ## Check if we've got this id in the pedigree
    if (!any(ped[, "id"] == id))
        stop("Individual with id '", id, "' not found in the pedigree!")
    ## Building the pedigree for that individual.
    ids <- unique(c(doGetAncestors(ped = ped, id = id, maxlevel = 200), id))
    ids <- unique(c(ids, doGetChildren(ped = ped, id = ids, maxlevel = 200)))
    ids <- unique(c(ids, doGetMissingMate(ped, id = ids)))
    ped <- ped[ped[, "id"] %in% ids, , drop = FALSE]
    ## OK, now we have the pedigree for the individual, now find founders.
    founders <- ped[which(is.na(ped[, "father"]) &
                          is.na(ped[, "mother"])), "id"]
    ## determine for each founder the number of generations of its children,
    ## grandchildren etc.
    founderGen <- doCountGenerations(ped, id = founders, direction = "down")
    founders <- names(founderGen)[founderGen == max(founderGen)][1]
    founders <- c(founders, doGetMissingMate(ped, founders))
    unique(founders)
}

## calculate for each id the number of generations up- or down.
doCountGenerations <- function(ped, id=NULL, direction="down"){
    if(is.null(id))
        stop("At least one id has to be specified!")
    direction <- match.arg(direction, c("up", "down"))
    if(direction == "up")
        walkFun <- doGetAncestors
    if(direction == "down")
        walkFun <- doGetChildren
    res <- sapply(id, function(x){
        generation <- 0
        ids <- x
        repeat{
            ids <- walkFun(ped, id=ids, maxlevel=1)
            if(length(ids) == 0){
                return(generation)
            }else{
                generation <- generation + 1
            }
        }
    })
    names(res) <- id
    res
}

## returns a list of generations for each family in the pedigree.
## generations are relative to the founders.
doEstimateGenerationsFor2 <- function(ped, family=NULL){
    if(!is.null(family)){
        ped <- ped[ped$family %in% family, ]
        if(nrow(ped) == 0)
            stop("The submitted family id is not present in the pedigree!")
    }
    ## OK, now loop through the families...
    pedList <- split(ped, ped$family)
    Gens <- lapply(pedList, function(currentPed){
        founders <- doFindFounders(currentPed, family=currentPed[1, "family"])
        generation <- doGetGenerationFrom2(currentPed, founders[1])
        return(generation)
    })
    Gens
}

getAllSibsNMates <- function(ped, id){
    ## recursively fetch siblings and mates
    sibsNmates <- id
    repeat{
        sibs <- unique(c(id, doGetSiblings(ped, id=id)))
        sibsNmates <- unique(c(doGetMissingMate(ped, id=sibs), sibs))
        if(length(sibsNmates) != length(id)){
            id <- sibsNmates
        }else{
            break
        }
    }
    sibsNmates
}

doGetGenerationFrom2 <- function(ped, id, generation=0){
    if(missing(id))
        stop("id has to be specified!")
    if(missing(ped))
        stop("ped has to be specified!")
    id <- as.character(id)
    if(length(id) > 1){
        id <- id[1]
        warning("Length of 'id' is larger 1. Using only the first id ",
                "in 'id': ", id)
    }
    if(!any(ped$id %in% id))
        stop("Can not find id in the pedigree!")
    famid <- ped[ped$id == id, "family"]
    ped <- ped[ped$family == famid, , drop=FALSE]
    ## first define the connected!!! pedigree.
    pedg <- ped2graph(ped)
    if(clusters(pedg)$no > 1){
        Cl <- clusters(pedg)
        warning("Found ", Cl$no, " connected sub-graphs in the pedigree ",
                "for family ", famid, "! Will restrict the analysis to the ",
                "subgraph containing individual ", id, ".")
        vertices <- names(Cl$membership)[Cl$membership == Cl$membership[id]]
        pedg <- induced_subgraph(pedg, vids=vertices)
    }
    individuals <- names(V(pedg))
    gens4individuals <- rep(NA, length(individuals))
    names(gens4individuals) <- individuals
    if(!any(names(gens4individuals) == id))
        stop("Individual ", id, " is not part of the connected pedigree!")
    gens4individuals[id] <- 0
    fromId <- id
    Tested <- rep(FALSE, length(gens4individuals))
    repeat{
        ## get the generation of the individuals from which we start
        fromIdBool <- individuals %in% fromId
        Tested[fromIdBool] <- TRUE
        currentGens <- gens4individuals[fromIdBool]
        currentGens <- split(currentGens, currentGens)
        Res <- lapply(currentGens, allids=individuals, FUN=function(z, allids){
            ## get the ancestors and set the generation for them
            ancs <- getAncestors(ped, id=names(z), max.generations=1)
            if(length(ancs) > 0){
                ancret <- cbind(which(allids %in% ancs), (z[1] - 1))
            }else{
                ancret <- NULL
            }
            ## get the children
            kids <- getChildren(ped, id=names(z), max.generation=1)
            if(length(kids) > 0){
                kidret <- cbind(which(allids %in% kids), (z[1] + 1))
            }else{
                kidret <- NULL
            }
            return(rbind(ancret, kidret))
        })
        Res <- do.call(rbind, Res)
        if(nrow(Res) == 0){
            warning("What the heck... stopped before we got all?")
            break
        }
        ## don't want to search the same id several times... if Tested is
        ## TRUE, don't use them again.
        toTestIdx <- Res[, 1][!Tested[Res[, 1]]]
        fromId <- individuals[toTestIdx]
        gens4individuals[Res[, 1]] <- Res[, 2]
        if(sum(is.na(gens4individuals)) == 0)
            break
    }
    ## make a larger generation vector.
    allgens <- rep(NA, nrow(ped))
    names(allgens) <- ped$id
    allgens[names(gens4individuals)] <- gens4individuals
    allgens
}

##********************************************************************
##
##   Utility functions
##
##********************************************************************
sanitizeSex <- function(x){
    if(is.numeric(x)){
        ## require values 1, 2, NA
        numrange <- unique(x)
        if(!all(c(1, 2) %in% numrange))
            stop("If column sex is numeric it has to contain 1 (=male) ",
                 "and 2 (=female) values! Unknown or missing can be encoded",
                 " by NA or any other number than 1 or 2.")
        news <- rep(NA, length(x))
        news[which(x == 1)] <- "M"
        news[which(x == 2)] <- "F"
        return(factor(news, levels=c("M", "F")))
    }
    if(is.factor(x))
        x <- as.character(x)
    if(is.character(x)){
        ## remove all white spaces:
        x <- gsub(x, pattern="\\s", replacement="")
        ## that's tricky...kind of...
        ## just get the first character; want to have M and F...
        x <- substr(toupper(x), 1, 1)
        chars <- unique(x)
        if(length(chars) == 2){
            if(!all(c("M", "F") %in% chars))
                stop("Male and female individuals have to be represented",
                     " by 'M' and 'F', respectively!")
        }
        if(length(chars) == 1){
            if(!any(c("M", "F") %in% chars))
                stop("Male and female individuals have to be represented",
                     " by 'M' and 'F', respectively!")
        }
        if(length(chars[!is.na(chars)]) > 2)
            warning("All characters in column sex other than M and F",
                    " (or male, female etc) are set to NA!")
        news <- rep(NA, length(x))
        news[which(x == "M")] <- "M"
        news[which(x == "F")] <- "F"
        return(factor(news, levels=c("M", "F")))
    }
}


#####
#### NOTE:
##   isn't that the same as removeSingletons???
##
####
## remove non-connected individuals. We're assuming that ped is the pedigree
## of a SINGLE family, as we expect to get ONE graph. in case we've got 2, we're
## using the largest connected one.
## solveMultiGraph: what should we do if we get multiple graphs?
##    + use.all: just use all, i.e. return vertix names of all.
##    + use.largest: just return vertices of the largest connected sub-graph.
doPrunePed <- function(ped, addMissingMates=FALSE, solveMultiGraph="use.all"){
    .Deprecated("removeSingletons")
    solveMultiGraph <- match.arg(solveMultiGraph, c("use.all", "use.largest"))
    pedg <- ped2graph(ped)
    if(solveMultiGraph == "use.largest"){
        if(clusters(pedg)$no > 1){
            Cl <- clusters(pedg)
            warning("Found ", Cl$no,
                    " connected sub-graphs in the pedigree! Will restrict ",
                    "the analysis to the largest connected pedigree",
                    " in that family.")
            vertices <-
                names(Cl$membership)[Cl$membership ==
                                     as.numeric(sort(table(Cl$membership),
                                                     decreasing=TRUE))[1]]
            pedg <- induced_subgraph(pedg, vids=vertices)
        }
    }
    keepIds <- names(V(pedg))
    if(addMissingMates){
        missMates <- doGetMissingMate(ped, id=keepIds)
        keepIds <- unique(c(missMates, keepIds))
    }
    notThere <- !(as.character(ped$id) %in% keepIds)
    ped <- ped[!notThere, , drop=FALSE]
    if(any(notThere))
        warning("Had to remove ", sum(notThere),
                " un-connected individuals from the pedigree.")
    ped
}


## a singleton a.k.a childless founder is an individual which id is not
## present in the father or mother column of the pedigree and which has 0
## in mother and father
.removeSingletons <- function(ped){
    message("Removing singletons...", appendLF=FALSE)
    ped <- checkPedCol(ped)
    ped <- sanitizePed(ped)
    childlF <- doGetSingletons(ped)
    if(length(childlF) > 0){
        ped <- ped[!(ped$id %in% childlF), ]
        message(" ", length(childlF), " removed.")
    }else{
        message(" none present.")
    }
    ped
}

#' @title Extract pairs of individuals matching certain kinship criteria
#'
#' @name kinshipPairs
#'
#' @description
#'
#' The `kinshipPairs` function allows to extract pairs of individuals matching
#' a user-defined kinship *condition* (e.g. individuals with a kinship larger
#' than 0.0625). Such sets of paired individuals (along with paired unrelated
#' values) would enable a *familial resemblance* analysis on quantitative
#' traits (Ziegler 2010) (see examples below for details).
#'
#' By default, `kinshipPairs` returns all pairs of individuals for which the
#' `condition` on the kinship matrix matches (e.g. all pairs of individuals with
#' a kinship coefficient larger than or equal to 0.25). Individuals can thus
#' be reported multiple times (see examples below). Parameter `duplicates` can
#' be used to define a strategy to avoid such duplicated IDs. Supported are:
#'
#' - `duplicates = "keep"`: the default, return all values.
#' - `duplicates = "first"`: report only the first pair of individuals for each
#'   individual ID.
#' - `duplicates = "last"`: report only the last pair of individuals for each
#'   individual ID.
#' - `duplicates = "random"`: randomly select one pair of individuals for
#'   each individual ID.
#'
#' For any setting different than `duplicates = "keep"` each individual will
#' only be listed **once** in the resulting matrix.
#'
#' @param x A `FAData` object (or object inheriting from that).
#'
#' @param condition A `function` defining how individuals should be selected
#'     based on the object's kinship matrix. The default is to select all
#'     individuals with a kinship `>= 0.25`. Note that the diagonal of the
#'     kinship matrix (i.e. the kinship of individuals with itself) is always
#'     skipped, so no additional criteria is needed to avoid self-pairs.
#'
#' @param duplicates `character(1)` defining how to deal with duplicated IDs
#'     in the result returned by the function.
#'     See function description and examples below for more details. Defaults
#'     to `duplicates = "keep"` returning all pairs of IDs matching `condition`.
#'
#' @param family optional family identifiers if pairs should only defined for
#'     selected families. Defaults to `family = NULL` hence the full data set is
#'     considered.
#'
#' @param id optional identifiers of subsets of individuals on which the pairs
#'     should be defined. Defaults to `id = NULL` hence the full data set is
#'     considered.
#'
#' @return
#'
#' A two column `matrix` with the IDs (colnames/rownames of the kinship matrix
#' or as defined in `x$id`) of the pairs. If `duplicates` is either `"first"`,
#' `"last"` or `"random"` each ID is only returned once (i.e. no ID is reported
#' more than one time).
#'
#' @author Johannes Rainer
#'
#' @references
#'
#' Ziegler A., Koenig I. R. (2010). Familiality, Heristability, and Segregation
#' Analysis. In A Statistical Approach to Genetic Epidemiology: With Access to
#' E-Learning Platform by Friedrich Pahlke, Second Edition.
#' \doi{10.1002/9783527633654.ch6}.
#'
#' @seealso [PedigreeUtils] for other pedigree utility functions.
#'
#' @md
#'
#' @examples
#'
#' ##########################
#' ##
#' ##  Create a new FAData object
#' ##
#' ## Load the Minnesota Breast Cancer record and subset to the
#' ## first families.
#' data(minnbreast)
#' mbsub <- minnbreast[minnbreast$famid %in% 1:20, ]
#' mbped <- mbsub[, c("famid", "id", "fatherid", "motherid", "sex")]
#' ## Renaming column names
#' colnames(mbped) <- c("family", "id", "father", "mother", "sex")
#' ## Defining the optional argument age.
#' Age <- mbsub$endage
#' names(Age) <- mbsub$id
#' ## Create the object
#' fad <- FAData(pedigree=mbped, age=Age)
#'
#' ## Getting all pairs of individuals with a kinship coefficient >= 0.25
#' ## keeping all duplicates
#' rel_pairs <- kinshipPairs(fad)
#' head(rel_pairs)
#' ## As we see, we have multiple times the individual 1 etc.
#'
#' ## For an actual correlation analysis it would be better to drop duplicates.
#' ## Below we randomly select individual pairs if they occurr multiple times
#' rel_pairs <- kinshipPairs(fad, duplicates = "random")
#' head(rel_pairs)
#'
#' ## In addition we extract pairs of individuals that are much less related.
#' ## For this examples we consider all individuals with a kinship
#' ## coefficient < 0.03125 (second cousin) to be *unrelated*.
#' unrel_pairs <- kinshipPairs(fad, duplicates = "random",
#'     condition = function(z) z < 0.03125)
#' head(unrel_pairs)
#'
#' ## For a familial resemblance analysis we can now calculate the correlation
#' ## coefficient of a quantitative trait between pairs of related individuals
#' ## and compare that with the correlation coefficient calculated on unrelated
#' ## individuals. For our toy example we use the participant's age, since we
#' ## don't have any other quantitative values available.
#' cor_rel <- cor(age(fad)[rel_pairs[, 1]], age(fad)[rel_pairs[, 2]],
#'     use = "pairwise.complete.obs")
#'
#' cor_unrel <- cor(age(fad)[unrel_pairs[, 1]], age(fad)[unrel_pairs[, 2]],
#'     use = "pairwise.complete.obs")
#' cor_rel
#' cor_unrel
#'
#' ## We don't see a clear difference in the correlation, thus, the age (as
#' ## expected) has no familial component.
kinshipPairs <- function(x, condition = function(x) x >= 0.25,
                         duplicates = c("keep", "first", "last", "random"),
                         id = NULL, family = NULL) {
    duplicates <- match.arg(duplicates)
    if (!inherits(x, "FAData"))
        stop("'kinshipPairs' is currently only implemented for 'FAData'")
    if (length(family)) {
        id <- x$id[x$family %in% family]
        if (!length(id))
            stop("No individuals for the provided families found")
    }
    ks <- kinship(x)
    if (length(id)) {
        is_id <- rownames(ks) %in% id
        if (!any(is_id))
            stop("None of the specified individuals found")
        ks <- ks[is_id, is_id, drop = FALSE]
    }
    res <- condition(ks)
    if (!length(dim(res)) || ncol(ks) != ncol(res) ||
        nrow(ks) != nrow(res) || !is.logical(res[1, 1]))
        stop("'condition' function did not return a 'logical' 'matrix' with ",
             "dimensions matching the kinship matrix in 'x'")
    res[lower.tri(res, diag = TRUE)] <- NA
    keep <- which(res, arr.ind = TRUE, useNames = FALSE)

    if (nrow(keep)) {
        if (duplicates != "keep") {
            if (duplicates == "first")
                dupl_fun <- function(z) z[1, , drop = FALSE]
            if (duplicates == "last")
                dupl_fun <- function(z) z[nrow(z), , drop = FALSE]
            if (duplicates == "random")
                dupl_fun <- function(z)
                    z[sample(seq_len(nrow(z)), 1), , drop = FALSE]
            uids <- unique(as.integer(keep))
            to_test <- rep(TRUE, nrow(keep))
            res <- matrix(ncol = 2, nrow = length(uids))
            for (i in seq_along(uids)) {
                id <- uids[i]
                idx <- which((keep[, 1] == id | keep[, 2] == id) & to_test)
                if (length(idx)) {
                    pair <- dupl_fun(keep[idx, , drop = FALSE])
                    ## Ensure we're not reporting these individuals again.
                    to_test[keep[, 1] %in% pair | keep[, 2] %in% pair] <- FALSE
                    res[i, ] <- pair
                }
            }
            res <- res[!is.na(res[, 1]), , drop = FALSE]
            unname(cbind(rownames(ks)[res[, 1]], rownames(ks)[res[, 2]]))
        } else unname(cbind(rownames(ks)[keep[, 1]], rownames(ks)[keep[, 2]]))
    } else matrix(nrow = 0, ncol = 2)
}
EuracBiomedicalResearch/FamAgg documentation built on March 12, 2023, 7:45 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
EuracBiomedicalResearch/FamAgg
Pedigree Analysis and Familial Aggregation

R/utils.R
In EuracBiomedicalResearch/FamAgg: Pedigree Analysis and Familial Aggregation

Documented in connectedSubgraph kinshipPairs ped2graph subPedigree

R Package Documentation

Browse R Packages

We want your feedback!

EuracBiomedicalResearch/FamAgg Pedigree Analysis and Familial Aggregation

R/utils.R In EuracBiomedicalResearch/FamAgg: Pedigree Analysis and Familial Aggregation

Documented in connectedSubgraph kinshipPairs ped2graph subPedigree

R Package Documentation

Browse R Packages

We want your feedback!

EuracBiomedicalResearch/FamAgg
Pedigree Analysis and Familial Aggregation

R/utils.R
In EuracBiomedicalResearch/FamAgg: Pedigree Analysis and Familial Aggregation