R/utils.R

Defines functions kinshipPairs .removeSingletons doPrunePed sanitizeSex doGetGenerationFrom2 getAllSibsNMates doEstimateGenerationsFor2 doCountGenerations doFindFoundersForId doFindFounders ped2graph infOrZero doGetCommonAncestor connectedSubgraph subPedigree table2hist.int inc.table inc.hist inc.density sanitizePed checkPedCol getParents doGetSingletons doGetFounders doShareKinship df2ped ped2df writePed doGetMissingMate doGetChildren doGetParents doGetAncestors doGetSiblings buildPedigree noColNames

Documented in connectedSubgraph kinshipPairs ped2graph subPedigree

## just simple utility function to test for empty/not set column names
noColNames <- function(x){
    CN <- colnames(x)
    all(CN %in% paste("V", 1:ncol(x), sep=""))
}

## takes a pedigree (data.frame, pedigree or pedigreeList) as input and
## generates a valid pedigree for the submitted ids (if no ids are specified
## it returns the ped). basically, the function subsets the full pedigree to
## the individuals ids and recursively adds parents if needed.
buildPedigree <- function(ped, ids){
    ## check pedigree and eventually add missing parents...
    if(missing(ids))
        return(ped)
    ## want to have a data.frame or matrix...
    if(is(ped, "pedigree") | is(ped, "pedigreeList"))
        ped <- ped2df(ped)
    if(!all(ids %in% rownames(ped)))
        stop("Some of the submitted ids are not in the pedigree!")
    subfam <- ped[as.character(ids), ]
    ## now recursively add parents, if needed...
    parents <- as.character(unique(c(subfam$father, subfam$mother)))
    ## parents[is.na(parents)] <- 0
    ## parents <- parents[parents!=0]
    parents <- parents[!is.na(parents)]
    while(!all(parents %in% as.character(subfam$id))){
        subfam <- ped[unique(c(as.character(subfam$id), parents)), ]
        parents <- as.character(unique(c(subfam$father, subfam$mother)))
        ## parents[is.na(parents)] <- 0
        ## parents <- parents[parents!=0]
        parents <- parents[!is.na(parents)]
    }
    subfam
}


doGetSiblings <- function(ped, id = NULL) {
    if(is.null(id))
        stop("At least one if has to be specified!")
    parents <- doGetParents(ped, id = id)
    siblings <- doGetChildren(ped, id = parents, maxlevel = 1)
    siblings
}

## gets all the ancestors for (one or more) id(s)
doGetAncestors <- function(ped, id = NULL, maxlevel = 3) {
    if (is.null(id))
        stop("At least one id has to be specified!")
    id <- as.character(id)
    allids <- NULL
    for (i in 1:maxlevel) {
        ## get the parents of all the allids
        newids <- unlist(
            ped[which(ped$id %in% id), c("father", "mother")],
            use.names = FALSE
        )
        zeros <- is.na(newids)
        ##zeros <- newids == founder
        if(all(zeros) | length(newids)==0)
            break
        newids <- newids[!zeros]
        id <- newids
        allids <- c(allids, newids)
    }
    as.character(unique(allids))
}

doGetParents <- function(ped, id = NULL) {
    doGetAncestors(ped = ped, id = id, maxlevel = 1)
}

## gets all the children for (one or more) id(s)
## note: these are only children in direct blood line.
doGetChildren <- function(ped, id=NULL, maxlevel=16){
    if(is.null(id))
        stop("At least one id has to be specified!")
    allids <- NULL
    for(i in 1:maxlevel){
        ## get all the children for the ids
        newids <- ped[which(ped$father %in% id | ped$mother %in% id), "id"]
        if(length(newids)==0)
            break
        id <- newids
        allids <- c(allids, newids)
    }
    as.character(unique(allids))
}

## check for the submitted id(s) whether a mate (spouse) is
## available (in column mother or father of the pedigree ped)
## and if yes add them to the returned list of ids.
## returned value contains
## eventually missing mates.
doGetMissingMate <- function(ped, id) {
    if(is.null(id))
        stop("Al least one id has to be specified")
    newids <- c(ped[which(ped$father %in% id), "mother"],
                ped[which(ped$mother %in% id), "father"])
    unique(newids)
}

writePed <- function(ped, file){
    Sexes <- as.numeric(sanitizeSex(ped$sex))
    Sexes[is.na(Sexes)] <- 0
    ped$sex <- Sexes
    write.table(ped, file=file, quote=FALSE, sep="\t", row.names=FALSE,
                col.names=FALSE)
}

## creates a data.frame (or matrix) for a pedigree...
ped2df <- function(ped){
    ped <- do.call(cbind, ped)
    if(!any(colnames(ped)=="famid"))
        ped <- cbind(ped, famid=rep(1, nrow(ped)))
    ped <- ped[ , c("famid", "id", "findex", "mindex", "sex")]
    ## FIXME!!! Can I get NAs as findex too?
    ## have to rematch the fater id and mother id...
    notZ <- which(ped[, "findex"] > 0)
    ped[notZ, "findex"] <- ped[ped[notZ, "findex"], "id"]
    notZ <- which(ped[, "findex"] > 0)
    ped[notZ, "mindex"] <- ped[ped[notZ, "mindex"], "id"]
    colnames(ped) <- .PEDCN
    ped <- data.frame(ped)
    rownames(ped) <- ped$id
    ped
}


df2ped <- function(ped){
    ped <- checkPedCol(ped)
    if(!any(colnames(ped)=="family"))
        ped <- cbind(ped, family=rep(1, nrow(ped)))
    ## if there is only one family, remove that one ;)
    if(length(unique(ped$family)) == 1)
        ped <- ped[, colnames(ped)!="family"]
    ##
    if(any(colnames(ped) == "affected")){
        if(any(colnames(ped) == "family")){
            peddi <- kinship2::pedigree(id=ped$id, dadid=ped$father,
                                        momid=ped$mother, famid=ped$family,
                                        sex=as.numeric(ped$sex),
                                        affected=ped$affected)
        }else{
            peddi <- kinship2::pedigree(id=ped$id, dadid=ped$father,
                                        momid=ped$mother,
                                        sex=as.numeric(ped$sex),
                                        affected=ped$affected)
        }
    }else{
        if(any(colnames(ped) == "family")){
            peddi <- kinship2::pedigree(id=ped$id, dadid=ped$father,
                                        momid=ped$mother, famid=ped$family,
                                        sex=as.numeric(ped$sex))
        }else{
            peddi <- kinship2::pedigree(id=ped$id, dadid=ped$father,
                                        momid=ped$mother,
                                        sex=as.numeric(ped$sex))
        }
    }
    peddi
}


## finds related individuals, i.e. individuals sharing the same blood line.
## should we call this "shareBloodLine" or "shareKinship"
## ped has to be the data.frame that we get with the pedigree method.
## Use `rmKinship` for fine control: omit all individuals with kinship less
## than or equal to this paramter. Since it defaults to 0, all individuals
## sharing blood line will be reported by default.
doShareKinship <- function(ped, kin, id, rmKinship=0){
    if(missing(ped) & missing(kin))
        stop("ped or kin has to be submitted!")
    if(missing(id))
        stop("At least one id has to be submitted!")
    id <- as.character(id)
    ## calculate kin from ped
    if(missing(kin)){
        kin <- kinship(id=ped$id, momid=ped$mother, dadid=ped$father,
                    sex=ped$sex)
    }
    if(!all(id %in% colnames(kin)))
        stop("id has to be present in ped or kin!")
    ## subset to id (which can be more than one)
    kin <- kin[id, , drop=FALSE]
    related <- colnames(kin)[colSums(kin>rmKinship)>0]
    related
}

####============================================================
##  doGetFounders
##
##  Returns the ids of the founders in the specified pedigree.
####------------------------------------------------------------
doGetFounders <- function(ped){
    ped[is.na(ped$mother) & is.na(ped$father), "id"]
}

####============================================================
##  doGetSingletons
##
##  Return the ids of those founders that have no child in the
##  pedigree.
####------------------------------------------------------------
doGetSingletons <- function(ped){
    founders <- ped[is.na(ped$mother) & is.na(ped$father), , drop=FALSE]
    if(nrow(founders) > 0){
        childl <- !((founders$id %in% ped$father) |
                    (founders$id %in% ped$mother))
        if(any(childl)){
            return(founders[childl, "id"])
        }
    }
    character()
}


## gets all parents.
getParents <- function(ped, id){
    if(is(ped, "pedigree") | is(ped, "pedigreeList"))
        ped <- ped2df(ped)
    ped <- checkPedCol(ped)
    if(missing(id))
        stop("id is missing!")
    if(is.null(rownames(ped)))
        rownames(ped) <- ped[, "id"]
    id <- as.character(id)
    if(!is.data.frame(ped))
        ped <- as.data.frame(ped)
    ped[id, c("father", "mother"), drop=FALSE]
}


## just a simple helper function that checks that the ped parameter
## has the required colnames, is a data.frame and has rownames set.
checkPedCol <- function(ped){
    if(missing(ped))
        stop("ped is required!")
    if(!all(c("id", "father", "mother") %in% colnames(ped)))
        stop("id, father, mother are required columns in ped.")
    if(!is.data.frame(ped))
        ped <- as.data.frame(ped)
    ## add a family column if not present...
    if(!any(colnames(ped) == "family"))
        ped <- cbind(ped, family=rep(1, nrow(ped)))
    rownames(ped) <- as.character(ped$id)
    ped
}

## Sanitize the pedigree the way we want: sex should be sanitized (if present),
## father and mother column should contain NA for founders instead of "" or 0.
sanitizePed <- function(ped){
    if(any(colnames(ped) == "sex"))
        ped$sex <- sanitizeSex(ped$sex)
    Father <- ped$father
    if(is.factor(Father))
        Father <- as.character(Father)
    if(is.numeric(Father))
        Father[which(Father == 0)] <- NA
    if(is.character(Father))
        Father[which(Father == "")] <- NA
    Mother <- ped$mother
    if(is.factor(Mother))
        Mother <- as.character(Mother)
    if(is.numeric(Mother))
        Mother[which(Mother == 0)] <- NA
    if(is.character(Mother))
        Mother[which(Mother == "")] <- NA
    ped$father <- Father
    ped$mother <- Mother
    ped
}

## Add density from values x to a density d computed by a previous call to this
## function. Make the initial call by omitting d.
## Resulting density of the partitioned dataset is a (pretty decent)
## approximation of the density obtained when submitting the whole dataset.
inc.density <- function(x, d = NULL)
{
    if( is.null(d) ) {
        ## adjust has been determined empirically.
        dd <- density(x = x, n = 512, adjust = 0.75)
    } else {
        if( length(d$x)!=512 )
            stop("Argument d has not been created by a call to this function.")
        ## Determine the density data structure that has the larger domain and
        ## add interpolated density values from the other data structure. By
        ## this, the domain can grow and will never omit values that appear
        ## later in the iterative process.
        dd <- density(x, bw = d$bw, n = 512)
        if( dd$x[512]<d$x[512] ) {
            tmp <- dd; dd <- d; d <- tmp; rm(tmp)
        }
        delta <- dd$x[2] - dd$x[1]
        f <- approxfun(d$x, d$y)
        y <- f(dd$x)
        y[is.na(y)] <- 0
        dd$y <- dd$y + y
        dd$y <- dd$y/(delta*sum(dd$y))
        dd$n <- dd$n + d$n
    }
    dd
}

## Add histogram from values x to a histogram h computed by a previous call to
## this function. Additionally, reports the number of values that produced an
## overflow in variable nreplace (i.e. values did were too large to be included
## in any bin of the histogram h). Make the initial call by omitting h.
##
## Resulting histogram of the partitioned dataset is a (pretty decent)
## approximation of the histogram obtained when submitting the whole dataset.
## Drawback: the domain of the histogram cannot be changed during addition of
## new values. We therefore rely on the first batch as a random sample of the
## population and estimate the overall domain.
inc.hist <- function(x, h = NULL)
{
    if( is.null(h) ) {
        ## Empirically determined overall size of the domain (110%).
        hh <- hist(x, breaks = 1/127*(0:127)*max(x)*1.1, plot = FALSE)
        hh$nreplace <- 0
    } else {
        if( is.null(h$nreplace) )
            stop("Argument h has not been created by a call to this function.")
        maxv <- h$breaks[length(h$breaks)]
        nreplace <- sum(x>maxv)
        if( nreplace>0 )
            x[x>maxv] <- maxv
        hh <- hist(x, breaks = h$breaks, plot = FALSE)
        hh$counts <- hh$counts + h$counts
        hh$density <- (hh$counts/sum(hh$counts))/diff(hh$breaks)
        hh$nreplace <- h$nreplace + nreplace
    }
    hh
}

## Add tabulated data from values x to a table t computed by a previous call to
## this function. Make the initial call by omitting t.
## This function produces identical results when used in the iterative way
## compared to submitting the whole dataset at once.
inc.table <- function(x, t = NULL)
{
    if( is.null(t) ) {
        tt <- table(x)
    } else {
        tt <- table(x)
        both <- intersect(names(t), names(tt))
        tt[both] <- tt[both] + t[both]
        tonly <- setdiff(names(t), names(tt))
        tt[tonly] <- t[tonly]
    }
    tt
}

## Convert 1D count table with only integers as counted objects to a histogram.
table2hist.int <- function(t)
{
    ## Only 1D tables have names(), for higher dimensional tables, it is set to
    ## NULL. dimnames() OTH works opposite.
    nms <- names(t)
    stopifnot("Table must be one-dimensional" = length(nms)>=1)
    suppressWarnings( bins <- as.integer(names(t)) )
    stopifnot("Table can be integer only." = all(as.character(bins)==nms))
    h <- list(breaks = c(bins, max(bins) + 1),
              counts = as.vector(t),
              mids = bins + 0.5,
              xname = "generic",
              equidist = TRUE)
    attr(h, "class") <- "histogram"
    h
}


#' find the subPedigree (smallest pedigree) including the individuals specified
#' with id and all eventually needed additionals
#'
#' @param ped pedigree `data.frame`.
#'
#' @param id `character` with the IDs of the individuals.
#'
#' @param all `logical(1)`
#'
#' @md
#'
#' @noRd
#'
#' @author Johannes Rainer
subPedigree <- function(ped, id = NULL, all = TRUE) {
    CL <- class(ped)
    ## handling for pedigree and pedigreeList classes: unfortunately these
    ## are not correct classes, otherwise I could define methods instead of
    ## a function...
    if (is(ped, "pedigree") | is(ped, "pedigreeList")) {
        ped <- ped2df(ped)
    }
    ped <- checkPedCol(ped)
    if (is.null(id))
        stop("id is missing!")
    id <- as.character(id)
    subgr <- connectedSubgraph(ped2graph(ped), nodes = id, mode = "all",
                               all.nodes = all, ifnotfound = NA)
    if (!is(subgr, "igraph")) {
        stop("No common pedigree for the submitted individuals found!")
    }
    inds <- names(V(subgr))
    ## get missing mates
    inds <- unique(c(inds, doGetMissingMate(ped, id = inds)))
    subped <- ped[inds, ]
    subped <- removeSingletons(subped)
    ## fix founders.
    subped[!(subped$father %in% subped$id), "father"] <- NA
    subped[!(subped$mother %in% subped$id), "mother"] <- NA
    ## Fix if we have individuals with a mother or a father, but not both.
    idx <- which(is.na(subped$father) != is.na(subped$mother))
    if (length(idx)) {
        idx_ped <- match(subped$id[idx], ped$id)
        id_msng <- ped$mother[idx_ped][is.na(subped$mother[idx])]
        id_msng <- c(id_msng, ped$father[idx_ped][is.na(subped$father[idx])])
        subped$father[idx] <- ped$father[idx_ped]
        subped$mother[idx] <- ped$mother[idx_ped]
        if (length(id_msng)) {
            subped <- rbind(subped, ped[match(id_msng, ped$id), ])
            subped[!(subped$father %in% subped$id), "father"] <- NA
            subped[!(subped$mother %in% subped$id), "mother"] <- NA
        }
    }
    if(CL == "pedigree" | CL == "pedigreeList")
        subped <- df2ped(subped)
    subped
}


## find the (eventually smallest) connected subgraph of all specified
## nodes.
## graph: igraph object.
## nodes: node (vertex) names.
## mode: to calculate the smallest distance.
## all.nodes: if all nodes should be present in the resulting graph.
## ifnotfound: by default, the function throws an error if no connected
##             subgraph is found, or, if all.nodes=TRUE, not all input nodes
##             are in the connected subgraph. If ifnotfound is specified, its
##             value is returned instead of an error.
connectedSubgraph <- function(graph, nodes, mode="all", all.nodes=TRUE,
                              ifnotfound){
    if(!is(graph, "igraph"))
        stop("graph has to be an igraph object!")
    mode <- match.arg(mode, c("all", "in", "out"))
    if(missing(nodes) | length(nodes) < 2)
        stop("At least two nodes have to be submitted!")
    nodes <- as.character(nodes)
    ## check if nodes are present in the graph
    nono <- !(nodes %in% names(V(graph)))
    if(any(nono)){
        missingNodes <- nodes[nono]
        if(all.nodes)
            stop("With argument all.nodes=TRUE all nodes have to be present ",
                 "in the graph! Nodes ", paste(missingNodes, collapse=", "),
                 " not present in the original graph.")
        nodes <- nodes[!nono]
        warning("Nodes ", paste(missingNodes, collapse=", "),
                " removed as they are not present in graph! ")
        if(length(nodes) < 2)
            stop("At least two nodes have to be present in the graph!")
    }
    ## OK, now we can start...
    ## 1) iterate over all nodes and calculate the shortest path between all.
    ## 2) get all nodes along the shortest path and add to the needed nodes.
    needNodes <- character()
    ## two versions: if mode="all" the directionality does not matter, thus we
    ## can run a faster version. for the other modes really calculate distances
    ## between all (both directions).
    if(mode == "all"){
        for(i in 1:(length(nodes)-1)){
            suppressWarnings(
                paths <- shortest_paths(graph, from=nodes[i],
                                        to=nodes[(i+1):length(nodes)],
                                        mode=mode)
            )
            needNodes <- unique(c(needNodes,
                                  unlist(lapply(paths$vpath, names))))
        }
    }else{
        for(i in 1:length(nodes)){
            suppressWarnings(
                paths <- shortest_paths(graph, from=nodes[i],
                                        to=nodes[-i],
                                        mode=mode)
            )
            needNodes <- unique(c(needNodes,
                                  unlist(lapply(paths$vpath, names))))
        }
    }
    if(length(needNodes)==0){
        if(missing(ifnotfound)){
            stop("Could not find any connected subgraph!")
        }
        warning("Could not find any connected subgraph!")
        return(ifnotfound)
    }
    ## check the subgraph.
    if(!all(nodes %in% needNodes)){
        if(all.nodes){
            if(missing(ifnotfound)){
                stop("Could not find any connected subgraph!")
            }
            warning("Could not find any connected subgraph!")
            return(ifnotfound)
        }else{
            warning("Nodes ", paste(nodes[!(nodes %in% needNodes)]),
                    " not present in the returned graph!")
        }
    }
    ## finally build the graph...
    ## subgraph is a little tedious as it requires submitting of vertex INDICES!
    induced_subgraph(graph, vids=needNodes)
}

## search for a common ancestor couple in a pedigree.
## ped: data.frame or pedigree or pedigreeList.
doGetCommonAncestor <- function(ped, id, method="min.dist"){
    method <- match.arg(method, c("min.dist", "smallest.mean.dist"))
    if(missing(id))
        stop("id has to be specified")
    if(length(id) <= 1)
        stop("The minimum number of ids has to be 2.")
    if(is(ped, "pedigree") | is(ped, "pedigreeList"))
        ped <- ped2df(ped)
    ped <- checkPedCol(ped)
    ## check if id are in ped:
    id <- as.character(id)
    if(!all(id %in% rownames(ped))){
        stop("all of the sumitted ids should be present in the pedigree!")
    }
    ## check if the ids are in the same family
    if(any(colnames(ped)=="family")){
        if(length(unique(ped[id, "family"])) > 1){
            warning("The provided ids are from different families; still",
                    " trying to find a common ancestor but I'll probably fail.")
        }else{
            ## subset the pedigree to this family...
            ped <- ped[ped$family == ped[id, "family"][1], , drop=FALSE]
        }
    }
    ## OK; done with input parameter checking.
    ## transform the pedigree into a graph
    igr <- ped2graph(ped)
    ## "in": go from children -> parents
    ## LLLL TODO can do faster!!!
    din <- distances(igr, mode="in")
    ## use the ROWS in that.
    subD <- din[id, , drop=FALSE]
    subD <- subD[, !apply(subD, MARGIN=2, function(x){
        return(all(infOrZero(x)))
    }) , drop=FALSE]
    ## check if we've got one common ancestor:
    if(all(is.infinite(colSums(subD)))){
        warning("Did not find any common ancestor!")
        return(NA)
    }
    ## remains to find the "one and only" (well, usually we expect a couple).
    subD <- subD[, !is.infinite(colSums(subD)), drop=FALSE]
    ## now get the ones that are closest:
    if(method == "min.dist"){
        dists <- apply(subD, MARGIN=2, min)
        ancs <- names(dists)[dists==min(dists)]
    }
    if(method == "smallest.mean.dist"){
        dists <- colMeans(subD)
        ancs <- names(dists)[dists==min(dists)]
    }
    ## at last trying to get the mate... if any; usually, ancs should
    ## already correspond to the ancestor couple.
    ancs <- c(ancs, doGetMissingMate(ped, ancs))
    unique(ancs)
}

## returns true if a value is 0 or infinite
infOrZero <- function(x){
    (is.infinite(x) | x == 0)
}

## transform a pedigree into a (directed) igraph
## direction is always parent->child.
## this time each individual is treated as a single node!
ped2graph <- function(ped){
    if(is(ped, "pedigree") | is(ped, "pedigreeList"))
        ped <- ped2df(ped)
    ped <- checkPedCol(ped)
    ## define end/starting points
    ped[!ped[, "father"] %in% ped$id, "father"] <- NA
    ped[!ped[, "mother"] %in% ped$id, "mother"] <- NA
    eds <- apply(ped[, c("id", "father", "mother")], MARGIN=1, function(x){
        retval <- c()
        if(!is.na(x["father"]))
            retval <- x[c("father", "id")]
        if(!is.na(x["mother"]))
            retval <- c(retval, x[c("mother", "id")])
        return(retval)
    })
    ## what the heck... I need character vertex names!
    Edges <- unlist(eds, use.names=FALSE)
    igr <- graph(edges=as.character(Edges))
    igr
}


## takes a pedigree as input and tries to identify the founders
## in case there are more than two founder pairs (i.e. founders with the same,
## largest number of offspring generations), it returns only one of them.
doFindFounders <- function(ped, family, id){
    if(!any(colnames(ped) == "family"))
        ped <- data.frame(ped, family=rep(1, nrow(ped)), stringsAsFactors=FALSE)
    families <- unique(as.character(ped$family))
    if(missing(family))
        family <- NULL
    if (missing(id))
        id <- NULL
    if(is.null(family) & is.null(id)){
        family <- families[1]
        warning("No family specified; using the first one in the pedigree: ",
                family, ".")
    }
    if (!is.null(family))
        ped <- ped[ped$family == family, , drop=FALSE]
    if (!is.null(id)) {
        return(doFindFoundersForId(ped = ped, id = id))
    } else {
        founders <- ped$id[which(is.na(ped$father) & is.na(ped$mother))]
        ## determine for each founder the number of generations of its children,
        ## grandchildren etc.
        founderGen <- doCountGenerations(ped, id=founders, direction="down")
        founders <- names(founderGen)[founderGen == max(founderGen)][1]
        founders <- c(founders, doGetMissingMate(ped, founders))
        return(unique(founders))
    }
}

##' Find founders for the pedigree of a given individual. Have to first build
##' the largest pedigree for the given individual and identify the founders in
##' that pedigree.
##'
##' @note The function returns only a single founder pair, i.e. the one with the
##' largest number of offspring generations.
##' @noRd
doFindFoundersForId <- function(ped, id) {
    if (length(id) > 1) {
        id <- id[1]
        warning("length of 'id' was > 1, but only the first id was selected!")
    }
    ## Check if we've got this id in the pedigree
    if (!any(ped[, "id"] == id))
        stop("Individual with id '", id, "' not found in the pedigree!")
    ## Building the pedigree for that individual.
    ids <- unique(c(doGetAncestors(ped = ped, id = id, maxlevel = 200), id))
    ids <- unique(c(ids, doGetChildren(ped = ped, id = ids, maxlevel = 200)))
    ids <- unique(c(ids, doGetMissingMate(ped, id = ids)))
    ped <- ped[ped[, "id"] %in% ids, , drop = FALSE]
    ## OK, now we have the pedigree for the individual, now find founders.
    founders <- ped[which(is.na(ped[, "father"]) &
                          is.na(ped[, "mother"])), "id"]
    ## determine for each founder the number of generations of its children,
    ## grandchildren etc.
    founderGen <- doCountGenerations(ped, id = founders, direction = "down")
    founders <- names(founderGen)[founderGen == max(founderGen)][1]
    founders <- c(founders, doGetMissingMate(ped, founders))
    unique(founders)
}

## calculate for each id the number of generations up- or down.
doCountGenerations <- function(ped, id=NULL, direction="down"){
    if(is.null(id))
        stop("At least one id has to be specified!")
    direction <- match.arg(direction, c("up", "down"))
    if(direction == "up")
        walkFun <- doGetAncestors
    if(direction == "down")
        walkFun <- doGetChildren
    res <- sapply(id, function(x){
        generation <- 0
        ids <- x
        repeat{
            ids <- walkFun(ped, id=ids, maxlevel=1)
            if(length(ids) == 0){
                return(generation)
            }else{
                generation <- generation + 1
            }
        }
    })
    names(res) <- id
    res
}

## returns a list of generations for each family in the pedigree.
## generations are relative to the founders.
doEstimateGenerationsFor2 <- function(ped, family=NULL){
    if(!is.null(family)){
        ped <- ped[ped$family %in% family, ]
        if(nrow(ped) == 0)
            stop("The submitted family id is not present in the pedigree!")
    }
    ## OK, now loop through the families...
    pedList <- split(ped, ped$family)
    Gens <- lapply(pedList, function(currentPed){
        founders <- doFindFounders(currentPed, family=currentPed[1, "family"])
        generation <- doGetGenerationFrom2(currentPed, founders[1])
        return(generation)
    })
    Gens
}

getAllSibsNMates <- function(ped, id){
    ## recursively fetch siblings and mates
    sibsNmates <- id
    repeat{
        sibs <- unique(c(id, doGetSiblings(ped, id=id)))
        sibsNmates <- unique(c(doGetMissingMate(ped, id=sibs), sibs))
        if(length(sibsNmates) != length(id)){
            id <- sibsNmates
        }else{
            break
        }
    }
    sibsNmates
}

doGetGenerationFrom2 <- function(ped, id, generation=0){
    if(missing(id))
        stop("id has to be specified!")
    if(missing(ped))
        stop("ped has to be specified!")
    id <- as.character(id)
    if(length(id) > 1){
        id <- id[1]
        warning("Length of 'id' is larger 1. Using only the first id ",
                "in 'id': ", id)
    }
    if(!any(ped$id %in% id))
        stop("Can not find id in the pedigree!")
    famid <- ped[ped$id == id, "family"]
    ped <- ped[ped$family == famid, , drop=FALSE]
    ## first define the connected!!! pedigree.
    pedg <- ped2graph(ped)
    if(clusters(pedg)$no > 1){
        Cl <- clusters(pedg)
        warning("Found ", Cl$no, " connected sub-graphs in the pedigree ",
                "for family ", famid, "! Will restrict the analysis to the ",
                "subgraph containing individual ", id, ".")
        vertices <- names(Cl$membership)[Cl$membership == Cl$membership[id]]
        pedg <- induced_subgraph(pedg, vids=vertices)
    }
    individuals <- names(V(pedg))
    gens4individuals <- rep(NA, length(individuals))
    names(gens4individuals) <- individuals
    if(!any(names(gens4individuals) == id))
        stop("Individual ", id, " is not part of the connected pedigree!")
    gens4individuals[id] <- 0
    fromId <- id
    Tested <- rep(FALSE, length(gens4individuals))
    repeat{
        ## get the generation of the individuals from which we start
        fromIdBool <- individuals %in% fromId
        Tested[fromIdBool] <- TRUE
        currentGens <- gens4individuals[fromIdBool]
        currentGens <- split(currentGens, currentGens)
        Res <- lapply(currentGens, allids=individuals, FUN=function(z, allids){
            ## get the ancestors and set the generation for them
            ancs <- getAncestors(ped, id=names(z), max.generations=1)
            if(length(ancs) > 0){
                ancret <- cbind(which(allids %in% ancs), (z[1] - 1))
            }else{
                ancret <- NULL
            }
            ## get the children
            kids <- getChildren(ped, id=names(z), max.generation=1)
            if(length(kids) > 0){
                kidret <- cbind(which(allids %in% kids), (z[1] + 1))
            }else{
                kidret <- NULL
            }
            return(rbind(ancret, kidret))
        })
        Res <- do.call(rbind, Res)
        if(nrow(Res) == 0){
            warning("What the heck... stopped before we got all?")
            break
        }
        ## don't want to search the same id several times... if Tested is
        ## TRUE, don't use them again.
        toTestIdx <- Res[, 1][!Tested[Res[, 1]]]
        fromId <- individuals[toTestIdx]
        gens4individuals[Res[, 1]] <- Res[, 2]
        if(sum(is.na(gens4individuals)) == 0)
            break
    }
    ## make a larger generation vector.
    allgens <- rep(NA, nrow(ped))
    names(allgens) <- ped$id
    allgens[names(gens4individuals)] <- gens4individuals
    allgens
}

##********************************************************************
##
##   Utility functions
##
##********************************************************************
sanitizeSex <- function(x){
    if(is.numeric(x)){
        ## require values 1, 2, NA
        numrange <- unique(x)
        if(!all(c(1, 2) %in% numrange))
            stop("If column sex is numeric it has to contain 1 (=male) ",
                 "and 2 (=female) values! Unknown or missing can be encoded",
                 " by NA or any other number than 1 or 2.")
        news <- rep(NA, length(x))
        news[which(x == 1)] <- "M"
        news[which(x == 2)] <- "F"
        return(factor(news, levels=c("M", "F")))
    }
    if(is.factor(x))
        x <- as.character(x)
    if(is.character(x)){
        ## remove all white spaces:
        x <- gsub(x, pattern="\\s", replacement="")
        ## that's tricky...kind of...
        ## just get the first character; want to have M and F...
        x <- substr(toupper(x), 1, 1)
        chars <- unique(x)
        if(length(chars) == 2){
            if(!all(c("M", "F") %in% chars))
                stop("Male and female individuals have to be represented",
                     " by 'M' and 'F', respectively!")
        }
        if(length(chars) == 1){
            if(!any(c("M", "F") %in% chars))
                stop("Male and female individuals have to be represented",
                     " by 'M' and 'F', respectively!")
        }
        if(length(chars[!is.na(chars)]) > 2)
            warning("All characters in column sex other than M and F",
                    " (or male, female etc) are set to NA!")
        news <- rep(NA, length(x))
        news[which(x == "M")] <- "M"
        news[which(x == "F")] <- "F"
        return(factor(news, levels=c("M", "F")))
    }
}


#####
#### NOTE:
##   isn't that the same as removeSingletons???
##
####
## remove non-connected individuals. We're assuming that ped is the pedigree
## of a SINGLE family, as we expect to get ONE graph. in case we've got 2, we're
## using the largest connected one.
## solveMultiGraph: what should we do if we get multiple graphs?
##    + use.all: just use all, i.e. return vertix names of all.
##    + use.largest: just return vertices of the largest connected sub-graph.
doPrunePed <- function(ped, addMissingMates=FALSE, solveMultiGraph="use.all"){
    .Deprecated("removeSingletons")
    solveMultiGraph <- match.arg(solveMultiGraph, c("use.all", "use.largest"))
    pedg <- ped2graph(ped)
    if(solveMultiGraph == "use.largest"){
        if(clusters(pedg)$no > 1){
            Cl <- clusters(pedg)
            warning("Found ", Cl$no,
                    " connected sub-graphs in the pedigree! Will restrict ",
                    "the analysis to the largest connected pedigree",
                    " in that family.")
            vertices <-
                names(Cl$membership)[Cl$membership ==
                                     as.numeric(sort(table(Cl$membership),
                                                     decreasing=TRUE))[1]]
            pedg <- induced_subgraph(pedg, vids=vertices)
        }
    }
    keepIds <- names(V(pedg))
    if(addMissingMates){
        missMates <- doGetMissingMate(ped, id=keepIds)
        keepIds <- unique(c(missMates, keepIds))
    }
    notThere <- !(as.character(ped$id) %in% keepIds)
    ped <- ped[!notThere, , drop=FALSE]
    if(any(notThere))
        warning("Had to remove ", sum(notThere),
                " un-connected individuals from the pedigree.")
    ped
}


## a singleton a.k.a childless founder is an individual which id is not
## present in the father or mother column of the pedigree and which has 0
## in mother and father
.removeSingletons <- function(ped){
    message("Removing singletons...", appendLF=FALSE)
    ped <- checkPedCol(ped)
    ped <- sanitizePed(ped)
    childlF <- doGetSingletons(ped)
    if(length(childlF) > 0){
        ped <- ped[!(ped$id %in% childlF), ]
        message(" ", length(childlF), " removed.")
    }else{
        message(" none present.")
    }
    ped
}

#' @title Extract pairs of individuals matching certain kinship criteria
#'
#' @name kinshipPairs
#'
#' @description
#'
#' The `kinshipPairs` function allows to extract pairs of individuals matching
#' a user-defined kinship *condition* (e.g. individuals with a kinship larger
#' than 0.0625). Such sets of paired individuals (along with paired unrelated
#' values) would enable a *familial resemblance* analysis on quantitative
#' traits (Ziegler 2010) (see examples below for details).
#'
#' By default, `kinshipPairs` returns all pairs of individuals for which the
#' `condition` on the kinship matrix matches (e.g. all pairs of individuals with
#' a kinship coefficient larger than or equal to 0.25). Individuals can thus
#' be reported multiple times (see examples below). Parameter `duplicates` can
#' be used to define a strategy to avoid such duplicated IDs. Supported are:
#'
#' - `duplicates = "keep"`: the default, return all values.
#' - `duplicates = "first"`: report only the first pair of individuals for each
#'   individual ID.
#' - `duplicates = "last"`: report only the last pair of individuals for each
#'   individual ID.
#' - `duplicates = "random"`: randomly select one pair of individuals for
#'   each individual ID.
#'
#' For any setting different than `duplicates = "keep"` each individual will
#' only be listed **once** in the resulting matrix.
#'
#' @param x A `FAData` object (or object inheriting from that).
#'
#' @param condition A `function` defining how individuals should be selected
#'     based on the object's kinship matrix. The default is to select all
#'     individuals with a kinship `>= 0.25`. Note that the diagonal of the
#'     kinship matrix (i.e. the kinship of individuals with itself) is always
#'     skipped, so no additional criteria is needed to avoid self-pairs.
#'
#' @param duplicates `character(1)` defining how to deal with duplicated IDs
#'     in the result returned by the function.
#'     See function description and examples below for more details. Defaults
#'     to `duplicates = "keep"` returning all pairs of IDs matching `condition`.
#'
#' @param family optional family identifiers if pairs should only defined for
#'     selected families. Defaults to `family = NULL` hence the full data set is
#'     considered.
#'
#' @param id optional identifiers of subsets of individuals on which the pairs
#'     should be defined. Defaults to `id = NULL` hence the full data set is
#'     considered.
#'
#' @return
#'
#' A two column `matrix` with the IDs (colnames/rownames of the kinship matrix
#' or as defined in `x$id`) of the pairs. If `duplicates` is either `"first"`,
#' `"last"` or `"random"` each ID is only returned once (i.e. no ID is reported
#' more than one time).
#'
#' @author Johannes Rainer
#'
#' @references
#'
#' Ziegler A., Koenig I. R. (2010). Familiality, Heristability, and Segregation
#' Analysis. In A Statistical Approach to Genetic Epidemiology: With Access to
#' E-Learning Platform by Friedrich Pahlke, Second Edition.
#' \doi{10.1002/9783527633654.ch6}.
#'
#' @seealso [PedigreeUtils] for other pedigree utility functions.
#'
#' @md
#'
#' @examples
#'
#' ##########################
#' ##
#' ##  Create a new FAData object
#' ##
#' ## Load the Minnesota Breast Cancer record and subset to the
#' ## first families.
#' data(minnbreast)
#' mbsub <- minnbreast[minnbreast$famid %in% 1:20, ]
#' mbped <- mbsub[, c("famid", "id", "fatherid", "motherid", "sex")]
#' ## Renaming column names
#' colnames(mbped) <- c("family", "id", "father", "mother", "sex")
#' ## Defining the optional argument age.
#' Age <- mbsub$endage
#' names(Age) <- mbsub$id
#' ## Create the object
#' fad <- FAData(pedigree=mbped, age=Age)
#'
#' ## Getting all pairs of individuals with a kinship coefficient >= 0.25
#' ## keeping all duplicates
#' rel_pairs <- kinshipPairs(fad)
#' head(rel_pairs)
#' ## As we see, we have multiple times the individual 1 etc.
#'
#' ## For an actual correlation analysis it would be better to drop duplicates.
#' ## Below we randomly select individual pairs if they occurr multiple times
#' rel_pairs <- kinshipPairs(fad, duplicates = "random")
#' head(rel_pairs)
#'
#' ## In addition we extract pairs of individuals that are much less related.
#' ## For this examples we consider all individuals with a kinship
#' ## coefficient < 0.03125 (second cousin) to be *unrelated*.
#' unrel_pairs <- kinshipPairs(fad, duplicates = "random",
#'     condition = function(z) z < 0.03125)
#' head(unrel_pairs)
#'
#' ## For a familial resemblance analysis we can now calculate the correlation
#' ## coefficient of a quantitative trait between pairs of related individuals
#' ## and compare that with the correlation coefficient calculated on unrelated
#' ## individuals. For our toy example we use the participant's age, since we
#' ## don't have any other quantitative values available.
#' cor_rel <- cor(age(fad)[rel_pairs[, 1]], age(fad)[rel_pairs[, 2]],
#'     use = "pairwise.complete.obs")
#'
#' cor_unrel <- cor(age(fad)[unrel_pairs[, 1]], age(fad)[unrel_pairs[, 2]],
#'     use = "pairwise.complete.obs")
#' cor_rel
#' cor_unrel
#'
#' ## We don't see a clear difference in the correlation, thus, the age (as
#' ## expected) has no familial component.
kinshipPairs <- function(x, condition = function(x) x >= 0.25,
                         duplicates = c("keep", "first", "last", "random"),
                         id = NULL, family = NULL) {
    duplicates <- match.arg(duplicates)
    if (!inherits(x, "FAData"))
        stop("'kinshipPairs' is currently only implemented for 'FAData'")
    if (length(family)) {
        id <- x$id[x$family %in% family]
        if (!length(id))
            stop("No individuals for the provided families found")
    }
    ks <- kinship(x)
    if (length(id)) {
        is_id <- rownames(ks) %in% id
        if (!any(is_id))
            stop("None of the specified individuals found")
        ks <- ks[is_id, is_id, drop = FALSE]
    }
    res <- condition(ks)
    if (!length(dim(res)) || ncol(ks) != ncol(res) ||
        nrow(ks) != nrow(res) || !is.logical(res[1, 1]))
        stop("'condition' function did not return a 'logical' 'matrix' with ",
             "dimensions matching the kinship matrix in 'x'")
    res[lower.tri(res, diag = TRUE)] <- NA
    keep <- which(res, arr.ind = TRUE, useNames = FALSE)

    if (nrow(keep)) {
        if (duplicates != "keep") {
            if (duplicates == "first")
                dupl_fun <- function(z) z[1, , drop = FALSE]
            if (duplicates == "last")
                dupl_fun <- function(z) z[nrow(z), , drop = FALSE]
            if (duplicates == "random")
                dupl_fun <- function(z)
                    z[sample(seq_len(nrow(z)), 1), , drop = FALSE]
            uids <- unique(as.integer(keep))
            to_test <- rep(TRUE, nrow(keep))
            res <- matrix(ncol = 2, nrow = length(uids))
            for (i in seq_along(uids)) {
                id <- uids[i]
                idx <- which((keep[, 1] == id | keep[, 2] == id) & to_test)
                if (length(idx)) {
                    pair <- dupl_fun(keep[idx, , drop = FALSE])
                    ## Ensure we're not reporting these individuals again.
                    to_test[keep[, 1] %in% pair | keep[, 2] %in% pair] <- FALSE
                    res[i, ] <- pair
                }
            }
            res <- res[!is.na(res[, 1]), , drop = FALSE]
            unname(cbind(rownames(ks)[res[, 1]], rownames(ks)[res[, 2]]))
        } else unname(cbind(rownames(ks)[keep[, 1]], rownames(ks)[keep[, 2]]))
    } else matrix(nrow = 0, ncol = 2)
}
jotsetung/FamAgg documentation built on March 15, 2023, 3:48 a.m.