kinship2: Pedigree Functions

Documented in ibdMatrix

# Repack IBD data as a Matrix object

#' Create an IBD matrix
#'
#' @description
#' Transform identity by descent (IBD) matrix data from the form produced by
#' external programs such as SOLAR into the compact form used by the coxme and
#' lmekin routines.
#'
#' @details
#' The IBD matrix for a set of n subjects will be an n by n symmetric matrix
#' whose i,j element is the contains, for some given genetic location, a 0/1
#' indicator of whether 0, 1/2 or 2/2 of the alleles for i and j are identical
#' by descent.  Fractional values occur if the IBD fraction must be imputed.
#' The diagonal will be 1.  Since a large fraction of the values will be zero,
#' programs such as Solar return a data set containing only the non-zero
#' elements.  As well, Solar will have renumbered the subjects as 1:n in such a
#' way that families are grouped together in the matrix; a separate index file
#' contains the mapping between this new id and the original one.  The final
#' matrix should be labeled with the original identifiers.
#'
#' @param id1,id2 pairs of subject identifiers
#' @param x the IBD value for that pair
#' @param idmap an optional 2 column matrix or data frame whose first element
#' is the internal value (as found in \code{id1} and \code{id2}), and whose
#' second element will be used for the dimnames of the result
#' @param diagonal optional value for the diagonal element. If present, any
#' missing diagonal elements in the input data will be set to this value.
#'
#' @return a sparse matrix of class \code{dsCMatrix}.  This is the same form
#' used for kinship matrices.
#'
#' @seealso \code{\link{kinship}}, \code{\link{Matrix}}
#' @keywords genetics
#' @export ibdMatrix
ibdMatrix <- function(id1, id2, x, idmap, diagonal) {
  if (!is.null(ncol(id1)) && ncol(id1) == 1) id1 <- id1[, 1]
  if (!is.null(ncol(id1))) {
    # can be a matrix or a data frame
    if (ncol(id1) != 3) {
      stop("Argument id1 is a matrix or dataframe  but does not have 3 columns")
    }
    if (!missing(id2)) {
      stop("First argument is a matrix or dataframe, but id2 argument is present")
    }
    if (!missing(x)) {
      stop("First argument is a matrix or dataframe, but x argument is present")
    }
    id2 <- id1[, 2]
    x <- id1[, 3]
    id1 <- id1[, 1]
  }

  # Reset each pair such that id1 <= id2
  temp <- pmin(id1, id2)
  id2 <- pmax(id1, id2)
  id1 <- temp

  # Add diagonal elements
  if (!missing(diagonal) && diagonal != 0) {
    idlist <- unique(c(id1, id2))
    id1 <- c(id1, idlist)
    id2 <- c(id2, idlist)
    x <- c(x, rep(diagonal, length(idlist)))
  }

  # Toss away any zeros and duplicates
  keep <- (x != 0 & !duplicated(cbind(id1, id2)))
  if (!all(keep)) {
    id1 <- id1[keep]
    id2 <- id2[keep]
    x <- x[keep]
  }


  # If the set of ids is a list of integers 1-n for some n, we'll assume
  #  the the data is already in the optimal order.  Otherwise figure
  #  out families.  I expect the latter to happen rarely to never.
  temp <- sort(unique(id1, id2))
  maxid <- max(id1, id2)
  if (maxid != as.integer(maxid) || length(temp) != maxid ||
    any(is.na(match(temp, 1:maxid)))) {
    # drat, need to figure out family blocks
    idlist <- sort(unique(c(id1, id2)))
    nid <- length(idlist)
    id1 <- match(id1, idlist)
    id2 <- match(id2, idlist)
    famid <- 1:nid # everyone a singleton
    indx1 <- sort(unique(id2)) # the result vector for remap1 below
    indx2 <- sort(unique(id1))
    while (1) {
      remap1 <- tapply(famid[id1], id2, min) # map each id2 to min id1
      if (all(famid[indx1] == remap1)) break
      famid[indx1] <- remap1
      remap2 <- tapply(famid[id2], id1, min)
      famid[indx2] <- remap2
    }
    remap <- (1:nid)[order(famid)] # reordering of subjects
    id1 <- match(id1, remap)
    id2 <- match(id2, remap)
    idlist <- idlist[remap]
  } else {
    idlist <- 1:maxid
  }

  # dimid will be the dimnames
  if (missing(idmap)) {
    dimid <- idlist
  } else {
    if (!is.null(dim(idmap)) || ncol(idmap) != 2) {
      stop("idmap must have 2 columns")
    }
    temp <- match(idlist, idmap[, 1])
    if (any(is.na(temp))) {
      stop("Values appear in id1 or id2 that are not in idmap")
    }
    dimid <- idmap[temp, 2]
  }

  sparseMatrix(
    i = id1, j = id2, x = x, symmetric = TRUE,
    dimnames = list(dimid, dimid)
  )
}