R/segment_utils.R
In ibdsim2: Simulation of Chromosomal Regions Shared by Family Members

Documented in segmentStats

alleleFlow = function(x, ids, addState = TRUE) {
  if(!inherits(x, "genomeSim"))
    stop2("Argument `x` must be a `genomeSim` object. Received: ", class(x))
  
  xids = extractIds(x)
  if(!all(ids %in% xids))
    stop2("Unknown ID label: ", setdiff(ids, xids))
  
  n = length(ids)
  clnms = colnames(x)
  
  # If nothing to do, return early
  if(setequal(ids, xids)) {
    if(n > 2 || !addState) 
      return(x)
    if(n == 1 && "Aut" %in% clnms)
      return(x)
    if(n == 2 && "Sigma" %in% clnms)
      return(x)
  }
  
  # Allele columns of selected ids
  cols = paste(rep(ids, each = 2), c("p", "m"), sep = ":")
  y = mergeSegments(x, by = cols)
  
  # Keep only allele columns of selected ids
  y = y[, c(clnms[1:5], cols), drop = FALSE]
  
  if(addState)
    y = addStates(y, cols)
  
  y
}


stackIntervals = function(start, end, chrom = NULL, sort = TRUE) {
  
  n = length(start)
  if(n < 2)
    return(rep_len(1, n))
  
  lay = integer(n)
  
  if(!is.null(chrom)) {
    for(chr in unique.default(chrom)) {
      idx = chrom == chr
      lay[idx] = stackIntervals(start = start[idx], end = end[idx], sort = sort)
    }
    return(lay)
  }
    
  if(any(start > end))
    stop2("Inverted segment")
  
  if(sort) {
    ord = order(start, -end, method = "shell")
    start = start[ord]
    end = end[ord]
  }
  
  # Current end of each layer
  layerEnds = numeric(n) # theoretical max
  layerEnds[1] = end[1]
  
  lay[1] = 1
  for (k in seq.int(2, n)) {
    for(i in 1:n) {
      if(layerEnds[i] < start[k]) {
        lay[k] = i
        layerEnds[i] = end[k]
        break
      }
    }
  }
  
  if(sort)
    lay = lay[order(ord, method = "shell")]
  lay
}

groupOverlaps = function(x) { # assumes sorted on chrom, start, end
  
  # Storage vector for group number and group size
  g = n = gidx = numeric(nrow(x))
  chrom = x[, 1]
  start = x[, 2]
  end = x[, 3]
  
  for(chr in unique.default(chrom)) {
    idx = chrom == chr
    sta = start[idx]
    sto = end[idx]
    gchr = cumsum(cummax(c(0, sto[-length(sto)])) <= sta)
    tb = tabulate(gchr)
    g[idx] = gchr
    n[idx] = tb[gchr]
    gidx[idx] = unlist(lapply(tb, seq_len), use.names = FALSE)
  }
  
  cbind(x, group = paste(chrom, g, sep = "-"), gsize = n, gidx = gidx)
}

# Merge segments (on the same chrom) with equal entries in `by` (single vector or column names)
mergeSegments = function(x, by = NULL, checkAdjacency = FALSE) {
  k = nrow(x)
  if(k < 2)
    return(x)
  
  # Rows where chromosome is unchanged
  chrEq = x[-1, 'chrom'] == x[-k, 'chrom']
  mergeRow = chrEq
  
  # Rows where `by` elements don't change
  if(!is.null(by)) {
    byLen = length(by)
    
    if(is.character(by) && all(by %in% colnames(x))) { # Interpret as column names
      if(byLen == 1)
        byEq = x[-1, by] == x[-k, by]
      else {
        chck = x[-1, by, drop = FALSE] == x[-k, by, drop = FALSE]
        byEq = .rowSums(chck, m = k - 1, n = byLen) == byLen
      }
    }
    else if(byLen == k && (is.numeric(by) || is.character(by))) {
      byEq = by[-1] == by[-k]
    }
    else
      stop2("Wrong format or length of argument `by`")
    
    mergeRow = chrEq & !is.na(byEq) & byEq
  }
 
  # Segment adjacency
  if(checkAdjacency) {
    adjac = x[-1, 'startMB'] == x[-k, 'endMB']
    mergeRow = mergeRow & adjac
  }
 
  if(!any(mergeRow))
    return(x)
  
  fromRow = which(c(TRUE, !mergeRow))
  toRow = c(fromRow[-1] - 1, k)
  
  y = x[fromRow, , drop = FALSE]
  y[, 'endMB'] = x[toRow, 'endMB']
  y[, 'endCM'] = x[toRow, 'endCM']
  
  rownames(y) = NULL
  y
}


#' Summary statistics for identified segments
#'
#' Compute summary statistics for segments identified by [findPattern()].
#'
#' @param x A list of matrices produced with [findPattern()].
#' @param quantiles A vector of quantiles to include in the summary.
#' @param returnAll A logical, by default FALSE. If TRUE, the output includes a
#'   vector `allSegs` containing the lengths of all segments in all simulations.
#' @param unit Either "mb" (megabases) or "cm" (centiMorgan); the length unit
#'   for genomic segments.
#'   
#' @return A list containing a data frame `perSim`, a matrix `summary` and (if
#'   `returnAll` is TRUE) a vector `allSegs`.
#'
#'   Variables used in the output:
#'
#'   * `Count`: The total number of segments in a simulation
#'
#'   * `Total`: The total sum of the segment lengths in a simulation
#'
#'   * `Average`: The average segment lengths in a simulation
#'
#'   * `Shortest`: The length of the shortest segment in a simulation
#'
#'   * `Longest`: The length of the longest segment in a simulation
#'
#'   * `Overall` (only in `summary`): A summary of all segments from all
#'   simulations
#'
#' @seealso [findPattern()]
#'
#' @examples
#' x = nuclearPed(3)
#' sims = ibdsim(x, N = 2, map = uniformMap(M = 2), model = "haldane", seed = 1729)
#'
#' # Segments where all siblings carry the same allele
#' segs = findPattern(sims, pattern = list(carriers = 3:5))
#'
#' # Summarise
#' segmentStats(segs, unit = "mb")
#' 
#' # The unit does not matter in this case (since the map is trivial)
#' segmentStats(segs, unit = "cm")
#'
#' @importFrom stats quantile
#' @export
segmentStats = function(x, quantiles = c(0.025, 0.5, 0.975), returnAll = FALSE, unit = "mb") {
  
  if(is.matrix(x))
    x = list(x)
  
  # Names of start/end columns
  startCol = switch(unit, mb = "startMB", cm = "startCM")
  endCol = switch(unit, mb = "endMB", cm = "endCM")
  
  # List of lengths
  lenDat = lapply(x, function(s) s[, endCol] - s[, startCol])
  
  # Collect data for each sim
  perSim = data.frame(
    `Count`    = lengths(lenDat),
    `Total`    = vapply(lenDat, sum, FUN.VALUE = numeric(1)), 
    `Average`  = vapply(lenDat, safeMean, FUN.VALUE = numeric(1)), 
    `Shortest` = vapply(lenDat, safeMin, FUN.VALUE = numeric(1)),
    `Longest`  = vapply(lenDat, safeMax, FUN.VALUE = numeric(1)))
  
  # Summarising function
  sumfun = function(v) c(mean = safeMean(v), sd = sd(v), min = safeMin(v), 
                         quantile(v, quantiles), max = safeMax(v))
  # Summary
  sumList = lapply(perSim, sumfun)
  
  # Add stats of overall lengths
  allSegs = unlist(lenDat, use.names = FALSE)
  sumList$Overall = sumfun(allSegs)
  
  # Return as data frames
  res = list(perSim = perSim, summary = do.call(rbind, sumList))
  if(returnAll)
    res$allSegs = allSegs
  
  res
}


# Add columns with IBD states (if 1 or 2 ids)
addStates = function(x, acols) {
  
  nc = length(acols)
  Xchrom = any(x[, "chrom"] == 23)
  
  if(nc == 2) {
    a1 = x[, acols[1]]
    a2 = x[, acols[2]]
    aut = a1 == a2 
    if(Xchrom) # convention: hemizygous = autozygous
      aut = aut | a1 == 0 
    aut = as.numeric(aut)  # x is a numeric (not integer) matrix
    x = cbind(x, Aut = aut)
  }
  else if(nc == 4) {
    a1 = x[, acols[1]]
    a2 = x[, acols[2]]
    b1 = x[, acols[3]]
    b2 = x[, acols[4]]
    
    IBD = ibdState(a1, a2, b1, b2, Xchrom = Xchrom)
    Sigma = jacquardState(a1, a2, b1, b2, Xchrom = Xchrom)
    
    x = cbind(x, IBD = IBD, Sigma = Sigma)
  }
  else
    stop2("addStates requires 2 or 4 columns, not: ", nc)
  
  x
}

# Return the condensed identity ("jacquard") state given 4 alleles:
# a1 and a2 from individual 1; b1 and b2 from individual 2
jacquardState = function(a1, a2, b1, b2, Xchrom = FALSE) { 
  
  # Identity states on X follow the autosomal ordering,
  # after replacing hemizygous alleles with autozygous ones. 
  # See https://github.com/magnusdv/ribd
  if(Xchrom) {
    azero = a1 == 0
    bzero = b1 == 0
    a1[azero] = a2[azero]
    b1[bzero] = b2[bzero]
  }
  
  inb1 = a1 == a2
  inb2 = b1 == b2
  pa = a1 == b1
  ma = a2 == b2
  d1 = a1 == b2
  d2 = a2 == b1
  
  horiz = inb1 + inb2
  verts = pa + ma + d1 + d2
  
  res = integer(length(a1))
  
  res[horiz == 2 & pa] = 1
  res[horiz == 2 & !pa] = 2
  
  res[inb1 & !inb2 & (pa | ma)] = 3
  res[inb1 & !inb2 & verts == 0] = 4
  
  res[!inb1 & inb2 & (pa | ma)] = 5
  res[!inb1 & inb2 & verts == 0] = 6
  
  res[horiz == 0 & verts == 2] = 7
  res[verts == 1] = 8
  res[horiz + verts == 0] = 9
  res
}

ibdState = function(a1, a2, b1, b2, Xchrom = FALSE) { 
  
  inb1 = a1 == a2
  inb2 = b1 == b2
  pa = a1 == b1
  ma = a2 == b2
  d1 = a1 == b2
  d2 = a2 == b1
  
  if(Xchrom) {
    hem1 = a1 == 0
    hem2 = b1 == 0
  
    pa = pa & !hem1 & !hem2
    d1 = d1 & !hem1
    d2 = d2 & !hem2
  }
  
  res = as.integer(pa + ma + d1 + d2)
  res[inb1 | inb2] = NA_integer_
  res
}


### Modified from range-utils in IBDsim

# x: list of matrices of two column: start, end
rangeUnion = function(x) {
  nrws = lengths(x, use.names = FALSE)/2
  if (sum(nrws) == 0) return(matrix(numeric(0), ncol=2))
  
  n = length(nrws)
  vec = unlist(x, recursive = TRUE, use.names = FALSE)
  
  # 1 = interval start; -1 = interval stop
  event = rep.int(rep.int(c(1,-1), n), rep.int(nrws, rep.int(2,n))) 
  
  # Extend all regions by 1bp in both directions (to count consecutive regions as overlapping) 
  eps = 1e-6
  extvec = vec - event * eps
  
  # Sort the extended vector
  ord = order(extvec, event, na.last = TRUE, decreasing = FALSE) # NB: in rangeIntersect use '-startstop'
  m = extvec[ord]
  eventSorted = event[ord]
  
  #print(cbind(m,eventSorted,cumsum(eventSorted)))  #look at this to understand.
  idxEnd = seq_along(ord)[cumsum(eventSorted) == 0]
  idxStart = c(1, idxEnd[-length(idxEnd)] + 1)
  cbind(m[idxStart] + eps, m[idxEnd] - eps, deparse.level=0)
}

# x: a list of data frames, each produced by findPattern
segmentUnion = function(x, unit = "mb") {
  if(length(x) ==1) return(x[[1]])
  
  chrs = sort(unique.default(unlist(lapply(x, function(a) a[,1]))))
  
  cls = switch(unit, cm = c('startCM', 'endCM'), mb = c('startMB', 'endMB'))
  
  chrUnions = lapply(chrs, function(i) {
		chrArgs = lapply(x, function(a) {
		  a[a[,1] == i, cls, drop = FALSE]
		})
		resi = rangeUnion(chrArgs)
		resi = cbind(i, resi)
		colnames(resi) = c('chrom', cls)
		resi
	})
  
	do.call(rbind, chrUnions)
}

Any scripts or data that you put into this service are public.

ibdsim2 documentation built on Nov. 23, 2025, 5:06 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ibdsim2
Simulation of Chromosomal Regions Shared by Family Members

R/segment_utils.R
In ibdsim2: Simulation of Chromosomal Regions Shared by Family Members

Defines functions segmentUnion rangeUnion ibdState jacquardState addStates segmentStats mergeSegments groupOverlaps stackIntervals alleleFlow

Documented in segmentStats

Try the ibdsim2 package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ibdsim2 Simulation of Chromosomal Regions Shared by Family Members

R/segment_utils.R In ibdsim2: Simulation of Chromosomal Regions Shared by Family Members

Defines functions segmentUnion rangeUnion ibdState jacquardState addStates segmentStats mergeSegments groupOverlaps stackIntervals alleleFlow

Documented in segmentStats

Try the ibdsim2 package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ibdsim2
Simulation of Chromosomal Regions Shared by Family Members

R/segment_utils.R
In ibdsim2: Simulation of Chromosomal Regions Shared by Family Members