#' @title Determine the minimal inclusive clades and calculate its frequency of
#' co-occurrence events of every pair of alleles.
#'
#' @description This function finds out the minimal clade that contains all samples
#' having a pair of patterns (representatives of alleles) co-occurring. It also
#' computes the frequency of co-occurrence events and the maximal sample distance
#' in each minimal clade. This function can be used for determining whether the
#' distributions of a pair of identically distributed alleles are identical-by-descent
#' (IBD). An NA vlaue appears when the patterns do not overlap (when min.co = 0).
#'
#' By design, this function takes as input the outputs of the lmm or findPhysLink
#' function when it is applied independently. It however serves as a subordinate
#' function of lmm/findPhysLink as well.
#'
#' @param lmms Allele-level results from linear mixed models, generated by the
#' function lmm or findPhysLink.
#' @param allele.pam An allele-level presence-absence matrix. It is the element
#' [["alleles"]][["A"]] in the output list of lmm or findPhysLink.
#' @param clade.pam A matrix for the presence-absence of samples in each clade
#' of an input tree. It can be obtained using the function tree2Clades of GeneMates.
#' @param clade.sizes A named vector of integers for the number of samples in
#' each clade. It can be obtained from the element "sizes" in the outputs of the
#' function tree2Clades. Optional.
#' @param sample.dists A square matrix for distances between samples. It can be
#' acquired through the function projectSamples.
#' @param n.cores An integer determining the number of cores used for parallel
#' computing. Valid values are the same as those for the findPhysLink function.
#'
#' @examples
#' assoc <- findPhysLink(...) # Although this command calls the findMinIncClade
#' function, however, a user wants to regenerate the clade inforamtion for some
#' reasons.
#' C <- projectSamples(...)
#' clades <- tree2Clades(...)
#' clonality <- findMinIncClade(lmms = assoc[["lmms"]],
#' allele.pam = assoc[["alleles"]][["A"]], clade.pam = clades[["pam"]],
#' clade.sizes = clades[["sizes"]], sample.dists = C[["d"]], n.cores = 16)
#'
#' Dependency: parallel
#'
#' @author Yu Wan (\email{wanyuac@@126.com})
#' @export
#
# Copyright 2017 - 2018 Yu Wan
# Licensed under the Apache License, Version 2.0
# First edition: 18 July 2017, the lastest edition: 23 May 2018
# Although this function may be faster if it processes patterns rather than alleles, I make it to work on alleles to simplify the
# script and to make the script more understandable.
findMinIncClade <- function(lmms, allele.pam, clade.pam, clade.sizes = NULL,
sample.dists, n.cores = -1) {
print(paste0(Sys.time(), ": finding the minimal inclusive clade of each pair of alleles."))
require(parallel)
require(data.table)
# match samples in clade.pam and sample.dists to allele.pam
samples <- rownames(allele.pam)
if (any(rownames(clade.pam) != samples)) {
print("Warning: matching the sample order of clade.pam to allele.pam.")
clade.pam <- clade.pam[samples, ]
}
if (any(rownames(sample.dists) != samples)) {
print("Warning: matching the sample order of the distance matrix to allele.pam.")
sample.dists <- sample.dists[samples, samples]
}
# initalise the output data frame
c <- NULL # clade summaries for pairs of alleles
if (class(lmms) == "list") { # then concatenate data frames in this list
for (g in names(lmms)) { # c("dif", "idd") or "dif" only
d <- lmms[[g]][["h1"]][, c("y", "x", "y_pat", "x_pat", "pair", "n_y", "n_x", "n_xy")]
if (!is.null(d)) {
# assign groups of association results
if (g == "dif") {
d$dif <- rep(1, times = nrow(d)) # Are alleles differently distributed? Yes
} else { # g == "idd"
d$dif <- rep(0, times = nrow(d)) # identically distributed; # beta = 1 for all idd. alleles and all associations are considered as significant.
}
c <- rbind.data.frame(c, d, stringsAsFactors = FALSE)
}
}
} else if (class(lmms) == "data.frame") {
if ("dif" %in% names(lmms)) {
c <- lmms[, c("y", "x", "y_pat", "x_pat", "pair", "n_y", "n_x", "n_xy", "dif")]
} else {
c <- lmms[, c("y", "x", "y_pat", "x_pat", "pair", "n_y", "n_x", "n_xy")]
}
} else {
stop("Argument error: \"lmms\" must be either a data frame or a list with elements \"dif\" and \"idd\".")
}
# calculate clade sizes when they are not provided
if (is.null(clade.sizes)) {
clade.sizes <- apply(clade.pam, 2, sum)
}
# only take one way from each pair of alleles
c <- c[order(c$pair, decreasing = FALSE), ]
c <- c[seq(from = 1, to = nrow(c) - 1, by = 2), ] # c is a data frame
# assign a job in parallel per pair of alleles
cl <- makeCluster(.setCoreNum(n.cores = n.cores, cores.avai = detectCores()))
#outfile = "findMinIncClade.log") # cores.avai >= 1; n.cores >= 1
clusterExport(cl = cl,
varlist = list("c", "allele.pam", "clade.pam", "clade.sizes", "sample.dists"),
envir = environment()) # make variables accessible to different cores
print(paste0(Sys.time(), ": Starting searching for the minimal clade for every pair of alleles."))
min.clades <- parLapply(cl, 1 : nrow(c), .searchMinCladePerPair, c, allele.pam,
clade.pam, clade.sizes, sample.dists) # wait until all jobs finish
stopCluster(cl)
print(paste0(Sys.time(), ": the search is finished successfully."))
min.clades <- rbindlist(min.clades)
min.clades <- min.clades[order(min.clades$pair, decreasing = FALSE), ]
return(min.clades) # return a single data frame for both the dif and idd groups
}
# A subordinate function of findMinIncClade
.searchMinCladePerPair <- function(i, lmms, allele.pam, clade.pam, clade.sizes, sample.dists) {
r <- lmms[i, ] # extract a row from the input data frame
ax <- r$x # get y and x alleles
ay <- r$y
py <- r$y_pat
px <- r$x_pat
x <- as.logical(allele.pam[, ax]) # presence-absence of the x allele; TRUE: presence; FALSE: absence
y <- as.logical(allele.pam[, ay]) # for the y allele
xy <- x & y # co-occurrence events. Notice x and y may not overlap at all when they are differently distributed.
n <- sum(xy) # equals r$n_xy
# In normal cases, find out clades having all samples in the vector sample.co.
if (n > 0) {
samples.co <- rownames(allele.pam)[xy] # samples where ax and ay are co-occurring, equalling the number of co-occurrence events
#print(paste0(ax, " and ", ay, " are co-occurring in ", length(samples.co),
# " samples: ", paste(samples.co, collapse = ",", ".")))
candidate.clades <- names(clade.sizes)[which(as.integer(clade.sizes) >= n)] # Each candiate clade must have at least n samples.
clade.pam <- clade.pam[, candidate.clades] # This step reduces the number of comparisons later on.
#print(paste0(ncol(clade.pam), " candidate clades were found for ", ax,
# " and ", ay, "."))
# The following command identifies clades that have all samples where x
# and y alleles co-occur. sum(clade | xy) always > sum(clade) when one or
# more samples in samples.co do not belong the the current clade; otherwise,
# sum(clade | xy) = sum(clade), where samples.co represent a subset (may
# equal all samples in the clade) of samples in the current clade. It is
# impossible that sum(clade | xy) < sum(clade) or sum(xy).
clade.xy <- apply(clade.pam, 2, function(c) sum(as.logical(c) | xy) == sum(c))
inc.clades <- candidate.clades[as.logical(clade.xy)] # IDs of all inclusive clades
#print(paste0(length(inc.clades), " inclusive clades are found for ", ax,
# " and ", ay, "."))
# get the minimal inclusive clade
# The function which.min returns the first index of the minimum in a vector, which is correct for a tree topology
# because it must have a unique minimal inclusive clade.
min.clade <- inc.clades[which.min(as.integer(clade.sizes[inc.clades]))] # ID of the minimal inclusive clade
# retrieve the count of co-occurrence events
min.clade.size <- as.integer(clade.sizes[[min.clade]]) # Can also be computed through sum(clade.pam[, min.clade]).
f.xy <- round(n / min.clade.size * 100, digits = 4) # co-occurrence frequency (%)
# get the maximal distance between samples where the current pair of alleles are co-occurring
#print(paste0("Retriving the maximum sample distance between the samples where ",
# ax, " and ", ay, " are co-occurring."))
d.max <- max(sample.dists[samples.co, samples.co])
#print(paste0("The maximum sample distance is ", d.max, " for ", ax, " and ",
# ay, "."))
# construct the output data frame
mic <- data.frame(y = c(ay, ax), x = c(ax, ay), y_pat = c(py, px),
x_pat = c(px, py), pair = rep(r$pair, times = 2),
n_y = rep(r$n_y, times = 2), n_x = rep(r$n_x, times = 2),
n_xy = rep(n, times = 2), clade = rep(min.clade, times = 2),
size = rep(min.clade.size, times = 2),
f_xy = rep(f.xy, times = 2),
ds_max = rep(d.max, times = 2),
stringsAsFactors = FALSE) # ds: sample distances
if ("dif" %in% names(r)) { # when there is a "dif" column in the data frame lmms
mic$dif <- rep(r$dif, times = 2)
}
} else { # Alleles X and Y never co-occur in the current data set.
mic <- NULL
}
print(paste0("Finishing finding the minimum inclusive clade for ", ax,
" and ", ay, "."))
return(mic)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.