R/func__findMinIncClade.R

Defines functions .searchMinCladePerPair findMinIncClade

Documented in findMinIncClade

#' @title Determine the minimal inclusive clades and calculate its frequency of
#' co-occurrence events of every pair of alleles.
#'
#' @description This function finds out the minimal clade that contains all samples
#' having a pair of patterns (representatives of alleles) co-occurring. It also
#' computes the frequency of co-occurrence events and the maximal sample distance
#' in each minimal clade. This function can be used for determining whether the
#' distributions of a pair of identically distributed alleles are identical-by-descent
#' (IBD). An NA vlaue appears when the patterns do not overlap (when min.co = 0).
#'
#' By design, this function takes as input the outputs of the lmm or findPhysLink
#' function when it is applied independently. It however serves as a subordinate
#' function of lmm/findPhysLink as well.
#'
#' @param lmms Allele-level results from linear mixed models, generated by the
#' function lmm or findPhysLink.
#' @param allele.pam An allele-level presence-absence matrix. It is the element
#' [["alleles"]][["A"]] in the output list of lmm or findPhysLink.
#' @param clade.pam A matrix for the presence-absence of samples in each clade
#' of an input tree. It can be obtained using the function tree2Clades of GeneMates.
#' @param clade.sizes A named vector of integers for the number of samples in
#' each clade. It can be obtained from the element "sizes" in the outputs of the
#' function tree2Clades. Optional.
#' @param sample.dists A square matrix for distances between samples. It can be
#' acquired through the function projectSamples.
#' @param n.cores An integer determining the number of cores used for parallel
#' computing. Valid values are the same as those for the findPhysLink function.
#'
#' @examples
#' assoc <- findPhysLink(...)  # Although this command calls the findMinIncClade
#' function, however, a user wants to regenerate the clade inforamtion for some
#' reasons.
#' C <- projectSamples(...)
#' clades <- tree2Clades(...)
#' clonality <- findMinIncClade(lmms = assoc[["lmms"]],
#'     allele.pam = assoc[["alleles"]][["A"]], clade.pam = clades[["pam"]],
#'     clade.sizes = clades[["sizes"]], sample.dists = C[["d"]], n.cores = 16)
#'
#' Dependency: parallel
#'
#' @author Yu Wan (\email{wanyuac@@126.com})
#' @export
#
#  Copyright 2017 - 2018 Yu Wan
#  Licensed under the Apache License, Version 2.0
#  First edition: 18 July 2017, the lastest edition: 23 May 2018

# Although this function may be faster if it processes patterns rather than alleles, I make it to work on alleles to simplify the
# script and to make the script more understandable.
findMinIncClade <- function(lmms, allele.pam, clade.pam, clade.sizes = NULL,
                            sample.dists, n.cores = -1) {
    print(paste0(Sys.time(), ": finding the minimal inclusive clade of each pair of alleles."))

    require(parallel)
    require(data.table)

    # match samples in clade.pam and sample.dists to allele.pam
    samples <- rownames(allele.pam)
    if (any(rownames(clade.pam) != samples)) {
        print("Warning: matching the sample order of clade.pam to allele.pam.")
        clade.pam <- clade.pam[samples, ]
    }
    if (any(rownames(sample.dists) != samples)) {
        print("Warning: matching the sample order of the distance matrix to allele.pam.")
        sample.dists <- sample.dists[samples, samples]
    }

    # initalise the output data frame
    c <- NULL  # clade summaries for pairs of alleles
    if (class(lmms) == "list") {  # then concatenate data frames in this list
        for (g in names(lmms)) {  # c("dif", "idd") or "dif" only
            d <- lmms[[g]][["h1"]][, c("y", "x", "y_pat", "x_pat", "pair", "n_y", "n_x", "n_xy")]
            if (!is.null(d)) {
                # assign groups of association results
                if (g == "dif") {
                    d$dif <- rep(1, times = nrow(d))  # Are alleles differently distributed? Yes
                } else {  # g == "idd"
                    d$dif <- rep(0, times = nrow(d))  # identically distributed; # beta = 1 for all idd. alleles and all associations are considered as significant.
                }
                c <- rbind.data.frame(c, d, stringsAsFactors = FALSE)
            }
        }
    } else if (class(lmms) == "data.frame") {
        if ("dif" %in% names(lmms)) {
            c <- lmms[, c("y", "x", "y_pat", "x_pat", "pair", "n_y", "n_x", "n_xy", "dif")]
        } else {
            c <- lmms[, c("y", "x", "y_pat", "x_pat", "pair", "n_y", "n_x", "n_xy")]
        }
    } else {
        stop("Argument error: \"lmms\" must be either a data frame or a list with elements \"dif\" and \"idd\".")
    }

    # calculate clade sizes when they are not provided
    if (is.null(clade.sizes)) {
        clade.sizes <- apply(clade.pam, 2, sum)
    }

    # only take one way from each pair of alleles
    c <- c[order(c$pair, decreasing = FALSE), ]
    c <- c[seq(from = 1, to = nrow(c) - 1, by = 2), ]  # c is a data frame

    # assign a job in parallel per pair of alleles
    cl <- makeCluster(.setCoreNum(n.cores = n.cores, cores.avai = detectCores()))
                      #outfile = "findMinIncClade.log")  # cores.avai >= 1; n.cores >= 1
    clusterExport(cl = cl,
                  varlist = list("c", "allele.pam", "clade.pam", "clade.sizes", "sample.dists"),
                  envir = environment())  # make variables accessible to different cores
    print(paste0(Sys.time(), ": Starting searching for the minimal clade for every pair of alleles."))
    min.clades <- parLapply(cl, 1 : nrow(c), .searchMinCladePerPair, c, allele.pam,
                            clade.pam, clade.sizes, sample.dists)  # wait until all jobs finish
    stopCluster(cl)
    print(paste0(Sys.time(), ": the search is finished successfully."))

    min.clades <- rbindlist(min.clades)
    min.clades <- min.clades[order(min.clades$pair, decreasing = FALSE), ]

    return(min.clades)  # return a single data frame for both the dif and idd groups
}

# A subordinate function of findMinIncClade
.searchMinCladePerPair <- function(i, lmms, allele.pam, clade.pam, clade.sizes, sample.dists) {
    r <- lmms[i, ]  # extract a row from the input data frame
    ax <- r$x  # get y and x alleles
    ay <- r$y
    py <- r$y_pat
    px <- r$x_pat
    x <- as.logical(allele.pam[, ax])  # presence-absence of the x allele; TRUE: presence; FALSE: absence
    y <- as.logical(allele.pam[, ay])  # for the y allele
    xy <- x & y  # co-occurrence events. Notice x and y may not overlap at all when they are differently distributed.
    n <- sum(xy)  # equals r$n_xy

    # In normal cases, find out clades having all samples in the vector sample.co.
    if (n > 0) {
        samples.co <- rownames(allele.pam)[xy]  # samples where ax and ay are co-occurring, equalling the number of co-occurrence events
        #print(paste0(ax, " and ", ay, " are co-occurring in ", length(samples.co),
        #             " samples: ", paste(samples.co, collapse = ",", ".")))
        candidate.clades <- names(clade.sizes)[which(as.integer(clade.sizes) >= n)]  # Each candiate clade must have at least n samples.
        clade.pam <- clade.pam[, candidate.clades]  # This step reduces the number of comparisons later on.
        #print(paste0(ncol(clade.pam), " candidate clades were found for ", ax,
        #             " and ", ay, "."))

        # The following command identifies clades that have all samples where x
        # and y alleles co-occur. sum(clade | xy) always > sum(clade) when one or
        # more samples in samples.co do not belong the the current clade; otherwise,
        # sum(clade | xy) = sum(clade), where samples.co represent a subset (may
        # equal all samples in the clade) of samples in the current clade. It is
        # impossible that sum(clade | xy) < sum(clade) or sum(xy).
        clade.xy <- apply(clade.pam, 2, function(c) sum(as.logical(c) | xy) == sum(c))
        inc.clades <- candidate.clades[as.logical(clade.xy)]  # IDs of all inclusive clades
        #print(paste0(length(inc.clades), " inclusive clades are found for ", ax,
        #             " and ", ay, "."))

        # get the minimal inclusive clade
        # The function which.min returns the first index of the minimum in a vector, which is correct for a tree topology
        # because it must have a unique minimal inclusive clade.
        min.clade <- inc.clades[which.min(as.integer(clade.sizes[inc.clades]))]  # ID of the minimal inclusive clade

        # retrieve the count of co-occurrence events
        min.clade.size <- as.integer(clade.sizes[[min.clade]])  # Can also be computed through sum(clade.pam[, min.clade]).
        f.xy <- round(n / min.clade.size * 100, digits = 4)  # co-occurrence frequency (%)

        # get the maximal distance between samples where the current pair of alleles are co-occurring
        #print(paste0("Retriving the maximum sample distance between the samples where ",
        #             ax, " and ", ay, " are co-occurring."))
        d.max <- max(sample.dists[samples.co, samples.co])
        #print(paste0("The maximum sample distance is ", d.max, " for ", ax, " and ",
        #             ay, "."))

        # construct the output data frame
        mic <- data.frame(y = c(ay, ax), x = c(ax, ay), y_pat = c(py, px),
                          x_pat = c(px, py), pair = rep(r$pair, times = 2),
                          n_y = rep(r$n_y, times = 2), n_x = rep(r$n_x, times = 2),
                          n_xy = rep(n, times = 2), clade = rep(min.clade, times = 2),
                          size = rep(min.clade.size, times = 2),
                          f_xy = rep(f.xy, times = 2),
                          ds_max = rep(d.max, times = 2),
                          stringsAsFactors = FALSE)  # ds: sample distances
        if ("dif" %in% names(r)) {  # when there is a "dif" column in the data frame lmms
            mic$dif <- rep(r$dif, times = 2)
        }
    } else {  # Alleles X and Y never co-occur in the current data set.
        mic <- NULL
    }
    print(paste0("Finishing finding the minimum inclusive clade for ", ax,
                 " and ", ay, "."))

    return(mic)
}
wanyuac/GeneMates documentation built on Aug. 12, 2022, 7:37 a.m.