R/addtree.R

Defines functions plot.cl_addtree .decompose_addtree .cl_addtree_from_addtree_approximation .cl_addtree_from_veclh as.cl_addtree.phylo as.cl_addtree.default as.cl_addtree .make_centroid_matrix ls_fit_centroid .non_additivity .ls_fit_addtree_by_iterative_reduction .ls_fit_addtree_by_iterative_projection .make_penalty_gradient_addtree .make_penalty_function_addtree .ls_fit_addtree_by_SUMT ls_fit_addtree

Documented in as.cl_addtree ls_fit_addtree ls_fit_centroid

### * ls_fit_addtree

ls_fit_addtree <-
function(x, method = c("SUMT", "IP", "IR"), weights = 1,
         control = list())
{
    if(!inherits(x, "dist"))
        x <- as.dist(x)

    ## Catch some special cases right away.
    if(attr(x, "Size") <= 3L)
        return(as.cl_addtree(x))
    if(.non_additivity(x, max = TRUE) == 0)
        return(as.cl_addtree(x))

    ## Handle argument 'weights'.
    ## This is somewhat tricky ...
    if(is.matrix(weights)) {
        weights <- as.dist(weights)
        if(length(weights) != length(x))
            stop("Argument 'weights' must be compatible with 'x'.")
    }
    else
        weights <- rep_len(weights, length(x))
    if(any(weights < 0))
        stop("Argument 'weights' has negative elements.")
    if(!any(weights > 0))
        stop("Argument 'weights' has no positive elements.")

    method <- match.arg(method)
    switch(method,
           SUMT = .ls_fit_addtree_by_SUMT(x, weights, control),
           IP = {
               .ls_fit_addtree_by_iterative_projection(x, weights,
                                                       control)
           },
           IR = {
               .ls_fit_addtree_by_iterative_reduction(x, weights,
                                                      control)
           })
}

### ** .ls_fit_addtree_by_SUMT

.ls_fit_addtree_by_SUMT <-
function(x, weights = 1, control = list())
{
    ## Control parameters:
    ## gradient,
    gradient <- control$gradient
    if(is.null(gradient))
        gradient <- TRUE
    ## nruns,
    nruns <- control$nruns
    ## start,
    start <- control$start

    ## Handle start values and number of runs.
    if(!is.null(start)) {
        if(!is.list(start)) {
            ## Be nice to users.
            start <- list(start)
        }
    } else if(is.null(nruns)) {
        ## Use nruns only if start is not given.
        nruns <- 1L
    }

    w <- weights / sum(weights)
    n <- attr(x, "Size")
    labels <- attr(x, "Labels")    

    ## Handle missing values in x along the lines of de Soete (1984):
    ## set the corresponding weights to 0, and impute by the weighted
    ## mean.
    ind <- which(is.na(x))
    if(any(ind)) {
        w[ind] <- 0
        x[ind] <- weighted.mean(x, w, na.rm = TRUE)
    }

    L <- function(d) sum(w * (d - x) ^ 2)
    P <- .make_penalty_function_addtree(n)
    if(gradient) {
        grad_L <- function(d) 2 * w * (d - x)
        grad_P <- .make_penalty_gradient_addtree(n)
    }
    else {
        grad_L <- grad_P <- NULL
    }

    if(is.null(start)) {
        ## Initialize by "random shaking".  Use sd() for simplicity.
        start <- replicate(nruns,  
                           x + rnorm(length(x), sd = sd(x) / sqrt(3)),
                           simplify = FALSE)
    }

    ## And now ...
    d <- sumt(start, L, P, grad_L, grad_P,
              method = control$method, eps = control$eps,
              q = control$q, verbose = control$verbose,
              control = as.list(control$control))$x

    ## Round to enforce additivity, and hope for the best ...
    .cl_addtree_from_addtree_approximation(d, n, labels)    

}

.make_penalty_function_addtree <-
function(n)
    function(d) {
        (.non_additivity(.symmetric_matrix_from_veclh(d, n))
         + sum(pmin(d, 0) ^ 2))
    }

.make_penalty_gradient_addtree <-
function(n)
    function(d) {
        gr <- matrix(.C(C_deviation_from_additivity_gradient,
                        as.double(.symmetric_matrix_from_veclh(d, n)),
                        as.integer(n),
                        gr = double(n * n))$gr,
                     n, n)
        gr[row(gr) > col(gr)] + 2 * sum(pmin(d, 0))
    }
    

### ** .ls_fit_addtree_by_iterative_projection

## <NOTE>
## Functions
##   .ls_fit_addtree_by_iterative_projection()
##   .ls_fit_addtree_by_iterative_reduction()
## are really identical apart from the name of the C routine they call.
## (But will this necessarily always be the case in the future?)
## Merge maybe ...
## </NOTE>

.ls_fit_addtree_by_iterative_projection <-
function(x, weights = 1, control = list())
{
    if(any(diff(weights)))
        warning("Non-identical weights currently not supported.")

    labels <- attr(x, "Labels")
    x <- as.matrix(x)
    n <- nrow(x)

    ## Control parameters:
    ## maxiter,
    maxiter <- control$maxiter
    if(is.null(maxiter))
        maxiter <- 10000L
    ## nruns,
    nruns <- control$nruns
    ## order,
    order <- control$order
    ## tol,
    tol <- control$tol
    if(is.null(tol))
        tol <- 1e-8
    ## verbose.
    verbose <- control$verbose
    if(is.null(verbose))
        verbose <- getOption("verbose")

    ## Handle order and nruns.
    if(!is.null(order)) {
        if(!is.list(order))
            order <- as.list(order)
        if(!all(vapply(order,
                       function(o) all(sort(o) == seq_len(n)),
                       NA)))
            stop("All given orders must be valid permutations.")
    }
    else {
        if(is.null(nruns))
            nruns <- 1L
        order <- replicate(nruns, sample(n), simplify = FALSE)
    }

    ind <- lower.tri(x)
    L <- function(d) sum(weights * (x - d)[ind] ^ 2)

    d_opt <- NULL
    v_opt <- Inf
    for(run in seq_along(order)) {
        if(verbose)
            message(gettextf("Iterative projection run: %d", run))
        d <- .C(C_ls_fit_addtree_by_iterative_projection,
                as.double(x),
                as.integer(n),
                as.integer(order[[run]] - 1L),
                as.integer(maxiter),
                iter = integer(1L),
                as.double(tol),
                as.logical(verbose))[[1L]]
        v <- L(d)
        if(v < v_opt) {
            v_opt <- v
            d_opt <- d
        }
    }

    d <- matrix(d_opt, n)
    dimnames(d) <- list(labels, labels)
    .cl_addtree_from_addtree_approximation(as.dist(d))
}

### ** .ls_fit_addtree_by_iterative_reduction

.ls_fit_addtree_by_iterative_reduction <-
function(x, weights = 1, control = list())
{
    if(any(diff(weights)))
        warning("Non-identical weights currently not supported.")

    labels <- attr(x, "Labels")
    x <- as.matrix(x)
    n <- nrow(x)

    ## Control parameters:
    ## maxiter,
    maxiter <- control$maxiter
    if(is.null(maxiter))
        maxiter <- 10000L
    ## nruns,
    nruns <- control$nruns
    ## order,
    order <- control$order
    ## tol,
    tol <- control$tol
    if(is.null(tol))
        tol <- 1e-8
    ## verbose.
    verbose <- control$verbose
    if(is.null(verbose))
        verbose <- getOption("verbose")

    ## Handle order and nruns.
    if(!is.null(order)) {
        if(!is.list(order))
            order <- as.list(order)
        if(!all(vapply(order,
                       function(o) all(sort(o) == seq_len(n)),
                       NA)))
            stop("All given orders must be valid permutations.")
    }
    else {
        if(is.null(nruns))
            nruns <- 1L
        order <- replicate(nruns, sample(n), simplify = FALSE)
    }

    ind <- lower.tri(x)
    L <- function(d) sum(weights * (x - d)[ind] ^ 2)

    d_opt <- NULL
    v_opt <- Inf
    for(run in seq_along(order)) {
        if(verbose)
            message(gettextf("Iterative reduction run: %d", run))
        d <- .C(C_ls_fit_addtree_by_iterative_reduction,
                as.double(x),
                as.integer(n),
                as.integer(order[[run]] - 1L),
                as.integer(maxiter),
                iter = integer(1L),
                as.double(tol),
                as.logical(verbose))[[1L]]
        v <- L(d)
        if(v < v_opt) {
            v_opt <- v
            d_opt <- d
        }
    }

    d <- matrix(d_opt, n)
    dimnames(d) <- list(labels, labels)
    .cl_addtree_from_addtree_approximation(as.dist(d))
}

### * .non_additivity

.non_additivity <-
function(x, max = FALSE)
{
    if(!is.matrix(x))
        x <- .symmetric_matrix_from_veclh(x)
    .C(C_deviation_from_additivity,
       as.double(x),
       as.integer(nrow(x)),
       fn = double(1L),
       as.logical(max))$fn
}

### * ls_fit_centroid

ls_fit_centroid <-
function(x)
{
    ## Fit a centroid additive tree distance along the lines of Carroll
    ## & Pruzansky (1980).  In fact, solving
    ##
    ##    \sum_{i,j: i \ne j} (\delta_{ij} - (g_i + g_j)) ^ 2 => min_g
    ##
    ## gives \sum_{j: j \ne i} (g_i + g_j - \delta_{ij}) = 0, or (also
    ## in Barthemely & Guenoche)
    ##
    ##    (n - 2) g_i + \sum_j g_j = \sum_{j: j \ne i} \delta_{ij}
    ##
    ## which after summing over all i and some manipulations eventually
    ## gives
    ##
    ##    g_i = \frac{1}{n-2} (v_i - m),
    ##
    ##    v_i = \sum_{j: j \ne i} \delta_{ij}
    ##    s   = \frac{1}{2(n-1)} \sum_{i,j: j \ne i} \delta_{ij}

    n <- attr(x, "Size")

    if(n <= 2L)
        return(as.cl_addtree(0 * x))

    x <- as.matrix(x)
    g <- rowSums(x) / (n - 2) - sum(x) / (2 * (n - 1) * (n - 2))
    as.cl_addtree(as.dist(.make_centroid_matrix(g)))
}

.make_centroid_matrix <-
function(g)
{
    y <- outer(g, g, `+`)
    diag(y) <- 0
    y
}

### * as.cl_addtree

as.cl_addtree <-
function(x)
    UseMethod("as.cl_addtree")

as.cl_addtree.default <-
function(x)    
{
    if(inherits(x, "cl_addtree"))
        x
    else if(is.atomic(x) || inherits(x, "cl_ultrametric"))
        .cl_addtree_from_veclh(x)
    else if(is.matrix(x)) {
        ## Should actually check whether the matrix is symmetric, >= 0
        ## and satisfies the 4-point conditions ...
        .cl_addtree_from_veclh(as.dist(x))
    }
    else if(is.cl_dendrogram(x))
        .cl_addtree_from_veclh(cl_ultrametric(x))
    else
        stop("Cannot coerce to 'cl_addtree'.")
}

as.cl_addtree.phylo <-
function(x)
    .cl_addtree_from_veclh(as.dist(cophenetic(x)))
## Phylogenetic trees with edge/branch lengths yield additive tree
## dissimilarities.

### * .cl_addtree_from_veclh

.cl_addtree_from_veclh <-
function(x, size = NULL, labels = NULL)
{
    cl_proximity(x, "Additive tree distances",
                 labels = labels, size = size,
                 class = c("cl_addtree", "cl_dissimilarity",
                 "cl_proximity", "dist"))
}

### * .cl_addtree_from_addtree_approximation

.cl_addtree_from_addtree_approximation <-
function(x, size = NULL, labels = NULL)
{
    ## Turn x into an addtree after possibly rounding to non-additivity
    ## significance (note that this is not guaranteed to work ...).
    mnum <- .non_additivity(x, max = TRUE)
    x <- round(x, floor(abs(log10(mnum))))
    .cl_addtree_from_veclh(x, size = size, labels = labels)
}

### * .decompose_addtree

.decompose_addtree <-
function(x, const = NULL)
{
    ## Decompose an addtree into an ultrametric and a centroid
    ## distance.

    ## If 'const' is not given, we take the root as half way between the
    ## diameter of the addtree, and choose a minimal constant to ensure
    ## non-negativity (but not positivity) of the ultrametric.

    ## As this is all slightly dubious and it is not quite clear how
    ## much positivity we want in the ultrametric of the decomposition,
    ## we keep this hidden.  For plotting addtrees, the choice of the
    ## constant does not seem to matter.

    x <- as.matrix(x)
    n <- nrow(x)

    ## Determine diameter.
    ind <- which.max(x) - 1
    u <- ind %% n + 1
    v <- ind %/% n + 1

    if(!is.null(const))
        g <- pmax(x[u, ], x[v, ]) - const
    else {
        g <- pmax(x[u, ], x[v, ]) - x[u, v] / 2
        u <- x - .make_centroid_matrix(g)
        k <- - min(u)
        g <- g - k / 2
    }
    u <- x - .make_centroid_matrix(g)

    names(g) <- rownames(x)

    ## Ensure a valid ultrametric.
    d <- .ultrametrify(as.dist(u))
    u <- .cl_ultrametric_from_veclh(d, nrow(x), rownames(x))

    ## Note that we return the centroid distances to the root, and not
    ## between the objects (as.dist(.make_centroid_matrix(g))) ...
    list(Ultrametric = as.cl_ultrametric(u), Centroid = g)
}

### * plot.cl_addtree

plot.cl_addtree <-
function(x, ...)
{
    ## Construct a dendrogram-style representation of the addtree with
    ## the root half way between the diameter, and plot.
    y <- .decompose_addtree(x, max(x))
    u <- y$Ultrametric
    g <- y$Centroid
    ## We halve the scale of the ultrametric, and add the maximal g from
    ## the centroid.
    h <- hclust(as.dist(u / 2), "single")
    h$height <- h$height + max(g)
    d <- as.dendrogram(h)
    ## Now modify the heights of the leaves so that the objects giving
    ## the diameter of the addtree end up with height zero.
    g <- max(g) - g
    names(g) <- labels(g)
    d <- dendrapply(d, function(n) {
        if(!is.leaf(n)) return(n)
        attr(n, "height") <- g[attr(n, "label")]
        n
    })
    ## And finally plot
    plot(d, ...)
}


### Local variables: ***
### mode: outline-minor ***
### outline-regexp: "### [*]+" ***
### End: ***

Try the clue package in your browser

Any scripts or data that you put into this service are public.

clue documentation built on Sept. 23, 2023, 5:06 p.m.