Nothing
## mjn.R (2023-02-23)
## Median-Joining Network
## Copyright 2017-2023 Emmanuel Paradis
## This file is part of the R-package `pegas'.
## See the file ../DESCRIPTION for licensing issues.
mjn <- function(x, epsilon = 0, max.n.cost = 10000, prefix = "median.vector_", quiet = FALSE)
{
if (is.data.frame(x)) x <- as.matrix(x)
if (mode(x) == "numeric") {
if (!all(unique.default(x) %in% 0:1))
stop("mjn() requires binary data (0/1)")
} else {
if (!inherits(x, "DNAbin"))
stop("mjn() requires DNA or binary data (0/1)")
if (is.list(x)) x <- as.matrix(x)
}
getIndex <- function(i, j, n) {
if (i < j) n*(i - 1) - i*(i - 1)/2 + j - i
else n*(j - 1) - j*(j - 1)/2 + i - j
}
## the reverse operation:
getIandJ <- function(ij, n) {
b <- n - 0.5
i <- ceiling(b - sqrt(b * b - 2 * ij))
j <- n * (1 - i) + (i + 1) * i / 2 + ij
as.integer(c(i, j))
}
## the above function uses a quadratic equation to find i;
## it is slightly faster than the previous version (below)
## which uses an iteration (see sources of pegas 1.1)
purgeObsolete <- function()
{
res <- integer()
if (nrow(x) > n) {
tab <- tabulate(c(link1, link2))[(n + 1):nrow(x)]
sel <- tab == 1 | tab == 2
if (any(sel)) {
res <- which(sel) + n
x <<- x[-res, ]
}
}
res
}
## setting functions depending on the type of data
if (inherits(x, "DNAbin")) { # mode(x) == "raw"
getMedianVectors <- function(x) {
## returns 1 or 3 sequences
p <- as.integer(ncol(x))
#browser()
ans <- .C(getMedianVectors_DNAbin_mjn, x, p, raw(3L * p), 1L)
res <- ans[[3L]]
dim(res) <- c(3L, p)
if (ans[[4L]]) res <- res[1L, , drop = FALSE]
res
}
funDist <-function(x) dist.dna(x, "n", pairwise.deletion = TRUE)
funVariableCol <- seg.sites
alreadyIn <- function(x, table) .Call(alreadyIn_mjn_DNAbin, x, table)
} else { # mode(x) == "numeric"
getMedianVectors <- function(x) { # for binary (0/1) data
p <- ncol(x)
res <- matrix(NA_real_, 1L, p)
for (j in 1:p) res[, j] <- as.integer(sum(x[, j]) > 1)
res
}
funDist <- function(x) dist(x, "manhattan")
funVariableCol <- function(x) {
foo <- function(x) length(unique.default(x)) > 1
which(apply(x, 2, foo))
}
alreadyIn <- function(x, table) {
foo <- function(y) all(y == x)
any(apply(table, 1, foo))
}
}
## this speeds-up significantly the indexing operations
if (mode(x) == "raw") class(x) <- c("matrix", "array")
## initialization --
n <- nrow(x)
nextX <- n + 1L
## step 1 --
Delta <- funDist(x)
## find the d-step components with an MST:
MST <- mst(Delta)
if (n < 3) {
warning("less than 3 observations: function mst() was used")
return(MST)
}
## step 2 --
link1 <- MST[, 1]
link2 <- MST[, 2]
weight <- MST[, 3]
## add extra links depending on the value of epsilon
threshold <- max(weight) + epsilon
minDelta <- min(Delta)
extra <- which(Delta > minDelta & Delta <= minDelta + threshold)
i <- 1L
while (i <= length(extra)) {
p <- getIandJ(extra[i], n = n)
if (!any(p[1] == link1 & p[2] == link2)) {
link1 <- c(link1, p[1])
link2 <- c(link2, p[2])
weight <- c(weight, Delta[extra[i]])
}
i <- i + 1L
}
n.cost <- 0L
lambda <- Inf
if (!quiet) cat("Adding median vectors:")
repeat {
## step 3 --
purged <- purgeObsolete()
if (length(purged)) {
sel <- purged == link1 | purged == link2
link1 <- link1[!sel]
link2 <- link2[!sel]
weight <- weight[!sel]
}
triplet <- combn(1:nrow(x), 3)
median.vector <- NULL
for (i in 1:ncol(triplet)) {
s <- triplet[, i]
## step 4 --
U <- s[1]; V <- s[2]; W <- s[3]
l1 <- any(U == link1 & V == link2)
l2 <- any(U == link1 & W == link2)
l3 <- any(V == link1 & W == link2)
## check if at least 2 links feasible:
if (l1 + l2 + l3 < 2) next
## get the median vectors:
subsamp <- x[s, ] # get the sequences for triplet 's'
s4 <- funVariableCol(subsamp)
if (!length(s4))
stop("maybe there are duplicated sequences in your data")
subsamp4 <- subsamp[, s4, drop = FALSE] # get only the variable cols for 's'
X <- getMedianVectors(subsamp4)
tmp <- subsamp[rep(1, nrow(X)), , drop = FALSE]
tmp[, s4] <- X
## check that 'tmp' is not yet among the current sequence types:
check.presence <- alreadyIn(tmp, x)
if (all(check.presence)) next
if (any(check.presence)) tmp <- tmp[!check.presence, , drop = FALSE]
## ... same thing with the sequences in median.vector:
if (!is.null(median.vector)) {
check.presence <- alreadyIn(tmp, median.vector)
if (all(check.presence)) next
if (any(check.presence)) tmp <- tmp[!check.presence, , drop = FALSE]
}
m <- nrow(tmp)
rownames(tmp) <- paste0(prefix, nextX:(nextX + m - 1))
dist2X <- matrix(0, m, 3)
for (k in 1:m)
dist2X[k, ] <- funDist(rbind(tmp[k, ], subsamp))[1:3]
connection.cost <- rowSums(dist2X)
## connection.cost <- sum(dist2X)
n.cost <- n.cost + length(connection.cost)
if (n.cost > max.n.cost) stop("too many connection costs computed")
if (any(connection.cost < lambda)) lambda <- min(connection.cost)
for (k in 1:m) {
if (connection.cost[k] <= lambda + epsilon) {
link1 <- c(link1, s)
link2 <- c(link2, rep(nextX, 3))
weight <- c(weight, dist2X[k, ])
median.vector <-
if (is.null(median.vector)) tmp[k, , drop = FALSE]
else rbind(median.vector, tmp[k, , drop = FALSE])
nextX <- nextX + 1L
}
}
}
if (!quiet) cat("", nrow(median.vector))
if (is.null(median.vector)) break else x <- rbind(x, median.vector)
}
## step 5 --
N <- nrow(x)
d <- funDist(x)
dnet <- numeric(N*(N - 1)/2) # distances inferred from the network
m <- matrix(NA_real_, 0, 3)
forest <- 1:N
ud <- sort(unique(d)) # unique distances in increasing order
## build the network:
i <- 1L
if (!quiet) {
cat("\nBuilding the network... number of clusters:")
previousNC <- N
}
while (length(unique(forest)) > 1) {
if (!quiet) {
if (length(unique(forest)) < previousNC) {
previousNC <- length(unique(forest))
cat("", previousNC)
}
}
delta <- ud[i]
for (iud in which(d == delta)) {
p <- getIandJ(iud, N)
p1 <- p[1L]
p2 <- p[2L]
f1 <- forest[p1]
f2 <- forest[p2]
if (f1 != f2) {
## update dnet:
for (k1 in which(forest == f1)) {
for (k2 in which(forest == f2)) {
A <- if (k1 == p1) 0 else dnet[getIndex(k1, p1, N)]
B <- if (k2 == p2) 0 else dnet[getIndex(k2, p2, N)]
dnet[getIndex(k1, k2, N)] <- A + B + delta
}
}
forest[forest == f2] <- f1
m <- rbind(m, c(p, delta))
} else {
if (delta < dnet[iud]) {
m <- rbind(m, c(p, delta))
## need to update dnet
n1 <- c(m[m[, 2] == p1, 1], m[m[, 1] == p1, 2])
n2 <- c(m[m[, 2] == p2, 1], m[m[, 1] == p2, 2])
for (k1 in c(p1, n1)) {
for (k2 in c(p2, n2)) {
if (k1 == k2) next
j <- getIndex(k1, k2, N)
A <- if (k1 == p1) 0 else dnet[getIndex(k1, p1, N)]
B <- if (k2 == p2) 0 else dnet[getIndex(k2, p2, N)]
tmp <- A + B + delta
if (dnet[j] > tmp) dnet[j] <- tmp
}
}
}
}
}
i <- i + 1L
if (i == 2e3) stop("too many links")
}
## define the alternative links wrt the MST:
MST <- mst(funDist(x))
MST.str <- paste(MST[, 1], MST[, 2], sep = "\r")
m.str <- paste(m[, 1], m[, 2], sep = "\r")
k <- match(m.str, MST.str)
nak <- is.na(k)
if (any(nak)) {
alt <- m[nak, , drop = FALSE]
colnames(alt) <- c("", "", "step")
m <- m[!nak, , drop = FALSE]
attr(m, "alter.links") <- alt
}
dimnames(m) <- list(NULL, c("", "", "step"))
attr(m, "labels") <- rownames(x)
if (mode(x) == "raw") class(x) <- "DNAbin"
attr(m, "data") <- x
attr(m, "prefix") <- prefix
class(m) <- c("mjn", "haploNet")
if (!quiet) cat(" 1\n")
m
}
plot.mjn <- function(x, shape = c("circles", "diamonds"),
bg = c("green", "slategrey"), labels = FALSE, ...)
{
prefix <- attr(x, "prefix")
labs <- labels(x)
if (length(shape) != 2) stop("argument 'shape' must give two shapes")
if (length(bg) != 2) stop("argument 'bg' must give two colours")
## identify the median vectors among the labels (1 or 2):
mv <- grepl(paste0("^", prefix), labs) + 1L
plot.haploNet(x, bg = bg[mv], shape = shape[mv], labels = labels, ...)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.