## This code is part of the polenta package
## © C. Heibl 2016 (last update 2017-11-08)
#' @title Transitivity Merge
#' @description Merges two multiple sequence alignments and their reliability
#' scores using the transitivity criterion.
#' @param x A list containing objects of class \code{"\link[=polentaDNA-class]{polentaDNA}"}.
#' @param id An integer vector of length 2 indicating the two elements of
#' \code{x} that will be merged.
#' @param exec A character string giving the path to the alignment program
#' executable. This alignment program is only used to merge leading and
#' trailing positions in the two MSAa that have no shared sequences. All other
#' positions are merged by transitivity.
#' @return An object of class \code{"\link[=polentaDNA-class]{polentaDNA}"}.
#' @importFrom ape as.DNAbin cbind.DNAbin rbind.DNAbin
#' @export
transitivityMerge <- function(x, id = c(1, 2), exec){
include_scores <- inherits(x[[1]], "polentaDNA")
if (include_scores){
msa1 <- x[[id[1]]]@msa
msa2 <- x[[id[2]]]@msa
} else {
msa1 <- x[[id[1]]]
msa2 <- x[[id[2]]]
}
## shared species
## --------------
shared <- intersect(rownames(msa1), rownames(msa2))
if (!length(shared)) stop("MSAs do not overlap")
## If non-shared sequences in both MSAs extend the
## 3'- or 5'-ends of shared sequences, they cannot
## be merged properly. Hence we have to identify
## and delete those positions
## --------------------------
firstInformativePosition <- function(z){
z <- which(z %in% as.raw(4))
z <- z == 1:length(z)
if (any(z)){
max(which(z)) + 1
} else {
1
}
}
lastInformativePosition <- function(z){
seq.length <- length(z)
z <- which(rev(z) %in% as.raw(4))
z <- z == 1:length(z)
if (any(z)){
seq.length - max(which(z))
} else {
seq.length
}
}
h1 <- apply(msa1, 1, firstInformativePosition)
h1 <- min(h1[shared])
t1 <- apply(msa1, 1, lastInformativePosition)
t1 <- max(t1[shared])
h2 <- apply(msa2, 1, firstInformativePosition)
h2 <- min(h2[shared])
t2 <- apply(msa2, 1, lastInformativePosition)
t2 <- max(t2[shared])
both_leading <- (h1 > 1) & (h2 > 1)
both_trailing <- (t1 < ncol(msa1)) & (t2 < ncol(msa2))
if (both_trailing) {
msa_trailing <- mafft.merge(list(msa1[, (t1 + 1):ncol(msa1)], msa2[, (t2 + 1):ncol(msa2)]),
exec = exec)
msa_trailing <- msa_trailing[!duplicated(rownames(msa_trailing)), ]
msa1 <- msa1[, 1:t1]
msa2 <- msa2[, 1:t2]
}
if (both_leading){
msa_leading <- mafft.merge(list(msa1[, 1:(h1 - 1)], msa2[, 1:(h2 - 1)]),
exec = exec)
msa_leading <- msa_leading[!duplicated(rownames(msa_leading)), ]
msa1 <- msa1[, h1:ncol(msa1)]
msa2 <- msa2[, h2:ncol(msa2)]
}
## identify non-empty positions in shared-subset
## -------------------------------------------
shared1 <- msa1[shared, ]
nonempty1 <- apply(shared1, 2, function(z) sum(z == 4))
nonempty1 <- which(nonempty1 < nrow(shared1))
shared2 <- msa2[shared, ]
nonempty2 <- apply(shared2, 2, function(z) sum(z == 4))
nonempty2 <- which(nonempty2 < nrow(shared2))
## Where do gaps have to be inserted?
## ----------------------------------
insert1 <- insert2 <- vector(mode = "list")
total <- length(nonempty1)
for (i in 1:total){
if (nonempty1[i] == nonempty2[i]) next
if (nonempty1[i] > nonempty2[i]){
d <- nonempty1[i] - nonempty2[i]
insert2 <- c(insert2, list(c(nonempty2[i], d)))
nonempty2[i:total] <- nonempty2[i:total] + d
}
if (nonempty1[i] < nonempty2[i]){
d <- nonempty2[i] - nonempty1[i]
insert1 <- c(insert1, list(c(nonempty1[i], d)))
nonempty1[i:total] <- nonempty1[i:total] + d
}
}
## Prepare score matrices
## ----------------------
if (include_scores){
scores1 <- x[[id[1]]]@scores
scores2 <- x[[id[2]]]@scores
}
## Insert gaps into msa1 (and scores1)
## -----------------------------------
if (length(insert1)){
for (i in 1:length(insert1)){
where <- insert1[[i]][1]
what <- insert1[[i]][2]
msa1 <- cbind(msa1[, 0:(where - 1)],
as.DNAbin(matrix("-", nrow = nrow(msa1), ncol = what, dimnames = list(rownames(msa1), NULL))),
msa1[, where:ncol(msa1)])
if (include_scores){
scores1 <- cbind(scores1[, 0:(where - 1)],
matrix(NaN, nrow = nrow(scores1), ncol = what, dimnames = list(rownames(scores1), NULL)),
scores1[, where:ncol(scores1)])
}
}
}
## Insert gaps into msa2 (and scores2)
## -----------------------------------
if (length(insert2)){
for (i in 1:length(insert2)){
where <- insert2[[i]][1]
what <- insert2[[i]][2]
msa2 <- cbind(msa2[, 0:(where - 1)],
as.DNAbin(matrix("-", nrow = nrow(msa2), ncol = what, dimnames = list(rownames(msa2), NULL))),
msa2[, where:ncol(msa2)])
if (include_scores){
scores2 <- cbind(scores2[, 0:(where - 1)],
matrix(NaN, nrow = nrow(scores2), ncol = what, dimnames = list(rownames(scores2), NULL)),
scores2[, where:ncol(scores2)])
}
}
}
## add trailing gaps
## -----------------
if (ncol(msa1) < ncol(msa2)){
d <- ncol(msa2) - ncol(msa1)
msa1 <- cbind(msa1,
as.DNAbin(matrix("-", nrow = nrow(msa1), ncol = d, dimnames = list(rownames(msa1), NULL))))
if (include_scores){
scores1 <- cbind(scores1,
matrix(NaN, nrow = nrow(scores1), ncol = d, dimnames = list(rownames(scores1), NULL)))
}
}
if (ncol(msa1) > ncol(msa2)){
d <- ncol(msa1) - ncol(msa2)
msa2 <- cbind(msa2,
as.DNAbin(matrix("-", nrow = nrow(msa2), ncol = d, dimnames = list(rownames(msa2), NULL))))
if (include_scores){
scores2 <- cbind(scores2,
matrix(NaN, nrow = nrow(scores2), ncol = d, dimnames = list(rownames(scores2), NULL)))
}
}
## Remove one set of shared species and join bot sets
## --------------------------------------------------
msa1 <- msa1[!rownames(msa1) %in% shared, ]
if (include_scores){
scores1 <- scores1[!rownames(scores1) %in% shared, ]
}
msa <- rbind(msa1, msa2)
## Reappend profile alignments of leading and trailing positions
## -------------------------------------------------------------
if (both_leading){
msa_leading <- msa_leading[match(rownames(msa), rownames(msa_leading)), ]
msa <- cbind(msa_leading, msa)
}
if (both_trailing){
msa_trailing <- msa_trailing[match(rownames(msa), rownames(msa_trailing)), ]
msa <- cbind(msa, msa_trailing)
}
## Create output object
## --------------------
if (include_scores){
polentaDNA(msa = msa,
scores = rbind(scores1, scores2),
method = x[[id[1]]]@method)
} else {
msa
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.