MakeConsensus <- function(seq, ref=NA, method=c("Muscle", "ClustalW", "ClustalOmega"),
weight=c('r', 'rank', 'count')) {
require(msa);
require(Biostrings);
runMsa <- function(seq, method) {
if (tolower(method)[1]=='clustalw') {
msaClustalW(seq, order='input');
} else if (tolower(method)[1]=='clustalomega') {
msaClustalOmega(seq, order='input');
} else {
msaMuscle(seq, order='input');
};
};
# Run MSA and remove sequence(s) not agree with the others
removed <- c();
pss <- FALSE;
while(!pss & length(seq)>2) {
seq <- seq[!(names(seq) %in% removed)];
# Run consensus algorithm
msa <- runMsa(seq, method);
# Base by base
aln <- do.call('cbind', strsplit(as.character(msa), ''));
rownames(aln) <- 1:nrow(aln);
# Weigth sequences
wgt <- WeightConsensus(aln, method='count');
if (min(wgt[[2]])<(0.5*(length(seq)-1))) {
removed <- c(removed, names(seq)[wgt[[2]]==min(wgt[[2]])]);
} else pss <- TRUE;
};
# Remove sequences hurts the consensus by reducing number of bases
pss <- FALSE;
while(!pss & length(seq)>2) {
# Base by base
aln <- do.call('cbind', strsplit(as.character(msa), ''));
rownames(aln) <- 1:nrow(aln);
wgt <- WeightConsensus(aln, method=weight);
# Call consensus
rg1 <- apply(aln, 2, function(a) range(which(a!='-')));
rg0 <- c(max(rg1[1, ]), min(rg1[2, ]));
cll <- CallConsensus(wgt$weighted[rg0[1]:rg0[2], , drop=FALSE]);
cll <- cll[cll!='-'];
sm0 <- sum(apply(cbind(cll, aln[names(cll), ]), 1, function(x) length(x[x==x[1]]))-1);
sm1 <- sapply(1:ncol(aln), function(i) {
a <- aln[, -i];
w <- WeightConsensus(a, method=weight);
r <- apply(a, 2, function(a) range(which(a!='-')));
r <- c(max(r[1, ]), min(r[2, ]));
c <- CallConsensus(w$weighted[r[1]:r[2], , drop=FALSE]);
c <- c[c!='-'];
n <- apply(cbind(c, a[names(c), ]), 1, function(x) length(x[x==x[1]]))-1;
sum(n);
});
if (max(sm1) < sm0) pss <- TRUE else {
removed <- c(removed, names(seq)[sm1==max(sm1)][1]);
seq <- seq[!(names(seq) %in% removed)];
# Run consensus algorithm
msa <- runMsa(seq, method);
}
};
list(consensus=paste(cll, collapse=''), range=rg0, base=aln, alignment=msa, weight=wgt,
removed=removed, input=list(sequence=seq, reference=ref, method=method, weight=weight));
};
WeightConsensus <- function(mtrx, method=c('r', 'rank', 'count')) {
ltt <- c('-', 'A', 'C', 'G', 'T')
tbl <- sum <- matrix(0, nr=nrow(mtrx), nc=length(ltt), dimnames = list(rownames(mtrx), ltt));
ttl <- rep(0, nrow(mtrx));
rng <- apply(mtrx, 2, function(a) range(which(a!='-')));
rng <- c(max(rng[1, ]), min(rng[2, ]));
sub <- mtrx[rng[1]:rng[2], , drop=FALSE];
for (i in 1:ncol(mtrx)) for (j in 1:length(ltt)) sum[mtrx[, i]==ltt[j], j] <- sum[mtrx[, i]==ltt[j], j] + 1;
cnt <- apply(sub, 2, function(s) {
n <- rep(0, length(s));
for (i in 1:length(ltt)) n[s==ltt[i]] <- sum[rng[1]:rng[2], , drop=FALSE][s==ltt[i], ltt[i]];
n - 1;
});
if (tolower(method)[1] == 'rank') weight <- rank(-colMeans(cnt)) else
if (tolower(method)[1] == 'r') weight <- colMeans(cnt)/(ncol(cnt)-1) else
if (tolower(method)[1] == 'count') weight <- colMeans(cnt) else
weight <- rep(1, ncol(cnt));
for (i in 1:ncol(mtrx)) {
x <- mtrx[, i];
rng <- range(which(x!='-'));
ttl[rng[1]:rng[2]] <- ttl[rng[1]:rng[2]] + 1;
for (j in 1:length(ltt)) {
ind <- which(x==ltt[j]);
ind <- ind[ind>=rng[1] & ind<=rng[2]];
tbl[ind, j] <- tbl[ind, j] + weight[i];
};
};
rownames(tbl) <- 1:nrow(tbl);
out <- cbind(tbl, Count=ttl, Gap=apply(mtrx, 1, function(x) length(x[x=='-'])));
list(weighted=out, weight=weight, method=method);
};
CallConsensus <- function(weighted) {
tbl <- weighted[, c('-', 'A', 'C', 'G', 'T')];
cll <- apply(tbl[, 1:5], 1, function(x) {
mx <- which(x==max(x));
if (length(mx)>1) sample(mx, 1) else mx;
});
cll <- colnames(tbl)[1:5][cll];
names(cll) <- rownames(weighted);
cll;
}
AlignConsensus2Reference2 <- function(cons, ref) {
nms <- names(cons);
if (is.null(nms) | length(nms)!=length(cons)) nms <- paste('seq', 1:length(cons));
if (class(cons) != 'DNAStringSet') cons <- DNAStringSet(cons);
names(cons) <- nms;
aln <- lapply(names(cons), function(nm) {
cat(nm, '\n');
con <- cons[[nm]];
pw1 <- AlignConsensus2Reference(con, ref);
pw2 <- AlignConsensus2Reference(reverseComplement(con), ref);
if (score(pw1$alignment) >= score(pw2$alignment)) {
pw <- pw1;
pw$strand <- 1;
} else {
pw <- pw2;
pw$strand <- -1;
}
pw;
});
names(aln) <- nms;
stat <- t(sapply(aln, function(x) x[[3]]));
stat <- cbind(score=sapply(aln, function(a) score(a$alignment)), strand=sapply(aln, function(a) a$strand), stat);
list(summary=stat, consensus=cons, alignment=lapply(aln, function(a) a$alignment),
index=lapply(aln, function(a) a$index))
}
AlignConsensus2Reference <- function(con, ref) {
require(S4Vectors);
require(Biostrings);
pw <- pairwiseAlignment(con, ref, type='global-local');
ptn <- as.character(pattern(pw));
sbj <- as.character(subject(pw));
prs <- data.frame(P=strsplit(ptn, '')[[1]], S=strsplit(sbj, '')[[1]], stringsAsFactors = FALSE);
ind <- rep(NA, nrow(prs));
ind[prs[,1]!='-'] <- 1:length(prs[prs[,1]!='-', 1]);
rownames(prs) <- 1:nrow(prs);
prs$index <- ind;
ind0 <- prs[prs[,1]==prs[,2] & prs[,1]!='-', 3];
ind1 <- prs[prs[,1]!='-' & prs[,2]!='-' & prs[,1]!=prs[,2], 3];
ind2 <- prs[prs[,2]=='-', 3];
ind3 <- prs[which(prs[,1]=='-')-1, 3];
ind3 <- ind3[!is.na(ind3)];
ind4 <- Rle(as.integer(prs[,1]=='-'));
ind4 <- runLength(ind4)[runValue(ind4)==1];
cnt <- c(match=nmatch(pw), mismatch=nmismatch(pw), insertion=nindel(pw)@insertion[1,],
deletion=nindel(pw)@deletion[1,]);
idl <- nindel(pw);
list(alignment=pw,
index=list(match=ind0, mismatch=ind1, insertion=ind2, deletion=ind3, deletion.width=ind4),
count=c(match=nmatch(pw), mismatch=nmismatch(pw), insertion=idl@insertion[1,], deletion=idl@deletion[1,]));
}
######## TODO
OptimizeConsensus <- function(freq, count, gap, ref) {
CallCon <- function(x) {
mx <- which(x==max(x));
if (length(mx)>1) sample(mx, 1) else mx;
};
lvl1 <- unique(count);
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.