## Make the consensus sequence based on MSA of palindrome sequences by Muscle
ConsensusFromMuscleAlignment <- function(read, weight=c('count', 'r', 'rank')) {
# read Path and file prefix of the read to be processed; $read-msa.fasta both exist
suppressPackageStartupMessages(require(Biostrings));
lns <- readLines(paste0(read, '-msa.fasta'));
ind <- grep('^>', lns);
ids <- sub('^>', '', lns[ind]);
fst <- ind+1;
lst <- c(ind[-1]-1, length(lns));
msa <- sapply(1:length(fst), function(i) paste(lns[fst[i]:lst[i]], collapse=''));
seq <- gsub('-', '', msa);
names(seq) <- names(msa) <- ids;
########################################################################################################################
WeightConsensus <- function(mtrx, method=c('r', 'rank', 'count')) {
ltt <- c('-', 'A', 'C', 'G', 'T')
tbl <- sum <- matrix(0, nr=nrow(mtrx), nc=length(ltt), dimnames = list(rownames(mtrx), ltt));
ttl <- rep(0, nrow(mtrx));
rng <- apply(mtrx, 2, function(a) range(which(a!='-')));
rng <- c(max(rng[1, ]), min(rng[2, ]));
sub <- mtrx[rng[1]:rng[2], , drop=FALSE];
for (i in 1:ncol(mtrx)) for (j in 1:length(ltt)) sum[mtrx[, i]==ltt[j], j] <- sum[mtrx[, i]==ltt[j], j] + 1;
cnt <- apply(sub, 2, function(s) {
n <- rep(0, length(s));
for (i in 1:length(ltt)) n[s==ltt[i]] <- sum[rng[1]:rng[2], , drop=FALSE][s==ltt[i], ltt[i]];
n - 1;
});
if (tolower(method)[1] == 'rank') weight <- rank(-colMeans(cnt)) else
if (tolower(method)[1] == 'r') weight <- colMeans(cnt)/(ncol(cnt)-1) else
if (tolower(method)[1] == 'count') weight <- colMeans(cnt) else
weight <- rep(1, ncol(cnt));
for (i in 1:ncol(mtrx)) {
x <- mtrx[, i];
rng <- range(which(x!='-'));
ttl[rng[1]:rng[2]] <- ttl[rng[1]:rng[2]] + 1;
for (j in 1:length(ltt)) {
ind <- which(x==ltt[j]);
ind <- ind[ind>=rng[1] & ind<=rng[2]];
tbl[ind, j] <- tbl[ind, j] + weight[i];
};
};
rownames(tbl) <- 1:nrow(tbl);
out <- cbind(tbl, Count=ttl, Gap=apply(mtrx, 1, function(x) length(x[x=='-'])));
list(weighted=out, weight=weight, method=method);
};
CallConsensus <- function(weighted) {
tbl <- weighted[, c('-', 'A', 'C', 'G', 'T')];
cll <- apply(tbl[, 1:5], 1, function(x) {
mx <- which(x==max(x));
if (length(mx)>1) sample(mx, 1) else mx;
});
cll <- colnames(tbl)[1:5][cll];
names(cll) <- rownames(weighted);
cll;
}
########################################################################################################################
removed <- c(); # Palindromes not to be used for consensus
########################################################################################################################
# Run MSA and remove sequence(s) not agree with the others
pss <- FALSE;
while(!pss & length(seq)>2) {
seq <- seq[!(names(seq) %in% removed)];
# Base by base
aln <- do.call('cbind', strsplit(as.character(msa[names(seq)]), ''));
rownames(aln) <- 1:nrow(aln);
# Weigth sequences
wgt <- WeightConsensus(aln, method=weight);
if (min(wgt[[2]])<(0.5*(length(seq)-1))) {
removed <- c(removed, names(seq)[wgt[[2]]==min(wgt[[2]])]);
} else pss <- TRUE;
};
########################################################################################################################
########################################################################################################################
# Remove sequences hurts the consensus by reducing number of bases
pss <- FALSE;
while(!pss & length(seq)>2) {
# Base by base
aln <- do.call('cbind', strsplit(as.character(msa[names(seq)]), ''));
rownames(aln) <- 1:nrow(aln);
wgt <- WeightConsensus(aln, method=weight[1]);
# Call consensus
rg1 <- apply(aln, 2, function(a) range(which(a!='-')));
rg0 <- c(max(rg1[1, ]), min(rg1[2, ]));
cll <- CallConsensus(wgt$weighted[rg0[1]:rg0[2], , drop=FALSE]);
cll <- cll[cll!='-'];
sm0 <- sum(apply(cbind(cll, aln[names(cll), ]), 1, function(x) length(x[x==x[1]]))-1);
sm1 <- sapply(1:ncol(aln), function(i) {
a <- aln[, -i];
w <- WeightConsensus(a, method=weight[1]);
r <- apply(a, 2, function(a) range(which(a!='-')));
r <- c(max(r[1, ]), min(r[2, ]));
c <- CallConsensus(w$weighted[r[1]:r[2], , drop=FALSE]);
c <- c[c!='-'];
n <- apply(cbind(c, a[names(c), ]), 1, function(x) length(x[x==x[1]]))-1;
sum(n);
});
if (max(sm1) < sm0) pss <- TRUE else {
removed <- c(removed, names(seq)[sm1==max(sm1)][1]);
seq <- seq[!(names(seq) %in% removed)];
# Run consensus algorithm
msa <- msa[names(seq)];
}
};
########################################################################################################################
########################################################################################################################
if (!exists('cll')) {
aln <- do.call('cbind', strsplit(as.character(msa[names(seq)]), ''));
rownames(aln) <- 1:nrow(aln);
wgt <- WeightConsensus(aln, method=weight[1]);
# Call consensus
rg1 <- apply(aln, 2, function(a) range(which(a!='-')));
rg0 <- c(max(rg1[1, ]), min(rg1[2, ]));
cll <- CallConsensus(wgt$weighted[rg0[1]:rg0[2], , drop=FALSE]);
cll <- cll[cll!='-'];
};
ind <- as.integer(names(cll));
########################################################################################################################
out <- list(consensus=paste(cll, collapse=''), range=rg0, index=ind, base=aln, alignment=msa, weight=wgt, removed=removed);
con <- out$consensus;
if (!(is.na(con) | is.null(con) | nchar(con)==0 | length(msa)==0)) {
## Base frequency by position in MSA
aln <- apply(aln, 2, function(a) {
rng <- range(which(a %in% c('A', 'C', 'G', 'T')));
if (rng[1] > 1) a[1:(rng[1]-1)] <- '';
if (rng[2] > length(a)) a[(rng[2]+1):length(a)] <- '';
a;
});
B <- c('-', 'A', 'C', 'G', 'T');
b0 <- matrix(1, nr=nrow(aln), nc=ncol(aln));
frq <- matrix(0, nr=nrow(aln), nc=length(B), dimnames=list(rownames(aln), B));
for (i in 1:length(B)) {
b1 <- b0;
b1[aln!=B[i]] <- 0;
frq[, i] <- rowSums(b1);
};
ttl <- rowSums(frq);
mxs <- apply(frq, 1, max);
################################################################################################################
ct0 <- length(msa); # Total number of palindromed used for consensus
ct1 <- round(mean(ttl), 1); # Average number of palindrome used at position for consensus
pct <- round(100*mean(mxs/ttl), 1); # Average percentage of palindromes agreeing with the consensus at each position
################################################################################################################
hdr <- sub('[0-9]+_[0-9]+$', '', names(msa)[1]);
hdr <- paste0('>', hdr, '0_', nchar(con));
hdr <- paste(hdr, ct0, ct1, pct);
writeLines(c(hdr, con), paste0(read, '-consensus.fasta'));
};
saveRDS(out, paste0(read, '-consensus.rds'))
invisible(out);
};
# AlignConsensus2Reference2 <- function(cons, ref) {
# nms <- names(cons);
# if (is.null(nms) | length(nms)!=length(cons)) nms <- paste('seq', 1:length(cons));
# if (class(cons) != 'DNAStringSet') cons <- DNAStringSet(cons);
# names(cons) <- nms;
#
# aln <- lapply(names(cons), function(nm) {
# cat(nm, '\n');
# con <- cons[[nm]];
# pw1 <- AlignConsensus2Reference(con, ref);
# pw2 <- AlignConsensus2Reference(reverseComplement(con), ref);
#
# if (score(pw1$alignment) >= score(pw2$alignment)) {
# pw <- pw1;
# pw$strand <- 1;
# } else {
# pw <- pw2;
# pw$strand <- -1;
# }
# pw;
# });
# names(aln) <- nms;
#
# stat <- t(sapply(aln, function(x) x[[3]]));
# stat <- cbind(score=sapply(aln, function(a) score(a$alignment)), strand=sapply(aln, function(a) a$strand), stat);
#
# list(summary=stat, consensus=cons, alignment=lapply(aln, function(a) a$alignment),
# index=lapply(aln, function(a) a$index))
# }
#
# AlignConsensus2Reference <- function(con, ref) {
# require(S4Vectors);
# require(Biostrings);
#
# pw <- pairwiseAlignment(con, ref, type='global-local');
#
# ptn <- as.character(pattern(pw));
# sbj <- as.character(subject(pw));
# prs <- data.frame(P=strsplit(ptn, '')[[1]], S=strsplit(sbj, '')[[1]], stringsAsFactors = FALSE);
# ind <- rep(NA, nrow(prs));
# ind[prs[,1]!='-'] <- 1:length(prs[prs[,1]!='-', 1]);
# rownames(prs) <- 1:nrow(prs);
# prs$index <- ind;
#
# ind0 <- prs[prs[,1]==prs[,2] & prs[,1]!='-', 3];
# ind1 <- prs[prs[,1]!='-' & prs[,2]!='-' & prs[,1]!=prs[,2], 3];
# ind2 <- prs[prs[,2]=='-', 3];
# ind3 <- prs[which(prs[,1]=='-')-1, 3];
# ind3 <- ind3[!is.na(ind3)];
# ind4 <- Rle(as.integer(prs[,1]=='-'));
# ind4 <- runLength(ind4)[runValue(ind4)==1];
#
# cnt <- c(match=nmatch(pw), mismatch=nmismatch(pw), insertion=nindel(pw)@insertion[1,],
# deletion=nindel(pw)@deletion[1,]);
# idl <- nindel(pw);
#
# list(alignment=pw,
# index=list(match=ind0, mismatch=ind1, insertion=ind2, deletion=ind3, deletion.width=ind4),
# count=c(match=nmatch(pw), mismatch=nmismatch(pw), insertion=idl@insertion[1,], deletion=idl@deletion[1,]));
# }
#
#
# ######## TODO
# OptimizeConsensus <- function(freq, count, gap, ref) {
# CallCon <- function(x) {
# mx <- which(x==max(x));
# if (length(mx)>1) sample(mx, 1) else mx;
# };
#
# lvl1 <- unique(count);
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.