Nothing
## Copyright (C) 2011 Daniel Lai and Irmtraud M. Meyer (www.e-rna.org)
## Contact information: irmtraud.meyer@cantab.net
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
colourByCovariation <- function(helix, msa, cols, get = FALSE) {
if (is(msa, "XStringSet")) {
msa <- as.character(msa)
}
output <- colourBy.internal(helixCovariation(helix, msa), 2, -2, cols)
if (get) {
helix$col <- output
attr(helix, "legend") <- attr(output, "legend")
attr(helix, "fill") <- attr(output, "fill")
return(helix)
} else {
return(output)
}
}
colourByConservation <- function(helix, msa, cols, get = FALSE) {
if (is(msa, "XStringSet")) {
msa <- as.character(msa)
}
output <- colourBy.internal(helixConservation(helix, msa), 1, 0, cols)
if (get) {
helix$col <- output
attr(helix, "legend") <- attr(output, "legend")
attr(helix, "fill") <- attr(output, "fill")
return(helix)
} else {
return(output)
}
}
colourByCanonical <- function(helix, msa, cols, get = FALSE) {
if (is(msa, "XStringSet")) {
msa <- as.character(msa)
}
output <- colourBy.internal(helixCanonical(helix, msa), 1, 0, cols)
if (get) {
helix$col <- output
attr(helix, "legend") <- attr(output, "legend")
attr(helix, "fill") <- attr(output, "fill")
return(helix)
} else {
return(output)
}
}
colourBy.internal <- function(values, start, end, cols) {
if (missing(cols)) {
cols <- defaultPalette()
}
cols <- rev(cols)
seq <- seq(start, end, length.out = length(cols) + 1)
levels <- cut(values, breaks = seq, include.lowest = TRUE)
legend <- levels(levels)
fill <- cols
output <- cols[as.numeric(levels)]
attr(output, "legend") <- legend
attr(output, "fill") <- fill
return(output)
}
# TODO: Three following functions could probably be refactored?
# Given a helix data frame x, returns a list of covariation values corresponding to each row
helixCovariation <- function(helix, msa) {
if (!is.helix(helix)) {
stop("Invalid input")
}
if (is(msa, "XStringSet")) {
msa <- as.character(msa)
}
ex <- expandHelix(helix)
num <- rep(1:nrow(helix), times = helix$length)
cov <- c()
for (i in 1:nrow(ex)) {
cov[i] <- basepairCovariation(msa, ex$i[i], ex$j[i])
}
pt <- 1
out = c()
for (i in 1:nrow(helix)) {
out[i] <- mean(cov[seq(pt, length.out = helix$length[i])])
pt <- pt + helix$length[i]
}
return(out)
}
# Given a helix data frame x, returns a list of covariation values corresponding to each row
helixConservation <- function(helix, msa) {
if (!is.helix(helix)) {
stop("Invalid input")
}
if (nrow(helix) == 0) {
return(0)
}
if (is(msa, "XStringSet")) {
msa <- as.character(msa)
}
ex <- expandHelix(helix)
num <- rep(1:nrow(helix), times = helix$length)
cons <- c()
for (i in 1:nrow(ex)) {
cons[i] <- basepairConservation(msa, ex$i[i], ex$j[i])
}
pt <- 1
out = c()
for (i in 1:nrow(helix)) {
out[i] <- mean(cons[seq(pt, length.out = helix$length[i])])
pt <- pt + helix$length[i]
}
return(out)
}
helixCanonical <- function(helix, msa) {
if (!is.helix(helix)) {
stop("Invalid input")
}
if (is(msa, "XStringSet")) {
msa <- as.character(msa)
}
ex <- expandHelix(helix)
num <- rep(1:nrow(helix), times = helix$length)
cano <- c()
for (i in 1:nrow(ex)) {
cano[i] <- basepairCanonical(msa, ex$i[i], ex$j[i])
}
pt <- 1
out = c()
for (i in 1:nrow(helix)) {
out[i] <- mean(cano[seq(pt, length.out = helix$length[i])])
pt <- pt + helix$length[i]
}
return(out)
}
# Given a multiple sequence alignment, and two column indices indicating the 5' partner, and 3' partner respectively,
# calculates the covariation of a base pair at this position.
# values ranges between -2 and 2
basepairCovariation <- function(msa, pos.5p, pos.3p) {
if (is(msa, "XStringSet")) {
msa <- as.character(msa)
}
covariation <- 0
seq.pair.count <- choose(length(msa), 2)
base.pairs <- paste(substr(msa, pos.5p, pos.5p),
substr(msa, pos.3p, pos.3p), sep="")
pair.counts <- table(base.pairs)
pair.names <- names(pair.counts)
if (length(pair.counts) <= 1) {
return(0)
}
for (pairA in 2:length(pair.counts)) {
for (pairB in 1:(pairA-1)) {
bpA <- pair.names[pairA]
bpB <- pair.names[pairB]
occurences <- pair.counts[pairA] * pair.counts[pairB]
hdist <- hamming(bpA, bpB)
coef <- ifelse(isValidBp(bpA) & isValidBp(bpB), 1, -1)
covariation <- covariation + (coef * hdist * occurences)
}
}
return (covariation / seq.pair.count)
}
# returns the hamming distance between strings s1 and s2
# hamming distance = number of positions which differ between s1 and s2
# ASSUMES same length
hamming <- function(s1, s2) {
return(sum(strsplit(s1, "")[[1]] != strsplit(s2, "")[[1]]))
}
isValidBp <- function(bp) {
return(toupper(bp) %in% c("GC", "CG", "UA", "AU", "GU", "UG", "TA", "AT", "TG", "GT"))
}
basepairConservation <- function(msa, pos.5p, pos.3p) {
if (is(msa, "XStringSet")) {
msa <- as.character(msa)
}
return((baseConservation(msa, pos.5p) + baseConservation(msa, pos.3p)) / 2)
}
baseConservation <- function(msa, pos) {
if (is(msa, "XStringSet")) {
msa <- as.character(msa)
}
return(columnPercentIdentity(substr(msa, pos, pos)))
}
basepairCanonical <- function(msa, pos.5p, pos.3p) {
if (is(msa, "XStringSet")) {
msa <- as.character(msa)
}
base.pairs <- paste(substr(msa, pos.5p, pos.5p), substr(msa, pos.3p, pos.3p), sep="")
canonical.count <- 0
for (bp in base.pairs) {
if (isValidBp(bp)) {
canonical.count <- canonical.count + 1
}
}
return(canonical.count/length(base.pairs))
}
# linear implementation of column percent identity
# does not work... cannot correctly apply denominator
columnPercentIdentity <- function(bases) {
identities <- 0
seq.pair.count <- choose(length(bases), 2)
base.counts <- table(bases)
base.names <- names(base.counts)
if (sum(base.counts) != length(bases)) {
warning("columnPercentIdentity: length of column does not equal sum(base count)")
}
num.gap.pairs <- 0
for (base.idx in 1:length(base.counts)) {
count <- base.counts[base.idx]
base <- base.names[base.idx]
if (base == "-") {
num.gap.pairs <- choose(count, 2)
} else {
identities <- identities + choose(count, 2)
}
}
return(identities / (seq.pair.count - num.gap.pairs))
}
# given a multiple sequence alignment, and a helix data structure,
# this function returns the covariation of the entire alignment.
# the formula used is described on pg. 86 of Nick Wiebe's MSc thesis
alignmentCovariation <- function(msa, helix) {
if (is(msa, "XStringSet")) {
msa <- as.character(msa)
}
if (any(helix$length != 1)) {
helix <- expandHelix(helix)
}
covariation <- sum(apply(helix, 1, function(bp, msa) {
basepairCovariation(msa, bp['i'], bp['j'])
}, msa))
return (covariation/nrow(helix))
}
# calculates average percent identity of all possible pairs of sequences in the alignment
# percent identity is equal to number of non-gap matches / number of non-gap columns
# definition of percent identity: average over all seq pairs (# identities / (aligned positions + internal gaps))
alignmentConservation <- function(msa) {
if (is(msa, "XStringSet")) {
msa <- as.character(msa)
}
seq.pair.count <- choose(length(msa), 2)
pid.sum <- 0
msa.bases <- strsplit(msa, "")
for (seqB.index in 2:length(msa)) {
for (seqA.index in 1:(seqB.index-1)) {
seqA.bases <- msa.bases[[seqA.index]]
seqB.bases <- msa.bases[[seqB.index]]
alignedPositions <- which(seqA.bases != "-" & seqB.bases != "-")
if (length(alignedPositions) == 0) { next }
start <- min(alignedPositions)
end <- max(alignedPositions)
seqA.bases <- seqA.bases[start:end]
seqB.bases <- seqB.bases[start:end]
contains.nucleotide <- which(seqA.bases != "-" | seqB.bases != "-")
seqA.cleaned <- seqA.bases[contains.nucleotide]
seqB.cleaned <- seqB.bases[contains.nucleotide]
identities <- sum(seqA.cleaned == seqB.cleaned)
pid.sum <- pid.sum + identities/length(seqA.cleaned)
}
}
return (pid.sum/seq.pair.count)
}
# calculates percent of base pairs that are canonical (GC, CG, AU, UA, GU, UG) in the alignment
alignmentCanonical <- function(msa, helix) {
if (is(msa, "XStringSet")) {
msa <- as.character(msa)
}
if (any(helix$length != 1)) {
helix <- expandHelix(helix)
}
if (nrow(helix) == 0) {
return(0)
}
percentCanonical <- 0
for (bp.num in 1:nrow(helix)) {
bp.i <- helix[bp.num, 'i']
bp.j <- helix[bp.num, 'j']
percentCanonical <- percentCanonical +
basepairCanonical(msa, bp.i, bp.j)
}
return(percentCanonical / nrow(helix))
}
structureMismatchScore <- function(msa, helix, one.gap.penalty = 2,
two.gap.penalty = 2, invalid.penalty = 1) {
if (is(msa, "XStringSet")) {
msa <- as.character(msa)
}
if (!is.helix(helix)) {
stop("Invalid helix data.frame")
} else {
helix <- expandHelix(helix)
}
return(unlist(lapply(msa, structureMismatchScoreInternal, helix,
one.gap.penalty, two.gap.penalty, invalid.penalty)))
}
structureMismatchScoreInternal <- function(sequence, helix, one.gap.penalty = 2,
two.gap.penalty = 2, invalid.penalty = 1) {
base.5p <- substring(sequence, helix$i, helix$i)
base.3p <- substring(sequence, helix$j, helix$j)
gaps.5p <- base.5p == "-"
gaps.3p <- base.3p == "-"
one.sided.gap <- xor(gaps.5p, gaps.3p)
two.sided.gap <- gaps.5p & gaps.3p
invalid.pair <- !isValidBp(paste(base.5p, base.3p, sep = ""))
non.canonical.pair <- invalid.pair & !(one.sided.gap | two.sided.gap)
return(c(one.gap.penalty * length(which(one.sided.gap))
+ two.gap.penalty * length(which(two.sided.gap))
+ invalid.penalty * length(which(non.canonical.pair))))
}
alignmentPercentGaps <- function(msa) {
if (is(msa, "XStringSet")) {
msa <- as.character(msa)
}
return(unlist(lapply(lapply(strsplit(msa, ""), '==', '-'), sum)) / nchar(msa))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.