R/stats.R

Defines functions alignmentPercentGaps structureMismatchScoreInternal structureMismatchScore alignmentCanonical alignmentConservation alignmentCovariation columnPercentIdentity basepairCanonical baseConservation basepairConservation isValidBp hamming basepairCovariation helixCanonical helixConservation helixCovariation colourBy.internal colourByCanonical colourByConservation colourByCovariation

Documented in alignmentCanonical alignmentConservation alignmentCovariation alignmentPercentGaps baseConservation basepairCanonical basepairConservation basepairCovariation colourByCanonical colourByConservation colourByCovariation helixCanonical helixConservation helixCovariation structureMismatchScore

## Copyright (C) 2011 Daniel Lai and Irmtraud M. Meyer (www.e-rna.org)
## Contact information: irmtraud.meyer@cantab.net

## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.

## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.

## You should have received a copy of the GNU General Public License
## along with this program.  If not, see <http://www.gnu.org/licenses/>.

colourByCovariation <- function(helix, msa, cols, get = FALSE) {
    if (is(msa, "XStringSet")) {
        msa <- as.character(msa)
    }
    output <- colourBy.internal(helixCovariation(helix, msa), 2, -2, cols)
    if (get) {
        helix$col <- output
        attr(helix, "legend") <- attr(output, "legend")
        attr(helix, "fill") <- attr(output, "fill")
        return(helix)
    } else {
        return(output)
    }
}

colourByConservation <- function(helix, msa, cols, get = FALSE) {
    if (is(msa, "XStringSet")) {
        msa <- as.character(msa)
    }
    output <- colourBy.internal(helixConservation(helix, msa), 1, 0, cols)
    if (get) {
        helix$col <- output
        attr(helix, "legend") <- attr(output, "legend")
        attr(helix, "fill") <- attr(output, "fill")
        return(helix)
    } else {
        return(output)
    }
}

colourByCanonical <- function(helix, msa, cols, get = FALSE) {
    if (is(msa, "XStringSet")) {
        msa <- as.character(msa)
    }
    output <- colourBy.internal(helixCanonical(helix, msa), 1, 0, cols)
    if (get) {
        helix$col <- output
        attr(helix, "legend") <- attr(output, "legend")
        attr(helix, "fill") <- attr(output, "fill")
        return(helix)
    } else {
        return(output)
    }    
}

colourBy.internal <- function(values, start, end, cols) {
    if (missing(cols)) {
        cols <- defaultPalette()
    }
    cols <- rev(cols)
    seq <- seq(start, end, length.out = length(cols) + 1)
    levels <- cut(values, breaks = seq, include.lowest = TRUE)
    legend <- levels(levels)
    fill <- cols
    output <- cols[as.numeric(levels)]    
    attr(output, "legend") <- legend
    attr(output, "fill") <- fill
    return(output)    
}

# TODO: Three following functions could probably be refactored?
# Given a helix data frame x, returns a list of covariation values corresponding to each row
helixCovariation <- function(helix, msa) {
    if (!is.helix(helix)) {
        stop("Invalid input")
    }
    if (is(msa, "XStringSet")) {
        msa <- as.character(msa)
    }
    ex <- expandHelix(helix)
    num <- rep(1:nrow(helix), times = helix$length)
    cov <- c()
    for (i in 1:nrow(ex)) {
        cov[i] <- basepairCovariation(msa, ex$i[i], ex$j[i])
    }
    pt <- 1
    out = c()
    for (i in 1:nrow(helix)) {
        out[i] <- mean(cov[seq(pt, length.out = helix$length[i])])
        pt <- pt + helix$length[i]
    }
    return(out)
}

# Given a helix data frame x, returns a list of covariation values corresponding to each row
helixConservation <- function(helix, msa) {
    if (!is.helix(helix)) {
        stop("Invalid input")
    }
    if (nrow(helix) == 0) {
        return(0)
    }
    if (is(msa, "XStringSet")) {
        msa <- as.character(msa)
    }
    ex <- expandHelix(helix)
    num <- rep(1:nrow(helix), times = helix$length)
    cons <- c()
    for (i in 1:nrow(ex)) {
        cons[i] <- basepairConservation(msa, ex$i[i], ex$j[i])
    }
    pt <- 1
    out = c()
    for (i in 1:nrow(helix)) {
        out[i] <- mean(cons[seq(pt, length.out = helix$length[i])])
        pt <- pt + helix$length[i]
    }
    return(out)
}

helixCanonical <- function(helix, msa) {
    if (!is.helix(helix)) {
        stop("Invalid input")
    }
    if (is(msa, "XStringSet")) {
        msa <- as.character(msa)
    }
    ex <- expandHelix(helix)
    num <- rep(1:nrow(helix), times = helix$length)
    cano <- c()
    for (i in 1:nrow(ex)) {
        cano[i] <- basepairCanonical(msa, ex$i[i], ex$j[i])
    }
    pt <- 1

    out = c()
    for (i in 1:nrow(helix)) {
        out[i] <- mean(cano[seq(pt, length.out = helix$length[i])])
        pt <- pt + helix$length[i]
    }
    return(out)
}

# Given a multiple sequence alignment, and two column indices indicating the 5' partner, and 3' partner respectively,
# calculates the covariation of a base pair at this position.
# values ranges between -2 and 2
basepairCovariation <- function(msa, pos.5p, pos.3p) {
    if (is(msa, "XStringSet")) {
        msa <- as.character(msa)
    }
    covariation <- 0
    seq.pair.count <- choose(length(msa), 2)
    base.pairs <- paste(substr(msa, pos.5p, pos.5p),
        substr(msa, pos.3p, pos.3p), sep="")

    pair.counts <- table(base.pairs)
    pair.names <- names(pair.counts)

    if (length(pair.counts) <= 1) {
        return(0)
    }

    for (pairA in 2:length(pair.counts)) {
        for (pairB in 1:(pairA-1)) {
            bpA <- pair.names[pairA]
            bpB <- pair.names[pairB]
            occurences <- pair.counts[pairA] * pair.counts[pairB]
            hdist <- hamming(bpA, bpB)
            coef <- ifelse(isValidBp(bpA) & isValidBp(bpB), 1, -1)
            covariation <- covariation + (coef * hdist * occurences)
        }
    }
    return (covariation / seq.pair.count)
}

# returns the hamming distance between strings s1 and s2
# hamming distance = number of positions which differ between s1 and s2
# ASSUMES same length
hamming <- function(s1, s2) {
    return(sum(strsplit(s1, "")[[1]] != strsplit(s2, "")[[1]]))
}

isValidBp <- function(bp) {
    return(toupper(bp) %in% c("GC", "CG", "UA", "AU", "GU", "UG", "TA", "AT", "TG", "GT"))
}

basepairConservation <- function(msa, pos.5p, pos.3p) {
    if (is(msa, "XStringSet")) {
        msa <- as.character(msa)
    }
    return((baseConservation(msa, pos.5p) + baseConservation(msa, pos.3p)) / 2)    
}

baseConservation <- function(msa, pos) {
    if (is(msa, "XStringSet")) {
        msa <- as.character(msa)
    }
    return(columnPercentIdentity(substr(msa, pos, pos)))
}

basepairCanonical <- function(msa, pos.5p, pos.3p) {
    if (is(msa, "XStringSet")) {
        msa <- as.character(msa)
    }
    base.pairs <- paste(substr(msa, pos.5p, pos.5p), substr(msa, pos.3p, pos.3p), sep="")
    canonical.count <- 0
    for (bp in base.pairs) {
        if (isValidBp(bp)) {
            canonical.count <- canonical.count + 1
        }
    }
    return(canonical.count/length(base.pairs))
}

# linear implementation of column percent identity
# does not work... cannot correctly apply denominator
columnPercentIdentity <- function(bases) {
    identities <- 0
    seq.pair.count <- choose(length(bases), 2)

    base.counts <- table(bases)
    base.names <- names(base.counts)

    if (sum(base.counts) != length(bases)) {
        warning("columnPercentIdentity: length of column does not equal sum(base count)")
    }

    num.gap.pairs <- 0
    for (base.idx in 1:length(base.counts)) {
        count <- base.counts[base.idx]
        base <- base.names[base.idx]
        if (base == "-") {
            num.gap.pairs <- choose(count, 2)
        } else {
            identities <- identities + choose(count, 2)
        }
    }
    return(identities / (seq.pair.count - num.gap.pairs))
}

# given a multiple sequence alignment, and a helix data structure,
# this function returns the covariation of the entire alignment.
# the formula used is described on pg. 86 of Nick Wiebe's MSc thesis
alignmentCovariation <- function(msa, helix) {
    if (is(msa, "XStringSet")) {
        msa <- as.character(msa)
    }
    if (any(helix$length != 1)) {
        helix <- expandHelix(helix)
    }
    covariation <- sum(apply(helix, 1, function(bp, msa) {
            basepairCovariation(msa, bp['i'], bp['j'])
        }, msa))
    return (covariation/nrow(helix))
}

# calculates average percent identity of all possible pairs of sequences in the alignment
# percent identity is equal to number of non-gap matches / number of non-gap columns
# definition of percent identity: average over all seq pairs (# identities / (aligned positions + internal gaps))
alignmentConservation <- function(msa) {
    if (is(msa, "XStringSet")) {
        msa <- as.character(msa)
    }
    seq.pair.count <- choose(length(msa), 2)
    pid.sum <- 0
    msa.bases <- strsplit(msa, "")
    for (seqB.index in 2:length(msa)) {
        for (seqA.index in 1:(seqB.index-1)) {
            seqA.bases <- msa.bases[[seqA.index]]
            seqB.bases <- msa.bases[[seqB.index]]
            alignedPositions <- which(seqA.bases != "-" & seqB.bases != "-")
            if (length(alignedPositions) == 0) { next }
            start <- min(alignedPositions)
            end <- max(alignedPositions)
            seqA.bases <- seqA.bases[start:end]
            seqB.bases <- seqB.bases[start:end]
            contains.nucleotide <- which(seqA.bases != "-" | seqB.bases != "-")
            seqA.cleaned <- seqA.bases[contains.nucleotide]
            seqB.cleaned <- seqB.bases[contains.nucleotide]
            identities <- sum(seqA.cleaned == seqB.cleaned)
            pid.sum <- pid.sum + identities/length(seqA.cleaned)
        }
    }
    return (pid.sum/seq.pair.count)
}

# calculates percent of base pairs that are canonical (GC, CG, AU, UA, GU, UG) in the alignment
alignmentCanonical <- function(msa, helix) {
    if (is(msa, "XStringSet")) {
        msa <- as.character(msa)
    }
    if (any(helix$length != 1)) {
        helix <- expandHelix(helix)
    }
    if (nrow(helix) == 0) {
        return(0)
    }
    percentCanonical <- 0
    for (bp.num in 1:nrow(helix)) {
        bp.i <- helix[bp.num, 'i']
        bp.j <- helix[bp.num, 'j']
        percentCanonical <- percentCanonical +
            basepairCanonical(msa, bp.i, bp.j)
    }
    return(percentCanonical / nrow(helix))
}

structureMismatchScore <- function(msa, helix, one.gap.penalty = 2,
    two.gap.penalty = 2, invalid.penalty = 1) {
    if (is(msa, "XStringSet")) {
        msa <- as.character(msa)
    }
    if (!is.helix(helix)) {
        stop("Invalid helix data.frame")
    } else {
        helix <- expandHelix(helix)
    }
    return(unlist(lapply(msa, structureMismatchScoreInternal, helix,
        one.gap.penalty, two.gap.penalty, invalid.penalty)))
}

structureMismatchScoreInternal <- function(sequence, helix, one.gap.penalty = 2,
    two.gap.penalty = 2, invalid.penalty = 1) {
    base.5p <- substring(sequence, helix$i, helix$i)
    base.3p <- substring(sequence, helix$j, helix$j)
    gaps.5p <- base.5p == "-"
    gaps.3p <- base.3p == "-"
    one.sided.gap <- xor(gaps.5p, gaps.3p)
    two.sided.gap <- gaps.5p & gaps.3p
    invalid.pair <- !isValidBp(paste(base.5p, base.3p, sep = ""))
    non.canonical.pair <- invalid.pair & !(one.sided.gap | two.sided.gap)
    return(c(one.gap.penalty * length(which(one.sided.gap))
        + two.gap.penalty * length(which(two.sided.gap))
        + invalid.penalty * length(which(non.canonical.pair))))
}

alignmentPercentGaps <- function(msa) {
    if (is(msa, "XStringSet")) {
        msa <- as.character(msa)
    }
    return(unlist(lapply(lapply(strsplit(msa, ""), '==', '-'), sum)) / nchar(msa))
}

Try the R4RNA package in your browser

Any scripts or data that you put into this service are public.

R4RNA documentation built on Nov. 8, 2020, 5:15 p.m.