R/dist.R

Defines functions distAlignment distEdit distApe distSimRank distKMer distNSV distCV distFFP

Documented in distAlignment distApe distCV distEdit distFFP distKMer distNSV distSimRank

#######################################################################
# BiostringsTools - Interfaces to several sequence alignment and
# classification tools
# Copyright (C) 2012 Michael Hahsler and Anurag Nagar
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

### Feature Frequency Profile
distFFP <- function(x, k = 3, method = "JSD", normalize = TRUE) {
    ## Jensen-Shannon divergence between rows
    JSdiv <- function(x) {
        n <- nrow(x)
        s <- matrix(NA_real_, nrow = n, ncol = n)

        for (i in 1:(n - 1)) {
            for (j in (i + 1):n) {
                s[j, i] <- s[i, j] <- JSDivergence(x[i, ], x[j, ])
            }
        }

        as.dist(s)
    }

    ### from: Bioconductor: cn.farms
    ## Kullback-Leibler divergences
    KLDivergence <- function(p, q) {
        sum(p * log(p / q))
    }

    KLInformation <- function(p, q) {
        sum((p - q) * log(p / q))
    }

    ## Jensen-Shannon divergence
    JSDivergence <- function(p, q) {
        m <- 0.5 * (p + q)
        0.5 * (KLDivergence(p, m) + KLDivergence(q, m))
    }
    ### end

    x <- DNAStringSet(x)
    x.kmer <- oligonucleotideFrequency(x, k)

    if (normalize) x.kmer <- sweep(x.kmer, 1, rowSums(x.kmer), "/")

    ### JSD: Jensen-Shannon divergence
    if (pmatch(tolower(method), "jsd", nomatch = 0)) {
        x.kmer <- x.kmer + 1
        return(JSdiv(x.kmer))
    }

    if (pmatch(tolower(method), "kullback", nomatch = 0)) x.kmer <- x.kmer + 1

    proxy::dist(x.kmer, method = method)
}

### Composition Vector
###
distCV <- function(x, k = 3) {
    x <- DNAStringSet(x)
    x.kmer <- oligonucleotideFrequency(x, k)
    x.kmer <- sweep(x.kmer, 1, rowSums(x.kmer), "/") ### normalize

    ### k-1 and k-2 mers
    x.kmer1 <- oligonucleotideFrequency(x, k - 1)
    x.kmer1 <- sweep(x.kmer1, 1, rowSums(x.kmer1), "/") ### normalize
    x.kmer2 <- oligonucleotideFrequency(x, k - 2)
    x.kmer2 <- sweep(x.kmer2, 1, rowSums(x.kmer2), "/") ### normalize


    x.kmer <- x.kmer - sapply(colnames(x.kmer), FUN = function(n) {
        p0 <- x.kmer1[, substr(n, 1L, k - 1L), drop = FALSE] *
            x.kmer1[, substr(n, 2L, k), drop = FALSE] /
            x.kmer2[, substr(n, 2L, k - 1L), drop = FALSE]

        p0[is.nan(p0)] <- 0
    })

    proxy::dist(x.kmer, method = "Cosine")
}

### NSV-based distance
distNSV <- function(x, k = 3, method = "Manhattan", normalize = FALSE) {
    distFFP(x, k, method = method, normalize = normalize)
}

distKMer <- function(x, k = 3) {
    x.kmer <- oligonucleotideFrequency(x, k) > 0
    dist(x.kmer, method = "binary")
}

### dist based on simRank
### FIXME: proxy::as.dist does not work...
distSimRank <- function(x, k = 7) proxy::as.dist(simRank(x, k))


### distances based on package ape
distApe <- function(x, model = "K80", ...) {
    if (!inherits(x, "MultipleAlignment")) stop("x needs to be a Multiple Alignment.")

    dist.dna(
        as.DNAbin(lapply(tolower(as.character(x)), s2c)),
        model, ...
    )
}

# alignScore <- function(x, ...) {
#   n <- length(x)
#   s <- matrix(NA_real_, nrow=n, ncol=n)
#
#   for(i in 1:(n-1)) {
#     for(j in (i+1):n) {
#       s[j,i] <- s[i,j] <- pairwiseAlignment(x[[i]], x[[j]], ..., scoreOnly=TRUE)
#     }
#   }
#
#   as.simil(s)
# }
#
# distAlignment <- function(x, ...) structure(
#  1/(alignScore(x, ...)+1),
#  class="dist")


distEdit <- function(x) pwalign::stringDist(x, method = "levenshtein")


distAlignment <- function(x, substitutionMatrix = NULL, ...) {
    if (is.null(substitutionMatrix)) {
        substitutionMatrix <- nucleotideSubstitutionMatrix(
            match = 1,
            mismatch = 0,
            baseOnly = TRUE,
            type = "DNA"
        )
    }

    pwalign::stringDist(x, substitutionMatrix = substitutionMatrix, ...)
}
mhahsler/rMSA documentation built on May 24, 2024, 3:36 p.m.