# R/distributionDistance.R In Bioconductor-mirror/cn.farms: cn.FARMS - factor analysis for copy number estimation

#### Documented in distributionDistance

```## Some distance measures as mentioned in Singh (2006), "Study Of Some
## Distance Measures For Language And Encoding Identification",
## http://clair.si.umich.edu/clair/anthology/query.cgi?type=Paper&id=W06-1109.

#' Computes the distribution distance
#'
#' Be aware that this function is implemented quite slow.
#'
#' @param intensityData A matrix or an AffyBatch object.
#' @param method The method you want to use.
#' @param useSubset Logical. States if only a subset should be used.
#' @param subsetFraction The fraction of the subset.
#' @param useQuantileReference Logical for a quantile reference.
#' @return Computes the distribution distance
#' @author Djork-Arne Clevert \email{[email protected]@clevert.de} and
#' Andreas Mitterecker \email{[email protected]@bioinf.jku.at}
#' @export
#' @examples
#' x <- assayData(normData)\$intensity[, 1:3]
#' y <- distributionDistance(x)
#' attr(y, "Labels") <- substr(sampleNames(normData), 1, 7)
#' plotDendrogram(y)
distributionDistance <- function(
intensityData,
method = c("JSDiv", "KLDiv", "KLInf"),
useSubset = T,
subsetFraction = 0.25,
useQuantileReference = FALSE) {

## Parse arguments, stored in variables
## If no value is specified, the first option in the list is taken as
## the default value
method <- match.arg(method)
if(!is.matrix(intensityData)){
stop("A matrix as input is needed!")
}

nbrOfRows <- nrow(intensityData)
nbrOfCols <- ncol(intensityData)

## Computing the CDFs is computational expensive. Costs can be reduced by
## data subsetting.
## The distance measure will be computed on a subset if useSubset is set
## to TRUE (default)
## The subset size can be controlled by subsetFraction (25% of the data
## is used by default)
if (useSubset){
### use fix seed for reproducibility
set.seed(123)
### get data subset
intensityData <- intensityData[sample.int(nbrOfRows,
floor(nbrOfRows * subsetFraction)),]
}

## Scale data to remove offset and scale issues
intensityData <- scale(intensityData, center = TRUE, scale = TRUE)

## use a fix reference distribution
if(useQuantileReference){
### compute quantile distribution as reference
referenceDistribution <- Biobase::rowMedians(apply(intensityData, 2, sort))
res <- rep(NA, nbrOfCols)
for(i in 1:nbrOfCols){
### compute distance against the reference
res[i] <- calcDistance(
x = intensityData[, i],
y = referenceDistribution,
method = method)
}

attributes(res) <- list(Size = nbrOfCols, Labels = colnames(intensityData),
methods = method, class = "vector")
} else {
res <- rep(NA, nbrOfCols * (nbrOfCols - 1) / 2)
count <- 1
## compute pairwise distances
for(i in 1:(nbrOfCols - 1)){
for(j in (i + 1):nbrOfCols){
res[count] <- calcDistance(
x = intensityData[, i],
y = intensityData[, j],
method = method)
count <- count + 1
}
}

attributes(res) <- list(
Size = nbrOfCols,
Labels = colnames(intensityData),
methods = method,
class = "dist")

}

return(res)

}

calcDistance <- function(x, y, method = c("JSDiv", "KLDiv", "KLInf")){

## eps is added to avoid numerical issues
eps <- .Machine\$double.eps

## compute kernel density estimates for both distributions
P <- density(x, n = 10000, from = min(c(x, y)), to = max(c(x, y)))\$y + eps
Q <- density(y, n = 10000, from = min(c(x, y)), to = max(c(x, y)))\$y + eps

### calc distance measure
distance <- switch(method,
JSDiv = JSDivergence(P, Q),
KLDiv = KLDivergence(P, Q),
KLInf = KLInformation(P, Q)
)

return(distance)

}

## Kullback-Leibler divergences
## http://en.wikipedia.org/wiki/Kullback–Leibler_divergence:
KLDivergence <- function(p, q){
sum(p * log(p / q))
}

KLInformation <- function(p, q){
sum((p - q) * log(p / q))
}

## Jensen-Shannon divergence
## http://en.wikipedia.org/wiki/Jensen–Shannon_divergence
JSDivergence <- function(p, q){
m <- 0.5 * (p + q)
0.5 * (KLDivergence(p, m) +  KLDivergence(q, m))
}
```
Bioconductor-mirror/cn.farms documentation built on June 1, 2017, 6:35 a.m.