#' Calculate a distance matrix of an alignment
#'
#' Create a matrix of pariwise distances between languages. Takes as input eighter a set of languages or an alignment matrix
#'
#' @param languages ids of the languages
#' @param alignment alignment matrix consisting of 0, 1, ?. If alignment=NULL, then it will be created with the language IDs.
#' @param type one of "Hamming" or "Jaccard"
#' @param silent report progress or not
#' @param ... further arguments
#'
#' @details Optional arguments are:
#' \itemize{
#' \item removeMissings: (default TRUE): removes all ? in the sequence and only considers entries, where both languages in comparison have data
#' \item removeZeroes: (default: FALSE): if TRUE, only consider entries, where at least one of the two languages have a 1
#'}
#'
#' @return a matrix of pairwise distances
#' @export
#'
distanceMatrix <- function(languages=NULL, alignment=NULL, type="Hamming", silent=FALSE, ...) {
if(is.null(alignment)) {
if(is.null(languages)) stop("Error calculation the distance matrix: Eighter languages or alignment has to be specified!")
alignment <- createAlignmentMatrix(languages, silent = silent, ...)
}
rows <- length(alignment$matrix[,1])
out <- matrix(nrow = rows, ncol = rows, byrow = TRUE)
for (i in 1:rows) {
if(!silent) {
cat(paste("Calculating distance matrix: ", i, "/", rows, "\n", sep=""))
}
for (j in 1:i) {
if(i==j) {
out[i,j] <- 0
} else {
if(type=="Hamming") {
out[i, j] <- pairwiseDistance(alignment$matrix[i,], alignment$matrix[j,], normalized=TRUE, removeZeroes=FALSE, ...)
}
else if(type=="Jaccard") {
out[i, j] <- pairwiseDistance(alignment$matrix[i,], alignment$matrix[j,], normalized=TRUE, removeZeroes=TRUE, ...)
}
else if(type=="SharedCognate") {
out[i,j] <- pairwiseSCDistance(i,j,alignment)
}
out[j, i] <- out[i,j]
}
}
}
return(out)
}
#' Calculates the (normalized) Hamming or Jaccard distance of two binary vectors. This can be used to compare two rows of an alignment matrix, where
#' a ? marks missing entries. If normalized=FALSE and removeZeroes=FALSE, this is the classical Hamming distance. If normalized=TRUE and
#' removeZeroes=TRUE, it is the Jaccard distance.
#'
#' @param x
#' @param y
#' @param normalized
#' @param removeMissing
#' @param removeZeroes
#'
#' @return
#'
#'
pairwiseDistance <- function(x,y, normalized=TRUE, removeMissing=TRUE, removeZeroes=FALSE) {
l <- length(x)
if (l!=length(y)) {
stop("Error calculating the Hamming distance: Vectors differ in length")
}
if(removeMissing) {
missing <- grepl("\\?", x) | grepl("\\?", y)
x<-x[!missing]
y<-y[!missing]
}
if(removeZeroes) {
zeroes <- grepl("0", x) & grepl("0", y)
x<-x[!zeroes]
y<-y[!zeroes]
}
if (normalized) {
return (sum(x!=y)/length(x))
}
else {
return (sum(x!=y))
}
}
#' Distance of two languages
#'
#' Is not that fast...
#'
#' @param id1 ID of first language
#' @param id2 ID of second language
#' @param normalized should the distance be divided by the length of the sequence
#' @param ... further Arguments
#'
#' @details Optional arguments are:
#' \itemize{
#' \item words (default 1:210): which words should be included in the alignment
#' \item normalized: (default TRUE):
#' \item removeMissings: (default TRUE): removes all ? in the sequence and only considers entries, where both languages in comparison have data
#' \item removeZeroes: (default: FALSE): if TRUE, only consider entries, where at least one of the two languages have a 1
#'}
#'
#' @return the distance between language1 and language2
#' @export
#'
getDistance <- function(id1, id2, normalized=TRUE, ...) {
alignment <- createAlignmentMatrix(c(id1, id2), silent=TRUE, ...)
return(distanceMatrix(alignment=alignment, silent=TRUE, ...)[1,2])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.