R/dists.R
In cultevo: Tools, Measures and Statistical Tests for Cultural Evolution

#' Make a meaning distance function vectorisable.
#'
#' This function takes as its only argument a function \code{f(m1, m2)} which
#' returns a single numeric indicating the distance between two 'meanings'
#' \code{m1, m2} (which are themselves most likely vectors or lists). Based
#' on \code{f}, this function returns a function \code{g(mm)} which takes as
#' its only argument a matrix or data frame \code{mm} with the meaning
#' elements (equivalent to the ones in \code{m1, m2}) along columns and
#' different meaning combinations (like \code{m1, m2, ...}) along rows. This
#' function returns a distance matrix of class \code{\link[stats]{dist}}
#' containing all pairwise distances between the rows of \code{mm}. The
#' resulting function \code{g} can be passed to other functions in this
#' package, in particular \code{\link{mantel.test}}.
#'
#' The meaning distance function should be commutative, i.e.
#' \code{f(a,b) = f(b,a)}, and meanings should have a distance of zero to
#' themselves, i.e. \code{f(a,a) = 0}.
#' @param pairwisemeaningdistfun a function of two arguments returning a
#'   single numeric indicating the semantic distance between its arguments
#' @return A function that takes a meaning matrix and returns a corresponding
#'   distance matrix of class \code{\link[stats]{dist}}.
#' @examples
#' trivialdistance <- function(a, b) return(a - b)
#'
#' trivialmeanings <- as.matrix(3:1)
#' trivialdistance(trivialmeanings[1], trivialmeanings[2])
#' trivialdistance(trivialmeanings[1], trivialmeanings[3])
#' trivialdistance(trivialmeanings[2], trivialmeanings[3])
#'
#' distmatrixfunction <- wrap.meaningdistfunction(trivialdistance)
#' distmatrixfunction(trivialmeanings)
#' @export
wrap.meaningdistfunction <- function(pairwisemeaningdistfun) {
  f <- function(meanings) {
    if (is.vector(meanings))
      meanings <- as.matrix(meanings)
    nmeanings <- dim(meanings)[1]
    stats::as.dist(cbind(sapply(1:(nmeanings-1), function(i) {
      c(rep(0, i),
        sapply((i+1):nmeanings, function(j)
          pairwisemeaningdistfun(meanings[i,], meanings[j,])))
    }), 0))
#    matrix <- matrix(nrow=nmeanings, ncol=nmeanings)
#    combs <- utils::combn(1:nmeanings, 2)
    # fill only the lower half to pass to as.dist
#    matrix[cbind(combs[2,], combs[1,])] <- apply(combs, 2,
#      function(i) pairwisemeaningdistfun(meanings[i[1],], meanings[i[2],]))
  }
  # cache distance matrix computations
  if (requireNamespace("memoise", quietly = TRUE))
    f <- memoise::memoise(f)
  invisible(f)
}

#' Pairwise Hamming distances between matrix rows.
#'
#' Returns a distance matrix giving all pairwise Hamming distances between the
#' rows of its argument \code{meanings}, which can be a matrix, data frame or
#' vector. Vectors are treated as matrices with a single column, so the
#' distances in its return value can only be 0 or 1.
#'
#' This function behaves differently from calling
#' \code{\link[stats]{dist}(meanings, method="manhattan")} in how \code{NA}
#' values are treated: specifying a meaning component as \code{NA} allows you
#' to \emph{ignore} that dimension for the given row/meaning combinations,
#' (instead of counting a difference between \code{NA} and another value as a
#' distance of 1).
#'
#' @param meanings a matrix with the different dimensions encoded along
#'   columns, and all combinations of meanings specified along rows. The data
#'   type of the cells does not matter since distance is simply based on
#'   equality (with the exception of \code{NA} values, see below.
#' @return A distance matrix of type \code{\link{dist}} with \code{n*(n-1)/2}
#'   rows/columns, where n is the number of rows in \code{meanings}.
#'
#' @examples
#' # a 2x2 design using strings
#' print(strings <- matrix(c("a1", "b1", "a1", "b2", "a2", "b1", "a2", "b2"),
#'   ncol=2, byrow=TRUE))
#' hammingdists(strings)
#'
#' # a 2x3 design using integers
#' print(integers <- matrix(c(0, 0, 0, 1, 0, 2, 1, 0, 1, 1, 1, 2), ncol=2, byrow=TRUE))
#' hammingdists(integers)
#'
#' # a 3x2 design using factors (ncol is always the number of dimensions)
#' print(factors <- data.frame(colour=c("red", "red", "green", "blue"),
#'                             animal=c("dog", "cat", "dog", "cat")))
#' hammingdists(factors)
#'
#' # if some meaning dimension is not relevant for some combinations of
#' # meanings (e.g. optional arguments), specifying them as NA in the matrix
#' # will make them not be counted towards the hamming distance! in this
#' # example the value of the second dimension does not matter (and does not
#' # count towards the distance) when the the first dimension has value '1'
#' print(ignoredimension <- matrix(c(0, 0, 0, 1, 1, NA), ncol=2, byrow=TRUE))
#' hammingdists(ignoredimension)
#'
#' # trivial case of a vector: first and last two elements are identical,
#' # otherwise a difference of one
#' hammingdists(c(0, 0, 1, 1))
#' @seealso \code{\link[stats]{dist}}
#' @export
hammingdists <-
  wrap.meaningdistfunction(function(m1, m2) sum(m1 != m2, na.rm=TRUE))

#' Compute the normalised Levenshtein distances between strings.
#' 
#' @param strings a vector or list of strings
#' @return A distance matrix specifying all pairwise normalised Levenshtein
#'   distances between the strings.
#' @examples normalisedlevenshteindists(c("abd", "absolute", "asdasd", "casd"))
#' @seealso \code{\link[stats]{dist}}
#' @export
normalisedlevenshteindists <- function(strings) {
  levs <- utils::adist(strings)
  lens <- sapply(strings, nchar)
  stats::as.dist(levs / outer(lens, lens, pmax))
}

#' Split strings into their constituent segments.
#'
#' Split strings into their constituent segments (and count them).
#'
#' @describeIn segment.string
#' Returns a list (of the same length as \code{x}), each item a vector of
#'   character vectors.
#' @param x one or more strings to be split (and, optionally, counted)
#' @param split the boundary character or sequence at which to segment the
#'   string(s). The default, \code{NULL}, splits the string after every character.
#' @examples
#' segment.string(c("asd", "fghj"))
#'
#' segment.string(c("la-dee-da", "lala-la"), "-")
#' @export
segment.string <- function(x, split=NULL)
  strsplit(x, split, fixed=TRUE)

#' @describeIn segment.string
#' Calculate the frequency of individual characters in one or more strings.
#' Returns a matrix with one row for every string in \code{x}.
#' @examples
#' segment.counts(c("asd", "aasd", "asdf"))
#' @export
segment.counts <- function(x, split=NULL) {
  chars <- segment.string(x, split)
  set <- unique(unlist(chars))
  structure(t(sapply(chars, function(s) {
    tabulate(match(s, set), nbins=length(set))
  })), dimnames=list(x, set))
}

#' Calculate the bag-of-characters similarity between strings.
#'
#' @param strings a vector or list of strings
#' @param split boundary sequency at which to segment the strings (default
#'   splits the string into all its constituent characters)
#' @param segmentcounts if custom segmentation is required, the pre-segmented
#'   strings can be passed as this argument (which is a list of lists)
#' @return a distance matrix
#' @examples orderinsensitivedists(c("xxxx", "asdf", "asd", "dsa"))
#' @seealso \code{\link[stats]{dist}}
#' @export
orderinsensitivedists <- function(strings=NULL, split=NULL, segmentcounts=segment.counts(strings, split))
  stats::as.dist(cbind(sapply(1:(length(strings)-1), function(i) {
    c(rep(0, i),
      colSums(abs(t(segmentcounts[(i+1):length(strings),,drop=FALSE]) - segmentcounts[i,])))
  }), 0))

#' Read a distance matrix from a file or data frame.
#'
#' @param data a filename, data frame or matrix
#' @param el1.column the column name or id specifying the first element
#' @param el2.column the column name or id specifying the second element
#' @param dist.columns the column name(s) or id(s) specifying the distance(s)
#'   between the two corresponding elements
#' @return a distance matrix (or list of distance matrixes when there is more
#'   than one \code{dist.columns}) of type \code{matrix}
#' @examples
#' read.dist(cbind(c(1,1,1,2,2,3), c(2,3,4,3,4,4), 1:6, 6:1), dist.columns=c(3,4))
#' @export
#' @importFrom utils read.csv
read.dist <- function(data, el1.column=1, el2.column=2, dist.columns=3) {
  if (is.character(data))
    data <- read.csv(data, header=TRUE)

  els <- sort(unique(c(data[,el1.column], data[,el2.column])))
  # create as many matrices as there are dist.columns
  d <- replicate(length(dist.columns), matrix(0, nrow=length(els), ncol=length(els)))
  el1s <- match(data[,el1.column], els)
  el2s <- match(data[,el2.column], els)
  indices <- cbind(repmatrix(cbind(el1s, el2s), each.row=length(dist.columns)), 1:length(dist.columns))
  d[indices] <- as.vector(t(data[,dist.columns]))
  if (length(dist.columns) == 1) {
    return(d[,,1])
  } else {
    # turn 3d array into named list
    d <- lapply(1:length(dist.columns), function(i)d[,,i])
    names(d) <- dist.columns
    return(d)
  }
}

#' Check or fix a distance matrix.
#' 
#' Checks or fixes the given distance matrix specification and returns an
#' equivalent, symmetric \code{matrix} object with 0s in the diagonal.
#' 
#' If the argument is a matrix, check whether it is a valid specification of a
#' distance matrix and return it, making it symmetric if it isn't already.
#' 
#' If the argument is a list, calls \code{check.dist} on every of its elements
#' and returns a list of the results.
#' 
#' For all other object types, attempts to coerce the argument to a \code{dist}
#' object and return the corresponding distance matrix (see above).
#' 
#' @param x an object (or list of objects) specifying a distance matrix
#' @return a symmetric \code{matrix} object (or list of such objects) of the
#'   same dimension as \code{x}
#' @seealso \code{\link[stats]{dist}}
#' @export
check.dist <- function(x)
  UseMethod("check.dist")

#' @export
check.dist.dist <- function(x)
  as.matrix(x)

#' @export
check.dist.default <- function(x)
  as.matrix(tryCatch(stats::as.dist(x), warning=function(w) stop(w)))

#' @export
check.dist.matrix <- function(x) {
  if (any(diag(x) != 0)) {
    stop("Not a valid distance matrix: nonzero diagonal entries")
  } else if (any(x[upper.tri(x)] != 0, na.rm = TRUE) && !isSymmetric(x)) {
    if (any(x[lower.tri(x)] != 0, na.rm = TRUE)) {
      stop("Not a valid distance matrix: distances are not symmetric. The matrix either has to be symmetric or have one of its triangles unspecified (0 or NA).")
    } else {
      x <- t(x)
    }
  }
  check.dist.default(x)
}

#' @export
check.dist.list <- function(x)
  lapply(x, check.dist)

#' Permute the rows and columns of a square matrix.
#'
#' Returns the given matrix with rows and columns permuted in the same order.
#'
#' @param m a matrix with an equal number of rows and columns
#' @param perm vector of indices specifying the new order of rows/columns
#' @return a matrix of the same size as \code{m}
#' @export
shuffle.locations <- function(m, perm = sample.int(dim(m)[1]))
  m[perm, perm]
# https://stat.ethz.ch/R-manual/R-devel/library/Matrix/html/pMatrix-class.html
Any scripts or data that you put into this service are public.
cultevo documentation built on May 2, 2019, 12:02 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
cultevo
Tools, Measures and Statistical Tests for Cultural Evolution

R/dists.R
In cultevo: Tools, Measures and Statistical Tests for Cultural Evolution

Try the cultevo package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

cultevo Tools, Measures and Statistical Tests for Cultural Evolution

R/dists.R In cultevo: Tools, Measures and Statistical Tests for Cultural Evolution

Try the cultevo package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

cultevo
Tools, Measures and Statistical Tests for Cultural Evolution

R/dists.R
In cultevo: Tools, Measures and Statistical Tests for Cultural Evolution