R/TDA.R

Defines functions dtm gridBy gridDiag ripsDiag get_kernel_matrix get_persistence_diagrams sliced_Wd pssk

Documented in get_kernel_matrix get_persistence_diagrams pssk sliced_Wd

#' pssk
#' 
#' Compute the persistence scale-space kernel on persistence diagrams. 
#' Reference: Jan Reininghaus, Stefan Huber, Ulrich Bauer, and Roland Kwitt. A stable multi-scale kernel for topological machine learning. In Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), pages 4741–4748, 2015.
#'
#' @param Dg1 a persistence diagram as a n1 x 3 matrix where each row is a topological feature
#' and the columns are dimension, birth and death of the feature.
#' @param Dg2 another persistence diagram as a n2 x 3 matrix
#' @param sigma kernel bandwidth
#' @param dimensions vector of the dimensions of the topological features to consider, if NULL (default) use all available dimensions
#' @return kernel value
#' @examples 
#' D1 <- matrix(c(0,0,0,1,1,0,0,0,1.5, 3.5,2,2.5,3, 4, 6), ncol = 3, byrow = FALSE)
#' D2 <- matrix(c(0,0,1,1,0, 0, 1.2, 2, 1.4, 3.2,4.6,6.5), ncol = 3, byrow = FALSE)
#' K <- pssk(Dg1 = D1, Dg2 = D2, sigma = 1)
#' @export

pssk <- function(Dg1 = NULL, Dg2 = NULL, sigma = NULL, dimensions = NULL) {
  if (sigma <= 0) {
    stop("ERROR: Parameter sigma must be strictly positive.\n")
  }
  if(!is.null(dimensions)) {
    Dg1 <- Dg1[Dg1[,1] %in% dimensions,, drop = FALSE]
    Dg2 <- Dg2[Dg2[,1] %in% dimensions,, drop = FALSE]
  }
  dims <- unique(c(Dg1[,1], Dg2[,1]))
  K <- 0
  for(d in dims) {
    PD1 <- Dg1[Dg1[,1] == d, 2:3, drop = FALSE]
    PD2 <- Dg2[Dg2[,1] == d, 2:3, drop = FALSE]
    n1 <- nrow(PD1)
    n2 <- nrow(PD2)
    if(n1==0 || n2==0) { next }
    k <- 0
    # Note: nested loops 25% faster than using apply()
    for(i in 1:n1) {
      p <- PD1[i,]
      for(j in 1:n2) {
        q <- PD2[j,]
        qbar <- rev(q)
        d1 <- (p[1]-q[1])^2+(p[2]-q[2])^2
        d2 <- (p[1]-qbar[1])^2+(p[2]-qbar[2])^2
        k <- k + exp(-d1/(8*sigma)) -exp(-d2/(8*sigma))
      }
    }
    k <- k/(8*pi*sigma)
    K <- K + k
  }
  return(K)
}

#' sliced_Wd
#' 
#' Compute sliced Wasserstein distance or kernel. 
#' Reference: Mathieu Carriere, Marco Cuturi, and Steve Oudot. Sliced Wasserstein kernel for persistence diagrams. In Proceedings of the 34th International Conference on Machine Learning, volume 70 of Proceedings of Machine Learning Research, pages 664–673, 2017.
#'
#' @param Dg1 a persistence diagram as a n1 x 3 matrix where each row is a topological feature
#' and the columns are dimension, birth and death of the feature.
#' @param Dg2 another persistence diagram as a n2 x 3 matrix
#' @param M number of slices (default: 10)
#' @param sigma kernel bandwidth (default: 1)
#' @param dimensions  vector of the dimensions of the topological features to consider, if NULL (default) use all available dimensions
#' @param return.dist logical (default: FALSE). Whether to return the kernel or distance value.
#' @return kernel or distance value
#' @examples 
#' D1 <- matrix(c(0,0,0,1,1,0,0,0,1.5, 3.5,2,2.5,3, 4, 6), ncol = 3, byrow = FALSE)
#' D2 <- matrix(c(0,0,1,1,0, 0, 1.2, 2, 1.4, 3.2,4.6,6.5), ncol = 3, byrow = FALSE)
#' K <- sliced_Wd(Dg1 = D1, Dg2 = D2, M = 10, sigma = 1, return.dist = TRUE)
#' @export

sliced_Wd <- function(Dg1, Dg2, M = 10, sigma = 1, dimensions = NULL, return.dist = FALSE) {
 
  if(!is.null(dimensions)) {
    Dg1 <- Dg1[Dg1[,1] %in% dimensions,, drop = FALSE]
    Dg2 <- Dg2[Dg2[,1] %in% dimensions,, drop = FALSE]
  }
  dims <- unique(c(Dg1[,1], Dg2[,1]))
  SW <- 0
  for(d in dims) {
    PD1 <- Dg1[Dg1[,1] == d, 2:3, drop = FALSE]
    PD2 <- Dg2[Dg2[,1] == d, 2:3, drop = FALSE]
    n1 <- nrow(PD1)
    n2 <- nrow(PD2)
    if(n1==0 || n2==0) { next }
    # Project diagram points on the diagonal and add to the other diagram
    L <- c(2,2) # Vector representing the diagonal
    p.PD1 <- matrix(0, ncol = 2, nrow = n1)
    for(i in 1:n1) {
      p <- PD1[i,]
      s <- (L %*% p)/(L%*%L)
      p.PD1[i,] <- s[1,1] * L
    }
    p.PD2 <- matrix(0, ncol = 2, nrow = n2)
    for(i in 1:n2) {
      p <- PD2[i,]
      s <- (L %*% p)/(L%*%L)
      p.PD2[i,] <- s[1,1] * L
    }
    PD1 <- rbind(PD1, p.PD2)
    PD2 <- rbind(PD2, p.PD1)
    
    theta <- -pi/2
    s <- pi/M
    K <- 0
    V1 <- numeric(n1+n2)
    V2 <- numeric(n1+n2)
    for(i in 1:M) {
      L <- c(cos(theta), sin(theta))
      V1 <- apply(PD1, 1, function(x) { x %*% L })
      V1 <- sort(V1)
      V2 <- apply(PD2, 1, function(x) { x %*% L })
      V2 <- sort(V2)
      K <- K + s * (sum(abs(V1 - V2)))
      theta <- theta + s
    }
    K <- K/pi
    SW <- SW + K
  }
  if (!return.dist) {
    return(exp(-SW/sigma))
  } else {
    return(SW)
  }
}

#' get_persistence_diagrams
#' 
#' Compute persistence diagrams for a list of point sets.
#' By default, compute persistent homology from the Vietoris-Rips filtration.
#' If use.dtm is TRUE, compute instead the persistent homology of the sublevel
#' set of the distance to measure evaluated over a grid.
#' 
#' @importFrom foreach %dopar%
#' @param point.sets list of point sets, each as a data frame with columns x,y,z
#' @param maxdimension maximum dimension of the homological features to be computed
#' @param maxscale limit of the Vietoris-Rips filtration
#' @param use.dtm logical (default: FALSE), whether to use the distance to measure function
#' @param m0 parameter for the dtm function
#' @param grid.by vector of space between points of the grid for the dtm function along each dimension
#' @param ncpu number of parallel threads to use for computation
#' @return a list of persistence diagrams as n x 3 matrices. Each row is a topological feature
#'  and the columns are dimension, birth and death of the feature.
#' @examples 
#' PS <- list(data.frame(x = c(2.4,-6.9,4.6,-0.7,-3.3,-4.9,-3.5,-3.5,4.2,-7),
#'                       y = c(5.7,1.9,4.8,3.4,-3,-2.1,7.2,1.8,6.1,-1.6),
#'                       z = c(2.7,-0.1,-0.7,-0.6,0.4,-1.5,-0.6,-0.9,2.2,0.7)),
#'            data.frame(x = c(0,0,3.1,-5.6,-5,-7.4,-0.7,-7.7,-6.7,4,4.2,0.2,5.8,3.9,3.9),
#'                       y = c(6.3,-6.1,-3.5,4.6,-4.1,0.3,8.8,-2.3,2.9,3.7,-1.4,-3.9,5.5,-1.2,-6.7),
#'                       z = c(-1.5,1.7,-0.4,-1.4,1.8,1.7,-0.9,-1.8,-0.5,1.7,1.3,0.5,-1.4,1.6,-0.1)))
#' Diags <- get_persistence_diagrams(point.sets = PS, maxdimension = 1, maxscale = 5, ncpu = 1)
#' @export

get_persistence_diagrams <- function(point.sets = NULL, 
                                     maxdimension = NULL, 
                                     maxscale = NULL, 
                                     use.dtm = FALSE,
                                     m0 = NULL,
                                     grid.by = NULL,
                                     ncpu = 1) {
  Diag <- list()
  i <- NULL
  # Set up cluster for parallelization
  cluster <- parallel::makeCluster(ncpu)
  doParallel::registerDoParallel(cluster)
  n <- length(point.sets)
  if(!use.dtm) {
    Diag <- foreach::foreach(i = c(1:n), .packages = "LOMAR") %dopar% {
      ripsDiag(X = point.sets[[i]],
               maxdimension,
               maxscale,
               printProgress = FALSE)[["diagram"]]
    }
  } else {
    point.sets <- lapply(point.sets, function(x) {scale(x, center = TRUE, scale = FALSE)})
    min.dims <- apply(sapply(point.sets, function(x) {apply(x, 2, min)}), 1, min)
    max.dims <- apply(sapply(point.sets, function(x) {apply(x, 2, max)}), 1, max)
    lims <- NULL
    for(l in 1:length(max.dims)) {
      lims <- cbind(lims, c(min.dims[l], max.dims[l]))
    }
    Diag <- foreach::foreach(i = c(1:n), .packages = "LOMAR") %dopar% {
      gridDiag(X = point.sets[[i]], 
               FUN = dtm, 
               m0 = m0, 
               lim = lims,
               by = grid.by, 
               sublevel = TRUE,
               maxdimension = maxdimension,
               printProgress = FALSE )$diagram
    }
  }
  on.exit(parallel::stopCluster(cluster))
  return(Diag)
}

#' get_kernel_matrix
#' 
#' Compute kernel/distance matrix between persistence diagrams.
#' 
#' @importFrom foreach %dopar%
#' @importFrom foreach %:%
#' @param Diag list of persistence diagrams as n x 3 matrices
#' @param method which kernel or distance to compute. One of sWd (for sliced Wasserstein kernel) or pssk (for the persistence scale-space kernel)
#' @param sigma kernel bandwidth
#' @param return.dist logical (default: FALSE) for method sWd, whether to return the sliced Wasserstein distance matrix instead of the kernel.
#' @param M number of slices for the sliced Wasserstein kernel
#' @param dimensions vector of the dimensions of the topological features to consider, if NULL (default) use all available dimensions
#' @param ncpu number of parallel threads to use for computation
#' @return a matrix
#' @examples 
#' PS <- list(data.frame(x = c(2.4,-6.9,4.6,-0.7,-3.3,-4.9,-3.5,-3.5,4.2,-7),
#'                       y = c(5.7,1.9,4.8,3.4,-3,-2.1,7.2,1.8,6.1,-1.6),
#'                       z = c(2.7,-0.1,-0.7,-0.6,0.4,-1.5,-0.6,-0.9,2.2,0.7)),
#'            data.frame(x = c(0,0,3.1,-5.6,-5,-7.4,-0.7,-7.7,-6.7,4,4.2,0.2,5.8,3.9,3.9),
#'                       y = c(6.3,-6.1,-3.5,4.6,-4.1,0.3,8.8,-2.3,2.9,3.7,-1.4,-3.9,5.5,-1.2,-6.7),
#'                       z = c(-1.5,1.7,-0.4,-1.4,1.8,1.7,-0.9,-1.8,-0.5,1.7,1.3,0.5,-1.4,1.6,-0.1)),
#'            data.frame(x = c(-9.8,-5.2,12.5,2.5,4.5,1.3,-0.2,0.4,9.3,-1.4,0.5,-1.1,-7.7),
#'                       y = c(-4.2,1.5,-0.5,12,-3,-7.2,10.9,6.7,-1.3,10,6.7,-6.2,2.9),
#'                       z = c(3.4,-3.8,-1.4,1.8,3.5,2.5,2.6,-4.8,-3.8,3.9,4.1,-3.6,-4)))
#' Dgs <- get_persistence_diagrams(point.sets = PS, maxdimension = 1, maxscale = 5, ncpu = 1)
#' K <- get_kernel_matrix(Diag = Dgs, method = 'sWd', dimensions = c(0,1), M = 10, sigma = 5)
#' @export

get_kernel_matrix <- function(Diag = NULL, method = c("sWd", "pssk"), dimensions = NULL, return.dist = FALSE, M = NULL, sigma = NULL, ncpu = 1) {
  n <- length(Diag)
  K <- matrix(NA, nrow = n, ncol = n)
  i <- j <- NULL
  cluster <- parallel::makeCluster(ncpu)
  doParallel::registerDoParallel(cluster)
  if(method == "sWd") {
    # Compute pairwise sliced Wasserstein distances
    K <- foreach::foreach(j = c(1:n), .combine='cbind') %:%
      foreach::foreach(i = c(1:n), .combine = 'c') %dopar% {
        Di <- Diag[[i]]
        Dj <- Diag[[j]]
        sliced_Wd(Di, Dj, M, sigma, dimensions, return.dist)
      }
  } else if (method == "pssk") {
    K <- foreach::foreach(j = c(1:n), .combine='cbind') %:%
      foreach::foreach(i = c(1:n), .combine = 'c') %dopar% {
        Di <- Diag[[i]]
        Dj <- Diag[[j]]
        pssk(Di, Dj, sigma, dimensions)
      }
  } else {
    stop(paste0("Unknown kernel method: ", method,".\n"))
  }
  on.exit(parallel::stopCluster(cluster))
  return(K)  
}



### Functions copied from package TDA ###

ripsDiag <- function(X, maxdimension, maxscale, location = FALSE, printProgress = FALSE) {
    
    if (!is.numeric(X) && !is.data.frame(X)) {
      stop("X should be a matrix of coordinates")
    }
    if (!is.numeric(maxdimension) ||
        length(maxdimension) != 1 || maxdimension < 0) {
      stop("maxdimension should be a nonnegative integer")
    }
    if (!is.numeric(maxscale) || length(maxscale) != 1) {
      stop("maxscale should be a number")
    }
    
    X <- as.matrix(X)
    
    max_num_pairs <- 5000  # to be added as an option
    
    ripsOut <- RipsDiag(X = X, maxdimension = maxdimension,
                        maxscale = maxscale, dist = 'euclidean', libraryFiltration = 'Dionysus',
                        libraryDiag = 'Dyonisus', location = location,
                        printProgress = printProgress)
    
    if (location == TRUE) {
      BirthLocation <- X[ripsOut[[2]][, 1], ]
      DeathLocation <- X[ripsOut[[2]][, 2], ]
    }
    
    Diag <- ripsOut[[1]]
    if (NROW(Diag) > 0) {
      ## change Inf values to maxscale
      Diag[which(Diag[, 3] == Inf), 3] <- maxscale  
    }
    
    colnames(Diag) <- c("dimension", "Birth", "Death")
    class(Diag) <- "diagram"
    attributes(Diag)[["maxdimension"]] <- max(Diag[, 1])
    attributes(Diag)[["scale"]] <- c(0, maxscale)
    attributes(Diag)[["call"]] <- match.call()
    
    out <- list("diagram" = Diag)
  
    return (out)
  }


gridDiag <- function(X = NULL, FUN = NULL, lim = NULL, by = NULL, FUNvalues = NULL,
           maxdimension = max(NCOL(X), length(dim(FUNvalues))) - 1,
           sublevel = TRUE, location = FALSE,
           printProgress = FALSE, diagLimit = NULL, ...) {
    
    if (!xor(is.null(X) || is.null(FUN), is.null(FUNvalues))) {
      stop("either values of X, FUN should be set, or a value of FUNvalues should be set, but not both")
    }
    if (!is.null(X) && !is.null(FUN)) {
      if (!is.numeric(X) && !is.data.frame(X)) {
        stop("X should be a matrix of coordinates")
      }
      if (!is.function(FUN)) {
        stop("FUN should be a function")
      }
      if (is.null(lim) || is.null(by)) {
        stop("When X and FUN are set, lim and by should also be set")
      }
    }
    if (!is.null(lim) && !is.null(by)) {
      if (!is.numeric(lim) || length(lim) %% 2 != 0) {
        stop("lim should be either a numeric matrix or a numeric vector of even length")
      }
      if (!is.numeric(by) || any(by <= 0)) {
        stop("by should be positive")
      }
    }
    if (!is.null(X) && !is.null(FUN) && !is.null(lim) && !is.null(by)) {
      if (2 * NCOL(X) != length(lim)) {
        stop("dimension of X does not match with lim")
      }
      if (length(by) != 1 && length(by) != NCOL(X)) {
        stop("by should be either a number or a vector of length equals dimension of grid")
      }
    }
    if (!is.null(FUNvalues)) {
      if (!is.numeric(FUNvalues) && !is.data.frame(FUNvalues)) {
        stop("FUNvalues should be an array")
      }
    }
    if (!is.numeric(maxdimension) || length(maxdimension) != 1 ||
        maxdimension < 0) {
      stop("maxdimnsion should be a nonnegative integer")
    }
    if (!is.logical(sublevel)) {
      stop("sublevel should be logical")
    }
    if (!is.logical(location)) {
      stop("location should be logical")
    }
    if(location == TRUE && (is.null(lim) || is.null(by))) {
      stop("If location is TRUE, both lim and by should be set")
    }
    if (!is.logical(printProgress)) {
      stop("printProgress should be logical")
    }
    if ((!is.numeric(diagLimit) || length(diagLimit) != 1) &&
        !is.null(diagLimit)) {
      stop("diagLimit should be a positive number")
    }
    
    if (!is.null(X)) {
      X <- as.matrix(X)
    }
    if (!is.null(lim) && !is.null(by)) {
      Grid <- gridBy(lim = lim, by = by)
    }
    if (is.null(FUNvalues)) {
      FUNvalues <- FUN(X, Grid[["grid"]], ...)
      gridDim <- Grid[["dim"]]
    } else {
      if (is.data.frame(FUNvalues)) {
        FUNvalues <- as.matrix(FUNvalues)
      } else {
        FUNvalues <- as.array(FUNvalues)
      }
      gridDim <- dim(FUNvalues)
    }
    
    if (!is.null(lim) && !is.null(by) &&
        length(Grid[["dim"]]) != length(gridDim)) {
      stop("dimension of FUNvalues does not match with lim and by")
    }
    
    maxdimension <- min(maxdimension, length(gridDim) - 1)
    if (sublevel == FALSE) {
      FUNvalues <- -FUNvalues
    }
    
    # compute persistence diagram of function values over a grid
    if (length(gridDim) <= 3) {
      gridOut <- GridDiag(FUNvalues = FUNvalues, gridDim = as.integer(gridDim),
        maxdimension = as.integer(maxdimension), decomposition = "5tetrahedra",
        library = 'Dyonisus', location = location, printProgress = printProgress)
    } else {
      gridOut <- GridDiag(FUNvalues = FUNvalues, gridDim = as.integer(gridDim),
        maxdimension = as.integer(maxdimension), decomposition = "barycenter",
        library = 'Dyonisus', location = location, printProgress = printProgress)
    }
    
    if (location == TRUE) {
      BirthLocation <- Grid[["grid"]][gridOut[[2]][, 1], ]
      DeathLocation <- Grid[["grid"]][gridOut[[2]][, 2], ]
    }
    
    Diag <- gridOut[[1]]
    if (NROW(Diag) > 0) {
      Diag[1, 3] <- ifelse(is.null(diagLimit), max(FUNvalues), diagLimit) 
    }
    if (sublevel == FALSE) {
      colnames(Diag) <- c("dimension", "Death", "Birth")
      Diag[, 2:3] <- -Diag[, 3:2]
    } else {
      colnames(Diag) <- c("dimension", "Birth", "Death")
    }
    
    class(Diag) <- "diagram"
    attributes(Diag)[["maxdimension"]] <- max(Diag[, 1])
    nonInf <- which(Diag[, 2] != Inf & Diag[, 3] != Inf)
    attributes(Diag)[["scale"]] <-
      c(min(Diag[nonInf, 2:3]), max(Diag[nonInf, 2:3]))
    attributes(Diag)[["call"]] <- match.call()
    
    out <- list("diagram" = Diag)
    
    return (out)
  }

gridBy <- function(lim = c(0, 1), by = (lim[2] - lim[1]) / 10) {
  gridlist <- list()
  for (idx in 1:(length(lim)/2)) {
    if (length(by) == 1) {
      gridlist[[idx]] <- seq(lim[2 * idx - 1], lim[2 * idx], by = by)
    } else {
      gridlist[[idx]] <- seq(lim[2 * idx - 1], lim[2 * idx], by = by[idx])
    }
  }
  grid <- as.matrix(expand.grid(gridlist))
  colnames(grid) <- NULL
  dim <- sapply(gridlist, length)
  out <- list("dim" = dim, "lim" = lim, "grid" = grid)
  return(out)
}

dtm <- function(X, Grid, m0, r = 2, weight = 1) {

  if (!is.numeric(X) && !is.data.frame(X)) {
    stop("X should be a matrix of coordinates")
  }
  if (!is.numeric(Grid) && !is.data.frame(Grid)) {
    stop("Grid should be a matrix of coordinates")
  }
  if (NCOL(X) != NCOL(Grid)) {
    stop("dimensions of X and Grid do not match")
  }
  if (!is.numeric(m0) || length(m0) != 1 || m0 < 0 || m0 > 1) {
    stop("m0 should be a number between 0 and 1")
  }
  if (!is.numeric(r) || length(r) != 1 || r < 1) {
    stop("r should be a number greater than or equal to 1")
  }
  if (!is.numeric(weight) || 
      (length(weight) != 1 && length(weight) != NROW(X))) {
    stop("weight should be either a number or a vector of length equals the number of sample")
  }

  # without weight
  if (length(weight) == 1) {
    X <- as.matrix(X)
    weightBound <- m0 * NROW(X)
    knnDistance <- FNN::knnx.dist(
		data = X, query = as.matrix(Grid), k = ceiling(weightBound),
		algorithm = c("kd_tree"))
    return (Dtm(knnDistance = knnDistance, weightBound = weightBound, r = r))

  # with weight
  } else {
    X0 <- as.matrix(X[weight != 0, , drop = FALSE]) 
    weight0 <- weight[weight != 0]
    weight0sort <- sort(weight0)
    weightBound <- m0 * sum(weight0)
    weightSumTemp <- 0
    for (k0 in seq(along = weight0)) {
      weightSumTemp <- weightSumTemp + weight0sort[k0]
      if (weightSumTemp >= weightBound) {
        break
      }
    }
    knnDistanceIndex <- FNN::get.knnx(
	    data = X0, query = as.matrix(Grid), k = k0, algorithm = c("kd_tree"))
    return (DtmWeight(
	    knnDistance = knnDistanceIndex[["nn.dist"]], weightBound = weightBound,
		r = r, knnIndex = knnDistanceIndex[["nn.index"]], weight = weight0))
  }
}

Try the LOMAR package in your browser

Any scripts or data that you put into this service are public.

LOMAR documentation built on March 18, 2022, 6:05 p.m.