R/umap_naive.R

Defines functions umap.naive umap.naive.predict naive.simplicial.set.embedding naive.optimize.embedding naive.fuzzy.simplicial.set smooth.knn.dist

Documented in naive.fuzzy.simplicial.set naive.optimize.embedding naive.simplicial.set.embedding smooth.knn.dist umap.naive umap.naive.predict

## package umap
## a from-scratch implementation of the UMAP algorithm
##
## This implementation is called naive because it is a rather direct translation
## of the original implementation by Leland McInnes. This implementation does,
## however, have some slight modifications.
##
## The original implementation is available at https://github.com/lmcinnes/umap.
##
## The original implementation was published under a BSD license.
##


##' Create a umap embedding
##'
##' This implementation is called naive because it is a rather straightforward
##' translation of the original python code.
##'
##' @keywords internal
##' @param d data object
##' @param config list with settings
##'
##' @return list, one element of which is matrix with embedding coordinates
umap.naive = function(d, config) {
  
  verbose = config$verbose
  
  ## prep parameters
  message.w.date("starting umap", verbose)
  if (is.na(config$a) | is.na(config$b)) {
    config[c("a", "b")] = find.ab.params(config$spread, config$min_dist)
  }
  if (is.na(config$random_state)) {
    config$random_state= as.integer(stats::runif(1, 0, 2^30))
  }
  
  ## perhaps extract knn from input
  knn = NULL
  if ("knn" %in% names(config)) {
    if (class(config$knn) == "umap.knn") {
      message.w.date("using supplied nearest neighbors", verbose)
      knn = config$knn
    }
  }
  if (is.null(knn)) {
    message.w.date("creating graph of nearest neighbors", verbose)
    knn =  knn.info(d, config)
  }
  
  ## create a graph representation
  graph = naive.fuzzy.simplicial.set(knn, config)
  message.w.date("creating initial embedding", verbose)
  embedding = make.initial.embedding(graph$n.elements, config, graph)
  message.w.date("optimizing embedding", verbose)
  embedding = naive.simplicial.set.embedding(graph, embedding, config)
  embedding = center.embedding(embedding)
  message.w.date("done", verbose)
  
  list(layout=embedding, data=d, knn=knn, config=config)
}




##' predict embedding of new data given an existing umap object
##'
##' @keywords internal
##' @param umap object of class umap
##' @param data matrix with new data
##'
##' @return matrix with embedding coordinates
umap.naive.predict = function(umap, data) {

  ## check that data and knn are available
  missing = setdiff(c("knn", "data", "config"), names(umap))
  if (length(missing)>0) {
    umap.error("missing components: ", paste(missing, collapse=", "))
  }

  ## create a configuration for the prediction
  config = umap$config
  config$n_epochs = floor(config$n_epochs/3)
  V = nrow(umap$layout)
  verbose = umap$config$verbose
  
  ## obtain nearest neighbors
  message.w.date("creating graph of nearest neighbors", verbose)
  spectator.knn = spectator.knn.info(data, umap$data, config)
  knn = list(indexes=rbind(umap$knn$indexes, spectator.knn$indexes),
             distances=rbind(umap$knn$distances, spectator.knn$distances))  
  
  ## create graph representation of primary and spectator data together
  graph = naive.fuzzy.simplicial.set(knn, config)
  message.w.date("creating initial embedding", verbose)
  embedding = make.initial.spectator.embedding(umap$layout, spectator.knn$indexes)
  embedding = rbind(umap$layout, embedding)
  message.w.date("optimizing embedding", verbose)
  embedding = naive.simplicial.set.embedding(graph, embedding, config,
                                             fix.observations=V)  
  
  ## extract coordinates for just the spectator data
  embedding = embedding[V+(1:nrow(data)),,drop=FALSE]
  message.w.date("done", verbose)
  
  embedding
}




## ############################################################################
## Details of the implementation specific to umap.naive


##' create an embedding of graph into a low-dimensional space
##'
##' @keywords internal
##' @param g matrix, graph connectivity as coo
##' @param embedding matrix, coordinates for an initial graph embedding
##' @param config list with settings
##' @param fix.observations integer, number of points to avoid moving in optimization
##'
##' @return matrix with embedding,
##' nrows is from g, ncols determined from config
naive.simplicial.set.embedding = function(g, embedding, config, fix.observations=NULL) {

  if (config$n_epochs==0) {
    return(embedding)
  }
  
  total.weight = sum(g$coo[, "value"])
  
  ## simplify graph a little bit
  gmax = max(g$coo[, "value"])
  g$coo[g$coo[, "value"] < gmax/config$n_epochs, "value"] = 0
  g = reduce.coo(g)
  
  ## create an epochs-per-sample. Keep track of it together with the graph coo
  eps = cbind(g$coo,
              eps=make.epochs.per.sample(g$coo[, "value"], config$n_epochs))
  if (!is.null(fix.observations)) {
    eps = eps[eps[, "from"]>fix.observations,]
  }
  
  result = naive.optimize.embedding(embedding, config, eps)
  rownames(result) = g$names
  
  result
}




##' modify an existing embedding 
##' 
##' @keywords internal
##' @param embedding matrix holding an initial embedding
##' @param config list with settings
##' @param eps matrix with connectivity coo graph and epochs per sample;
##' this must be a matrix with columns "from", "to", "value", and "eps"
##'
##' @return matrix of same dimension as initial embedding
naive.optimize.embedding = function(embedding, config, eps) {

  ## number of vertices in embedding
  V = nrow(embedding)
  ## transpose to get observations in columns
  embedding = t(embedding)

  ## define some vectors for book-keeping
  ## integer matrix with pairs of data
  eps.pairs = matrix(as.integer(eps[, c("from", "to")]), ncol=2)-1
  eps.val = eps[, "eps"]
  ## epns is short for "epochs per negative sample"
  epns = eps.val/config$negative_sample_rate
  ## eon2s is short for "epochs of next negative sample"
  eon2s = epns
  ## eons is short for "epochs of next sample"
  eons = eps.val
  ## nns is next negative sample
  nns = rep(0, nrow(eps))

  ## infer if some points should remain fixed
  fix.observations = min(eps.pairs[,1])>1
  ## extract some variables from config
  abg = c(config$a, config$b, config$gamma, as.numeric(fix.observations))
  
  for (n in seq_len(config$n_epochs)) {
    ## set the learning rate for this epoch
    alpha = config$alpha * (1 - ((n-1)/config$n_epochs))
    
    ## occasional message or output
    if (config$verbose) {
      message.w.date(paste0("epoch: ", n), (n %% config$verbose) == 0)
      if (!is.null(config$save)) {
        embedding.in.progress = t(embedding)
        save(embedding.in.progress, file=paste0(config$save, ".", n, ".Rda"))
      }
    }
    
    ## identify links in graph that require attention, then process those in loop
    adjust = eons<=n
    ihits = which(adjust)
    nns[ihits] = floor((n-eon2s[ihits])/epns[ihits])
    embedding = optimize_epoch(embedding, eps.pairs,
                               as.integer(adjust), nns, abg, alpha)
    
    ## prepare for next epoch
    eons[ihits] = eons[ihits] + eps.val[ihits]
    eon2s[ihits] = eon2s[ihits] + (nns[ihits]*epns[ihits])    
  } 
  
  t(embedding)
}




##' create a simplicial set from a distance object
##'
##' @keywords internal
##' @param knn list with inform about nearest neighbors (output of knn.info)
##' @param config list with settings
##'
##' @return matrix 
naive.fuzzy.simplicial.set = function(knn, config) {
  
  ## prepare constants
  V = nrow(knn$indexes)
  mix.ratio = config$set_op_mix_ratio
  bandwidth = config$bandwidth
  nk = config$n_neighbors
  connectivity = config$local_connectivity

  ## construct a smooth map to non-integer neighbors
  n.smooth = smooth.knn.dist(knn$distance, nk,
                             local.connectivity=connectivity,
                             bandwidth=bandwidth)

  ## construct a weighted connectivity matrix (manually)
  conn = base::vector("list", V)
  for (i in 1:V) {
    conn.i = rep(0, nk)
    nearest.i = n.smooth$nearest[i]
    smooth.i = n.smooth$distances[i]
    coo.i = cbind(from=i, to=knn$indexes[i,], value=0)
    for (j in 1:nk) {
      distance.ij = knn$distances[i,j]
      if (knn$indexes[i,j]==i) {
        val = 0
      } else if (distance.ij-nearest.i<=0) {
        val = 1
      } else {
        val = exp(-(distance.ij-nearest.i)/(smooth.i*bandwidth))
      }
      coo.i[j, "value"] = val
    }
    conn[[i]] = coo.i
  }
  conn = do.call(rbind, conn)
  conn = list(coo=conn, names=rownames(knn$indexes), n.elements=V)
  class(conn) = "coo"
  
  connT = t.coo(conn)
  ## here product is indended as a element-by-element product
  connP = multiply.coo(conn, connT)
  
  ## formula with standard matrix objects:
  ##result = mix.ratio*(conn+connT-connP) + (1-mix.ratio)*(connP)
  result = add.coo(add.coo(conn, connT), connP, b=-1)
  result = add.coo(result, connP, a=mix.ratio, b=(1-mix.ratio))
  
  reduce.coo(result)
}




##' compute a "smooth" distance to the kth neighbor and approximate first neighbor
##'
##' @keywords internal
##' @param k.dist matrix with distances to k neighbors
##' @param neighbors numeric, number of neighbors to approximate for
##' @param iterations integers, number of iterations
##' @param local.connectivity iteger
##' @param bandwidth numeric
##' @param tolerance numeric, for numeric precision
##' @param min.dist.scale numeric, minimum distance to nearest neighbor (for display)
##'
##' @return list with two vectors, distances to the kth neighbor,
##' and distance to the first nearest neighbor 
smooth.knn.dist = function(k.dist, neighbors,
                           iterations=64,
                           local.connectivity=1,
                           bandwidth=1,
                           tolerance = 1e-5,
                           min.dist.scale=1e-3) {
  
  target = log2(neighbors)*bandwidth
  result = rho = rep(0, nrow(k.dist))
  
  ## for numeric interpolation
  local.int = floor(local.connectivity)
  interpolation = local.connectivity-local.int
  
  ## precompute constants used within loop
  k.dist.mean = mean(k.dist)
  k.dist.cols = 1:ncol(k.dist)
  
  for (i in 1:nrow(k.dist)) {
    ## identify distances to ith point
    i.dist = k.dist[i,]
    i.nonzero = i.dist[i.dist!=0]
    i.num.nonzero = length(i.nonzero)
    if (i.num.nonzero>local.connectivity) {
      if (local.int>0) {
        rho[i] = i.nonzero[local.int] +
          interpolation*(i.nonzero[local.int+1]-i.nonzero[local.int])
      } else {
        rho[i] = interpolation*i.nonzero[1]
      }
    } else if (i.num.nonzero>0 & i.num.nonzero<local.connectivity)  {
      rho[i] = max(i.nonzero)
    } 

    lo = 0
    hi = Inf
    mid = 1
    
    ## iterate to get a normalization factor
    for (n in 2:iterations) {
      val = sapply(k.dist[i,2:ncol(k.dist)]-rho[i], max, 0)
      val = sum(exp(-val/mid))
      if (abs(val-target)<tolerance) {
        break
      }
      if (val>target) {
        hi = mid
        mid = mean(c(lo, hi))
      } else {
        lo = mid
        if (!is.finite(hi)) {
          mid = mid*2
        } else {
          mid = mean(c(lo, hi))
        }
      }
    }
    result[i] = mid
    
    if (rho[i]>0) {
      result[i] = max(result[i], min.dist.scale*mean(i.dist))
    } else {
      result[i] = max(result[i], min.dist.scale*k.dist.mean)
    }
  }
  
  list(distances=result, nearest=rho)
}
donelsonsmith/umap_R documentation built on Nov. 4, 2019, 10:58 a.m.