smmtools: Single-cell multi-omics sequenceing data tools

Documented in runKNN runLeiden

#' Simplified runKNN in SnapATAC
#'
#' We have checked this function:
#' it returns the same KNN mat as SnapATAC does.
#'
#' @param smat dense matrix or matrix, cell by feature
#' @param k integer, max number of nearest neighbours, should be between 10 to 50.
#' @param treetype string, "kd" or "bd", "bd" is usuful for larger point sets and
#' local clusters in the dataset, which could reduce the depth of the tree.
#' Default is "kd".
#' NOTE: "bd" may have bugs on Linux (Ubuntu) but not on MacOS. When I use bd, my task is
#' always be killed no mater how big memory I use (I even use 200GB for a small dataset: < 60,000 points).
#' This does not happen on MacOS. But "kd" works on Linux
#' @param searchtype string, "standard", "priority", or "radius".
#' Default is "standard"
#' @param nn_eps Error bound when performing nearest neighbor seach using RANN.
#' default of 0.0 implies exact nearest neighbor search
#' @return sparseMatrix, KNN matrix, ncell by ncell, value is 1 (unweighted),
#' including the diagnal part.
#' @export
runKNN <- function(smat, k = 20, treetype = "kd",
                   searchtype = "standard",
                   nn_eps = 0.0){
  message(paste("Generate KNN with", k))
  ncell <- nrow(smat)
  if(ncell < k){
    message(paste("Ncell", ncell, "is smaller than K nearst neighbor", k))
    k <- ncell-1
    message("Set k as Ncell - 1.")
  }
  
  nnRanked <- RANN::nn2(data = smat, k = k,
                        treetype = treetype,
                        searchtype = searchtype,
                        eps = nn_eps)$nn.idx
  j <- as.numeric(t(nnRanked))
  i <- (seq_along(j)-1) %/% k + 1
  kmat <- Matrix::sparseMatrix(i = i, j = j, x = 1, dims = c(ncell, ncell))
  if (!is.null(rownames(smat))) {
    rownames(kmat) <- rownames(smat)
  }
  return(kmat)
}

#' Run Leiden algorithm as graph-based clustering.
#' @param kmat sparseMatrix, KNN matrix generated by runKNN.
#' @param path_to_ptyhon string
#' @param reso double, resolution param in Leiden default is 0.8
#' @param seed integer, used for Leiden
#' @param partitionType string, used for Leiden, default is "RB"
#' @return vector of factor, cluster index for cells
#' @import reticulate
#' @export
runLeiden <- function(kmat,
                      path_to_python = NULL,
                      reso = 0.8, seed = 10,
                      partitionType = "RB") {
  message(paste("Run Leiden for clustering with resolution", reso, "and partitionType", partitionType))
  if(!is.null(path_to_python)) {
    use_python(path_to_python, required = TRUE)
    message("Use the Python located in:", path_to_python, "\n")
  }
  setSessionTimeLimit(cpu = Inf, elapsed = Inf)
  ld <- import(module = "smmuty", convert = FALSE)
  ldCluster <- as.factor(py_to_r(
    ld$leiden(knn = r_to_py(kmat), reso = reso, seed = seed, opt = partitionType)))
  
  message("Summary of clustering:")
  print(table(ldCluster))
  return(ldCluster)
}