R/graph.R

Defines functions generateNetwork calculate_knn_graph

Documented in calculate_knn_graph generateNetwork

#' Isolation forest based KNN graph
#'
#' Calculate a KNN graph from a feature matrix and
#' prune the graph using isolation forest.
#'
#' @param feature_mat A feature matrix based on which KNN graph is computed
#' @param k Number of neighbours in KNN graph.
#' @param hardwareSetup Hardware setup for parallelising isolation forest
#'
#' @return edgeList
#'
#' @importFrom stats sd predict
#' @importFrom isotree isolation.forest
calculate_knn_graph <- function(feature_mat, k, hardwareSetup) {

    cell_list  <- rownames(feature_mat)
    ## For each cell, calculate the distance to save memory
    start_time <- Sys.time()
    edgeList   <- list()

    for (i in seq(dim(feature_mat)[1])) {
        #if ((i %% 10000) == 0)
        #    message(sprintf("Start pruning %i th cell. Cost %f seconds...", i, Sys.time() - start_time))
        # Calculate Euclidean distance for i-th sample
        dist_array <- array(
            apply(feature_mat, 1, function(x)
                sqrt(sum((feature_mat[i, ] - x)**2))
                )
            )

        nn  <- order(dist_array)[1:(k + 1)]
        nn  <- nn[nn != i] ## Excluding self in nearest neighbours
        iso <- isotree::isolation.forest(data     = feature_mat[nn, ],
                                         ntrees   = 100, ## as in original isolation forest paper
                                         nthreads = hardwareSetup$coresUage) ## parallelising
        preds     <- stats::predict(iso, feature_mat[nn, ])
        pruned    <- nn[which(preds <= 0.5)] ## Use 0.5 cutoff for outliers
        new_edges <- sapply(pruned, simplify = FALSE, function(x) return(c(i, x)))
        ## check if repetitively adding existing edges
        if (length(edgeList) > 0) {
            exist_edges <- vapply(new_edges, FUN.VALUE = logical(1), function(x) {
                hasEdge <- vapply(edgeList, FUN.VALUE = logical(1), function(y) {
                    return(setequal(x, y)) ## compare a fixed new edge with all existing edges
                })
                if (any(hasEdge)){
                    return(TRUE)
                } else {
                    return(FALSE)
                    }
            })

            ## Drop existing edges from new edges
            if (any(exist_edges))
                new_edges <- new_edges[-which(exist_edges)]
        }

        edgeList  <- c(edgeList, new_edges)
        ## Old KNN method consider mean + 1 std away as outliers
        #k_dist     <- dist_array[nn]
        #for (j in seq(k)) {
        #    if (dist_array[nn[j]] <= boundary) {
        #        weight <- 1.0
        #    } else {
        #        weight <- 0.0
        #    }
        #    edgeList[[i]] <- c(i, nn[j], weight)
        #}
    }

    edgeList <- do.call(rbind, edgeList)
    edgeList <- as.data.frame(edgeList)
    edgeList[, 1] <- cell_list[edgeList[, 1]]
    edgeList[, 2] <- cell_list[edgeList[, 2]]
    #colnames(edgeList) <- c("V1", "V2", "weight") # removed 0 weighted edges

    return(edgeList)
}

#' Infer a cell-cell network from the encoded space using one-stat KNN
#'
#' Compute a cell network from a feature matrix using KNN.
#'
#' @param feature_mat An encoded matrix generated by the feature auto-encoder.
#' @param k Number of neighbours in KNN graph. If k is not provided,
#'     then the best heuristic k = floor(sqrt(N)) will be used,
#'     where N is the sample size.
#' @param hardwareSetup Hardware setup for running isolation forest
#'
#' @return An igraph object to plot the cell network.
#'
#' @references
#' \insertRef{scGNN}{scRGNet}
#' \insertRef{isotree}{scRGNet}
#' \insertRef{isolationForest}{scRGNet}
#' \insertRef{stats}{scRGNet}
#' \insertRef{defaultK}{scRGNet}
#' \insertRef{igraph}{scRGNet}
#'
#' @examples
#' # Example 1:
#' # Tested examples. Not run for fast package compiling
#' \dontrun{
#' # Accessing the demo gene_counts_small dataset available with the package
#' inputCountsPath <- system.file("extdata", "GSE138852_small.csv", package = "scRGNet")
#' # Preprocess the raw counts
#' counts <- preprocessCSV(path = inputCountsPath)
#' ltmg <- runLTMG(counts)
#' hyperParams <- setHyperParams(regu_epochs = 5L)
#' hardwareSetup <- setHardware(coresUsage = 1L)
#' z <- runFeatureAE(scDataset = counts, LTMG_mat = ltmg, hyperParams, hardwareSetup)
#' net <- generateNetwork(z)
#'}
#'
#' @export
#' @importFrom Rdpack reprompt
#' @importFrom igraph graph_from_data_frame
generateNetwork <- function(feature_mat, k = NULL, hardwareSetup = NULL) {

    N <- dim(feature_mat)[1]
    if (is.null(k)) {
        k <- floor(sqrt(N)) ## Best heuristic for k
    } else {
        if ((!is.numeric(k)) | k %% 1 != 0 | k < 1 | k > N) ## Lazy evaluation :)
            stop("Invalid argument for k: Must be a positive integer greater than 0.")
    }

    if (is.null(hardwareSetup))
        hardwareSetup <- setHardware()

    cell_list <- rownames(feature_mat)
    edgeList  <- calculate_knn_graph(feature_mat, k, hardwareSetup)
    graph     <- igraph::graph_from_data_frame(edgeList, directed = FALSE)

    #adj   <- igraph::as_adj(graph, attr = "weight")
    return(graph)
}

# [END]
ff98li/scRGNet documentation built on Jan. 14, 2022, 4:58 a.m.