SINTER: Single-cell genomic data integration

Documented in eval_neighbor_test neighbor_test neighbor_test_fast

#' @title Evaluate the spacial discribution of the mixed single cells from different data types
#' @description This function is used for testing whether the single cells from different data types are mixed well.
#' For each cell in the input data, a fisher's extract test will be performed to test whether the ratio of input cells to reference cells in the given region is the same as the ratio of the total number of input cells to the total number of reference cells.
#' @param input_data Low dimensional representation of single cell from one data type as the input for matching (e.g., PCs from scRNA-seq data).
#' @param ref_data Low dimensional representation of single cell from another data type as the reference for matching (e.g., PCs from scATAC-seq data).
#' @param dist_scale Scale used to define the radius of the region for testing.
#' @param print_message Flag to print the radius used for the testing.
#' @return
#'  \item{input_count}{The number of cells in the input data for each test.}
#'  \item{ref_count}{The number of cells in the reference data for each test.}
#'  \item{pval}{P-values from fisher's extract tests.}
#' @keywords spacial test
#' @examples
#' \dontrun{
#' neighbor_test_p <- neighbor_test(input_data,ref_data,dist_scale=10)$pval
#' }
#' @export
neighbor_test <- function(input_data,ref_data,dist_scale=10,print_message=TRUE){
  
  dist_search <- max(max(dist(input_data)),max(dist(ref_data)))/dist_scale
  
  if(print_message){
    print(paste0("Using radius ",dist_search))
  }
  
  input_expect <- nrow(input_data)
  ref_expect <- nrow(ref_data)
  
  dist_all <- as.matrix(dist(rbind(input_data,ref_data)))
  input_dist_all <- dist_all[c(1:nrow(input_data)),c(1:nrow(input_data))]
  ref_dist_all <- dist_all[c(1:nrow(input_data)),-c(1:nrow(input_data))]
  
  input_count <- rowSums(input_dist_all < dist_search)
  ref_count <- rowSums(ref_dist_all < dist_search)
  
  fisher_pval <- sapply(1:nrow(input_data),function(i){
    ##self included
    test_table <- matrix(c(input_count[i],ref_count[i],input_expect,ref_expect),
                         nrow = 2,dimnames = list(c("input", "ref"),c("observe", "expect")))
    fisher.test(test_table)$p.value
  })
  
  return(list(input_count=input_count,ref_count=ref_count,pval=fisher_pval))
}


#' @title Evaluate the spacial discribution of the mixed single cells from different data types. (Fast version of neighbor_test)
#' @description This function is used for testing whether the single cells from different data types are mixed well.
#' A fisher's extract test will be performed to test whether the average ratio of input cells to reference cells in the given region is the same as the ratio of the total number of input cells to the total number of reference cells.
#' @param input_data Low dimensional representation of single cell from one data type as the input for matching (e.g., PCs from scRNA-seq data).
#' @param ref_data Low dimensional representation of single cell from another data type as the reference for matching (e.g., PCs from scATAC-seq data).
#' @param dist_scale Scale used to define the radius of the region for testing.
#' @param print_message Flag to print the radius used for the testing.
#' @return
#'  \item{input_count}{The number of cells in the input data for each test.}
#'  \item{ref_count}{The number of cells in the reference data for each test.}
#'  \item{pval}{P-values from fisher's extract test.}
#' @keywords spacial test
#' @examples
#' \dontrun{
#' neighbor_test_p <- neighbor_test_fast(input_data,ref_data,dist_scale=10)$pval
#' }
#' @export
neighbor_test_fast <- function(input_data,ref_data,dist_scale=10,print_message=TRUE){
  
  dist_search <- max(max(dist(input_data)),max(dist(ref_data)))/dist_scale
  
  if(print_message){
    print(paste0("Using radius ",dist_search))
  }
  
  input_expect <- nrow(input_data)
  ref_expect <- nrow(ref_data)
  
  dist_all <- as.matrix(dist(rbind(input_data,ref_data)))
  input_dist_all <- dist_all[c(1:nrow(input_data)),c(1:nrow(input_data))]
  ref_dist_all <- dist_all[c(1:nrow(input_data)),-c(1:nrow(input_data))]
  
  input_count <- rowSums(input_dist_all < dist_search)
  ref_count <- rowSums(ref_dist_all < dist_search)
  
  input_ref_ratio <- median(input_count/(ref_count+1))
  ref_count_test <- median(ref_count+1)
  input_count_test <- round(input_ref_ratio*median(ref_count_test))
  
  test_table <- matrix(c(input_count_test,ref_count_test,input_expect,ref_expect),
                       nrow = 2,dimnames = list(c("input", "ref"),c("observe", "expect")))
  fisher_pval <- fisher.test(test_table)$p.value
  
  return(list(input_count=input_count,ref_count=ref_count,pval=fisher_pval))
}


#' @title Evaluate the spacial discribution of the mixed single cells from different data types while given the cell type labels
#' @description This function is used for evaluating whether single cells from the same cell type are mixed well.
#' For each cell in the input data, a fisher's extract test will be performed to test whether the ratio of input cells to reference cells in the given region is the same as the ratio of the total number of input cells to the total number of reference cells while only single cells from the same cell type are considered.
#' @param input_data Low dimensional representation of single cell from one data type as the input for matching (e.g., PCs from scRNA-seq data).
#' @param ref_data Low dimensional representation of single cell from another data type as the reference for matching (e.g., PCs from scATAC-seq data).
#' @param input_mem Cell type label for the input_data.
#' @param ref_mem Cell type label for the ref_data.
#' @param dist_scale Scale used to define the radius of the region for testing.
#' @param print_message Flag to print the radius used for the testing.
#' @return
#'  \item{test_stat}{P-values from fisher's extract tests.}
#' @keywords spacial test evaluation
#' @examples
#' \dontrun{
#' test_stat <- eval_neighbor_test(input_data,ref_data,input_mem,ref_mem,dist_scale=10,print_message=TRUE)
#' }
#' @export
eval_neighbor_test <- function(input_data,ref_data,input_mem,ref_mem,dist_scale=10,print_message=TRUE){
  
  dist_search <- max(max(dist(input_data)),max(dist(ref_data)))/dist_scale
  test_stat <- rep(NA,nrow(input_data))
  input_expect <- nrow(input_data)
  ref_expect <- nrow(ref_data)
  
  if(print_message){
    print(paste0("Using radius ",dist_search))
  }
  
  for(i in 1:nrow(input_data)){
    ref_dist <- sapply(1:nrow(ref_data),function(x){
      sqrt(sum((input_data[i,] - ref_data[x,]) ^ 2))})
    input_dist <- sapply(1:nrow(input_data),function(x){
      sqrt(sum((input_data[i,] - input_data[x,]) ^ 2))})
    ref_count <- length(intersect(which(ref_dist < dist_search),which(ref_mem == input_mem[i])))
    input_count <- length(intersect(which(input_dist < dist_search),which(input_mem == input_mem[i])))
    
    test_table <- matrix(c(input_count,ref_count,input_expect,ref_expect),
                         nrow = 2,dimnames = list(c("input", "ref"),c("observe", "expect")))
    test_stat[i] <- fisher.test(test_table)$p.value
  }
  return(test_stat)
}