R/util.R

Defines functions filter_by_size pair_cor logCPM cpm sample_cells calc_tau sparse_cor get_cl_prop get_cl_medians get_cl_means get_cl_sums get_cl_mat convert_pair_matrix_str convert_pair_matrix get_pairs set_pair_matrix get_pair_matrix get_pair_matrix_coor

Documented in calc_tau convert_pair_matrix convert_pair_matrix_str cpm get_cl_mat get_cl_means get_cl_medians get_cl_prop get_cl_sums get_pair_matrix get_pair_matrix_coor get_pairs logCPM pair_cor sample_cells set_pair_matrix sparse_cor

# Function call map
# function_1()
#   called_by_function_1() called_function_file.R
#
# get_pair_matrix_coor()
#
# get_pair_matrix()
#   get_pair_matrix_coor() util.R
#
# set_pair_matrix()
#   get_pair_matrix_coor() util.R
#
# get_pairs()
#
# convert_pair_matrix()
#   get_pairs() util.R
#   set_pair_matrix() util.R
#
# convert_pair_matrix_str()
#
# get_cl_mat()
#
# get_cl_sums()
#   get_cl_mat() util.R
#
# get_cl_means()
#   get_cl_sums() util.R
#
# get_cl_medians()
#
# get_cl_prop()
#   get_cl_mat() util.R
#
# sparse_cor()
#
# calc_tau()
#
# sample_cells()
#
# cpm()
#
# logCPM()
#   cpm() util.R
# 
# pair_cor()
#

#' Convert matrix row/column positions to vector position
#' 
#' @param m a matrix object
#' @param rows row positions, either as indices or matches to row names
#' @param cols row positions, either as indices or matches to row names
#' 
#' @return a vector containing the numeric position at [rows,cols]
#' 
get_pair_matrix_coor <- function(m, 
                                 rows, 
                                 cols) {
  
  if(!is.numeric(rows)){
    rows <- match(rows, rownames(m))
  }
  
  if(!is.numeric(cols)){
    cols <- match(cols, colnames(m))
  }
  
  coor <- (cols - 1L) * nrow(m) + rows
  
  return(coor)
}

#' Subset a matrix as a vector using row and column positions
#' 
#' @param m a matrix object
#' @param rows row positions, either as indices or matches to row names
#' @param cols row positions, either as indices or matches to row names
#' 
#' @return a vector with values extracted from m at [rows/cols]
#' 
get_pair_matrix <- function(m, 
                            rows, 
                            cols) {
  v <- as.vector(m)
  coor <- get_pair_matrix_coor(m, rows, cols)
  return(v[coor])
}


#' Update a matrix with values from a 1d vector using row and column positions
#' 
#' @param m a matrix object
#' @param rows row positions, either as indices or matches to row names
#' @param cols row positions, either as indices or matches to row names
#' @param vals values to insert at [rows/cols]
#' 
#' @return a matrix with updated values
#' 
set_pair_matrix <- function(m, 
                            rows, 
                            cols, 
                            vals) {
  
  coor <- get_pair_matrix_coor(m, rows, cols)
  m[coor] <- vals
  
  return(m)
}

#' Convert underscore_separated pair names to a data.frame
#' 
#' @param pairs.str a character vector with underscore_separated values
#' 
#' @return a data.frame with columns "P1" and "P2" containing the separated pair names
#' 
#' @export
#' 
get_pairs <- function(pairs.str) {
  
  pairs_split <- strsplit(pairs.str, "_")
  pairs_mat <- do.call("rbind", pairs_split)
  pairs_df <- as.data.frame(pairs_mat,
                            stringsAsFactors = FALSE)
  
  row.names(pairs_df) <- pairs.str
  colnames(pairs_df) <- c("P1", "P2")
  
  return(pairs_df)
}


#' Convert paired cluster comparison values to a matrix
#' 
#' @param pair.num a named numeric vector of values. Names correspond to compared elements separted by "_", e.g. "c1_c2", "c23_c59"
#' @param l labels for columns. Default is NULL, which will compute them from names(pair.num)
#' @param directed If FALSE (default), the first value in each pair will specify used as columns, with the second as rows. 
#' If TRUE, first values will be rows, and second will be columns.
#' 
#' @return a matrix containing values from pair.num, and named for each element separated by "_" in names(pair.num)
#'  
#' @examples
#' 
#' pair_values <- seq(1,27,3)
#' names(pair_values) <- paste(rep(letters[1:3], each = 3), rep(letters[1:3], 3), sep = "_")
#' pair_values
#' 
#' pair_matrix <- convert_pair_matrix(pair_values, directed = FALSE)
#' pair_matrix
#' 
#' pair_matrix <- convert_pair_matrix(pair_values, directed = TRUE)
#' pair_matrix
#' 
convert_pair_matrix <- function(pair.num, 
                                l = NULL,
                                directed = FALSE) {
  
  pairs <- get_pairs(names(pair.num))
  
  if(is.null(l)){
    l <- sort(unique(c(pairs[,1], pairs[,2])))
  }
  
  n.cl <- length(l)
  
  pair.num.mat <- matrix(0, nrow = n.cl, ncol = n.cl)
  rownames(pair.num.mat) <- l
  colnames(pair.num.mat) <- l
  
  pair.num.mat <- set_pair_matrix(pair.num.mat, pairs[,1], pairs[,2], pair.num)
  
  if(!directed) {
    pair.num.mat <- set_pair_matrix(pair.num.mat, pairs[,2], pairs[,1], pair.num)
  }
  
  return(pair.num.mat)
}

#' Convert paired cluster comparison values to a matrix
#' 
#' @param pair.num a vector of values with names of compared elements separted by "_", e.g. "c1_c2", "c23_c59"
#' @param l labels for columns. Default is NULL, which will compute them from names(pair.num)
#' @param directed If FALSE (default), the first value in each split will be used as columns, with the second as rows. 
#' If TRUE, first values will be rows, and second will be columns.
#' 
#' @return a matrix containing values from pair.num, and named for each element separated by "_" in names(pair.num)
#'  
#' @examples
#' 
#' pair_values <- seq(1,27,3)
#' names(pair_values) <- paste(rep(letters[1:3], each = 3), rep(letters[1:3], 3), sep = "_")
#' pair_values
#' 
#' pair_matrix <- convert_pair_matrix(pair_values, directed = FALSE)
#' pair_matrix
#' 
#' pair_matrix <- convert_pair_matrix(pair_values, directed = TRUE)
#' pair_matrix
#' 
convert_pair_matrix_str <- function(pair.str, 
                                    l = NULL,
                                    directed = FALSE) {
  
  pairs <- do.call("rbind", strsplit(names(pair.str),"_"))
  
  if(is.null(l)){
    l <- sort(unique(as.vector(pairs)))
  }
  
  n.cl <- length(l)
  
  pair.str.mat <- matrix("", nrow = n.cl, ncol = n.cl)
  rownames(pair.str.mat) <- l
  colnames(pair.str.mat) <- l
  
  for(i in 1:nrow(pairs)){
    pair.str.mat[pairs[i,1], pairs[i,2]] <- pair.str[i]
    if(!directed) {
      pair.str.mat[pairs[i,2], pairs[i,1]] <- pair.str[i]
    }
  }
  return(pair.str.mat)
}


#' Generate a sparse matrix one-hot representation of clusters x samples
#' 
#' @param cl a cluster factor object
#' 
#' @return a sparse, one-hot matrix indicating which cluster(columns) each sample (rows) belongs to.
#' @export
#' 
get_cl_mat <- function(cl) {
  
  if(!is.factor(cl)){
    cl <- as.factor(cl)
  }
  cl <- droplevels(cl)
  
  cl.mat <- Matrix::sparseMatrix(i = 1:length(cl),  
                                 j = as.integer(cl), 
                                 x = 1)
  
  rownames(cl.mat) <- names(cl)
  colnames(cl.mat) <- levels(cl)
  
  return(cl.mat)
}

#' Compute cluster sums for each row in a matrix
#' 
#' @param mat A gene (rows) x samples (columns) sparse matrix
#' @param cl A cluster factor object
#' 
#' @return a matrix of genes (rows) x clusters (columns) with sums for each cluster
#' @export
#' 
get_cl_sums <- function(mat, 
                        cl) {
  
  cl.mat <- get_cl_mat(cl)
  if(all(names(cl) %in% colnames(mat))){
    cl.sums <- Matrix::tcrossprod(mat[,rownames(cl.mat)], Matrix::t(cl.mat))
  }
  else{
    cl.sums <- Matrix::crossprod(mat[rownames(cl.mat),], cl.mat)
  }
  cl.sums <- as.matrix(cl.sums)
  return(cl.sums)
}

#' Compute cluster means for each row in a matrix
#' 
#' @param mat A gene (rows) x samples (columns) sparse matrix
#' @param cl A cluster factor object
#' 
#' @return a matrix of genes (rows) x clusters (columns) with means for each cluster
#' @export
#' 
get_cl_means <- function(mat, 
                         cl) {
  
  cl.sums <- get_cl_sums(mat, cl)
  
  cl.size <- table(cl)
  
  cl.means <- as.matrix(Matrix::t(Matrix::t(cl.sums)/as.vector(cl.size[colnames(cl.sums)])))
  
  return(cl.means)
}

#' Compute cluster medians for each row in a matrix
#' 
#' @param mat A gene (rows) x samples (columns) sparse matrix
#' @param cl A cluster factor object
#' 
#' @return a matrix of genes (rows) x clusters (columns) with medians for each cluster
#' @export
#' 
get_cl_medians <- function(mat, cl)
{
  library(Matrix)
  library(matrixStats)
  
  cl.med <- do.call("cbind",
                    tapply(names(cl), 
                           cl, 
                           function(x){
                             matrixStats::rowMedians(as.matrix(mat[,x]))
                           }
                    )
  )
  
  rownames(cl.med) <- rownames(mat)
  
  return(cl.med)
}


#' Compute cluster proportions for each row in a matrix
#'
#' @param mat a gene (rows) x samples(columns) sparse matrix
#' @param cl A cluster factor object
#' @param threshold The minimum expression value used to binarize the results
#'
#' @return a matrix of genes (rows) x cluster(columns) with proportions for each cluster
#' @export
#' 
get_cl_prop <- function(mat, 
                        cl, 
                        threshold = 1) {
  
  cl.mat <- get_cl_mat(cl)
  
  cl.prop <- Matrix::tcrossprod(mat[,rownames(cl.mat)] > threshold, Matrix::t(cl.mat))
  
  cl.prop <- as.matrix(cl.prop) / Matrix::colSums(cl.mat)[col(cl.prop)]
  
  return(cl.prop)
}


#' Compute correlation scores for columns of a sparse matrix
#' 
#' @param m a sparse matrix, preferrably dgCMatrix
#' 
#' @return a matrix of correlation values between each column of m
#'  
sparse_cor <- function(m) {
  #library(Matrix)
  
  n_rows <- nrow(m)
  n_cols <- ncol(m)
  
  ii <- unique(m@i) + 1 # rows with a non-zero element
  
  Ex <- Matrix::colMeans(m)
  
  nozero <- as.vector(m[ii,]) - rep(Ex, each = length(ii))        # colmeans
  
  covmat <- (crossprod(matrix(nozero, ncol = n_cols)) +
               crossprod(t(Ex)) * (n_rows - length(ii))
  ) / (n_rows - 1)
  
  sdvec <- sqrt(diag(covmat))
  
  cormat <- covmat / crossprod(t(sdvec))
  
  return(cormat)
}

#' Calculate Tau scores for each gene
#' 
#' @param m Matrix of expression values
#' @param byRow if TRUE, treats genes as row values and samples as columns.
#' 
#' @return a vector of Tau scores
#' 
calc_tau <- function(m, 
                     byRow = TRUE) {
  if(!byRow){
    m <- t(m)
  }
  
  row_maxes <- apply(m, 1, max)
  
  m <- m / row_maxes
  tau <- Matrix::rowSums(1 - m) / (ncol(m) - 1)
  tau[is.na(tau)] <- 0
  return(tau)
}

#' Downsample cells from each cluster
#' 
#' @param cl A cluster factor object
#' @param sample.size A maximum number of cells to take from each cluster, or a named numeric object with the number of cells to be sampled and names matching cluster levels
#' @param weights A named numeric vector with weights for each cell, passed to the prob parameter of the sample() function. Default is NULL.
#' @param seed A seed value for random sampling. If NULL (default), will be randomized.
#' 
#' @return A cluster factor object containing sampled cells
#' 
sample_cells <- function(cl, 
                         sample.size, 
                         weights = NULL,
                         seed = NULL) {
  
  n_cl <- unique(cl)
  
  if(length(sample.size) == 1) {
    
    sample.size <- setNames(rep(sample.size, length(n_cl)), n_cl)
    
  }
  
  cl.cells <- split(names(cl), cl)
  
  sampled.cells <- sapply(names(cl.cells), 
                          function(x) {
                            cells <- cl.cells[[x]]
                            
                            if(sample.size[[x]] >= length(cells)){
                              return(cells)
                            }
                            
                            to.sample <- pmin(sample.size[[x]], length(cells))
                            
                            if(!is.null(weights)){
                              set.seed(seed)
                              
                              sampled <- sample(cells, to.sample, prob = weights[cells])
                              
                            } else{
                              set.seed(seed)
                              
                              sampled <- sample(cells, to.sample)
                            }
                            
                            sampled
                            
                          }, simplify = FALSE)
  
  sampled.cells <- unlist(sampled.cells)
  
  return(sampled.cells)
}

#' Convert a matrix of raw counts to a matrix of Counts per Million values
#' 
#' The input can be a base R matrix or a sparse matrix from the Matrix package.
#' 
#' This function expects that columns correspond to samples, and rows to genes.
#' 
#' @param counts a matrix, dgCMatrix, or dgTMatrix of count values.
#' 
#' @return a matrix, dgCMatrix, or dgTMatrix of CPM values (matching input)
#' 
#' @export
#' 
cpm <- function(counts) {
  
  sf <- Matrix::colSums(counts) / 1e6
  
  if(is.matrix(counts)){    
    return(t(t(counts) / sf))
  } else if(class(counts) == "dgCMatrix") {
    sep <- counts@p
    sep <- sep[-1] - sep[-length(sep)]
    j <- S4Vectors::Rle(1:length(sep), sep)
    counts@x <- counts@x / sf[as.integer(j)]
  }
  else if(class(counts)=="dgTMatrix"){
    j = counts@j
    counts@x = counts@x/sf[j+1]
  }
  else{
    stop(paste("cpm function for", class(counts)[1], "not supported"))
  }
  return(counts)
}

#' Convert a matrix of raw counts to a matrix of log2(Counts per Million + 1) values
#' 
#' The input can be a base R matrix or a sparse matrix from the Matrix package.
#' 
#' This function expects that columns correspond to samples, and rows to genes.
#' 
#' @param counts a matrix, dgCMatrix, or dgTMatrix of count values.
#' 
#' @return a matrix, dgCMatrix, or dgTMatrix of log2(CPM + 1) values (matching input)
#' 
#' @export
#' 
logCPM <- function(counts) {
  
  norm.dat <- cpm(counts)
  
  if(is.matrix(norm.dat)){
    norm.dat <- log2(norm.dat + 1)
  } else {
    norm.dat@x <- log2(norm.dat@x + 1)
  }
  
  norm.dat
}

#' Compute correlations between each matching row or column of two matrices
#' 
#' e.g. row 1 of mat1 will be correlated with row 1 of mat2; row 2 of mat 1 with row 2 of mat2, etc.
#' 
#' The matrices must have the same number of rows or columns
#' 
#' @param mat1 a numeric matrix for correlation
#' @param mat2 a second numeric matrix for correlation
#' @param margin 1 for rows, 2 for columns (as for the MARGIN parameter of apply())
#' 
#' @return a numeric vector with paired correlation values
#' 
#' @export
#' 
pair_cor <- function(mat1, 
                     mat2,
                     margin = 1) {
  
  if(!margin %in% c(1,2)) {
    stop("margin must be either 1 (rows) or 2 (columns).")
  }
  
  if(margin == 1) {
    
    if(nrow(mat1) != nrow(mat2)) {
      stop("mat1 and mat2 must have an equal number of rows when margin = 1.")
    }
    
    
  } else if(margin == 2) {
    
    if(ncol(mat1) != ncol(mat2)) {
      stop("mat1 and mat2 must have an equal number of columns when margin = 2.")
    }
    
    mat1 <- t(mat1)
    mat2 <- t(mat2)
  }
  
  mat1 <- mat1 - rowMeans(mat1)
  mat2 <- mat2 - rowMeans(mat2)
  sd1 <- rowSds(mat1)
  sd2 <- rowSds(mat2)
  cors <- rowSums(mat1 * mat2) / ((ncol(mat1) - 1) * sd1 * sd2)  
  return(cors)
}


filter_by_size <- function(cat, min.size)
  {
    cat.size = table(cat)
    select.cat = names(cat.size)[cat.size >= min.size]
    cat = cat[cat %in% select.cat]
    if(is.factor(cat)){
      cat = droplevels(cat)
    }
  }
AllenInstitute/scrattch.hicat documentation built on Oct. 20, 2023, 6:55 a.m.