R/module_scores.R

Defines functions plot_genes_module plot_scores_heatmap module_score max_scores run_iscore run_sscore run_zscore run_rsscore filter_mat_by_cpm rescale_vector normalize_mat_by_gene calc_boot_score bin_genes get_binned_module get_smooth_module get_boot_scores scale_rows scale_rows_01 scale_rows_rank simple_score boot_seurat boot_zval boot_pval softmax

# module scores 
library(matrixStats)

# scoring methods are from Igor and Teo

# scoring qc - silhouette

# compare scoring 

# scoring plots

# heatmap of scores

#heatmap of each gene in module list


plot_genes_module <- function(module_scores, data, module_tbl, id = ""){
  
  #iterate through celltypes
  for(i in 1:length(unique(module_tbl$celltype))){
    current_celltype <- unique(module_tbl$celltype)[i]
    # get genes in that celltype   grepl(paste0("^", current_celltype), module_tbl$celltype)
    #current_genes <- module_tbl$gene[which(module_tbl$celltype %in% current_celltype)]
    current_genes <- module_tbl$gene[grepl(paste0("^", current_celltype), module_tbl$celltype)] 
    # get counts
    current_data <- as.matrix(data[which(rownames(data) %in% current_genes),])
    # get scores as dataframe
    module_scores_annotation <- as.data.frame(do.call(cbind,lapply(module_scores, function(x){
      position_of_celltype <- grepl(paste0("^", current_celltype), colnames(x))
      out <- x %>% 
        rownames_to_column('cluster') %>% 
        select("cluster", colnames(x)[position_of_celltype]) %>% 
        column_to_rownames('cluster')
    })))
    #colnames(module_scores_annotation) <- make.names(colnames(module_scores_annotation), unique = TRUE)
    
    if(nrow(current_data) == 1){next}
    
    sd_cols <- apply(current_data, 2, sd)
    if(length(which(sd_cols == 0)) != 0){
      current_data <- current_data[, -which(sd_cols == 0)]
    }
    sd_rows <- apply(current_data, 1, sd)
    if(length(which(sd_rows == 0)) != 0){
      current_data <- current_data[-which(sd_rows == 0),]
    }
    
    # generate the heatmap
    
    png(glue("{id}.{current_celltype}.genes.heatmap.png"), width = 10, height = 10, units = "in", res = 300)
    pheatmap(
      current_data, scale = "row", annotation_col= module_scores_annotation, 
      fontsize = 10, fontsize_row = 8, fontsize_col = 12, show_colnames = TRUE)
    dev.off()
    Sys.sleep(1)
  }
  # and one all together
  current_data <- as.matrix(data[which(rownames(data) %in% module_tbl$gene),])
  sd_cols <- apply(current_data, 2, sd)
  if(length(which(sd_cols == 0)) != 0){
    current_data <- current_data[, -which(sd_cols == 0)]
  }
  sd_rows <- apply(current_data, 1, sd)
  if(length(which(sd_rows == 0)) != 0){
    current_data <- current_data[-which(sd_rows == 0),]
  }
  module_scores_annotation <- as.data.frame(do.call(cbind,lapply(module_scores, function(x){
    position_of_celltype <- grepl(paste0("^", current_celltype), colnames(x))
    out <- x %>% 
      rownames_to_column('cluster') %>% 
      select("cluster", ends_with("module")) %>% 
      column_to_rownames('cluster')
  })))
  colnames(module_scores_annotation) <- make.names(colnames(module_scores_annotation), unique = TRUE)
  # generate the heatmap
  png(glue("{id}.all.genes.heatmap.png"), width = 30, height = 30, units = "in", res = 300)
  pheatmap(
    current_data, scale = "row", annotation_col= module_scores_annotation, 
    fontsize = 10, fontsize_row = 8, fontsize_col = 12, show_colnames = TRUE)
  dev.off()
  Sys.sleep(1)
}

plot_scores_heatmap <- function(module_scores, id = ""){
  
  for(i in 1:length(module_scores)){
    
    current_mod <- module_scores[[i]]
    
    main <- current_mod %>% 
      select(ends_with("score")) %>% 
      as.matrix()
    
    label <- current_mod %>% 
      select(ends_with("module")) %>% 
      rename(!!names(module_scores)[i] := ends_with("module")) %>% 
      as.data.frame()
    
    # generate the heatmap
    
    png(glue("{names(module_scores)[i]}.{id}.score.heatmap.png"), width = 10, height = 10, units = "in", res = 300)
    pheatmap(
      main, scale = "none", annotation_row = label, 
      fontsize = 10, fontsize_row = 8, fontsize_col = 12, show_colnames = TRUE)
    dev.off()
    Sys.sleep(1)
  }
}

module_score <- function(module_tbl, counts_norm = NULL, counts_raw = NULL, 
                         method = c("iscore", "sscore","zscore","rsscore")) {
  # returns module scores in a uniform way for the method specified
  # 
  # Args:
  #   module_tbl: a tibble in long format with celltype and gene
  #   counts_norm: normalized counts table for use with Teo's scores
  #   count_raw: raw counts table for use with Igor's scores
  #   method: character vector with the score 
  #
  # Results:
  #   list of dataframes of module scores
  
  scoring_methods <- c(iscore = "run_iscore",
                       sscore = "run_sscore",
                       zscore = "run_zscore",
                       rsscore = "run_rsscore")
  
  current_methods <- scoring_methods[method]
  
  module_scores <- lapply(current_methods, function(x){
    scores <- eval(as.name(x))(module_tbl= module_tbl, 
                                             counts_norm = counts_norm,
                                             counts_raw = counts_raw)
    
    scores_max <- max_scores(scores = scores, method = names(x))
  })
}

max_scores <- function(scores, method, threshold = 0) {
  # gets max pos or zero 
  # idk how to do Igors yet
  scores$unknown <- rep(threshold, nrow(scores))
  scores <- scores %>% 
    column_to_rownames("cell")
  scores$module <- colnames(scores)[apply(scores,1,which.max)]
  scores <- scores %>% 
    separate(module, c("module", NA))
  var <- sym(paste(eval(method), "_module", sep = ""))
  scores <- scores %>% 
    rename(!!var := module) %>% 
    select(-contains("unknown"))
  return(scores)
}

run_iscore <- function(module_tbl, counts_norm, counts_raw = NULL) {
  # module tbl should be in long format with celltype and gene
  
  module_tbl <- module_tbl %>% 
    filter(.$gene %in% rownames(counts_norm)) 
  
  counts_sub <- counts_norm[which(rownames(counts_norm) %in% module_tbl$gene),]
  counts_sub <- scale_rows(counts_sub)

  iscores <- module_tbl %>% 
    group_by(celltype) %>% 
    do(
      s <- simple_score(counts_sub, .$gene)
    ) %>% 
    spread(celltype, scores) %>% 
    rename_at(vars(-contains("cell")), list(~paste0(., "_Iscore")))
  
  return(iscores)
}

run_sscore <- function(module_tbl, counts_norm, counts_raw = NULL) {
  # module tbl should be in long format with celltype and gene

  module_list <- module_tbl %>%
    filter(.$gene %in% rownames(counts_norm)) %>% 
    with(split(.$gene, celltype))
  
  celltypes <- unique(module_tbl$celltype)
  
  rmodule_binned <- lapply(module_list, get_binned_module, 
                           binned_genes = bin_genes(counts_norm, nbins = 25L))
  
  score = vapply(celltypes, calc_boot_score, numeric(ncol(counts_norm)), score = boot_seurat, rmodule = rmodule_binned)
  sscore <- score %>% 
    as.data.frame() %>% 
    rownames_to_column("cell") %>% 
    rename_at(vars(-contains("cell")), list(~paste0(., "_Sscore")))
  return(sscore)
}

run_zscore <- function(module_tbl, counts_norm, counts_raw = NULL) {
  # module tbl should be in long format with celltype and gene
  
  module_list <- module_tbl %>%
    filter(.$gene %in% rownames(counts_norm)) %>% 
    with(split(.$gene, celltype))
  
  celltypes <- unique(module_tbl$celltype)
  
  rmodule_smooth = lapply(module_list, get_smooth_module, ave = rowMeans(counts_norm))
  
  score = vapply(celltypes, calc_boot_score, numeric(ncol(counts_norm)), score = boot_zval, rmodule = rmodule_smooth)
  zscore <- score %>% 
    as.data.frame() %>% 
    rownames_to_column("cell") %>% 
    rename_at(vars(-contains("cell")), list(~paste0(., "_Zscore")))
  return(zscore)
}

run_rsscore = function(module_tbl, counts_raw, min_cpm = 0, limit_pct = 1, counts_norm = NULL) {
  # perform the cell type enrichment calculation based on rescaled values
  
  module_list <- module_tbl %>%
    filter(.$gene %in% rownames(counts_raw)) %>% 
    with(split(.$gene, celltype))
  
  if (class(counts_raw) != "matrix") { stop("expression matrix is not a matrix") }
  if (max(counts_raw) < 100) { stop("expression values appear to be log-scaled") }
  
  # filter matrix for expressed genes only
  counts_raw = filter_mat_by_cpm(counts_raw = counts_raw, min_cpm = min_cpm)
  
  # rescale matrix for expressed genes only
  counts_raw_subs = normalize_mat_by_gene(counts_raw = counts_raw[unlist(module_list), ], limit_pct = limit_pct)
  

  # check if enough genes pass filter
  if (min(lengths(module_list)) < 3) { stop("too few genes per celltype") }
  
  # calculate average z-score per celltype
  celltype_scores_tbl = tibble()
  for (ct in names(module_list)) {
    celltype_scores_tbl =
      bind_rows(
        celltype_scores_tbl,
        tibble(
          cell = colnames(counts_raw_subs),
          celltype = ct,
          score = colMeans(counts_raw_subs[module_list[[ct]], ])
        )
      )
    ct_scores = colnames(counts_raw_subs)
  }

  celltype_scores_tbl <- celltype_scores_tbl %>% 
    spread(celltype, score) %>% 
    rename_at(vars(-contains("cell")), list(~paste0(., "_RSscore")))
  
  return(celltype_scores_tbl)
}

filter_mat_by_cpm = function(counts_raw, min_cpm = 0) {
  # filter matrix by a specified CPM value (higher in at least one sample/column for each gene/row)
  
  if (class(counts_raw) != "matrix") { stop("expression matrix is not a matrix") }
  #if (max(counts_raw) < 100) { stop("expression values appear to be log-scaled") }
  if (nrow(counts_raw) < 10000) { stop("expression matrix has too few genes") }
  
  # expression level equivalent to 1 CPM (1 for 1m, 0.01 for 10k)
  exp_cpm1 = (1 / 1000000) * median(colSums(counts_raw))
  # expression level equivalent to the requested CPM
  min_exp = exp_cpm1 * min_cpm
  # filtered expression matrix
  counts_raw = counts_raw[matrixStats::rowMaxs(counts_raw) > min_exp, ]
  
  return(counts_raw)
  
}

rescale_vector = function(x, limit_pct = 1) {
  x / quantile(x, limit_pct)
}

normalize_mat_by_gene = function(counts_raw, limit_pct = 1) {
  
  if (limit_pct > 1) { stop("percentile should be expressed as a fraction") }
  if (class(counts_raw) != "matrix") { stop("expression matrix is not a matrix") }
  #if (max(counts_raw) < 100) { stop("expression values appear to be log-scaled") }
  
  counts_raw = apply(counts_raw, MARGIN = 1, FUN = rescale_vector, limit_pct = limit_pct)
  counts_raw = t(counts_raw)
  counts_raw[counts_raw > 1] = 1
  
  return(counts_raw)
  
}
calc_boot_score = function(celltype, score, rmodule) 
  score(counts_norm, module_list[[celltype]], rmodule[[celltype]])

bin_genes = function(mat, nbins = 25L) {
  # given a matrix it splits the genes into `bins`
  # and also returns a vector (`gene2bin`) mapping genes to bins
  ave = rowMeans(mat)
  ave_bins = as.integer(Hmisc::cut2(ave, m = round(nrow(mat) / nbins)))
  names(ave_bins) = rownames(mat)
  list(bins = split(rownames(mat), ave_bins), gene2bin = ave_bins)
}

get_binned_module = function(module, binned_genes) {
  # given a list of binned genes and a gene-module, it returns a function
  # that generates `n` "similar" random modules
  pools = with(binned_genes, bins[gene2bin[module]])
  pools = lapply(pools, setdiff, module)
  function(n) 
    vapply(pools, sample, rep("", n), size = n, replace = TRUE, USE.NAMES = FALSE)
}
get_smooth_module = function(module, ave, p = 2, scale. = 1) {
  # given a gene `module` and a named vector of gene averages (`ave`)
  # it returns a function that generates `n` "similar" random modules
  # `p` is used to scale the negative exponential distances
  # p = 2 -> gaussian kernel around each gene, p = 1 -> laplacian kernel etc
  ix = names(ave) %in% module
  module_mean = ave[ix]
  ave = ave[!ix]
  genes = names(ave)
  function(n)
    vapply(module_mean, function(x) {
      d = abs(x - ave) / scale.
      prob = exp(-d^p)
      sample(genes, n, prob = prob, replace = TRUE)
    }, rep("", n), USE.NAMES = FALSE)
}
get_boot_scores = function(mat, rmodule, boot_size = 100L) 
  apply(rmodule(boot_size), 1L, function(x) colMeans(mat[x, , drop = FALSE]))

scale_rows = function(mat, epsilon = 1e-5) (mat - rowMeans(mat)) / (rowSds(mat) + epsilon)
scale_rows_01 = function(mat) mat / rowMaxs(mat)
scale_rows_rank = function(mat) t(apply(mat[module_genes,], 1L, rank))

simple_score = function(mat, module) {
  # assuming mat is scaled by row
  ix = rownames(mat) %in% module
  out <- as.data.frame(colMeans(mat[ix, , drop = FALSE]) - colMeans(mat[!ix, , drop = FALSE]))
  out$cell <- rownames(out)
  colnames(out) <- c("scores", "cell")
  return(out)
}
boot_seurat = function(mat, module, rmodule, boot_size = 100L, seed = 1234L) {
  set.seed(seed)
  boot = colMeans(mat[which(rownames(mat) %in% module), ]) - get_boot_scores(mat, rmodule, boot_size)
  rowMeans(boot)
}
boot_zval = function(mat, module, rmodule, boot_size = 100L, seed = 1234L) {
  set.seed(seed)
  boot = cbind(colMeans(mat[module, ]), get_boot_scores(mat, rmodule, boot_size))
  boot = scale_rows(boot)
  boot[, 1L]
}
boot_pval = function(mat, module, rmodule, boot_size = 100L, seed = 1234L) {
  set.seed(seed)
  boot = colMeans(mat[module, ]) > get_boot_scores(mat, rmodule, boot_size)
  rowMeans(boot)
}
softmax = function(x, mask = NULL) {
  y = exp(x)
  if (!is.null(mask))
    y[x < mask] = 0
  mass = rowSums(y)
  mass[mass == 0] = 1  # avoid division by 0
  y / mass
}
ayeaton/scRNAscripts documentation built on Nov. 3, 2019, 2:05 p.m.