R/analyze_clustering.R

Defines functions analyze_clustering

Documented in analyze_clustering

#' This function analyzes ST data based on clustering generated by cluster_counts_OL. 
#' It calculates a list of genes specific to a certain cluster and identifies differentually expressed genes
#' across clusters
#'
#' @param counts non-negative numeric matrix containing gene counts, 
#' rows correspond to spots, columns correspond to genes
#' @param ids data frame or matrix assigning spatial coordinates to the spots (see process_input() for details)
#' @param clustering list containing clustering information as returned by cluster_counts_OL (numeric vector assigning spots to clusters)
#' @param sig.level significance level for the t-test between clusters; genes with p-values above this threshold
#' will be removed from output; default 0.05
#' @param lasso.data output of build_lassos function
#' @param deg.weight numeric, weight of the ranking of genes by differential gene expression in the
#' overall ranking; default 2
#' @param lls.weight numeric, weight of the ranking of genes by lls fit in the
#' overall ranking; default 3
#' @param entropy.weightnumeric, weight of the ranking of genes by entropy in the
#' overall ranking; default 1
#' @param build.lasso logical, should lasso models be calculated? only used if lasso.data is NULL, default FALSE
#' @param gamma numeric, gamma value passed to build_lassos. Determines ration of sparsity and smoothness in the model, default 3
#' @param ncores integer, number of cores used for lasso calculation; default 4
#' @param reduce logical, default TRUE. Return only genes passing certain thresholds in each ranking metric? Large runtime improvement in case build.lasso is TRUE.
#' @param normalize logical, should counts be normalized with sctransform prior to differential expression testing? default FALSE
#' @param verbose logical, default TRUE
#' @return list with 2 entries:\cr
#' 1) differential genes - data frame containing information (name, ckuster, p-value, up-/downregulation) for differentially expressed genes. Ordered by gene rank.\cr
#' 2) lasso_data
#' @export

analyze_clustering <- function(counts, ids, clustering, sig.level = 0.001, lasso.data = NULL, deg.weight = 2, lls.weight = 3, entropy.weight = 1, build.lasso = FALSE, gamma = 3, ncores = 4, reduce = TRUE, normalize = FALSE, verbose = TRUE){
    # parameter checks  
  
    if(verbose) cat("calculating gene-wise entropy...\t")
    # find cluster-specific genes by calculating total RNA per cluster and gene
    # use the lasso model if available
    if(is.null(lasso.data)){
      cluster.libs <- matrix(0, ncol=ncol(counts), nrow = length(unique(clustering)))
      colnames(cluster.libs) <- colnames(counts)
      rownames(cluster.libs) <- unique(clustering)

      for(cl in rownames(cluster.libs)){
          cluster.libs[cl,] <- colSums(counts[which(clustering == as.numeric(cl)),,drop=F])
      }
    }else{
      cluster.libs <- matrix(0, ncol=ncol(lasso.data$counts), nrow = length(unique(clustering)))
      colnames(cluster.libs) <- colnames(lasso.data$counts)
      rownames(cluster.libs) <- unique(clustering)

      # clustering is by default in the same order as columns in counts
      # lasso order is probably different
      for(cl in rownames(cluster.libs)){
  	      names(clustering) <- rownames(counts)
          cluster.libs[cl,] <- colSums(lasso.data$counts[which(clustering[rownames(lasso.data$counts)] == as.numeric(cl)),,drop=F])
      }
    }

    # cannot handle negative expression for entropy
    # shift expressions if necessary
    if(any(cluster.libs < 0)){
	    cluster.libs <- cluster.libs - min(cluster.libs)
    }
    # reduce to genes that are not 0 overall
    cluster.libs <- cluster.libs[, which(colSums(cluster.libs) > 0)]
    # scale to 1
    cluster.libs <- sweep(cluster.libs, 2, colSums(cluster.libs), "/")

    # create entropy vector (entropy for each spot)
    specific <- sapply(colnames(cluster.libs), function(x){
        if(! sum(cluster.libs[,x]) > 0) return(NA)
        # if there are 0 entries for a gene, these need to be excluded
        if(any(cluster.libs[,x] == 0)){
            entropy <- -sum(cluster.libs[-which(cluster.libs[,x] == 0), x] * log2(cluster.libs[-which(cluster.libs[,x] == 0), x]))
        }else{
            entropy <- -sum(cluster.libs[, x] * log2(cluster.libs[, x]))
        }
        return(entropy)
    })
    if(verbose) cat("done\n")
    names(specific) <- colnames(cluster.libs)
    
    # remove NA genes
    if(any(is.na(specific))){
    	specific <- specific[-which(is.na(specific))]
    }
    
    pdf("entropies.pdf")
    hist(specific, breaks = 100)
    dev.off()
    #if(reduce){
    #  # keep only genes with entropy smaller or equal to the mean value
    #  specific <- specific[which(specific <= summary(specific)[4])]
    #  if(verbose) cat(length(specific), " genes passed entropy threshold\n", sep = "")
    #}else{
    #  if(verbose) cat(length(which(specific <= summary(specific)[3])), " genes below mean entropy\n", sep = "")
    #}


    if(verbose) cat("testing for differential gene expression...\t")
    # implement testing with multtest for differentially expressed genes
    # test each cluster against all other clusters at the same time
    
    # normalize counts if required
    if(normalize && !any(counts < 0)){
	    deg.counts <- normalize_counts(counts)
    }else{
	    deg.counts <- counts
    }
    
    test.results <- list()
    
        p.vals <- sapply(colnames(deg.counts), function(g, count.mat = deg.counts, labs = as.factor(clustering)){
    		summary(aov(count.mat[, g] ~ labs))[[1]][5][1,1]
    	}) 
        # if test not possible. set the genes p-value to 1 (max)
        if(any(is.na(p.vals))){
            p.vals[is.na(p.vals)] <- 1
        }
        if(any(is.nan(p.vals))){
            p.vals[is.nan(p.vals)] <- 1
        }
        # sort and store
        p.vals <- sort(p.vals)
    
    if(verbose) cat("done\n")

    # right now all.genes is actually all genes, but if a significance cutoff
    # were used above, this would be needed
    #all.genes <- as.vector(sapply(test.results, function(x){names(x)}))
    all.genes <- names(p.vals)
    if(any(is.na(all.genes)))
    all.genes <- all.genes[!is.na(all.genes)]
    all.genes <- unique(all.genes)

    if(verbose) {
      if(reduce) cat("Assign genes to clusters and remove insignificant genes")
      else cat("Assign genes to clusters")
    }
    # sort the clustering vector to have consistent order
    clusters <- sort(unique(clustering))
    
    if(reduce){
    	p.vals <- p.vals[p.vals <= sig.level]
    }

    # reduced information to minimum p-value, corresponding cluster, gene name and up/downregulation info
    gene.info <- c()
    for(i in 1:length(p.vals)){
        g <- names(p.vals)[i]
        p <- p.vals[i]
  	
        # assign genes to downregulated clusters only if that gene's mean expression in the 
        # remaining clusters is closer to the expression in the upregulated than in the 
        # downregulated cluster
        regulations <- sapply(clusters, function(cl){mean(counts[which(clustering == cl),g]) - mean(counts[which(clustering != cl), g])})
      	
      	 cl <- clusters[which.max(regulations)]
    	
        reg <- 1	
	gene.info <- rbind(gene.info,
                	   c(g, cl, p, reg)
        	          )
    }
    # coerce to data frame, name columns and adjust variable types
    gene.info <- as.data.frame(gene.info)
    colnames(gene.info) <- c("gene", "cluster", "pVal", "regulation")
    gene.info$pVal <- as.numeric(as.character(gene.info$pVal))
    gene.info$regulation <- as.numeric(as.character(gene.info$regulation))
    rownames(gene.info) <- as.character(gene.info$gene)

    if(verbose){
      cat("done\n")
      if(reduce) cat(nrow(gene.info), " genes passed the significance threshold\n", sep = "")
      else cat(sum(gene.info$pVal < sig.level), " genes below significance threshold\n", sep = "")
    }

    gene.info <- filter_genes(
      gene.info, 
      specific, 
      lasso.data, 
      deg.weight = deg.weight, 
      lls.weight = lls.weight, 
      entropy.weight = entropy.weight,
      build.lasso = build.lasso, 
      ncores = ncores, 
      gamma = gamma, 
      counts = counts, 
      ids = ids,
      reduce = reduce,
      verbose = verbose
      )
    if(is.null(gene.info)) return(NULL)

    return(list(
        differential_genes = gene.info$diff.genes,
        lasso_data = gene.info$lasso
	))
}
tmirus/TTT documentation built on April 17, 2021, 11:04 p.m.