R/mc.r

Defines functions mcell_mc_split_by_color_group mc_restrict mcell_mc_match_graph mcell_mc_reorder_hc mc_reorder mcell_mc_add_color mcell_mc_add_annot mc_compute_n_bc mc_compute_cov_gc mc_compute_e_gc mc_compute_fp mc_set_outlier_mc mc_update_stats mcell_new_mc

Documented in mc_compute_cov_gc mc_compute_e_gc mc_compute_fp mc_compute_n_bc mcell_mc_add_annot mcell_mc_add_color mcell_mc_match_graph mcell_mc_reorder_hc mcell_mc_split_by_color_group mcell_new_mc mc_reorder mc_restrict mc_set_outlier_mc mc_update_stats

#' Meta cell cover
#'
#' Representing a meta cell cover of a given cell graph (or more generally of a scRNA data matrix)
#'
#' @slot mc assignment of cells to metacell id
#' @slot cell_names names of cells (all other objects use running integres relating to these names)
#' @slot mc_fp a matrix showing for each gene (row) the relative enrichment of umis
#' @slot e_gc a matrix showin
#'
#'
#'
#'
#'
#'                                                                                                                                                                                                                                     g for each gene (row) the mean umi count (without normalizing for cell size)
#' @slot cov_gc a matrix showing for each gene (row) the fraction of cells in the meta cell that are non zero for the umi
#' @slot n_bc a matrix detemrining for each batch (row) the meta cell breakdown
#' @slot annots names of metacells (oridnal numbers by default)
#' @slot colors colors of metacells
#' @slot color_key data.frame defining color per markers genes
#'
#' @export tgMCCov
#' @exportClass tgMCCov
tgMCCov <- setClass(
   "tgMCCov",
	slots = c(
	  mc = "vector",
	  outliers = "vector",
	  cell_names = "vector",
	  mc_fp = "matrix",
	  e_gc = "matrix",
	  cov_gc = "matrix",
	  n_bc = "matrix",
	  annots = "vector",
	  colors = "vector",
	  color_key = "data.frame")
)

#' Construct a meta cell object
#'
#' This constructs a meta cell cover object. It gets an MC assignment (cell->MC_ID), and a matrix, and call standard api of this class to compute the footprints.
#'
#' @param mc assignment of metacell id to cell
#' @param scmat a single cell RNA matrix object
#' @export

setMethod(
  "initialize",
  signature = "tgMCCov",
  definition =
    function(.Object, mc, outliers = character(length = 0), scmat) {
		all_cells = colnames(scmat@mat)
		.Object@mc = mc
		.Object@outliers = outliers
		.Object@cell_names = all_cells
		mc_cells = names(mc)
		if(length(intersect(outliers, mc_cells)) > 0) {
			stop("MC-ERR non zero intersect of outliers and mc assigned cells")
		}
		if(length(all_cells) != (length(mc_cells)+length(outliers))
		| length(setdiff(all_cells, c(mc_cells, outliers))) > 0) {
			stop("matrix cell names and mc assignments+outliers differ in tgMCCov initialization", paste(head(setdiff(all_cells, c(mc_cells, outliers))),collapse=" "))
		}
		max_clust = max(mc)
		.Object@colors = rep("white", n=max_clust)
		.Object@annots= 1:max_clust
		.Object = mc_update_stats(.Object, scmat)
		.Object@color_key = data.frame(group=c(), gene=c(), color=c(), priority=c(), T_fold=c())
      return(.Object)
    }
)

#' Generate a new metacell in scdb
#'
#' This constructs a meta cell cover object and puts it into scdb. It gets an MC assignment (cell->MC_ID), and a matrix, and call standard api of this class to compute the footprints.
#'
#' @param mc_id id of scdb meta cell object ot be added
#' @param mc assignment of metacell id to cell
#' @param outliers the list of outliers 
#' @param scmat a single cell RNA matrix object
#' @export
mcell_new_mc = function(mc_id, mc, outliers, scmat)
{
	scdb_add_mc(mc_id, tgMCCov(mc, outliers, scmat))
}

#' Compute stats over metacell and update the object
#'
#' This compute statistics per metacell given an sc matirx.
#'
#' @param mc a metacell object
#' @param scmat a matrix object to draw statisics from
#' @export
mc_update_stats = function(mc, scmat)
{
	us = scmat@mat[, names(mc@mc)]
	message("add batch counts")
	mc@n_bc = mc_compute_n_bc(mc, scmat)
	message("compute footprints")
	mc@mc_fp = mc_compute_fp(mc, us)
	message("compute absolute ps")
	mc@e_gc= mc_compute_e_gc(mc, us)
	message("compute coverage ps")
	mc@cov_gc = mc_compute_cov_gc(mc, us)
	return(mc)
}

#' Move all cels from specific metacells to the outliers
#'
#' This can be used to treat doublets metacells or other artifacts.
#'
#' @param mc mc object
#' @param mc_ids list of metacells to eliminate
#'
#' @export
mc_set_outlier_mc = function(mc, mc_ids)
{
	miss_mc = setdiff(mc_ids, colnames(mc@mc_fp))
	if(length(miss_mc) != 0) {
		stop("trying to remove non existing mcs, ids ", paste(miss_mc, collapse=" "))
	}
	N = ncol(mc@mc_fp)
	newN = N - length(mc_ids)
	id_map = rep(-1,N)
	id_map[(1:N)[-mc_ids]] = 1:newN
	ocells = names(mc@mc)[which(mc@mc %in% mc_ids)]
	mc@outliers = c(mc@outliers, ocells)
	gcells = setdiff(mc@cell_names, mc@outliers)
	mc@mc = mc@mc[gcells]
	mc@mc = id_map[mc@mc]
	names(mc@mc) = gcells
	drop_f = !(colnames(mc@mc_fp) %in% mc_ids)
	mc@mc_fp = mc@mc_fp[,drop_f]
	colnames(mc@mc_fp) = 1:newN
	mc@e_gc = mc@e_gc[,drop_f]
	colnames(mc@e_gc) = 1:newN
	mc@cov_gc = mc@cov_gc[,drop_f]
	colnames(mc@cov_gc) = 1:newN

#handle cases of single batch (vector.matrix issues)
	tb = mc@n_bc[,drop_f]
	n_bc = matrix(tb, ncol=sum(drop_f))
	rownames(n_bc) = rownames(mc@n_bc)
	mc@n_bc = n_bc
	colnames(mc@n_bc) = 1:newN

	mc@annots = mc@annots[drop_f]
	names(mc@annots) = 1:newN
	mc@colors= mc@colors[drop_f]
	names(mc@colors) = 1:newN
	return(mc)
}


#' Compute metacell gene footprint
#'
#' The footprint is defined as the size-normalized geometric mena of the number of umis per metacells, dvided by the median over all metacells.
#'
#' @param mc a metacell object
#' @param us umi matrix
#' @param norm_by_mc_size normalize by mean total umis and then multiply by median mean mc size (or at least 1000). This means umis per 1000 molecules.
#' @param min_total_umi consider genes with at least min_total_umi total umis
#'
#' @export
mc_compute_fp = function(mc, us, norm_by_mc_size=T, min_total_umi=10)
{
	f_g_cov = rowSums(us) > min_total_umi

	if(0) {
		mc_cores = get_param("mc_cores")
		doMC::registerDoMC(mc_cores)
		all_gs = rownames(us[f_g_cov,])
		n_g = length(all_gs)
		g_splts = split(all_gs, 1+floor(mc_cores*(1:n_g)/(n_g+1)))
		fnc = function(gs) {
						.row_stats_by_factor(us[gs,],
										mc@mc,
										function(y) {exp(rowMeans(log(1+y)))-1}) }

		clust_geomean = do.call(rbind, mclapply(g_splts, fnc, mc.cores = mc_cores))
	}
	clust_geomean = t(tgs_matrix_tapply(us[f_g_cov,], mc@mc,
									  function(y) {exp(mean(log(1+y)))-1}))
	rownames(clust_geomean) = rownames(us)[f_g_cov]

#	clust_geomean = .row_stats_by_factor(us[f_g_cov,],
#									mc@mc,
#									function(y) {exp(rowMeans(log(1+y)))-1})

	if (norm_by_mc_size) {
		mc_meansize = tapply(colSums(us), mc@mc, mean)
		ideal_cell_size = pmin(1000, median(mc_meansize))
		g_fp = t(ideal_cell_size*t(clust_geomean)/as.vector(mc_meansize))
	}
	else {
		g_fp = clust_geomean
	}
	#normalize each gene
	fp_reg = 0.1
	#0.1 is defined here because 0.1*mean_num_of_cells_in_cluster
	#is epxected to be 3-7, which means that we regulairze
	#umicount in the cluster by 3-7.
	g_fp_n = (fp_reg+g_fp)/apply(fp_reg+g_fp, 1, median)

	return(g_fp_n)
}

#' Compute metacell absolute mean umi per cell
#'
#' This compute the genometric mean of the number of umis per cells for each metacell
#'
#' @param mc a metacell object
#' @param us umi matrix
#' @param norm_by_mc_meansize normalize metacells by mean total cell umis
#' @export
mc_compute_e_gc= function(mc, us, norm_by_mc_meansize=T)
{
	f_g_cov = rowSums(us) > 10

	e_gc = t(tgs_matrix_tapply(us[f_g_cov,], mc@mc, function(y) {exp(mean(log(1+y)))-1}))
	rownames(e_gc) = rownames(us)[f_g_cov]

	if (norm_by_mc_meansize) {
		mc_meansize = tapply(colSums(us), mc@mc, mean)
		e_gc = t(t(e_gc)/as.vector(mc_meansize))
	}
	return(e_gc)
}

#' Compute fraction of non zero expressing cells per gene and mc
#'
#'
#' @param mc a metacell object
#' @param us umi matrix
#' @export
mc_compute_cov_gc= function(mc, us)
{
	f_g_cov = rowSums(us) > 10
	cov_gc = t(tgs_matrix_tapply(us[f_g_cov,], mc@mc, function(x) mean(x>0)))
	rownames(cov_gc) = rownames(us)[f_g_cov]
	return(cov_gc)
}

#' Compute distribution of cells over batches and metacell
#'
#' @param mc a metacell object
#' @param scmat scamt object (for the metadata)
#' @export
mc_compute_n_bc= function(mc, scmat)
{
	tb = table(scmat@cell_metadata[names(mc@mc), "amp_batch_id"], mc@mc)
	n_bc = matrix(tb, ncol=dim(tb)[2])
	rownames(n_bc) = dimnames(tb)[[1]]
	return(n_bc)
}

#' Update metacell annotation
#'
#' @param mc a metacell object
#' @param annot annotation vectors
#' @export
mcell_mc_add_annot = function(mc_id, annots)
{
	mc = scdb_mc(mc_id)
	if(is.null(mc)) {
		stop("MC-ERR: missing mc_id when updating annotation id = ", mc_id)
	}
	mc@annots = annots
	scdb_add_mc(mc_id, mc)
}

#' Update metacells  colors
#'
#' @param mc a metacell object
#' @param colors a vector of colors per metacell
#' @export
mcell_mc_add_color= function(mc_id, colors)
{
	mc = scdb_mc(mc_id)
	if(is.null(mc)) {
		stop("MC-ERR: missing mc_id when updating annotation id = ", mc_id)
	}
	mc@colors = colors
	scdb_add_mc(mc_id, mc)
}

#' Reorder metacell data given defined order
#'
#' @param mc metacell object
#' @param ord new order metacells
#'
#' @export

mc_reorder = function(mc, ord)
{
	if(length(ord) != ncol(mc@mc_fp)) {
		stop("MC-ERR: reordering metacells with an order vector shorter than the number of MC in the object")
	}
	ord_map = rep(NA, ncol(mc@mc_fp))
	ord_map[ord] = 1:ncol(mc@mc_fp)
	if(sum(is.na(ord_map)) > 0) {
		stop("MC-ERR: reordering metacells with an order vector lacking some of the ids - generate a new subset metacell object if this is what you wanted to do")
	}
	nms = names(mc@mc)
	mc@mc = ord_map[mc@mc]
	names(mc@mc) = nms
	mc@mc_fp = mc@mc_fp[,ord]
	colnames(mc@mc_fp) = 1:length(ord)
	mc@e_gc = mc@e_gc[,ord]
	colnames(mc@e_gc) = 1:length(ord)
	mc@cov_gc = mc@cov_gc[,ord]
	colnames(mc@cov_gc) = 1:length(ord)
	bnms = rownames(mc@n_bc)
	mc@n_bc = matrix(mc@n_bc[,ord], ncol=length(ord))	#as.matrix to avoid casting on 1 batch data
	rownames(mc@n_bc) = bnms
	colnames(mc@n_bc) = 1:length(ord)
	if(!is.null(mc@annots)) {
		mc@annots = mc@annots[ord]
		names(mc@annots) = 1:length(mc@annots)
	}
	if(!is.null(mc@colors)) {
		mc@colors = mc@colors[ord]
		names(mc@colors) = 1:length(mc@colors)
	}
	return(mc)
}

#' Reorder metacells using hierarchical clustering
#'
#' MEtacells are reorder according to their footprint similarity based on hclust and the reordering using two select antagonistic markers (that can be selected automatically)
#'
#' @param mc_id id of metacell object
#' @param gene_left gene for reordering toward the left side (null by default)
#' @param gene_right gene for reordering toward the right side (null by default)
#' @param gset_blist_id id of gene set to blacklist while ordering (e.g. cell cycle genes)
#'
#' @export

mcell_mc_reorder_hc = function(mc_id,
							gene_left = NULL, gene_right = NULL,
							gset_blist_id = NULL)
{
	mc = scdb_mc(mc_id)
	if(is.null(mc)) {
		stop("MC-ERR: missing mc_id when updating annotation id = ", mc_id)
	}
	gene_folds = mc@mc_fp

	marks = rownames(gene_folds)

	if(!is.null(gset_blist_id)) {
		blist = scdb_gset(gset_blist_id)
		if(is.null(blist)) {
			stop("MC-ERR unknown geneset id ", gset_blist_id, " when blacklisting markers for ordering metacells")
			marks = setdiff(marks, names(blist@gene_set))
		}
	}
	max_gcov = apply(mc@cov_gc[marks,],1,max)
	cov_marks = rownames(mc@cov_gc)[max_gcov > 0.25]
	if(length(intersect(marks, cov_marks)) > ncol(gene_folds)) {
		marks = intersect(marks, cov_marks)
	}

	gene_folds = gene_folds[marks,]

	good_marks = unique(as.vector(unlist(
			apply(gene_folds,
				2,
				function(x)  {
				   names(head(sort(-x[x>0.5]),n=10)) })
		     )))

	if(is.null(good_marks) | length(good_marks) < 4) {
		good_marks= rownames(gene_folds)
	}
	feat = log2(gene_folds[good_marks,])

	hc = hclust(dist(cor(feat)), "ward.D2")

	if(!is.null(gene_right) & !is.null(gene_left)) {
		if(!gene_right %in% marks | !gene_left %in% marks) {
			stop("genes for order polarization ", gene_right, " ", gene_left, " are not in markers")
		}
	}
	if(is.null(gene_right) | is.null(gene_left)) {
		g_ncover = apply(feat > 1, 1, sum)
		main_mark = names(g_ncover)[which.max(g_ncover)]
		f = feat[main_mark,] < 0.25
		if(sum(f) > 0.2*ncol(feat)) {
			g_score = apply(feat[,f]>1, 1, sum)
		} else {
			g_score = -apply(feat, 1, cor, feat[main_mark,])
		}
		second_mark = names(g_score)[which.max(g_score)]
		message("reorder on ", main_mark, " vs ", second_mark)
	}

	d = reorder(as.dendrogram(hc),
				feat[gene_right,]-feat[gene_left,],
				agglo.FUN=mean)
	hc2 = as.hclust(d)

	scdb_add_mc(mc_id, mc_reorder(mc, hc2$order))
}

#' TEst if a graph object cover all cells in the mc
#'
#' mc_id mc object
#' graph_id graph object
#'
#' @export
#'
mcell_mc_match_graph = function(mc_id, graph_id)
{
	mc = scdb_mc(mc_id)
	graph = scdb_cgraph(graph_id)
	cov = intersect(graph@cell_names, mc@cell_names)
	return(length(cov) == length(mc@cell_names))
}

#' Create a metacell object on a subset of the MCs, with all other cells becoming outliers. There is no re-normalization of mc_fp.
#'
#' @param mc metacell object
#' @param foc_mcs list of metacell to keep in the object
#'
#' @export

mc_restrict = function(mc, foc_mcs)
{
	foc_mcs = intersect(as.character(foc_mcs), as.character(1:ncol(mc@mc_fp)))
	all_nms = names(mc@mc)
	foc_nms = names(mc@mc)[mc@mc %in% foc_mcs]
	message("restrict mc from ", length(mc@mc), " to ", length(foc_nms), "cells")
	mc@outliers = union(mc@outliers, setdiff(all_nms, foc_nms))
	mc@mc = mc@mc[foc_nms]
	mc@mc_fp = mc@mc_fp[,foc_mcs]
	mc@e_gc = mc@e_gc[,foc_mcs]
	mc@cov_gc = mc@cov_gc[,foc_mcs]
	colnames(mc@n_bc) = 1:ncol(mc@n_bc)
	mc@n_bc = mc@n_bc[,foc_mcs]
	if(!is.null(mc@annots)) {
		mc@annots = mc@annots[foc_mcs]
	}
	if(!is.null(mc@colors)) {
		names(mc@colors) = 1:length(mc@colors)
		mc@colors = mc@colors[foc_mcs]
	}
	return(mc)
}

#' Splits input metacell object into sub-objects by color group, naming the new metacells <mc_id>_submc_<group>
#'
#' @param mc_id input mc object
#' @param mat_id mat object corresponsing to mc_id
#' @param min_cells_per_sub_mc minimum number of cells per group required to create a new metacell
#' @param col2grp mapping of mc colors to groups, by default, use the color_key slot
#'
#' @export
#'
mcell_mc_split_by_color_group = function(mc_id, mat_id, min_cells_per_sub_mc=500, col2grp=NULL)
{
	mat = scdb_mat(mat_id)
	if(is.null(mat)) {
		stop("MC-ERR: missing mat_id when splitting by color group = ", mat_id)
	}

	mc = scdb_mc(mc_id)
	if(is.null(mc)) {
		stop("MC-ERR: missing mc_id when splitting by color group = ", mc_id)
	}

	if (is.null(col2grp)) {
		ucolkey =  unique(mc@color_key[, c('group', 'color')])
		col2grp = ucolkey$group
		names(col2grp) = ucolkey$color
	}

	cg = split(names(mc@mc), col2grp[mc@colors[mc@mc]])
	for (gr in names(cg)) {
		nms = cg[[gr]]
		if (length(nms) >= min_cells_per_sub_mc) {
			message(sprintf("splitting %s, creating %s_submc_%s with %d cells", mc_id, mc_id, gr, length(nms)))
			mc_map = 1:length(unique(mc@mc[nms]))
			names(mc_map) = names(table(mc@mc[nms]))

			dst_mc = mc_map[as.character(mc@mc[nms])]
			names(dst_mc) = nms

			new_id = paste(mc_id, "submc", gr, sep="_")

			mcell_mat_ignore_cells(new_id, mat_id, nms, reverse=T)
			mcell_add_gene_stat(new_id, new_id)

			mcell_new_mc(mc_id = new_id,
									 mc = dst_mc,
									 outliers = character(0), #setdiff(c(mc@outliers, names(mc@mc)), nms),
									 scmat = scdb_mat(new_id))
		}
	}
}
tanaylab/metacell documentation built on Oct. 19, 2023, 1:01 p.m.