## ------------------------------------------------------------------------
#' Remove missing values from the metadata, issuing a warning if changes are made.
#'
#' @param object Seurat object
#'
#' @export
#'
FillNA = function( object, filler = "NA" ){
varnames = object@data.info %>% names
missing_list = c()
na2filler = function(x){
x[is.na(x)] = filler
return(x)
}
for( var in varnames ){
if( any( is.na( object@data.info[[var]] ) ) ){
missing_list %<>% c(var)
object@data.info[[var]] %<>% na2filler
}
}
if( length( missing_list ) > 0 ){
warning( paste0( "Missing values found in these variables: \n",
paste0( missing_list, collapse = "\n" ),
"\nReplacing with ", filler, " .\n\n" ) )
}
return( object )
}
#' Get available variable names (genes, identity classes, PCA embeddings, etc)
#'
#' @param object Seurat object
#' @return Returns a character vector of all eligible inputs to the `vars.all` argument of `FetchData`.
#' @export
AvailableData = function( object ){
available_categorized = list( metadata = names( object@data.info ),
PCs = names(object@pca.x),
tsne = names(object@tsne.rot),
ICs = names(object@ica.rot),
genes = rownames( object@data ),
ident = "ident" )
return( Reduce( f = union, x = available_categorized ) )
}
#' FetchData but with zeroes for unavailable genes
#'
#' @export
#' @param dge Seurat object
#' @param vars.all List of all variables to fetch. Missing entries are ignored.
#' @param ... Other arguments to pass to FetchData
#'
#' @details This function is stupid: if you ask for "PC1" and it's not available,
#' it will think you're asking for a non-expressed gene, so it will return zeroes.
FetchDataZeroPad = function( dge, vars.all, warn = T, ... ){
vars.all = vars.all[complete.cases(vars.all)]
avail = intersect( vars.all, AvailableData( dge ) )
unavail = setdiff( vars.all, AvailableData( dge ) )
if(warn && length(unavail) > 0){
warning("Some variables are not available. Returning zeroes.\n")
}
to_return = FetchData( dge, avail, ... )
pad = as.data.frame( matrix(0,
nrow = nrow( to_return ),
ncol = length( unavail ),
dimnames = list( rownames( to_return ),
unavail) ) )
to_return = cbind( to_return, pad )
assertthat::are_equal( sort( vars.all ), sort( colnames( to_return ) ) )
to_return = to_return[, vars.all, drop = F]
assertthat::assert_that( is.data.frame( to_return ) )
return( to_return )
}
#' Subset data flexibly from a Seurat object.
#'
#' @param dge Seurat object
#' @param vars.use Variables to fetch for use in `predicate`.
#' @param predicate String to be parsed into an R expression and evaluated as an arg to base::subset.
#' @param preserve_raw If TRUE, then it will not exclude any cells from the @raw.data slot.
#' By default, it leaves cells in @raw.data only if they satisfy the given predicate.
#' @param show_on_tsne If true, save a tSNE showing which cells are kept.
#' @param results_path Where to save the tSNE.
#' @param ... Extra params passed to tsne_colored.
#' @details Calls FetchData, subsets it as a dataframe using base::subset, and
#' subsets the Seurat object correspondingly (using the df rownames.)
#'
#' @export
#'
SubsetDataFlex = function( dge, vars.use, predicate, preserve_raw = FALSE,
results_path = NULL, show_on_tsne = !is.null(results_path), ... ){
if( typeof(predicate) != "character"){
print("predicate should be a character vector. It will be parsed into `subset` as an R expression.")
}
df = FetchData(dge, vars.use) %>% as.data.frame
cu = df %>% subset(eval(parse(text=predicate))) %>% rownames
if( show_on_tsne ){
dge@data.info$included = (rownames(dge@data.info) %in% cu) %>% as.character
tsne_colored( dge, results_path, colour = "included", ... )
}
dge = SubsetData(dge, cells.use = cu)
if( !preserve_raw ){
dge@raw.data = deseuratify_raw_data(dge)
}
return( dge )
}
#' Merge two Seurat objects.
#'
#' By default, preserves any column of @data.info shared between both objects.
#' You can also specify what variables to keep. They will be added to data.info in
#' the output, warning and padding with zeroes if either object lacks any var in vars.keep.
#'
#' @export
#'
SeuratMerge = function( dge1, dge2, preserve_ident = F,
vars.keep = intersect(names(dge1@data.info),
names(dge2@data.info) ) ){
# Save on memory
dge1@scale.data = matrix()
dge2@scale.data = matrix()
gc()
# Do merging
dge_all = list( dge1 = deseuratify_raw_data( dge1 ),
dge2 = deseuratify_raw_data( dge2 ) ) %>%
dge_merge_list %>% seuratify_thy_data
characterize_factor = function(x){ if(is.factor(x)) as.character(x) else x }
all_factors_to_char = function(X) data.frame(lapply(X, characterize_factor), stringsAsFactors=FALSE)
if( length(vars.keep) > 0 ){
preserved_metadata = rbind( FetchDataZeroPad( dge1, vars.keep ) %>% all_factors_to_char,
FetchDataZeroPad( dge2, vars.keep ) %>% all_factors_to_char )
preserved_metadata %<>% as.data.frame
rownames(preserved_metadata) = c( rownames(dge1@data.info),rownames(dge2@data.info))
}
if(preserve_ident){
new_ident = c( characterize_factor(dge1@ident),
characterize_factor(dge2@ident) )
names(new_ident) = c(names(dge1@ident), names(dge2@ident))
dge_all %<>% SetIdent(new_ident, cells.use = names(new_ident) )
}
return(dge_all)
}
#' Make small-multiple pie charts.
#'
#' @param dge Seurat object
#' @param ident.use Becomes the categories in the pie chart
#' @param facet_by Each small multiple contains cases at one level of this variable.
#' @param col Optional colorscale.
#' @param label Logical. If TRUE, percentages are added.
#' @param main Plot title.
#' @param drop_levels If TRUE, omit facets that would be empty.
#'
#' @export
#'
SeuratPie = function( dge, ident.use = "cell_type", facet_by = "eday",
do.test = FetchDataZeroPad(dge, "eday")[[1]] %>% unique %>% length %>% is_greater_than(1),
col = NULL, label = F,
main = "Sample makeup by day", drop_levels = F ){
cell_types = FetchData(dge, ident.use)[[1]] %>% unique
if( length(cell_types) < 2 ){
stop("\n SeuratPie cannot handle cases with only one level in ident.use.\n")
}
#### Testing
# Test each cluster for a quadratic trend in pct by eday, weighted by the number of cells at each eday.
if(do.test){
# Assemble percentages for testing
pct_for_testing = FetchData( dge, c( ident.use, "orig.ident" )) %>%
table %>%
apply(2, percentify) %>%
(reshape2::melt) %>%
plyr::rename(c("value" = "percent"))
ncells = FetchData( dge, "orig.ident" ) %>% table
# Fill in eday based on sample id
map_to_eday = setNames(get_metadata()$eday,
get_metadata()$Sample_ID %>% as.character )
map_to_eday %<>% na.omit
nm = names( map_to_eday )
map_to_eday %<>% as.numeric
names(map_to_eday) = nm
pct_for_testing$eday = map_to_eday[pct_for_testing$orig.ident %>% as.character]
pct_for_testing$eday_sq = pct_for_testing$eday ^ 2
# Quadratic fit, weighted by day, tested against null model
pvals = rep(1, length(cell_types))
names(pvals) = cell_types
for( cl in cell_types ){
this_cluster_data = subset( pct_for_testing, eval(parse(text = ident.use)) == cl )
mod = lm( data = this_cluster_data, formula = percent~eday + eday_sq, weights = ncells )
test_result = car::linearHypothesis(mod, c("eday = 0", "eday_sq = 0" ))
pvals[cl] = test_result$`Pr(>F)`[[2]]
}
}
#### Plotting
# Get percentages by facet
X = FetchData( dge, c( ident.use, facet_by )) %>%
table %>%
apply(2, percentify) %>%
(reshape2::melt) %>%
plyr::rename(c("value" = "percent"))
if( drop_levels ){
X %<>% subset( percent != 0 )
}
facet_values = FetchData( dge, facet_by )[[1]]
if(is.factor(facet_values) & !drop_levels){
X[[facet_by]] %<>% as.character %>% factor(levels = levels(facet_values), ordered = T)
}
# Position percentages decently
X$at = 0
X = X[order(X[[ident.use]]), ]
for(facet_level in unique(X[[facet_by]])){
idx = (X[[facet_by]] == facet_level)
X[idx, "at"] = 100 - ( cumsum(X[idx, "percent"]) - X[idx, "percent"]/2 )
}
# Pie charts require stat=identity and x=constant
p = ggplot(X) + ggtitle( main) +
geom_bar( aes_string( y = "percent", x = "factor('')", fill = ident.use ),
position = "stack", stat='identity' ) +
coord_polar(theta = "y") + xlab("") + ylab("") +
facet_wrap(facet_by, nrow = 1) + theme(axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_blank())
if(!is.null(col)){p = p + scale_fill_manual( values = col ) }
if( label ) { p = p + geom_text( aes( y = at, x = 1.5, label = percent ) ) }
if(do.test){
pval_text = paste0(" (", round(log10(pvals), 1), ")")
} else {
pval_text = ""
}
p = p +
scale_fill_manual( name="Cell type (Log10 p)",
values = col,
breaks=cell_types,
labels=paste0(cell_types, pval_text))
return(p)
}
#' Test for markers flexibly from a Seurat object.
#'
#' Calls FindMarkers with extra features.
#'
#' @param ident.use Fetched via FetchData to define the groups being tested. Should obey
#' @param test.use Passed into FindMarkers unless it is "binomial_batch", in which case
#' it uses approximate p-values based on a binomial glmm with a random effect for batch (1|orig.ident).
#'
#' All other parameters are passed into FindMarkers unless test.use=="binomial_batch", in
#' which case I attempt to match the behavior of FindMarkers.
#'
#' Output contains an extra column for q-values from p.adjust(..., method="fdr").
#'
#' @export
#'
FindMarkersFlex = function( object,
ident.use, ident.1,
ident.2 = object %>% FetchData(ident.use) %>% extract2(1) %>% unique %>% setdiff(ident.1),
order_by_var = "avg_diff",
thresh.use = 0.25,
test.use = "binomial_batch",
genes.use = object@data %>% rownames,
min.pct = 0.1, ... ){
# This chunk handles ident.1 or .2 of length greater than 1 by collapsing them both with underscores.
new_ident = FetchData( object, ident.use )[[1]] %>% as.character
names(new_ident) = names(object@ident)
new_ident[new_ident %in% ident.1] = paste0(ident.1, collapse = "_")
new_ident[new_ident %in% ident.2] = paste0(ident.2, collapse = "_")
ident.1 = paste0(ident.1, collapse = "_")
ident.2 = paste0(ident.2, collapse = "_")
object %<>% AddMetaData(metadata = new_ident, col.name = ident.use)
# To interface with Seurat, the @ident slot gets overwritten with the groups for the expression test.
object %<>% Seurat::SetIdent(ident.use = new_ident)
# Slice the object down to just the relevant cells, to save time and reduce code complexity downstream.
predicate = paste0(ident.use, " %in% c( '", ident.1, "', '", paste0(ident.2, collapse = "', '"), "' )")
object %<>% SubsetDataFlex(vars.use = ident.use, predicate)
genes.use %<>% intersect(AvailableData(object))
if (test.use == "binomial_batch") {
cat(" \n Computing summaries... \n")
x = data.frame( gene = genes.use, stringsAsFactors = F )
rownames( x ) = x$gene
group_means = aggregate_nice( x = FetchData(object, genes.use),
by = FetchData(object, ident.use),
FUN = mean ) %>% t
group_pcts = aggregate_nice( x = FetchData(object, genes.use),
by = FetchData(object, ident.use),
FUN = prop_nz ) %>% t
x$avg_diff = group_means[, ident.1] - group_means[, ident.2]
x$pct.1 = group_pcts[, ident.1]
x$pct.2 = group_pcts[, ident.2]
x = subset(x, abs(avg_diff) > thresh.use & ( pct.1 > min.pct | pct.2 > min.pct ) )
cat(" Computing p-values... \n")
get_p = function( gene ) {
data = FetchData(object, c(gene, ident.use, "orig.ident"))
data[[gene]] %<>% is_greater_than(0)
data[[ident.1]] = data[[ident.use]] %in% ident.1
colnames(data) = make.names(colnames(data))
mod = lme4::glmer(formula = paste0( colnames(data)[1], " ~ (1|orig.ident) + ", colnames(data)[4] ) ,
family = "binomial", data = data )
mod_p = car::linearHypothesis( mod, hypothesis.matrix = paste0( make.names(ident.1), "TRUE = 0" ) )
cat(".")
return( mod_p$`Pr(>Chisq)`[[2]] )
}
x$p.value = parallel::mclapply( x$gene,
function(s) {
tryCatch(get_p(s), error = function(e) NA)
}) %>% simplify2array()
failures = is.na(x$p.value)
cat(" ", sum( failures ), " failed tests out of ", nrow(x),
". Setting failures to 1 for conservative FDR control. \n" )
x$p.value[failures] = 1
} else {
x = Seurat::FindMarkers( object, ident.1 = ident.1, ident.2 = ident.2,
test.use = test.use,
genes.use = genes.use,
thresh.use = thresh.use,
min.pct = min.pct, ... )
}
if(is.null(x$p.value)){x$p.value = x$p_val}
if( !is.null( x$p.value ) ){
x$q.value = p.adjust( x$p.value, method = "fdr" )
}
x = x[order(x[[order_by_var]], decreasing = T), ]
x$gene = rownames(x)
return( x )
}
#' Sanitize gene names via `make.names`
#'
#' @export
#'
SanitizeGenes = function( dge ){
rownames( dge@raw.data ) %<>% make.names
rownames( dge@data ) %<>% make.names
rownames( dge@scale.data ) %<>% make.names
names( dge@var.genes ) %<>% make.names
return( dge )
}
## ------------------------------------------------------------------------
#' Make a FACS-like plot from a single-cell rna-seq dataset.
#'
#' @param dge Seurat object
#' @param gene1 Horizontal axis on plot mimics this gene. Character, usually length 1 but possibly longer.
#' @param gene2 Vertical axis on plot mimics this gene. Character, usually length 1 but possibly longer.
#' @param genesets_predetermined If FALSE, plot the sum of many genes similar to gene1 instead of gene1 alone (same
#' for gene2). See ?get_similar_genes. If TRUE, plot the sum of only the genes given.
#' @param num_genes_add Each axis shows a simple sum of similar genes. This is how many (before removing overlap). Integer.
#' @param return_val If "all", returns a list with several internal calculations revealed.
#' If "plot", returns just a ggplot object. If "seurat", returns a Seurat object with gene scores added.
#' @param cutoffs If given, divide plot into four quadrants and annotate with percentages. Numeric vector of length 2.
#' @param dge_reference Seurat object. This function relies on gene-gene correlation. If your dataset is perturbed in a way
#' that would substantially alter gene-gene correlations, for example if different time points are present or certain
#' cell types are mostly depleted, you can feed in a reference dge, and TACS will choose axes based on the reference data.
#' @param density If TRUE, plot contours instead of points.
#' @param ... Extra params for stat_density2d.
#'
#' This function is based on a simple scheme: choose genes similar to the ones specified
#' and average them to reduce the noise.
#'
#' @export
#'
TACS = function( dge, gene1, gene2, cutoffs = NULL,
return_val = "plot", density = F,
facet_by = NULL,
include_panel_with_all = FALSE,
facet_levels =
FetchData(dge, facet_by)[[1]] %>% factor %>% levels %>%
c(rep("all", include_panel_with_all), .),
col = stats::setNames( scales::hue_pal()( length( facet_levels ) - include_panel_with_all ),
facet_levels[ ( include_panel_with_all + 1 ) : length( facet_levels )] ),
num_genes_add = 100, genesets_predetermined = F, dge_reference = dge, ... ){
# Get gene sets to average
if(genesets_predetermined){
g1_similar = gene1
g2_similar = gene2
} else {
g1_similar = get_similar_genes(dge_reference, gene1, num_genes_add) %>% c( gene1, . )
g2_similar = get_similar_genes(dge_reference, gene2, num_genes_add) %>% c( gene2, . )
shared = intersect(g1_similar, g2_similar)
g1_similar %<>% setdiff(shared)
g2_similar %<>% setdiff(shared)
}
# Average gene sets to get scores
g1_score = rowMeans(FetchDataZeroPad(dge, g1_similar))
g2_score = rowMeans(FetchDataZeroPad(dge, g2_similar))
g1_score_name = paste0(gene1[1], "_score")
g2_score_name = paste0(gene2[1], "_score")
#Add scores as metadata. Extract with faceting var into plotting data.
dge %<>% AddMetaData(g1_score, col.name = g1_score_name)
dge %<>% AddMetaData(g2_score, col.name = g2_score_name)
plot_df = FetchData(dge, c(g1_score_name, g2_score_name, facet_by))
# Augment data to form extra panel with everything
if( include_panel_with_all ){
plot_df_all = plot_df
plot_df_all[[facet_by]] = "all"
plot_df = rbind(plot_df, plot_df_all)
col = c(col, "all"="black")
}
# Prepare to facet
if(!is.null(facet_by)) {
plot_df[[facet_by]] %<>% factor(levels = facet_levels, ordered = T) %>% droplevels
}
# Form plot
p = ggplot(plot_df)
if(density){
p = p + stat_density2d( aes_string( x = g1_score_name, y = g2_score_name,
colour = facet_by, alpha = "..level.." ), bins = 50 ) +
scale_alpha_continuous( range = c(0.4, 1) ) +
scale_color_manual(values = col)
} else {
p = p + geom_point( aes_string( x=g1_score_name, y=g2_score_name ) )
}
p = p + expand_limits(y=0, x=0)
# Facet if desired
if(!is.null(facet_by)) {
p = p + facet_wrap(as.formula(paste0("~", facet_by)))
}
# Add quadrants and percentages
if( !is.null(cutoffs)){
p %<>% add_quadrants(g1_score_name = g1_score_name,
g2_score_name = g2_score_name,
cutoffs = cutoffs,
facet_by = facet_by)
}
# Return everything or just a plot or just a seurat object
if( return_val == "all" ){
return( list( plot = p,
dge = dge,
genes = list( gene1 = gene1, gene2 = gene2 ),
score_names = c( g1_score_name, g2_score_name ),
genesets = list( g1_similar, g2_similar ),
cutoffs = cutoffs,
plot_df = plot_df ) )
} else if( return_val == "seurat" ){
return(dge)
} else if( return_val == "plot" ){
return( p )
} else {
stop(" return_val should be 'all', 'seurat', or 'plot'. ")
}
}
#' Split a scatterplot (or similar) into quadrants and label percentages in each quadrant.
#'
#' @param dge Seurat object
#' @param cutoffs numeric vector of length 2. Where to delineate the quadrants.
#' @param facet_by optional facetting variable. Percents are calculated separately for each facet.
#'
#' This is a helper for TACS, but it's exported in case it becomes useful.
#'
#' @export
#'
add_quadrants = function(p, g1_score_name, g2_score_name, cutoffs, facet_by = NULL){
# Calculate percentages
p = p + geom_vline(data = data.frame(xint=cutoffs[1]), aes(xintercept=xint))
p = p + geom_hline(data = data.frame(yint=cutoffs[2]), aes(yintercept=yint))
percentages = p$data[c(g1_score_name, g2_score_name, facet_by)]
percentages[[g1_score_name]] %<>% is_greater_than(cutoffs[1])
percentages[[g2_score_name]] %<>% is_greater_than(cutoffs[2])
percentages %<>% table %>% (reshape2::melt)
if(!is.null(facet_by)) {
percentages = percentages[order(percentages[[facet_by]]), ]
for( facet_level in unique(p$data[[facet_by]])){
percentages[percentages[[facet_by]] == facet_level, "value"] %<>% percentify()
}
} else {
percentages$value %<>% percentify()
}
# Form annotation DF with correct facet and attempt sensible placement of percentages
for( i in seq_along(percentages$value)){
if(percentages$value[i]==0){next}
annot_df = data.frame(
x = ifelse( percentages[i, g1_score_name], cutoffs[1]*2, cutoffs[1]*0.35) ,
y = ifelse( percentages[i, g2_score_name], cutoffs[2]*2, cutoffs[2]*0.25) ,
label = paste0( round(percentages$value[i], 1), "%") )
if(!is.null(facet_by)) {
annot_df[[facet_by]] = percentages[i, facet_by]
}
p = p + geom_text( data = annot_df, aes(x=x,y=y,label=label) )
}
return(p)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.