## ------------------------------------------------------------------------
#' Remove intra-TEC doublets by finding cells with both a high cTEC signature and a high mTEC signature.
#'
#' @param dge Seurat object
#' @param results_path Plots are saved here.
#' @param mTEC_thresh, @param cTEC_thresh Cells are excluded if they exceed both thresholds.
#' If these are NULL, they are selected automatically.
#' @return A list with names "dge", "mTEC_thresh", and "cTEC_thresh."
#'
#'
#' @details For threshold selection, we isolate some nearly-pure cTECs by ranking with all cells on
#' cTEC signatures and retaining the top pure_fraction_cTEC (default 20%).
#' Among those cells, we take the (1 - reject_rate) quantile (default 98th percentile), as the
#' rejection threshold.
#' The same process is used for mTEC threshold selection, but pure_fraction_mTEC defaults to just 2%.
#'
#' @export
remove_TEC_doublets = function( dge,
results_path,
mTEC_thresh = NULL,
cTEC_thresh = NULL,
pure_fraction_mTEC = 0.02,
pure_fraction_cTEC = 0.2,
reject_rate = 0.05 ) {
dir.create.nice( results_path )
dge %<>% add_cTEC_mTEC_signatures
X = FetchData(dge, c("mTEC_signature", "cTEC_signature"))
if( is.null( mTEC_thresh ) ){
cTEC_purity_cutoff = quantile( X$cTEC_signature, p = (1 - pure_fraction_cTEC) )
pure_cTECs = subset(X, cTEC_signature > cTEC_purity_cutoff )
mTEC_thresh = quantile( pure_cTECs$mTEC_signature, (1-reject_rate) )
}
if( is.null( cTEC_thresh ) ){
mTEC_purity_cutoff = quantile( X$mTEC_signature, p = (1 - pure_fraction_mTEC) )
pure_mTECs = subset(X, mTEC_signature > mTEC_purity_cutoff )
cTEC_thresh = quantile( pure_mTECs$cTEC_signature, (1-reject_rate) )
}
df_keep = subset( X, cTEC_signature < cTEC_thresh | mTEC_signature < mTEC_thresh )
p = custom_feature_plot( dge, colour = NULL,
axes = c( "mTEC_signature", "cTEC_signature" ) ) +
ggtitle( paste0( "TEC doublet removal (", nrow( df_keep ), " of ", nrow( X ), " remain)" ) ) +
geom_hline( data = data.frame(cTEC_thresh), aes( yintercept = cTEC_thresh ), col = "red" ) +
geom_vline( data = data.frame(mTEC_thresh), aes( xintercept = mTEC_thresh ), col = "red" )
print( p )
ggsave( file.path( results_path, "tec_dub_cutoffs.pdf" ), plot = p, width = 7, height = 7 )
cu = rownames( df_keep )
dge = SubsetData( dge, cells.use = cu )
return(list( dge = dge,
mTEC_thresh = mTEC_thresh,
cTEC_thresh = cTEC_thresh ) )
}
## ------------------------------------------------------------------------
#' Given a named vector x with counts of various cell types, returns expected doublet quantities for each possible pairing.
#'
#' @details Make sure you only feed this one replicate at a time! You can't get doublets across replicates.
#' Assumes a doublet rate of 5%. Your mileage (and flow rates) may vary.
#'
#' Output is a named numeric vector of expected cell counts.
#' For names, every combination of names(x) should be present once in the output.
#' Order doesn't matter, so labels get alphabetized and concatenated with '_'.
#' Within-cluster doublets are included.
#' E.g. you get BLD_END and BLD_BLD but not END_BLD.
#'
#' @export
expected_doublet_counts = function( x, rate = 0.05 ){
my_mat = matrix(x, nrow = length(x)) %*% matrix(x, ncol = length(x))
my_mat = my_mat / sum( x )
my_mat = my_mat*rate
# Count BLD_TEC and TEC_BLD together
for(i in 1:nrow(my_mat)){
for(j in 1:ncol(my_mat)){
if(i > j){
my_mat[i, j] = 0
}
# adding this tiny bit is a dumb hack to prevent zeroes from disappearing during summary(Matrix(, sparse = T))
if(i < j){
my_mat[i, j] = 2*my_mat[i, j] + 0.000001
}
if( i==j ){
my_mat[i, j] = my_mat[i, j] + 0.000001
}
}
}
colnames(my_mat) = names( x )
rownames(my_mat) = names( x )
dubs = summary( Matrix( my_mat, sparse = T ) )
colnames(dubs) = c("cell_type_1", "cell_type_2", "n_dubs")
dubs$cell_type_1 = names( x )[dubs$cell_type_1]
dubs$cell_type_2 = names( x )[dubs$cell_type_2]
dubs$n_dubs = round(dubs$n_dubs, 3)
postprocess = function( s1_s2 ){ paste( sort( s1_s2), collapse = "_" ) }
rownames( dubs ) = apply( dubs[, 1:2], 1, postprocess )
dubs = setNames( dubs$n_dubs, nm = rownames(dubs))
return( dubs )
}
#' Classify cells from one Seurat object in terms of another Seurat object's identity field, with a "reject option" for unfamiliar cells.
#'
#' @param dge_train Cells to train classifier on. Seurat object.
#' @param dge_test Cells to be classified. Seurat object.
#' @param ident.use Identity variable to use for training labels.
#' @param vars.all List of raw genes/features to use. If possible, will be accessed
#' through `FetchData`; in this case, should be numeric. For others, zeroes are filled in.
#' If NULL, uses variable genes from both `dge_train` and `dge_test`.
#' @param my_transform NULL, character, or function.
#' If `is.null(my_transform)` (default), then `my_transform` is the identity.
#' if `my_transform` has the form "PCA_<integer>", then the `my_transform` is an unscaled <integer>-dimensional
#' PCA based on the training data. This option triggers special behavior for quantifying
#' classifier badness, because NN will perform badly in a principal subspace.
#' If a function is given, `my_transform` should accept and return matrices where rows are cells.
#' @param badness Either "pc_dist" or "neighbor_dist" or `NULL`. If `NULL`, default depends on
#' `my_transform`. You can't use "pc_dist" unless `my_transform` has the form "PCA_<integer>".
#' @param k Number of nearest neighbors to use. Default 25.
#' @param reject_prop Expected rate of false rejections you're willing to tolerate
#' on held-out training instances. Default is 1/100. This is not honest if `my_transform`
#' is chosen using the training data, and it cannot account for batch effects.
#' @return Seurat object identical to `dge_test` but with new/modified fields for
#' - `classifier_ident` (predicted class)
#' - `classifier_badness` (lower means higher confidence)
#' - `classifier_probs_<each identity class from trainset>` (predicted class probabilities)
#' @details Using k-nearest neighbors, classify cells from `dge_test` in terms of
#' the options in `unique(FetchData(dge_train, ident.use))`, plus a reject option.
#' Rejection happens when the badness (usually distance to the nearest neighbors)
#' falls above a threshold (see `reject_prop`). Badness gets adjusted by cluster,
#' because some clusters naturally are less concentrated on the principal subspace
#' or the coordinates of interest.
#' @export
knn_classifier = function( dge_train, dge_test, ident.use = "ident",
vars.all = NULL, my_transform = "PCA_20", badness = NULL,
k = 25, reject_prop = 0 ){
if( is.null( vars.all ) ){
vars.all = union( dge_train@var.genes, dge_test@var.genes )
}
# # Retrieve data, padding test data with zeroes as needed
coords_train_orig = FetchDataZeroPad( dge_train, vars.all, warn = F )
coords_test_orig = FetchDataZeroPad( dge_test, vars.all, warn = F )
# # set the transform (and badness)
if( is.null( my_transform ) ){ my_transform = function(x) x }
if( is.character( my_transform ) ) {
pca_n = strsplit( my_transform, "_", fixed = T )[[1]]
atae( "PCA", pca_n[1] )
atat( !is.na( as.numeric( pca_n[2] ) ) )
cat("Training transform: unscaled PCA of dimension", pca_n[2], " ...\n")
w = irlba::prcomp_irlba( x = coords_train_orig, n = as.numeric( pca_n[2] ),
retx = F,
center = colMeans( coords_train_orig ),
scale = F )
my_transform = function(x){
sweep( x,
STATS = w$center,
FUN = "-",
MARGIN = 2 ) %*% w$rotation
}
if( is.null( badness ) ){ badness = "pc_dist" }
} else {
if( is.null( badness ) ){ badness = "neighbor_dist" }
}
if ( badness == "pc_dist" ) {
cat(" Using distance to principal subspace as badness. \n")
get_badness = function( nn_dists, x ){
z = sweep( x,
STATS = w$center,
FUN = "-",
MARGIN = 2 )
zproj = z %*% w$rotation %*% t( w$rotation )
square = function( x ) x^2
( z-zproj ) %>% apply( 1, square ) %>% apply( 2, sum ) %>% sapply( sqrt )
}
} else {
cat(" Using average distance to neighbors as badness. \n")
get_badness = function( nn_dists, x ) { rowMeans( nn_dists )}
}
cat("Transforming data...\n")
coords_train_trans = my_transform( as.matrix( coords_train_orig ) ) %>% as.data.frame
coords_test_trans = my_transform( as.matrix( coords_test_orig ) ) %>% as.data.frame
# # Find nearest neighbors & classify
cat("Finding nearest neighbors...\n")
nn_out = FNN::get.knnx( data = coords_train_trans,
query = coords_test_trans,
k=k, algorithm=c( "cover_tree" ) )
train_labels = FetchData( dge_train, ident.use )[[1]]
get_label = function(idx) train_labels[idx]
empir_prob = function(x) factor( x, levels = unique( train_labels ) ) %>% table %>% div_by_sum
classifier_probs = apply( apply( nn_out$nn.index, 2, get_label ), 1, FUN = empir_prob ) %>% t %>% as.data.frame
classifier_ident = apply( classifier_probs, 1, function(x) names( which.max( x ) ) )
# # Get badness
cat("Calculating badness for each point...\n")
nn_out_self = FNN::get.knn( data = coords_train_trans,
k=k, algorithm=c( "cover_tree" ) )
classifier_prob_self = apply( apply( nn_out_self$nn.index, 2, get_label ),
1, FUN = empir_prob ) %>% t %>% as.data.frame
classifier_badness = get_badness( nn_dists = nn_out$nn.dist, x = as.matrix( coords_test_orig ) )
classifier_badness_self = get_badness( nn_dists = nn_out_self$nn.dist, x = as.matrix( coords_train_orig ) )
# # Adjust badness via regression: some clusters are naturally less dense or farther from PC axes
model_badness_by_cluster = lm( classifier_badness_self ~ . + 0,
data = cbind( classifier_badness_self, classifier_prob_self ) )
classifier_badness_self = classifier_badness_self - predict( object = model_badness_by_cluster )
classifier_badness = classifier_badness - predict( object = model_badness_by_cluster,
newdata = classifier_probs )
classifier_badness = classifier_badness / sd(classifier_badness_self)
classifier_badness_self = classifier_badness_self / sd(classifier_badness_self)
# # Set threshold and label rejects
if( reject_prop > 0 ){
cat("Labeling rejects with attempted controls on false rejection rate ... \n")
threshold = quantile( classifier_badness_self, 1 - reject_prop )
hist( classifier_badness_self, breaks = 80, col = scales::alpha("blue", 0.5))
hist( classifier_badness, breaks = 80, col = scales::alpha("red", 0.5),
add = T,
main = "Held-out set badness and threshold",
xlab = "Average distance to neighbors (train = blue, test = red)" )
abline( v = threshold )
text( x = threshold*1.05, y = 100, labels = "Reject", srt = 90 )
text( x = threshold*0.95, y = 100, labels = "Classify", srt = 90 )
classifier_ident[classifier_badness >= threshold] = "reject"
}
# # Save data and return
to_add = cbind( classifier_ident,
classifier_badness,
classifier_probs )
colnames( to_add ) = c( "classifier_ident",
"classifier_badness" ,
"classifier_probs_" %>% paste0( colnames( classifier_probs ) ) )
rownames( to_add ) = rownames( coords_test_orig )
dge_test %<>% AddMetaData( to_add )
cat("Done.\n")
return( dge_test )
}
#' Train and save a penalized logistic regression classifier.
#'
#' Uses Seurat::FetchData(training_dge, vars.all = ident.use ) as class labels.
#' Results (`glmnet` object) and training data (Seurat object) get
#' saved into a subdirectory of `results_path`.
#' @export
train_save_classifier = function(training_dge, results_path, ident.use = "cell_type", genes.use, do.save = F ){
# # Get labels
training_labels = factor( vectorize_preserving_rownames ( Seurat::FetchData(training_dge, vars.all = ident.use ) ) )
# # Get features
genes.use = intersect( genes.use, rownames( training_dge@data ) )
features_tf = t( training_dge@data[ genes.use, ] )
# # Build classifier (penalized multinomial logistic regression)
print("Training classifier...")
# # alpha = 0 does L2 regression. alpha = 1 does LASSO. In between is elastic net.
mlr_mod = glmnet::cv.glmnet(x = features_tf, y = training_labels, family = "multinomial", alpha = 0)
print("... classifier trained.")
# # Save and return
if(do.save){
dir.create.nice( file.path( results_path, "classifier" ) )
print("Saving classifier...")
saveRDS(mlr_mod, file.path( results_path, "classifier", "glmnet_object.data" ) )
saveRDS(training_dge, file.path( results_path, "classifier", "training_data.data" ) )
}
return( mlr_mod )
}
#' Extract coefficients from a glmnet multiclass logistic regression model.
#'
get_classifier_coefs = function( mlr_mod ){
cell_types = names( coefficients( mlr_mod, newx = features, s = "lambda.min" ) )
genes = rownames( coefficients( mlr_mod, newx = features, s = "lambda.min" )[[1]] )
coeffs = data.frame( matrix( NA, nrow = length(genes), ncol = length( cell_types ) ) )
colnames( coeffs ) = cell_types
rownames( coeffs ) = make.unique( genes )
for( cell_type in cell_types ){
coeffs[[cell_type]] = as.vector( coefficients( mlr_mod, newx = features, s = "lambda.min" )[[cell_type]] )
}
return( coeffs )
}
#' Apply a machine learning classifier (from `train_save_classifier`) to new data
#'
#' @param dge a Seurat object
#' @param mlr_mod a glmnet multiclass logistic regression model.
#' You can feed the output of `train_save_classifier` into the `mlr_mod` argument.
#' @export
classify_mlr = function( dge, mlr_mod ){
genes_used = ( mlr_mod %>% coef %>% down_idx %>% attributes )$Dimnames %>% down_idx
genes_used = setdiff( genes_used, "(Intercept)" )
features = FetchDataZeroPad( dge, vars.all = genes_used ) %>% t
predictions = predict( mlr_mod, newx = features, s = "lambda.min", type = "response" )[, , 1]
return( predictions )
}
#' Visualize probabilistic classification results
#'
#' @param dge a Seurat object
#' @param results_path folder to place output in
#' @param mlr_mod a glmnet multiclass logistic regression model. Give this or `class_labels`, not both.
#' You can feed the output of `train_save_classifier` into the `mlr_mod` argument.
#' @param class_labels atomic character vector naming columns in the Seurat object that contain
#' class probabilities. Give this or `mlr_mod`, not both. If names(class_labels) is filled in,
#' then that's how the spokes get labeled.
#' @param fig_name filename for output.
#' @param facet_by Variable in Seurat object to facet resulting plots by; default is none
#' @param colour_by Variable in Seurat object to map color to; default is none
#' @param fig_name filename for output.
#' @param style If "points", plot each cell as a dot.
#' If "density", then instead of plotting points, plot 2d density contours.
#' If "hexagons", do AWESOME HEXAGON BINNING YEAHHHHHHH HEXAGONS.
#' @param wheel_order Deprecated.
#' @param do.density Deprecated.
#' @export
wheel_plot = function( dge, results_path, mlr_mod = NULL, class_labels = NULL, fig_name,
colour_by = NULL, facet_by = NULL, style = "points",
wheel_order = NULL, do.density = NULL ){
# # Handle stupid old input
if(!is.null(wheel_order)){
warning( "`wheel_order` arg is deprecated. Use `class_labels` instead." )
if( is.null( class_labels ) ){ class_labels = wheel_order}
}
if( !is.null(do.density) && do.density){
warning("do.density is deprecated. Use ` style = \"density\" ` instead.")
style = "density"
}
# # Handle regular, non-stupid input
mlr_mod_given = !is.null( mlr_mod )
labels_given = !is.null( class_labels )
if( !mlr_mod_given & !labels_given ){
stop("Please specify either `mlr_mod` or `class_labels`")
} else if( !mlr_mod_given & labels_given ) {
cat( "Fetching stored predictions from Seurat object.\n" )
predictions = FetchData( dge, vars.all = class_labels ) %>% as.matrix
} else if( mlr_mod_given & !labels_given ) {
cat( "Making predictions from glmnet object.\n" )
predictions = classify_mlr( dge, mlr_mod )
class_labels = colnames( predictions )
} else if( mlr_mod_given & labels_given ) {
cat( "Making predictions from glmnet object.\n" )
warning( "Since `mlr_mod` was given, using `class_labels`
only to order wheel spokes, not to fetch stored predictions." )
predictions = classify_mlr( dge, mlr_mod )
class_labels = colnames( predictions )
}
# # Make wheel
lp1 = length( class_labels ) + 1
if( is.null( names( class_labels ) ) ){ names( class_labels ) = class_labels }
unwrapped_circle = seq( 0, 2*pi, length.out = lp1 )
wheel_spokes = data.frame( z = names( class_labels ),
x = cos( unwrapped_circle[-lp1] ),
y = sin( unwrapped_circle[-lp1] ) )
# # Process data
cat("Processing data.\n")
cell_positions = predictions %*% as.matrix( wheel_spokes[, c("x", "y")] )
cell_positions = as.data.frame( cell_positions ); colnames( cell_positions ) = c( "x", "y" )
if( !is.null( colour_by ) ){
cell_positions[[colour_by]] = Seurat::FetchData(dge, vars.all = colour_by)[ rownames( predictions ), ]
}
if( !is.null( facet_by ) ){
cell_positions[[facet_by]] = Seurat::FetchData(dge, vars.all = facet_by )[ rownames( predictions ), ]
}
# # Avoid possible namespace conflict between class_labels in model parameters and class_labels in test data
if( "class_labels" %in% c( facet_by, colour_by ) ){
wheel_spokes$class_labels_in_model = wheel_spokes$class_labels
wheel_spokes$class_labels = NULL
}
# # Add wheel & label spokes
wheel_plot = ggplot() + geom_path ( data = wheel_spokes, mapping = aes( x, y ) )
wheel_plot = wheel_plot + geom_text ( data = wheel_spokes, mapping = aes( 1.2*x, 1.2*y, label = z ) )
# # Add data & facet
if( style == "density"){
cat( "Estimating density.\n" )
wheel_plot = wheel_plot + geom_density_2d( data = cell_positions,
mapping = aes_string( "x", "y", colour = colour_by ) )
} else if( style == "hexagons"){
if( !is.null( colour_by ) ) {
warning( "Won't use color with style==\"hegaxons\" because fill is mapped to bin count. " )
}
wheel_plot = wheel_plot + geom_hex( data = cell_positions, mapping = aes_string( "x", "y" ) )
wheel_plot = wheel_plot + scale_fill_gradient(trans = "log")
} else {
wheel_plot = wheel_plot + geom_point( data = cell_positions,
mapping = aes_string( "x", "y", colour = colour_by ), alpha = 0.5 )
}
wheel_plot = wheel_plot + ggtitle( fig_name )
if( !is.null( colour_by ) && is.numeric( cell_positions[[colour_by]] ) ){
wheel_plot = wheel_plot + scale_color_gradientn( colours = blue_gray_red )
}
if ( !is.null( facet_by ) ){
wheel_plot = wheel_plot + facet_wrap( as.formula( paste( "~", facet_by ) ) )
}
# # Save & return
cat( "Done. Saving and returning plot.\n" )
ggsave( filename = file.path( results_path, paste0( fig_name, ".pdf" ) ),
plot = wheel_plot,
width = 12, height = 10)
return( wheel_plot )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.