##
# @title STenrich_old
# @description Test for spatial enrichment of gene expression sets in ST data sets
# @details The function performs a randomization test to assess if the sum of
# distances between cells/spots with high expression of a gene set is lower than
# the sum of distances of randomly selected cells/spots. The cells/spots are
# considered as having high gene set expression if the average expression of genes in a
# set is higher than the average expression plus a `num_sds` times the standard deviation.
# Control over the size of regions with high expression is provided by setting the
# minimum number of cells or spots (`min_units`). This method is a modification of
# the method devised by Hunter et al. 2021 (zebrafish melanoma study)
#
# @param x an STlist with transformed gene expression
# @param samples a vector with sample names or indexes to run analysis
# @param gene_sets a named list of gene sets to test. The names of the list should
# identify the gene sets to be tested
# @param reps the number of random samples to be extracted. Default is 1000 replicates
# @param num_sds the number of standard deviations to set the minimum gene set
# expression threshold. Default is one (1) standard deviation
# @param min_units Minimum number of spots with high expression of a pathway for
# that gene set to be considered in the analysis. Defaults to 20 spots or cells
# @param min_genes the minimum number of genes of a gene set present in the data set
# for that gene set to be included. Default is 5 genes
# @param pval_adj_method the method for multiple comparison adjustment of p-values.
# Options are the same as that of `p.adjust`. Default is 'BH'
# @param seed the seed number for the selection of random samples. Default is 12345
# @param cores the number of cores used during parallelization. If NULL (default),
# the number of cores is defined automatically
# @return a list of data frames with the results of the test
#
# @export
#
#' @importFrom magrittr %>%
#' @importFrom stats p.adjust
#
#
STenrich_old = function(x=NULL, samples=NULL, gene_sets=NULL, reps=1000, num_sds=1, min_units=20, min_genes=5, pval_adj_method='BH', seed=12345, cores=NULL){
# Record time
zero_t = Sys.time()
cat("Running STenrich...\n")
reps = as.integer(reps)
num_sds = as.double(num_sds)
min_units = as.integer(min_units)
min_genes = as.integer(min_genes)
# Define samples using names (convert indexes to names if necessary)
if(is.null(samples)){
samples = names(x@spatial_meta)
} else{
if(is.numeric(samples)){
samples = as.vector(na.omit(names(x@spatial_meta)[samples]))
} else{
samples = samples[samples %in% names(x@spatial_meta)]
}
# Verify that sample names exist
if(length(samples) == 0 | !any(samples %in% names(x@spatial_meta))){
stop('None of the requested samples are present in the STlist.')
}
}
# Define number of cores for parallelization of tests
if(is.null(cores)){
cores = count_cores(length(samples))
} else{
cores = ceiling(cores)
}
# Loop through samples in STlist
result_dfs = parallel::mclapply(samples, function(i){
system(sprintf('echo "%s"', paste0("\tSample: ", i, "...")))
# Extract spots to be used in analysis
# This selection implemented proactively as analysis might later be applied to tissue niches within samples
tissue_spots = x@spatial_meta[[i]][['libname']]
# Extract gene expression
exp = expandSparse(x@tr_counts[[i]])
exp = exp[, tissue_spots]
# Extract spot coordinates and match order of spots
coords_df = x@spatial_meta[[i]][, c('libname', 'xpos', 'ypos')]
coords_df = coords_df[match(colnames(exp), coords_df[['libname']]), ]
coords_df = coords_df %>% tibble::column_to_rownames(var='libname')
# Calculate distances for the sample
distances_spots = as.matrix(stats::dist(coords_df[, c('xpos', 'ypos')], method='euclidean'))
rm(coords_df) # Clean env
# Perform tests in parallel
pval_df = tibble::tibble()
for(pw in 1:length(gene_sets)){
pw_genes = gene_sets[[pw]]
pw_genes = pw_genes[pw_genes %in% rownames(exp)] # Filter genes that aren't in the expression matrix.
# Test if genes in data set are enough
if(length(pw_genes) >= min_genes){
# Average expression of genes within pathway for each spot
pw_avg_exp = apply(exp[pw_genes, ], 2, mean)
# Find spots that highly express this pathway (mean + stdev_t*std in this case)
avg = mean(pw_avg_exp)
std = sd(pw_avg_exp)
exp_thresh = avg + (num_sds*std)
# Extract spots with average expression above sd threshold
high_spots_bc = names(which(pw_avg_exp >= exp_thresh))
# Are there at least 'min_units' number of cells/spots?
if(length(high_spots_bc) >= min_units){
# Compute the distance between these spots' XY coordinates with high expression
distances_high_spots = distances_spots[high_spots_bc, high_spots_bc]
distances_high_spots[upper.tri(distances_high_spots)] = 0 # Make upper half zero to avoid sum of distances twice
sum_high_distances = sum(distances_high_spots)
rm(distances_high_spots) # Clean env
# Compute random distance permutations
set.seed(seed)
sum_rand_distances = c()
for(rep in 1:reps){
rand_idx = sample(x=colnames(distances_spots), size=length(high_spots_bc))
rand_dist = distances_spots[rand_idx, rand_idx]
rand_dist[upper.tri(rand_dist)] = 0 # Make upper half zero to avoid sum of distances twice
sum_rand_distances = append(sum_rand_distances, sum(rand_dist))
rm(rand_idx, rand_dist) # Clean env
}
# Compute p-value
# Ho: sum of observed distances is larger than sum of random distances
# Ha: sum of observed distances is smaller than sum of random distances
count_test = sum(sum_rand_distances < sum_high_distances) # Times observed dist was higher than random dists
p_val = count_test / reps
rm(high_spots_bc) # Clean env
} else{
p_val=NA
} # Close test minimum spots
} else{
p_val=NA
} # Close test minimum genes
# Get results in data frame form
pval_tmp = tibble::tibble(gene_set=names(gene_sets)[pw],
size_test=length(pw_genes),
size_gene_set=length(gene_sets[[pw]]),
p_value=p_val)
# Compile results in a single data frame
pval_df = dplyr::bind_rows(pval_df,
pval_tmp %>%
tibble::add_column(sample_name=i, .before=1))
}
# Adjust p-values
pval_df[['adj_p_value']] = p.adjust(pval_df[['p_value']], method=pval_adj_method)
pval_df = pval_df[order(pval_df[['adj_p_value']]), ]
return(pval_df)
}, mc.cores=cores)
names(result_dfs) = samples
# Print time
verbose = 1L
end_t = difftime(Sys.time(), zero_t, units='min')
if(verbose > 0L){
cat(paste0('STenrich completed in ', round(end_t, 2), ' min.\n'))
}
return(result_dfs)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.