# devtools::install_github('immunogenomics/presto')
#' filter out genes with low expressions
#'
#' @description I am not really sure the most efficient way to do this given
#' that Seurat object does not have rowData. Therefore this function is quite
#' slow.
#' @param x a Seurat object
#' @param quantile_val filter away genes whose expressions are below a certain
#' percentile
#' @export
filter_genes <- function (x, quantile_val, assay='RNA', slot_data='data',
method='quantile'){
exp_mat <- as.matrix (Seurat::GetAssayData (x, assay=assay, slot=slot_data) )
mean_gene <- rowMeans (exp_mat)
if (method == 'quantile'){
filter_thres <- stats::quantile (mean_gene, quantile_val)
}else if (method == 'abs_value'){filter_thres <- quantile_val
}
filter_genes <- mean_gene > filter_thres
print (paste ('filtering away', sum (!filter_genes), 'genes'))
return (x [filter_genes, ])
}
# ----------------
# DE gene analysis
# ----------------
#' Find DE genes across a particular feature
#'
#' @param x a Seurat object
#' @param group.by a column in the meta data to group the cell types
#' @param directory where to save the computed markers
#' @param filename the name of the data file. If NULL, it will be generated
#' automatically in the format of '$directory/marker_list_$group.by_$label.csv'
#' @param method either 'group' to run DE gene of one group with respect to the
#' rest of the dataset using `presto::wilcoxauc`, or 'pairwise' to run DE gene
#' between any possible pairs of groups using `find_DE_pairwise`. The pairwise
#' option can be very computationally expensive.
#' @importFrom Seurat Idents<-
#' @export
find_DE_genes <- function (x, directory=NULL, group.by='seurat_clusters',
filename= NULL, label='all', method='group',
assay='RNA', slot_data='data'){
if (is.null (filename)){
filename <- paste ('marker_cluster_', group.by, '_', label, '.csv', sep='')
}
if (is.null (directory)){proceed <- T
}else{
file_name <- paste (directory, filename, sep='/')
if (!file.exists (file_name)){proceed <- T
}else{proceed <- F}
}
if (proceed){
Idents (x) <- x@meta.data [, group.by]
# Don't use the `FindMarkers` function in Seurat which is too
# slow
if (method == 'group'){
markers <- presto::wilcoxauc (x, group.by,
assay= slot_data, seurat_assay=assay)
}else if (method == 'pairwise'){
markers <- find_DE_pairwise (x, group.by,
slot_data= slot_data, assay=assay)
}
if (!is.null (directory)) {utils::write.csv (markers, file_name)}
}else{
print ('loading precomputed DE genes')
# use data table to read csv is much faster for large files
markers <- data.frame (data.table::fread(file_name))
}
return (markers)
}
find_DE_onepair <- function (x, group.by, group1, group2, assay='RNA', slot_data='data'){
sel_x <- x [, x@meta.data [, group.by] %in% c(group1, group2)]
onepair_marker <- presto::wilcoxauc (sel_x, group.by, assay=slot_data,
seurat_assay=assay)
compare_vec <- onepair_marker$group == group1
onepair_marker$compare_group <- data.table::fifelse (compare_vec, group2, group1)
return (onepair_marker)
}
#' Find pairwise DE gene expression
#'
#' @param x a Seurat object
#' @param group.by a column in the meta data to group the cell types
#' @importFrom magrittr %>%
find_DE_pairwise <- function (x, group.by, assay='RNA', slot_data='data'){
all_types <- x@meta.data [, group.by] %>% unique () %>% as.character ()
N <- length (all_types)
all_pairs <- matrix (0, N, N)
# do not select the diagonal terms
all_pairs [lower.tri (all_pairs, diag=F)] <- 1
all_list <- list ()
k <- 1
for (i in 1:N){
for (j in 1:N){
if (all_pairs [i, j] == 1){
print (paste ('comparing', all_types[i], all_types[j] ))
all_list [[k]] <- find_DE_onepair (x, group.by,
all_types[i], all_types[j], assay=assay,
slot_data=slot_data)
k <- k+ 1
}
}
}
return (do.call (rbind, all_list))
}
# ---------------
# Select DE genes
# ---------------
#' Select the top DE genes from each cluster
#'
#' @description If 1 DE gene appears in 2 clusters, the cluster with the
#' highest avg_logFC will be chosen.
#' Then the user can specify the option in `iterative` whether the other DE
#' genes from the group will be chosen to ensure at the end the number of DE
#' genes in each group is the same.
#' NB: only upregulated genes will be selected
#'
#' @param DE_genes a dataframe
#' @param top_number how many DE genes per cluster/group to choose
#' @param gene which column has the gene symbols
#' @param weighting which column has the gene rankings
#' @param cluster which column has the cluster names/numbers
#' @param iterative how many iterations to run until the number of DE genes in
#' each group is the same. By default, no iteration will be performed. I
#' recommend choosing a moderate size e.g. 10~20 to ensure the end result. I
#' could have implemented a while loop but until I consider all possible
#' scenarios, I would not do so.
#' @importFrom magrittr %>%
#' @export
unique_DE_genes <- function (DE_genes, top_number, gene='feature',
weighting='logFC', cluster='group', iterative=NULL){
if (is.null(iterative)){iterative<-1}
top_num <- top_number
for (i in 1:iterative){
DE_frame <- unique_DE_genes_one_iter (DE_genes, top_num,
gene, weighting, cluster)
DE_frame %>% dplyr::group_by (!!as.symbol (cluster)) %>%
dplyr::slice_max (!!as.symbol(weighting), n=top_number) -> DE_frame
count_mean <- DE_frame %>% dplyr::count (!!as.symbol (cluster)) %>%
tibble::deframe () %>% mean ()
if (count_mean != top_number){
top_num <- top_num + 1
}else{break}
}
return (DE_frame)
}
#' @importFrom magrittr %>%
unique_DE_genes_one_iter <- function (DE_genes, top_number, gene, weighting,
cluster){
DE_genes %>% dplyr::group_by (!!as.symbol (cluster) ) %>%
dplyr::top_n (n=top_number, wt=!!as.symbol (weighting) ) %>%
dplyr::ungroup () %>%
dplyr::arrange (desc (!!as.symbol (weighting) ) ) %>%
dplyr::distinct (!!as.symbol (gene), .keep_all=T )
}
list_DE_genes <- function (x, directory, group.by, top_number=8, label='all',
gene='feature', cluster='group'){
DE_genes <- find_DE_genes (x, directory, group.by, label=label)
top_markers <- as.data.frame (unique_DE_genes (DE_genes, top_number))
DE_genes <- as.vector (top_markers [, gene])
names (DE_genes) <- top_markers[,cluster]
return (DE_genes)
}
#' Merge DE genes and known lineage markers
#'
#' @export
DE_lineage_genes <- function (x, directory, top_number=3, label='all',
group.by='revised', lineage_markers=NULL){
if (is.null (lineage_markers)){data (lineage_markers, package='TBdev') }
DE_genes <- list_DE_genes (x, directory, group.by=group.by,
top_number=top_number, label=label)
all_genes <- c(lineage_markers, DE_genes)
all_genes <- all_genes [ gtools::mixedorder (names (all_genes)) ]
return (all_genes)
}
# ----------------------------------
# Exporting DE genes for publication
# ----------------------------------
#' Save Differentially Expressed Genes
#'
#' @description This function intends to save DE genes for presentation and
#' publication but not for computing. For downstream analysis, the
#' `find_DE_genes` function would be able to save the raw output.
#' @param x a Seurat object
#' @param save_dir where to save the results
#' @param group.by a column in the meta data to group the cell types
#' @param label the file name would be: '$save_dir/DE_$group.by_$label.$suffix'
#' @param save_format can be 'excel' or 'gene_table' for average expression only
#' @param weighting which column to arrange the genes from top to bottom
#' @param gene which column stores the gene names
#' @param value which column stores the average expression
#' @param show_num how many DE genes to save per group. If 'all', everything
#' will be saved
#' @param AP a list that contains the `cell_order` entry to arrange the cells
#' @param organism_db a genome database, e.g. org.Hs.eg.db. It is optional
#' unless you would like to add a ensemble ID column
#' @seealso `find_DE_genes`
#' @importFrom magrittr %>%
#' @export
save_DE_genes <- function (x, save_dir, markers=NULL,
# save path related
group.by='revised',
label='all',
save_format='excel',
# dataframe columns
weighting='logFC',
cluster='group',
gene='feature',
# other options
show_num=30,
value='avgExpr',
AP=NULL,
organism_db=NULL
){
AP <- return_aes_param (AP)
if (is.null(markers)){
markers <- find_DE_genes (x, save_dir, group.by=group.by, label=label)
}
if (show_num != 'all'){
markers %>% dplyr::group_by (!!as.symbol (cluster) ) %>%
dplyr::top_n (n=show_num, wt=!!as.symbol (weighting) ) %>%
dplyr::ungroup () %>% as.data.frame () -> top_markers
}else{top_markers <- markers}
cell_types <- unique (top_markers [, cluster])
cell_types <- sort (partial_relevel (cell_types, AP$cell_order))
save_name <- paste ('DE_', group.by, '_', label, '.xlsx', sep='')
save_path <- paste (save_dir, save_name, sep='/')
if (save_format == 'excel'){
save_DE_excel (top_markers, cell_types, weighting, cluster, save_path)
}else{
save_path <- gsub ('.xlsx', '_sim.csv', save_path)
save_DE_gene_table (top_markers, value, cluster, gene, save_path, AP, organism_db)
}
}
#' @importFrom magrittr %>%
save_DE_excel <- function (top_markers, cell_types, weighting, cluster, save_path){
for (i in 1:length (cell_types)){
if (i==1) {append_sheet <- F}else{append_sheet <- T}
top_markers [top_markers [, cluster ] == as.character (cell_types[i]),] %>%
dplyr::arrange (desc (!!as.symbol (weighting) )) %>%
xlsx::write.xlsx (file=save_path,
sheetName = as.character (cell_types [i]), append=append_sheet)
}
}
#' @importFrom magrittr %>%
save_DE_gene_table <- function (top_markers, value, cluster, gene, save_path, AP, organism_db){
data (TF, package='TBdev')
top_markers %>% dplyr::select (dplyr::all_of (c(value, cluster, gene)) ) %>%
magrittr::set_colnames(c('avgExpr', 'cluster', 'gene')) %>%
dplyr::mutate (cluster =partial_relevel (cluster, AP$cell_order)) %>%
reshape2::dcast (gene ~ cluster, mean, value.var='avgExpr') %>%
dplyr::mutate (transcription_factor = gene %in% TF) -> final_df
if (!is.null (organism_db)) {
final_df$ensemble_ID <- AnnotationDbi::mapIds (organism_db,
keys=as.character (final_df$gene), keytype='SYMBOL',
column='ENSEMBL')
}
utils::write.csv (final_df, save_path)
}
#' Save pairwise DE gene in excel files
#'
#' @description As many xlsx files will be created as there are comparison
#' groups. Each file contains the comparison groups
#' @param ... arguments passing to `save_DE_genes`
#' @importFrom magrittr %>%
save_DE_pairwise <- function (DE_df, group.by, directory, ...){
all_types <- DE_df$compare_group %>% unique () %>% as.character ()
if (!dir.exists (directory)){dir.create (directory)}
for (i in all_types){
DE_df [DE_df$compare_group == i,] %>%
dplyr::select (!compare_group) -> sel_DE
save_DE_genes (x=NULL, save_dir=directory, group.by=group.by,
markers=sel_DE, label=i,...)
}
}
# --------
# Plotting
# --------
#' Visualise DE genes
#'
#' @param x Seurat object
#' @param markers DE gene marker dataframe computed by
#' `FindAllMarkers`. Alternatively, one can parse a vector of marker
#' genes.
#' @param top_gene how many genes to show per group
#' @param by_group which group of features in which the top DE genes
#' are selected
#' @param show_group which group of features in which the expression of
#' top DE genes are plotted against
#' @param show_heatmap whether to show heatmap or violin plot
#' @export
plot_DE_genes <- function (x, markers, top_gene=1, by_group='cluster',
show_group='Type', show_heatmap=TRUE){
if (class (markers) == "data.frame"){
top_markers <- unique_DE_genes (markers, top_gene)
feature_plot <- as.character (top_markers$gene)
}else{feature_plot <- as.character (markers)}
print (feature_plot)
if (!show_heatmap){
plot_data <- Seurat::VlnPlot(x, features = feature_plot , group.by=show_group, ncol
= length(feature_plot)%/%3)
}else{
plot_data <- Seurat::DoHeatmap(x, features = feature_plot, group.by=show_group) +
Seurat::NoLegend()
}
return (plot_data)
}
#' Similar to Seurat VlnPlot, with more graphic controls
#'
#' @description This function was called 'seurat_violin' originally because of
#' re-implementation of violin plot in seurat. Its functionality has now
#' expanded to include scatter plots and boxplots.
#' @param x a Seurat object
#' @param features genes to plot, which will be split over by facets
#' @param group.by by which cell feature
#' @param num_col in how many columns the facets are arranged
#' @param plot_type can be 'box', 'violin' or 'scatter'.
#' @param color_by only effective when plot_type is 'scatter'
#' @param ... additional arguments for `custom_tick`
#' @importFrom magrittr %>%
#' @importFrom ggplot2 aes aes_string
#' @export
seurat_violin <- function (x, features, group.by, assay='RNA',
slot_data='data', num_col=NULL, free_xy='fixed',
AP=NULL, plot_type='box', color.by=NULL, ...){
AP <- return_aes_param (AP)
if (is.null (color.by)){color.by <- group.by}
if (plot_type != 'scatter'){color.by <- group.by; rota <- 90
}else{rota <- 0}
x@meta.data[, group.by] <- partial_relevel (x@meta.data[, group.by], AP$cell_order)
x %>% Seurat::FetchData (vars=c(features, group.by, color.by)) %>%
magrittr::set_colnames (c(features, 'feature', 'colorby')) %>%
tidyr::gather ( 'gene', 'expr_val', -feature, -colorby) %>%
tidyr::drop_na () %>%
dplyr::mutate ( gene = factor(gene, levels=features) ) -> plot_data
if ('pseudotime' %in% c(features, group.by, color.by)){plot_data %>%
dplyr::filter (!colorby %in% c('PE', 'EPI', 'EPI1', 'EPI2') ) -> plot_data }
ggplot2::ggplot (plot_data, aes ( x= feature, y=expr_val, fill=colorby) ) +
ggplot2::facet_wrap (.~gene, ncol=num_col, scales=free_xy) +
ggplot2::labs (fill='')+ ggplot2::xlab ('')+
ggplot2::ylab (expression (paste (italic ('mRNA levels')))) +
theme_TB ('dotplot', feature_vec=plot_data$colorby,
color_fill=T, aes_param=AP, rotation=rota)+
ggplot2::guides( fill= ggplot2::guide_legend(override.aes = list(alpha=1)))+
custom_tick (min_prec=1,...) -> plot_ob
if (plot_type=='box'){plot_ob <- plot_ob + ggplot2::geom_boxplot()
}else if (plot_type=='scatter'){plot_ob <- plot_ob +
ggplot2::geom_point (shape=AP$normal_shape, size=AP$pointsize,
color=AP$point_edge_color, stroke=AP$edge_stroke)
}else{plot_ob <- plot_ob +ggplot2::geom_jitter (shape=AP$normal_shape,
size=AP$pointsize, height=0, color=AP$point_edge_color,
stroke=AP$edge_stroke)}
return (plot_ob)
}
#' violin plot on genes that contribute to latent dimensions
strong_gene_violin <- function (x, group.by, dim_num=1, DR='pca', assay='RNA',
slot_data='data'){
feature_load <- x@reductions[[DR]]@feature.loadings
data (TF, package='TBdev')
strong_gene <- data.frame (feature_load [rownames (feature_load) %in% TF [,1], dim_num])
abs (strong_gene) %>% dplyr::top_n (20) -> gene_plot
vln_plot <- seurat_violin (x, rownames (gene_plot), 'species_type', assay, slot_data)
return (vln_plot + ggplot2::ggtitle (colnames (feature_load)[dim_num] ) )
}
#' Deseq2 style Volcano plot
#'
#' @param markers a dataframe, result generated from `find_DE_genes`
#' @param group1 comparison group1
#' @param group2 comparison group2
#' @param group1_col a column in `markers` where `group1` information is stored
#' @param group2_col a column in `markers` where `group2` information is stored
#' @param label_genes which genes to show in the volcano plot
#' @param weighting the x axis, i.e. log fold change
#' @param pval the p value for the y axis
#' @param gene where the gene names are stored
#' @param weight_thres above which value in `weighting` are the points colored
#' according to their comparison groups
#' @param AP aesthetic parameter for controlling the plot
#' @param ... arguments for `enrich_arrow`, including `nudge_x`, `nudge_y` and
#' `band_thickness`, `length_ratio`. For description of their roles, see
#' `enrich_bar`
#' @return a ggplot object
#' @importFrom magrittr %>%
#' @importFrom ggplot2 aes aes_string
#' @export
seurat_volcano <- function (markers, group1, group2, group1_col = 'group',
group2_col = 'compare_group', label_genes=NULL,
weighting='logFC', pval='padj', gene='feature',
weight_thres=0.25, show_gene_num=30, logy=T, AP=NULL, ...){
AP <- return_aes_param (AP)
# select appropriate data
markers %>% dplyr::filter (!!as.symbol (group1_col) == group1 ) %>%
dplyr::filter (!!as.symbol (group2_col) == group2 ) -> sel_mark
if (logy){
sel_mark %>% dplyr::mutate (logp = - log10 (!!as.symbol (pval) )) -> plot_data
y_lab <- expression ('-log[10] P')
}else{
sel_mark %>% dplyr::mutate (logp = !!as.symbol (pval)) -> plot_data
y_lab <- pval
}
# color the significant points
plot_data$groupby <- 'unknown'
plot_data$groupby [plot_data [, weighting] > weight_thres] <- group1
plot_data$groupby [plot_data [, weighting] < -weight_thres] <- group2
# remove the Inf logp values
plot_data %>% dplyr::filter (logp!=Inf) %>% dplyr::pull (logp) %>% max () -> max_p
plot_data$logp [plot_data$logp==Inf] <- max_p
# basic volcano plot
ggplot2::ggplot (plot_data, aes_string (x=weighting, y='logp') )+
ggplot2::geom_point (aes (fill=groupby), shape=AP$normal_shape,
color=AP$point_edge_color, show.legend=F,
size=AP$pointsize) +
# construct the threshold line
ggplot2::geom_vline (xintercept=c(-weight_thres, weight_thres),
linetype='dashed') +
# add arrow sign
enrich_arrow (plot_data, AP, yval='logp', xval=weighting,
group1='group', char_y=F, ...)+
ggplot2::ylab (y_lab) -> plot_ob
# add label genes
if (!is.null (label_genes)){
# remove those genes that are not significant
sel_mark %>% dplyr::filter (abs (!!as.symbol (weighting)) > weight_thres) %>%
pull (!!as.symbol (gene)) -> sig_lab
label_genes <- label_genes [label_genes %in% sig_lab]
}
if (is.null (label_genes) & !is.null (show_gene_num) ){
sel_mark %>% dplyr::mutate (pos_neg = ifelse (!!as.symbol (
weighting) > 0, 'pos', 'neg') ) %>%
dplyr::group_by (pos_neg) %>%
dplyr::slice_max (abs (!!as.symbol (weighting)), n=show_gene_num/2) %>%
dplyr::pull (!!as.symbol(gene)) -> label_genes
}
if (!is.null (label_genes)){
plot_data %>% dplyr::filter (!!as.symbol (gene) %in% label_genes) -> text_df
if (!is.null (names (label_genes) )){
text_df$color_by <- names (label_genes) [match (text_df[, gene], label_genes)]
text_df$color_by <- partial_relevel (text_df$color_by, AP$cell_order)
}else{
data.table::fifelse (text_df [, weighting] > 0, group1,
group2) -> text_df$color_by
}
aes_arg_list <- list (label=gene, color='color_by')
aes_arg <- do.call (aes_string, aes_arg_list)
plot_ob <- plot_ob +
ggrepel::geom_text_repel(aes_arg, data=text_df,
size=AP$point_fontsize, family=AP$font_fam, show.legend=F)
feature_vec <- text_df$color
}else{feature_vec <- NULL}
plot_ob + theme_TB ('dotplot', rotation=0, aes_param=AP, color_fill=F,
feature_vec=feature_vec)+
add_custom_color (feature_vec=plot_data$groupby, aes_param=AP, color_fill=T)+
ggplot2::coord_cartesian (clip='off')
}
#' Gene expression in two categories
#'
#' @description Calculate the mean expression of each gene in two categories
#' @param x a Seurat object
#' @param compare.by which features to be compared
#' @param group.by perform the analysis separately for different groups of
#' cells
#' @return a dataframe
#' @importFrom magrittr %>%
gene_gene_mat <- function (x, compare.by, group.by, assay='RNA', slot_data='data'){
datExpr <- as.matrix (Seurat::GetAssayData (x, assay=assay, slot=slot_data) )
group_compare <- paste ( x@meta.data [, compare.by], x@meta.data [, group.by], sep='__')
unique_group <- unique (group_compare)
mean_expr <- lapply ( as.list (unique_group), function (inp) {
rowMeans (datExpr [, group_compare == inp, drop=F] ) } )
mean_expr <- do.call (cbind, mean_expr)
colnames (mean_expr) <- unique_group
t (mean_expr) %>% data.frame () %>%
tibble::add_column (compare.group=unique_group) %>%
tidyr::separate (compare.group, c('compare_item', 'group_item'), sep='__') %>%
tidyr::gather ('gene', 'expr_val', -compare_item, -group_item) %>%
tidyr::spread (compare_item, expr_val) %>%
tidyr::drop_na ()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.