# pathway analysis using module scores
#' Obtain the gene in a particular pathway
#'
#' @param pathway a dataframe with `pathway` column being the comon pathway
#' names, and the `kegg_id` column being the KEGG ID for the pathways
#' @param path_name the common name of a pathway
#' @export
get_path_genes <- function (pathway, path_name, species='human') {
species_id <- get_kegg (species)
pathway$kegg_id <- gsub ('^hsa', species_id, pathway$kegg_id)
if (path_name %in% pathway$pathway){
path_genes <- KEGGREST::keggGet(pathway [ pathway$pathway == path_name,
'kegg_id'])[[1]]$GENE
print (paste ('found', length (path_genes), 'genes'))
if (length (path_genes) != 0 ){
path_genes <- path_genes [seq ( 2, length (path_genes), 2 ) ]
one_path <- sapply (path_genes, function (x) {strsplit (x, ';') [[1]][1] })
return (as.character (one_path))
}
}else{
print ('queried pathway not in the compiled list')
print ('choose one from below: ')
print (paste (pathway$pathway, collapse=', ') )
}
}
heat_high_not <- function (x, color_row, highlight_cells, group.by=c('revised', 'date')){
if ( is.null (highlight_cells) ){
print (seurat_heat (x, color_row=color_row, group.by = group.by,
slot='data', column_rotation=90))
}else{
print (seurat_heat_highlight (x, highlight_cells, color_row, group.by, average=T))
}
}
#' Save the PCA and heatmap of genes in a pathway
#'
#' @description The results will be stored in a pdf file. For each pathway, a
#' PC plot using the genes in that pathway is shown, followed by expression of
#' genes in that pathway in heatmap format.
#'
#' @param cutoff If the `gene_list` argument is longer than a cutoff length,
#' then the `gene_list` will be truncated into lists of genes, each of which is
#' no longer than the cutoff length. Then each heatmap will be generated for
#' each sublist
pathway_heat_pca <- function (x, gene_list, save_dir, cutoff=70,
highlight_cells=NULL, group.by=c('revised', 'date'),
DE_dir=NULL, DE_label=NULL){
x <- Seurat::ScaleData (x, features=gene_list)
y <- Seurat::RunPCA (x, features=gene_list)
pdf (save_dir, width=20, height=10)
if (is.null (highlight_cells)){
print (plot_dim_red (y, by_group= group.by, DR='pca', all_labels=T))
}else{
print (gg_plot_dim_red (y, by_group = group.by, DR='pca' ,
size_highlight=highlight_cells, highlight_font=2, dims=c(1,2)))
}
if (!is.null (DE_dir) & !is.null (DE_label) ){
print (plot_gene_PC (y, directory=DE_dir, label=DE_label, color_by='revised'))
}else{print ( plot_gene_PC (y) )}
rm (y)
heat_high_not (x, gene_list, highlight_cells)
if (length (gene_list) > cutoff){
print ('making separate heatmaps')
for (i in seq (1, length(gene_list), cutoff ) ){
if (i < length (gene_list) ){
short_list <- gene_list [i:(i+cutoff)]
heat_high_not (x, short_list, highlight_cells)
}
}
}
dev.off ()
}
#' Draw the heatmap and PCA on genes of different pathways
#'
#' @param x a Seurat object
#' @param save_dir where outputs will be saved. For each pathway, one pdf file
#' will be made, including the PCA in the first page, heatmap the second, and
#' truncated heatmaps if the heatmap is too dense
#' @param all_path a dataframe with 2 columns: 'pathway' for the common names
#' of the pathway that is used for naming the output file, and 'kegg_id' for
#' the KEGG ID for that pathway. This ID may be obtained from:
#' https://www.genome.jp/kegg/pathway.html
#' @param highlight_cells which cells to highlight in the PCA and heatmap
#' @export
all_pathways_heat_pca <- function (x, save_dir, all_path=NULL,
highlight_cells=NULL, DE_dir=NULL,
DE_label=NULL, species='human'){
if (is.null(all_path)){data (KeggID, package='TBdev'); all_path <- KeggID
}
if (!dir.exists (save_dir)){dir.create (save_dir)}
for (one_pathway in all_path$pathway){
print (paste ('analysing', one_pathway, 'pathway') )
one_path <- get_path_genes (all_path, one_pathway, species=species)
if (length (one_path) !=0 ){
pathway_heat_pca ( x, one_path, paste (save_dir,
paste (one_pathway, '.pdf', sep=''), sep='/'),
highlight_cells=highlight_cells, DE_dir=DE_dir,
DE_label=DE_label)
}
}
}
#' Draw the heatmap for the eigengene for single cell
#'
#' @description Due to the problem with the sign of eigenvector, this function
#' is not recommendd to visualise pathway activity. Instead, you may use the
#' `get_module_score` function and plot the results using `seurat_heat`
#' @param x a Seurat object
#' @param threshold This function assigns the cells with the highest total gene
#' expressions in a particular pathway to have positive eigen values. This
#' `threshold` argument sets the range of cells to calculate the total
#' expression. For example, a threshold of 0.02 means that the total expression
#' will be calculated for the bottom 2% and top 2% of the cells
#' @examples
#' path_eigen (all_data)
#' @export
path_eigen <- function (x, all_path=NULL, threshold=0.02,
select_cells=NULL, return_mat=F){
if (is.null(all_path)){data (KeggID, package='TBdev'); all_path <- KeggID
}
eigengene <- list()
for (i in 1:length (all_path$pathway) ){
print (paste ('analysing', all_path$pathway[i], 'pathway') )
pgenes <- get_path_genes (all_path, all_path$pathway [i] )
if (length (pgenes) != 0){
y <- ScaleData (x [pgenes, ], features=pgenes)
y <- RunPCA (y, features=pgenes)
eig_gene <- y@reductions[['pca']]@cell.embeddings [, 1]
# set up the sign of eigengene
thres <- quantile (eig_gene, c(threshold, 1.- threshold))
down_eig <- which (eig_gene < thres [1])
down_exp <- sum ( as.matrix (y [['RNA']]@data [, down_eig] ))
up_eig <- which (eig_gene > thres [2])
up_exp <- sum ( as.matrix (y [['RNA']]@data [, up_eig]) )
if (down_exp < up_exp){
eigengene [[i]] <- eig_gene
}else{
print ('flipping the sign of the eigengene')
eigengene [[i]] <- -eig_gene
}
rm (y)
}else{eigengene [[i]] <- rep (NA, dim (x)[2] )}
}
eigengene <- do.call (cbind, eigengene)
na_columns <- apply ( eigengene, 2, function (x) {sum (is.na (x)) > 0 } )
colnames (eigengene) <- all_path$pathway
eigengene <- eigengene [, !na_columns]
metadata <- x@meta.data [ match ( rownames (eigengene), colnames (x) ), ]
eigen_seurat <- Seurat::CreateSeuratObject ( t(eigengene), meta.data=metadata )
if (return_mat){return (eigen_seurat)
}else{
if (is.null (select_cells) ){
seurat_heat (eigen_seurat, color_row=rownames (eigen_seurat), group.by
= c('revised', 'date'), slot='data', column_rotation=90,
color_scale=c('blue', 'gray', 'red'))
}else{
seurat_heat_highlight (eigen_seurat, select_cells, rownames (eigen_seurat),
c('revised', 'date'), average=T,
color_scale = c('blue', 'gray', 'red'))
}
}
}
#' Calculate the module score of KEGG pathway
#'
#' @description It is extremely important to set seed for this function
#' @param x a Seurat object
#' @param save_path where to save to output dataframe. It is recommended to
#' save the results because this function tends to be time consuming.
#' @param all_path a dataframe with `pathway` column being the comon pathway
#' names, and the `kegg_id` column being the KEGG ID for the pathways
#' @param pgenes a named list of vectors of genes
#' @param append_meta if TRUE, a Seurat object will be created with the
#' expression matrix being the module scores and metadata inherited from the
#' input Seurat object. This output will be more convenient for doing violin
#' plot and heatmap
#' @param regexp remove a part of cell ID names that may arise because of some
#' tidyverse manipulations. If NULL, no changes will happen
#' @return a matrix or a Seurat object whose rows are gene module names in
#' `pgenes` and whose columns are cell names
#' @export
get_module_score <- function (x, save_path, all_path=NULL, pgenes=NULL,
append_meta=F, regexp='^X', species='human',
control_num=50){
if (is.null(all_path)){data (KeggID, package='TBdev'); all_path <- KeggID
}
if ( file.exists (save_path) ){
print ('loading saved module scores')
module_scores <- data.table::fread(save_path) %>% data.frame() %>%
tibble::column_to_rownames ('V1')
}else{
if (is.null (pgenes)){
pgenes <- list ()
for (i in 1:length (all_path$pathway) ){
print (paste ('geting genes from', all_path$pathway[i], 'pathway') )
pgenes [[i]] <- get_path_genes (all_path,
all_path$pathway [i], species=species)
}
names (pgenes) <- all_path$pathway
# remove modules with 0 genes
module_num <- sapply (pgenes, length)
pgenes <- pgenes [ module_num != 0 ]
}
names (pgenes) <- paste (names (pgenes), '_', sep='')
# calculate module scores
x <- Seurat::AddModuleScore(x, features = pgenes, name = names (
pgenes), ctrl=control_num)
# this function tends to append numbers after gene module names, need
# to clean them using regexp
module_names <- paste (names (pgenes), 1:length(pgenes), sep='')
module_names <- gsub ('-', '.', module_names)
module_scores <- t(x@meta.data [, module_names])
rownames (module_scores) <- gsub ('_[0-9]+$', '', rownames (module_scores))
utils::write.csv (module_scores, save_path)
}
# e.g. PI3K-Akt signaling may be changed to PI3K.Akt, need to
# change it back
rownames (module_scores) <- gsub ('\\.', '-', rownames (module_scores) )
if (!is.null (regexp)){
colnames (module_scores) <- gsub (regexp, '', colnames (module_scores))
}
if (!append_meta){return (module_scores)
}else{
metadata <- x@meta.data
module_scores <- module_scores [, match (rownames (metadata), colnames (module_scores) ) ]
path_seurat <- Seurat::CreateSeuratObject (module_scores, meta.data=metadata)
return (path_seurat)
}
}
#' Pathview
#'
#' @description Plot the average expression of a list of genes in one cell type
#' on a pathview plot
#' @param gene a vector of gene names
#' @param markers a dataframe of gene expression. It can be generated from
#' `find_DE_genes`
#' @param cell_type which cell type to analyse
#' @param organism_db the database for the organism to be investigated
#' @param pathname which pathway to view
#' @param save_dir where to save the results
#' @param pathway_db a dataframe with 2 columns: 'pathway' for the common names
#' of the pathway that is used for naming the output file, and 'kegg_id' for
#' the KEGG ID for that pathway. This ID may be obtained from:
#' https://www.genome.jp/kegg/pathway.html
get_pathway_view <- function (genes, markers, cell_type, organism_db, pathname,
save_dir, pathway_db=NULL, cluster_col='feature',
organism_name='human'){
if (is.null (pathway_db)){data(KeggID, package='TBdev') }
# setting up saving diretory
final_dir <- paste (save_dir, cell_type, sep='/')
if (!dir.exists (final_dir) ){dir.create (final_dir) }
path_ID <- KeggID [KeggID$pathway==pathname, 'kegg_id']
# obtain a vector of entrez ID
sel_markers <- markers [markers [, cluster_col] %in% genes, ]
geneList <- get_gene_list (sel_markers, cell_type, org.Hs.eg.db)
kegg_name <- get_kegg (organism_name)
pathview::pathview(gene.data = geneList,
pathway.id = path_ID,
species = kegg_name,
kegg.dir = final_dir
)
# move the file to `save_dir`
old_name <- paste (path_ID, '.pathview.png', sep='')
new_name <- paste ('Path_', pathname, '.png', sep='')
new_name <- paste (final_dir, new_name, sep='/')
file.rename (from=old_name, to=new_name)
}
#' Pathview
#'
#' @description Plot the average expression of a list of genes in one cell type
#' on pathview plots for the major pathways in embryological development.
#' @param gene a vector of gene names
#' @param markers a dataframe of gene expression. It can be generated from
#' `find_DE_genes`
#' @param cell_type which cell type to analyse
#' @param organism_db the database for the organism to be investigated
#' @param save_dir where to save the results
#' @param pathway_db a dataframe with 2 columns: 'pathway' for the common names
#' of the pathway that is used for naming the output file, and 'kegg_id' for
#' the KEGG ID for that pathway. This ID may be obtained from:
#' https://www.genome.jp/kegg/pathway.html
#' @export
get_all_path_view <- function (genes, markers, cell_type, organism_db,save_dir,
pathway_db=NULL, cluster_col='feature',
organism_name='human', ...){
if (is.null (pathway_db)){data(KeggID, package='TBdev'); pathway_db <- KeggID}
for (i in 1:nrow (pathway_db)){
get_pathway_view (genes, markers, cell_type, organism_db,
pathway_db$pathway[i], save_dir, ...)
}
junk <- dir (path = paste(save_dir, cell_type, sep='/'), pattern='hsa*.png')
file.remove (junk)
junk <- dir (path = paste(save_dir, cell_type, sep='/'), pattern='hsa*.xml')
file.remove (junk)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.