In neurogenomics/scNLP: NLP for Single-cell Omics.

knitr::opts_chunk$set(echo = T, fig.width = 7, fig.height = 5, root.dir=here::here())
knitr::opts_knit$set(root.dir=here::here())
print(here::here())

library(scNLP)
library(dplyr)

set.seed(2020)

library(future)
plan(strategy = "multicore", workers = future::availableCores()-1)
options(future.globals.maxSize = 8000 * 1024^2)

### Set conda env
python_path <- file.path(reticulate::miniconda_path(),"envs/onclass/bin/python")
reticulate::use_python(python_path, required = TRUE)
# reticulate::use_condaenv( file.path(reticulate::miniconda_path(),"envs/onclass"))

Intro

Cell-type labels are often provided by authors, but these labels are usually not harmonised across datasets. Rather than re-identified all each cell's identity based on its marker genes all over again, some NLP methods have been developed to align each existing cell-type label to a controlled ontology, such as the Cell Ontology.

This makes it so you can compare multiple datasets without having to manually map cell-type labels. scNLP provides easy-to-use wrappers for several of these kinds of tools.

Import data

Pseudo-bulk data

Here we use datasets with >100 pseudo-bulk "cells".

library(scNLP) # remotes::install_github('neurogenomics/scNLP')
obj <- scNLP::pseudo_seurat
obj$Celltypes <- obj$celltype
obj <- subset(obj, batch %in% c("LaManno2020","Raj2020","Aerts2020"))
seurat_list <- Seurat::SplitObject(obj, split.by = "batch") 

dplyr::group_by(obj@meta.data, batch) %>% dplyr::count()
# remove(obj)

Raw data

# obj <- SeuratObject::RenameAssays(obj, RNA="integrated")

# adat_files <- c("/Desktop/Aerts2018.orth.h5ad",
#                 "/Desktop/Aerts2021.orth.h5ad")
# library(scKirby)
# 
# seurat_list <- lapply(adat_files, function(x){
#   message(x)
#   seurat <- scKirby::ingest_data(x, output_type = "seurat") 
#   seurat$batch <- gsub(".orth.h5ad","",basename(x))
#   return(seurat)
# })

Human Primary cell Atlas

library(SingleCellExperiment)
library(celldex) # BiocManager::install("celldex")
hpca_se = celldex::HumanPrimaryCellAtlasData() 
SummarizedExperiment:::assay(hpca_se,"counts") <-  exp(SummarizedExperiment:::assay(hpca_se))

hpca_seurat <- scKirby::ingest_data(obj = as(hpca_se,"SingleCellExperiment"),
                                              output_type = "seurat", 
                                              save_output = FALSE)
hpca_seurat$Celltypes <- hpca_seurat$label.fine
hpca_seurat$batch <- "HPCA"
hpca_seurat$dataset <- "HPCA"
hpca_seurat$species <- "human"
#### Subset to only genes in seurat_list
hpca_seurat <- hpca_seurat[rownames(seurat_list[[1]]),]
hpca_seurat <- SeuratObject::RenameAssays(object = hpca_seurat, originalexp="RNA") 

seurat_list[["HPCA"]] <- hpca_seurat
# remove(hpca_se, hpca_seurat)

ELeFHAnt

Integrate

library(ELeFHAnt) # remotes::install_github('praneet1988/ELeFHAnt') 

seurat_integrated <- ELeFHAnt::LabelHarmonization(seurat.objects = seurat_list, 
                                                  classification.method = "Ensemble", 
                                                  downsample = TRUE)
saveRDS(seurat_integrated, "/Desktop/pseudo_bulk_ELeFHAnt.rds")

Visualise harmonised results

gp <- Seurat::DimPlot(seurat_integrated, 
                      group.by = c("batch","species",
                                   "HarmonizedLabels_UsingRF",
                                   "HarmonizedLabels_UsingSVM",
                                   "HarmonizedLabels_UsingEnsemble",
                                   "Celltypes")) +
  Seurat::NoLegend()



plotly::ggplotly(gp)

CelltypeAnnotation

library(SeuratObject)
out = ELeFHAnt::CelltypeAnnotation(reference = hpca_seurat,
                                   query = seurat_integrated,
                                   downsample = T,
                                   classification.method = "Ensemble")
gp <- Seurat::DimPlot(out, 
                      group.by = c("batch","species",
                                   "PredictedCelltype_UsingRF",
                                   "PredictedCelltype_UsingSVM",
                                   "PredictedCelltype_UsingEnsemble",
                                   "Celltypes")) +
  Seurat::NoLegend()

DeduceRelationship

reference2 <- subset(obj, batch!="HPCA")
reference2$Celltypes <- rownames(reference2@meta.data)

relate <- ELeFHAnt::DeduceRelationship(reference1 = hpca_seurat,
                                       reference2 = reference2,
                                       classification.method ="Ensemble")

print(relate)
sig_res <- data.table::data.table(relate$data[relate$data$Cells==1,])

saveRDS(relate, "/Desktop/pseudo_bulk_ELeFHAnt_relate.rds")

mat <- data.table::data.table(relate$data) %>%
  data.table::dcast.data.table(formula = reference1~ reference2, 
                               fun.aggregate = mean, 
                               value.var = "Cells", 
                               fill = 0, 
                               na.rm=TRUE) %>% 
  tibble::column_to_rownames("reference1") %>%
  as.matrix() 
rowmax <- DelayedArray::rowMaxs(mat, na.rm = T)

heatmap(mat[rowmax>0,])

OnClass

Sometimes pip install OnClass gets killed midway through installation. To get past this, run :

pip install OnClass --no-cache-dir

OnClass uses BERT to map ucontrolled celltype annotations to controlled celltype annotations.
Check mappings using OLS Ontology Search.

library(anndata)
tmp_file <- file.path(tempdir(),"pseudo_seurat.h5ad")
adat <- sceasy::convertFormat(obj = scNLP::pseudo_seurat, 
                              from = "seurat", to="anndata")
adat$write_h5ad(filename = tmp_file)

from os import chdir, getcwd
getcwd() # Must be in main scNLP dir to be able to import inst code

import anndata as ad
from scipy import stats, sparse
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import sys
from collections import Counter
from OnClass.OnClassModel import OnClassModel 
from OnClass.OnClass_utils import read_cell_type_nlp_network, read_h5ad,fine_nearest_co_using_nlp
from inst.OnClass_code.utils import read_ontology_file, read_data
from inst.OnClass_code.config import ontology_data_dir, scrna_data_dir, model_dir, Run_scanorama_batch_correction, NHIDDEN, MAX_ITER


train_file = r.tmp_file 
# root_dir = "/Desktop/" 
# train_file = root_dir+"DescartesHuman.orth.h5ad"
# train_file = root_dir+"Aerts2021.orth.h5ad"
adat1 = read_h5ad(train_file)
# adat2  = read_h5ad(test_file)
train_label = "celltype"
# train_label = 'level2'
# test_label = 'level1'

cell_type_nlp_emb_file, cell_type_network_file, cl_obo_file = read_ontology_file('cell ontology', "/Desktop/scNLP/inst/OnClass_code/Ontology_data/")

OnClass_train_obj = OnClassModel(cell_type_nlp_emb_file = cell_type_nlp_emb_file, cell_type_network_file = cell_type_network_file)


train_features, train_genes, train_labels, tissues, adat_sub  = read_data(feature_file=train_file, cell_ontology_ids = OnClass_train_obj.cell_ontology_ids,
        exclude_non_leaf_ontology = False,
        tissue_key = 'species', 
        AnnData_label_key = train_label, 
        filter_key = {},
        nlp_mapping = True, 
        cl_obo_file = cl_obo_file, 
        cell_ontology_file = cell_type_network_file, 
        co2emb = OnClass_train_obj.co2vec_nlp)

id_dict = dict(zip(train_labels, [x[0] for x in adat1.obs[[train_label]].values]))        
id_dict_rev = dict(zip([x[0] for x in adat1.obs[[train_label]].values], train_labels))
adat_sub.obs["cell_ontology_id"] = train_labels        


adat_sub.write("/Desktop/pseudo_bulk_OnClass.h5ad")


# train_label="cell_ontology_id"
# embed = OnClass_train_obj.EmbedCellTypes(train_Y_str=["Astrocyte-like","astrocyte","neuron"])
# 
# embed = pd.read_csv(cell_type_nlp_emb_file, sep="\t", index_col=0, header=None)
# 
# import umap 
# reducer = umap.UMAP()
# embedding = reducer.fit_transform(embed)
# embedding = pd.DataFrame(embedding, index=embed.index, columns=["UMAP1","UMAP2"])

Map back onto celltype names

Map Cell Ontology IDs back onto interpretable celltype names.

library(ontoProc)
adat = anndata::read_h5ad("/Desktop/pseudo_bulk_OnClass.h5ad")
# cl = ontoProc::getCellOnto()
# cl = ontoProc::getHCAOnto() ### This seems to work way better than cellOnto

cl <- ontologyIndex::get_OBO("inst/OnClass_code/Ontology_data/cl.obo")
adat$obs[["cell_ontology_name"]] = cl$name[adat$obs$cell_ontology_id] 

adat$write_h5ad("/Desktop/pseudo_bulk_OnClass.h5ad")

ontologyIndex

library(reticulate)
umap_df = py$embedding

obo <- ontologyIndex::get_OBO("inst/OnClass_code/Ontology_data/cl.obo")

umap_df$celltype <- obo$name[rownames(umap_df)]
ancest1 <- lapply(rownames(umap_df), function(x){ontologyIndex::get_term_property(ontology=obo, property="ancestors", term=x, as_names=TRUE)[1]}) %>%
         rownames(umap_df)

library(ggplot2)

u

gp <- ggplot(data = umap_df, aes(x=UMAP1, y=UMAP2, label=celltype)) +
  geom_point()

plotly::ggplotly(gp)

Session Info

utils::sessionInfo()

neurogenomics/scNLP documentation built on Oct. 8, 2024, 5:30 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

neurogenomics/scNLP
NLP for Single-cell Omics.

In neurogenomics/scNLP: NLP for Single-cell Omics.

Intro

Import data

Pseudo-bulk data

Raw data

Human Primary cell Atlas

ELeFHAnt

Integrate

CelltypeAnnotation

DeduceRelationship

OnClass

Map back onto celltype names

Session Info

R Package Documentation

Browse R Packages

We want your feedback!

neurogenomics/scNLP NLP for Single-cell Omics.

In neurogenomics/scNLP: NLP for Single-cell Omics.

Intro

Import data

Pseudo-bulk data

Raw data

Human Primary cell Atlas

ELeFHAnt

Integrate

CelltypeAnnotation

DeduceRelationship

OnClass

Map back onto celltype names

Session Info

R Package Documentation

Browse R Packages

We want your feedback!

neurogenomics/scNLP
NLP for Single-cell Omics.