knitr::opts_chunk$set( collapse = TRUE, comment = "#>", warning = FALSE )
Set up python environment to call the script for GPLVM. Because the script for GPLVM is written in python, to call this script in R, it is necessary to install the reticulate package. Next, a python virtual environment needs to be created as followed:
```{bash, eval= FALSE} conda create -n ptime python=3.6 conda activate ptime conda install -c conda-forge tensorflow==1.13.1 conda install pip pip install gpflow==1.0.0 conda install -c bioconda anndata
```r reticulate::use_condaenv("ptime") # where the python files are located within the package: script_root <- system.file("python/", package='TBdev') reticulate::source_python (paste (script_root, 'GPLVM/GrandPrix.py', sep='/')) reticulate::source_python (paste (script_root, 'STREAM/fitting_prob.py', sep='/')) library (TBdev)
Load a seurat object that has been saved in the directory data
.
root <- '.' data_dir <- paste (root, 'data', sep='/') all_data <- get (load (paste (data_dir, 'final_merged_tb.Robj', sep='/') )) AP <- list (pointsize=3, legend_point_size=3, fontsize=11, point_fontsize=4, font_fam= 'sans') plot_dim_red (all_data, group.by = c('broad_type', 'date'), DR='pca', dims=c(1,2,3), all_theta=50, AP=AP )
Select the cells in the trophoblast lineage
# save from cMor data (CT) tb_data <- all_data [, ! (all_data$revised %in% c(CT$in_vitro_cells, CT$non_emb_lineage, 'Oocyte', 'Zy', '2C', '4C', '8C', 'PE', 'EPI', 'PSA-EPI' ))] exp_mat <- save_to_csv (tb_data, select_genes=4000) # scale the data as required by GPLVM Y <- scale(t(exp_mat))
It is highly recommended to run the python script on a GPU.
That is, save the expression matrix and then run inst/python/GPLVM/gplvm_tb.py
.
However, for illustration in this R markdown, the code below is run on a CPU
for 100 epochs of iteration only. For this dataset, convergence is usually
reached after 50~100 epochs.
It may be better to set a higher number of epochs e.g. 1000 to ensure
convergence for other datasets in general.
reticulate::py_set_seed (100) Grand_out = fit_model (data=Y, # rows are cells and columns are genes n_latent_dims=3L, #here 3 latent dimensions are computed n_inducing_points=10L, #number of inducing points latent_prior_var=1., maxitr=100L, #maximum epoch of iteration disp=T, return_pred=T, kernel=list ('name'='RBF', 'ls'=1.0, 'var'=1.0) # Use RBF kernel )
rm (Y)
Visualise the result
tb_data <- RunGPLVM (tb_data, Grand_out[[1]][[1]]) plot_dim_red (tb_data, group.by = c('broad_type', 'date'), DR='gplvm', dims=c(1,2,3), all_theta=50, AP=AP )
Another benefit of GPLVM is infer the gene expression distribution at given pseudotime location. We can determine the probability of a cell whose expression fits into this distribution. To illustrate, we will compare the human trophoblast stem cells (hTSC) with those along the trophoblast lineage. Human embryonic stem cells (hESC) will also be compared.
in_vitro <- all_data [rownames (exp_mat),all_data$broad_type %in% c('htsc_okae', 'htsc_turco', 'hesc')] in_vitro_mat <- as.matrix (seurat::getassaydata (in_vitro, assay='rna', slot='data')) in_vitro_met <- in_vitro@meta.data fit_chance <- fitting_likelihood (data.frame (exp_mat), new_x = data.frame (in_vitro_mat), new_meta = in_vitro_met, group_by = 'broad_type', pt_mean = data.frame (grand_out[[2]][[1]]), pt_var = data.frame (grand_out[[2]][[2]]) )
Visualise the results on the latent dimension inferred by GPLVM
meta <- tb_data@meta.data tb_data@meta.data <- cbind (meta [, !colnames (meta) %in% colnames (fit_chance)], fit_chance) # shorten the name a bit for plotting colnames (tb_data@meta.data) <- gsub ('^hTSC_', '', colnames (tb_data@meta.data)) plot_dim_red (tb_data, group.by = c('broad_type', 'hESC', 'OKAE', 'TURCO'), DR='gplvm', dims=c(1,2,3), all_theta=50, AP=AP, label_col='broad_type' #label the cell types in all subplots )
rm (list = ls ())
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.