examples/code/gastrulationE8.25_Pijuan-Sala2019/2.2_E8.25_mesoderm_runBioTP_different.clustering.R

# This code doing:
# 1) Run BioTIP on 1362 cells clustered with different clustering methods, respectively.
## The 1362 cells cover the major HEP processes (i.e., cells of the original C3, C13, C10, C6, C15, C7),
# generated by the code BioTIP_E8.25_mesoderm_add.clusterIDs.R.
# The different clustering methods are:
# SNNGraph clusters (scran), soft-thresholding clusters with or without the transition cells (QUANTC), 
## consensus clusters with two different parameters (SC3), 
## and Leiden clusters with 4 resolution settings (Seurat4).
#
# Note that the original CTS identification used the clustering of 7k cells (by SNNGraph) 
# 
# last update 1/31/2022
# by Holly yang

setwd('F:/projects/scRNA/results/GSE87038_gastrulation/uncorrected/E8.25_mesoderm_robustness/')
source(file='F:/projects/scRNA/source/GSE87038_gastrulation/GSE87038_git_copy/stability_score.R') 

# the following funciton has been updated on github, but before updating the package to Bioconductor, you need to call it locally to test
source(file='F:/projects/scRNA/source/GSE87038_gastrulation/GSE87038_git_copy/ForJennifer/optimize.sd_selection.R') 
source(file='F:/projects/scRNA/source/GSE87038_gastrulation/GSE87038_git_copy/BioTIP.wrap.R')


# normalized data were downloaded and reanalyzed
# refer to GSE87038_E8.25_normalize.R
library(BioTIP)
library(dplyr)
library(SingleCellExperiment)

library(ggplot2)
library(gridExtra)  # required by grid.arrange()
library(ggpubr)  # required by p-values on bar


###################################################
## load the R object ##
## focusing on 1.3k cells of the HEP processes
## refer to BioTIP_E8.25_mesoderm_add.clusters.R
###################################################

load('sce_E8.25_HEP.RData')
sce
# sceclass: SingleCellExperiment 
# dim: 3073 1362 
# metadata(0):
#   assays(2): counts logcounts
# rownames(3073): Phlda2 Myl7 ... Hmcn1 Tfdp2
# rowData names(2): ENSEMBL SYMBOL
# colnames(1362): cell_63240 cell_63242 ... cell_95715 cell_95717
# colData names(21): cell barcode ... C_Leiden_0.8 C_Leiden_1.2
# reducedDimNames(5): pca.corrected umap TSNE UMAP force
# altExpNames(0):

# dim(dec.pois)
# # [1]  10938     7

load(file='CTS_ShrinkM_E8.25_HEP.RData')
dim(M) #3073 3073
all(rownames(sce)==rownames(M))  #TRUE
  
levels(sce$label)  # '3', '13', '10','6','15', '7'
table(sce$C_QuanTC_k4, sce$C_QuanTC_k6)
#     C1  C2  C3  C4  C5  C6  TC
# C1   0   0 235   0   0   0   5
# C2 161   0   0   0   0   0   0
# C3   0   0   0   0 181 174 162
# C4   0 111   0 254   0   0  14
# TC   0   0   0   0   0   0  65

############################################################
## 1) running BioTIP on different cell clustering outputs ##
############################################################

######### 1) setting parameters,   ######################
localHVG.preselect.cut = 0.1 # A positive numeric value setting how many propotion of global HVG to be selected per cell cluster
getNetwork.cut.fdr = 0.2  # to construct RW network and extract co-expressed gene moduels
getTopMCI.gene.minsize = 30  # min number of genes in an identified CTS
getTopMCI.n.states = 5  # A number setting the number of states to check the significance of DNB score (i.e. MCI) 
                        # This parameter can be enlarge when inputting more clusters, usually the expected number of transition states +1
MCIbottom = 2  # A number setting the threshold to prioritize the initially selected CTS candidates.
# In our experiment, a number between 2 and 4 generated expected resutls.


############### all new clustering outputs  ##################
x <- grep("C_", colnames(colData(sce)))
colnames(colData(sce))[x]
# [1] "C_Soft_k4"        "C_Soft_k6"        "C_consensus_ks4"  "C_consensus_ks5" 
# [5] "C_consensus_ks6"  "C_consensus_ks7"  "C_consensus_ks8"  "C_consensus_ks9" 
# [9] "C_consensus_ks10" "C_Leiden_0.4"     "C_Leiden_0.8"      "C_Leiden_1.2" 

#set.seed(102020)
for(i in 1:length(x)){
  samplesL <- split(rownames(colData(sce)), f = colData(sce)[x[i]])
  (tmp = lengths(samplesL))
  
 # getTopMCI.n.states = ifelse(length(samplesL)<10, 3, 4)
  res <- BioTIP.wrap(sce, samplesL, subDir=colnames(colData(sce))[x[i]],
                     getTopMCI.n.states=getTopMCI.n.states, 
                     getTopMCI.gene.minsize=getTopMCI.gene.minsize, 
                     MCIbottom=MCIbottom,
                     localHVG.preselect.cut=localHVG.preselect.cut, 
                     getNetwork.cut.fdr=getNetwork.cut.fdr, 
                     M=M,
                     verbose=FALSE, plot=TRUE)           
  save(res, file=paste0(colnames(colData(sce))[x[i]],'/BioTIP.res.RData'))  
  
}



############### for k=4, soft clustering generate 5 clusters  ##################
samplesL <- split(rownames(colData(sce)), f = colData(sce)$C_Soft_k4)
(tmp = lengths(samplesL))
#  C1  C2  C3  C4  TC 
# 240 161 517 379  65 

# getTopMCI.n.states = ifelse(length(samplesL)<10, 3, 4) # this parameter is larger when inputting more clusters
## 2nd run: without TC 
samplesL = samplesL[1:4]
res <- BioTIP.wrap(sce, samplesL, subDir='C_Soft_k4.wo.TC',
                   getTopMCI.n.states=getTopMCI.n.states, 
                   getTopMCI.gene.minsize=getTopMCI.gene.minsize, 
                   MCIbottom=MCIbottom,
                   localHVG.preselect.cut=localHVG.preselect.cut, 
                   getNetwork.cut.fdr=getNetwork.cut.fdr, 
                   M=M,
                   verbose=FALSE, plot=TRUE)           

save(res, file='C_Soft_k4.wo.TC/BioTIP.res.RData')  


############### for k=6, soft clustering generate 7 clusters  ##################
## first run: with TC 
samplesL <- split(rownames(colData(sce)), f = colData(sce)$C_Soft_k6)
(tmp = lengths(samplesL))
# C1  C2  C3  C4  C5  C6  TC 
# 161 111 235 254 181 174 246

## 2nd run: without TC 
samplesL = samplesL[1:6]
res <- BioTIP.wrap(sce, samplesL, subDir='C_Soft_k6.wo.TC',
                   getTopMCI.n.states=getTopMCI.n.states, 
                   getTopMCI.gene.minsize=getTopMCI.gene.minsize, 
                   MCIbottom=MCIbottom,
                   localHVG.preselect.cut=localHVG.preselect.cut, 
                   getNetwork.cut.fdr=getNetwork.cut.fdr, 
                   M=M,
                   verbose=FALSE, plot=TRUE)           

save(res, file='C_Soft_k6.wo.TC/BioTIP.res.RData')  

## additional run for all published SNNGraph but 1.3k cells
samplesL <- split(rownames(colData(sce)), f = colData(sce)$label)
(tmp = lengths(samplesL))
#   3  13  10   6  15   7
# 381 223 278 283  60 137

res <- BioTIP.wrap(sce, samplesL, subDir='C_SNNGraph',
                   getTopMCI.n.states=getTopMCI.n.states, 
                   getTopMCI.gene.minsize=getTopMCI.gene.minsize,
                   MCIbottom=MCIbottom,
                   localHVG.preselect.cut=localHVG.preselect.cut, 
                   getNetwork.cut.fdr=getNetwork.cut.fdr,
                   M=M,
                   verbose=FALSE, plot=TRUE)

save(res, file='C_SNNGraph/BioTIP.res.RData')



## additional re-run for all published 7240 cells  
getTopMCI.n.states = 10  ## enlarge the scanning range of clusters with relatively higehr CTS cores !!!
load('../E8.25_mesoderm/sce_E8.25_uncorrected.RData')
sce # 10938 7240 
sce <- sce[rownames(M),]

samplesL <- split(rownames(colData(sce)), f = colData(sce)$label)
(tmp = lengths(samplesL))
#   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19
# 845  337  381 1025  743  283  137  800  534  278  183  466  223  397   60  175  107  203   63

# For degubing purpose, to speedup the progress, using the previously selected local HVG (which runs ~ 1hour on this big dataset)
# load('original_run/optimized_test_sd_selection_E8.25.RData')
# class(testres)
# # [1] "list"
# all(names(testres) %in% names(samplesL))  #TRUE
# dim(testres[[1]])
# # [1] 282 845
# dim(testres[[2]])
# # [1] 288 337
set.seed(2020)
res <- BioTIP.wrap(sce, samplesL, subDir='C_SNNGraph_allcells',
                   getTopMCI.n.states=getTopMCI.n.states, 
                   getTopMCI.gene.minsize=getTopMCI.gene.minsize, 
                   MCIbottom=MCIbottom,
                   localHVG.preselect.cut=localHVG.preselect.cut, 
                   getNetwork.cut.fdr=getNetwork.cut.fdr, 
                   M=M, 
                   IC.rank=2,  ## adjusted for the similar clusters C10 and C13
                   verbose=TRUE, plot=TRUE)           

save(res, file='C_SNNGraph_allcells/BioTIP.res.RData')  



#############
## manually copy the original BioTIP results
# load('../E8.25_mesoderm/BioTIP_top0.1FDR0.2/4CTS.Lib_GRanges.mm9.RData')
# lengths(gr)
# # 15  6 16 13 
# # 67 90 79 60 
# CTS <- lapply(gr, names)
# save(CTS, file='original_run/CTS.RData')

## manually copy the original QuanTC results, only for the HEP->EP bifurcation that best overlap with our eHEP signal at C13
CTS <- list()
tmp <- read.table('../../QuanTC/QuanTC_HEP/k4/C1start/transition_gene_name1.1.txt')
CTS[[1]] <- tmp[,1]
tmp <- read.table('../../QuanTC/QuanTC_HEP/k4/C1start/transition_gene_name1.2.txt')
CTS[[1]] <- c(CTS[[1]], tmp[,1])
length(CTS[[1]])  # 40
CTS[[1]] <- unique(CTS[[1]])
length(CTS[[1]]) # 40
save(CTS, file='QuanTC.k4_run/CTS.RData')

CTS <- list()
tmp <- read.table('../../QuanTC/QuanTC_HEP/k6/C6start/transition_gene_name2.1.txt')
CTS[['C6.start']]  <- tmp[,1]
tmp <- read.table('../../QuanTC/QuanTC_HEP/k6/C6start/transition_gene_name2.2.txt')
CTS[['C6.start']] <- c(CTS[['C6.start']], tmp[,1])

tmp <- read.table('../../QuanTC/QuanTC_HEP/k6/C3start/transition_gene_name3.1.txt')
CTS[['C3.start']] <- tmp[,1]
tmp <- read.table('../../QuanTC/QuanTC_HEP/k6/C3start/transition_gene_name3.2.txt')
CTS[['C3.start']] <- c(CTS[['C3.start']], tmp[,1])

lengths(CTS)
# C6.start C3.start 
# 33        9 

save(CTS, file='QuanTC.k6_run/CTS.RData')
xyang2uchicago/NPS documentation built on June 30, 2024, 10:15 p.m.