Cas9 lineage barcodes

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

library(devtools)
#devtools::load_all("~/TedSim/TedSim_1.00/TedSim")
library(TedSim)
library(dplyr)
library(Seurat)
library(ggplot2)
library(reshape2)
library(ctc)

Fit the state tree and the branching possibilities

# Load the real dataset
data(larvae4_sub)
p1 <- DimPlot(larvae4_sub , reduction = "umap", pt.size = 0.5) + ggtitle(label = "UMAP")
p1

#Calculate cell type specific properties of the dataset
box_data_cell <- data.frame(celltype = character(),UMI_per_cell = numeric(),zeros_per_cell = numeric())
for (cluster in c("Epidermal cells A","Neutrophils","Macrophages","Keratinocytes A","Cone cells","Erythrocytes A","Endothelial cells","Chondrocytes A","Hepatocytes A","Lymphocytes A")){
  larvae4_ct <- subset(x = larvae4_sub, idents = c(cluster))
  counts_ct <- as.matrix(GetAssayData(object = larvae4_ct, slot = "counts"))
  #temp_data_gene <- data.frame(celltype = rep(cluster,length(rowMeans(counts_sub))),UMI_per_gene =  rowMeans(counts_sub),zeros_per_gene = rowMeans(counts_sub == 0))
  temp_data_cell <- data.frame(celltype = rep(cluster,length(colSums(counts_ct))),UMI_per_cell =  colMeans(counts_ct),zeros_per_cell = colMeans(counts_ct == 0),fill = rep("larvae4",length(colMeans(counts_ct))))
  #box_data_gene <- rbind(box_data_gene,temp_data_gene)
  box_data_cell <- rbind(box_data_cell,temp_data_cell)
}

zero_means <- aggregate(box_data_cell$zeros_per_cell, list(box_data_cell$celltype), mean)
UMI_means <- aggregate(box_data_cell$UMI_per_cell, list(box_data_cell$celltype), mean)


#Get state tree by hierachical clustering on the state means
ct_means <- AverageExpression(larvae4_sub,slot = 'scale.data')
ct_tree <- hclust(dist(t(ct_means$RNA)),method = 'ward.D')
phyla <- hc2Newick(ct_tree, flat=TRUE)
phyla <- read.tree(text = phyla)
phyla$edge.length <- ceiling(phyla$edge.length)
phyla$edge.length[phyla$edge.length == 0] <- 1
cell_table <- as.data.frame(table(Idents(larvae4_sub)))
cell_table$Var1 <- gsub(" ", "", cell_table$Var1, fixed = TRUE)

p_edge <- BranchCal(phyla,cell_table)

plot(phyla)
nodelabels()

Simulate datasets using the state tree and the branching possibilities

#20 datasets are generated (Cell Identity Factors), and the datasets are merged to generate true counts and observed counts together
n_samples <-20

#parameter settings
ncells <- 1024
N_nodes <- 2*ncells-2
ngenes <- 500
p_a <- 1
n_cif <- 30
n_diff <- 20
cif_step <- 1
p_d <- 0
N_char <- 32
max_walk <- 10
seeds <- sample(1:9999,n_samples)

#Initialize data and cell meta
states_leaves_combined <- data.frame(parent = numeric(), cluster = character(),depth = numeric(),cellID = numeric())
counts_combined <- c()
cell_meta_combined <- c()
scale_s_combined <- c()
cifs_combined <- lapply(c(1:3),function(parami){
  matrix(ncol = n_cif,nrow = n_samples*ncells)
})

#Simulate shared State Identity Factors for all datasets
SIF_res <- SIFGenerate(phyla,n_diff,step = cif_step)

#Generate Cell Identity Factors of 20 lineages/datasets
for (i in 1:n_samples){
  set.seed(seeds[i])
  print(seeds[i])
  cifs <- SimulateCIFs(ncells,phyla,p_a = p_a,n_CIF = n_cif,n_diff = n_diff,step = cif_step,p_d = p_d, Sigma = 0.5, N_char = 32, max_walk = max_walk, p_edge = p_edge, SIF_res = SIF_res, unif_on = FALSE)
  for (parami in c(1:3)){
    cif_leaves_all <- cifs[[1]][[parami]][c(1:ncells),]
    cifs_combined[[parami]][((i-1)*ncells + 1):(i*ncells),] <- cif_leaves_all
  }
  cell_meta_combined <- rbind(cell_meta_combined,cifs[[2]][1:ncells,])
  #cif_res <- list(cifs_combined,cifs[[2]])
  states <- cifs[[2]]
  states <- states[1:N_nodes,]
  states_leaves <- states[1:ncells,]

  #Fit scale_s to specific cell type's based on their UMI_means
  states_uniq <- c(5,2,1,4,3,7,6,9,10,8,11:19)
  scale_s_states <- c(0.03 * UMI_means$x,rep(0.001,9))
  scale_s <- states_leaves[,2]
  scale_s[scale_s %in% states_uniq] <- scale_s_states[match(scale_s, states_uniq, nomatch = 0)]
  #true_counts_res <- CIF2Truecounts(ngenes = 500,ncif = n_cif,ge_prob = 0.3,ncells = ncells, cif_res = cif_res, prop_hge = 0.03, mean_hge = 6,scale_s = scale_s)

  #assign cell types
  state_intermediate <- c(5,2,1,4,3,7,6,9,10,8,seq(11,19))
  state_merge_intermediate <- c(c("Epidermal cells A","Neutrophils","Macrophages","Keratinocytes A","Cone cells","Erythrocytes A","Endothelial cells","Chondrocytes A","Hepatocytes A","Lymphocytes A"),rep("intermediate",9))
  new_states <- states_leaves[,2]
  new_states[new_states %in% state_intermediate] <- state_merge_intermediate[match(new_states, state_intermediate, nomatch = 0)]
  states_leaves[,2] <- new_states

  scale_s_combined <- c(scale_s_combined,scale_s)

  states_leaves_combined <- rbind(states_leaves_combined,states_leaves)
}
cifs_combined <- list(cifs_combined,cell_meta_combined)

true_counts_res <- CIF2Truecounts(ngenes = 500,ncif = n_cif,ge_prob = 0.3,ncells = ncells * n_samples, cif_res = cifs_combined, prop_hge = 0.03, mean_hge = 6,scale_s = scale_s_combined)
umap_true_counts <- PlotUmap(meta=states_leaves_combined, data=log2(true_counts_res[[1]]+1), n_pc=30, label='cluster', saving = F, plotname="Differentiating population (true counts)")
umap_true_counts[[2]] + ggtitle("Continuous population (true counts)") + xlab("UMAP1") + ylab("UMAP 2") +  theme(axis.text = element_text(size = 20), axis.title = element_text(size = 30),legend.text = element_text(size = 20))

Compare the properties of simulated datasets and the real dataset

# Read the pre-computed data (true counts)
states_leaves_combined <- read.table(file = "./cell_meta_larvae4_20.txt",sep = "\t")
counts_combined <- read.table(file = "./counts_larvae4_20.txt",sep = "\t",header = TRUE)
observed_counts <- read.table(file = "./observed_counts_larvae4_20.txt",sep = "\t",header = TRUE)

# Calculate and compare cell type composition with the real data
cell_type_decomp_tedsim <- table(states_leaves_combined[,2])
cell_type_decomp_tedsim <- as.data.frame(cell_type_decomp_tedsim/sum(cell_type_decomp_tedsim))
cell_type_decomp_tedsim <- cell_type_decomp_tedsim[cell_type_decomp_tedsim$Var1 != "intermediate",]
cell_type_decomp_tedsim$fill <- rep("TedSim",length(cell_type_decomp_tedsim$Var1))
cell_type_decomp_larvae <- table(Idents(larvae4_sub))
cell_type_decomp_larvae <- as.data.frame(cell_type_decomp_larvae/sum(cell_type_decomp_larvae))
cell_type_decomp_larvae$fill <- rep("Larvae4",length(cell_type_decomp_larvae$Var1))
cell_type_decomp <- rbind(cell_type_decomp_tedsim,cell_type_decomp_larvae)
p <- ggplot(cell_type_decomp, aes(Var1, y = Freq, fill = fill))
p <- p + geom_bar(aes(x=Var1,y=Freq,fill = fill),stat="identity",position = 'dodge') + labs(fill = " ")+ ylab("Percentage of cells")
p + ylim(0,1) + theme(axis.text = element_text(size = 20), axis.title = element_text(size = 30),legend.text = element_text(size = 20))
#ggsave('./output/cell_type_compare.png',width = 2300,height = 700)

# Test different technical noise levels on the datasets
data(gene_len_pool)
gene_len <- sample(gene_len_pool, ngenes, replace = FALSE)
observed_counts <- True2ObservedCounts(true_counts=true_counts_res[[1]], meta_cell=true_counts_res[[3]], protocol="UMI", alpha_mean=0.1, alpha_sd=0.1, gene_len=gene_len, depth_mean=1e5, depth_sd=3e3)
observed_counts <- observed_counts$counts

box_data_gene <- data.frame(celltype = character(),UMI_per_gene = numeric(),zeros_per_gene = numeric())
box_data_cell <- data.frame(celltype = character(),UMI_per_cell = numeric(),zeros_per_cell = numeric())
cell_Types <- c("Epidermal cells A","Neutrophils","Macrophages","Keratinocytes A","Cone cells","Erythrocytes A","Endothelial cells","Chondrocytes A","Hepatocytes A","Lymphocytes A")
for (cluster in cell_Types){
  observed_counts_sub <- observed_counts[,states_leaves_combined[,2] == cluster]
  observed_counts_sub <- log2(observed_counts_sub+1)
  temp_data_gene <- data.frame(celltype = rep(cluster,length(rowMeans(observed_counts_sub))),UMI_per_gene =  rowMeans(observed_counts_sub),zeros_per_gene = rowMeans(observed_counts_sub == 0))
  temp_data_cell <- data.frame(celltype = rep(cluster,length(colMeans(observed_counts_sub))),UMI_per_cell =  colMeans(observed_counts_sub),zeros_per_cell = colMeans(observed_counts_sub == 0))
  box_data_gene <- rbind(box_data_gene,temp_data_gene)
  box_data_cell <- rbind(box_data_cell,temp_data_cell)
}
box_data_cell$fill <- rep("TedSim",length(box_data_cell$UMI_per_cell))
box_data_gene$fill <- rep("TedSim",length(box_data_gene$UMI_per_gene))

for (cluster in c("Epidermal cells A","Neutrophils","Macrophages","Keratinocytes A","Cone cells","Erythrocytes A","Endothelial cells","Chondrocytes A","Hepatocytes A","Lymphocytes A")){
  larvae4_ct <- subset(x = larvae4_sub, idents = c(cluster))
  counts_sub <- as.matrix(GetAssayData(object = larvae4_ct, slot = "counts"))
  counts_sub <- log2(counts_sub+1)
  temp_data_gene <- data.frame(celltype = rep(cluster,length(rowMeans(counts_sub))),UMI_per_gene =  rowMeans(counts_sub),zeros_per_gene = rowMeans(counts_sub == 0),fill = rep("larvae4",length(rowMeans(counts_sub))))
  temp_data_cell <- data.frame(celltype = rep(cluster,length(colSums(counts_sub))),UMI_per_cell =  colMeans(counts_sub),zeros_per_cell = colMeans(counts_sub == 0),fill = rep("larvae4",length(colMeans(counts_sub))))
  box_data_gene <- rbind(box_data_gene,temp_data_gene)
  box_data_cell <- rbind(box_data_cell,temp_data_cell)
}

p <- ggplot(box_data_cell, aes(celltype, UMI_per_cell,fill = fill))
p <- p + geom_boxplot(aes(fill = box_data_cell$fill)) + labs(fill="") + ylab("log transformed count per cell")
p+ theme(axis.text = element_text(size = 20), axis.title = element_text(size = 30),legend.text = element_text(size = 20))
#ggsave('./tedsim_plots_paper/larvae_4/counts_per_gene_larvae4_0.1.png',width = 2200,height = 700, limitsize = FALSE)
p <- ggplot(box_data_cell, aes(celltype, zeros_per_cell,fill = fill))
p <- p + geom_boxplot(aes(fill = box_data_cell$fill)) + labs(fill="") + ylab("zero percentage per cell")
p + ylim(0,1) +  theme(axis.text = element_text(size = 20), axis.title = element_text(size = 30),legend.text = element_text(size = 20))
#ggsave('./output/zeros_larvae4_0.1.png',width = 2200,height = 700)

p <- ggplot(box_data_gene, aes(celltype, UMI_per_gene,fill = fill))
p <- p + geom_boxplot(aes(fill = box_data_gene$fill)) + labs(fill="") + ylab("log transformed count per gene")
p+ theme(axis.text = element_text(size = 20), axis.title = element_text(size = 30),legend.text = element_text(size = 20))
#ggsave('./tedsim_plots_paper/larvae_4/counts_per_gene_larvae4_0.1.png',width = 2200,height = 700, limitsize = FALSE)
p <- ggplot(box_data_gene, aes(celltype, zeros_per_gene,fill = fill))
p <- p + geom_boxplot(aes(fill = box_data_gene$fill)) + labs(fill="") + ylab("zero percentage per gene")
p + ylim(0,1) +  theme(axis.text = element_text(size = 20), axis.title = element_text(size = 30),legend.text = element_text(size = 20))


#ggsave('./tedsim_plots_paper/larvae_4/umap_true_counts_larvae4.png',width = 1000,height = 700)

# Compare cross-cluster distances between simulated datasets and the real dataset
average_expression <- c()
average_expression_real <- c()
cluster_list <- c("Epidermal cells A","Neutrophils","Macrophages","Keratinocytes A","Cone cells","Erythrocytes A","Endothelial cells","Chondrocytes A","Hepatocytes A","Lymphocytes A")
for (cluster in cluster_list){
  observed_counts_sub <- observed_counts[,states_leaves_combined[,2] == cluster]
  average_expression <- rbind(average_expression, rowMeans(observed_counts_sub))

  larvae4_ct <- subset(x = larvae4_sub, idents = c(cluster))
  counts_sub <- as.matrix(GetAssayData(object = larvae4_ct, slot = "counts"))
  average_expression_real <- rbind(average_expression_real, rowMeans(counts_sub))
}

rownames(average_expression) <- cluster_list
cell_type_cor <- round(cor(t(average_expression)),2)
cell_type_cor_melt <- melt(cell_type_cor)
p <- ggplot(data = cell_type_cor_melt, aes(x=Var1,y=Var2,fill=value))
p <- p + geom_tile() + xlab("") +ylab("")
p+  theme(axis.text.x = element_text(angle = 90),axis.text = element_text(size = 20), axis.title = element_text(size = 30),legend.text = element_text(size = 20), legend.title = element_text(size = 25))


rownames(average_expression_real) <- cluster_list
cell_type_cor_real <- round(cor(t(average_expression_real)),2)
cell_type_cor_melt_real <- melt(cell_type_cor_real)
p <- ggplot(data = cell_type_cor_melt_real, aes(x=Var1,y=Var2,fill=value))
p <- p + geom_tile() + xlab("") +ylab("")
p+  theme(axis.text.x = element_text(angle = 90),axis.text = element_text(size = 20), axis.title = element_text(size = 30),legend.text = element_text(size = 20), legend.title = element_text(size = 25))

Galaxeee/TedSim documentation built on Oct. 2, 2022, 1:25 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Galaxeee/TedSim
Temporal Dynamics Simulation of scRNA-seq and CRISR/Cas9 lineage barcodes

In Galaxeee/TedSim: Temporal Dynamics Simulation of scRNA-seq and CRISR/Cas9 lineage barcodes

Fit the state tree and the branching possibilities

Simulate datasets using the state tree and the branching possibilities

Compare the properties of simulated datasets and the real dataset

R Package Documentation

Browse R Packages

We want your feedback!

Galaxeee/TedSim Temporal Dynamics Simulation of scRNA-seq and CRISR/Cas9 lineage barcodes

In Galaxeee/TedSim: Temporal Dynamics Simulation of scRNA-seq and CRISR/Cas9 lineage barcodes

Fit the state tree and the branching possibilities

Simulate datasets using the state tree and the branching possibilities

Compare the properties of simulated datasets and the real dataset

R Package Documentation

Browse R Packages

We want your feedback!

Galaxeee/TedSim
Temporal Dynamics Simulation of scRNA-seq and CRISR/Cas9 lineage barcodes