Nothing
## ----setup, eval = TRUE, echo = FALSE-----------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ----installation, eval = FALSE-----------------------------------------------
# # To install scGPS from github (Depending on the configuration of the local
# # computer or HPC, possible custom C++ compilation may be required - see
# # installation trouble-shootings below)
# devtools::install_github("IMB-Computational-Genomics-Lab/scGPS")
#
# # for C++ compilation trouble-shooting, manual download and installation can be
# # done from github
#
# git clone https://github.com/IMB-Computational-Genomics-Lab/scGPS
#
# # then check in scGPS/src if any of the precompiled (e.g. those with *.so and
# # *.o) files exist and delete them before recompiling
#
# # then with the scGPS as the R working directory, manually install and load
# # using devtools functionality
# # Install the package
# devtools::install()
# #load the package to the workspace
# library(scGPS)
#
## ----scGPS_object, warning = FALSE, message = FALSE---------------------------
# load mixed population 1 (loaded from day_2_cardio_cell_sample dataset, named it as day2)
library(scGPS)
day2 <- day_2_cardio_cell_sample
mixedpop1 <- new_scGPS_object(ExpressionMatrix = day2$dat2_counts,
GeneMetadata = day2$dat2geneInfo, CellMetadata = day2$dat2_clusters)
# load mixed population 2 (loaded from day_5_cardio_cell_sample dataset, named it as day5)
day5 <- day_5_cardio_cell_sample
mixedpop2 <- new_scGPS_object(ExpressionMatrix = day5$dat5_counts,
GeneMetadata = day5$dat5geneInfo, CellMetadata = day5$dat5_clusters)
## ----prediction, warning = FALSE, message = FALSE-----------------------------
# select a subpopulation
c_selectID <- 1
# load gene list (this can be any lists of user selected genes)
genes <- training_gene_sample
genes <- genes$Merged_unique
# load cluster information
cluster_mixedpop1 <- colData(mixedpop1)[,1]
cluster_mixedpop2 <- colData(mixedpop2)[,1]
#run training (running nboots = 3 here, but recommend to use nboots = 50-100)
LSOLDA_dat <- bootstrap_prediction(nboots = 3, mixedpop1 = mixedpop1,
mixedpop2 = mixedpop2, genes = genes, c_selectID = c_selectID,
listData = list(), cluster_mixedpop1 = cluster_mixedpop1,
cluster_mixedpop2 = cluster_mixedpop2, trainset_ratio = 0.7)
names(LSOLDA_dat)
## ----summarise_results, warning = FALSE, message = FALSE----------------------
# summary results LDA
sum_pred_lda <- summary_prediction_lda(LSOLDA_dat = LSOLDA_dat, nPredSubpop = 4)
# summary results Lasso to show the percent of cells
# classified as cells belonging
sum_pred_lasso <- summary_prediction_lasso(LSOLDA_dat = LSOLDA_dat,
nPredSubpop = 4)
# plot summary results
plot_sum <-function(sum_dat){
sum_dat_tf <- t(sum_dat)
sum_dat_tf <- na.omit(sum_dat_tf)
sum_dat_tf <- apply(sum_dat[, -ncol(sum_dat)],1,
function(x){as.numeric(as.vector(x))})
sum_dat$names <- gsub("ElasticNet for subpop","sp", sum_dat$names )
sum_dat$names <- gsub("in target mixedpop","in p", sum_dat$names)
sum_dat$names <- gsub("LDA for subpop","sp", sum_dat$names )
sum_dat$names <- gsub("in target mixedpop","in p", sum_dat$names)
colnames(sum_dat_tf) <- sum_dat$names
boxplot(sum_dat_tf, las=2)
}
plot_sum(sum_pred_lasso)
plot_sum(sum_pred_lda)
# summary accuracy to check the model accuracy in the leave-out test set
summary_accuracy(object = LSOLDA_dat)
# summary maximum deviance explained by the model
summary_deviance(object = LSOLDA_dat)
## ----CORE, warning = FALSE, message = FALSE-----------------------------------
# find clustering information in an expresion data using CORE
day5 <- day_5_cardio_cell_sample
cellnames <- colnames(day5$dat5_counts)
cluster <-day5$dat5_clusters
cellnames <-data.frame("Cluster"=cluster, "cellBarcodes" = cellnames)
mixedpop2 <-new_scGPS_object(ExpressionMatrix = day5$dat5_counts,
GeneMetadata = day5$dat5geneInfo, CellMetadata = cellnames)
CORE_cluster <- CORE_clustering(mixedpop2, remove_outlier = c(0), PCA=FALSE)
# to update the clustering information, users can ...
key_height <- CORE_cluster$optimalClust$KeyStats$Height
optimal_res <- CORE_cluster$optimalClust$OptimalRes
optimal_index = which(key_height == optimal_res)
clustering_after_outlier_removal <- unname(unlist(
CORE_cluster$Cluster[[optimal_index]]))
corresponding_cells_after_outlier_removal <- CORE_cluster$cellsForClustering
original_cells_before_removal <- colData(mixedpop2)[,2]
corresponding_index <- match(corresponding_cells_after_outlier_removal,
original_cells_before_removal )
# check the matching
identical(as.character(original_cells_before_removal[corresponding_index]),
corresponding_cells_after_outlier_removal)
# create new object with the new clustering after removing outliers
mixedpop2_post_clustering <- mixedpop2[,corresponding_index]
colData(mixedpop2_post_clustering)[,1] <- clustering_after_outlier_removal
## ----SCORE with bagging, warning = FALSE, message = FALSE---------------------
# find clustering information in an expresion data using SCORE
day5 <- day_5_cardio_cell_sample
cellnames <- colnames(day5$dat5_counts)
cluster <-day5$dat5_clusters
cellnames <-data.frame("Cluster"=cluster, "cellBarcodes" = cellnames)
mixedpop2 <-new_scGPS_object(ExpressionMatrix = day5$dat5_counts,
GeneMetadata = day5$dat5geneInfo, CellMetadata = cellnames )
SCORE_test <- CORE_bagging(mixedpop2, remove_outlier = c(0), PCA=FALSE,
bagging_run = 20, subsample_proportion = .8)
## ----visualisation, dev='CairoPDF'--------------------------------------------
dev.off()
##3.2.1 plot CORE clustering
p1 <- plot_CORE(CORE_cluster$tree, CORE_cluster$Cluster,
color_branch = c("#208eb7", "#6ce9d3", "#1c5e39", "#8fca40", "#154975",
"#b1c8eb"))
p1
#extract optimal index identified by CORE
key_height <- CORE_cluster$optimalClust$KeyStats$Height
optimal_res <- CORE_cluster$optimalClust$OptimalRes
optimal_index = which(key_height == optimal_res)
#plot one optimal clustering bar
plot_optimal_CORE(original_tree= CORE_cluster$tree,
optimal_cluster = unlist(CORE_cluster$Cluster[optimal_index]),
shift = -2000)
##3.2.2 plot SCORE clustering
#plot all clustering bars
plot_CORE(SCORE_test$tree, list_clusters = SCORE_test$Cluster)
#plot one stable optimal clustering bar
plot_optimal_CORE(original_tree= SCORE_test$tree,
optimal_cluster = unlist(SCORE_test$Cluster[SCORE_test$optimal_index]),
shift = -100)
## ----compare_clustering, fig.width=5, fig.height=5----------------------------
t <- tSNE(expression.mat=assay(mixedpop2))
p2 <-plot_reduced(t, color_fac = factor(colData(mixedpop2)[,1]),
palletes =1:length(unique(colData(mixedpop2)[,1])))
p2
## ----find_markers, warning = FALSE, message = FALSE, fig.width=7--------------
#load gene list (this can be any lists of user-selected genes)
genes <-training_gene_sample
genes <-genes$Merged_unique
#the gene list can also be objectively identified by differential expression
#analysis cluster information is requied for find_markers. Here, we use
#CORE results.
#colData(mixedpop2)[,1] <- unlist(SCORE_test$Cluster[SCORE_test$optimal_index])
suppressMessages(library(locfit))
suppressMessages(library(DESeq))
DEgenes <- find_markers(expression_matrix=assay(mixedpop2),
cluster = colData(mixedpop2)[,1],
selected_cluster=unique(colData(mixedpop2)[,1]))
#the output contains dataframes for each cluster.
#the data frame contains all genes, sorted by p-values
names(DEgenes)
#you can annotate the identified clusters
DEgeneList_1vsOthers <- DEgenes$DE_Subpop1vsRemaining$id
#users need to check the format of the gene input to make sure they are
#consistent to the gene names in the expression matrix
#the following command saves the file "PathwayEnrichment.xlsx" to the
#working dir
#use 500 top DE genes
suppressMessages(library(DOSE))
suppressMessages(library(ReactomePA))
suppressMessages(library(clusterProfiler))
genes500 <- as.factor(DEgeneList_1vsOthers[seq_len(500)])
enrichment_test <- annotate_clusters(genes, pvalueCutoff=0.05, gene_symbol=TRUE)
#the enrichment outputs can be displayed by running
clusterProfiler::dotplot(enrichment_test, showCategory=10, font.size = 6)
## ----scGPS_prediction, warning = FALSE, message = FALSE-----------------------
#select a subpopulation, and input gene list
c_selectID <- 1
#note make sure the format for genes input here is the same to the format
#for genes in the mixedpop1 and mixedpop2
genes = DEgenes$DE_Subpop1vsRemaining$id[1:500]
#run the test bootstrap with nboots = 2 runs
cluster_mixedpop1 <- colData(mixedpop1)[,1]
cluster_mixedpop2 <- colData(mixedpop2)[,1]
LSOLDA_dat <- bootstrap_prediction(nboots = 2, mixedpop1 = mixedpop1,
mixedpop2 = mixedpop2, genes = genes, c_selectID = c_selectID,
listData = list(),
cluster_mixedpop1 = cluster_mixedpop1,
cluster_mixedpop2 = cluster_mixedpop2)
## ----summarise_prediction-----------------------------------------------------
#get the number of rows for the summary matrix
row_cluster <-length(unique(colData(mixedpop2)[,1]))
#summary results LDA to to show the percent of cells classified as cells
#belonging by LDA classifier
summary_prediction_lda(LSOLDA_dat=LSOLDA_dat, nPredSubpop = row_cluster )
#summary results Lasso to show the percent of cells classified as cells
#belonging by Lasso classifier
summary_prediction_lasso(LSOLDA_dat=LSOLDA_dat, nPredSubpop = row_cluster)
# summary maximum deviance explained by the model during the model training
summary_deviance(object = LSOLDA_dat)
# summary accuracy to check the model accuracy in the leave-out test set
summary_accuracy(object = LSOLDA_dat)
## ----prediction_one_sample, warning = FALSE, message = FALSE, fig.width=5-----
#run prediction for 3 clusters
cluster_mixedpop1 <- colData(mixedpop1)[,1]
cluster_mixedpop2 <- colData(mixedpop2)[,1]
#cluster_mixedpop2 <- as.numeric(as.vector(colData(mixedpop2)[,1]))
c_selectID <- 1
#top 200 gene markers distinguishing cluster 1
genes = DEgenes$DE_Subpop1vsRemaining$id[1:200]
LSOLDA_dat1 <- bootstrap_prediction(nboots = 2, mixedpop1 = mixedpop2,
mixedpop2 = mixedpop2, genes=genes, c_selectID, listData =list(),
cluster_mixedpop1 = cluster_mixedpop2,
cluster_mixedpop2 = cluster_mixedpop2)
c_selectID <- 2
genes = DEgenes$DE_Subpop2vsRemaining$id[1:200]
LSOLDA_dat2 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop2,
mixedpop2 = mixedpop2, genes=genes, c_selectID, listData =list(),
cluster_mixedpop1 = cluster_mixedpop2,
cluster_mixedpop2 = cluster_mixedpop2)
c_selectID <- 3
genes = DEgenes$DE_Subpop3vsRemaining$id[1:200]
LSOLDA_dat3 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop2,
mixedpop2 = mixedpop2, genes=genes, c_selectID, listData =list(),
cluster_mixedpop1 = cluster_mixedpop2,
cluster_mixedpop2 = cluster_mixedpop2)
c_selectID <- 4
genes = DEgenes$DE_Subpop4vsRemaining$id[1:200]
LSOLDA_dat4 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop2,
mixedpop2 = mixedpop2, genes=genes, c_selectID, listData =list(),
cluster_mixedpop1 = cluster_mixedpop2,
cluster_mixedpop2 = cluster_mixedpop2)
#prepare table input for sankey plot
LASSO_C1S2 <- reformat_LASSO(c_selectID=1, mp_selectID = 2,
LSOLDA_dat=LSOLDA_dat1,
nPredSubpop = length(unique(colData(mixedpop2)[,1])),
Nodes_group ="#7570b3")
LASSO_C2S2 <- reformat_LASSO(c_selectID=2, mp_selectID =2,
LSOLDA_dat=LSOLDA_dat2,
nPredSubpop = length(unique(colData(mixedpop2)[,1])),
Nodes_group ="#1b9e77")
LASSO_C3S2 <- reformat_LASSO(c_selectID=3, mp_selectID =2,
LSOLDA_dat=LSOLDA_dat3,
nPredSubpop = length(unique(colData(mixedpop2)[,1])),
Nodes_group ="#e7298a")
LASSO_C4S2 <- reformat_LASSO(c_selectID=4, mp_selectID =2,
LSOLDA_dat=LSOLDA_dat4,
nPredSubpop = length(unique(colData(mixedpop2)[,1])),
Nodes_group ="#00FFFF")
combined <- rbind(LASSO_C1S2,LASSO_C2S2,LASSO_C3S2, LASSO_C4S2 )
combined <- combined[is.na(combined$Value) != TRUE,]
nboots = 2
#links: source, target, value
#source: node, nodegroup
combined_D3obj <-list(Nodes=combined[,(nboots+3):(nboots+4)],
Links=combined[,c((nboots+2):(nboots+1),ncol(combined))])
library(networkD3)
Node_source <- as.vector(sort(unique(combined_D3obj$Links$Source)))
Node_target <- as.vector(sort(unique(combined_D3obj$Links$Target)))
Node_all <-unique(c(Node_source, Node_target))
#assign IDs for Source (start from 0)
Source <-combined_D3obj$Links$Source
Target <- combined_D3obj$Links$Target
for(i in 1:length(Node_all)){
Source[Source==Node_all[i]] <-i-1
Target[Target==Node_all[i]] <-i-1
}
#
combined_D3obj$Links$Source <- as.numeric(Source)
combined_D3obj$Links$Target <- as.numeric(Target)
combined_D3obj$Links$LinkColor <- combined$NodeGroup
#prepare node info
node_df <-data.frame(Node=Node_all)
node_df$id <-as.numeric(c(0, 1:(length(Node_all)-1)))
suppressMessages(library(dplyr))
Color <- combined %>% count(Node, color=NodeGroup) %>% select(2)
node_df$color <- Color$color
suppressMessages(library(networkD3))
p1<-sankeyNetwork(Links =combined_D3obj$Links, Nodes = node_df,
Value = "Value", NodeGroup ="color", LinkGroup = "LinkColor", NodeID="Node",
Source="Source", Target="Target", fontSize = 22 )
p1
#saveNetwork(p1, file = paste0(path,'Subpopulation_Net.html'))
## ----prediction_two_samples,warning = FALSE, message = FALSE, fig.width=5-----
#run prediction for 3 clusters
cluster_mixedpop1 <- colData(mixedpop1)[,1]
cluster_mixedpop2 <- colData(mixedpop2)[,1]
row_cluster <-length(unique(colData(mixedpop2)[,1]))
c_selectID <- 1
#top 200 gene markers distinguishing cluster 1
genes = DEgenes$DE_Subpop1vsRemaining$id[1:200]
LSOLDA_dat1 <- bootstrap_prediction(nboots = 2, mixedpop1 = mixedpop1,
mixedpop2 = mixedpop2, genes=genes, c_selectID, listData =list(),
cluster_mixedpop1 = cluster_mixedpop1,
cluster_mixedpop2 = cluster_mixedpop2)
c_selectID <- 2
genes = DEgenes$DE_Subpop2vsRemaining$id[1:200]
LSOLDA_dat2 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop1,
mixedpop2 = mixedpop2, genes=genes, c_selectID, listData =list(),
cluster_mixedpop1 = cluster_mixedpop1,
cluster_mixedpop2 = cluster_mixedpop2)
c_selectID <- 3
genes = DEgenes$DE_Subpop3vsRemaining$id[1:200]
LSOLDA_dat3 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop1,
mixedpop2 = mixedpop2, genes=genes, c_selectID, listData =list(),
cluster_mixedpop1 = cluster_mixedpop1,
cluster_mixedpop2 = cluster_mixedpop2)
#prepare table input for sankey plot
LASSO_C1S1 <- reformat_LASSO(c_selectID=1, mp_selectID = 1,
LSOLDA_dat=LSOLDA_dat1, nPredSubpop = row_cluster, Nodes_group = "#7570b3")
LASSO_C2S1 <- reformat_LASSO(c_selectID=2, mp_selectID = 1,
LSOLDA_dat=LSOLDA_dat2, nPredSubpop = row_cluster, Nodes_group = "#1b9e77")
LASSO_C3S1 <- reformat_LASSO(c_selectID=3, mp_selectID = 1,
LSOLDA_dat=LSOLDA_dat3, nPredSubpop = row_cluster, Nodes_group = "#e7298a")
combined <- rbind(LASSO_C1S1,LASSO_C2S1,LASSO_C3S1)
nboots = 2
#links: source, target, value
#source: node, nodegroup
combined_D3obj <-list(Nodes=combined[,(nboots+3):(nboots+4)],
Links=combined[,c((nboots+2):(nboots+1),ncol(combined))])
combined <- combined[is.na(combined$Value) != TRUE,]
library(networkD3)
Node_source <- as.vector(sort(unique(combined_D3obj$Links$Source)))
Node_target <- as.vector(sort(unique(combined_D3obj$Links$Target)))
Node_all <-unique(c(Node_source, Node_target))
#assign IDs for Source (start from 0)
Source <-combined_D3obj$Links$Source
Target <- combined_D3obj$Links$Target
for(i in 1:length(Node_all)){
Source[Source==Node_all[i]] <-i-1
Target[Target==Node_all[i]] <-i-1
}
combined_D3obj$Links$Source <- as.numeric(Source)
combined_D3obj$Links$Target <- as.numeric(Target)
combined_D3obj$Links$LinkColor <- combined$NodeGroup
#prepare node info
node_df <-data.frame(Node=Node_all)
node_df$id <-as.numeric(c(0, 1:(length(Node_all)-1)))
suppressMessages(library(dplyr))
n <- length(unique(node_df$Node))
getPalette = colorRampPalette(RColorBrewer::brewer.pal(9, "Set1"))
Color = getPalette(n)
node_df$color <- Color
suppressMessages(library(networkD3))
p1<-sankeyNetwork(Links =combined_D3obj$Links, Nodes = node_df,
Value = "Value", NodeGroup ="color", LinkGroup = "LinkColor",
NodeID="Node", Source="Source", Target="Target", fontSize = 22)
p1
#saveNetwork(p1, file = paste0(path,'Subpopulation_Net.html'))
## ----session_info, include=TRUE, echo=TRUE, results='markup'------------------
devtools::session_info()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.