library(knitr)
opts_chunk$set(fig.align = 'center', fig.width = 6, fig.height = 5, dev = 'png')

Introduction

'FastSC3' is an extension of the 'SC3' package to very large datasets. While using some approximations, it allows a user to cluster the dataset without using the hybrid (SVM) strategy and to work on the total number of cells.

'FastSC3' depends on 'SC3' so that all the 'SC3' methods are accessible from 'FastSC3'.

Quick Start

FastSC3 Input

The input for FastSC3 is an object of class scater. Here we will create it from the dataset ('treutlein') that comes with the package:

Macosko

library(scater)
library(FastSC3)
library(pheatmap)

# macosko <- readRDS("~/Dropbox/scRNASeq-public-datasets/drosophila-retina/intermediate/macosko.rds")
# macosko <- macosko[!duplicated(unlist(lapply(strsplit(rownames(macosko), ":"), "[[", 3))), ]
# rownames(macosko) <- unlist(lapply(strsplit(rownames(macosko), ":"), "[[", 3))
# cell_info <- data.frame(cell_id = as.numeric(colnames(macosko)))
# cell_info$cell_type <- "rods"
# cell_info$cell_type[cell_info$cell_id == 1] <- "horizontal"
# cell_info$cell_type[cell_info$cell_id == 2] <- "ganglion"
# cell_info$cell_type[cell_info$cell_id %in% 3:23] <- "amacrine"
# cell_info$cell_type[cell_info$cell_id == 25] <- "cones"
# cell_info$cell_type[cell_info$cell_id %in% 26:33] <- "bipolar"
# cell_info$cell_type[cell_info$cell_id == 34] <- "muller"
# cell_info$cell_type[cell_info$cell_id == 35] <- "astrocytes"
# cell_info$cell_type[cell_info$cell_id == 36] <- "fibroblasts"
# cell_info$cell_type[cell_info$cell_id == 37] <- "vascular_endothelium"
# cell_info$cell_type[cell_info$cell_id == 38] <- "pericytes"
# cell_info$cell_type[cell_info$cell_id == 39] <- "microglia"
# cell_inds <- paste("Cell", 1:ncol(macosko), sep = "_")
# rownames(cell_info) <- cell_inds
# cell_exprs <- macosko
# colnames(cell_exprs) <- cell_inds
# pd <- new("AnnotatedDataFrame", data = cell_info)
# sceset <- newSCESet(countData = cell_exprs, phenoData = pd)
# sceset <- calculateQCMetrics(sceset)
# sceset@featureData@data$feature_symbol <- featureNames(sceset)
# saveRDS(sceset, file = "~/Dropbox/scRNASeq-public-datasets/drosophila-retina/macosko_sceset.rds")

sceset <- readRDS("~/Dropbox/scRNASeq-public-datasets/drosophila-retina/macosko_sceset.rds")
gc()
ptm <- proc.time()
# prepare for k prediction
sceset <- sc3_prepare(sceset, ks = 38:40, svm_max = Inf)
sceset <- fsc3_get_hyperplanes(sceset, n_genes = 100, suppress_plot = F)
sceset <- fsc3_get_signatures(sceset)
sceset <- fsc3_get_buckets(sceset, common_bits = 75)
sceset <- fsc3_get_buckets_signatures(sceset)

sceset <- fsc3_select_repr_cells(sceset)
# run normal SC3 pipeline on the selected cells
sceset <- sc3_calc_dists(sceset)
sceset <- sc3_calc_transfs(sceset)
sceset <- sc3_kmeans(sceset)
sceset <- sc3_calc_consens(sceset)
# infer cluster indexes from buckets
sceset <- fsc3_get_clusters(sceset, k = k)
# summarise results
sceset <- sc3_summarise_results(sceset, k = k)
mclust::adjustedRandIndex(sceset@sc3$results$clusters[,1], as.numeric(pData(sceset)[,1]))
proc.time() - ptm
gc()

Shekhar

library(scater)
library(FastSC3)
library(pheatmap)
# 
# shekhar <- readRDS("~/Dropbox/scRNASeq-public-datasets/drosophila-retina/intermediate/shekhar.rds")
# cell_info <- readRDS("~/Dropbox/scRNASeq-public-datasets/drosophila-retina/intermediate/shekhar_ann.rds")
# cell_info$cell_type <- "unknown"
# cell_info$cell_type[cell_info$louvain_clusts %in% c(1, 3:15)] <- "bipolar"
# cell_info$cell_type[cell_info$louvain_clusts == 2] <- "muller"
# cell_info$cell_type[cell_info$louvain_clusts == 16] <- "amacrine"
# cell_info$cell_type[cell_info$louvain_clusts == 20] <- "rods"
# cell_info$cell_type[cell_info$louvain_clusts == 22] <- "cones"
# pd <- new("AnnotatedDataFrame", data = cell_info)
# sceset <- newSCESet(countData = shekhar, phenoData = pd)
# sceset <- calculateQCMetrics(sceset)
# sceset@featureData@data$feature_symbol <- featureNames(sceset)
# saveRDS(sceset, file = "~/Dropbox/scRNASeq-public-datasets/drosophila-retina/shekhar_sceset.rds")

sceset <- readRDS("~/Dropbox/scRNASeq-public-datasets/drosophila-retina/shekhar_sceset.rds")
gc()
ptm <- proc.time()
# prepare for k prediction
sceset <- sc3_prepare(sceset, ks = 2:26, svm_max = Inf)
sceset <- fsc3_get_features(sceset)
sceset <- fsc3_get_signatures(sceset)

sceset <- fsc3_get_buckets(sceset, common_bits = 61, runs = 10)


sceset <- fsc3_get_buckets_signatures(sceset)
mclust::adjustedRandIndex(sceset@sc3$buckets, as.numeric(pData(sceset)[,1]))
sceset <- fsc3_select_repr_cells(sceset)
# run normal SC3 pipeline on the selected cells
sceset <- sc3_calc_dists(sceset)
sceset <- sc3_calc_transfs(sceset)
sceset <- sc3_kmeans(sceset)
sceset <- sc3_calc_consens(sceset)
# infer cluster indexes from buckets
sceset <- fsc3_get_clusters(sceset, k = 10)
# summarise results
sceset <- sc3_summarise_results(sceset, k = 10)
mclust::adjustedRandIndex(sceset@sc3$results$clusters[,1], as.numeric(pData(sceset)[,1]))
proc.time() - ptm
gc()

Pollen

library(scater)
library(FastSC3)

# pollen2 <- readRDS("~/Dropbox/scRNASeq-public-datasets/pollen2.rds")
# cell_info <- data.frame(cell_id = colnames(pollen2))
# cell_inds <- paste("Cell", 1:ncol(pollen2), sep = "_")
# rownames(cell_info) <- cell_inds
# cell_exprs <- pollen2
# colnames(cell_exprs) <- cell_inds
# pd <- new("AnnotatedDataFrame", data = cell_info)
# sceset <- newSCESet(tpmData = cell_exprs, phenoData = pd)
# is_exprs(sceset) <- exprs(sceset) > 0
# sceset <- calculateQCMetrics(sceset)
# saveRDS(sceset, file = "~/Dropbox/scRNASeq-public-datasets/pollen2_sceset.rds")

sceset <- readRDS("~/Dropbox/scRNASeq-public-datasets/pollen2_sceset.rds")
# prepare for k prediction
sceset <- sc3_prepare(sceset, ks = 8:14)
# prepare for M3Drop + LSH
sceset <- fsc3_get_hyperplanes(sceset, n_genes = 100, suppress_plot = F)
sceset <- fsc3_get_signatures(sceset)
sceset <- fsc3_get_buckets(sceset, common_bits = 75)
mclust::adjustedRandIndex(sceset@sc3$buckets, as.numeric(pData(sceset)[,1]))
sceset <- fsc3_get_buckets_signatures(sceset)
sceset <- fsc3_select_repr_cells(sceset)
# run normal SC3 pipeline on the selected cells
sceset <- sc3_calc_dists(sceset)
sceset <- sc3_calc_transfs(sceset)
sceset <- sc3_kmeans(sceset)
sceset <- sc3_calc_consens(sceset)
# infer cluster indexes from buckets
sceset <- fsc3_get_clusters(sceset, k = 11)
# summarise results
sceset <- sc3_summarise_results(sceset, k = 11)
mclust::adjustedRandIndex(sceset@sc3$results$clusters[,1], as.numeric(pData(sceset)[,1]))
plot_sankey(as.numeric(pData(sceset)[,1]), sceset@sc3$results$clusters[,1])

Zeisel

library(scater)
library(FastSC3)

# zeisel <- readRDS("~/Dropbox/scRNASeq-public-datasets/zeisel.rds")
# cell_info <- data.frame(cell_id = colnames(zeisel))
# cell_inds <- paste("Cell", 1:ncol(zeisel), sep = "_")
# rownames(cell_info) <- cell_inds
# cell_exprs <- zeisel
# colnames(cell_exprs) <- cell_inds
# pd <- new("AnnotatedDataFrame", data = cell_info)
# sceset <- newSCESet(countData = cell_exprs, phenoData = pd)
# sceset <- calculateQCMetrics(sceset)
# saveRDS(sceset, file = "~/Dropbox/scRNASeq-public-datasets/zeisel_sceset.rds")

sceset <- readRDS("~/Dropbox/scRNASeq-public-datasets/zeisel_sceset.rds")
# prepare for k prediction
sceset <- sc3_prepare(sceset)
# sceset <- sc3_estimate_k(sceset)
k <- 5:11
sceset <- sc3_set_ks(sceset, ks = k)
# prepare for M3Drop + LSH
sceset <- fsc3_get_hyperplanes(sceset, n_genes = 10, suppress_plot = F)
sceset <- fsc3_get_signatures(sceset)
sceset <- fsc3_get_buckets(sceset, common_bits = 7)
# mclust::adjustedRandIndex(sceset@sc3$buckets, as.numeric(pData(sceset)[,1]))
sceset <- fsc3_get_buckets_signatures(sceset)
sceset <- fsc3_select_repr_cells(sceset)
# run normal SC3 pipeline on the selected cells
sceset <- sc3_calc_dists(sceset)
sceset <- sc3_calc_transfs(sceset)
sceset <- sc3_kmeans(sceset)
sceset <- sc3_calc_consens(sceset)
# infer cluster indexes from buckets
sceset <- fsc3_get_clusters(sceset, k = 8)
# summarise results
sceset <- sc3_summarise_results(sceset, k = 8)
mclust::adjustedRandIndex(sceset@sc3$results$clusters[,1], as.numeric(pData(sceset)[,1]))
plot_sankey(as.numeric(pData(sceset)[,1]), sceset@sc3$results$clusters[,1])

Klein

library(scater)
library(FastSC3)

# klein <- readRDS("~/Dropbox/scRNASeq-public-datasets/klein.rds")
# cell_info <- data.frame(cell_id = colnames(klein))
# cell_inds <- paste("Cell", 1:ncol(klein), sep = "_")
# rownames(cell_info) <- cell_inds
# cell_exprs <- klein
# colnames(cell_exprs) <- cell_inds
# pd <- new("AnnotatedDataFrame", data = cell_info)
# sceset <- newSCESet(countData = cell_exprs, phenoData = pd)
# sceset <- calculateQCMetrics(sceset)
# saveRDS(sceset, file = "~/Dropbox/scRNASeq-public-datasets/klein_sceset.rds")

sceset <- readRDS("~/Dropbox/scRNASeq-public-datasets/klein_sceset.rds")
# prepare for k prediction
sceset <- sc3_prepare(sceset)
# sceset <- sc3_estimate_k(sceset)
k <- 3:6
sceset <- sc3_set_ks(sceset, ks = k)
# prepare for M3Drop + LSH
sceset <- fsc3_get_hyperplanes(sceset, n_genes = 100, suppress_plot = F)
sceset <- fsc3_get_signatures(sceset)
sceset <- fsc3_get_buckets(sceset, common_bits = 65, runs = 10)
mclust::adjustedRandIndex(sceset@sc3$buckets, as.numeric(pData(sceset)[,1]))
sceset <- fsc3_get_buckets_signatures(sceset)
sceset <- fsc3_select_repr_cells(sceset)
# run normal SC3 pipeline on the selected cells
sceset <- sc3_calc_dists(sceset)
sceset <- sc3_calc_transfs(sceset)
sceset <- sc3_kmeans(sceset)
sceset <- sc3_calc_consens(sceset)
# infer cluster indexes from buckets
sceset <- fsc3_get_clusters(sceset, k = 4)
# summarise results
sceset <- sc3_summarise_results(sceset, k = 4)
mclust::adjustedRandIndex(sceset@sc3$results$clusters[,1], as.numeric(pData(sceset)[,1]))

Kolodziejczyk

library(scater)
library(FastSC3)

# kolodziejczyk <- readRDS("~/Dropbox/scRNASeq-public-datasets/kolodziejczyk.rds")
# cell_info <- data.frame(cell_id = colnames(kolodziejczyk))
# cell_inds <- paste("Cell", 1:ncol(kolodziejczyk), sep = "_")
# rownames(cell_info) <- cell_inds
# cell_exprs <- kolodziejczyk
# colnames(cell_exprs) <- cell_inds
# pd <- new("AnnotatedDataFrame", data = cell_info)
# sceset <- newSCESet(tpmData = cell_exprs, phenoData = pd)
# is_exprs(sceset) <- exprs(sceset) > 0
# sceset <- calculateQCMetrics(sceset)
# saveRDS(sceset, file = "~/Dropbox/scRNASeq-public-datasets/kolodziejczyk_sceset.rds")

sceset <- readRDS("~/Dropbox/scRNASeq-public-datasets/kolodziejczyk_sceset.rds")
# prepare for k prediction
sceset <- sc3_prepare(sceset)
# sceset <- sc3_estimate_k(sceset)
k <- 3:5
sceset <- sc3_set_ks(sceset, ks = k)
# prepare for M3Drop + LSH
sceset <- fsc3_get_hyperplanes(sceset, n_genes = 100, suppress_plot = F)
sceset <- fsc3_get_signatures(sceset)
sceset <- fsc3_get_buckets(sceset, common_bits = 70)
mclust::adjustedRandIndex(sceset@sc3$buckets, as.numeric(pData(sceset)[,1]))
sceset <- fsc3_get_buckets_signatures(sceset)
sceset <- fsc3_select_repr_cells(sceset)
# run normal SC3 pipeline on the selected cells
sceset <- sc3_calc_dists(sceset)
sceset <- sc3_calc_transfs(sceset)
sceset <- sc3_kmeans(sceset)
sceset <- sc3_calc_consens(sceset)
# infer cluster indexes from buckets
sceset <- fsc3_get_clusters(sceset, k = 3)
# summarise results
sceset <- sc3_summarise_results(sceset, k = 3)
mclust::adjustedRandIndex(sceset@sc3$results$clusters[,1], as.numeric(pData(sceset)[,1]))

FJLT

sceset <- readRDS("~/Dropbox/scRNASeq-public-datasets/pollen2_sceset.rds")
# prepare for k prediction
sceset <- sc3_prepare(sceset)
# sceset <- sc3_estimate_k(sceset)
k <- 11
sceset <- sc3_set_ks(sceset, ks = k)
# prepare for FJLT + LSH
sceset <- fsc3_fjlt(sceset)
sceset <- fsc3_get_signatures_fjlt(sceset)
sceset <- fsc3_get_buckets(sceset, common_bits = 80)

sc3_prepare

# Note that n.cores = 1 is required for compilation of this vignette.
# Please remove this parameter when running on your computer:
# sceset <- sc3_prepare(sceset)
sceset <- sc3_prepare(sceset, n.cores = 1)
sceset <- sc3_estimate_k(sceset)
sceset <- sc3_set_ks(sceset, ks = 5)
sceset <- fsc3_fjlt(sceset)
pheatmap::pheatmap(sceset@sc3$fjlt)
sceset <- sc3_calc_dists(sceset)
sceset <- fsc3_calc_transfs(sceset)
sceset <- fsc3_calc_eigenv(sceset)
sceset <- sc3_kmeans(sceset)
sceset <- sc3_calc_consens(sceset)
sceset <- sc3_summarise_results(sceset, k = 5)

Out of sample extension

Drosophila retina

shekhar <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-retina/shekhar_sceset.rds")
shekhar <- fsc3_get_features(shekhar)

macosko <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-retina/macosko_sceset.rds")
macosko <- fsc3_get_features(macosko)

shekh <- featureNames(shekhar)[fData(shekhar)$fsc3_features]
mac <- featureNames(macosko)[fData(macosko)$fsc3_features]
upset(fromList(list(Shekhar = shekh, Macosko = mac)), order.by = "freq", nsets = 2)

# Assignments based on Shekhar
common_features <- get_common_fsc3_features(list(shekhar))
macosko <- fsc3_set_features(macosko, common_features)
# update common_features
common_features <- featureNames(macosko)[fData(macosko)$fsc3_features]
shekhar <- fsc3_set_features(shekhar, common_features)

shekhar <- fsc3_get_signatures(shekhar)
macosko <- fsc3_get_signatures(macosko)

p_data <- pData(shekhar)
p_data$fsc3_buckets <- p_data$cell_type
pData(shekhar) <- new("AnnotatedDataFrame", data = p_data)
shekhar <- fsc3_get_buckets_signatures(shekhar, threshold = 0.6)

macosko <- fsc3_assign_signatures(macosko, shekhar, threshold = 0.6)
if(is.factor(pData(macosko)$cell_type)) {
    labs_orig <- levels(pData(macosko)$cell_type)[pData(macosko)$cell_type]
} else {
    labs_orig <- pData(macosko)$cell_type
}
labs_new <- pData(macosko)$fsc3_buckets_assigned
plot_sankey(labs_orig, labs_new)
irr::kappa2(cbind(labs_orig, labs_new))$value
mclust::adjustedRandIndex(labs_orig, labs_new)

# Assignments based on Macosko
macosko <- fsc3_get_features(macosko)
common_features <- get_common_fsc3_features(list(macosko))
shekhar <- fsc3_set_features(shekhar, common_features)
# update common_features
common_features <- featureNames(shekhar)[fData(shekhar)$fsc3_features]
macosko <- fsc3_set_features(macosko, common_features)

shekhar <- fsc3_get_signatures(shekhar)
macosko <- fsc3_get_signatures(macosko)

p_data <- pData(macosko)
p_data$fsc3_buckets <- p_data$cell_type
pData(macosko) <- new("AnnotatedDataFrame", data = p_data)
macosko <- fsc3_get_buckets_signatures(macosko, threshold = 0.6)

shekhar <- fsc3_assign_signatures(shekhar, macosko, threshold = 0.6)
if(is.factor(pData(shekhar)$cell_type)) {
    labs_orig <- levels(pData(shekhar)$cell_type)[pData(shekhar)$cell_type]
} else {
    labs_orig <- pData(shekhar)$cell_type
}
labs_new <- pData(shekhar)$fsc3_buckets_assigned
plot_sankey(labs_orig, labs_new)
irr::kappa2(cbind(labs_orig, labs_new))$value
mclust::adjustedRandIndex(labs_orig, labs_new)



common_features <- get_common_fsc3_features(list(shekhar, macosko))

shekhar <- fsc3_set_features(shekhar, common_features)
macosko <- fsc3_set_features(macosko, common_features)

shekhar <- fsc3_get_signatures(shekhar)
macosko <- fsc3_get_signatures(macosko)

# Assignments based on Shekhar

p_data <- pData(shekhar)
p_data$fsc3_buckets <- p_data$cell_type
pData(shekhar) <- new("AnnotatedDataFrame", data = p_data)
shekhar <- fsc3_get_buckets_signatures(shekhar, threshold = 0.8)

macosko <- fsc3_assign_signatures(macosko, shekhar, threshold = 0.7)
if(is.factor(pData(macosko)$cell_type)) {
    labs_orig <- levels(pData(macosko)$cell_type)[pData(macosko)$cell_type]
} else {
    labs_orig <- pData(macosko)$cell_type
}
labs_new <- pData(macosko)$fsc3_buckets_assigned
plot_sankey(labs_orig, labs_new)
# plot_sankey(labs_orig, labs_new, colors = c("#FF0000","#FFA500","#008000","#008000","#008000","#008000","#008000","#008000","#008000","#008000","#008000","#008000","#008000","#008000","#008000","#008000","#008000","#008000","#008000","#008000","#008000","#008000","#008000","#57A5D6","#57A5D6","#0000FF","#0000FF","#0000FF","#0000FF","#0000FF","#0000FF","#0000FF","#0000FF","#800080","#800080","#800080","#800080","#800080","#800080"))
irr::kappa2(cbind(labs_orig, labs_new))
mclust::adjustedRandIndex(labs_orig, labs_new)

# Assignments based on Macosko

p_data <- pData(macosko)
p_data$fsc3_buckets <- p_data$cell_type
pData(macosko) <- new("AnnotatedDataFrame", data = p_data)
macosko <- fsc3_get_buckets_signatures(macosko, threshold = 0.8)

shekhar <- fsc3_assign_signatures(shekhar, macosko, threshold = 0.7)
if(is.factor(pData(shekhar)$cell_type)) {
    labs_orig <- levels(pData(shekhar)$cell_type)[pData(shekhar)$cell_type]
} else {
    labs_orig <- pData(shekhar)$cell_type
}
labs_new <- pData(shekhar)$fsc3_buckets_assigned
plot_sankey(labs_orig, labs_new)
# plot_sankey(labs_orig, labs_new, colors = c("#0000FF", "#800080", "#0000FF", "#0000FF", "#0000FF", "#0000FF", "#0000FF", "#0000FF", "#0000FF", "#0000FF", "#0000FF", "#0000FF", "#0000FF", "#0000FF", "#0000FF", "#008000", "#000000", "#000000", "#000000", "#57A5D6", "#000000", "#57A5D6", "#000000", "#000000", "#000000", "#000000"))
irr::kappa2(cbind(labs_orig, labs_new))
mclust::adjustedRandIndex(labs_orig, labs_new)

Embryo development

Deng

# d <- readRDS("~/Dropbox/scRNASeq-public-datasets/deng.rds")
# #  1   1-4 - zygote
# #  2   5-12 - early2cell
# #  3   13-24 - mid2cell
# #  4   25-34 - late2cell
# #  5   35-48 - 4cell
# #  6   49-85 - 8cell
# #  7   86-135 - 16cell
# #  8   136-178 - earlyblast
# #  9   179-238 - midblast
# #  10   239-268 - lateblast
# labs <- colnames(d)
# labs[labs == "1" | labs == "2"] = "zygote"
# labs[labs == "3" | labs == "4"] = "2cell"
# labs[labs == "5"] = "4cell"
# labs[labs == "6"] = "8cell"
# labs[labs == "7"] = "16cell"
# labs[labs == "8" | labs == "9" | labs == "10"] = "blast"
# ann <- data.frame(stage = labs)
# rownames(ann) <- paste("cell", 1:ncol(d), sep = "_")
# pd <- new("AnnotatedDataFrame", data = ann)
# d <- d[!duplicated(rownames(d)), ]
# colnames(d) <- rownames(ann)
# deng <- newSCESet(fpkmData = d, phenoData = pd)
# is_exprs(deng) <- exprs(deng) > 0
# deng <- calculateQCMetrics(deng)
# # use gene names as feature symbols
# deng@featureData@data$feature_symbol <- featureNames(deng)
# saveRDS(deng, "~/Dropbox/scRNASeq-public-datasets/mouse-embryo-devel/deng_rpkm_sceset.rds")

Biase

# d <- read.table("~/Dropbox/scRNASeq-public-datasets/mouse-embryo-devel/intermediate/GSE57249_fpkm_ZHONG.txt", header = T)
# d <- d[!duplicated(d$ID),]
# rownames(d) <- d$ID
# d <- d[,2:ncol(d)]
# 
# # options(stringsAsFactors = F)
# 
# labs <- read.table("~/Dropbox/scRNASeq-public-datasets/mouse-embryo-devel/intermediate/GSE57249_labels_ZHONG.txt", stringsAsFactors = F)
# labs <- as.character(labs)
# 
# labs[grep("TE",labs)] = "blast"
# labs[grep("ICM",labs)] = "blast"
# 
# ann <- data.frame(stage = labs)
# rownames(ann) <- colnames(d)
# 
# colnames(d) <- rownames(ann)
# pd <- new("AnnotatedDataFrame", data = ann)
# biase <- newSCESet(fpkmData = as.matrix(d), phenoData = pd)
# is_exprs(biase) <- exprs(biase) > 0
# biase <- calculateQCMetrics(biase)
# # use gene names as feature symbols
# biase@featureData@data$feature_symbol <- featureNames(biase)
# saveRDS(biase, "~/Dropbox/scRNASeq-public-datasets/mouse-embryo-devel/biase_fpkm.rds")

Goolam

# d <- read.table("~/Dropbox/scRNASeq-public-datasets/mouse-embryo-devel/intermediate/Goolam_et_al_2015_count_table.tsv", header = T)
# 
# labs <- colnames(d)
# labs[grep("4cell",labs)] = "4cell"
# labs[grep("8cell",labs)] = "8cell"
# labs[grep("16cell",labs)] = "16cell"
# labs[grep("32cell",labs)] = "blast"
# labs[grep("2cell",labs)] = "2cell"
# 
# ann <- data.frame(stage = labs)
# rownames(ann) <- colnames(d)
# colnames(d) <- rownames(ann)
# pd <- new("AnnotatedDataFrame", data = ann)
# goolam <- newSCESet(countData = as.matrix(d), phenoData = pd)
# goolam <- calculateQCMetrics(goolam, feature_controls = grep("ERCC-", rownames(d)))
# # convert ensembl ids into gene names
# goolam <- getBMFeatureAnnos(
#     goolam, filters="ensembl_gene_id",
#     biomart="ensembl", dataset="mmusculus_gene_ensembl")
# saveRDS(goolam, "~/Dropbox/scRNASeq-public-datasets/mouse-embryo-devel/goolam_counts.rds")

Fan

# d <- read.table("~/Dropbox/scRNASeq-public-datasets/mouse-embryo-devel/intermediate/GSE53386_matrix_fpkms.tsv")
# 
# labs <- colnames(d)
# labs[grep("ocyte",labs)] = "zygote"
# labs[grep("zygote",labs)] = "zygote"
# labs[grep("a.AM",labs)] = "zygote"
# labs[grep("2.cell",labs)] = "2cell"
# labs[grep("4.cell",labs)] = "4cell"
# labs[grep("8.cell",labs)] = "8cell"
# labs[grep("morula",labs)] = "16cell"
# labs[grep("ICM",labs)] = "blast"
# labs[grep("TE",labs)] = "blast"
# labs[grep("blast",labs)] = "blast"
# 
# ann <- data.frame(stage = labs)
# rownames(ann) <- colnames(d)
# 
# colnames(d) <- rownames(ann)
# pd <- new("AnnotatedDataFrame", data = ann)
# fan <- newSCESet(fpkmData = as.matrix(d), phenoData = pd)
# is_exprs(fan) <- exprs(fan) > 0
# fan <- calculateQCMetrics(fan)
# # use gene names as feature symbols
# fan@featureData@data$feature_symbol <- featureNames(fan)
# saveRDS(fan, "~/Dropbox/scRNASeq-public-datasets/mouse-embryo-devel/fan_fpkm.rds")
deng_rpkm <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-embryo-devel/deng_rpkm_sceset.rds")
deng_rpkm <- fsc3_get_features(deng_rpkm)
biase_fpkm <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-embryo-devel/biase_fpkm.rds")
biase_fpkm <- fsc3_get_features(biase_fpkm)
goolam_counts <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-embryo-devel/goolam_counts.rds")
goolam_counts <- fsc3_get_features(goolam_counts)
fan_fpkm <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-embryo-devel/fan_fpkm.rds")
fan_fpkm <- fsc3_get_features(fan_fpkm)


deng <- featureNames(deng_rpkm)[fData(deng_rpkm)$fsc3_features]
biase <- featureNames(biase_fpkm)[fData(biase_fpkm)$fsc3_features]
goolam <- featureNames(goolam_counts)[fData(goolam_counts)$fsc3_features]
fan <- featureNames(fan_fpkm)[fData(fan_fpkm)$fsc3_features]
upset(fromList(list(Deng = deng, Biase = biase, Goolam = goolam, Fan = fan)), order.by = "freq", nsets = 4)

common_features <- get_common_fsc3_features(list(deng_rpkm, goolam_counts, biase_fpkm, fan_fpkm))

deng_rpkm <- fsc3_set_features(deng_rpkm, common_features)
biase_fpkm <- fsc3_set_features(biase_fpkm, common_features)
goolam_counts <- fsc3_set_features(goolam_counts, common_features)
fan_fpkm <- fsc3_set_features(fan_fpkm, common_features)

deng_rpkm <- fsc3_get_signatures(deng_rpkm)
biase_fpkm <- fsc3_get_signatures(biase_fpkm)
goolam_counts <- fsc3_get_signatures(goolam_counts)
fan_fpkm <- fsc3_get_signatures(fan_fpkm)

# Assignments based on Deng

p_data <- pData(deng_rpkm)
p_data$fsc3_buckets <- p_data$stage
pData(deng_rpkm) <- new("AnnotatedDataFrame", data = p_data)
deng_rpkm <- fsc3_get_buckets_signatures(deng_rpkm, threshold = 0.9)


goolam_counts <- fsc3_assign_signatures(goolam_counts, deng_rpkm, threshold = 0.7)
if(is.factor(pData(goolam_counts)$stage)) {
    labs_orig <- levels(pData(goolam_counts)$stage)[pData(goolam_counts)$stage]
} else {
    labs_orig <- pData(goolam_counts)$stage
}
labs_new <- pData(goolam_counts)$fsc3_buckets_assigned
plot_sankey(labs_orig, labs_new)
irr::kappa2(cbind(labs_orig, labs_new))
mclust::adjustedRandIndex(labs_orig, labs_new)

biase_fpkm <- fsc3_assign_signatures(biase_fpkm, deng_rpkm, bucket_threshold = 0.8, assign_threshold = 0.7)
if(is.factor(pData(biase_fpkm)$stage)) {
    labs_orig <- levels(pData(biase_fpkm)$stage)[pData(biase_fpkm)$stage]
} else {
    labs_orig <- pData(biase_fpkm)$stage
}
labs_new <- pData(biase_fpkm)$fsc3_buckets_assigned
plot_sankey(labs_orig, labs_new)
irr::kappa2(cbind(labs_orig, labs_new))
mclust::adjustedRandIndex(labs_orig, labs_new)

fan_fpkm <- fsc3_assign_signatures(fan_fpkm, deng_rpkm)
if(is.factor(pData(fan_fpkm)$stage)) {
    labs_orig <- levels(pData(fan_fpkm)$stage)[pData(fan_fpkm)$stage]
} else {
    labs_orig <- pData(fan_fpkm)$stage
}
labs_new <- pData(fan_fpkm)$fsc3_buckets_assigned
plot_sankey(labs_orig, labs_new)
irr::kappa2(cbind(labs_orig, labs_new))
mclust::adjustedRandIndex(labs_orig, labs_new)

# Compare expressions

p <- fsc3_plot_sig_expression(deng_rpkm, fColumn = "stage")
fsc3_plot_sig_expression(goolam_counts, fColumn = "stage", hc = p$tree_row)
fsc3_plot_sig_expression(fan_fpkm, fColumn = "stage", hc = p$tree_row)
fsc3_plot_sig_expression(biase_fpkm, fColumn = "stage", hc = p$tree_row)

# Compare with Tallulah's genes

ta_genes <- read.table("~/Dropbox/scRNASeq-public-datasets/mouse-embryo-devel/intermediate/FourDatasets_Stage_Expression_DB_M3Drop_Genes.txt")
ta_genes <- rownames(ta_genes)

table(fData(deng_rpkm)$feature_symbol[fData(deng_rpkm)$fsc3_features] %in% ta_genes)
table(fData(goolam_counts)$feature_symbol[fData(goolam_counts)$fsc3_features] %in% ta_genes)
table(fData(biase_fpkm)$feature_symbol[fData(biase_fpkm)$fsc3_features] %in% ta_genes)
table(fData(fan_fpkm)$feature_symbol[fData(fan_fpkm)$fsc3_features] %in% ta_genes)




deng_rpkm <- fsc3_get_features(deng_rpkm)
common_features <- get_common_fsc3_features(list(deng_rpkm))
biase_fpkm <- fsc3_set_features(biase_fpkm, common_features)
# update common_features
common_features <- featureNames(biase_fpkm)[fData(biase_fpkm)$fsc3_features]
deng_rpkm <- fsc3_set_features(deng_rpkm, common_features)

deng_rpkm <- fsc3_get_signatures(deng_rpkm)
biase_fpkm <- fsc3_get_signatures(biase_fpkm)

p_data <- pData(deng_rpkm)
p_data$fsc3_buckets <- p_data$stage
pData(deng_rpkm) <- new("AnnotatedDataFrame", data = p_data)
deng_rpkm <- fsc3_get_buckets_signatures(deng_rpkm, threshold = 0.6)

biase_fpkm <- fsc3_assign_signatures(biase_fpkm, deng_rpkm, threshold = 0.6)
if(is.factor(pData(biase_fpkm)$stage)) {
    labs_orig <- levels(pData(biase_fpkm)$stage)[pData(biase_fpkm)$stage]
} else {
    labs_orig <- pData(biase_fpkm)$stage
}
labs_new <- pData(biase_fpkm)$fsc3_buckets_assigned
plot_sankey(labs_orig, labs_new)
irr::kappa2(cbind(labs_orig, labs_new))$value
mclust::adjustedRandIndex(labs_orig, labs_new)


deng_rpkm <- fsc3_get_features(deng_rpkm)
common_features <- get_common_fsc3_features(list(deng_rpkm))
fan_fpkm <- fsc3_set_features(fan_fpkm, common_features)
# update common_features
common_features <- featureNames(fan_fpkm)[fData(fan_fpkm)$fsc3_features]
deng_rpkm <- fsc3_set_features(deng_rpkm, common_features)

deng_rpkm <- fsc3_get_signatures(deng_rpkm)
fan_fpkm <- fsc3_get_signatures(fan_fpkm)

p_data <- pData(deng_rpkm)
p_data$fsc3_buckets <- p_data$stage
pData(deng_rpkm) <- new("AnnotatedDataFrame", data = p_data)
deng_rpkm <- fsc3_get_buckets_signatures(deng_rpkm, threshold = 0.6)

fan_fpkm <- fsc3_assign_signatures(fan_fpkm, deng_rpkm, threshold = 0.6)
if(is.factor(pData(fan_fpkm)$stage)) {
    labs_orig <- levels(pData(fan_fpkm)$stage)[pData(fan_fpkm)$stage]
} else {
    labs_orig <- pData(fan_fpkm)$stage
}
labs_new <- pData(fan_fpkm)$fsc3_buckets_assigned
plot_sankey(labs_orig, labs_new)
irr::kappa2(cbind(labs_orig, labs_new))$value
mclust::adjustedRandIndex(labs_orig, labs_new)




deng_rpkm <- fsc3_get_features(deng_rpkm)
common_features <- get_common_fsc3_features(list(deng_rpkm))
goolam_counts <- fsc3_set_features(goolam_counts, common_features)
# update common_features
common_features <- fData(goolam_counts)$feature_symbol[fData(goolam_counts)$fsc3_features]
deng_rpkm <- fsc3_set_features(deng_rpkm, common_features)

deng_rpkm <- fsc3_get_signatures(deng_rpkm)
goolam_counts <- fsc3_get_signatures(goolam_counts)

p_data <- pData(deng_rpkm)
p_data$fsc3_buckets <- p_data$stage
pData(deng_rpkm) <- new("AnnotatedDataFrame", data = p_data)
deng_rpkm <- fsc3_get_buckets_signatures(deng_rpkm, threshold = 0.6)

goolam_counts <- fsc3_assign_signatures(goolam_counts, deng_rpkm, threshold = 0.6)
if(is.factor(pData(goolam_counts)$stage)) {
    labs_orig <- levels(pData(goolam_counts)$stage)[pData(goolam_counts)$stage]
} else {
    labs_orig <- pData(goolam_counts)$stage
}
labs_new <- pData(goolam_counts)$fsc3_buckets_assigned
plot_sankey(labs_orig, labs_new)
irr::kappa2(cbind(labs_orig, labs_new))$value
mclust::adjustedRandIndex(labs_orig, labs_new)

Hematopoietic cells

Nestorowa

d <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/intermediate/Nestorowa.rds")
dataset <- d[[1]]
ann <- d[[2]]
rownames(ann) <- colnames(dataset)
pd <- new("AnnotatedDataFrame", data = ann)
nestorowa <- newSCESet(fpkmData = as.matrix(dataset), phenoData = pd)
is_exprs(nestorowa) <- exprs(nestorowa) > 0
nestorowa <- calculateQCMetrics(nestorowa)
# use gene names as feature symbols
nestorowa@featureData@data$feature_symbol <- featureNames(nestorowa)
saveRDS(nestorowa, "~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/nestorowa.rds")

Olsson

d <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/intermediate/Olsson.rds")
dataset <- d[[1]]
ann <- d[[2]]
rownames(ann) <- colnames(dataset)
pd <- new("AnnotatedDataFrame", data = ann)
olsson <- newSCESet(fpkmData = as.matrix(dataset), phenoData = pd)
is_exprs(olsson) <- exprs(olsson) > 0
olsson <- calculateQCMetrics(olsson)
# use gene names as feature symbols
olsson@featureData@data$feature_symbol <- featureNames(olsson)
saveRDS(olsson, "~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/olsson.rds")

Kowalczyk

d <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/intermediate/GSE59114_C57BL6.rds")
dataset <- d[[1]]
ann <- d[[2]]
rownames(ann) <- colnames(dataset)
pd <- new("AnnotatedDataFrame", data = ann)
kowalczyk <- newSCESet(fpkmData = as.matrix(dataset), phenoData = pd)
is_exprs(kowalczyk) <- exprs(kowalczyk) > 0
kowalczyk <- calculateQCMetrics(kowalczyk)
# use gene names as feature symbols
kowalczyk@featureData@data$feature_symbol <- featureNames(kowalczyk)
saveRDS(kowalczyk, "~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/kowalczyk_bl6.rds")
d <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/intermediate/GSE59114_DBA.rds")
dataset <- d[[1]]
ann <- d[[2]]
rownames(ann) <- colnames(dataset)
pd <- new("AnnotatedDataFrame", data = ann)
kowalczyk <- newSCESet(fpkmData = as.matrix(dataset), phenoData = pd)
is_exprs(kowalczyk) <- exprs(kowalczyk) > 0
kowalczyk <- calculateQCMetrics(kowalczyk)
# use gene names as feature symbols
kowalczyk@featureData@data$feature_symbol <- featureNames(kowalczyk)
saveRDS(kowalczyk, "~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/kowalczyk_dba.rds")

Grover

d <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/intermediate/Grover.rds")
dataset <- d[[1]]
ann <- d[[2]]
rownames(ann) <- colnames(dataset)
pd <- new("AnnotatedDataFrame", data = ann)
grover <- newSCESet(fpkmData = as.matrix(dataset), phenoData = pd)
is_exprs(grover) <- exprs(grover) > 0
grover <- calculateQCMetrics(grover)
# use gene names as feature symbols
grover@featureData@data$feature_symbol <- featureNames(grover)
saveRDS(grover, "~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/grover.rds")

Grover

d <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/intermediate/Drissen.rds")
dataset <- d[[1]]
ann <- d[[2]]
rownames(ann) <- colnames(dataset)
pd <- new("AnnotatedDataFrame", data = ann)
drissen <- newSCESet(fpkmData = as.matrix(dataset), phenoData = pd)
is_exprs(drissen) <- exprs(drissen) > 0
drissen <- calculateQCMetrics(drissen)
# use gene names as feature symbols
drissen@featureData@data$feature_symbol <- featureNames(drissen)
saveRDS(drissen, "~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/drissen.rds")

Zhou

d <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/intermediate/GSE67120.rds")
dataset <- d[[1]]
ann <- d[[2]]
rownames(ann) <- colnames(dataset)
pd <- new("AnnotatedDataFrame", data = ann)
zhou <- newSCESet(fpkmData = as.matrix(dataset), phenoData = pd)
is_exprs(zhou) <- exprs(zhou) > 0
zhou <- calculateQCMetrics(zhou)
# use gene names as feature symbols
zhou@featureData@data$feature_symbol <- featureNames(zhou)
saveRDS(zhou, "~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/zhou.rds")

Schlitzer

d <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/intermediate/GSE60781.rds")
dataset <- d[[1]]
ann <- d[[2]]
rownames(ann) <- colnames(dataset)
pd <- new("AnnotatedDataFrame", data = ann)
sceset <- newSCESet(fpkmData = as.matrix(dataset), phenoData = pd)
is_exprs(sceset) <- exprs(sceset) > 0
sceset <- calculateQCMetrics(sceset)
# use gene names as feature symbols
sceset@featureData@data$feature_symbol <- featureNames(sceset)
saveRDS(sceset, "~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/schlitzer.rds")

Wilson

d <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/intermediate/GSE61533.rds")
dataset <- d[[1]]
ann <- d[[2]]
rownames(ann) <- colnames(dataset)
pd <- new("AnnotatedDataFrame", data = ann)
sceset <- newSCESet(countData = as.matrix(dataset), phenoData = pd)
sceset <- calculateQCMetrics(sceset)
# use gene names as feature symbols
sceset@featureData@data$feature_symbol <- featureNames(sceset)
saveRDS(sceset, "~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/wilson.rds")

Pereira

d <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/intermediate/GSE54574.rds")
dataset <- d[[1]]
ann <- d[[2]]
rownames(ann) <- colnames(dataset)
pd <- new("AnnotatedDataFrame", data = ann)
sceset <- newSCESet(fpkmData = as.matrix(dataset), phenoData = pd)
is_exprs(sceset) <- exprs(sceset) > 0
sceset <- calculateQCMetrics(sceset)
# use gene names as feature symbols
sceset@featureData@data$feature_symbol <- featureNames(sceset)
saveRDS(sceset, "~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/pereira.rds")

Grun

d <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/intermediate/E-GEOD-76983.rds")
dataset <- d[[1]]
ann <- d[[2]]
rownames(ann) <- colnames(dataset)
pd <- new("AnnotatedDataFrame", data = ann)
sceset <- newSCESet(fpkmData = as.matrix(dataset), phenoData = pd)
is_exprs(sceset) <- exprs(sceset) > 0
sceset <- calculateQCMetrics(sceset)
# use gene names as feature symbols
sceset@featureData@data$feature_symbol <- featureNames(sceset)
saveRDS(sceset, "~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/grun.rds")

Paul

d <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/intermediate/E-GEOD-72857.rds")
dataset <- d[[1]]
dataset <- dataset[!duplicated(rownames(dataset)), ]
ann <- d[[2]]
rownames(ann) <- colnames(dataset)
pd <- new("AnnotatedDataFrame", data = ann)
sceset <- newSCESet(countData = as.matrix(dataset), phenoData = pd)
sceset <- calculateQCMetrics(sceset)
# use gene names as feature symbols
sceset@featureData@data$feature_symbol <- featureNames(sceset)
saveRDS(sceset, "~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/paul.rds")
nestorowa <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/nestorowa.rds")
nestorowa <- fsc3_get_features(nestorowa)
olsson <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/olsson.rds")
olsson <- fsc3_get_features(olsson)
kowalczyk_bl6 <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/kowalczyk_bl6.rds")
kowalczyk_bl6 <- fsc3_get_features(kowalczyk_bl6)
kowalczyk_dba <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/kowalczyk_dba.rds")
kowalczyk_dba <- fsc3_get_features(kowalczyk_dba)
grover <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/grover.rds")
grover <- fsc3_get_features(grover)
drissen <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/drissen.rds")
drissen <- fsc3_get_features(drissen)
zhou <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/zhou.rds")
zhou <- fsc3_get_features(zhou)
schlitzer <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/schlitzer.rds")
schlitzer <- fsc3_get_features(schlitzer)
wilson <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/wilson.rds")
wilson <- fsc3_get_features(wilson)
pereira <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/pereira.rds")
pereira <- fsc3_get_features(pereira)
grun <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/grun.rds")
grun <- fsc3_get_features(grun)
paul <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-bone-marrow/paul.rds")
paul <- fsc3_get_features(paul)

nest <- featureNames(nestorowa)[fData(nestorowa)$fsc3_features]
ols <- featureNames(olsson)[fData(olsson)$fsc3_features]
kowbl6 <- featureNames(kowalczyk_bl6)[fData(kowalczyk_bl6)$fsc3_features]
kowdba <- featureNames(kowalczyk_dba)[fData(kowalczyk_dba)$fsc3_features]
grov <- featureNames(grover)[fData(grover)$fsc3_features]
dris <- featureNames(drissen)[fData(drissen)$fsc3_features]
zho <- featureNames(zhou)[fData(zhou)$fsc3_features]
schlitz <- featureNames(schlitzer)[fData(schlitzer)$fsc3_features]
wils <- featureNames(wilson)[fData(wilson)$fsc3_features]
per <- featureNames(pereira)[fData(pereira)$fsc3_features]
gru <- featureNames(grun)[fData(grun)$fsc3_features]
pau <- featureNames(paul)[fData(paul)$fsc3_features]
upset(fromList(list(Nestorowa = nest, Olsson = ols, Kowalczyk_BL6 = kowbl6, Kowalczyk_DBA = kowdba, Grover = grov, Drissen = dris, Zhou = zho, Schlitzer = schlitz, Wilson = wils, Pereira = per, Grun = gru, Paul = pau)), order.by = "freq", nsets = 12)




nestorowa <- fsc3_get_features(nestorowa)
common_features <- get_common_fsc3_features(list(nestorowa))
olsson <- fsc3_set_features(olsson, common_features)
# update common_features
common_features <- featureNames(olsson)[fData(olsson)$fsc3_features]
nestorowa <- fsc3_set_features(nestorowa, common_features)

nestorowa <- fsc3_get_signatures(nestorowa)
olsson <- fsc3_get_signatures(olsson)

p_data <- pData(nestorowa)
p_data$fsc3_buckets <- p_data$Type
pData(nestorowa) <- new("AnnotatedDataFrame", data = p_data)
nestorowa <- fsc3_get_buckets_signatures(nestorowa, threshold = 0.6)

olsson <- fsc3_assign_signatures(olsson, nestorowa, threshold = 0.6)
if(is.factor(pData(olsson)$Type)) {
    labs_orig <- levels(pData(olsson)$Type)[pData(olsson)$Type]
} else {
    labs_orig <- pData(olsson)$Type
}
labs_new <- pData(olsson)$fsc3_buckets_assigned
plot_sankey(labs_orig, labs_new)
irr::kappa2(cbind(labs_orig, labs_new))$value
mclust::adjustedRandIndex(labs_orig, labs_new)


nestorowa <- fsc3_get_features(nestorowa)
common_features <- get_common_fsc3_features(list(nestorowa))
paul <- fsc3_set_features(paul, common_features)
# update common_features
common_features <- featureNames(paul)[fData(paul)$fsc3_features]
nestorowa <- fsc3_set_features(nestorowa, common_features)

nestorowa <- fsc3_get_signatures(nestorowa)
paul <- fsc3_get_signatures(paul)

p_data <- pData(nestorowa)
p_data$fsc3_buckets <- p_data$Type
pData(nestorowa) <- new("AnnotatedDataFrame", data = p_data)
nestorowa <- fsc3_get_buckets_signatures(nestorowa, threshold = 0.6)

paul <- fsc3_assign_signatures(paul, nestorowa, threshold = 0.6)
if(is.factor(pData(paul)$Type)) {
    labs_orig <- levels(pData(paul)$Type)[pData(paul)$Type]
} else {
    labs_orig <- pData(paul)$Type
}
labs_new <- pData(paul)$fsc3_buckets_assigned
plot_sankey(labs_orig, labs_new)
irr::kappa2(cbind(labs_orig, labs_new))$value
mclust::adjustedRandIndex(labs_orig, labs_new)



kowalczyk_bl6 <- fsc3_get_features(kowalczyk_bl6)
common_features <- get_common_fsc3_features(list(kowalczyk_bl6))
kowalczyk_dba <- fsc3_set_features(kowalczyk_dba, common_features)
# update common_features
common_features <- featureNames(kowalczyk_dba)[fData(kowalczyk_dba)$fsc3_features]
kowalczyk_bl6 <- fsc3_set_features(kowalczyk_bl6, common_features)

kowalczyk_bl6 <- fsc3_get_signatures(kowalczyk_bl6)
kowalczyk_dba <- fsc3_get_signatures(kowalczyk_dba)

p_data <- pData(kowalczyk_bl6)
p_data$fsc3_buckets <- p_data$Type
pData(kowalczyk_bl6) <- new("AnnotatedDataFrame", data = p_data)
kowalczyk_bl6 <- fsc3_get_buckets_signatures(kowalczyk_bl6, threshold = 0.6)

kowalczyk_dba <- fsc3_assign_signatures(kowalczyk_dba, kowalczyk_bl6, threshold = 0.6)
if(is.factor(pData(kowalczyk_dba)$Type)) {
    labs_orig <- levels(pData(kowalczyk_dba)$Type)[pData(kowalczyk_dba)$Type]
} else {
    labs_orig <- pData(kowalczyk_dba)$Type
}
labs_new <- pData(kowalczyk_dba)$fsc3_buckets_assigned
plot_sankey(labs_orig, labs_new)
irr::kappa2(cbind(labs_orig, labs_new))$value
mclust::adjustedRandIndex(labs_orig, labs_new)



kowalczyk_dba <- fsc3_get_features(kowalczyk_dba)
common_features <- get_common_fsc3_features(list(kowalczyk_dba))
kowalczyk_bl6 <- fsc3_set_features(kowalczyk_bl6, common_features)
# update common_features
common_features <- featureNames(kowalczyk_bl6)[fData(kowalczyk_bl6)$fsc3_features]
kowalczyk_dba <- fsc3_set_features(kowalczyk_dba, common_features)

kowalczyk_dba <- fsc3_get_signatures(kowalczyk_dba)
kowalczyk_bl6 <- fsc3_get_signatures(kowalczyk_bl6)

p_data <- pData(kowalczyk_dba)
p_data$fsc3_buckets <- p_data$Type
pData(kowalczyk_dba) <- new("AnnotatedDataFrame", data = p_data)
kowalczyk_dba <- fsc3_get_buckets_signatures(kowalczyk_dba, threshold = 0.6)

kowalczyk_bl6 <- fsc3_assign_signatures(kowalczyk_bl6, kowalczyk_dba, threshold = 0.6)
if(is.factor(pData(kowalczyk_bl6)$Type)) {
    labs_orig <- levels(pData(kowalczyk_bl6)$Type)[pData(kowalczyk_bl6)$Type]
} else {
    labs_orig <- pData(kowalczyk_bl6)$Type
}
labs_new <- pData(kowalczyk_bl6)$fsc3_buckets_assigned
plot_sankey(labs_orig, labs_new)
irr::kappa2(cbind(labs_orig, labs_new))$value
mclust::adjustedRandIndex(labs_orig, labs_new)

Mouse brain

Tasic

# # download data from http://casestudies.brain-map.org/celltax
# # and run this code
# d <- read.csv("~/Downloads/data_download/genes_rpkm.csv")
# rownames(d) <- d[,1]
# d <- d[,2:ncol(d)]
# # then do this:
# ann_col <- read.csv("~/Downloads/data_download/cell_metadata.csv")
# 
# cell_classification <- read.csv("~/Downloads/data_download/cell_classification.csv")
# cluster_metadata <- read.csv("~/Downloads/data_download/cluster_metadata.csv")
# cluster_metadata <- cluster_metadata[,1:(ncol(cluster_metadata) - 1)]
# 
# ann_col <- cbind(ann_col, cell_classification[,2:3])
# colnames(ann_col)[length(ann_col)] <- "cluster_id"
# 
# ann_col <- merge(ann_col, cluster_metadata, by = "cluster_id")
# ann_col <- ann_col[order(ann_col$long_name), ]
# rownames(ann_col) <- ann_col$long_name
# ann_col <- ann_col[,c(1, 3:ncol(ann_col))]
# 
# pd <- new("AnnotatedDataFrame", data = ann_col)
# sceset <- newSCESet(fpkmData = as.matrix(d), phenoData = pd)
# is_exprs(sceset) <- exprs(sceset) > 0
# sceset <- calculateQCMetrics(sceset)
# # use gene names as feature symbols
# sceset@featureData@data$feature_symbol <- featureNames(sceset)
# saveRDS(sceset, "~/Dropbox/scRNASeq-public-datasets/mouse-brain/tasic.rds")

10x-genomics

# # download data from http://support.10xgenomics.com/single-cell/datasets/neuron_9k
# # and run this code
# d <- as.matrix(Seurat::Read10X("~/Downloads/filtered_gene_bc_matrices/mm10"))
# sceset <- newSCESet(countData = d)
# sceset <- calculateQCMetrics(sceset)
# # use gene names as feature symbols
# sceset@featureData@data$feature_symbol <- featureNames(sceset)
# saveRDS(sceset, "~/Dropbox/scRNASeq-public-datasets/mouse-brain/10x-genomics/neuron_9k.rds")

VIS

# # download Primary Visual Cortex (VISp) from here: http://celltypes.brain-map.org/rnaseq
# # and run this code
# d <- read.csv("~/Dropbox/scRNASeq-public-datasets/mouse-brain/intermediate/VIS_gene_expression_matrix_2016-10-27/fpkm_table.csv")
# rownames(d) <- d[,1]
# d <- d[,2:ncol(d)]
# colnames(d) <- as.numeric(sapply(colnames(d), function(x)substring(x, 2)))
# # then do this:
# ann_col <- read.csv("~/Dropbox/scRNASeq-public-datasets/mouse-brain/intermediate/VIS_gene_expression_matrix_2016-10-27/columns-samples.csv")
# rownames(ann_col) <- ann_col$rnaseq_profile_id
# ann_col <- ann_col[,2:ncol(ann_col)]
# 
# ann_row <- read.csv("~/Dropbox/scRNASeq-public-datasets/mouse-brain/intermediate/VIS_gene_expression_matrix_2016-10-27/rows-genes.csv")
# rownames(ann_row) <- ann_row$gene_id
# ann_row <- ann_row[,2:ncol(ann_row)]
# 
# pd <- new("AnnotatedDataFrame", data = ann_col)
# fd <- new("AnnotatedDataFrame", data = ann_row)
# sceset <- newSCESet(fpkmData = as.matrix(d), phenoData = pd, featureData = fd)
# is_exprs(sceset) <- exprs(sceset) > 0
# sceset <- calculateQCMetrics(sceset)
# # use gene names as feature symbols
# sceset@featureData@data$feature_symbol <- sceset@featureData@data$gene_symbol
# saveRDS(sceset, "~/Dropbox/scRNASeq-public-datasets/mouse-brain/allen-atlas-vis.rds")

ALM

# # download anterior lateral motor cortical area (ALM) from here: http://celltypes.brain-map.org/rnaseq
# # and run this code
# d <- read.csv("~/Dropbox/scRNASeq-public-datasets/mouse-brain/intermediate/ALM_gene_expression_matrix_2016-10-27/fpkm_table.csv")
# rownames(d) <- d[,1]
# d <- d[,2:ncol(d)]
# colnames(d) <- as.numeric(sapply(colnames(d), function(x)substring(x, 2)))
# # then do this:
# ann_col <- read.csv("~/Dropbox/scRNASeq-public-datasets/mouse-brain/intermediate/ALM_gene_expression_matrix_2016-10-27/columns-cells.csv")
# rownames(ann_col) <- ann_col$rnaseq_profile_id
# ann_col <- ann_col[,2:ncol(ann_col)]
# 
# ann_row <- read.csv("~/Dropbox/scRNASeq-public-datasets/mouse-brain/intermediate/ALM_gene_expression_matrix_2016-10-27/rows-genes.csv")
# rownames(ann_row) <- ann_row$gene_id
# ann_row <- ann_row[,2:ncol(ann_row)]
# 
# pd <- new("AnnotatedDataFrame", data = ann_col)
# fd <- new("AnnotatedDataFrame", data = ann_row)
# sceset <- newSCESet(fpkmData = as.matrix(d), phenoData = pd, featureData = fd)
# is_exprs(sceset) <- exprs(sceset) > 0
# sceset <- calculateQCMetrics(sceset)
# # use gene names as feature symbols
# sceset@featureData@data$feature_symbol <- sceset@featureData@data$gene_symbol
# saveRDS(sceset, "~/Dropbox/scRNASeq-public-datasets/mouse-brain/allen-atlas-alm.rds")

dLGN

# # download dorsal lateral geniculate nucleus (dLGN) of the thalamus from here: http://celltypes.brain-map.org/rnaseq
# # and run this code
# d <- read.csv("~/Dropbox/scRNASeq-public-datasets/mouse-brain/intermediate/gene_expression_matrix_2016-03-03/fpkm_table.csv")
# rownames(d) <- d[,1]
# d <- d[,2:ncol(d)]
# colnames(d) <- as.numeric(sapply(colnames(d), function(x)substring(x, 2)))
# # then do this:
# ann_col <- read.csv("~/Dropbox/scRNASeq-public-datasets/mouse-brain/intermediate/gene_expression_matrix_2016-03-03/columns-cells.csv")
# rownames(ann_col) <- ann_col$rnaseq_profile_id
# ann_col <- ann_col[,2:ncol(ann_col)]
# 
# ann_row <- read.csv("~/Dropbox/scRNASeq-public-datasets/mouse-brain/intermediate/gene_expression_matrix_2016-03-03/rows-genes.csv")
# rownames(ann_row) <- ann_row$gene_id
# ann_row <- ann_row[,2:ncol(ann_row)]
# 
# pd <- new("AnnotatedDataFrame", data = ann_col)
# fd <- new("AnnotatedDataFrame", data = ann_row)
# sceset <- newSCESet(fpkmData = as.matrix(d), phenoData = pd, featureData = fd)
# is_exprs(sceset) <- exprs(sceset) > 0
# sceset <- calculateQCMetrics(sceset)
# # use gene names as feature symbols
# sceset@featureData@data$feature_symbol <- sceset@featureData@data$gene_symbol
# saveRDS(sceset, "~/Dropbox/scRNASeq-public-datasets/mouse-brain/allen-atlas-dlgn.rds")
vis <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-brain/allen-atlas-vis.rds")
vis <- fsc3_get_features(vis)
alm <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-brain/allen-atlas-alm.rds")
alm <- fsc3_get_features(alm)
dlgn <- readRDS("~/Dropbox/scRNASeq-public-datasets/mouse-brain/allen-atlas-dlgn.rds")
dlgn <- fsc3_get_features(dlgn)

vi <- as.character(fData(vis)$feature_symbol[fData(vis)$fsc3_features])
al <- as.character(fData(alm)$feature_symbol[fData(alm)$fsc3_features])
dlg <- as.character(fData(dlgn)$feature_symbol[fData(dlgn)$fsc3_features])
upset(fromList(list(VIS = vi, ALM = al, dLGN = dlg)), order.by = "freq", nsets = 3)




vis <- fsc3_get_features(vis)
common_features <- get_common_fsc3_features(list(vis))
alm <- fsc3_set_features(alm, common_features)
# update common_features
common_features <- as.character(fData(alm)$feature_symbol)[fData(alm)$fsc3_features]
vis <- fsc3_set_features(vis, common_features)

vis <- fsc3_get_signatures(vis)
alm <- fsc3_get_signatures(alm)

p_data <- pData(vis)
p_data$fsc3_buckets <- p_data$genotype_driver
pData(vis) <- new("AnnotatedDataFrame", data = p_data)
vis <- fsc3_get_buckets_signatures(vis, threshold = 0.6)

alm <- fsc3_assign_signatures(alm, vis, threshold = 0.6)
if(is.factor(pData(alm)$genotype_driver)) {
    labs_orig <- levels(pData(alm)$genotype_driver)[pData(alm)$genotype_driver]
} else {
    labs_orig <- pData(alm)$genotype_driver
}
labs_new <- pData(alm)$fsc3_buckets_assigned
plot_sankey(labs_orig, labs_new)
irr::kappa2(cbind(labs_orig, labs_new))$value
mclust::adjustedRandIndex(labs_orig, labs_new)


hemberg-lab/FastSC3 documentation built on May 17, 2019, 3:42 p.m.