Split into Test & Train

For each entry in Cell by Gene count matrix, split read counts using binomial.

# for each entry in raw_data, split read counts using binomial

raw_data <- readRDS("../data/merged-mouse_counts_tsne-QC_low-mt.rds")

# create train data
set.seed(562393)
raw_data_train <- apply(raw_data, 2, function(y) sapply(y, function(x) rbinom(1, size = x, prob = 0.8)))

raw_data_train2 <- Matrix(raw_data_train, sparse = TRUE)

rm(raw_data_train)

# create test data
raw_data_test <- raw_data - raw_data_train2

# force compression
raw_data_test <- Matrix(as.matrix(raw_data_test), sparse = TRUE)

# sanity check
sum(raw_data < raw_data_train2)

saveRDS(raw_data_train2, "../data/imputation/raw_data_train.rds")
saveRDS(raw_data_test,"../data/imputation/raw_data_test.rds")

Compute counts recovery info

Calculate information required to untransorm the data back to counts. We need the standard deviations we divided by and the library sizes.

raw_data <- readRDS("../data/raw_data_train.rds")
dge <- t(raw_data)

library(Matrix)
library_size <- Matrix::colSums(dge)
#gene_sizes <- rowSums(dge)
lib_size_correction_factor <- library_size/median(sqrt(library_size))

dge <- t(t(dge)/lib_size_correction_factor)
cell_subset <- library_size >= 200
gene_mean <- Matrix::rowMeans(dge[, cell_subset])
gene_subset <- data.table(gene_mean, names(gene_mean))[order(-gene_mean)][1:round(length(gene_mean) * 2/3)]$V2
dge <- dge[gene_subset, cell_subset]
dge <- sqrt(10000 * dge)

dge <- t(dge)

colSdColMeans <- function(x, na.rm = TRUE) {
            if (na.rm) {
                n <- Matrix::colSums(!is.na(x))
            }
            else {
                n <- nrow(x)
            }
            colVar <- Matrix::colMeans(x * x, na.rm = na.rm) - (Matrix::colMeans(x, 
                na.rm = na.rm))^2
            return(sqrt(colVar * n/(n - 1)))
        }

sds <- colSdColMeans(dge)

dge <- t(t(dge)/colSdColMeans(dge))

save(gene_subset, sds, lib_size_correction_factor, cell_subset, file = "../data/count_dge_recovery_information_train.rds")

# this part can't be reversed as throwing information away
# dge2 <- copy(dge)
# sum(dge2 > 10)
# dge2[dge2 > 10] <- 10
# dge2[dge2 < (-10)] <- (-10)

# check reverse_normalisation works
# tmp <- reverse_normalisation(dge)

# identical(round(tmp), raw_data[cell_subset, gene_subset])
# TRUE

Normalise data

data <- readRDS("../data/imputation/raw_data_train.rds")

normalised_data <- dropsim::normaliseDGE(t(data),
                     center = FALSE,
                     scale = TRUE,
                     threshold = 10,
                     min_library_size = 200,
                     gene_subset = (2/3))

saveRDS(normalised_data,"../data/imputation/train_0.8_normalised.rds")

run SDA

library(SDAtools)
export_data(as.matrix(normalised_data), name = "merged_mouse_V3_SDA_train_0.8.data", path="")

run_SDA(sda_location = "../sda",
        out = "../results/conradV3_sda_train_0.8_1",
        data = "merged_mouse_V3_SDA_train_0.8.data",
        num_comps = 50,
        max_iter = 10000,
        save_freq = 1000,
        set_seed = "79151 17351",
        N = 20036,
        eigen_parallel = TRUE,
        num_openmp_threads = 5,
        num_blocks = 5)

Load SDA results & calculate prediction (imputed values)

library(SDAtools)

# predicted with mask
results_train <- load_results(results_folder = "../data/SDA/conradV3_sda_train_0.8_1/", data_path = "../data/count_matrices/") #data/SDA #data/count_matrices/
rownames(results_train$loadings[[1]]) <- paste0("V",1:50)
rownames(results_train$pips[[1]]) <- paste0("V",1:50)

predicted_train <- results_train$scores %*% results_train$loadings[[1]]

str(results_train)
str(predicted_train)

run PCA

library(flashpcaR)
pca_training <- flashpca(as.matrix(normalised_data), ndim=50, stan="none", verbose=T, seed=42, do_loadings=T, divisor = "none")


rownames(pca_training$projection) <- rownames(normalised_data)
rownames(pca_training$loadings) <- colnames(normalised_data)


saveRDS(pca_training,"../data/imputation/pca_training.rds")

Load PCA results & calculate prediction (imputed values)

pca_training <- readRDS("../data/imputation/pca_training.rds")

predicted_train_PCA <- pca_training$projection %*% t(pca_training$loadings)

str(predicted_train_PCA)

run ICA

normalised_data <- readRDS("../data/imputation/train_0.8_normalised.rds")

a <- Sys.time()
ICA_decomp <- fastICA::fastICA(as.matrix(normalised_data), n.comp = 50, verbose = TRUE)
b <- Sys.time()

b-a

# 10 hours

saveRDS(ICA_decomp,"../data/imputation/ICA_decomp_training.rds")
colnames(ICA_decomp$A) <- colnames(ICA_decomp$X)
ICA_decomp$X_Center <- attributes(ICA_decomp$X)[3][[1]]
saveRDS(ICA_decomp[-1],"../data/imputation/ICA_decomp_training_sansX.rds")


######


normalised_data <- readRDS("../data/imputation/train_0.8_normalised.rds")

a <- Sys.time()
ICA_decomp <- fastICA::fastICA(as.matrix(normalised_data), n.comp = 50, verbose = TRUE, method="C")
b <- Sys.time()

Load ICA results & calculate prediction (imputed values)

ica_training <- readRDS("../data/imputation/ICA_decomp_training_sansX.rds")

# X = SA

predicted_train_ica <- ica_training$S %*% ica_training$A

# add center back, which ICA removes automatically
predicted_train_ica <- sweep(predicted_train_ica, 2, ica_training$X_Center, "+")

str(predicted_train_ica)

run NNMF

normalised_data <- readRDS("../data/imputation/train_0.8_normalised.rds")

NNMF_training <- NNLM::nnmf(as.matrix(normalised_data), 50, rel.tol = 1e-5)

saveRDS(NNMF_training,"../data/imputation/NNMF_training.rds")

Load NNMF results & calculate prediction (imputed values)

NNMF_training <- readRDS("../data/imputation/NNMF_training.rds")

predicted_train_NNMF <- NNMF_training$W %*% NNMF_training$H

str(predicted_train_NNMF)

MAGIC

normalised <- readRDS("../data/imputation/train_0.8_normalised.rds")

library(Rmagic)
MAGIC_data <- Rmagic::magic(as.matrix(normalised), n.jobs=3)

MAGIC_data$params$data <- NULL
saveRDS(MAGIC_data, file = "../data/imputation/MAGIC_data.rds")

MAGIC_AUCs <- calculateAUCs(as.matrix(MAGIC_data$result))

saveRDS(MAGIC_AUCs[[1]], file = "../data/imputation/cellAUCs_MAGIC.rds")

save(MAGIC_AUCs, file = "../data/imputation/imputed_ranks_MAGIC.rds")


# MAGIC expression jumps around quite a lot...
ggplot(cell_data[sda_predict("Prdm9")], aes(-PseudoTime, Prdm9)) + geom_point(stroke=0, alpha=0.3, size=0.5)
ggplot(cell_data[nmf_predict("Prdm9")], aes(-PseudoTime, Prdm9)) + geom_point(stroke=0, alpha=0.3, size=0.5)
ggplot(cell_data[magic_predict("Prdm9",MAGIC_data)], aes(-PseudoTime, Prdm9)) + geom_point(stroke=0, alpha=0.3, size=0.5)

ggplot(cell_data[magic_predict("Ssxb1",MAGIC_data)], aes(-PseudoTime, Ssxb1)) + geom_point(stroke=0, alpha=0.3, size=0.5)
ggplot(cell_data[nmf_predict("Ssxb1")], aes(-PseudoTime, Ssxb1)) + geom_point(stroke=0, alpha=0.3, size=0.5)
ggplot(cell_data[sda_predict("Ssxb1")], aes(-PseudoTime, Ssxb1)) + geom_point(stroke=0, alpha=0.3, size=0.5)

Data Compression

MAGIC_data <- readRDS("../data/imputation/MAGIC_data.rds")

results_train2 <- results_train
results_train2$loadings[[1]] <- as.numeric(results_train2$pips[[1]]>0.5) * results_train2$loadings[[1]]
results_train2$loadings[[1]] <- Matrix(results_train2$loadings[[1]], sparse = TRUE)

format(object.size(MAGIC_data$result), units="Mb", digits=1)
format(object.size(results_train[c("scores","loadings")]), units="Mb", digits=1)
format(object.size(data), units="Mb", digits=1)

format(object.size(results_train2[c("scores","loadings")]), units="Mb", digits=1)

format(object.size(results_train[c("loadings")]), units="Mb", digits=1)
format(object.size(results_train2[c("loadings")]), units="Mb", digits=1)

DCA

# pip3 install dca --user wells
# pip3 install tensorflow --user wells
# 
# from dca.api import dca
# import anndata
# 
# adata = anndata.read_text("../data/conradV3/V3_SDAmerged_mouse_V3_SDA.data")
# 
# adata.obs_names = cellinfo.Cell
# adata.var_names = geneinfo.Gene

raw_data <- readRDS("../data/imputation/raw_data_train.rds")
normalised <- readRDS("../data/imputation/train_0.8_normalised.rds")

write.csv(as.matrix(normalised), "../data/imputation/counts_train_normalised.csv")

write.csv(as.matrix(t(raw_data[rownames(normalised), colnames(normalised)])), "../data/imputation/counts_train.csv")

library(data.table)

write.csv(as.matrix(t(raw_data[rownames(normalised), colnames(normalised)])), "../data/imputation/counts_train.csv")

str(fread("../data/imputation/counts_train.csv"))



#  ~.local/bin/dca ../data/imputation/counts_train.csv ../data/imputation/DCAresults


dcapredict <- fread("../data/imputation/DCAresults/mean.tsv")

dcapredict_m <- t(as.matrix(dcapredict[,-c("V1"),with=F]))

colnames(dcapredict_m) <- dcapredict$V1

# Not sure this will work due to different normalisation within DCA
DCA_AUCs <- calculateAUCs(dcapredict_m)


saveRDS(DCA_AUCs[[1]], file = "../data/imputation/cellAUCs_DCA.rds")

save(DCA_AUCs, file = "../data/imputation/imputed_ranks_DCA.rds")

Not sure calculateAUC will work due to different normalisation within DCA

Function to calculate ranks & AUCs

# NB this function wastes time by recaculating train and average AUCs

calculateAUCs <- function(predicted_train, 
                          raw_data_test_file="../data/imputation/raw_data_test.rds",
                          raw_data_train_file="../data/imputation/raw_data_train.rds",
                          count_recovery_file="../data/imputation/count_dge_recovery_information.rds",
                          shift=F){
  # Reverse Normalisation of training data

  load(count_recovery_file)

  cell_subset <- rownames(predicted_train)

  predicted_train <- reverse_normalisation(predicted_train, sign = TRUE, cells=cell_subset, lib_correction=lib_size_correction_factor, standard_dev=sds)

  mean(predicted_train<0)

  if(shift){
    predicted_train <- predicted_train + abs(quantile(predicted_train, 1/1e3))
  }


  predicted_train[predicted_train < 0] <- 0

  mean(predicted_train==0)

  # Normalise each cell by total
  # row = cell, col = gene
  predicted_train <- predicted_train/rowSums(predicted_train)

  str(predicted_train)
  # rm(predicted_train_counts_0)
  # rm(predicted_train)

  # Load raw test & train data

  raw_data_test <- readRDS(raw_data_test_file)
  raw_data_test <- raw_data_test[rownames(predicted_train), colnames(predicted_train)]

  # row = cell, gene = col
  raw_data_train <- readRDS(raw_data_train_file)
  raw_data_train <- raw_data_train[rownames(predicted_train), colnames(predicted_train)]



  # AUC
  # randomise ordering, because gene order is from average high to low by default, leaking information
  set.seed(42)
  reorder = sample(1:ncol(raw_data_train))

  # row = cell, gene = col
  av_exp_order <- names(sort(Matrix::colSums(raw_data_train), T))

  predicted_train = predicted_train[,reorder]
  raw_data_test2 = raw_data_test[,reorder]
  raw_data_train2 = raw_data_train[,reorder]



  cumsum_predictF <- function(i){
    testvec = raw_data_test2[i,]
    cumsum(testvec[order(predicted_train[i,], decreasing=T)]) / sum(testvec)
  }

  cumsum_trainF <- function(i){
    testvec = raw_data_test2[i,]
    cumsum(testvec[order(raw_data_train2[i,], decreasing=T)]) / sum(testvec)
  }

  cumsum_meanF <- function(i){
    testvec = raw_data_test2[i,]
    cumsum(testvec[av_exp_order]) / sum(testvec) #order(reorder)
  }


  cumsum_predict <- sapply(1:nrow(raw_data_train), cumsum_predictF)
  cumsum_train <- sapply(1:nrow(raw_data_train), cumsum_trainF)
  cumsum_mean <- sapply(1:nrow(raw_data_train), cumsum_meanF)


  cellAUCs <- data.table(cell = rownames(raw_data_train),
                         predict = colSums(cumsum_predict) /ncol(raw_data_train),
                         train = colSums(cumsum_train) /ncol(raw_data_train),
                         mean = colSums(cumsum_mean) /ncol(raw_data_train))

  return(list(cellAUCs, cumsum_predict, cumsum_train, cumsum_mean))

}

Calculate & Save AUCs / ranks

NB do this part on a server with lots of memory >16GB

SDA_AUCs <- calculateAUCs(predicted_train)


saveRDS(SDA_AUCs[[1]], file = "../data/imputation/cellAUCs_SDA.rds")

save(SDA_AUCs, file = "../data/imputation/imputed_ranks.rds")




SDA_AUCs_shifted <- calculateAUCs(predicted_train, shift=T)

saveRDS(SDA_AUCs_shifted[[1]], file = "../data/imputation/cellAUCs_SDA_shifted.rds")

save(SDA_AUCs_shifted, file = "../data/imputation/imputed_ranks_SDA_shifted.rds")




PCA_AUCs <- calculateAUCs(predicted_train_PCA)

saveRDS(PCA_AUCs[[1]], file = "../data/imputation/cellAUCs_PCA.rds")

save(PCA_AUCs, file = "../data/imputation/imputed_ranks_PCA.rds")



ICA_AUCs <- calculateAUCs(predicted_train_ica)

saveRDS(ICA_AUCs[[1]], file = "../data/imputation/cellAUCs_ICA.rds")

save(ICA_AUCs, file = "../data/imputation/imputed_ranks_ICA.rds")



NNMF_AUCs <- calculateAUCs(predicted_train_NNMF)

saveRDS(NNMF_AUCs[[1]], file = "../data/imputation/cellAUCs_NNMF.rds")

save(NNMF_AUCs, file = "../data/imputation/imputed_ranks_NNMF.rds")

Plot AUC for each cell

cellAUCs <- readRDS("../data/imputation/cellAUCs_SDA.rds")

density_auc <- get_density(cellAUCs$mean, cellAUCs$predict, kern=0.025)
density_auc_t <- get_density(cellAUCs$train, cellAUCs$predict, kern=0.025)

plot_train <- ggplot(cbind(cellAUCs, density_auc_t), aes(train, predict, color=density_auc_t)) +
  geom_point(size=0.1) +
  geom_abline(colour="red") +
  scale_colour_viridis(option="inferno") + 
  theme_minimal() +
  theme(legend.position = "none") +
  labs(colour="Cell density", x="Cellwise Training Expression AUC", y="Imputed Expression AUC")

plot_mean <- ggplot(cbind(cellAUCs, density_auc), aes(mean, predict, color=density_auc)) +
  geom_point(size=0.1) +
  geom_abline(colour="red") +
  scale_colour_viridis(option="inferno") +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(colour="Cell density", x="Average Training Expression AUC", y="Imputed Expression AUC")

plot_train
plot_mean

vs PCA

cellAUCs <- readRDS("../data/imputation/cellAUCs_SDA.rds")
cellAUCs_SDA_shifted <- readRDS("../data/imputation/cellAUCs_SDA_shifted.rds")
cellAUCs_PCA <- readRDS("../data/imputation/cellAUCs_PCA.rds")
cellAUCs_ICA <- readRDS("../data/imputation/cellAUCs_ICA.rds")
cellAUCs_NNMF <- readRDS("../data/imputation/cellAUCs_NNMF.rds")
cellAUCs_MAGIC <- readRDS("../data/imputation/cellAUCs_MAGIC.rds")
#cellAUCs_DCA <- readRDS("../data/imputation/cellAUCs_DCA.rds")

cellAUCs <- merge(cellAUCs, cellAUCs_PCA, by="cell", suffixes = c("",".PCA"))
cellAUCs <- merge(cellAUCs, cellAUCs_ICA, by="cell", suffixes = c("",".ICA"))
cellAUCs <- merge(cellAUCs, cellAUCs_NNMF, by="cell", suffixes = c("",".NNMF"))
cellAUCs <- merge(cellAUCs, cellAUCs_MAGIC, by="cell", suffixes = c("",".MAGIC"))
#cellAUCs <- merge(cellAUCs, cellAUCs_SDA_shifted, by="cell", suffixes = c("",".SDA_shifted"))

cellAUCs <- cellAUCs[,c("cell","predict","train","mean","predict.PCA","predict.ICA","predict.NNMF","predict.MAGIC"),with=F]

saveRDS(cellAUCs, "../data/imputation/cellAUCs.rds")

ggplot(cellAUCs, aes(train, predict)) + geom_hex(bins=150) + scale_fill_viridis(option="inferno") + geom_abline(colour="red") + ylim(0.8,1)
#ggplot(cellAUCs, aes(train, predict.SDA_shifted)) + geom_hex(bins=150) + scale_fill_viridis(option="inferno") + geom_abline(colour="red") + ylim(0.8,1)
# fraction cells better or not vs SDA

cellAUCs[,mean(predict>predict.PCA)]
# 0.74
cellAUCs[,mean(predict<predict.PCA)]
# 0.26

cellAUCs[,mean(predict>predict.NNMF)]
# 0.21
cellAUCs[,mean(predict<predict.NNMF)]
# 0.79


# X vs Y plot
ggplot(cellAUCs, aes(predict.PCA, predict)) + geom_point(stroke=0, size=1, alpha=0.3) + geom_abline(intercept = 0,slope=1,col='red')

ggplot(cellAUCs, aes(predict.ICA, predict)) + geom_point(stroke=0, size=1, alpha=0.3) + geom_abline(intercept = 0,slope=1,col='red')

ggplot(cellAUCs, aes(predict.NNMF, predict)) + geom_point(stroke=0, size=1, alpha=0.3) + geom_abline(intercept = 0,slope=1,col='red')

# density difference plot
ggplot(cellAUCs, aes(predict.PCA - predict)) + geom_density()
ggplot(cellAUCs, aes(predict.ICA - predict)) + geom_density()
ggplot(cellAUCs, aes(predict.NNMF - predict)) + geom_density()

# Overlaping Densities plot
tmp <- melt(cellAUCs, id="cell", variable.name = "Method", value.name = "AUC")[Method %in% c("predict","train","mean","predict.PCA","predict.ICA","predict.NNMF","predict.MAGIC")]

tmp[,Method := gsub("predict\\.?","",Method)]
tmp[,Method := gsub("train","Unimputed",Method)]
tmp[,Method := gsub("mean","Mean_Cell",Method)]
tmp[Method=="",Method := "SDA"]
tmp[, Method := factor(Method, levels=c("Unimputed","Mean_Cell","SDA","NNMF","ICA","PCA","MAGIC"))]

compare_imputation <- ggplot(tmp,
       aes(AUC, colour=Method)) +
  geom_density() +
  scale_color_manual(values=c("#E41A1C", "black",brewer.pal(7, "Set1")[-c(6,1)])) + 
  theme(legend.position = "none") +
  facet_zoom(xlim = c(0.91,0.94), zoom.size = 1)

saveRDS(compare_imputation, "../data/plots/compare_imputation.rds")

pdf("../results/other_factorisations/compare_imputation.pdf")
compare_imputation
dev.off()

which cells are better worse in NNMF vs SDA

ggplot(cell_data[cellAUCs], aes(Tsne1_QC1, Tsne2_QC1, colour=predict.NNMF/predict < 0.99)) +
  geom_jitter(size=0.5, stroke=0, height = 0.25, width = 0.25) +
  scale_color_manual(values=c("lightgrey","red")) + theme_minimal()

ggplot(cell_data[cellAUCs], aes(Tsne1_QC1, Tsne2_QC1, colour=predict.NNMF/predict > 1.01)) +
  geom_jitter(size=0.5, stroke=0, height = 0.25, width = 0.25) +
  scale_color_manual(values=c("lightgrey","red")) + theme_minimal()

Compare imputed expression of example gene

All methods are similar

example_gene = "Smok2b"

pred_exp_example <- data.table("Unimputed"=(normalised[,c(example_gene)]), cell=rownames(normalised))
setkey(pred_exp_example, cell)
pred_exp_example <- pred_exp_example[sda_predict(example_gene, results_train, ".SDA")][nmf_predict(example_gene, NNMF_training, ".NNMF")][ica_predict(example_gene,  ica_training, ".ICA")][pca_predict(example_gene, pca_training, ".PCA")][magic_predict(example_gene, MAGIC_data,".MAGIC")]

setnames(pred_exp_example, names(pred_exp_example), gsub(paste0(example_gene,"."),"",names(pred_exp_example)))

ex_imputed <- ggplot(melt(pred_exp_example, id.vars = "cell", value.name = example_gene, variable.name = "Method")[cell_data, on="cell"][!is.na(Method)],
       aes(-PseudoTime, get(example_gene), colour=Method)) + 
  geom_point(stroke=0, size=0.3, alpha=0.5)+
  scale_colour_manual(values = brewer.pal(7, "Set1")[-6])+
  facet_wrap(~Method, ncol=1, scales = "free_y") +
  ylab(paste0(example_gene, " Imputed Expression")) +
  theme(legend.position = "none")

Example Curves

extract a few cells for examples

set.seed(42)
sample_cells <- c(5749, sample(1:20036, size = 99, replace = F))

load(file = "../data/imputation/imputed_ranks_NNMF.rds")
NNMF_cumsum_pred <- NNMF_AUCs[[2]][,sample_cells]
load(file = "../data/imputation/imputed_ranks_ICA.rds")
ICA_cumsum_pred <- ICA_AUCs[[2]][,sample_cells]
load(file = "../data/imputation/imputed_ranks_PCA.rds")
PCA_cumsum_pred <- cumsum_predict[,sample_cells]
load(file = "../data/imputation/imputed_ranks.rds")
SDA_cumsum_pred <- cumsum_predict[,sample_cells]
load(file = "../data/imputation/imputed_ranks_MAGIC.rds")
MAGIC_cumsum_pred <- MAGIC_AUCs[[2]][,sample_cells]

saveRDS(list("Unimputed" = cumsum_train[,sample_cells],
             "Mean_Cell" = cumsum_mean[,sample_cells],
             "NNMF" = NNMF_AUCs[[2]][,sample_cells],
             "ICA" = ICA_AUCs[[2]][,sample_cells],
             "PCA" = PCA_cumsum_pred,
             "SDA" = SDA_cumsum_pred,
             "MAGIC" = MAGIC_cumsum_pred), "../data/imputation/imputed_ranks_sample_all_methods.rds")
imputed_ranks <- readRDS("../data/imputation/imputed_ranks_sample_all_methods.rds")

plotCellAUC(1, imputed_ranks)
plotCellAUC(4, imputed_ranks)
plotCellAUC(6, imputed_ranks)

tSNE

Which cells are better with / without imputation

vs the mean cell apprach

setkey(cellAUCs, cell)

# which cells are best
c <- ggplot(merge(cell_data, cellAUCs)[predict/mean>1], aes(Tsne1_QC1, Tsne2_QC1, colour=predict/mean)) +
  geom_point(size=0.3, stroke=0) + 
  scale_color_viridis(direction = -1) +
  theme_classic(base_size=8) +
  theme(legend.position = "top",
        legend.justification = "right",
        legend.margin=margin(t = 0, unit='cm'),
        axis.line=element_blank(),
      axis.text.x=element_blank(),
      axis.text.y=element_blank(),
      axis.ticks=element_blank(),
      axis.title.x=element_blank(),
      axis.title.y=element_blank(),
      rect = element_rect(fill = "transparent"),
      panel.background = element_rect(fill = "transparent"), # bg of the panel
      plot.background = element_rect(fill = "transparent", color = NA)) +
  guides(colour=guide_colourbar(barwidth=5, barheight=0.5, title.position="top")) +
  labs(x="", y="", colour="Ratio of Imputed to\n Average Expression AUC")

c2 <- ggplot(merge(cell_data, cellAUCs)[predict/mean>1], aes(Tsne1_QC1, Tsne2_QC1, colour=predict/mean)) +
  geom_point(size=1, stroke=0) + 
  scale_color_viridis(direction = -1) +
  labs(x="tSNE1", y="tSNE2", colour="Ratio of Imputed to\n Average Expression AUC") +
  theme(legend.position = "bottom") +
  guides(colour=guide_colourbar(barwidth=15, barheight=0.5, title.position="top"))

c

vs the no imputation

density_lib <- get_density(log(merge(cell_data, cellAUCs)$library_size), merge(cell_data, cellAUCs)[,predict/train], kern=0.07)

ggplot(merge(cell_data, cellAUCs), aes(log(library_size), predict/train, colour=density_lib)) +
  geom_point(size=0.1)+
  scale_colour_viridis() +
  theme_minimal() +
  geom_hline(yintercept = 1, colour="red") +
  labs(y="Ratio of SDA Imputed AUC\n to Unimputed AUC", x="Log Library Size (Total UMI Counts)")

lib_size_plot <- ggplot(merge(cell_data, cellAUCs), aes(log10(library_size), predict/train, colour=-PseudoTime)) +
  geom_point(size=0.1)+
  scale_colour_viridis(guide_colorbar(title="PseudoTime")) +
  theme_minimal() +
  scale_y_continuous(breaks=c(1.0,1.4,1.8,2.2))+
  guides(colour=guide_colourbar(barwidth=10,label.position="bottom")) +
  theme(legend.position = "bottom") +
  geom_hline(yintercept = 1, colour="black") +
  labs(y="Ratio of SDA Imputed AUC\n to Unimputed AUC", x="Log10 Library Size (Total UMI Counts)")

lib_size_plot

80:20

predict80 <- apply(cumsum_predict, 2, function(x) min(which(x>0.8)/nrow(cumsum_predict)))
train80 <- apply(cumsum_train, 2, function(x) min(which(x>0.8)/nrow(cumsum_predict)))
mean80 <- apply(cumsum_mean, 2, function(x) min(which(x>0.8)/nrow(cumsum_predict)))

frac80 <- data.table(mean80, train80, predict80)

density_frac80 <- get_density(frac80$mean80, frac80$predict80, kern=0.025)
density_frac80_t <- get_density(frac80$train80, frac80$predict80, kern=0.025)

ggplot(cbind(frac80, density_frac80_t), aes(train80, predict80, color=density_frac80_t)) +
  geom_point(size=0.1) +
  geom_abline(colour="red") +
  scale_colour_viridis() + 
  theme_minimal() +
  theme(legend.position = "none") +
  labs(colour="Cell density", x="Cellwise Training Fraction of Genes for 80% Cumsum", y="Imputed Fraction of Genes for 80% Cumsum")

ggplot(cbind(cellAUCs, density_frac80), aes(mean80, predict80, color=density_frac80)) +
  geom_point(size=0.1) +
  geom_abline(colour="red") +
  scale_colour_viridis() +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(colour="Cell density", x="Average Training Fraction of Genes for 80% Cumsum", y="Imputed Fraction of Genes for 80% Cumsum")
# add tSNE subplot
plot_mean_2 <- ggdraw() +
  draw_plot(plot_mean, 0, 0, 1, 1) +
  draw_plot(c, 0.5, 0.07, 0.5, 0.5)

save figure

genes_tmp <- c("Zbtb16","Prdm9","Piwil1","Ccna1","Ssxb1","Lrrd1","Gapdhs")

imputation_plot <- imputed_vs_raw(genes_tmp)
imputation_plot

compare_imputation <- readRDS("../data/plots/compare_imputation.rds")

right_panelA <- plot_grid(plotCellAUC(1, imputed_ranks[c(1:2,6,3:5,7)]) + scale_colour_manual(values=c("#E41A1C", "black",brewer.pal(7, "Set1")[-c(6,1)])),
                         compare_imputation, ncol=2, labels=c("B","C"))
right_panelB <- plot_grid(gonia_SDA_zoom, gonia_NNMF, ncol=2, rel_widths = c(3,4.5), labels = c("D",""))
right_panelC <- plot_grid(Rhox2h_prediction, blood_SDA_vs_NNMF, ncol=2, rel_widths = c(3,2), labels = c("E","F"))

right_panel <- plot_grid(right_panelA, right_panelB, right_panelC, ncol=1)


pdf(width = 13, height = 11, file = "../results/imputation/imputation.pdf")
plot_grid(imputation_plot,
          right_panel,
          labels = c("A",""),
          ncol=2)
dev.off()

#pdf(width = 4250, height = 2500, res = 300, file = "../results/imputation/imputationAUC2.png")
heatmapgrob <- readRDS("../data/plots/heatmapgrob.rds")

right <- plot_grid(heatmapgrob, plot_grid(NULL, lib_size_plot),ncol=1, rel_heights = c(3,1), labels = c("B","C"))

pdf("../results/Imputation_Supp.pdf", height = 12, width=15)
plot_grid(ex_imputed, right, ncol=2, rel_widths = c(1,3), labels = c("A",""))
dev.off()


MyersGroup/testisAtlas documentation built on Nov. 29, 2020, 9:21 p.m.