knitr::read_chunk("../../analysis/chunks.R")



library(ggplot2)
library(ggrastr)
library(dplyr)
library(parallel)
library(dropestr)
library(dropEstAnalysis)
library(Matrix)

theme_set(theme_base)

Load data

kDataPath <- '../../data/dropest/'
kTablesPath <- '../../output/tables/'
k10xSubfolders <- c(poisson='est_01_14_precise/', real='est_01_14_barcodes/',
                    unmerged='est_01_14_unmerged/', merge_all='est_01_16_merge_all/')

kDropSeqSubolders <- c(poisson='est_01_16_precise/', unmerged='est_01_16_unmerged/',
                       merge_all='est_01_16_merge_all/')
kDataFiles <- list(
  `10x`=paste0(kDataPath, '10x/hgmm_6k/', k10xSubfolders, "hgmm6k.rds") %>%
    setNames(names(k10xSubfolders)),
  drop_seq=paste0(kDataPath, 'dropseq/thousand/', kDropSeqSubolders, "thousand.rds") %>%
    setNames(names(kDropSeqSubolders))
)
holders <- mclapply(kDataFiles, function(paths) mclapply(paths, readRDS, mc.cores=4),
                    mc.cores=2)

validation_data <- mclapply(holders, function(hs) list(
  merge_targets = lapply(hs, function(holder) unlist(holder$merge_targets)),
  cms_raw = lapply(hs, `[[`, 'cm_raw'),
  cms = lapply(hs, `[[`, 'cm')
), mc.cores=8)


validation_data$`10x`$cms_raw <- lapply(validation_data$`10x`$cms_raw,
                                        function(cm) cm[grep("^[^;]+$", rownames(cm)),])
validation_data$`10x`$cms <- lapply(validation_data$`10x`$cms,
                                    function(cm) cm[grep("^[^;]+$", rownames(cm)),])

rm(holders)
invisible(gc())
# saveRDS(validation_data, paste0(kDataPath, 'human_mouse_mixture_validation_data.rds'))
# validation_data <- readRDS(paste0(kDataPath, 'human_mouse_mixture_validation_data.rds'))

10x

umis_per_cb <- Matrix::colSums(validation_data$`10x`$cms$unmerged) %>% sort(decreasing=T)
real_cbs <- names(umis_per_cb)[1:6000]
PlotCellsNumberLine(umis_per_cb[1:10000])
GeneSpeciesFromMixture <- function(cm, org1.marker, org1.name, org2.name) {
  res <- ifelse(substr(rownames(cm), 1, nchar(org1.marker)) == org1.marker, org1.name, org2.name)
  return(as.factor(res))
}

CellSpeciesFromMixture <- function(gene.species, cm) {
  res <- levels(gene.species) %>%
    lapply(function(l) cm[gene.species == l,] %>% Matrix::colSums())

  res <- levels(gene.species)[as.integer(res[[1]] < res[[2]]) + 1] %>% 
    setNames(colnames(cm)) %>% as.factor()

  return(res)
}
gene_species <- GeneSpeciesFromMixture(validation_data$`10x`$cms_raw$unmerged, 
                                       'hg', 'Human', 'Mouse')
cell_species <- CellSpeciesFromMixture(gene_species, 
                                       validation_data$`10x`$cms_raw$unmerged)

table(cell_species[real_cbs])
table(cell_species) / sum(table(cell_species))
merge_targets <- lapply(validation_data$`10x`$merge_targets, 
                        function(mt) mt[mt %in% real_cbs])
comparison_10x <- MergeComparisonSummary(merge_targets, cell_species, dataset="10x hgmm6k")
comparison_10x$`Merge type` <- c('Poisson', 'Known barcodes', 'Simple')
comparison_10x

Drop-seq

umis_per_cb <- Matrix::colSums(validation_data$drop_seq$cms$unmerged) %>% 
  sort(decreasing=T)
real_cbs <- names(umis_per_cb)[1:1000]
PlotCellsNumberLine(umis_per_cb[1:5000])
gene_species <- GeneSpeciesFromMixture(validation_data$drop_seq$cms_raw$unmerged, 
                                       'HUMAN', 'Human', 'Mouse')
cell_species <- CellSpeciesFromMixture(gene_species, 
                                       validation_data$drop_seq$cms_raw$unmerged)

table(cell_species[real_cbs])
table(cell_species)
merge_targets <- lapply(validation_data$drop_seq$merge_targets, 
                        function(mt) mt[mt %in% real_cbs])
comparison_drop_seq <- MergeComparisonSummary(merge_targets, cell_species, 
                                              dataset='Drop-seq thousand')
comparison_drop_seq$`Merge type` <- c('Poisson', 'Simple')
comparison_drop_seq
complete_table <- rbind(comparison_10x, comparison_drop_seq)
write.csv(complete_table, paste0(kTablesPath, 'merge_comparison.csv'), row.names=F)

Session information




VPetukhov/dropEstAnalysis documentation built on Dec. 28, 2019, 8:16 p.m.