In ajabadi/BIRSBIO2020.scNMTseq.PLS: scNMTseq challenge analysis using a PLS approach

library(knitr)
knitr::opts_chunk$set(collapse = TRUE, 
                      comment = "#>", 
                      fig.width = 8, 
                      cache = FALSE)

Note: These analyses are different from the ones presented in the hackathon due to updated preprocessing if data in order to harmonize the analyses for publication. For original analyses see https://github.com/ajabadi/scNMT_seq_gastrulation.

Load the required packages:

sapply(list.files('../R', full.names = TRUE), source)
local.lib <- '../lib'
dir.create(local.lib)
.libPaths(local.lib)
## uncomment this if you are not building the vignette along with the package
# remotes::install_github('mixOmicsTeam/mixOmics@MultiAssayExperiment', upgrade = 'never')

library(BIRSBIO2020.scNMTseq.PLS)
library(MultiAssayExperiment)
library(scater)
library(scran)
library(mixOmics)
library(ggplot2)
library(magrittr)
library(reshape2)
library(uwot)
library(impute)
library(nipals)

nipals_maxiter <- params$nipals_params['maxiter']
nipals_ncomp <- params$nipals_params['ncomp']
nipals_nhvr <- params$nipals_params['nhvr']

## create directories to save the figures and output data:
if (!dir.exists('figures')) {
  cat('creating "figures" folder ...\n')
  dir.create('figures')
}

if (!dir.exists('savedata')) {
   cat('creating "savedata" folder ...\n')
  dir.create('savedata')
}

Data

Details of the hackathon data and preprocessing steps: https://github.com/BIRSBiointegration/Hackathon/tree/master/scNMT-seq

Load the MultiAssayExperiment object from Cloudstor:

cat('loading data from Cloudstor ...\n')
gastru.mae_path <- url('https://cloudstor.aarnet.edu.au/plus/s/jsW7nh4YwThw8Q5/download')

cat('loading data from local folder ...\n')
gastru.mae_path <- 'savedata/scnmtseq_gastrulation_mae-sce.rds'

nipals_maxiter <- 10
nipals_ncomp <- 3
nipals_nhvr <- 200
cat('loading mini data from Cloudstor ...\n')
gastru.mae_path <- url('https://cloudstor.aarnet.edu.au/plus/s/tppSiI9GPuw9DUI/download')

gastru.mae <- readRDS(gastru.mae_path)

Filter MAE object

Keep cells which pass QC metrics and exclude putative extra-embryonic cells as they are starkly different in methylation patterns and will drive most of variation in the relatively small population of matching cells, not allowing to fully explore cell to cell variation intended for the purpose of this hackathon:

## subset RNA expression and DNA methylation modalities
keep_assays <- grep("rna|met",names(assays(gastru.mae)))
gastru.mae <- gastru.mae[,,keep_assays]
## remove putative extraembryonic cells
cat(sprintf('dropping lineages %s, plus the unassigned lineages.\n', paste(params$drop_lineages, collapse = ', ')))
filter_cells <- gastru.mae$lineage %in% params$drop_lineages
filter_cells <- filter_cells | is.na(gastru.mae$lineage)
## outlier cell to be filtered
filter_cells <- filter_cells | rownames(colData(gastru.mae)) == 'E6.75_Plate2_H10'
gastru.mae <- gastru.mae[,!filter_cells,]
## keep cells which pass RNA QC
gastru.mae <- gastru.mae[,gastru.mae$pass_rnaQC==TRUE,]
## Keep full rna SCE for UMAP
rna.sce <- gastru.mae@ExperimentList$rna
## keep cells that also pass QC for DNA methylation
gastru.mae <- gastru.mae[,gastru.mae$pass_metQC==TRUE,]
## rna SCE for cells passing met and rna QC
rna.sce.matching <- gastru.mae@ExperimentList$rna

Replace the rna SCE with normalised counts in MAE object as the integration wrapper requires matrices in assays:

## rna is an SCE object
gastru.mae@ExperimentList
## replace SCE with logcounts
gastru.mae@ExperimentList$rna <- logcounts(gastru.mae@ExperimentList$rna)
## rna updated to matrix of logcounts
gastru.mae@ExperimentList

Breakdown of the number of cells in each stage:

table(gastru.mae$stage) %>% as.data.frame() %>% t() %>% 
  set_rownames(c('stage', '# of cells')) %>% kable()

Feature detection

Create density plots of the feature detection rate across all cells for all modalities:

# get the methylation assays
met_assays <- grep(names(gastru.mae), pattern = '^met', value = TRUE)
# add dimensions to labels for ggplot
dims <- lapply(experiments(gastru.mae[,,met_assays]), dim)
dims <- sapply(dims, function(x) sprintf(' (%s, %s)', x[2], x[1]))
names(met_assays) <- paste0(met_assays, dims) %>% as.list()
# calculate the feature detection in a data.frame for methylation assays
coverages <- lapply(met_assays, function(assay_name) {
  mat <- assay(gastru.mae, assay_name)
  NAs <- rowSums(!is.na(mat))/dim(mat)[2]*100
  data.frame(pct_NAs=NAs)
})
# create a long data.frame containing the assay name for plot
coverages <- rbindListWithNames(coverages)
coverages$dataset <- factor(coverages$dataset, levels = unique(coverages$dataset), ordered = TRUE)

cov_plot <- ggplot(coverages, aes(x = pct_NAs)) + geom_density(fill = 'lightblue', show.legend = FALSE) +
  geom_vline(aes(xintercept=mean(pct_NAs)),
             color="blue", linetype="dashed", size=0.5) +
  labs(x = '% of cells detecting the feature') + facet_wrap(.~dataset, nrow = 2) +
  theme_bw() + theme(strip.text.x = element_text(size = 10, face = 'bold', color = 'purple'))

cov_plot

Density plots for methylation data show that shorter genomic regions tend to have less feature coverage. Dashed blue line indicates the average across all modalities.

ggsave(cov_plot, filename = 'figures/covplots.pdf', width = 8, height = 4)

RNA

Run PCA and then UMAP using PCs:

Cells passing rna QC (n = `r dim(rna.sce)[2]`)

npcs <- 15
## PCA first: retrieve npcs PCs
rna.sce <- runPCA(rna.sce,  ncomponents = npcs, name = 'PCA')

## UMAP parameters used:
cat(sprintf('Running UMAP with parameters %s\n', paste(names(params$umap_params), ':',params$umap_params, collapse = ', ')))

## run UMAP
set.seed(params$umap_params['run.seed'])
rna.sce <- runUMAP(rna.sce, dimred="PCA", 
                   ncomponents = params$umap_params['n_components'], 
                   n_neighbors = params$umap_params['n_neighbors'], 
                   min_dist = params$umap_params['min_dist'])

Cells passing both rna and methylation QC (n = `r dim(rna.sce.matching)[2]`)

decomp <- modelGeneVar(rna.sce.matching)
## filter by mean expression and significance of biological variation signal
hvgs <- rownames(decomp)[decomp$p.value<0.05 & decomp$mean > 0.01]
length(hvgs)

npcs <- 15
## PCA first: retrieve npcs PCs
rna.sce.matching <- runPCA(rna.sce.matching, ncomponents = npcs, subset_row=hvgs, name = 'PCA')

## UMAP parameters used:
cat(sprintf('Running UMAP with parameters %s\n', paste(names(params$umap_params), ':',params$umap_params, collapse = ', ')))

## run UMAP
set.seed(params$umap_params['run.seed'])
rna.sce.matching <- runUMAP(rna.sce.matching, dimred="PCA", 
                   ncomponents = params$umap_params['n_components'], 
                   n_neighbors = params$umap_params['n_neighbors'], 
                   min_dist = params$umap_params['min_dist'])

UMAP

Plot the first two UMAP components:

# Create colour palettes for stages:
stages <- c("E4.5", "E5.5", "E6.5", "E7.5")
# col_palette <- dput(viridisLite::viridis(n = length(stages)))
col_palette <- c("#440154FF", "#31688EFF", "#35B779FF", "#FDE725FF")
names(col_palette) <- stages

all cells

gg_rna_all <- plot_reducedDim(sce = rna.sce, reducedDim = 'UMAP', dims = c(1,2), col_palette = col_palette)
gg_rna_all

UMAP plot of all cells passing rna QC controls separates the cell populations from early-stage, mid-stage, and late-stage cells. This structure is also apparent in the matching cells' local embeddings but less strongly:

matching cells

gg_rna_matching <- plot_reducedDim(sce = rna.sce.matching, reducedDim = 'UMAP', dims = c(1,2), col_palette = col_palette)
gg_rna_matching

DNA

EMPCA

We perform PCA using an Expectation Maximization algorithm (NIPALS) for all methylation measurements using highly variable regions:

met_assays <- names(gastru.mae@ExperimentList)[-1]
nipals_comps <- list()
for (M in met_assays) {
  d <- t(gastru.mae@ExperimentList[[M]])
  nhvr <- nipals_nhvr
  d <- d[,order(-colVars(d, na.rm = TRUE))<=nhvr]
  cat('Performing nipals on:', M, '\n')
  nipals_comps[[M]] <-
    suppressWarnings({
      nipals::nipals(d, 
                     ncomp = nipals_ncomp, 
                     center = TRUE,
                     scale = FALSE,
                     nipals_maxiter)
    })

}
# cat('\nRunning nipals on', parallel::detectCores(), ' cpus...\n')
# nipals_comps <- mclapply(named_list(met_assays), FUN = function(M){
#   nipals::nipals(t(gastru.mae@ExperimentList[[M]]), 
#                  ncomp = 2, center = TRUE, scale = FALSE, 
#                  maxiter = ifelse(params$mini_run, 10, 500))
# }, mc.cores = parallel::detectCores())

nipals_comps <- readRDS('savedata/nipals_comps.rds')

saveRDS(nipals_comps, file = 'savedata/nipals_comps.rds')

Score plots

Plot PCs:

plot_nipals(nipals_comps, cell_order = colnames(gastru.mae@ExperimentList[[1]]), 
            stage = gastru.mae$stage, red_dim = 'PC', col_palette = col_palette, 
            dims = c(1, 2), facet_ncol = 2)

The first two components from Expectation Maximization PCA. Whereas these PCs mainly distinguish early-stage from late-stage cells in all methylation views, an observation in agreement with the transcriptome UMAP projection, the stages are better resolved in some views (e.g. met_cgi) than others.

Integration

PLS

Integration of all modalities with feature selection.

Here we select for 50 features on each component in multimodal_sPLS_wrapper:

## helper function to format runtime
format_runtime <- function(dsec) {
  hours <- floor(dsec / 3600)
  minutes <- floor((dsec - 3600 * hours) / 60)
  seconds <- dsec - 3600*hours - 60*minutes
  paste0(
    sapply(c(hours, minutes, seconds), function(x) {
      formatC(x, width = 2, format = "d", flag = "0")
    }), collapse = ":")
}

## number of components
ncomp <- params$pls_ncomp
## feature scaling
scale <- FALSE

cat(sprintf('Running PLS with %s components performing variable selection\n', ncomp))
st <- system.time({
  mmspls <-
    multimodal_sPLS_wrapper(mae = gastru.mae, study_assays = NULL,
                            ncomp = ncomp, scale = scale, design = 'null', lineages = NULL,
                            stages = NULL, DA = NULL, keepX = NULL, save = FALSE)
})['elapsed']
mmspls$stage <- gastru.mae$stage
mmspls$lineage<- gastru.mae$lineage
cat('\nPLS run finished. Runtime: ', format_runtime(st), '\n')

saveRDS(mmspls, file = 'savedata/MultiModalSparsePLS-All.rds')
# mmspls <- readRDS('savedata/MultiModalSparsePLS-All.rds')

score plots

Plot PLS components for cells across all views colored by embryonic stages:

pls_no.impute <- plot_pls(pls_obj = mmspls, stage = mmspls$stage, col_palette = col_palette)
pls_no.impute

ggsave(filename = 'figures/plotIndiv-MultiModalSparsePLS-stage.pdf')

The latent components which seek to maximise the sum of concordant variation between rna and other modalities separate early-stage cells from late-stage cells. It indicates coordinated changes across the methylomes associated with the distinct transcriptional states of cells in the regions of study during gastrulation.

Imputation

Impute the missing methylation data using a nearest neighbours algorithm:

# sum(!gastru.mae$pass_metQC)
#> 0
gastru.mae.imputed <- gastru.mae
met_assays <- grep(pattern = '^met', names(experiments(gastru.mae.imputed)), value = TRUE)
for (assay_type in met_assays) {
  assay_values <- gastru.mae.imputed@ExperimentList[[assay_type]]
  cat('\nimputing values for:', assay_type, '\n')
  gastru.mae.imputed@ExperimentList[[assay_type]] <- 
    impute.knn(data = assay_values, 
               k = 30, 
               rowmax = 0.99, 
               colmax = 0.99,  
               rng.seed=42)$data
}

saveRDS(gastru.mae.imputed, file = 'savedata/gastru.mae.imputed.rds')
# gastru.mae.imputed <- readRDS('savedata/gastru.mae.imputed.rds')

Data integration using imputed methylation data:

cat(sprintf('Running PLS on imputed data with %s components performing variable selection\n', ncomp))
st <- system.time({
  mmspls.imputed <-
    multimodal_sPLS_wrapper(mae = gastru.mae.imputed, study_assays = NULL,
                            ncomp = ncomp, scale = scale, design = 'null', lineages = NULL,
                            stages = NULL, DA = NULL, keepX = NULL, save = FALSE)
})['elapsed']

cat('\nPLS run finished. Runtime: ', format_runtime(st), '\n')

ggsave(filename = 'figures/plotIndiv-MultiModalSparsePLS-stage.pdf')

saveRDS(mmspls.imputed, file = 'savedata/MultiModalSparsePLS-All-imputed.rds')
# mmspls.imputed <- readRDS('savedata/MultiModalSparsePLS-All-imputed.rds')

score plots

Plot PLS components for cells across all views:

pls_impute <- plot_pls(pls_obj = mmspls.imputed, stage = gastru.mae.imputed$stage, col_palette = col_palette)
pls_impute

ggsave(filename = 'figures/plotIndiv-MultiModalSparsePLS-imputed-stage.pdf')

The met_DHS view has clearly improved in separating different stages.

Predictive performance of PLS and PCA components

We finally calculate the balanced prediction accuracy of a kmeans clustering algorithm using the first 2 components from PCA components from individual assays as well as using PLS variates from integration of original and imputed measurements:

rna_pca <- rna.sce.matching@int_colData$reducedDims@listData$PCA
all_scores <- c(list(rna = rna_pca), lapply(nipals_comps, function(x) x$scores[rownames(rna_pca),]))
# rna_pca <- rna_pca[rownames(mmspls$variates$rna),]
set.seed(427)
## kmeans clustering balanced accuracy with PCA components from RNA and DNA data:
df <- data.frame(
  PCA = sapply(all_scores, FUN = function(x) {
        kmeans_accuracy(variates = x[,1:2], labels = mmspls$stage)$BAR
    }),
  ## without imputation
    PLS = sapply(mmspls$variates, FUN = function(x) {
        kmeans_accuracy(variates = x[,1:2], labels = mmspls$stage)$BAR
    }),
    ## with imputation
    PLS_impute = sapply(mmspls.imputed$variates, FUN = function(x) {
        kmeans_accuracy(variates = x[,1:2], labels = mmspls$stage)$BAR
    })
)

kable(round(t(df), 2))

Surprisingly, For normalised gene expression data, the PLS components are more predictive of the embryonic stage than PCA components. Noticeably, without imputation, the components from all assays are nearly equally predictive with the exception of genebody methylome which underperforms the rest. After imputation, the accuracy of the prediction using PLS components is improved considerably for genebody methylation.

Looking at individual stages:

## helpers and function to create a data.frame of form:
##         Stage Method Modality Accuracy
##          E4.5    PCA      rna     0.50
##          E5.5    PLS      rna     0.35
##          E6.5    PCA      rna     0.75
.get_class_accuracy <- function(scores) {
  DF <- lapply(scores, FUN = function(x) {
  kmeans_accuracy(variates = x[,1:2], labels = mmspls$stage)$classAccuracy
})
  DF <- round(data.frame(DF),2)
  DF$Stage <- rownames(DF)
  rownames(DF) <- NULL
  DF
}

get_class_accuracy <- function(scores_list) {
  df_class <- lapply(scores_list, function(scores) {
    .get_class_accuracy(scores)
  })
  df_class <- rbindListWithNames(df_class, new_col = 'Method')
  df_class <- reshape2::melt(df_class, measure.vars = 1:5, 
                             value.name = 'Accuracy', 
                             variable.name = 'Modality')
  df_class
}

Get per class accuracy:

accuracy_per_class <- 
  get_class_accuracy(scores_list = 
                       list(PCA = all_scores, 
                            PLS = mmspls$variates,
                            PLS_impute = mmspls.imputed$variates))
head(accuracy_per_class)

Column plots of balanced accuracy rates for all modalities and stages:

ggplot(accuracy_per_class, aes(Stage, Accuracy)) + facet_wrap(.~Modality, ncol=2) +
  geom_col(aes(fill = Method), position = 'dodge') +
  theme_classic() +
  labs(y = 'Balanced Accuracy')

Views differ in their predictive performance for different stages. For instance, using PLS components the genebody methylation more predictive of embryonic stages in E4.5 and E5.5 cells compared to CGI methylation, whereas the opposite holds for late-stage cells.

purl('index.Rmd')

Conclusion

Multi-modal PLS derives the common variation between transcriptome and methylome in different genomic regions for r dim(gastru.mae@ExperimentList$rna)[2] single cells during mouse gastrulation. The used data types vary in dimensions and and biological variation. The common axes of variation are driven partly by the stages of embryo, which mainly separates early-stage and late-stage cells. The measurements include varying levels of masked values which are ignored by PLS algorithm during integration. Imputation of missing values using a nearest neighbours imputation method enabled the method to capture more cross-modality covariation relevant to embryonic stages in some data views. A classification of cells based on either PCA or PLS components revealed that the projected components of shared variation are more predictive of the embryonic stages than the Principal Components of the transcriptome. It also showed that different views resolve different phenotypic dynamics which highlights the importance of data integration to better understand the cellular behaviour.

ajabadi/BIRSBIO2020.scNMTseq.PLS documentation built on May 24, 2021, 4:19 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ajabadi/BIRSBIO2020.scNMTseq.PLS
scNMTseq challenge analysis using a PLS approach

In ajabadi/BIRSBIO2020.scNMTseq.PLS: scNMTseq challenge analysis using a PLS approach

Data

Filter MAE object

Feature detection

RNA

Cells passing rna QC (n = `r dim(rna.sce)[2]`)

Cells passing both rna and methylation QC (n = `r dim(rna.sce.matching)[2]`)

UMAP

all cells

matching cells

DNA

EMPCA

Score plots

Integration

PLS

Integration of all modalities with feature selection.

score plots

Imputation

Data integration using imputed methylation data:

score plots

Predictive performance of PLS and PCA components

Conclusion

R Package Documentation

Browse R Packages

We want your feedback!

ajabadi/BIRSBIO2020.scNMTseq.PLS scNMTseq challenge analysis using a PLS approach

In ajabadi/BIRSBIO2020.scNMTseq.PLS: scNMTseq challenge analysis using a PLS approach

Data

Filter MAE object

Feature detection

RNA

Cells passing rna QC (n = r dim(rna.sce)[2])

Cells passing both rna and methylation QC (n = r dim(rna.sce.matching)[2])

UMAP

all cells

matching cells

DNA

EMPCA

Score plots

Integration

PLS

Integration of all modalities with feature selection.

score plots

Imputation

Data integration using imputed methylation data:

score plots

Predictive performance of PLS and PCA components

Conclusion

R Package Documentation

Browse R Packages

We want your feedback!

ajabadi/BIRSBIO2020.scNMTseq.PLS
scNMTseq challenge analysis using a PLS approach

Cells passing rna QC (n = `r dim(rna.sce)[2]`)

Cells passing both rna and methylation QC (n = `r dim(rna.sce.matching)[2]`)