In KChen-lab/CytoSpill: de novo CyTOF compensation

simulation results

Figure 2

knitr::opts_chunk$set(echo=FALSE, warning=FALSE, message=FALSE)

get the compensated data from CATALYST package as the true expression base for the simulation study

colnames(comped_nnls_exprs)
#the compensated data flowframe: comped_nnls, exprs: comped_nnls_exprs

cols_metal <- c("Pr141Di", "Nd142Di", "Nd143Di", "Nd144Di", "Nd145Di", "Nd146Di", "Sm147Di",
"Nd148Di", "Sm149Di", "Nd150Di", "Eu151Di", "Sm152Di", "Eu153Di", "Sm154Di", "Gd155Di",
"Gd156Di", "Gd158Di", "Tb159Di", "Gd160Di", "Dy161Di")

simulate the spillover matrix

some helper functions

#get_spill_cols from CATALYST package
get_spill_cols <- function(ms, mets, l=CATALYST::isotope_list) {
  ms <- as.numeric(ms)
  spill_cols <- vector("list", length(ms))
  for (i in seq_along(ms)) {
    p1 <- m1 <- ox <- iso <- NULL
    if ((ms[i] + 1)  %in% ms) p1 <- which(ms == (ms[i] + 1))
    if ((ms[i] - 1)  %in% ms) m1 <- which(ms == (ms[i] - 1))
    if ((ms[i] + 16) %in% ms) ox <- which(ms == (ms[i] + 16))
    iso <- l[[mets[i]]]
    iso <- which(ms %in% iso[iso != ms[i]])
    spill_cols[[i]] <- unique(c(m1, p1, iso, ox))
  }
  spill_cols
}

spill_cols_data <- function(data){
  ##get ms and mets
  chs <- colnames(data)
  ####metal mass number like 167，***
  ms <- as.numeric(regmatches(chs, gregexpr("[0-9]+", chs)))
  ####metal name
  mets <- gsub("[[:digit:]]+Di", "", chs)
  ##get spillover cols
  spill_cols <- get_spill_cols(ms, mets)
  return(spill_cols)
}

#function generate a simulated spillover matrix
sm_simulation <- function(data, n, ...){
  ##get ms and mets
  chs <- colnames(data)
  ####metal mass number like 167，***
  ms <- as.numeric(regmatches(chs, gregexpr("[0-9]+", chs)))
  ####metal name
  mets <- gsub("[[:digit:]]+Di", "", chs)
  ##get spillover cols
  spill_cols <- get_spill_cols(ms, mets)

  sm <- vector("list",n)
  for (i in 1:n){
    m = diag(length(spill_cols))
    for (j in seq_along(spill_cols)){
      for (k in spill_cols[[j]]){
        m[j,k] <- runif(1, min=0, max=0.1)
      }
    }
    sm[[i]] <- m
  }
  return(sm)
}

#function generate simulated data
data_simulation <- function(signal, sm, ...){
  data <- vector("list",length(sm))
  for (i in seq_along(sm)){
    data[[i]] = signal%*%sm[[i]]
    colnames(data[[i]]) = colnames(signal)
  }
  return(data)
}

simulate_ture_signal <- function(){
  set.seed(1)
  # single modal channel
  single <- sort(sample(c(1:20), 10))
  double <- c(1:20)[-single]
  mean_single <- runif(10, min = 100, max = 600)
  mean_double1 <- runif(10, min = 100, max = 600)
  mean_double2 <- runif(10, min = 100, max = 600)

  data <- matrix(nrow = 100000, ncol = 20)

  j = 1
  for (i in single){
    data[,i] <- sample(c(rnorm(60000, mean = mean_single[j], sd = mean_single[j]/5), rep(0, 40000)), 100000)
    j = j + 1
  }

  j = 1
  for (i in double){
    data[,i] <- sample(c(rnorm(30000, mean = mean_double1[j], sd = mean_double1[j]/5), rnorm(30000, mean = mean_double2[j], sd = mean_double2[j]/5), rep(0, 40000)), 100000)
    j = j + 1
  }

  colnames(data) <- cols_metal
  return(data)
}

NMF helper functions

library(tensorflow)
tf$compat$v1$disable_v2_behavior()

get_H_mask <- function(spill_cols){
  n_cols <- length(spill_cols)
  H_mask <- matrix(0, n_cols, n_cols)
  for (i in 1:n_cols){
    if (!is.null(spill_cols[[i]])){
      for (j in spill_cols[[i]]){
        H_mask[i,j] = 1
      }
    }
  }
  return(H_mask)
}


get_A_mask <- function(data, cutoffs){
  A_mask <- matrix(0, dim(data)[1], dim(data)[2])
  for (i in 1:dim(data)[2]){
    A_mask[which(data[,i] < cutoffs[i]), i] = 1
    A_mask[which(data[,i] == 0), i] = 0
  }
  return(A_mask)
}

train_NMF <- function(data, spill_cols, cutoffs){
  #down sampling if necessary
  # data <- data[sample(nrow(data),5000),]
  tf$compat$v1$disable_v2_behavior()

  #get masks for A and H
  A_mask <- get_A_mask(data, cutoffs)
  H_mask <- get_H_mask(spill_cols)

  A = tf$constant(data)
  shape = dim(data)

  #initializing W,H
  # temp_H = matrix(rnorm(shape[2]*shape[2]),ncol=shape[2])
  # temp_H = temp_H/max(temp_H)
  #
  # temp_W = matrix(rnorm(shape[1]*shape[2]),ncol=shape[2])
  # temp_W = temp_W/max(temp_W)

  # temp_H = matrix(0.05, shape[2], shape[2]) * H_mask
  temp_H = matrix(runif(shape[2]*shape[2],min=0,max=0.1), shape[2], shape[2]) * H_mask
  temp_W = data - data*A_mask

  # W = tf$Variable(temp_W)
  W = tf$constant(temp_W)
  H =  tf$Variable(temp_H)

  A_mask = tf$constant(A_mask)
  H_mask = tf$constant(H_mask)
  H_masked = tf$multiply(H,H_mask)

  WH = tf$matmul(W, H_masked)
  # WH = tf$matmul(W, H)
  #cost of Frobenius norm
  # cost = tf$reduce_sum(tf$pow(tf$multiply(A,A_mask) -  tf$multiply(WH,A_mask) - tf$multiply(W,A_mask), 2))
  cost = tf$reduce_sum(tf$multiply(tf$pow(A - WH - W, 2),A_mask))
  # cost = tf$reduce_sum(tf$norm(tf$multiply(A,A_mask) -  tf$multiply(WH,A_mask)))
  # Learning rate
  lr = 0.001
  # Number of steps
  steps = 1000
  # train_step = tf$train$GradientDescentOptimizer(lr)$minimize(cost)
  train_step = tf$compat$v1$train$GradientDescentOptimizer(lr)$minimize(cost)

  # Clipping operation. This ensure that W and H learnt are non-negative
  # clip_W = W$assign(tf$maximum(tf$zeros_like(W), W))
  clip_H1 = H$assign(tf$maximum(tf$zeros_like(H), H))
  H_clip2 <- matrix(0.1, shape[2], shape[2])
  clip_H2 = H$assign(tf$minimum(H_clip2, H))
  # clip = tf$group(clip_W, clip_H)
  # clip = tf$group(clip_W,clip_H1, clip_H2)
  clip = tf$group(clip_H1, clip_H2)

  # Launch the graph and initialize the variables.
  sess = tf$compat$v1$Session()
  sess$run(tf$compat$v1$global_variables_initializer())

  for (step in 1:steps) {
    sess$run(train_step)
    sess$run(clip)
    if (step %% 100 == 0){
      cat("\nCost:",sess$run(cost), "\n")
      print("********************************")
    }
  }
  #
  # learnt_W = sess$run(W)
  # learnt_H = sess$run(H)
  # learnt_H_masked = learnt_H*get_H_mask(spill_cols)
  learnt_H_masked = sess$run(H_masked)
  return(learnt_H_masked)
}

data simulation

set.seed(1)

true_signal <- simulate_ture_signal()
sm_simulated <- sm_simulation(data = true_signal, n = 20)
data_simulated <- data_simulation(signal = true_signal, sm = sm_simulated)

get the estimated spillover matrix based on simulated data

library(CytoSpill)

estimate_sm <- function(data, spill_cols){
  my_sm = vector("list",length(data))
  my_sm_nmf = vector("list",length(data))

  for (i in seq_along(data)){
    cutoffs_sim <- .DeriveCutoffs(data=data[[i]], cols=c(1:ncol(data[[i]])), n = 10000, flexrep = 5)
    threshold <- 0.1
    model <- .EstimateSpill(data=data[[i]], cutoffs=cutoffs_sim, cols=c(1:ncol(data[[i]])), upperbound = threshold, neighbor = 1)
    estimates <- model[[1]]
    xcols <- model[[2]]
    spillmat <- diag(length(xcols))
    for (m in 1:length(xcols)) {
      if (!is.na(xcols[[m]][1])) {
        for (j in 1:length(xcols[[m]])) {
          if (!is.na(estimates[[m]][j])){
            spillmat[xcols[[m]][j],m] <- min(estimates[[m]][j],threshold)
          }
        }
      }
    }
    sm_sim <- spillmat
    my_sm[[i]] <- sm_sim

    nmf_sim <- train_NMF(data[[i]], spill_cols, cutoffs = cutoffs_sim)
    my_sm_nmf[[i]] <- nmf_sim
  }
  return(list(my_sm, my_sm_nmf))
}

# estimate_sm_nmf <- function(data, spill_cols) {
#   my_sm_nmf = vector("list",length(data))
#   for (i in seq_along(data)){
#     cutoffs_sim <- .DeriveCutoffs(data=data[[i]], cols=c(1:ncol(data[[i]])), n = 5000)
#     nmf_sim <- train_NMF(data[[i]], spill_cols, cutoffs = cutoffs_sim)
#     my_sm_nmf[[i]] <- nmf_sim
#   }
#   return(my_sm_nmf)
# }

set.seed(1)
estimate_sm_simulated <- estimate_sm(data = data_simulated, spill_cols = spill_cols_data(true_signal))

##simulated_results.Rdata

#number of simulated elements in each spillover matrix
spill_cols <- spill_cols_data(true_signal)
sum(sapply(spill_cols, length))

#trucate nmf estimated spillover effects with >0.1 or <0 value, also add diagnal to make spillover matrix, comparable to sqp results
nmf_estimate_sm <- estimate_sm_simulated[[2]]
for (i in seq_along(nmf_estimate_sm)){
  nmf_estimate_sm[[i]][which(nmf_estimate_sm[[i]]<0)] <- 0
  nmf_estimate_sm[[i]][which(nmf_estimate_sm[[i]]>0.1)] <- 0.1
  nmf_estimate_sm[[i]] <- nmf_estimate_sm[[i]] + diag(20)
}

Get comparison data

library(ggplot2)
sm_comparison <- function(x, y, spill_cols){
  # errorlog = NULL
  comparison = NULL
  # errorpoint = NULL
  for (i in seq_along(x)){
    single <- NULL
    # errorcount = 0
    # errorcount2 = 0
    for (j in seq_along(spill_cols)){
      for (k in spill_cols[[j]]){
        single <- rbind(single,c(x[[i]][j,k],y[[i]][j,k]))
        # if (y[[i]][j,k]>0.049 & y[[i]][j,k]<0.05 ) {
        #   errorcount = errorcount+1
        #   errorpoint = rbind(errorpoint, c(j,k))
        # }
        # if (y[[i]][j,k]==0) errorcount2 = errorcount2+1
      }
    }
    # errorlog <- rbind(errorlog,c(errorcount,errorcount2))
    comparison <- rbind(comparison,single)
  }
  colnames(comparison) <- c('simulation','my_method')
  # return(list(comparison,errorlog,errorpoint))
  return(comparison)
}

slsqp_comparison <- sm_comparison(sm_simulated,estimate_sm_simulated[[1]],spill_cols = spill_cols)
nmf_comparison <- sm_comparison(sm_simulated,nmf_estimate_sm,spill_cols = spill_cols)

ggplot(as.data.frame(slsqp_comparison), aes(x=my_method, y=simulation)) + geom_point(size=0.1) + theme_classic() + xlab('Sequential quadratic programming estimated spillover effects') + ylab('Simulated spillover effects') + coord_fixed()

ggplot(as.data.frame(nmf_comparison), aes(x=my_method, y=simulation)) + geom_point(size=0.1) + theme_classic() + xlab('NMF estimated spillover effects') + ylab('Simulated spillover effects') + coord_fixed()

cor(slsqp_comparison[,1],slsqp_comparison[,2])^2
cor(nmf_comparison[,1],nmf_comparison[,2])^2

sessionInfo()

KChen-lab/CytoSpill documentation built on June 15, 2020, 6:12 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

KChen-lab/CytoSpill
de novo CyTOF compensation

In KChen-lab/CytoSpill: de novo CyTOF compensation

simulation results

Figure 2

get the compensated data from CATALYST package as the true expression base for the simulation study

simulate the spillover matrix

some helper functions

NMF helper functions

data simulation

get the estimated spillover matrix based on simulated data

Get comparison data

R Package Documentation

Browse R Packages

We want your feedback!

KChen-lab/CytoSpill de novo CyTOF compensation

In KChen-lab/CytoSpill: de novo CyTOF compensation

simulation results

Figure 2

get the compensated data from CATALYST package as the true expression base for the simulation study

simulate the spillover matrix

some helper functions

NMF helper functions

data simulation

get the estimated spillover matrix based on simulated data

Get comparison data

R Package Documentation

Browse R Packages

We want your feedback!

KChen-lab/CytoSpill
de novo CyTOF compensation