simulation results

Figure 2

knitr::opts_chunk$set(echo=FALSE, warning=FALSE, message=FALSE)

get the compensated data from CATALYST package as the true expression base for the simulation study

colnames(comped_nnls_exprs)
#the compensated data flowframe: comped_nnls, exprs: comped_nnls_exprs

cols_metal <- c("Pr141Di", "Nd142Di", "Nd143Di", "Nd144Di", "Nd145Di", "Nd146Di", "Sm147Di",
"Nd148Di", "Sm149Di", "Nd150Di", "Eu151Di", "Sm152Di", "Eu153Di", "Sm154Di", "Gd155Di",
"Gd156Di", "Gd158Di", "Tb159Di", "Gd160Di", "Dy161Di")

simulate the spillover matrix

some helper functions

#get_spill_cols from CATALYST package
get_spill_cols <- function(ms, mets, l=CATALYST::isotope_list) {
  ms <- as.numeric(ms)
  spill_cols <- vector("list", length(ms))
  for (i in seq_along(ms)) {
    p1 <- m1 <- ox <- iso <- NULL
    if ((ms[i] + 1)  %in% ms) p1 <- which(ms == (ms[i] + 1))
    if ((ms[i] - 1)  %in% ms) m1 <- which(ms == (ms[i] - 1))
    if ((ms[i] + 16) %in% ms) ox <- which(ms == (ms[i] + 16))
    iso <- l[[mets[i]]]
    iso <- which(ms %in% iso[iso != ms[i]])
    spill_cols[[i]] <- unique(c(m1, p1, iso, ox))
  }
  spill_cols
}

spill_cols_data <- function(data){
  ##get ms and mets
  chs <- colnames(data)
  ####metal mass number like 167,***
  ms <- as.numeric(regmatches(chs, gregexpr("[0-9]+", chs)))
  ####metal name
  mets <- gsub("[[:digit:]]+Di", "", chs)
  ##get spillover cols
  spill_cols <- get_spill_cols(ms, mets)
  return(spill_cols)
}

#function generate a simulated spillover matrix
sm_simulation <- function(data, n, ...){
  ##get ms and mets
  chs <- colnames(data)
  ####metal mass number like 167,***
  ms <- as.numeric(regmatches(chs, gregexpr("[0-9]+", chs)))
  ####metal name
  mets <- gsub("[[:digit:]]+Di", "", chs)
  ##get spillover cols
  spill_cols <- get_spill_cols(ms, mets)

  sm <- vector("list",n)
  for (i in 1:n){
    m = diag(length(spill_cols))
    for (j in seq_along(spill_cols)){
      for (k in spill_cols[[j]]){
        m[j,k] <- runif(1, min=0, max=0.1)
      }
    }
    sm[[i]] <- m
  }
  return(sm)
}

#function generate simulated data
data_simulation <- function(signal, sm, ...){
  data <- vector("list",length(sm))
  for (i in seq_along(sm)){
    data[[i]] = signal%*%sm[[i]]
    colnames(data[[i]]) = colnames(signal)
  }
  return(data)
}

simulate_ture_signal <- function(){
  set.seed(1)
  # single modal channel
  single <- sort(sample(c(1:20), 10))
  double <- c(1:20)[-single]
  mean_single <- runif(10, min = 100, max = 600)
  mean_double1 <- runif(10, min = 100, max = 600)
  mean_double2 <- runif(10, min = 100, max = 600)

  data <- matrix(nrow = 100000, ncol = 20)

  j = 1
  for (i in single){
    data[,i] <- sample(c(rnorm(60000, mean = mean_single[j], sd = mean_single[j]/5), rep(0, 40000)), 100000)
    j = j + 1
  }

  j = 1
  for (i in double){
    data[,i] <- sample(c(rnorm(30000, mean = mean_double1[j], sd = mean_double1[j]/5), rnorm(30000, mean = mean_double2[j], sd = mean_double2[j]/5), rep(0, 40000)), 100000)
    j = j + 1
  }

  colnames(data) <- cols_metal
  return(data)
}

NMF helper functions

library(tensorflow)
tf$compat$v1$disable_v2_behavior()

get_H_mask <- function(spill_cols){
  n_cols <- length(spill_cols)
  H_mask <- matrix(0, n_cols, n_cols)
  for (i in 1:n_cols){
    if (!is.null(spill_cols[[i]])){
      for (j in spill_cols[[i]]){
        H_mask[i,j] = 1
      }
    }
  }
  return(H_mask)
}


get_A_mask <- function(data, cutoffs){
  A_mask <- matrix(0, dim(data)[1], dim(data)[2])
  for (i in 1:dim(data)[2]){
    A_mask[which(data[,i] < cutoffs[i]), i] = 1
    A_mask[which(data[,i] == 0), i] = 0
  }
  return(A_mask)
}

train_NMF <- function(data, spill_cols, cutoffs){
  #down sampling if necessary
  # data <- data[sample(nrow(data),5000),]
  tf$compat$v1$disable_v2_behavior()

  #get masks for A and H
  A_mask <- get_A_mask(data, cutoffs)
  H_mask <- get_H_mask(spill_cols)

  A = tf$constant(data)
  shape = dim(data)

  #initializing W,H
  # temp_H = matrix(rnorm(shape[2]*shape[2]),ncol=shape[2])
  # temp_H = temp_H/max(temp_H)
  #
  # temp_W = matrix(rnorm(shape[1]*shape[2]),ncol=shape[2])
  # temp_W = temp_W/max(temp_W)

  # temp_H = matrix(0.05, shape[2], shape[2]) * H_mask
  temp_H = matrix(runif(shape[2]*shape[2],min=0,max=0.1), shape[2], shape[2]) * H_mask
  temp_W = data - data*A_mask

  # W = tf$Variable(temp_W)
  W = tf$constant(temp_W)
  H =  tf$Variable(temp_H)

  A_mask = tf$constant(A_mask)
  H_mask = tf$constant(H_mask)
  H_masked = tf$multiply(H,H_mask)

  WH = tf$matmul(W, H_masked)
  # WH = tf$matmul(W, H)
  #cost of Frobenius norm
  # cost = tf$reduce_sum(tf$pow(tf$multiply(A,A_mask) -  tf$multiply(WH,A_mask) - tf$multiply(W,A_mask), 2))
  cost = tf$reduce_sum(tf$multiply(tf$pow(A - WH - W, 2),A_mask))
  # cost = tf$reduce_sum(tf$norm(tf$multiply(A,A_mask) -  tf$multiply(WH,A_mask)))
  # Learning rate
  lr = 0.001
  # Number of steps
  steps = 1000
  # train_step = tf$train$GradientDescentOptimizer(lr)$minimize(cost)
  train_step = tf$compat$v1$train$GradientDescentOptimizer(lr)$minimize(cost)

  # Clipping operation. This ensure that W and H learnt are non-negative
  # clip_W = W$assign(tf$maximum(tf$zeros_like(W), W))
  clip_H1 = H$assign(tf$maximum(tf$zeros_like(H), H))
  H_clip2 <- matrix(0.1, shape[2], shape[2])
  clip_H2 = H$assign(tf$minimum(H_clip2, H))
  # clip = tf$group(clip_W, clip_H)
  # clip = tf$group(clip_W,clip_H1, clip_H2)
  clip = tf$group(clip_H1, clip_H2)

  # Launch the graph and initialize the variables.
  sess = tf$compat$v1$Session()
  sess$run(tf$compat$v1$global_variables_initializer())

  for (step in 1:steps) {
    sess$run(train_step)
    sess$run(clip)
    if (step %% 100 == 0){
      cat("\nCost:",sess$run(cost), "\n")
      print("********************************")
    }
  }
  #
  # learnt_W = sess$run(W)
  # learnt_H = sess$run(H)
  # learnt_H_masked = learnt_H*get_H_mask(spill_cols)
  learnt_H_masked = sess$run(H_masked)
  return(learnt_H_masked)
}

data simulation

set.seed(1)

true_signal <- simulate_ture_signal()
sm_simulated <- sm_simulation(data = true_signal, n = 20)
data_simulated <- data_simulation(signal = true_signal, sm = sm_simulated)

get the estimated spillover matrix based on simulated data

library(CytoSpill)

estimate_sm <- function(data, spill_cols){
  my_sm = vector("list",length(data))
  my_sm_nmf = vector("list",length(data))

  for (i in seq_along(data)){
    cutoffs_sim <- .DeriveCutoffs(data=data[[i]], cols=c(1:ncol(data[[i]])), n = 10000, flexrep = 5)
    threshold <- 0.1
    model <- .EstimateSpill(data=data[[i]], cutoffs=cutoffs_sim, cols=c(1:ncol(data[[i]])), upperbound = threshold, neighbor = 1)
    estimates <- model[[1]]
    xcols <- model[[2]]
    spillmat <- diag(length(xcols))
    for (m in 1:length(xcols)) {
      if (!is.na(xcols[[m]][1])) {
        for (j in 1:length(xcols[[m]])) {
          if (!is.na(estimates[[m]][j])){
            spillmat[xcols[[m]][j],m] <- min(estimates[[m]][j],threshold)
          }
        }
      }
    }
    sm_sim <- spillmat
    my_sm[[i]] <- sm_sim

    nmf_sim <- train_NMF(data[[i]], spill_cols, cutoffs = cutoffs_sim)
    my_sm_nmf[[i]] <- nmf_sim
  }
  return(list(my_sm, my_sm_nmf))
}

# estimate_sm_nmf <- function(data, spill_cols) {
#   my_sm_nmf = vector("list",length(data))
#   for (i in seq_along(data)){
#     cutoffs_sim <- .DeriveCutoffs(data=data[[i]], cols=c(1:ncol(data[[i]])), n = 5000)
#     nmf_sim <- train_NMF(data[[i]], spill_cols, cutoffs = cutoffs_sim)
#     my_sm_nmf[[i]] <- nmf_sim
#   }
#   return(my_sm_nmf)
# }

set.seed(1)
estimate_sm_simulated <- estimate_sm(data = data_simulated, spill_cols = spill_cols_data(true_signal))

##simulated_results.Rdata
#number of simulated elements in each spillover matrix
spill_cols <- spill_cols_data(true_signal)
sum(sapply(spill_cols, length))
#trucate nmf estimated spillover effects with >0.1 or <0 value, also add diagnal to make spillover matrix, comparable to sqp results
nmf_estimate_sm <- estimate_sm_simulated[[2]]
for (i in seq_along(nmf_estimate_sm)){
  nmf_estimate_sm[[i]][which(nmf_estimate_sm[[i]]<0)] <- 0
  nmf_estimate_sm[[i]][which(nmf_estimate_sm[[i]]>0.1)] <- 0.1
  nmf_estimate_sm[[i]] <- nmf_estimate_sm[[i]] + diag(20)
}

Get comparison data

library(ggplot2)
sm_comparison <- function(x, y, spill_cols){
  # errorlog = NULL
  comparison = NULL
  # errorpoint = NULL
  for (i in seq_along(x)){
    single <- NULL
    # errorcount = 0
    # errorcount2 = 0
    for (j in seq_along(spill_cols)){
      for (k in spill_cols[[j]]){
        single <- rbind(single,c(x[[i]][j,k],y[[i]][j,k]))
        # if (y[[i]][j,k]>0.049 & y[[i]][j,k]<0.05 ) {
        #   errorcount = errorcount+1
        #   errorpoint = rbind(errorpoint, c(j,k))
        # }
        # if (y[[i]][j,k]==0) errorcount2 = errorcount2+1
      }
    }
    # errorlog <- rbind(errorlog,c(errorcount,errorcount2))
    comparison <- rbind(comparison,single)
  }
  colnames(comparison) <- c('simulation','my_method')
  # return(list(comparison,errorlog,errorpoint))
  return(comparison)
}

slsqp_comparison <- sm_comparison(sm_simulated,estimate_sm_simulated[[1]],spill_cols = spill_cols)
nmf_comparison <- sm_comparison(sm_simulated,nmf_estimate_sm,spill_cols = spill_cols)

ggplot(as.data.frame(slsqp_comparison), aes(x=my_method, y=simulation)) + geom_point(size=0.1) + theme_classic() + xlab('Sequential quadratic programming estimated spillover effects') + ylab('Simulated spillover effects') + coord_fixed()

ggplot(as.data.frame(nmf_comparison), aes(x=my_method, y=simulation)) + geom_point(size=0.1) + theme_classic() + xlab('NMF estimated spillover effects') + ylab('Simulated spillover effects') + coord_fixed()
cor(slsqp_comparison[,1],slsqp_comparison[,2])^2
cor(nmf_comparison[,1],nmf_comparison[,2])^2
sessionInfo()


KChen-lab/CytoSpill documentation built on June 15, 2020, 6:12 p.m.