R/StabilityFunctions.R

Defines functions stability.sim comp.trees.levels comp.trees comp.models.levels get.tree.levels comp.models rank.cor.dist euclidian.dist L2.norm cosin.dist L1.dist kullback.leibler Pval.dist

Documented in comp.models comp.models.levels comp.trees comp.trees.levels cosin.dist euclidian.dist get.tree.levels kullback.leibler L1.dist L2.norm Pval.dist rank.cor.dist stability.sim

################################################################################
##  FUNCTIONS FOR STABILITY ANALYSIS OF THE ONCOGENETIC TREES MIXTURE MODEL   ##
################################################################################
## Function for calculating the P value of a given similarity measure.
## What is the probability for obtaining the same or smaller
## value in a random vector of similarity values.
Pval.dist <- function(dist.val, random.vals) {
  return((sum(random.vals <= dist.val) + 1) /(length(random.vals) + 1))
}
######################################################################
## Function for calculating the Kullback-Leibler divergence between
## two discrete probability distributions p and q.
kullback.leibler <- function(p, q) {
  if(length(p) != length(q))
    stop("Error: The distribution vectors have different lengths!")  
  return(sum(p * log(p / q)))
}
######################################################################
## Function for calculating the L1 distance between
## two discrete probability distributions p and q.
L1.dist <- function(p, q) {
  if(length(p) != length(q))
    stop("Error: The distribution vectors have different lengths!")

  return(sum(abs(p - q)))
}
######################################################################
## Function for calculating the cosine distance between
## two discrete probability distributions p and q.
cosin.dist <- function(p, q) {
  if(length(p) != length(q))
    stop("Error: The distribution vectors have different lengths!")

  return(1 - (sum (p * q) / (L2.norm(p) * L2.norm(q)) ))
}
######################################################################
## Function for calculating the L2 norm of a given vector x.
L2.norm <- function(x) {
  return(sqrt(sum(x * x)))
}
######################################################################
## Function for calculating the Euclidian distance between
## two vectors x and y.
euclidian.dist <- function(x, y) {
  if(length(x) != length(y))
    stop("Error: The vectors have different lengths!")

  return(sqrt(drop((x - y) %*% (x - y))))
}
######################################################################
## Function for calculating the rank-correlation distance between
## two vectors x and y.
rank.cor.dist <- function(x, y) {
  if(length(x) != length(y))
    stop("Error: The vectors have different length!")

  return(1 - rcorr(x, y, type = "spearman")[[1]][1, 2])
}
######################################################################
## Function that implements a similarity measure for comparing the topologies
## of the trees of two mixture models mixture1 and mixture2.
## It returns a value that uses the number of different edges for caracterizing
## the difference between the models.
## For the comparisson it is necessary that the two models have the same
## number of tree components build on the same number of genetic events.
## It is assumed that the mixtures have at least two components (the first one is the noise).
comp.models <- function(mixture1, mixture2) {
  if((class(mixture1) != "RtreemixModel") || (class(mixture2) != "RtreemixModel"))
    stop("The specified mixture models should be of type RtreemixModel!")
  
  if(numTrees(mixture1) != numTrees(mixture2))
    stop("The two specified mixture models don't have the same number of trees!")
  
  if(eventsNum(mixture1) != eventsNum(mixture2))
    stop("The two specified mixture models don't have the same number of events!")
  no.trees <- numTrees(mixture1)
  if (no.trees == 1)
    stop("The mixture models should have at least two tree components, where the first one is the noise component!")
  
  no.events <- eventsNum(mixture1)
  
  true.order <- list() ## list specifying the edges in the components of mixture1
  fit.order <- list() ## list specifying the edges in the components of mixture2

  ## Build the vectors that characterize the edges of the components of the mixture models.
  ## The noise components are ignored since they always have the same (star) topology.
  for(k in 2:no.trees) {
    true.vec <- character(0)
    fit.vec <- character(0)
    for(l in 2:no.events) {
      ch <- names(which(sapply(edges(getTree(mixture1, k)), function(x, el) {el %in% x}, nodes(getTree(mixture1, k))[l])))
      true.vec <- c(true.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch))       
      ch <- names(which(sapply(edges(getTree(mixture2, k)), function(x, el) {el %in% x}, nodes(getTree(mixture2, k))[l])))
      fit.vec <- c(fit.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch))        
    }
    names(true.vec) <- nodes(getTree(mixture1, k))[-1]
    true.order <- c(true.order, list(true.vec))
    names(fit.vec) <- nodes(getTree(mixture2, k))[-1]
    fit.order <- c(fit.order, list(fit.vec))
  }
  ## Build the comparisson matrix.
  ## The (i, j) element is the number of distinct edges of the
  ## (i + 1)-th and (j + 1)-th component of the models
  ## mixture1 and mixture2 respectively.  
  comp.mat <- matrix(nrow = no.trees - 1, ncol = no.trees - 1)
  for(i in 1:(no.trees - 1))
    for(j in 1:(no.trees - 1)) {
      comp.mat[i, j] <- sum(true.order[[i]] != fit.order[[j]])
    }
  ## Form the similarity pairs and calculate the similarity of the models
  ## as a sum of the number of different edges of the trees in the similarity pairs:
  if(no.trees > 2) {
    rc <- which(comp.mat == min(comp.mat), arr.ind = TRUE)
    diff.sum <- min(comp.mat)
    row.index <- c(rc[1, 1]) ## get the row index of the minimum
    col.index <- c(rc[1, 2]) ## get the column index of the minimum

    for(m in 1:(nrow(comp.mat) - 1)) {
      rc <- which(comp.mat == min(comp.mat[-(row.index), -(col.index)]), arr.ind = TRUE)
      diff.sum <- diff.sum + min(comp.mat[-(row.index), -(col.index)])
      row.index <- c(row.index, rc[1, 1])
      col.index <- c(col.index, rc[1, 2])    
    }
  } else {
    diff.sum <- comp.mat[1, 1]
  }
  ## Return a result between 0 and 1.    
  return(diff.sum/((no.trees - 1) * (no.events - 1)))  

}
######################################################################
## Function that assignes to each node the level at which that node is
## in a specific tree (tree.num) of the trees mixture model.
## The start.val is the maximum depth of the tree with which the tree
## specified with tree.num will be compared.
get.tree.levels <- function(mixture, tree.num, start.val) {
  root <- nodes(getTree(mixture, tree.num))[1]
  levels <- acc(getTree(mixture, tree.num), root)
  ## vec <- rep(max(levels[[1]]), eventsNum(mixture) - 1)
  vec <- rep(start.val, eventsNum(mixture) - 1)
  names(vec) <- nodes(getTree(mixture, tree.num))[-1]
  
  vec[names(levels[[1]])] <- levels[[1]]

  return(vec)
}
######################################################################
## Function that implements a similarity measure for comparing the topologies
## of the trees of two mixture models mixture1 and mixture2.
## It returns a value that uses the number of different edges and the depth at
## which the events are, for caracterizing the difference between the models.
## This measure is more application specific, since the depth at which the
## events are in a tree is very important for disease progression.
## For the comparisson it is necessary that the two models have the same
## number of tree components build on the same number of genetic events.
## It is assumed that the mixtures have at least two components (the first one is the noise).
comp.models.levels <- function(mixture1, mixture2) {
  if((class(mixture1) != "RtreemixModel") || (class(mixture2) != "RtreemixModel"))
    stop("The specified mixture models should be of type RtreemixModel!")
  
  if(numTrees(mixture1) != numTrees(mixture2))
    stop("The two specified mixture models don't have the same number of trees!")

  if(eventsNum(mixture1) != eventsNum(mixture2))
    stop("The two specified mixture models don't have the same number of events!")
  no.trees <- numTrees(mixture1)
  if (no.trees == 1)
    stop("The mixture models should have at least two tree components, where the first one is the noise component!")  
  
  no.events <- eventsNum(mixture1)
  
  true.order <- list() ## list specifying the edges in the components of mixture1
  fit.order <- list() ## list specifying the edges in the components of mixture2

  start.vals1 <- vector(mode = "numeric", length = (no.trees - 1)) ## the depth of the tree components in mixture1 (without the noise component) 
  start.vals2 <- vector(mode = "numeric", length = (no.trees - 1)) ## the depth of the tree components in mixture2 (without the noise component)

  ## Build the vectors that characterize the edges of the components of the mixture models.
  ## The noise components are ignored since they always have the same (star) topology.  
  for(k in 2:no.trees) {
    true.vec <- character(0)
    fit.vec <- character(0)
    for(l in 2:no.events) {
      ch <- names(which(sapply(edges(getTree(mixture1, k)), function(x, el) {el %in% x}, nodes(getTree(mixture1, k))[l])))
      true.vec <- c(true.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch))       
      ch <- names(which(sapply(edges(getTree(mixture2, k)), function(x, el) {el %in% x}, nodes(getTree(mixture2, k))[l])))
      fit.vec <- c(fit.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch))        
    }
    names(true.vec) <- nodes(getTree(mixture1, k))[-1]
    true.order <- c(true.order, list(true.vec))
    names(fit.vec) <- nodes(getTree(mixture2, k))[-1]
    fit.order <- c(fit.order, list(fit.vec))

    ## Assign the (maximal depths + 1) for each tree in the mixtures
    start.vals1[k - 1] <- max(acc(getTree(mixture1, k), nodes(getTree(mixture1, k))[1])[[1]] + 1, na.remove = TRUE)
    start.vals2[k - 1] <- max(acc(getTree(mixture2, k), nodes(getTree(mixture2, k))[1])[[1]] + 1, na.remove = TRUE)                          
  }
  ## Build the comparisson matrix.
  ## The (i, j) element is the number of distinct edges of the
  ## (i + 1)-th and (j + 1)-th component of the models
  ## mixture1 and mixture2 respectively.    
  comp.mat <- matrix(nrow = no.trees - 1, ncol = no.trees - 1)
  for(i in 1:(no.trees - 1))
    for(j in 1:(no.trees - 1)) {
      comp.mat[i, j] <- sum(true.order[[i]] != fit.order[[j]])
    }
  ## Form the similarity pairs and calculate the similarity of the models
  ## as a sum of the number of different edges of the trees in the similarity pairs
  ## and their corresponding L1-distance of the levels of the events:
  if(no.trees > 2) {
    rc <- which(comp.mat == min(comp.mat), arr.ind = TRUE)
    diff.sum <- min(comp.mat) +
      L1.dist(get.tree.levels(mixture1, rc[1, 1] + 1, start.vals2[rc[1, 2]]),
              get.tree.levels(mixture2, rc[1, 2] + 1, start.vals1[rc[1, 1]]))
    row.index <- c(rc[1, 1]) ## get the row index of the minimum
    col.index <- c(rc[1, 2]) ## get the column index of the minimum

    for(m in 1:(nrow(comp.mat) - 1)) {
      rc <- which(comp.mat == min(comp.mat[-(row.index), -(col.index)]), arr.ind = TRUE)
      diff.sum <- diff.sum + min(comp.mat[-(row.index), -(col.index)]) +
        L1.dist(get.tree.levels(mixture1, rc[1, 1] + 1, start.vals2[rc[1, 2]]),
                get.tree.levels(mixture2, rc[1, 2] + 1, start.vals1[rc[1, 1]]))
      row.index <- c(row.index, rc[1, 1])
      col.index <- c(col.index, rc[1, 2])   
    }
  } else {
    if(no.trees == 2) {
      diff.sum <- comp.mat[1, 1] +
        L1.dist(get.tree.levels(mixture1, 2, start.vals2[1]),
                get.tree.levels(mixture2, 2, start.vals1[1]))
    } else {
      stop("The specified mixture models must have at least two tree components, where the first one is the noise component!")
    }
  }
    
  return(diff.sum)  

}
######################################################################
## Function that implements a similarity measure for comparing the topologies
## of the nontrivial tree components of a specified mixture model.
## It returns a value that uses the number of different edges for caracterizing
## the difference of the trees in the model.
## For the comparisson it is necessary that the model has at least two
## nontrivial components.
comp.trees <- function(model) {
  no.trees <- numTrees(model)
  if(no.trees <= 2)
    stop("The specified mixture model should have at least two nontrivial tree components.")
  
  no.events <- eventsNum(model)
  true.order <- list() ## list specifying the edges in the nontrivial components of the model  
  ## Build the vectors that characterize the
  ## edges of the nontrivial components of the mixture model.
  for(k in 2:no.trees) {
    true.vec <- character(0)
    for(l in 2:no.events) {
      ch <- names(which(sapply(edges(getTree(model, k)), function(x, el) {el %in% x}, nodes(getTree(model, k))[l])))
      true.vec <- c(true.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch))             
    }
    names(true.vec) <- nodes(getTree(model, k))[-1]
    true.order <- c(true.order, list(true.vec))    
  }
  ## Calculate the similarity of the nontrivial components of the model 
  ## as a sum of the number of different edges of all combinations of
  ## two different nontrivial trees:
  diff.sum <- 0
  for(k1 in 2:(no.trees - 1)) 
    for(k2 in (k1 + 1):no.trees) 
      diff.sum <- diff.sum + sum(true.order[[k1 - 1]] != true.order[[k2 - 1]])
    
  ## Return a result between 0 and 1.
  return(diff.sum / (choose((no.trees - 1), 2) * (no.events - 1)))
}
######################################################################
## Function that implements a similarity measure for comparing the topologies
## of the nontrivial tree components of a specified mixture model.
## It returns a value that uses the  number of different edges and the depth at
## which the events are, for caracterizing the difference of the trees in the model.
## For the comparisson it is necessary that the model has at least two
## nontrivial components.
comp.trees.levels <- function(model) {
  no.trees <- numTrees(model)
  if(no.trees <= 2)
    stop("The specified mixture model should have at least two nontrivial tree components.")
  
  no.events <- eventsNum(model)
  start.vals <- vector(mode = "numeric", length = (no.trees - 1)) ## the depth of the nontrivial tree components in the model 
  true.order <- list() ## list specifying the edges in the nontrivial components of the model 
  
  ## Build the vectors that characterize the
  ## edges of the nontrivial components of the mixture model.  
  for(k in 2:no.trees) {
    true.vec <- character(0)
    for(l in 2:no.events) {
      ch <- names(which(sapply(edges(getTree(model, k)), function(x, el) {el %in% x}, nodes(getTree(model, k))[l])))
      true.vec <- c(true.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch))             
    }
    names(true.vec) <- nodes(getTree(model, k))[-1]
    true.order <- c(true.order, list(true.vec))
    
    ## Assign the maximal depths + 1 for each tree in the mixtures
    start.vals[k - 1] <- max(acc(getTree(model, k), nodes(getTree(model, k))[1])[[1]] + 1, na.remove = TRUE)    
  }
  ## Calculate the similarity of the models as a sum of the number of
  ## different edges of all possible combinations of two different
  ## nontrivial trees in the model and their corresponding L1-distance
  ## of the levels of the events:
  diff.sum <- 0
  for(k1 in 2:(no.trees - 1)) {
    for(k2 in (k1 + 1):no.trees) {
      diff.sum <- diff.sum + (sum(true.order[[k1 - 1]] != true.order[[k2 - 1]])
                              +  L1.dist(get.tree.levels(model, k1, start.vals[k2 - 1]),
                                         get.tree.levels(model, k2, start.vals[k1 - 1])))
    }
  }
  return(diff.sum)
}
######################################################################
## Stability analysis of the oncogenetic trees mixture model.
## This includes stability analysis on different levels of the mixture
## model: GPS values, encoded probability distribution, tree topologies.
## The function outputs a list of analysis for the mentioned features.
## Each analysis contains the values of different similarity measures
## with their corresponding p-values.
## The models should have at least two components (the first one is the noise).
stability.sim <- function(no.trees = 3, ## number of tree components
                          no.events = 9, ## number of genetic events
                          prob = c(0.2, 0.8), ## interval for the edgeweights of the random mixture models 
                          no.draws = 300, ## number of samples drawn from the random models used for learning a mixture model
                          no.rands = 100, ## number of rands for calculatin the p-values
                          no.sim = 1 ## number of simulation iterations
                          ) {
  ## Set the true types.
  no.trees <- as.integer(no.trees)
  no.events <- as.integer(no.events)
  no.draws <- as.integer(no.draws)
  no.rands <- as.integer(no.rands)
  no.sim <- as.integer(no.sim)
  ## Check if the necessary parameters are provided and have the correct form.
  if(no.trees < 2)
    stop("The specified mixture model should have at least two
tree components (where the first one is the noise).")
  if (no.events < 1)
    stop("The number of events must be an integer greater than zero!")
  if(no.draws < 1)
    stop("The number of draws (number of samples) must be an integer greater than zero!")  
  if (no.rands < 1)
    stop("The number of random models for the p-values  must be an integer greater than zero!")   
  if (no.sim < 1)
    stop("The number of iterations for the waiting time simulation
must be an integer greater than zero!")
  if(!missing(prob)) {
    if(!is.numeric(prob) || (length(prob) != 2))
      stop("Specify the boundaries of the conditional probabilities as a numeric vector
of length two = c(min, max)!")  
    if(prob[2] < prob[1])
      stop("In the probability vector the lower boundary must be smaller than the
upper boundary!")
  }
  
  simGPS <- numeric(0) ## results from the stability analysis of the GPS values
  topo.dif <- numeric(0) ## results from the stability analysis of the topologies of the tree components of mixture models based on comp.models
  topo.levels.dif <- numeric(0) ## results from the stability analysis of the topologies of the tree components of mixture models based on comp.models.levels
  result.distr <- numeric(0) ## results from the stability analysis of the probability distribution encoded by the model
  
  mat.true.gps <- numeric(0) ## matrix containing the true GPS values from each simulation iteration
  mat.fit.gps <- numeric(0) ## matrix containing the corresponding fitted GPS values from each simulation iteration

  list.true.models <- list() ## list containing the true models from each simulation iteration
  list.fit.models <- list() ## list containing the corresponding fitted models from each simulation iteration
  ## Simulation iterations:
  for(i in 1:as.integer(no.sim)) {
    print(i) ## simulation iteration
    ## Pick a true model from the space of random models and draw data from it.
    true.m <- generate(K = no.trees, no.events = no.events, prob = prob)
    Weights(true.m) <- c(0.05, rep(0.95/(no.trees - 1), (no.trees - 1)))
    sim.data <- sim(model = true.m, no.draws = no.draws)    
    list.true.models <- c(list.true.models, true.m)
    ## Calculate the GPS and the distribution encoded by true.m
    true.gps <- GPS(gps(model = true.m, data = sim.data, no.sim = 10000))
    mat.true.gps <- cbind(mat.true.gps, true.gps)
    true.distr <- distribution(model = true.m)$probability
    ## Fit a mixture model to the data sim.data    
    fm <- fit(data = sim.data, K = no.trees, noise = TRUE, equal.edgeweights = TRUE, eps = 0.01)
    list.fit.models <- c(list.fit.models, fm)
    ## Calculate the GPS and the distribution encoded by fm
    fit.gps <- GPS(gps(model = fm, data = sim.data, no.sim = 10000))
    mat.fit.gps <- cbind(mat.fit.gps, fit.gps)
    fit.distr <- distribution(model = fm)$probability
    ## Compute different distances between the distributions induced
    ## by the true and fitted models:
    true.cosin <- cosin.dist(true.distr, fit.distr)
    true.l1 <- L1.dist(true.distr, fit.distr)
    true.kull <- kullback.leibler(true.distr, fit.distr)
    ## Compute different distances between the GPS values
    ## for the true and fitted models:
    true.euclGPS <- euclidian.dist(true.gps, fit.gps)
    true.rank.dist <-  rank.cor.dist(true.gps, fit.gps)
    ## Compute different similarity measures for comparing the
    ## topologies of the nontrivial components of the true and fitted models:
    true.topo.dif <- comp.models(true.m, fm)
    true.topo.levels.dif <- comp.models.levels(true.m, fm)
    ## Vectors for keeping the values for the different features
    ## of the random mixture models needed for the p-values calculation:
    rand.euclGPS <- numeric(0)
    rand.rankGPS <- numeric(0)

    rand.cos.distr <- numeric(0)
    rand.l1.distr <- numeric(0)
    rand.kull.distr <- numeric(0)

    rand.topo <- numeric(0)
    rand.topo.levels <- numeric(0)
    ## Create the vectors with random values for the p-values calculation:
    for(j in 1:(as.integer(no.rands) - 1)) {
      ## Generate random model and calculate the GPS and distribution:
      model <- generate(K = no.trees, no.events = no.events, prob = prob)
      Weights(model) <- c(0.05, rep(0.95/(no.trees - 1), (no.trees - 1)))      
      random.gps <- GPS(gps(model = model, data = sim.data, no.sim = 10000))
      random.distr <- distribution(model = model)$probability
      ## GPS:
      rand.euclGPS <- c(rand.euclGPS, euclidian.dist(true.gps, random.gps))      
      rand.rankGPS <- c(rand.rankGPS, rank.cor.dist(true.gps, random.gps))
      ## Distribution:
      rand.cos.distr <- c(rand.cos.distr, cosin.dist(true.distr, random.distr))
      rand.l1.distr <- c(rand.l1.distr, L1.dist(true.distr, random.distr))      
      rand.kull.distr <- c(rand.kull.distr, kullback.leibler(true.distr, random.distr))
      ## Tree topologies:
      rand.topo <- c(rand.topo, comp.models(true.m, model))
      rand.topo.levels <- c(rand.topo.levels, comp.models.levels(true.m, model))
    }
    ## Stability analysis of GPS values
    ## (using Euclidian distance and rank correlation distance as similarity measures):
    simGPS <- rbind(simGPS,
                    c(true.euclGPS, Pval.dist(true.euclGPS, rand.euclGPS),   
                      true.rank.dist, Pval.dist(true.rank.dist, rand.rankGPS)))
    ## Stability analysis of induced distributions
    ## (using cosine distance, L1 distance, Kullback-Leibler divergence as similarity measures):
    result.distr <- rbind(result.distr,
                          c(true.cosin, Pval.dist(true.cosin, rand.cos.distr),
                            true.l1, Pval.dist(true.l1, rand.l1.distr),
                            true.kull, Pval.dist(true.kull, rand.kull.distr)))
    ## Stability analysis of tree topologies
    ## (using comp.models and comp.models.levels as similarity measures):
    topo.dif <- rbind(topo.dif,
                      c(true.topo.dif, Pval.dist(true.topo.dif, rand.topo)))
    topo.levels.dif <- rbind(topo.levels.dif,
                             c(true.topo.levels.dif, Pval.dist(true.topo.levels.dif, rand.topo.levels)))
    
  }
  ## Organize the output:
  colnames(simGPS) <- c("eucl.dist", "p.val.eucl", "rank.cor.dist", "p.val.rank")
  colnames(result.distr) <- c("cos", "p.val.cos", "L1", "p.val.L1", "KL", "p.val.KL")
  colnames(topo.dif) <- c("topo.dif", "p.value")
  colnames(topo.levels.dif) <- c("topo.levels.dif", "p.value")
  colnames(mat.true.gps) <- c(1:no.sim)
  colnames(mat.fit.gps) <- c(1:no.sim)
  output <- list(simGPS, result.distr,
                 topo.dif, topo.levels.dif,
                 mat.true.gps, mat.fit.gps,
                 list.true.models, list.fit.models)
  names(output) <- c("GPS", "Distribution", "Tree topologies (distinct edges)",
                     "Tree topologies (distinct edges + levelsL1dist)", "True GPS",
                     "Fitted GPS", "True models", "Fitted models")
    
  return(output)  
}

Try the Rtreemix package in your browser

Any scripts or data that you put into this service are public.

Rtreemix documentation built on Nov. 8, 2020, 5:57 p.m.