R/clustering.R

Defines functions gaussian.bmm.filter.clusters binomial.bmm.filter.clusters bmm.filter.clusters bmm.calculate.pvalue calculate.self.overlap clusterWithGaussianBmm reorderGaussianClust clusterWithBinomialBmm reorderBinomialClust clusterWithBmm reorderBetaClust hardClusterAssignments clusterVafs

Documented in binomial.bmm.filter.clusters bmm.calculate.pvalue bmm.filter.clusters calculate.self.overlap clusterVafs clusterWithBinomialBmm clusterWithBmm clusterWithGaussianBmm gaussian.bmm.filter.clusters hardClusterAssignments reorderBetaClust reorderBinomialClust reorderGaussianClust

##--------------------------------------------------------------------
## hand off the data to the appropriate clustering algorithm
##
## Each clustering algorithm function should take as input
## an NxM matrix of vafs (with M = # of samples and N = # of points to cluster),
## an NxM matrix of variant read counts,
## and an NxM matrix of reference read counts.
## e.g., a beta mixture model works on the vafs, while a binomial mixture
## model operates on the variant and reference counts directly.  These
## two specifications are equivalent--we should probably drop the vaf input
## and have the method compute those if it needs them.
## It should return a list containing the following items
##   cluster.assignments = a vector of length N containing a numeric cluster assignment
##   cluster.means = a matrix of size M x number_of_clusters containing the mean values
##                   of a cluster's vafs in each sample
##   cluster.upper = same as cluster.means, but containing upper confidence bounds instead of mean
##   cluster.lower = same as cluster.means, but containing lower confidence bounds instead of mean
##   fit.x = a vector of length P, where P is an arbitrary number of points between 0 and 1
##           where the model fit was sampled (in each of the M dimensions)
##   fit.y = a matrix of size MxP, containing the corresponding Y value for each X
##           Y values should be scaled between 0 and 1
##   individual.fits.y = a list of length number_of_clusters, holding the individual fits for each of the models, each represented by an M x P matrix (as for fit.y)

clusterVafs <- function(vafs.merged, vafMatrix, varMatrix, refMatrix, maximumClusters, method="bmm", params=NULL, samples=1, plotIntermediateResults = 0, verbose=0){

  ##check for suitable method
  apply.uncertainty.self.overlap.condition <- TRUE
  apply.min.items.condition <- TRUE
  apply.overlapping.std.dev.condition <- TRUE
  if(!is.null(params)){
    if(grepl("no.uncertainty.overlap.detection",params)){
      cat("Disable uncertainty overlap detection\n")
      apply.uncertainty.self.overlap.condition <- FALSE      
    }
    if(grepl("no.min.items.condition",params)){
      cat("Disable min cluster size detection\n")
      apply.min.items.condition <- FALSE
    }
    if(grepl("no.apply.overlapping.std.dev.condition", params)) {
      cat("Disable overlapping std dev condition\n")
      apply.overlapping.std.dev.condition <- FALSE
    }
  }
  
  if(method == "bmm"){
    if(!is.null(params)){
      ##handle input params to clustering method - only supports two for now...
      params=strsplit(params,", *",perl=TRUE)[[1]]

      overlap.threshold=0.7
      apply.pvalue.outlier.condition <- TRUE
      outlier.pvalue.threshold=0.01
      alpha0 = 0.005
      mu0 = 1
      c0 = NULL
      
      if(grepl("overlap.threshold",params)){
        overlap.threshold = as.numeric(strsplit(params[grep("overlap.threshold",params)] ," *= *",perl=TRUE)[[1]][2])
      }
      if(grepl("alpha0",params)){
        alpha0 = as.numeric(strsplit(params[grep("alpha0",params)] ," *= *",perl=TRUE)[[1]][2])
        cat(paste("Using alpha0 = ", alpha0, "\n", sep=""))
      }
      if(grepl("mu0",params)){
        mu0 = as.numeric(strsplit(params[grep("mu0",params)] ," *= *",perl=TRUE)[[1]][2])
        cat(paste("Using mu0 = ", mu0, "\n", sep=""))
      }
      if(grepl("no.pvalue.outlier.detection",params)){
        cat("Disable pvalue outlier condition\n")
        apply.pvalue.outlier.condition <- FALSE
      }
      if(grepl("outlier.pvalue.threshold",params)){
        outlier.pvalue.threshold = as.numeric(strsplit(params[grep(" outlier.pvalue.threshold",params)] ," *= *",perl=TRUE)[[1]][2])
      }
      if(grepl("c0",params)){
        c0 = as.numeric(strsplit(params[grep("c0",params)] ," *= *",perl=TRUE)[[1]][2])
        cat(paste("Using c0 = ", c0, "\n", sep=""))
      }

      return(clusterWithBmm(vafs.merged, vafMatrix, varMatrix, refMatrix, samples=samples, plotIntermediateResults=plotIntermediateResults, verbose=0, overlap.threshold=overlap.threshold,apply.pvalue.outlier.condition=apply.pvalue.outlier.condition,initialClusters=maximumClusters, outlier.pvalue.threshold=outlier.pvalue.threshold,apply.uncertainty.self.overlap.condition=apply.uncertainty.self.overlap.condition,apply.min.items.condition=apply.min.items.condition,apply.overlapping.std.dev.condition=apply.overlapping.std.dev.condition,alpha0=alpha0,mu0=mu0,c0=c0))
    }
    return(clusterWithBmm(vafs.merged, vafMatrix, varMatrix, refMatrix, samples=samples, plotIntermediateResults=plotIntermediateResults, verbose=0,initialClusters=maximumClusters,apply.uncertainty.self.overlap.condition=apply.uncertainty.self.overlap.condition,apply.min.items.condition=apply.min.items.condition,apply.overlapping.std.dev.condition=apply.overlapping.std.dev.condition))


  } else if(method == "binomial.bmm"){
    if(!is.null(params)){
      ##handle input params to clustering method - only supports one for now...
      params=strsplit(params,", *",perl=TRUE)[[1]]
      if(grepl("overlap.threshold",params)){
        val = as.numeric(strsplit(params[grep("overlap.threshold",params)] ," *= *",perl=TRUE)[[1]][2])
        return(clusterWithBinomialBmm(vafs.merged, vafMatrix, varMatrix, refMatrix, samples=samples, plotIntermediateResults=plotIntermediateResults, verbose=0, overlap.threshold=val,initialClusters=maximumClusters,apply.uncertainty.self.overlap.condition=apply.uncertainty.self.overlap.condition,apply.min.items.condition=apply.min.items.condition,apply.overlapping.std.dev.condition=apply.overlapping.std.dev.condition))
      }
    }
    return(clusterWithBinomialBmm(vafs.merged, vafMatrix, varMatrix, refMatrix, samples=samples, plotIntermediateResults=plotIntermediateResults, verbose=0,initialClusters=maximumClusters,apply.uncertainty.self.overlap.condition=apply.uncertainty.self.overlap.condition,apply.min.items.condition=apply.min.items.condition,apply.overlapping.std.dev.condition=apply.overlapping.std.dev.condition))


  } else if(method == "gaussian.bmm"){
    W0 = NULL;
    beta0 = NULL;
    nu0 = NULL;
    c0 = NULL;

    if(!is.null(params)){
      if(grepl("W0",params)){
        W0 = as.numeric(strsplit(params[grep("W0",params)] ," *= *",perl=TRUE)[[1]][2])
        cat(paste("Using W0 = ", W0, "\n", sep=""))
      }
      if(grepl("beta0",params)){
        beta0 = as.numeric(strsplit(params[grep("beta0",params)] ," *= *",perl=TRUE)[[1]][2])
        cat(paste("Using beta0 = ", beta0, "\n", sep=""))
      }
      if(grepl("nu0",params)){
        nu0 = as.numeric(strsplit(params[grep("nu0",params)] ," *= *",perl=TRUE)[[1]][2])
        cat(paste("Using nu0 = ", nu0, "\n", sep=""))
      }
      if(grepl("c0",params)){
        c0 = as.numeric(strsplit(params[grep("c0",params)] ," *= *",perl=TRUE)[[1]][2])
        cat(paste("Using c0 = ", c0, "\n", sep=""))
      }
    }

    if(!is.null(params)){
      ##handle input params to clustering method
      if(grepl("overlap.threshold",params)){
        val = as.numeric(strsplit(params[grep("overlap.threshold",params)] ," *= *",perl=TRUE)[[1]][2])
        return(clusterWithGaussianBmm(vafs.merged, vafMatrix, varMatrix, refMatrix, samples=samples, plotIntermediateResults=plotIntermediateResults, verbose=0, overlap.threshold=val,initialClusters=maximumClusters,apply.uncertainty.self.overlap.condition=apply.uncertainty.self.overlap.condition,apply.min.items.condition=apply.min.items.condition,apply.overlapping.std.dev.condition=apply.overlapping.std.dev.condition,nu0=nu0, beta0=beta0, W0=W0, c0=c0))
      }
    }
    return(clusterWithGaussianBmm(vafs.merged, vafMatrix, varMatrix, refMatrix, samples=samples, plotIntermediateResults=plotIntermediateResults, verbose=0,initialClusters=maximumClusters,apply.uncertainty.self.overlap.condition=apply.uncertainty.self.overlap.condition,apply.min.items.condition=apply.min.items.condition,apply.overlapping.std.dev.condition=apply.overlapping.std.dev.condition,nu0=nu0, beta0=beta0, W0=W0, c0=c0))
  } else {
    print("Error: please choose a supported clustering method\n[bmm|gaussian.bmm|binomial.bmm]");
    return(0);
  }
}


##--------------------------------------------------------------------------
## Go from fuzzy probabilities to hard cluster assignments
##
hardClusterAssignments <- function(numPoints,cluster.names,probabilities) {
    # Any point that has been removed from the data set will have a
    # probability of NA and will be given an assignment of 0,
    # indicating that it is an outlier.
    assignments <- rep(0,numPoints);
    for(n in 1:numPoints) {
        max.cluster <- 0
        max.assignment <- -1
        # Prune any points that do not have an appreciable probability
        # of being in their respective cluster
        #max.assignment <- 1/numClusters
        #if(numClusters > 2) {
        #  max.assignment <- (4/3)/numClusters
        #}
        for(k in 1:length(cluster.names)) {
            if ( !is.na(probabilities[n,k]) & (probabilities[n,k] > max.assignment) ) {
                max.assignment <- probabilities[n,k]
                max.cluster <- cluster.names[k]
            }
        }
        assignments[n] <- max.cluster
    }
    return(assignments)
}


## ----------------------------------------
## Reorder the parameters to the Beta model based on the order specified
## in the second argument (see reorderClusters)
reorderBetaClust <- function(clust, ord) {

  mu=list()
  alpha=list()
  nu=list()
  beta=list()
  pi=list()

  for(i in 1:length(ord[,1])){
    oldnum = ord[i,1]
    mu[[i]] = clust$mu[,oldnum]
    alpha[[i]] = clust$alpha[,oldnum]
    nu[[i]] = clust$nu[,oldnum]
    beta[[i]] = clust$beta[,oldnum]
    pi[[i]] = clust$pi[oldnum]
  }

  for(i in 1:length(ord[,1])){
    clust$mu[,i] = mu[[i]]
    clust$alpha[,i] = alpha[[i]]
    clust$nu[,i] = nu[[i]]
    clust$beta[,i] = beta[[i]]
    clust$pi[i] = pi[[i]]
  }

  return(clust)
}

##--------------------------------------------------------------------------
## Do clustering with bmm (beta mixture model)
##
clusterWithBmm <- function(vafs.merged, vafs, vars, refs, initialClusters=10, samples=1, plotIntermediateResults=0,
                           verbose=TRUE, overlap.threshold=0.7, apply.pvalue.outlier.condition=TRUE,
                           outlier.pvalue.threshold=0.01, apply.uncertainty.self.overlap.condition = TRUE, apply.min.items.condition = TRUE, apply.overlapping.std.dev.condition = TRUE, alpha0=0.005, mu0=1, c0=NULL) {
  
    suppressPackageStartupMessages(library(bmm))

    initialClusters=initialClusters
    if(length(vafs[,1]) <= initialClusters){
      #print(paste("ERROR: only",length(vafs[,1])," points; reducing number of clusters from", initialClusters, "to", length(vafs[,1]),"\n"))
      #initialClusters <- length(vafs[,1])
      print(paste("ERROR: only",length(vafs[,1])," points - not enough points to cluster when using",initialClusters,"intialClusters. Provide more data or reduce your maximumClusters option"))
      return(list(NULL))
    }

    #replace any values of zero with a very small number to prevent errors
    # NB:  It seems OK to add .Machine$double.eps (without the factor of 100)
    # to 0 to avoid trouble, but subtracting from 1 is not sufficient.  Presumably,
    # there isn't enough accuracy to represent such a number and it is stored as 1.
    shift.by.machine.precision <- TRUE
    #shift.by.machine.precision <- FALSE
    delta <- 100 * .Machine$double.eps
    if(shift.by.machine.precision) {
      vafs[which(vafs <= delta)] = delta
    } else {
      num.dimensions <- ncol(vafs)
      for(m in 1:num.dimensions) {
        flag <- vafs[,m] <= delta
        # Replace 0 values by 1 / num reads
        min.vaf <- min(vafs[!flag,m])
        vafs[flag,m] <- unlist(lapply(refs[flag,m], function(x) min(min.vaf, 1/x)))
      }
    }

    #also replace any values of one to prevent errors
    vafs[which(vafs >= (1-delta))] = 1 - delta

    
    ## Initialize the hyperparameters of the Beta mixture model (bmm).
    hyperparams <- init.bmm.hyperparameters(vafs, initialClusters)

    if(!is.null(c0)) {
      hyperparams$c0 <- rep(c0, length(hyperparams$c0))
    }
    
    hyperparams$alpha0 <- matrix(data=alpha0, nrow=nrow(hyperparams$alpha0), ncol=ncol(hyperparams$alpha0))
    hyperparams$beta0 <- hyperparams$alpha0

    hyperparams$mu0 <- matrix(data=mu0, nrow=nrow(hyperparams$mu0), ncol=ncol(hyperparams$mu0))
    hyperparams$nu0 <- hyperparams$mu0

    
    ## Initialize the parameters of the bmm.
    params <- init.bmm.parameters(vafs, initialClusters, hyperparams$mu0, hyperparams$alpha0, hyperparams$nu0, hyperparams$beta0, hyperparams$c0, verbose=TRUE)

    ## Perform the clustering.
    ## Start with the provided number of clusters, but prune any with low probability
    bmm.results <- bmm.filter.clusters(vafs.merged, vafs, initialClusters, params$r, params$mu, params$alpha, params$nu, params$beta, params$c, params$E.pi, hyperparams$mu0, hyperparams$alpha0, hyperparams$nu0, hyperparams$beta0, hyperparams$c0, convergence.threshold = 10^-4, max.iterations = 10000, verbose = verbose, plotIntermediateResults=plotIntermediateResults, overlap.threshold=overlap.threshold,apply.pvalue.outlier.condition=apply.pvalue.outlier.condition, outlier.pvalue.threshold=outlier.pvalue.threshold, apply.uncertainty.self.overlap.condition=apply.uncertainty.self.overlap.condition,apply.min.items.condition=apply.min.items.condition,apply.overlapping.std.dev.condition=apply.overlapping.std.dev.condition)

    if(bmm.results$retVal != 0) {
        cat("WARNING: bmm failed to converge. No clusters assigned\n")
        return(NULL);
    }

    ##get the assignment of each point to a cluster
    probs = bmm.results$r
    numPoints = length(probs[,1])
    numClusters = length(probs[1,])
    clusters = hardClusterAssignments(numPoints,1:numClusters,probs);

    ## find confidence intervals around the means of the clusters
    intervals = bmm.narrowest.mean.interval.about.centers(bmm.results$mu, bmm.results$alpha, bmm.results$nu, bmm.results$beta, 0.68)
    means = intervals$centers
    if(verbose){
      print("Cluster Centers:");
      print(means);
    }
    lower = intervals$lb
    upper = intervals$ub

    if(dim(bmm.results$outliers)[1] > 0) {
      cat("Outliers:\n")
      print(bmm.results$outliers)
    }


    ## Generate (x,y) values of the posterior predictive density
    n <- 1000

    ## Don't evaluate at x=0 or x=1, which will blow up
    x <- seq(0, 1, 1/n)[2:n]

    ## create a num_dimensions x n matrix of y values
    n=n-1;
    y <- rep.int(0, n)
    y = t(matrix(rep(y,dim(vafs)[2]),ncol=dim(vafs)[2]))

    ##for each dimension
    yms = list()
    for (k in 1:numClusters) {
      yms[[k]] <- y
    }
    for(dim in 1:dim(vafs)[2]){
        ym <- matrix(data=0, nrow=numClusters, ncol=n)
        num.iterations <- 100
        for (k in 1:numClusters) {
            for (i in 1:n) {
                ## Evaluate posterior probability at x.
                ym[k,i] <- bmm.component.posterior.predictive.density(x[i], bmm.results$mu[dim,k], bmm.results$alpha[dim,k], bmm.results$nu[dim,k], bmm.results$beta[dim,k], bmm.results$E.pi[k], num.samples = num.iterations)
                yms[[k]][dim,i] <- ym[k,i]
                y[dim,i] <- y[dim,i] + ym[k,i]
            }
        }
        ##scale yvals between 0 and 1
        #for (k in 1:numClusters) {
        #  yms[[k]][dim,] <- yms[[k]][dim,]/max(yms[[k]][dim,])
        #}
        #y[dim,] = y[dim,]/max(y[dim,])
    }

    ##scale xvals between 1 and 100
    x = x*100

    cat("Converged on the following parameters:\n")
    cat("mu:\n")
    write.table(bmm.results$mu, col.names=FALSE, row.names=FALSE, quote=FALSE)
    cat("alpha:\n")
    write.table(bmm.results$alpha, col.names=FALSE, row.names=FALSE, quote=FALSE)
    cat("nu:\n")
    write.table(bmm.results$nu, col.names=FALSE, row.names=FALSE, quote=FALSE)
    cat("beta:\n")
    write.table(bmm.results$beta, col.names=FALSE, row.names=FALSE, quote=FALSE)    
    cat("pi:\n")
    cat(paste(bmm.results$E.pi, collapse="\t"))
    cat("\n")
    
    #return a list of info
    return(list(
        cluster.assignments = clusters,
        cluster.probabilities= probs,
        cluster.means = means,
        cluster.upper = upper,
        cluster.lower = lower,
        fit.x = x,
        fit.y = y,
        individual.fits.y = yms,
        mu = bmm.results$mu,
        alpha = bmm.results$alpha,
        nu = bmm.results$nu,
        beta = bmm.results$beta,
        pi = bmm.results$E.pi,
        cluster.method = "bmm"))
}

## ----------------------------------------
## Reorder the parameters to the Binomial model based on the order specified
## in the second argument (see reorderClusters)
reorderBinomialClust <- function(clust, ord) {

  a=list()
  b=list()
  pi=list()

  for(i in 1:length(ord[,1])){
    oldnum = ord[i,1]
    a[[i]] = clust$a[oldnum,]
    b[[i]] = clust$b[oldnum,]
    pi[[i]] = clust$pi[oldnum]
  }

  for(i in 1:length(ord[,1])){
    clust$a[i,] = a[[i]]
    clust$b[i,] = b[[i]]
    clust$pi[i] = pi[[i]]
  }

  return(clust)
}

##--------------------------------------------------------------------------
## Do clustering with binomial bmm (binomial bayesian mixture model)
##
clusterWithBinomialBmm <- function(vafs.merged, vafs, vars, refs, initialClusters=10, samples=1, plotIntermediateResults=0, verbose=TRUE, overlap.threshold=0.7,apply.uncertainty.self.overlap.condition=TRUE, apply.min.items.condition=TRUE, apply.overlapping.std.dev.condition = TRUE) { 
    suppressPackageStartupMessages(library(bmm))

    initialClusters=initialClusters
    if(length(vafs[,1]) <= initialClusters){
      #print(paste("ERROR: only",length(vafs[,1])," points; reducing number of clusters from", initialClusters, "to", length(vafs[,1]),"\n"))
      #initialClusters <- length(vafs[,1])
      print(paste("ERROR: only",length(vafs[,1])," points - not enough points to cluster when using",initialClusters,"intialClusters. Provide more data or reduce your maximumClusters option"))
      return(list(NULL))      
    }

    total.trials <- vars + refs

    ## Initialize the hyperparameters of the Binomial mixture model.
    hyperparams <- init.binomial.bmm.hyperparameters(vars, total.trials, initialClusters)

    ## Initialize the parameters of the bmm.
    params <- init.binomial.bmm.parameters(vars, total.trials, initialClusters, hyperparams$a0, hyperparams$b0, hyperparams$alpha0, verbose=TRUE)

    ## Perform the clustering.
    ## Start with the provided number of clusters, but prune any with low probability
    bmm.results <- binomial.bmm.filter.clusters(vafs.merged, vafs, vars, total.trials, initialClusters, params$r, params$a, params$b, params$alpha, hyperparams$a0, hyperparams$b0, hyperparams$alpha0, convergence.threshold = 10^-4, max.iterations = 10000, verbose = verbose, plotIntermediateResults=plotIntermediateResults, overlap.threshold=overlap.threshold, apply.uncertainty.self.overlap.condition=apply.uncertainty.self.overlap.condition,apply.min.items.condition=apply.min.items.condition,apply.overlapping.std.dev.condition=apply.overlapping.std.dev.condition)
    if(bmm.results$retVal != 0) {
        cat("WARNING: bmm failed to converge. No clusters assigned\n")
        return(NULL);
    }

    ##get the assignment of each point to a cluster
    probs = bmm.results$r
    numPoints = length(probs[,1])
    numClusters = length(probs[1,])
    clusters = hardClusterAssignments(numPoints,1:numClusters,probs);

    ## find confidence intervals around the means of the clusters
    intervals = binomial.bmm.narrowest.mean.interval.about.centers(bmm.results$a, bmm.results$b, 0.68)
    means = intervals$centers
    if(verbose){
      print("Cluster Centers:");
      print(means);
    }
    lower = intervals$lb
    upper = intervals$ub

    if(dim(bmm.results$outliers)[1] > 0) {
      cat("Outliers:\n")
      print(bmm.results$outliers)
    }


    ## Generate (x,y) values of the posterior predictive density

    ## We will evaluate these densities at the median total.trials
    ## in order to uniquely define proportions from count data.
    eta <- round(median(total.trials))

    x <- seq(0, eta, 1)
    n <- length(x)

    y <- rep.int(0, n)
    y = t(matrix(rep(y,dim(vafs)[2]),ncol=dim(vafs)[2]))

    ##for each dimension
    yms = list()
    for (k in 1:numClusters) {
      yms[[k]] <- y
    }
    for(dim in 1:dim(vafs)[2]){
        ym <- matrix(data=0, nrow=numClusters, ncol=n)
        num.iterations <- 100
        for (k in 1:numClusters) {
            for (i in 1:n) {
                ## Evaluate posterior probability at x.
                ym[k,i] <- binomial.bmm.component.posterior.predictive.density(x[i], eta, bmm.results$a[k,dim], bmm.results$b[k,dim], bmm.results$E.pi[k])
                yms[[k]][dim,i] <- ym[k,i]
                y[dim,i] <- y[dim,i] + ym[k,i]
            }
        }
    }

    x = ( x / eta ) * 100

    return(list(
        cluster.assignments = clusters,
        cluster.probabilities= probs,
        cluster.means = means,
        cluster.upper = upper,
        cluster.lower = lower,
        fit.x = x,
        fit.y = y,
        individual.fits.y = yms,
        a = bmm.results$a,
        b = bmm.results$b,
        pi = bmm.results$E.pi,
        cluster.method = "binomial.bmm"))
}

## ----------------------------------------
## Reorder the parameters to the gaussian model based on the order specified
## in the second argument (see reorderClusters)
reorderGaussianClust <- function(clust, ord) {

  m=list()
  alpha=list()
  beta=list()
  nu=list()
  W=list()
  L=list()
  pi=list()

  for(i in 1:length(ord[,1])){
    oldnum = ord[i,1]
    m[[i]] = clust$m[oldnum,]
    alpha[[i]] = clust$alpha[oldnum]
    beta[[i]] = clust$beta[oldnum]
    nu[[i]] = clust$nu[oldnum]
    W[[i]] = clust$W[oldnum]
    L[[i]] = clust$L[oldnum]
    pi[[i]] = clust$pi[oldnum]
  }

  for(i in 1:length(ord[,1])){
    clust$m[i,] = m[[i]]
    clust$alpha[i] = alpha[[i]]
    clust$beta[i] = beta[[i]]
    clust$nu[i] = nu[[i]]
    clust$W[i] = W[[i]]
    clust$L[i] = L[[i]]
    clust$pi[i] = pi[[i]]
  }

  return(clust)
}

##--------------------------------------------------------------------------
## Do clustering with gaussian bmm (gaussian bayesian mixture model)
##
clusterWithGaussianBmm <- function(vafs.merged, vafs, vars, refs, initialClusters=10, samples=1, plotIntermediateResults=0, verbose=TRUE, overlap.threshold=0.7,apply.uncertainty.self.overlap.condition=TRUE, apply.min.items.condition=TRUE, apply.overlapping.std.dev.condition = TRUE, nu0 = NULL, beta0 = NULL, W0 = NULL, c0 = NULL) {
    suppressPackageStartupMessages(library(bmm))

    initialClusters=initialClusters
    if(length(vafs[,1]) <= initialClusters){
      #print(paste("ERROR: only",length(vafs[,1])," points; reducing number of clusters from", initialClusters, "to", length(vafs[,1]),"\n"))
      #initialClusters <- length(vafs[,1])
      print(paste("ERROR: only",length(vafs[,1])," points - not enough points to cluster when using",initialClusters,"intialClusters. Provide more data or reduce your maximumClusters option"))
      return(list(NULL))
    }

    total.trials <- vars + refs

    ## Initialize the hyperparameters of the gaussian mixture model.
    hyperparams <- init.gaussian.bmm.hyperparameters(vafs, initialClusters)

    if(!is.null(nu0)) {
      hyperparams$nu0 <- rep(nu0, length(hyperparams$nu0))
    }

    if(!is.null(W0)) {
      for(k in 1:initialClusters) {
        hyperparams$W0[[k]] <- matrix(data=0, nrow=nrow(hyperparams$W0[[k]]), ncol=ncol(hyperparams$W0[[k]]))
      }

      for(d in 1:nrow(hyperparams$W0[[1]])) {
        for(k in 1:initialClusters) {
          hyperparams$W0[[k]][d,d] <- W0
        }
      }
    }
  
    if(!is.null(beta0)) {
      hyperparams$beta0 <- rep(beta0, length(hyperparams$beta0))
    }

    if(!is.null(c0)) {
      hyperparams$alpha0 <- rep(c0, length(hyperparams$alpha0))
    }

    
    ## Initialize the parameters of the model.
    params <- init.gaussian.bmm.parameters(vafs, initialClusters, hyperparams$m0, hyperparams$alpha0, hyperparams$beta0, hyperparams$nu0, hyperparams$W0, verbose=TRUE)

    ## Perform the clustering.
    ## Start with the provided number of clusters, but prune any with low probability
    bmm.results <- gaussian.bmm.filter.clusters(vafs.merged, vafs, vars, total.trials, initialClusters, params$r, params$m, params$alpha, params$beta, params$nu, params$W, hyperparams$m0, hyperparams$alpha0, hyperparams$beta0, hyperparams$nu0, hyperparams$W0, convergence.threshold = 10^-4, max.iterations = 10000, verbose = verbose, plotIntermediateResults=plotIntermediateResults, overlap.threshold=overlap.threshold,apply.uncertainty.self.overlap.condition=apply.uncertainty.self.overlap.condition,apply.min.items.condition=apply.min.items.condition,apply.overlapping.std.dev.condition=apply.overlapping.std.dev.condition)
    if(bmm.results$retVal != 0) {
        cat("WARNING: bmm failed to converge. No clusters assigned\n")
        return(NULL);
    }

    ##get the assignment of each point to a cluster
    probs = bmm.results$r
    numPoints = length(probs[,1])
    numClusters = length(probs[1,])
    clusters = hardClusterAssignments(numPoints,1:numClusters,probs);

    ## find confidence intervals around the means of the clusters
    intervals = gaussian.bmm.narrowest.mean.interval.about.centers(bmm.results$m, bmm.results$alpha, bmm.results$beta, bmm.results$nu, bmm.results$W, 0.68)
    means = intervals$centers
    if(verbose){
      print("Cluster Centers:");
      print(means);
    }
    lower = intervals$lb
    upper = intervals$ub

    if(dim(bmm.results$outliers)[1] > 0) {
      cat("Outliers:\n")
      print(bmm.results$outliers)
    }


    # Only generate the fit values in 1D--that's the only
    # place we use it.  Unlike the beta and binomial approaches,
    # the dimensions are not independent here, so we would either
    # have to generate the fits in the full dimensional space
    # or we would have to integrate out all but one dimension.
    # We should be able to do that integration analytically,
    # but I'm lazy, so won't attempt it now.
    x <- NULL
    y <- NULL
    yms <- NULL

    if(dim(vafs)[2]==1) {
      ## Generate (x,y) values of the posterior predictive density
      n <- 1000

      ## Don't evaluate at x=0 or x=1, which will blow up
      x <- seq(0, 1, 1/n)[2:n]

      ## create a num_dimensions x n matrix of y values
      n=n-1;
      y <- rep.int(0, n)
      y = t(matrix(rep(y,dim(vafs)[2]),ncol=dim(vafs)[2]))

      ##for each dimension
      yms = list()
      for (k in 1:numClusters) {
        yms[[k]] <- y
      }
      for(dim in 1:dim(vafs)[2]){
          ym <- matrix(data=0, nrow=numClusters, ncol=n)
          num.iterations <- 100
          for (k in 1:numClusters) {
              for (i in 1:n) {
                  ## Evaluate posterior probability at x.
                  ym[k,i] <- gaussian.bmm.component.posterior.predictive.density(x[i], k, bmm.results$m, bmm.results$alpha, bmm.results$beta, bmm.results$nu, bmm.results$W, bmm.results$L)
                  yms[[k]][dim,i] <- ym[k,i]
                  y[dim,i] <- y[dim,i] + ym[k,i]
              }
          }
          ##scale yvals between 0 and 1
          #for (k in 1:numClusters) {
          #  yms[[k]][dim,] <- yms[[k]][dim,]/max(yms[[k]][dim,])
          #}
          #y[dim,] = y[dim,]/max(y[dim,])
      }

      ##scale xvals between 1 and 100
      x = x*100
    }

    cat("Converged on the following parameters:\n")
    cat("m:\n")
    write.table(bmm.results$m, col.names=FALSE, row.names=FALSE, quote=FALSE)
    cat("alpha:\n")
    cat(paste(paste(bmm.results$alpha, collapse="\t"), "\n", sep=""))
    cat("beta:\n")
    cat(paste(paste(bmm.results$beta, collapse="\t"), "\n", sep=""))
    cat("nu:\n")
    cat(paste(paste(bmm.results$nu, collapse="\t"), "\n", sep=""))
    cat("pi:\n")
    cat(paste(paste(bmm.results$E.pi, collapse="\t"), "\n", sep=""))
    cat("\n")
    
    
    #return a list of info
    return(list(
        cluster.assignments = clusters,
        cluster.probabilities= probs,
        cluster.means = means,
        cluster.upper = upper,
        cluster.lower = lower,
        fit.x = x,
        fit.y = y,
        individual.fits.y = yms,
        m = bmm.results$m,
        alpha = bmm.results$alpha,
        beta = bmm.results$beta,
        nu = bmm.results$nu,
        W = bmm.results$W,
        L = bmm.results$L,
        pi = bmm.results$E.pi,
        cluster.method = "gaussian.bmm"))
}

# Calculate self overlap as defined by White and Shalloway; Phys Rev E 2009
calculate.self.overlap <- function(r) {
  N.c <- dim(r)[2]
  N <- dim(r)[1]
  overlaps <- rep(0, N.c)
  ones <- rep(1, N)
  for(k in 1:N.c) {
    overlaps[k] <- sum(r[,k] * r[,k], na.rm=TRUE) / sum(ones * r[,k], na.rm=TRUE)
  }
  return(overlaps)
}

## -----------------------------------------------------
## Calculate the pvalue of a vaf v being in cluster k
bmm.calculate.pvalue <- function(vaf, k, mu, alpha, nu, beta, pi) {
  num.samples <- 1000
  num.dimensions <- dim(mu)[1]
  proportions <- matrix(data=0, nrow=num.samples, ncol=num.dimensions)
  for(m in 1:num.dimensions){
    proportions[,m] <- sample.bmm.component.proportion(mu[m,k], alpha[m,k], nu[m,k], beta[m,k], num.samples)
  }

  probabilities <- rep(1, num.samples)
  for(n in 1:num.samples) {
    for(m in 1:num.dimensions){
      probabilities[n] <- probabilities[n] * bmm.component.posterior.predictive.density(proportions[n,m], mu[m,k], alpha[m,k], nu[m,k], beta[m,k], num.samples)
    }
  }

  vaf.prob <- 1
  for(m in 1:num.dimensions){
    vaf.prob <- vaf.prob * bmm.component.posterior.predictive.density(vaf[m], mu[m,k], alpha[m,k], nu[m,k], beta[m,k], num.samples)
  }

  pvalue <- (0+length((1:length(probabilities))[probabilities <= vaf.prob])) / length(probabilities)
  if(length((1:length(probabilities))[probabilities <= vaf.prob]) == 0) {
    # Don't return a zero pvalue.  It's just artifact of the (finite)
    # sampling
    pvalue <- 1 / length(probabilities)
  }

  return(pvalue)
}

## ##--------------------------------------------------------------------------
## ## The beta distribution clustering + filtering method
## ##
bmm.filter.clusters <- function(vafs.merged, X, N.c, r, mu, alpha, nu, beta, c, E.pi, mu0, alpha0, nu0, beta0, c0,
                                convergence.threshold = 10^-4, max.iterations = 10000, verbose = 0,
                                plotIntermediateResults = 0, overlap.threshold=0.7, apply.pvalue.outlier.condition = TRUE,
                                outlier.pvalue.threshold=0.01, apply.uncertainty.self.overlap.condition = TRUE, apply.min.items.condition=TRUE, apply.overlapping.std.dev.condition = TRUE){

    total.iterations <- 0
    N <- dim(X)[1]
    num.dimensions <- dim(X)[2]

    #effective.overlap.threshold = min(overlap.threshold^(1/num.dimensions), 0.8)
    effective.overlap.threshold = (overlap.threshold^(1/num.dimensions))
    cat("Using threshold: ", effective.overlap.threshold, "\n")
    

    # If we are plotting intermediate results, keep the cluster "names"/
    # numbers the same so that the coloring stays the same across iterations
    cluster.names <- 1:N.c

    x.colnames <- colnames(X)

    outliers <- matrix(data=0, nrow=0, ncol=dim(X)[2])
    colnames(outliers) <- colnames(X)

    E.pi.prev <- rep(0, N.c)
    E.pi.prev <- E.pi

    width <- as.double(erf(1.5/sqrt(2)))
    # width <- as.double(erf(1/sqrt(2)))
    if(plotIntermediateResults > 0) {

      probs <- r
      numClusters = length(probs[1,])
      numPoints = length(probs[,1])
      clusters = hardClusterAssignments(numPoints,cluster.names,probs);

      ellipse.width <- as.double(erf(1/sqrt(2)))

      # Calculate standard error of the means
      SEM.res <- bmm.narrowest.mean.interval.about.centers(mu, alpha, nu, beta, ellipse.width)
      SEM.centers <- 100 * t(SEM.res$centers)
      SEMs.lb <- 100 * t(SEM.res$lb)
      SEMs.ub <- 100 * t(SEM.res$ub)

      # Calculate standard errors
      std.dev.res <- bmm.narrowest.proportion.interval.about.centers(mu, alpha, nu, beta, ellipse.width)
      std.dev.centers <- 100 * t(std.dev.res$centers)
      std.dev.lb <- 100 * t(std.dev.res$lb)
      std.dev.ub <- 100 * t(std.dev.res$ub)
      ellipse.metadata <- list(SEMs.lb = SEMs.lb, SEMs.ub = SEMs.ub, std.dev.lb = std.dev.lb, std.dev.ub = std.dev.ub)

      means = SEM.res$centers

      clust <- list(
        cluster.assignments = clusters,
        cluster.probabilities= probs,
        cluster.means = means,
        cluster.upper = means,
        cluster.lower = means,
        fit.x = NULL,
        fit.y = NULL,
        individual.fits.y = NULL)
      #clust=reorderClust(clust)

      vafs.with.assignments = cbind(vafs.merged,cluster=clust$cluster.assignments)
      vafs.with.assignments = cbind(vafs.with.assignments,cluster.prob=clust$cluster.probabilities)

      outputFile <- paste("iter.", total.iterations, ".pdf", sep="")
      # Determine the names of the samples by inferring from col names.
      sampleNames <- names(vafs.merged)
      sampleNames <- sampleNames[grepl(pattern=".ref", sampleNames)]
      for(s in 1:length(sampleNames)) {
        # Strip off the ".ref"
        sampleNames[s] <- substr(sampleNames[s], 1, nchar(sampleNames[s])-4)
      }
      positionsToHighlight <- NULL
      highlightsHaveNames <- FALSE
      overlayClusters <- TRUE

      sco <- new("scObject", dimensions=num.dimensions, sampleNames=sampleNames, vafs.merged=vafs.with.assignments)
      sc.plot2d(sco, outputFile, ellipse.metadata=ellipse.metadata, positionsToHighlight=positionsToHighlight, highlightsHaveNames=highlightsHaveNames)
    }
  
    # Outer while loop: following convergence of inner loop, apply
    # overlapping cluster condition to drop any overlapping clusters.
    while(TRUE) {

        if(plotIntermediateResults > 0) {
          max.iterations <- plotIntermediateResults
        }

        bmm.res <- bmm.fixed.num.components(X, N.c, r, mu, alpha, nu, beta, c, E.pi, mu0, alpha0, nu0, beta0, c0, convergence.threshold, max.iterations, verbose)
        if((bmm.res$retVal != 0) & (plotIntermediateResults == 0)) {
            stop("Failed to converge!\n")
        }
        mu <- bmm.res$mu
        alpha <- bmm.res$alpha
        nu <- bmm.res$nu
        beta <- bmm.res$beta
        c <- bmm.res$c
        E.pi <- bmm.res$E.pi
        ubar <- bmm.res$ubar
        vbar <- bmm.res$vbar
        r <- bmm.res$r

        total.iterations <- total.iterations + bmm.res$num.iterations

        ln.rho <- bmm.res$ln.rho
        E.lnu <- bmm.res$E.lnu
        E.lnv <- bmm.res$E.lnv
        E.lnpi <- bmm.res$E.lnpi
        E.quadratic.u <- bmm.res$E.quadratic.u
        E.quadratic.v <- bmm.res$E.quadratic.v

        if(plotIntermediateResults > 0) {

          probs <- r
          numPoints = length(probs[,1])
          numClusters = length(probs[1,])
          clusters = hardClusterAssignments(numPoints,cluster.names,probs);

          # Calculate standard error of the means
          SEM.res <- bmm.narrowest.mean.interval.about.centers(mu, alpha, nu, beta, ellipse.width)
          SEM.centers <- 100 * t(SEM.res$centers)
          SEMs.lb <- 100 * t(SEM.res$lb)
          SEMs.ub <- 100 * t(SEM.res$ub)

          # Calculate standard errors
          std.dev.res <- bmm.narrowest.proportion.interval.about.centers(mu, alpha, nu, beta, ellipse.width)
          std.dev.centers <- 100 * t(std.dev.res$centers)
          std.dev.lb <- 100 * t(std.dev.res$lb)
          std.dev.ub <- 100 * t(std.dev.res$ub)
          ellipse.metadata <- list(SEMs.lb = SEMs.lb, SEMs.ub = SEMs.ub, std.dev.lb = std.dev.lb, std.dev.ub = std.dev.ub)

          means = SEM.res$centers

          clust <- list(
            cluster.assignments = clusters,
            cluster.probabilities= probs,
            cluster.means = means,
            cluster.upper = means,
            cluster.lower = means,
            fit.x = NULL,
            fit.y = NULL,
            individual.fits.y = NULL)
          #clust=reorderClust(clust)

          #print(ellipse.metadata)

          vafs.with.assignments = cbind(vafs.merged,cluster=clust$cluster.assignments)
          vafs.with.assignments = cbind(vafs.with.assignments,cluster.prob=clust$cluster.probabilities)

          #vafs.with.assignments = cbind(vafs.merged,cluster=clusters)
          #vafs.with.assignments = cbind(vafs.with.assignments,cluster.prob=probs)
          outputFile <- paste("iter.", total.iterations, ".pdf", sep="")

          positionsToHighlight <- NULL
          highlightsHaveNames <- FALSE
          overlayClusters <- TRUE

          sco <- new("scObject", dimensions=num.dimensions, sampleNames=sampleNames, vafs.merged=vafs.with.assignments)
          sc.plot2d(sco, outputFile, ellipse.metadata=ellipse.metadata, positionsToHighlight=positionsToHighlight, highlightsHaveNames=highlightsHaveNames)
        }

        #if((bmm.res$retVal != 0) & (plotIntermediateResults > 0)) {
        #  next
        #}


        do.inner.iteration <- FALSE

        # Remove any small clusters

        #apply.min.items.condition <- TRUE        
        # Just for diagnostic purposes, do not remove small clusters
        # if we are plot intermediate iterations.  This will not
        # effect results--there will just be a lot of empty clusters.
        #if(plotIntermediateResults > 0) {
        #  apply.min.items.condition <- FALSE
        #}
        
        #apply.uncertainty.self.overlap.condition <- FALSE
        apply.large.SEM.condition <- FALSE
        apply.overlapping.SEM.condition <- FALSE
        #apply.overlapping.std.dev.condition <- TRUE

        # Changes on Mar 25, 2013
        #apply.overlapping.std.dev.condition <- FALSE
        #apply.uncertainty.self.overlap.condition <- TRUE

        apply.outlier.condition <- FALSE
        #apply.pvalue.outlier.condition <- TRUE

        indices.to.keep <- rep(TRUE, N.c)
        remove.data <- FALSE

        remove.data.from.small.clusters <- FALSE

        # NB: This cluster naming is inconsistent with the above
        # clustering naming/numbering (in hardClusterAssignments using
        # cluster.names).  That's OK.  We understand that that these clusters
        # from 1 to N.c are incides to an N.c length vector cluster.names
        # given the non-mutable names of the clusters (i.e., that are not
        # changed/rearranged when a cluster is removed).
        clusters <- hardClusterAssignments(N,1:N.c,r)

        if((apply.min.items.condition == TRUE) & (N.c > 1)) {
            threshold.pts <- max(3, ceiling(.005*N))
            threshold.pts <- 3
            
            num.items.per.cluster <- rep(0, N.c)
            for(n in 1:N) {
                num.items.per.cluster[clusters[n]] <- num.items.per.cluster[clusters[n]] + 1
            }

            indices.above.threshold <- num.items.per.cluster >= threshold.pts
            if(any(indices.above.threshold == FALSE)) {
              dropped.clusters <- (1:N.c)[!indices.above.threshold]
              expected.means <- ubar / ( ubar + vbar )
              save.verbose <- verbose
              verbose <- TRUE

              for(k in dropped.clusters) {
                indices.to.keep[k] <- FALSE
                if(verbose){
                  cat("Dropped cluster", k, "with too few variants (", num.items.per.cluster[k], ") center: ")
                  for(n in 1:length(expected.means[,k])) {
                    cat(expected.means[n,k])
                    if(length(expected.means[,k]) > 1) { cat(", ") }
                  }
                  cat("\n")
                }
                break
              }
              verbose <- save.verbose
              
            }

            if(remove.data.from.small.clusters == TRUE) {
              # If we are remoing any small clusters ...
              if(any(indices.to.keep==FALSE)) {
                # Calculate the self-overlaps of all clusters
                overlaps <- calculate.self.overlap(r)
                # If any of the clusters to be removed have high self-overlap
                # (i.e., have items highly assigned to them) ...
                if(any(!is.na(overlaps[!indices.to.keep]) & (overlaps[!indices.to.keep] > 0.99))) {
                  # Drop the items as well as the clusters ...
                  remove.data <- TRUE
                  # But drop items (and clusters) for small clusters with highly
                  # assigned items.  We can drop other small clusters (with
                  # weakly assigned) later if they remain small.
                  indices.to.keep <- indices.to.keep | ( is.na(overlaps) | ( overlaps < 0.99 ) )
                }
              }
            }
        } # End apply.min.items.condition

        if(all(indices.to.keep==TRUE) & (apply.outlier.condition == TRUE)) {
          # Discard any points that are >= 3 std devs away from mean
          ellipse.width <- as.double(erf(1.5/sqrt(2)))

          # Calculate standard errors
          ellipse.res <- bmm.narrowest.proportion.interval.about.centers(mu, alpha, nu, beta, ellipse.width)
          ellipse.centers <- t(ellipse.res$centers)
          ellipse.lb <- t(ellipse.res$lb)
          ellipse.ub <- t(ellipse.res$ub)

          removed.pt <- FALSE
          for(k in 1:N.c) {
            # Get all points belonging to cluster k
            if(verbose){
              cat("Cluster", k,"\n")
              print(ellipse.lb[k,])
              print(ellipse.ub[k,])
            }
            indices <- (1:N)[clusters==k]
            remove.i <- TRUE
            #remove.i <- FALSE
            for(i in indices) {
              for(m in 1:num.dimensions) {
                lower <- ellipse.lb[k,m]
                upper <- ellipse.ub[k,m]
                # This commented out code requires all dimenions to
                # be within the limits (to avoid being called an outlier)
                if(!(X[i,m] < lower) & !(X[i,m] > upper)) {
                  remove.i <- FALSE
                  break
                }
                # This code requires only one dimension to be within
                # the limits (to avoid being an outlier)
                if((X[i,m] < lower) | (X[i,m] > upper)) {
                  if(verbose){
                    cat("Point ")
                    for(n in 1:num.dimensions) {
                      cat(X[i,n], " ")
                    }
                    cat("is outside of range.\n")
                    print(ellipse.lb[k,])
                    print(ellipse.ub[k,])
                  }
                  removed.pt <- TRUE
                  do.inner.iteration <- TRUE

                  outliers <- rbind(outliers, X[i,,drop=F])
                  # To remove data from the data set, set its entries to NA
                  X[i,] <- NA
                  break
                }
              }
              if(remove.i) {
                removed.pt <- TRUE
                do.inner.iteration <- TRUE

                if(verbose){
                  cat("Point ")
                  for(n in 1:num.dimensions) {
                    cat(X[i,n], " ")
                  }
                  cat("is outside of range.\n")                  
                  print(ellipse.lb[k,])
                  print(ellipse.ub[k,])
                }

                outliers <- rbind(outliers, X[i,,drop=F])
                # To remove data from the data set, set its entries to NA
                X[i,] <- NA
              }
            }
          }
          if(removed.pt == TRUE) {
            next
          }
        }

        if(all(indices.to.keep==TRUE) & (apply.pvalue.outlier.condition == TRUE)) {
          # Remove any points with a p-value < 10^-2.  For efficiency,
          # only do the detailed computation for those points having
          # all dimensions outside 1 std.
          ellipse.width <- as.double(erf(0.75/sqrt(2)))

          # Calculate standard errors
          ellipse.res <- bmm.narrowest.proportion.interval.about.centers(mu, alpha, nu, beta, ellipse.width)
          ellipse.centers <- t(ellipse.res$centers)
          ellipse.lb <- t(ellipse.res$lb)
          ellipse.ub <- t(ellipse.res$ub)

          removed.pt <- FALSE

          for(k in 1:N.c) {
            # Get all points belonging to cluster k
            scrutinize.i <- TRUE
            indices <- (1:N)[clusters==k]
            for(i in indices) {
              for(m in 1:num.dimensions) {
                lower <- ellipse.lb[k,m]
                upper <- ellipse.ub[k,m]
                # This commented out code requires all dimenions to
                # be within the limits (to avoid subsequent tests as
                # a potential outlier)
                if(!(X[i,m] < lower) & !(X[i,m] > upper)) {
                  scrutinize.i <- FALSE
                  break
                }
              }
              if(scrutinize.i) {

                num.samples <- 1000
                proportions <- matrix(data=0, nrow=num.samples, ncol=num.dimensions)
                for(m in 1:num.dimensions){
                  proportions[,m] <- sample.bmm.component.proportion(mu[m,k], alpha[m,k], nu[m,k], beta[m,k], num.samples)
                }

                probabilities <- rep(1, num.samples)
                for(n in 1:num.samples) {
                  for(m in 1:num.dimensions){
                    probabilities[n] <- probabilities[n] * bmm.component.posterior.predictive.density(proportions[n,m], mu[m,k], alpha[m,k], nu[m,k], beta[m,k], num.samples)
                  }
                }

                i.prob <- 1
                for(m in 1:num.dimensions){
                  i.prob <- i.prob * bmm.component.posterior.predictive.density(X[i,m], mu[m,k], alpha[m,k], nu[m,k], beta[m,k], num.samples)
                }


                pvalue <- length((1:length(probabilities))[probabilities <= i.prob]) / length(probabilities)

                if(verbose){
                  cat("Point (pvalue = ", pvalue, ") ", sep="")
                  for(n in 1:num.dimensions) {
                    cat(X[i,n], " ")
                  }
                  cat("\n")
                }

                if(pvalue < outlier.pvalue.threshold ) {
                  removed.pt <- TRUE
                  do.inner.iteration <- TRUE
                  if(verbose){
                    cat("Point ")
                    for(n in 1:num.dimensions) {
                      cat(X[i,n], " ")
                    }
                    cat(" has small pvalue = ", pvalue, "\n")
                  }
                  outliers <- rbind(outliers, X[i,,drop=F])
                  # To remove data from the data set, set its entries to NA
                  X[i,] <- NA
                }
              }
            }
          }
          if(removed.pt == TRUE) {
            next
          }
        }

        if(all(indices.to.keep==TRUE) & (apply.uncertainty.self.overlap.condition == TRUE) & (N.c > 1)) {

            # Just drop min overlap
            overlaps <- calculate.self.overlap(r)

            if(min(overlaps, na.rm=TRUE) < effective.overlap.threshold) {
                for(k in 1:N.c) {
                    # Small clusters will have undefined overlaps, just skip.
                    # We'll remove them later.
                    if(is.nan(overlaps[k])) { next }
                    if((overlaps[k] < effective.overlap.threshold) & (overlaps[k] == min(overlaps, na.rm=TRUE))) {
                        indices.to.keep[k] <- FALSE
                        if(verbose){
                          cat(sprintf("Condition (%dD): Remove cluster %d pi = %.3f self-overlap = %.3f", num.dimensions, k, E.pi[k], overlaps[k]))
                        }
                        expected.means <- ubar / ( ubar + vbar )
                        if(verbose){
                          cat(" center: ")
                          for(n in 1:length(expected.means[,k])) {
                            cat(expected.means[n,k])
                            if(length(expected.means[,k]) > 1) { cat(", ") }
                          }
                          cat("\n")
                        }
                        break
                    }
                }
            }

            if(verbose){
              for(k in 1:N.c) {
                cat(sprintf("Cluster %d pi = %.3f self-overlap = %.3f\n", k, E.pi[k], overlaps[k]))
              }
            }

        } # End apply.uncertainty.self.overlap.condition

        if(all(indices.to.keep==TRUE) &(apply.large.SEM.condition == TRUE) & (N.c > 1)) {

            # Calculate standard error of the means
            SEM.res <- bmm.narrowest.mean.interval.about.centers(mu, alpha, nu, beta, width)
            SEM.centers <- t(SEM.res$centers)
            SEMs.lb <- t(SEM.res$lb)
            SEMs.ub <- t(SEM.res$ub)

            # Calculate standard errors
            std.dev.res <- bmm.narrowest.proportion.interval.about.centers(mu, alpha, nu, beta, width)
            std.dev.centers <- t(std.dev.res$centers)
            std.dev.lb <- t(std.dev.res$lb)
            std.dev.ub <- t(std.dev.res$ub)

            for(k in 1:N.c) {
                if (verbose) {
                    cat(sprintf("%dD: Cluster %d pi = %.3f: ", num.dimensions, k, E.pi[k]))
                }
                greater.than.30 <- TRUE
                greater.than.02 <- TRUE
                SEM.width.threshold <- 0.02
                #SEM.width.threshold <- 0.001
                for(m in 1:num.dimensions) {
                    center <- SEM.centers[k,m]
                    lower <- SEMs.lb[k,m]
                    upper <- SEMs.ub[k,m]
                    std.dev.width <- (std.dev.ub[k,m] - std.dev.lb[k,m])
                    SEM.width <- (SEMs.ub[k,m] - SEMs.lb[k,m])
                    #if((SEM.width/std.dev.width)>.3){ greater.than.30 <- TRUE }
                    #if(SEM.width>.02){ greater.than.02 <- TRUE }
                    if((SEM.width/std.dev.width)<=.3){ greater.than.30 <- FALSE }
                    if(SEM.width<=SEM.width.threshold){ greater.than.02 <- FALSE }
                    if (verbose) {
                        cat(sprintf("%.3f (%.3f, %.3f) [(%.3f) %.3f, %.3f] {%.3f} ", center, lower, upper, width, std.dev.lb[k,m], std.dev.ub[k,m], SEM.width/std.dev.width))
                        if(greater.than.30) { cat("* ") }
                        if(greater.than.02) { cat("**") }
                    }
                }
                if (verbose) {
                    cat("\n")
                }
                if(greater.than.30 & greater.than.02) { indices.to.keep[k] <- FALSE }
            }

            if ( any(indices.to.keep==FALSE) ) {
                remove.data <- TRUE
            }

        } # End apply.large.SEM.condition

        if(all(indices.to.keep==TRUE) & (apply.overlapping.SEM.condition == TRUE) & (N.c > 1)) {

            # Calculate standard error of the means
            SEM.res <- bmm.narrowest.mean.interval.about.centers(mu, alpha, nu, beta, width)
            SEM.centers <- t(SEM.res$centers)
            SEMs.lb <- t(SEM.res$lb)
            SEMs.ub <- t(SEM.res$ub)

            # Calculate standard errors
            std.dev.res <- bmm.narrowest.proportion.interval.about.centers(mu, alpha, nu, beta, width)
            std.dev.centers <- t(std.dev.res$centers)
            std.dev.lb <- t(std.dev.res$lb)
            std.dev.ub <- t(std.dev.res$ub)

            # Determine if component i's center is contained within
            # component i2's std.dev
            pi.threshold = 10^-2

            for(i in 1:N.c){
                i.subsumed.by.another.cluster <- FALSE
                for(i2 in 1:N.c){
                    if(i == i2) { next }
                    if(indices.to.keep[i2] == FALSE) { next }
                    if(E.pi[i2] < pi.threshold) { next }
                    i.subsumed.by.another.cluster <- TRUE
                    for(l in 1:num.dimensions){
                        i.center <- std.dev.centers[i,l]
                        if((i.center < std.dev.lb[i2,l]) | (i.center > std.dev.ub[i2,l])) {
                            i.subsumed.by.another.cluster <- FALSE
                            break
                        }
                    }
                    if(i.subsumed.by.another.cluster==TRUE) {
                      if(verbose){
                        cat(sprintf("2. Dropping cluster with center: "))
                        for(l in 1:num.dimensions){
                          cat(sprintf("%.3f ", std.dev.centers[i,l]))
                        }
                        cat(sprintf("because it overlaps with: "))
                        for(l in 1:num.dimensions){
                          cat(sprintf("(%.3f, %.3f) ", std.dev.lb[i2,l], std.dev.ub[i2,l]))
                        }
                        cat("\n")
                        break
                      }
                    }
                }
                if(i.subsumed.by.another.cluster==TRUE) {
                    indices.to.keep[i] <- FALSE
                }
            }
        } # End apply.overlapping.SEM.condition

        if(all(indices.to.keep==TRUE) & (apply.overlapping.std.dev.condition == TRUE) & (N.c > 1)) {

            # Calculate standard error of the means
            SEM.res <- bmm.narrowest.mean.interval.about.centers(mu, alpha, nu, beta, width)
            SEM.centers <- t(SEM.res$centers)
            SEMs.lb <- t(SEM.res$lb)
            SEMs.ub <- t(SEM.res$ub)

            # Calculate standard errors
            std.dev.res <- bmm.narrowest.proportion.interval.about.centers(mu, alpha, nu, beta, width)
            std.dev.centers <- t(std.dev.res$centers)
            std.dev.lb <- t(std.dev.res$lb)
            std.dev.ub <- t(std.dev.res$ub)

            # Determine how much component i's std dev overlaps component i2's std dev
            overlaps <- matrix(data = 1, nrow=N.c, ncol=N.c)
            std.dev.overlap.threshold <- 0
            for(i in 1:N.c){
                #cat("Cluster i = ", i, " overlaps: \n")
                for(i2 in 1:N.c){
                  if(verbose){
                    cat("   ")
                  }
                    for(l in 1:num.dimensions){
                        fraction <- 0
                        if((std.dev.lb[i,l] < std.dev.ub[i2,l]) & (std.dev.ub[i,l] > std.dev.lb[i2,l])) {
                            overlap <- min(std.dev.ub[i,l], std.dev.ub[i2,l]) - max(std.dev.lb[i,l], std.dev.lb[i2,l])
                            fraction <- overlap / ( std.dev.ub[i,l] - std.dev.lb[i,l] )
                        }
                        if((fraction > std.dev.overlap.threshold) & (overlaps[i,i2] != 0)) {
                            overlaps[i,i2] <- fraction
                        } else {
                            overlaps[i,i2] <- 0
                        }
                        #cat(sprintf("f = %.3f ", fraction))
                    }
                    #cat("\n")
                }
            }

            # Remove one of two overlapping clusters.  If both overlap
            # (NB: above is not symmetric), then only remove smaller of two.
            save.verbose <- verbose
            verbose <- TRUE
            for(i in 2:N.c){
                if(indices.to.keep[i] == FALSE) { next }
                for(i2 in 1:(i-1)){
                    if(indices.to.keep[i2] == FALSE) { next }
                    if(overlaps[i,i2] > 0) {
                        if((overlaps[i2,i] > 0) & (E.pi[i2] < E.pi[i])) {
                          if(verbose){cat("Condition (", num.dimensions, "D): Removing ", i2, " because of overlap (", overlaps[i2,i], ") with i = ", i, "\n")}
                            indices.to.keep[i2] <- FALSE
                        } else {
                            if(verbose){cat("Condition (", num.dimensions, "D): Removing ", i, " because of overlap (", overlaps[i2,i], ") with i2 = ", i2, "\n")}
                            indices.to.keep[i] <- FALSE
                        }
                    }
                }
            }
            verbose <- save.verbose

        } # End apply.overlapping.std.dev.condition


        if ( any(indices.to.keep==FALSE) ) {

          do.inner.iteration <- TRUE

          numeric.indices <- (1:N.c)

          cluster.names <- cluster.names[indices.to.keep]

          E.pi <- E.pi[indices.to.keep]
          E.lnpi <- E.lnpi[indices.to.keep]

          if (remove.data == TRUE) {
            cluster.indices.to.keep <- (1:N.c)[indices.to.keep]
            # clusters == 0 -> outlier already                            
            new.outliers <- matrix(X[!(clusters == 0) & !(clusters %in% cluster.indices.to.keep),], ncol=num.dimensions)
            outliers <- rbind(outliers, new.outliers)
            # To remove data from the data set, set its entries to NA
            X[!(clusters == 0) & !(clusters %in% cluster.indices.to.keep),] <- NA
          }

          colnames(X) <- x.colnames

          N.c <- length(E.pi)
          E.pi.prev <- E.pi.prev[indices.to.keep]
          c <- c[indices.to.keep]
          c0 <- c0[indices.to.keep]

          # Don't resize r and ln.rho matrices to accomodate removed
          # outliers, instead we will have set their rows to NA above.
          # r <- matrix(r[clusters %in% numeric.indices,indices.to.keep], nrow=N, ncol=N.c)
          # ln.rho <- matrix(ln.rho[clusters %in% numeric.indices,indices.to.keep], nrow=N, ncol=N.c)
          # But do drop any columns corresponding to dropped clusters
          r <- matrix(r[,indices.to.keep], nrow=N, ncol=N.c)
          ln.rho <- matrix(ln.rho[,indices.to.keep], nrow=N, ncol=N.c)
          # Need to renormalize r--do it gently.
          for(n in 1:N) {
            if(any(is.na(ln.rho[n,]))) {
              r[n,] <- rep(NA, N.c)
              next
            }

            row.sum <- log(sum(exp(ln.rho[n,] - max(ln.rho[n,])))) + max(ln.rho[n,])
            for(k in 1:N.c) { r[n,k] = exp(ln.rho[n,k] - row.sum) }
          }

          mu <- matrix(mu[,indices.to.keep], nrow=num.dimensions, ncol=N.c)
          nu <- matrix(nu[,indices.to.keep], nrow=num.dimensions, ncol=N.c)
          mu0 <- matrix(mu0[,indices.to.keep], nrow=num.dimensions, ncol=N.c)
          nu0 <- matrix(nu0[,indices.to.keep], nrow=num.dimensions, ncol=N.c)
          alpha <- matrix(alpha[,indices.to.keep], nrow=num.dimensions, ncol=N.c)
          beta <- matrix(beta[,indices.to.keep], nrow=num.dimensions, ncol=N.c)
          alpha0 <- matrix(alpha0[,indices.to.keep], nrow=num.dimensions, ncol=N.c)
          beta0 <- matrix(beta0[,indices.to.keep], nrow=num.dimensions, ncol=N.c)
          ubar <- matrix(ubar[,indices.to.keep], nrow=num.dimensions, ncol=N.c)
          vbar <- matrix(vbar[,indices.to.keep], nrow=num.dimensions, ncol=N.c)
          E.pi.prev <- E.pi

          E.lnpi <- E.lnpi[indices.to.keep]
          E.lnu <- E.lnu[indices.to.keep]
          E.lnv <- E.lnv[indices.to.keep]
          E.quadratic.u <- matrix(E.quadratic.u[,indices.to.keep], nrow=num.dimensions, ncol=N.c)
          E.quadratic.v <- matrix(E.quadratic.v[,indices.to.keep], nrow=num.dimensions, ncol=N.c)

        }
        if(do.inner.iteration == FALSE) { break }

    } # End outer while(TRUE)

    # Calculate standard error of the means
    SEM.res <- bmm.narrowest.mean.interval.about.centers(mu, alpha, nu, beta, width)
    SEM.centers <- t(SEM.res$centers)
    SEMs.lb <- t(SEM.res$lb)
    SEMs.ub <- t(SEM.res$ub)

    # Calculate standard errors
    std.dev.res <- bmm.narrowest.proportion.interval.about.centers(mu, alpha, nu, beta, width)
    std.dev.centers <- t(std.dev.res$centers)
    std.dev.lb <- t(std.dev.res$lb)
    std.dev.ub <- t(std.dev.res$ub)

    save.verbose <- verbose
    verbose <- TRUE
    for(k in 1:N.c) {
      if(verbose){cat(sprintf("Cluster %d pi = %.3f center =", k, E.pi[k]))}

        for(d in 1:num.dimensions) {
            center <- SEM.centers[k,d]
            if(verbose){cat(sprintf(" %.3f", center))}
        }
        if(verbose){cat(" SEM =")}
        for(d in 1:num.dimensions) {
            lb <- SEMs.lb[k,d]
            ub <- SEMs.ub[k,d]
            if(verbose){cat(sprintf(" (%.3f, %.3f)", lb, ub))}
        }
        if(verbose){cat(" sd =")}
        for(d in 1:num.dimensions) {
            lb <- std.dev.lb[k,d]
            ub <- std.dev.ub[k,d]
            if(verbose){cat(sprintf(" (%.3f, %.3f)", lb, ub))}
        }
        if(verbose){cat("\n")}
    }
    verbose <- save.verbose
    
    if(verbose){cat(sprintf('total iterations = %d\n', total.iterations))}

    retList <- list("retVal" = 0, "mu" = mu, "alpha" = alpha, "nu" = nu, "beta" = beta, "c" = c, "r" = r, "num.iterations" = total.iterations, "ln.rho" = ln.rho, "E.lnu" = E.lnu, "E.lnv" = E.lnv, "E.lnpi" = E.lnpi, "E.pi" = E.pi, "E.quadratic.u" = E.quadratic.u, "E.quadratic.v" = E.quadratic.v, "ubar" = ubar, "vbar" = vbar, "outliers" = outliers)

    return(retList)

} # End bmm.filter.clusters


## ##--------------------------------------------------------------------------
## ## The binomial distribution clustering + filtering method
## ##
binomial.bmm.filter.clusters <- function(vafs.merged, vafs, successes, total.trials, N.c, r, a, b, alpha, a0, b0, alpha0,
                                convergence.threshold = 10^-4, max.iterations = 10000, verbose = 0,
                                plotIntermediateResults = 0, overlap.threshold=0.7, apply.uncertainty.self.overlap.condition = TRUE, apply.min.items.condition = TRUE, apply.overlapping.std.dev.condition = TRUE) {


  total.iterations <- 0
  num.dimensions <- dim(successes)[2]

  #effective.overlap.threshold = min(overlap.threshold^(1/num.dimensions), 0.8)
  effective.overlap.threshold = (overlap.threshold^(1/num.dimensions))
  cat("Using threshold: ", effective.overlap.threshold, "\n")
  
  N <- dim(successes)[1]

  successes.colnames <- colnames(successes)
  total.colnames <- colnames(total.trials)

  outliers <- matrix(data=0, nrow=0, ncol=num.dimensions)
  colnames(outliers) <- colnames(vafs)

  E.pi.prev <- rep(0, N.c)

  width <- as.double(erf(1.5/sqrt(2)))

  while(TRUE) {

    if(verbose){
      print(r)
    }

    bmm.res <- binomial.bmm.fixed.num.components(successes, total.trials, N.c, r, a, b, alpha, a0, b0, alpha0, convergence.threshold, max.iterations = 10000, verbose = verbose)
    if(bmm.res$retVal != 0) {
      stop("Failed to converge!\n")
    }

    a <- bmm.res$a
    b <- bmm.res$b
    alpha <- bmm.res$alpha

    E.pi <- bmm.res$E.pi
    r <- bmm.res$r

    total.iterations <- total.iterations + bmm.res$num.iterations

    ln.rho <- bmm.res$ln.rho
    E.lnpi <- bmm.res$E.lnpi

    do.inner.iteration <- FALSE

    #apply.min.items.condition <- TRUE
    #apply.uncertainty.self.overlap.condition <- FALSE
    apply.large.SEM.condition <- FALSE
    apply.overlapping.SEM.condition <- FALSE
    #apply.overlapping.std.dev.condition <- TRUE

    # Changes on Mar 25, 2013
    #apply.overlapping.std.dev.condition <- FALSE
    #apply.uncertainty.self.overlap.condition <- TRUE

    # remove.data = TRUE iff we should remove points assigned to clusters
    # that we remove
    remove.data <- FALSE

    remove.data.from.small.clusters <- FALSE

    clusters <- hardClusterAssignments(N,1:N.c,r)

    indices.to.keep <- rep(TRUE, N.c)
    if((apply.min.items.condition == TRUE) & (N.c > 1)) {
      threshold.pts <- max(3, ceiling(.005*N))

      num.items.per.cluster <- rep(0, N.c)
      for(n in 1:N) {
        num.items.per.cluster[clusters[n]] <- num.items.per.cluster[clusters[n]] + 1
      }

      indices.to.keep <- num.items.per.cluster >= threshold.pts

      if(remove.data.from.small.clusters == TRUE) {
        # If we are remoing any small clusters ...
        if(any(indices.to.keep==FALSE)) {
          # Calculate the self-overlaps of all clusters
          overlaps <- calculate.self.overlap(r)
          # If any of the clusters to be removed have high self-overlap
          # (i.e., have items highly assigned to them) ...
          if(any(!is.na(overlaps[!indices.to.keep]) & (overlaps[!indices.to.keep] > 0.99))) {
            # Drop the items as well as the clusters ...
            remove.data <- TRUE
            # But drop items (and clusters) for small clusters with highly
            # assigned items.  We can drop other small clusters (with
            # weakly assigned) later if they remain small.
            indices.to.keep <- indices.to.keep | ( is.na(overlaps) | ( overlaps < 0.99 ) )
          }
        }
      }

      # indices.to.keep <- E.pi > pi.threshold
    } # End apply.min.items.condition

    if(all(indices.to.keep==TRUE) & (apply.uncertainty.self.overlap.condition == TRUE) & (N.c > 1)) {

      # Just drop min overlap
      overlaps <- calculate.self.overlap(r)

      if(min(overlaps, na.rm=TRUE) < effective.overlap.threshold) {
          for(k in 1:N.c) {
              # Small clusters will have undefined overlaps, just skip.
              # We'll remove them later.
              if(is.nan(overlaps[k])) { next }
              if((overlaps[k] < effective.overlap.threshold) & (overlaps[k] == min(overlaps, na.rm=TRUE))) {
                  indices.to.keep[k] <- FALSE
                  if(verbose){
                    cat(sprintf("Condition (%dD): Remove cluster %d pi = %.3f self-overlap = %.3f", num.dimensions, k, E.pi[k], overlaps[k]))
                    cat("\n")
                  }
                  break
              }
          }
      }

      if(verbose){
        for(k in 1:N.c) {
          cat(sprintf("Cluster %d pi = %.3f self-overlap = %.3f\n", k, E.pi[k], overlaps[k]))
        }

        means <- matrix(a/(a+b), nrow=num.dimensions, ncol=N.c)
        indices.to.drop <- (1:N.c)[!indices.to.keep]
        for(i in 1:length(indices.to.drop)) {
          index <- indices.to.drop[i]
          if(verbose){
            cat("Self-overlap condition: Dropping cluster with center: ")
            for(l in 1:num.dimensions) {
              cat(sprintf("%.3f ", means[l, index]))
            }
            cat("\n")
          }
        }
      }

    } # End apply.uncertainty.self.overlap.condition

    # Calculate standard error of the means
    SEM.res <- binomial.bmm.narrowest.mean.interval.about.centers(a, b, width)
    SEM.centers <- t(SEM.res$centers)
    SEMs.lb <- t(SEM.res$lb)
    SEMs.ub <- t(SEM.res$ub)

    if(all(indices.to.keep==TRUE) & (apply.large.SEM.condition == TRUE) & (N.c > 1)) {
      stop("Not implemented because std dev not implemented for binomial!\n")
    } # End apply.large.SEM.condition

    if(all(indices.to.keep==TRUE) & (apply.overlapping.SEM.condition == TRUE) & (N.c > 1)) {
      stop("Not implemented because std dev not implemented for binomial!\n")
    } # End apply.overlapping.SEM.condition

    if(all(indices.to.keep==TRUE) & (apply.overlapping.std.dev.condition == TRUE) & (N.c > 1)) {
      stop("Not implemented because std dev not implemented for binomial!\n")
    } # End apply.overlapping.std.dev.condition



    if ( any(indices.to.keep==FALSE) ) {

      do.inner.iteration <- TRUE

      numeric.indices <- (1:N.c)

      E.pi <- E.pi[indices.to.keep]
      E.lnpi <- E.lnpi[indices.to.keep]

      if(remove.data == TRUE) {
        cluster.indices.to.keep <- (1:N.c)[indices.to.keep]
        # clusters == 0 -> outlier already        
        new.outliers <- matrix(vafs[!(clusters == 0) & !(clusters %in% cluster.indices.to.keep),], ncol=num.dimensions)
        outliers <- rbind(outliers, new.outliers)
        # To remove data from the data set, set its entries to NA
        vafs[!(clusters == 0) & !(clusters %in% cluster.indices.to.keep),] <- NA
        successes[!(clusters == 0) & !(clusters %in% cluster.indices.to.keep),] <- NA
        total.trials[!(clusters == 0) & !(clusters %in% cluster.indices.to.keep),] <- NA
      }

      N.c <- length(E.pi)

      r <- matrix(r[,indices.to.keep], nrow=N, ncol=N.c)
      ln.rho <- matrix(ln.rho[,indices.to.keep], nrow=N, ncol=N.c)
      # Need to renormalize r--do it gently.
      for(n in 1:N) {
         if(any(is.na(ln.rho[n,]))) {
           r[n,] <- rep(NA, N.c)
           next
         }

         row.sum <- log(sum(exp(ln.rho[n,] - max(ln.rho[n,])))) + max(ln.rho[n,])
         for(k in 1:N.c) { r[n,k] = exp(ln.rho[n,k] - row.sum) }
      }
      a <- matrix(a[indices.to.keep,], nrow=N.c, ncol=num.dimensions)
      b <- matrix(b[indices.to.keep,], nrow=N.c, ncol=num.dimensions)
      alpha <- alpha[indices.to.keep,drop=FALSE]
      a0 <- matrix(a0[indices.to.keep,], nrow=N.c, ncol=num.dimensions)
      b0 <- matrix(b0[indices.to.keep,], nrow=N.c, ncol=num.dimensions)
      alpha0 <- alpha0[indices.to.keep,drop=FALSE]

      E.pi.prev <- E.pi
    }
    if(do.inner.iteration == FALSE) { break }

  }

  retList <- list("retVal" = 0, "a" = a, "b" = b, "alpha" = alpha, "r" = r, "num.iterations" = total.iterations, "ln.rho" = ln.rho, "E.lnpi" = E.lnpi, "E.pi" = E.pi, "outliers" = outliers)

  return(retList)

} # End binomial.bmm.filter.clusters


## ##--------------------------------------------------------------------------
## ## The binomial distribution clustering + filtering method
## ##
gaussian.bmm.filter.clusters <- function(vafs.merged, vafs, successes, total.trials, N.c, r, m, alpha, beta, nu, W, m0, alpha0, beta0, nu0, W0,
                                convergence.threshold = 10^-4, max.iterations = 10000, verbose = 0,
                                plotIntermediateResults = 0, overlap.threshold=0.7, apply.uncertainty.self.overlap.condition = TRUE, apply.min.items.condition = TRUE, apply.overlapping.std.dev.condition = TRUE) {

  total.iterations <- 0
  num.dimensions <- dim(successes)[2]

  #overlap.threshold = overlap.threshold.1d^(1/num.dimensions)
  #effective.overlap.threshold = min(overlap.threshold^(1/num.dimensions), 0.8)
  effective.overlap.threshold = (overlap.threshold^(1/num.dimensions))
  cat("Using threshold: ", effective.overlap.threshold, "\n")
  
  N <- dim(successes)[1]

  successes.colnames <- colnames(successes)
  total.colnames <- colnames(total.trials)

  outliers <- matrix(data=0, nrow=0, ncol=num.dimensions)
  colnames(outliers) <- colnames(vafs)

  E.pi.prev <- rep(0, N.c)

  width <- as.double(erf(1/sqrt(2)))

  while(TRUE) {

    if(verbose){
      print(r)
    }

    bmm.res <- gaussian.bmm.fixed.num.components(vafs, N.c, r, m, alpha, beta, nu, W, m0, alpha0, beta0, nu0, W0, convergence.threshold, max.iterations, verbose)
    if(bmm.res$retVal != 0) {
      stop("Failed to converge!\n")
    }

    m <- bmm.res$m
    alpha <- bmm.res$alpha
    beta <- bmm.res$beta
    nu <- bmm.res$nu
    W <- bmm.res$W

    E.pi <- bmm.res$E.pi
    r <- bmm.res$r

    total.iterations <- total.iterations + bmm.res$num.iterations

    ln.rho <- bmm.res$ln.rho
    E.lnpi <- bmm.res$E.lnpi

    do.inner.iteration <- FALSE

    #apply.min.items.condition <- TRUE
    #apply.uncertainty.self.overlap.condition <- FALSE
    apply.large.SEM.condition <- FALSE
    apply.overlapping.SEM.condition <- FALSE
    #apply.overlapping.std.dev.condition <- TRUE

    # Changes on Mar 25, 2013
    #apply.overlapping.std.dev.condition <- FALSE
    #apply.uncertainty.self.overlap.condition <- TRUE

    # remove.data = TRUE iff we should remove points assigned to clusters
    # that we remove
    remove.data <- FALSE

    remove.data.from.small.clusters <- FALSE

    clusters <- hardClusterAssignments(N,1:N.c,r)

    indices.to.keep <- rep(TRUE, N.c)

    if((apply.min.items.condition == TRUE) & (N.c > 1)) {
      threshold.pts <- max(3, ceiling(.005*N))

      num.items.per.cluster <- rep(0, N.c)
      for(n in 1:N) {
        num.items.per.cluster[clusters[n]] <- num.items.per.cluster[clusters[n]] + 1
      }

      indices.to.keep <- num.items.per.cluster >= threshold.pts

      if(remove.data.from.small.clusters == TRUE) {
        # If we are remoing any small clusters ...
        if(any(indices.to.keep==FALSE)) {
          # Calculate the self-overlaps of all clusters
          overlaps <- calculate.self.overlap(r)
          # If any of the clusters to be removed have high self-overlap
          # (i.e., have items highly assigned to them) ...
          if(any(!is.na(overlaps[!indices.to.keep]) & (overlaps[!indices.to.keep] > 0.99))) {
            # Drop the items as well as the clusters ...
            remove.data <- TRUE
            # But drop items (and clusters) for small clusters with highly
            # assigned items.  We can drop other small clusters (with
            # weakly assigned) later if they remain small.
            indices.to.keep <- indices.to.keep | ( is.na(overlaps) | ( overlaps < 0.99 ) )
          }
        }
      }

      # indices.to.keep <- E.pi > pi.threshold
    } # End apply.min.items.condition

    if(all(indices.to.keep==TRUE) & (apply.uncertainty.self.overlap.condition == TRUE) & (N.c > 1)) {

      # Just drop min overlap
      overlaps <- calculate.self.overlap(r)

      if(min(overlaps, na.rm=TRUE) < effective.overlap.threshold) {
          for(k in 1:N.c) {
              # Small clusters will have undefined overlaps, just skip.
              # We'll remove them later.
              if(is.nan(overlaps[k])) { next }
              if((overlaps[k] < effective.overlap.threshold) & (overlaps[k] == min(overlaps, na.rm=TRUE))) {
                  indices.to.keep[k] <- FALSE
                  if(verbose){
                    cat(sprintf("Condition (%dD): Remove cluster %d pi = %.3f self-overlap = %.3f", num.dimensions, k, E.pi[k], overlaps[k]))
                    cat("\n")
                  }
                  break
              }
          }
      }

      if(verbose){
        for(k in 1:N.c) {
          cat(sprintf("Cluster %d pi = %.3f self-overlap = %.3f\n", k, E.pi[k], overlaps[k]))
        }

        means <- t(m)
        indices.to.drop <- (1:N.c)[!indices.to.keep]
        for(i in 1:length(indices.to.drop)) {
          index <- indices.to.drop[i]
          if(verbose){
            cat("Self-overlap condition: Dropping cluster with center: ")
            for(l in 1:num.dimensions) {
              cat(sprintf("%.3f ", means[l, index]))
            }
            cat("\n")
          }
        }
      }

    } # End apply.uncertainty.self.overlap.condition

    # Calculate standard error of the means
    SEM.res <- gaussian.bmm.narrowest.mean.interval.about.centers(m, alpha, beta, nu, W, width)
    SEM.centers <- t(SEM.res$centers)
    SEMs.lb <- t(SEM.res$lb)
    SEMs.ub <- t(SEM.res$ub)

    # Calculate standard errors
    std.dev.res <- gaussian.bmm.narrowest.proportion.interval.about.centers(m, alpha, beta, nu, W, width)
    std.dev.centers <- t(std.dev.res$centers)
    std.dev.lb <- t(std.dev.res$lb)
    std.dev.ub <- t(std.dev.res$ub)

    if(all(indices.to.keep==TRUE) & (apply.large.SEM.condition == TRUE) & (N.c > 1)) {

      for(k in 1:N.c) {
        if (verbose) {
          cat(sprintf("%dD: Cluster %d pi = %.3f: ", num.dimensions, k, E.pi[k]))
        }
        greater.than.30 <- TRUE
        greater.than.02 <- TRUE
        SEM.width.threshold <- 0.02
        #SEM.width.threshold <- 0.001
        for(m in 1:num.dimensions) {
          center <- SEM.centers[k,m]
          lower <- SEMs.lb[k,m]
          upper <- SEMs.ub[k,m]
          std.dev.width <- (std.dev.ub[k,m] - std.dev.lb[k,m])
          SEM.width <- (SEMs.ub[k,m] - SEMs.lb[k,m])
          #if((SEM.width/std.dev.width)>.3){ greater.than.30 <- TRUE }
          #if(SEM.width>.02){ greater.than.02 <- TRUE }
          if((SEM.width/std.dev.width)<=.3){ greater.than.30 <- FALSE }
          if(SEM.width<=SEM.width.threshold){ greater.than.02 <- FALSE }
          if (verbose) {
            cat(sprintf("%.3f (%.3f, %.3f) [(%.3f) %.3f, %.3f] {%.3f} ", center, lower, upper, width, std.dev.lb[k,m], std.dev.ub[k,m], SEM.width/std.dev.width))
            if(greater.than.30) { cat("* ") }
            if(greater.than.02) { cat("**") }
          }
        }
        if (verbose) {
          cat("\n")
        }
        if(greater.than.30 & greater.than.02) { indices.to.keep[k] <- FALSE }
      }
      if ( any(indices.to.keep==FALSE) ) {
        remove.data <- TRUE
      }
    } # End apply.large.SEM.condition

    if(all(indices.to.keep==TRUE) & (apply.overlapping.SEM.condition == TRUE) & (N.c > 1)) {

      # Determine if component i's center is contained within
      # component i2's std.dev
      pi.threshold = 10^-2

      for(i in 1:N.c){
        i.subsumed.by.another.cluster <- FALSE
        for(i2 in 1:N.c){
          if(i == i2) { next }
          if(indices.to.keep[i2] == FALSE) { next }
          if(E.pi[i2] < pi.threshold) { next }
          i.subsumed.by.another.cluster <- TRUE
          for(l in 1:num.dimensions){
            i.center <- std.dev.centers[i,l]
            if((i.center < std.dev.lb[i2,l]) | (i.center > std.dev.ub[i2,l])) {
              i.subsumed.by.another.cluster <- FALSE
              break
            }
          }
          if(i.subsumed.by.another.cluster==TRUE) {
            if(verbose){
              cat(sprintf("2. Dropping cluster with center: "))
              for(l in 1:num.dimensions){
                cat(sprintf("%.3f ", std.dev.centers[i,l]))
              }
              cat(sprintf("because it overlaps with: "))
              for(l in 1:num.dimensions){
                cat(sprintf("(%.3f, %.3f) ", std.dev.lb[i2,l], std.dev.ub[i2,l]))
              }
              cat("\n")
              break
            }
          }
        }
        if(i.subsumed.by.another.cluster==TRUE) {
          indices.to.keep[i] <- FALSE
        }
      }

    } # End apply.overlapping.SEM.condition

    if(all(indices.to.keep==TRUE) & (apply.overlapping.std.dev.condition == TRUE) & (N.c > 1)) {

      # Determine how much component i's std dev overlaps component i2's std dev
      overlaps <- matrix(data = 1, nrow=N.c, ncol=N.c)
      std.dev.overlap.threshold <- 0
      for(i in 1:N.c){
        for(i2 in 1:N.c){
          if(verbose){
            cat("   ")
          }
          for(l in 1:num.dimensions){
            fraction <- 0
            if((std.dev.lb[i,l] < std.dev.ub[i2,l]) & (std.dev.ub[i,l] > std.dev.lb[i2,l])) {
              overlap <- min(std.dev.ub[i,l], std.dev.ub[i2,l]) - max(std.dev.lb[i,l], std.dev.lb[i2,l])
              fraction <- overlap / ( std.dev.ub[i,l] - std.dev.lb[i,l] )
            }
            if((fraction > std.dev.overlap.threshold) & (overlaps[i,i2] != 0)) {
              overlaps[i,i2] <- fraction
            } else {
              overlaps[i,i2] <- 0
            }
          }
        }
      }

      # Remove one of two overlapping clusters.  If both overlap
      # (NB: above is not symmetric), then only remove smaller of two.
      save.verbose <- verbose
      verbose <- TRUE
      for(i in 2:N.c){
        if(indices.to.keep[i] == FALSE) { next }
        for(i2 in 1:(i-1)){
          if(indices.to.keep[i2] == FALSE) { next }
          if(overlaps[i,i2] > 0) {
            if((overlaps[i2,i] > 0) & (E.pi[i2] < E.pi[i])) {
              if(verbose){cat("Condition (", num.dimensions, "D): Removing ", i2, " because of overlap (", overlaps[i2,i], ") with i = ", i, "\n")}
              indices.to.keep[i2] <- FALSE
            } else {
              if(verbose){cat("Condition (", num.dimensions, "D): Removing ", i, " because of overlap (", overlaps[i2,i], ") with i2 = ", i2, "\n")}
              indices.to.keep[i] <- FALSE
            }
          }
        }
      }
      verbose <- save.verbose
    } # End apply.overlapping.std.dev.condition



    if ( any(indices.to.keep==FALSE) ) {

      do.inner.iteration <- TRUE

      numeric.indices <- (1:N.c)

      E.pi <- E.pi[indices.to.keep]
      E.lnpi <- E.lnpi[indices.to.keep]

      if(remove.data == TRUE) {
        cluster.indices.to.keep <- (1:N.c)[indices.to.keep]
        # clusters == 0 -> outlier already                
        new.outliers <- matrix(vafs[!(clusters == 0) & !(clusters %in% cluster.indices.to.keep),], ncol=num.dimensions)
        outliers <- rbind(outliers, new.outliers)
        # To remove data from the data set, set its entries to NA
        vafs[!(clusters == 0) & !(clusters %in% cluster.indices.to.keep),] <- NA
        successes[!(clusters == 0) & !(clusters %in% cluster.indices.to.keep),] <- NA
        total.trials[!(clusters == 0) & !(clusters %in% cluster.indices.to.keep),] <- NA
      }

      N.c <- length(E.pi)

      r <- matrix(r[,indices.to.keep], nrow=N, ncol=N.c)
      ln.rho <- matrix(ln.rho[,indices.to.keep], nrow=N, ncol=N.c)
      # Need to renormalize r--do it gently.
      for(n in 1:N) {
         if(any(is.na(ln.rho[n,]))) {
           r[n,] <- rep(NA, N.c)
           next
         }

         row.sum <- log(sum(exp(ln.rho[n,] - max(ln.rho[n,])))) + max(ln.rho[n,])
         for(k in 1:N.c) { r[n,k] = exp(ln.rho[n,k] - row.sum) }
      }

      m <- matrix(m[indices.to.keep,], nrow=N.c, ncol=num.dimensions)
      alpha <- alpha[indices.to.keep,drop=FALSE]
      beta <- beta[indices.to.keep,drop=FALSE]
      nu <- nu[indices.to.keep,drop=FALSE]
      W <- W[indices.to.keep,drop=FALSE]

      m0 <- matrix(m0[indices.to.keep,], nrow=N.c, ncol=num.dimensions)
      alpha0 <- alpha0[indices.to.keep,drop=FALSE]
      beta0 <- beta0[indices.to.keep,drop=FALSE]
      nu0 <- nu0[indices.to.keep,drop=FALSE]
      W0 <- W0[indices.to.keep,drop=FALSE]

      E.pi.prev <- E.pi
    }

    if(do.inner.iteration == FALSE) { break }

  }

  L <- gaussian.bmm.calculate.posterior.predictive.precision(m, alpha, beta, nu, W)


  # Calculate standard error of the means
  SEM.res <- gaussian.bmm.narrowest.mean.interval.about.centers(m, alpha, beta, nu, W, width)
  SEM.centers <- t(SEM.res$centers)
  SEMs.lb <- t(SEM.res$lb)
  SEMs.ub <- t(SEM.res$ub)

  # Calculate standard errors
  std.dev.res <- gaussian.bmm.narrowest.proportion.interval.about.centers(m, alpha, beta, nu, W, width)
  std.dev.centers <- t(std.dev.res$centers)
  std.dev.lb <- t(std.dev.res$lb)
  std.dev.ub <- t(std.dev.res$ub)
  
  save.verbose <- verbose
  verbose <- TRUE
  for(k in 1:N.c) {
    if(verbose){cat(sprintf("Cluster %d pi = %.3f center =", k, E.pi[k]))}
    
    for(d in 1:num.dimensions) {
      center <- SEM.centers[k,d]
      if(verbose){cat(sprintf(" %.3f", center))}
    }
    if(verbose){cat(" SEM =")}
    for(d in 1:num.dimensions) {
      lb <- SEMs.lb[k,d]
      ub <- SEMs.ub[k,d]
      if(verbose){cat(sprintf(" (%.3f, %.3f)", lb, ub))}
    }
    if(verbose){cat(" sd =")}
    for(d in 1:num.dimensions) {
      lb <- std.dev.lb[k,d]
      ub <- std.dev.ub[k,d]
      if(verbose){cat(sprintf(" (%.3f, %.3f)", lb, ub))}
    }
    if(verbose){cat("\n")}
  }
  verbose <- save.verbose

  retList <- list("retVal" = 0, "m" = m, "alpha" = alpha, "beta" = beta, "nu" = nu, "W" = W, "L" = L, "r" = r, "num.iterations" = total.iterations, "ln.rho" = ln.rho, "E.lnpi" = E.lnpi, "E.pi" = E.pi, "outliers" = outliers)

  return(retList)

} # End gaussian.bmm.filter.clusters
genome/sciclone documentation built on Oct. 15, 2023, 5:09 a.m.