MBHC: Model-Based Hierarchical Clustering

Documented in mbhc.find.cut.off mbhc.find.cut.off.multiple mbhc.multiple.cut mbhc.prepdata mbhc.reduce.graph mbhc.run mbhc.single mbhc.var

get.var <-function(x, x.nb, s, weight=1) {
  a <- mean(x.nb[as.logical(x)]);
  b <- var(x.nb[as.logical(x)])*weight;
  c <- sum(as.logical(x));
  d <- s;
  return(c(a, b, c, d));
}
fit.variants.weight <- function(variants){   #to calculate the initial matrix
  weight <- variants[length(variants)]
  variants <- variants[-length(variants)]
  #fit<-optim(par1, y=variants, fn=optim.obj, weight=weight,method="Brent",lower=1.5, upper=1000)#,  control=list(factr=1e-3, maxit=0, trace=0)) #, lower=1.5, upper=1000, method='L-BFGS-B'
  fit1<-optimize(y=variants,weight=weight,exp.depth=exp.depth ,f=optim.obj, interval = c(1,199),tol=0.001)#,lower=1, upper=200, control=list(factr=1e-3, maxit=200, trace=0)) #, lower=1.5, upper=1000, method='L-BFGS-B'
   return(c(fit1$object,fit1$minimum))#,c(fit3$optim$bestval, fit3$optim$bestmem)))
}
fit.variants <- function(variants){   #to calculate the initial matrix
  return(fit.variants.weight(c(variants, -0.0001)));
}
fit.elements <- function(element.1, element.2, arg.data){
  data.1 <- arg.data[as.logical(element.1)]
  data.2 <- arg.data[as.logical(element.2)]
  data.merge <- c(data.1, data.2)
  fit <- fit.variants(data.merge)
  return(fit)
}
fit.elements.weight <- function(element.1, element.2, arg.data, arg.data.other){
  data.1 <- arg.data[as.logical(element.1)]
  data.2 <- arg.data[as.logical(element.2)]
  weight <- get.weights.within(element.1 + element.2, arg.data.other)
  data.merge <- c(data.1, data.2, 10*weight)
  fit <- fit.variants.weight(data.merge)
  print(data.merge)
  return(fit)
}
fit.elements.weight.reduce <- function(element.1, element.2, arg.data, arg.data.other){
  data.1 <- arg.data[as.logical(element.1)]
  data.2 <- arg.data[as.logical(element.2)]
  weight <- get.weights.within(element.1 + element.2, arg.data.other)
  data.merge <- c(data.1, data.2, 10*weight)
  fit <- fit.variants.weight(data.merge)
  #print(data.merge)
  return(fit)
}

get.weights.between <- function(x1, x2, arg.data) {
  xx <- x1;
  if(is.vector(x1)) {
    xx <- t(as.data.frame(x1));
  }
  weights <- apply(xx, 1, function(x) { x <- x + x2; return(get.weights.within(x, arg.data=arg.data)) } );
  return(weights)
}
get.weights.within <- function(x1, arg.data) {   #arg.data should be the remaining samples
  weights <- c()
  #print(arg.data)
  if(sum(x1)>1){
    weights <- apply(arg.data, 2, function(x){ return(max(dist(x[as.logical(x1)]))) })
  } else{
    weights <- 0
  }
  #print(weis)
  return(max(weights))
}

optim.obj <- function(par1,exp.depth=exp.depth, y, weight=-0.0001){

  R <-dbeta(y, shape1=par1, shape2=exp.depth-par1, log=T) #log should be F- because if Log=T we could have positive and negative values. We should add log
  #print(R)
  #R[which(R < 1e-10)] <- 0;
  #if(length(which(R == 0)) > 0) {
  #return(1000);
  #}
  # print(R)
  #	print(paste(y, shapes))
  # print(mean(R))
  v <- sd(y)
  range<- max(y)-min(y)
  #print(v)
  #print('v')
  #print(v)

  #print('range')
  #print(range)

  # print('v,meanR,logR, expWei')
  v <- ifelse(v<0.005, 0.005, v)
  range<- ifelse(range<0.005,0.005,range)

  #R <- R/v
  R <- R/(range*v)
  #	print(mean(R))
  #R <- log(R)
  # print('mean')
  #  print(mean(R))
  #print(exp(5*(weight+0.0001)))
  results <- -mean(R)/exp(5*(weight+0.0001))
  #print(paste(results, 'end'))
  return(results)
}  #MAY 6th version. Uses range- only one par.


#' MBHC preparing the data
#'
#' This function creates ID for the data and preprocess the data for the MBHC.
#' @param arg.data the input data. In the dataframe format.
#' @export
#' mbhc.prepdata()
mbhc.prepdata<- function(arg.data){

  ### arg data should have the frequencies in rows and columns for samples. for 3 samples, 100 mutation: 3 columns,100rows.
  arg.data <- as.matrix(arg.data)
  arg.data <- round(arg.data,3)
  arg.data <- ifelse(arg.data<1e-3, 1e-3, arg.data)
  arg.data <- ifelse(arg.data> 1-(1e-3), 1-(1e-3), arg.data)

  prep.data<- data.frame(arg.data, row.names=NULL)
  prep.data$ID<- c(1:dim(prep.data)[1])
  return(prep.data)
}




#' Model based hc
#'
#' This function performs the model-based hierarchical clustering
#'
#' @param arg.data,sampleNum Arg.data is the output of mbhc.prepdata function with IDs added. sampleNum for single sample should be 1
#' @export
#'
mbhc.single <- function(arg.data, sampleNum=1){
  if(is.null(arg.data$ID)){ return(print('Run mag.prep on the data'))}
  time0 <- as.numeric(Sys.time())
  arg.data<- round(arg.data,3)
  arg.data[,-dim(arg.data)[2]]<- ifelse(arg.data[,-dim(arg.data)[2]] < 1e-3, 1e-3, arg.data[,-dim(arg.data)[2]])
  arg.data[,-dim(arg.data)[2]]<- ifelse(arg.data[,-dim(arg.data)[2]] > 1-1e-3, 1-1e-3, arg.data[,-dim(arg.data)[2]])

  x.nb <- arg.data;
  if(!is.null(ncol(arg.data)) && ncol(arg.data) > 0) {
    x.nb <- arg.data[,sampleNum]
  }
  params<-c();  x.values<-c();  x.steps<-c();  row.loop<-c(); x.step<-c()
  x.elements.list<-list()
  x.elements<-data.frame(diag(length(x.nb)))
  x.el.first<- data.frame(diag(length(x.nb)))

  x.el.first.okay<- apply(x.el.first, 1, function(x) paste(x, collapse = '')) # april 15

  sort.x<-sort(arg.data[,sampleNum])
  #sort.x<- ifelse(sort.x < 1e-9, 1e-9, sort.x)
  #sort.x<- ifelse(sort.x > 1-1e-4, 1-1e-3, sort.x)


  #x.loop<-sort.x
  sort.arg.data<- arg.data[order(arg.data[,sampleNum]),]

  names(x.elements)<-sort.arg.data$ID
  names(x.el.first)<- sort.arg.data$ID

  l<-length(sort.x)
  mat.loop<-matrix(10^5, ncol=l, nrow=l)
  par.1.loop<-matrix(0, ncol=l, nrow=l)
  #par.2.loop<-matrix(0, ncol=l, nrow=l)

  s.pairs <- cbind(sort.x[1:(l-1)], sort.x[2:l])
  #print(s.pairs)
  s.likls <- t(apply(s.pairs, 1, fit.variants))  #calculated likelihoods on the pairs

  #mat.loop<-diag(0, nrow=l, ncol=l)
  for(i in 2:l){
    mat.loop[i-1,i] <- s.likls[i-1, 1]      ### the s.likls is the likelihood between i-1)th element and the i th.
    par.1.loop[i-1,i] <- s.likls[i-1, 2]
    # par.2.loop[i-1,i] <- s.likls[i-1, 3]
  }

  time1 <- as.numeric(Sys.time())
  var.mean<-c()
  s<-1
  while(nrow(x.elements)>= 2){
    # print("INJA")
    # print(s)
    #print(mat.loop)

    #print(min(mat.loop))
    x.values[s]<-min(mat.loop)
    el1<-which(mat.loop==min(mat.loop), arr.ind = T)[1,1]
    el2<-which(mat.loop==min(mat.loop), arr.ind = T)[1,2]
    el.rm<-c(el1,el2)
    # print("Remove ina")
    # print(el.rm)
    params<-rbind(params,par.1.loop[el1,el2])

    #what happenes at each step
    x.step <- x.elements[el.rm[1],] + x.elements[el.rm[2],]   ###this is very clever! I dont need to update the xloop.
    ###only use x.elements rows for each cluster!
    x.steps<-rbind(x.steps, x.step)
    #print(x.steps)
    x.elements<-rbind(x.elements[-el.rm,], x.step)
    x.elements.list[[s]]<-x.elements


    #### april 14 add "okay"
    temp<- t(apply(x.elements, 1, get.var, x.nb=sort.x, s=s ))          ####for oc.v9 I changed x.nb=x.nb to x.nb=
    #####
    #okay<- apply(x.elements, 1, function(x) nrow(merge(t(x), x.el.first)))               ################# APRIL 14 2018   takeeeesss a long time
    ####################### faster version? :
    okay<- apply(x.elements, 1, function(x) sum(paste(x, collapse = '')%in%x.el.first.okay))
    weights<- rep(0, length(okay))  ##### just to have weight  APRIL 15

    temp<- cbind(temp,weights ,okay)

    var.mean<- rbind(var.mean, temp)




    #####UPDATE MATRIX HERE
    mat.loop<-mat.loop[-el.rm, -el.rm]
    par.1.loop<-par.1.loop[-el.rm, -el.rm]
    #par.2.loop<-par.2.loop[-el.rm, -el.rm]

    mat.column.update<-c()
    par1.column.update<-c()
    #par2.column.update<-c()

    max<-max(which(x.step%in%1))
    min<-min(which(x.step%in%1))
    #		print(paste("min", min))
    nei<-c()
    if(min == 1 & max != l){
      neiIND<-which(x.elements[,max+1]==1)
      nei<-x.elements[neiIND,]
      #			print("if1")
    }else if(max == l & min != 1){
      neiIND<-which(x.elements[,min-1]==1)
      nei<-x.elements[neiIND,]
      #			print("if2")
    }else if( max!=l & min!=1){
      nei1<-which(x.elements[,min-1]==1)
      nei2<-which(x.elements[,max+1]==1)
      neiIND<-c(nei1,nei2)
      nei<-x.elements[neiIND,]
      #			print("if3")
    }

    if(length(nei) > 0){
      x.last <- x.elements[nrow(x.elements), ]
      #print('sortx')
      #print(sort.x)
      #print(x.last)
      #print(nei)
      #print('done')
      temp <- apply(nei, 1, fit.elements, arg.data=sort.x, element.2=x.last)

      ###try to only look at cluster neighbours
      ###instead of dim i will use sqrt of length. Because mat.loop is always square matrix   WORKS!!! SEP12
      l2<-sqrt(length(mat.loop))
      mat.column.update<-rep(10^5,l2)
      mat.column.update[neiIND]<-temp[1,]
      par1.column.update<-rep(0, l2)
      par1.column.update[neiIND]<-temp[2,]
      #par2.column.update<-rep(0, l2)
      #par2.column.update[neiIND]<-temp[3,]

      mat.loop<-cbind(mat.loop, mat.column.update )
      mat.loop<-rbind(mat.loop, rep(10^5,dim(mat.loop)[2]))    #just to keep mat.loop square.
      par.1.loop<-cbind(par.1.loop, par1.column.update)
      par.1.loop<-rbind(par.1.loop, rep(0, dim(par.1.loop)[2]))
      #par.2.loop<-cbind(par.2.loop, par2.column.update)
      #par.2.loop<-rbind(par.2.loop, rep(0, dim(par.2.loop)[2]))
    }

    s<-s+1
  }

  #var.mean <-cbind(var.mean, 1/(var.mean[,3]))
  colnames(var.mean)<-c("mean", "var","NumberOfPoints", "step", "weights", 'okay')

  time2 <- as.numeric(Sys.time())
  cat("while took ", (time2 - time1), " seconds.\n")
  cat("total took ", (time2 - time0), " seconds.\n")

  result <- list(x.el=x.elements.list, params=params, var.mean=var.mean, data=arg.data, data.sorted=sort.arg.data,freq.s= sort.x, time=list(time0,time1, time2))
  return(result)
}

#' find cut off
#'
#' This function finds the optimal cluster assignment for the data.
#' @param mbhc.output The output of mbhc.single is the input of find.cut.off
#' @export
mbhc.find.cut.off <- function(mbhc.output){
  library(dplyr)
  time0 <- as.numeric(Sys.time())
  var.mean <- mbhc.output$var.mean
  var.mean <- data.frame(var.mean, row.names = NULL)
  var.mean[is.na(var.mean)] <- 0
  var.mean$var <- round(var.mean$var, 4)
  vars<- var.mean$var[var.mean$var>0]
  var.bound<- quantile(vars)[4]*1.5


  flag <- FALSE
  m <- c()
  str <- c()  #clone structure line from temp var.mean matrix
  str.f <- c()
  cutstr <- c()
  cutstr2 <- c()


  i <- max(var.mean$step)
  allPoints <- mbhc.output$x.el[[i]]
  ok.points <- allPoints - allPoints  ##### To generate clean slate
  n <- length(allPoints)
  #	print(allPoints)
  id.ok <-c() #clusters with okay variance!
  temp.mv <- c();
  temp.points <- c();
  temp.step <- c();
  temp.point.step <- c();
  while(flag == FALSE){
    #print(flag)
    print(i); flush.console();
    v <- i

    ############
    ## I need to use x.el to get the breaks, because using StepI would cause problem
    ## in identical clusters and mutations.
    ## TO FIX THIS I JUST KEPT MY ORIGINAL METHOD -
    ## I just duplicated the rows in the case of identical mutations
    ## because there should be onlytwo rows at each step
    ## and having only 1 row means identical clusters.
    ############
    stepI <- var.mean[var.mean$step==i,]
    stepI.x.el <- mbhc.output$x.el[[i]]
    stepI.x.el.step <- stepI.x.el[nrow(stepI.x.el), ]  #Only look at the most recent cluster

    stepII <- var.mean[var.mean$step==i-1, ]  ## previous step
    #stepII$var<-stepII$var,)
    stepII.x.el <- mbhc.output$x.el[[i-1]]
    print("II")
    #print(stepII.x.el)

    #if( !(id.sum%in%id.ok)){    ###this means if the division on this step is not happening on the "ok" clusters. so the okay clusters should still exist in the temp.
    #if(sum(abs(id.ok-id.sum)<0.1)==0){
    #newtemp<-temp[!(temp$id%in%id.ok),]
    #if(!id.I%in%id.ok){
    temp.points <- rbind(temp.points, c(i, ok.points));
    temp.step <- rbind(temp.step, c(i, stepI.x.el.step));
    temp.point.step <- rbind(temp.point.step, c(i, sum(ok.points & stepI.x.el.step)))


    print(ok.points)
    print(sum(ok.points & stepI.x.el.step))
    if(sum(ok.points & stepI.x.el.step) == 0){     ###meaning stepI.x.el is not in ok.points

      stepII.breaks <- setdiff(stepII[,-c(4,6)], stepI[,-c(4,6)])
      if(nrow(stepII.breaks)==1){
        print("why did this happen????")
        stepII.breaks <- rbind(stepII.breaks, stepII.breaks)
      }
      stepII.num <- stepII[, 4]
      stepII.breaks <- cbind(stepII.breaks, stepII[1:2,c(4,6)])   #just add the step number back to the matrix
      stepII.breaks.x.el <- setdiff(stepII.x.el, stepI.x.el)
           for(ii in 1:2){
         if(stepII.breaks[ii, 'okay'] ==1 | stepII.breaks$var[ii]<var.bound) {        #okay cluster   APRIL 15

          str.f <- c(str.f, stepII.breaks[ii,1])
          #print("here")
          #print(str)
          str <- rbind(str, stepII.breaks[ii,])
          print(stepII.breaks.x.el)
          cutstr2 <- rbind(cutstr2, stepII.breaks.x.el[ii,])
          #print(str)

          #id.ok<-c(id.ok, stepII.breaks$id[ii])
          ok.points <- ok.points + stepII.breaks.x.el[ii,]
          #print(ok.points)
        }
      }
    }

    #print("LAST PRINT")
    if(sum(ok.points>1)>0){
      print("ERROR")
    }
    if(sum(ok.points) == n){
      flag<-TRUE
    } else {
      i<-(v-1)
    }
    #if(v==i){iii<-TRUE}else{i<-(v-1)}
  }

  for( i in 1:nrow(str)){
    #print(i)
    xel <- mbhc.output$x.el[[str$step[i]]]
    #print(xel)
    for(j in 1:nrow(xel)){
      m1 <- mean(mbhc.output$freq.s[xel[j,]==1])    ##changed to sorted. after oc.9#### OCT 30: THis should be changed for OC11 since there is no sorting in that
      if(abs(m1-str[i,1]) < 0.001){
        cutstr<-rbind(cutstr, xel[j,])  ## what's the difference between cutstr and cutstr2????????
      }
    }
  }


  time1 <- as.numeric(Sys.time())
  cat("took ", (time1 - time0), " seconds.\n");
  #print(cutstr)
  colors<-colSums(cutstr*c(1:nrow(cutstr)))+1
  final.data<-cbind(mbhc.output$data.sorted,colors)
  results<-list(str=str, cutclust=cutstr, cutclust2=cutstr2, colors=colors, final.data=final.data)
  return(results)
}


#'
#'Reduce function
#'
mbhc.reduce.graph<- function(arg.data,x.el=data.frame() ,sampleNum=1, n=-100){
  #x.el<- reduce.out$x.el.red
  #data<- reduce.out$data
  #data.sorted<- reduce.out$data.sorted
  #IDs<- data.sorted[, dim(data.sorted)[2]]

  data<- arg.data
  data.sorted<- arg.data[order(arg.data[,sampleNum]),]
  IDs<- data.sorted[, dim(data.sorted)[2]]

  if(dim(x.el)[1]==0){
    l<- dim(data)[1]
    x.el<- data.frame(diag(l))
    names(x.el)<- IDs
  }


  mean.frqs<-t(apply( x.el, 1, function(x){mean(data.sorted[as.logical(x),sampleNum])}))
  x.el.sorted<-x.el[order(mean.frqs),]
  x.el.final<- data.frame()
  x.nb<- data.sorted[,sampleNum]
  ss.other<- data.sorted[,-c(sampleNum, dim(data.sorted)[2]), drop=F]

  size.threshold<-5

  l<-nrow(x.el)
  full<-floor(l/size.threshold)
  rem<-l%%size.threshold
  row.fold<-0

  if( rem < 5 ){ full<- full-1; rem<- rem+size.threshold}


  optim.value.bound<-fit.variants.weight(c(0.4,0.46,0.05))[1]




  ind.full<- c(1:size.threshold)
  combn.el<- t(combn(ind.full,2))

  if(full>0){
    for(i in 1:full){

      fold<- c((1+(i-1)*size.threshold):(i*size.threshold))
      x.el.fold<-x.el.sorted[fold, ]
      #print(i)
      initial.full<-apply(combn.el, 1, function(x){fit.elements.weight.reduce( x.el.fold[x[1],],
                                                                               x.el.fold[x[2],],
                                                                               x.nb, ss.other )})
      tempMatLoop<-matrix(1000, nrow=size.threshold, ncol=size.threshold)
      tempMatLoop.2<-lower.tri(diag(0, nrow=size.threshold, ncol=size.threshold))

      tempMatLoop[tempMatLoop.2] <- initial.full[1, ]
      mat.loop<-t(tempMatLoop)
      mat.loop.g<- mat.loop
      mat.loop.g[mat.loop <= optim.value.bound]<- 1
      mat.loop.g[mat.loop> optim.value.bound]<- 0
      sum(mat.loop.g==1)
      g1<-graph_from_adjacency_matrix(mat.loop.g, mode = 'undirected')
      complete_points<- max_cliques(g1, min=2)
      if(length(complete_points)>0){
        x.el.graph<-c()
        el.rm.graph<-c()
        for(i in 1:length(complete_points)){
          el.rm<- as.numeric(complete_points[[i]])
          el.rm<- el.rm[!el.rm%in%el.rm.graph]
          if(length(el.rm)>0){
            x.el.graph<- rbind(x.el.graph, colSums(x.el.fold[el.rm,]))
            el.rm.graph<- c(el.rm.graph, el.rm)
          }}
        x.el.fold<- rbind(x.el.fold[-el.rm.graph, , drop=F],x.el.graph)
        colSums(x.el.fold)}
      x.el.final<- rbind(x.el.final, x.el.fold)
      {  #while(  min(mat.loop)< optim.value.bound & nrow(mat.loop)>2){#

        # el.rm <- which(mat.loop==min(mat.loop), arr.ind=T)
        #  el.rm <- el.rm[1,]   # april 14

        #el.rm<- unique(as.numeric(el.rm))

        # mat.loop<- mat.loop[-el.rm, -el.rm, drop=F]
        #if(sum(dim(mat.loop))==0){mat.loop=matrix(c(0,0,0,0),nrow = 2)}
        #x.el.fold<- rbind( x.el.fold[-el.rm, , drop=F], colSums(x.el.fold[el.rm,]))
        #colnames(x.el.fold)<-row.names(data.sorted)
        #}
      }
    }}

  rems<- c((l-rem+1):l)
  #print(rems)
  x.el.rem<- x.el.sorted[rems,]
  ind.rem<- c(1:rem)
  combn.rem<- t(combn(ind.rem, 2))
  initial.rems<-apply(combn.rem, 1, function(x){fit.elements.weight.reduce( x.el.rem[x[1],],
                                                                            x.el.rem[x[2],],
                                                                            x.nb, ss.other )})

  ###may 6th
  tempMatLoop<-matrix(1000, nrow=rem, ncol=rem)
  tempMatLoop.2<-lower.tri(diag(0, nrow=rem, ncol=rem))
  tempMatLoop[tempMatLoop.2] <- initial.rems[1, ]
  mat.loop<-t(tempMatLoop)
  #diag(mat.loop) <- 1e5;
  mat.loop.g<- mat.loop
  mat.loop.g[mat.loop <= optim.value.bound]<- 1
  mat.loop.g[mat.loop> optim.value.bound]<- 0
  sum(mat.loop.g==1)
  g1<-graph_from_adjacency_matrix(mat.loop.g, mode = 'undirected')
  complete_points<- max_cliques(g1, min=2)
  x.el.graph<-c()
  el.rm.graph<-c()

  #print(complete_points)
  if(length(complete_points)>0){
    for(i in 1:length(complete_points)){
      el.rm<- as.numeric(complete_points[[i]])
      el.rm<- el.rm[!el.rm%in%el.rm.graph]
      if(length(el.rm)>0){
        x.el.graph<- rbind(x.el.graph, colSums(x.el.rem[el.rm,]))
        el.rm.graph<- c(el.rm.graph, el.rm)
      }}
    x.el.rem<- rbind(x.el.rem[-el.rm.graph, , drop=F],x.el.graph)
    colSums(x.el.rem)}
  x.el.final<- rbind(x.el.final, x.el.rem)
  {
    #while(  min(mat.loop)< -10 &nrow(mat.loop)>2){#
    #  el.rm<- which(mat.loop==min(mat.loop), arr.ind=T)
    #  el.rm<- unique(as.numeric(el.rm))

    # mat.loop<- mat.loop[-el.rm, -el.rm, drop=F]
    #  if(sum(dim(mat.loop))==0){mat.loop=matrix(c(0,0,0,0),nrow = 2)}
    #  x.el.rem<- rbind( x.el.rem[-el.rm, , drop=F], colSums(x.el.rem[el.rm,]))
    #  colnames(x.el.fold)<-row.names(data.sorted)
    #}
    #x.el.final<- rbind(x.el.final, x.el.rem)
  }

  results<- list(x.el.red=x.el.final, data=arg.data, data.sorted=data.sorted);
  nn <- nrow(results$x.el.red)
  print("jj")
  print(jj)
  jj<-jj+1
  if(nn < size.threshold | nn == n) {
    return(results)
  } else {
    #mbhc.reduce.2(results, sampleNum=sampleNum,n=nn)
    mbhc.reduce.graph(arg.data=data, sampleNum=sampleNum,x.el=x.el.final,n=nn)
  }



}

#'
#'MBHC multiple
#'
#' This function performs the clustering on multiple samples.
#' @export
#'
#' @param arg.data,x.el.cut,sampleNum Arg.data should be a dataframe with IDs in the last column. Each column correspond to a sample. X.el.cut is the output structure of mbhc.reduce which performs a prliminary clustering. it can be empty. SampleNum is the column of the primary sample that clustering needs to be run on.
mbhc.multiple.cut <- function(arg.data,x.el.cut=data.frame(), sampleNum=1) {

  if(sampleNum>dim(arg.data)[2]-1){print("stop the algorithm and check the sampleNum; run mbhc.prepdata() first")}

  time0<-as.numeric(Sys.time())
  params<-c();  x.values<-c();  x.steps<-c();  row.loop<-c();x.step<-c()
  x.elements.list <- list()

  weights.list <- list()
  weights.dist <- c()
  arg.data <- as.matrix(arg.data);
  arg.data.other <- c();

  if(ncol(arg.data) == 2){        ##### change '1' to '2' because the 2nd column is always id
    print('executing single sample clustering....'); flush.console();
    return(mag.single(arg.data[,1])) #feb 26: the second column is the IDs
  } else {
    arg.data.other<- arg.data[, -c(sampleNum,dim(arg.data)[2]), drop=F]    #other samples in the data     FEB26: aarg.datauming last column is ID
  }

  #feb 27: took these out of the if statement. So every version is done based on sorted data.
  order<-order(arg.data[,sampleNum])
  arg.data.other<-arg.data.other[order,,drop=F]
  data.sorted<- arg.data[order, ]
  x.nb<- arg.data[order, sampleNum]
  IDs<- arg.data[order,dim(arg.data)[2]] #FeB26

  if( sum(dim(x.el.cut))==0){
    print("Mag.reduce has not been run.")
    x.elements<-data.frame(diag(length(x.nb)))
    #names(x.elements)<-c(1:length(x.nb))   feb 26
    names(x.elements)<- IDs
  }else{
    x.elements<- x.el.cut
  }



  #print(class(arg.data.other))

  #x.elements<- x.el.cut


  #work on the target sample--- use other samples to find weight
  x.loop <- x.nb
  l <- length(x.loop)
  # generate pairs of the indexes... ----------------------------------------
  indx<-c(1:length(x.nb))
  combn.idx <- t(combn(indx, 2))
  weights <- apply(combn.idx, 1, function(x)
  { temp.elements <- rep(0, length(x.nb)); temp.elements[x] <- 1;
  return(get.weights.within(temp.elements, arg.data.other)) } );  ### Now I Have the initial weights
  combn.x <- t(combn(x.nb, 2))
  initial <- cbind(combn.x, weights)
  weights.list[[1]]<-initial
  #print(initial)


  l<- nrow(x.elements)

  #### new matloop with reduced x.el:
  indx.cut<-c(1:nrow(x.elements))
  combn.cut <- t(combn(indx.cut, 2))

  ###no need to run weights...
  cat('initial fitting ... '); flush.console();
  time1 <- as.numeric(Sys.time())
  initial.1<-apply(combn.cut, 1, function(x){fit.elements.weight( x.elements[x[1],], x.elements[x[2],],
                                                                  x.nb, arg.data.other )})
  time2 <- as.numeric(Sys.time())
  cat('took ', time2-time1, ' seconds.\n'); flush.console();

  tempMatLoop<-matrix(1000, nrow=l, ncol=l)  #may 6th
  tempMatLoop.2<-lower.tri(diag(0, nrow=l, ncol=l))
  tempMatLoop[tempMatLoop.2] <- initial.1[1, ]
  mat.loop<-t(tempMatLoop)
  #diag(mat.loop) <- 1e5;


  tempPar1Loop<-lower.tri(diag(0, nrow=l, ncol=l))
  #tempPar2Loop<-lower.tri(diag(0, nrow=l, ncol=l))
  tempPar1Loop[tempPar1Loop] <- initial.1[2, ]
  #tempPar2Loop[tempPar2Loop] <- initial.1[3, ]

  par.1.loop<-t(tempPar1Loop)
  #par.2.loop<-t(tempPar2Loop)
  weights.step<-c()
  var.mean <- c()


  time2<-as.numeric(Sys.time())
  s<-1
  while(nrow(x.elements) > 1){
    cat(s, '...', nrow(x.elements), '...\n'); flush.console();

    x.values[s]<-min(mat.loop)
    el1<-which(mat.loop==min(mat.loop), arr.ind = T)[1,1]
    el2<-which(mat.loop==min(mat.loop), arr.ind = T)[1,2]
    el.rm<-c(el1,el2)
    params<-c(params, par.1.loop[el1,el2])#, par.2.loop[el1,el2]))

    #what happenes at each step
    x.step <- x.elements[el.rm[1], ] + x.elements[el.rm[2],]   ###this is very clever! I dont need to update the xloop.
    ###only use x.elements rows for each cluster!
    x.steps <- rbind(x.steps, x.step)
    # print(x.steps)
    x.elements <- rbind(x.elements[-el.rm,], x.step)
    x.elements.list[[s]]<-x.elements

    var.mean.temp <- t(apply(x.elements, 1, get.var, x.nb=x.nb, s=s))
    weights.temp <- apply(x.elements, 1, get.weights.within, arg.data=arg.data.other)
    var.mean.temp <- cbind(var.mean.temp, weights.temp)
    var.mean <- rbind(var.mean, var.mean.temp)

    #####UPDATE MATRIX HERE
    mat.loop<-mat.loop[-el.rm, -el.rm, drop=F]
    par.1.loop<-par.1.loop[-el.rm, -el.rm, drop=F]


    mat.column.update<-c()
    par1.column.update<-c()
    #par2.column.update<-c()

    x1 <- x.elements[1:(nrow(x.elements)-1), ]
    x2 <- x.elements[nrow(x.elements), ]   #the ones that are put together
    #print(x1)
    #print(x2)

    fit.between <- t(apply(x1, 1, fit.elements.weight, element.2=x2, arg.data=x.loop, arg.data.other=arg.data.other))
    mat.column.update <- fit.between[, 1]
    par1.column.update <- fit.between[, 2]
    #par2.column.update <- fit.between[, 3]

    #print(mat.loop)
    if(nrow(mat.loop) != 0){
      mat.loop<-cbind(mat.loop, mat.column.update )
      mat.loop<-rbind(mat.loop, rep(1000,dim(mat.loop)[2]))    #just to keep mat.loop square.
      par.1.loop<-cbind(par.1.loop, par1.column.update)
      par.1.loop<-rbind(par.1.loop, rep(0, dim(par.1.loop)[2]))
      # par.2.loop<-cbind(par.2.loop, par2.column.update)
      # par.2.loop<-rbind(par.2.loop, rep(0, dim(par.2.loop)[2]))
      #print("MATLOOP UPDATE SHODE")
      #print(mat.loop)
      #print("END")
    }
    #print(mat.loop)

    s<-s+1
  }
  colnames(var.mean)<-c("mean", "var","NumberOfPoints", "step", "weights")

  time3 <- as.numeric(Sys.time())
  cat('loop took ', time3 - time2, ' seconds.\n')
  cat('total took ', time3 - time0, ' seconds.\n')

  result <- list(x.el=x.elements.list,data.sorted=data.sorted, data=arg.data, params=params, var.mean=var.mean, wights=weights.list ,freq.s=x.nb ,freq=x.nb)
  return(result)
}



#' VAR in other samples
mbhc.var<- function(mbhc.out, sampleNum=1){

  #varmean is geneated based on sorted data
  x.el.list<- mbhc.out$x.el
  data<- data.frame(mbhc.out$data.sorted)
  #this also assumes that the data was ran on prep.data code so the last column has ID.
  num<- dim(data)[2]-1
  var.mean.list<-list()

  ss.other<- data[,-c(sampleNum,dim(data)[2]),drop=F]
  x.el.red<- x.el.list[[1]]

  x.el.red.okay<- apply(x.el.red, 1, function(x) paste(x, collapse = ''))  #april 15



  for ( i in 1:num){
    var.mean<-c()
    x.data<- data[,i]
    names(data)

    #s<-7
    for( s in 1:length(x.el.list)){
      x.el<- x.el.list[[s]]

      temp<- t(apply(x.el, 1, get.var, x.nb=x.data, s=s))
      weights.temp <- apply(x.el, 1, get.weights.within, arg.data=ss.other)
      #okay<- apply( x.el, 1, function(x) nrow(merge(t(x),x.el.red)))
      #april 16
      okay<- apply(x.el, 1, function(x) sum(paste(x,collapse = '')%in%x.el.red.okay))


      temp <- cbind(temp, weights.temp, okay)
      var.mean<- rbind(var.mean, temp)
    }

    colnames(var.mean)<-c("mean", "var","NumberOfPoints", "step","weights","okay")
    var.mean.list[[i]]<- var.mean
    names(var.mean.list)[i]<-paste(names(data)[i],".var.mean",sep="")


  }

  return(list(var.mean.all=var.mean.list, mbhc.out=mbhc.out))
}



#'
mbhc.find.cut.off.single <- function(var.mean, data, x.el.list){
  #detach('package:plyr')
  library(dplyr)

  time0 <- as.numeric(Sys.time())
  #print(dim(mag.output))
  #var.mean <- mag.output$var.mean
  var.mean <- data.frame(var.mean, row.names = NULL)
  #var.mean$id<-round(var.mean$mean*var.mean$NumberOfPoints,5)
  var.mean[is.na(var.mean)] <- 0
  var.mean.first <- var.mean
  #print(var.mean)
  var.mean$var <- round(var.mean$var, 3)
  #var.mean$var <- var.mean$var + 1e-7
  #id.max<-max(var.mean$id)
  ##################### add   okay structure to the var mean:
  #var.mean$okay<- 0
  #var.mean[var.mean$step==1,'okay']<- 1
  vars<- var.mean$var[var.mean$var>0]
  var.bound<- quantile(vars)[4]*1.5




  flag <- FALSE
  m <- c()
  str <- c()  #clone structure line from temp var.mean matrix
  str.f <- c()
  cutstr <- c()
  cutstr2 <- c()

  #pointsMax<- max(rowSums(x.el.list[[1]]))


  i <- max(var.mean$step)
  # allPoints <- mag.output$x.el[[i]]
  allPoints<- x.el.list[[i]]   #feb28
  ok.points <- allPoints - allPoints  ##### To generate clean slate
  n <- length(allPoints)
  #	print(allPoints)
  id.ok <-c() #clusters with okay variance!
  temp.mv <- c();
  temp.points <- c();
  temp.step <- c();
  temp.point.step <- c();
  while(flag == FALSE){
    #print(flag)
    print(i); flush.console();
    v <- i

    ############
    ## I need to use x.el to get the breaks, because using StepI would cause problem
    ## in identical clusters and mutations.
    ## TO FIX THIS I JUST KEPT MY ORIGINAL METHOD -
    ## I just duplicated the rows in the case of identical mutations
    ## because there should be onlytwo rows at each step
    ## and having only 1 row means identical clusters.
    ############
    stepI <- var.mean[var.mean$step==i,]
    stepI.x.el <- x.el.list[[i]]
    stepI.x.el.step <- stepI.x.el[nrow(stepI.x.el), ]  #Only look at the most recent cluster

    stepII <- var.mean[var.mean$step==i-1, ]  ## previous step
    #stepII$var<-stepII$var,)
    stepII.x.el <- x.el.list[[i-1]]

    #if( !(id.sum%in%id.ok)){    ###this means if the division on this step is not happening on the "ok" clusters. so the okay clusters should still exist in the temp.
    #if(sum(abs(id.ok-id.sum)<0.1)==0){
    #newtemp<-temp[!(temp$id%in%id.ok),]
    #if(!id.I%in%id.ok){
    temp.points <- rbind(temp.points, c(i, ok.points));
    temp.step <- rbind(temp.step, c(i, stepI.x.el.step));
    temp.point.step <- rbind(temp.point.step, c(i, sum(ok.points & stepI.x.el.step)))


    print('here')
    #print(ok.points)
    #print(stepI.x.el.step)
    #print(sum( ok.points & stepI.x.el.step)==0)

    if(sum(ok.points & stepI.x.el.step) == 0){     ###meaning stepI.x.el is not in ok.points

      #stepII.breaks<- stepII[!stepII$id%in%stepI,]
      print("stepI ")
      #print(stepI )
      #print("stepII ")
      #print(stepII )
      #stepII.breaks <- setdiff(stepII[,-c(4,6)], stepI[,-c(4,6)])
      stepII.breaks <- setdiff(stepII[,-4], stepI[,-4])
      if(nrow(stepII.breaks)==1){
        print("why did this happen????")
        stepII.breaks <- rbind(stepII.breaks, stepII.breaks)
      }
      stepII.num <- unique(stepII[, 4])
      # stepII.breaks <- cbind(stepII.breaks, stepII[1:2,c(4,6)])   #just add the step number back to the matrix        #############MARCh 14 add "okay"
      stepII.breaks <- cbind(stepII.breaks, step=stepII.num)   #MAY 9th
      stepII.breaks.x.el <- setdiff(stepII.x.el, stepI.x.el)

      #for( ii in 1:dim(newtemp)[1]){              ###stepII.breaks always has 2 rows
      for(ii in 1:2){
        #sample.sim <- cluster.sim(edep=depth, efrq=stepII.breaks[ii,1], shape=shape, n=10000)
        #var.sim <- c()
        #size <- ifelse(stepII.breaks$NumberOfPoints[ii]<5, 5, stepII.breaks$NumberOfPoints[ii])
        #print(size)
        #for(j in 1:500){
         # sample.sim.2 <- sample.sim[sample(1:10000, size=size),]   #temp[1,3]
          #var.sim <- c(var.sim, sd(sample.sim.2[, 4]))
        #}
       # m <- mean(var.sim) + 1e-6
      #  vv <- sd(var.sim)
       # vmax <- max(var.sim)
      #  temp.mv <- rbind(temp.mv, c(i, ii, m, vv, vmax, stepII.breaks[ii,2]))



        #if(stepII.breaks[ii, 'okay'] ==1 | sqrt(stepII.breaks[ii,2])*(1+stepII.breaks[ii, "weights"]) < m+vv) {        #okay cluster
        #if(stepII.breaks[ii, 'okay'] ==1 | sqrt(stepII.breaks[ii,2])*(1+stepII.breaks[ii, "weights"]) < m+vv) {        #okay cluster    ####change april 11
        if(stepII.breaks[ii, 'okay'] ==1 | stepII.breaks$var[ii]<var.bound) {        #okay cluster    ####change 6/2/2018

          str.f <- c(str.f, stepII.breaks[ii,1])
          #print("here")
          #print(str)
          str <- rbind(str, stepII.breaks[ii,])
          cutstr2 <- rbind(cutstr2, stepII.breaks.x.el[ii,])
          #print(str)

          #id.ok<-c(id.ok, stepII.breaks$id[ii])
          ok.points <- ok.points + stepII.breaks.x.el[ii,]
          #print('OKAY LAST')
          #print(sum(ok.points))
          #print(ok.points)

        }
      }
    }

    ###stop condition
    #if(abs(sum(str$id)-id.max)<0.000001){flag<-TRUE}else{i<-(v-1)}
    #print(i)      ###new condition: sum of id.okay== id.max

    #if(abs(sum(str$id)-id.max)<0.000001){flag<-TRUE}else{i<-(v-1)}

    ####new condition    oct 31
    #print("LAST PRINT")
    if(sum(ok.points>1)>0){
      print("ERROR")
    }
    if(sum(ok.points) == n){
      flag<-TRUE
    } else {
      i<-(v-1)
    }
    #if(v==i){iii<-TRUE}else{i<-(v-1)}
  }

  for( i in 1:nrow(str)){
    #print(i)
    xel <- x.el.list[[str$step[i]]]     #feb 28
    #print(xel)
    for(j in 1:nrow(xel)){
      m1 <- mean(data[xel[j,]==1])   #feb 28
      ##changed to sorted. after oc.9#### OCT 30: THis should be changed for OC11 since there is no sorting in that
      if(abs(m1-str[i,1]) < 0.001){
        cutstr<-rbind(cutstr, xel[j,])  ## what's the difference between cutstr and cutstr2????????
      }
    }
  }


  time1 <- as.numeric(Sys.time())
  cat("took ", (time1 - time0), " seconds.\n");
  #print(cutstr)
  colors<-colSums(cutstr*c(1:nrow(cutstr)))+1
  final.data<-cbind(data,colors)
  results<-list(str=str, cutclust=cutstr, cutclust2=cutstr2, colors=colors, final.data=final.data)
  return(results)
}
#' find cut off multiple
mbhc.find.cut.off.multiple <- function(mbhc.var){
  library(dplyr)
  if(is.null(mbhc.var[['var.mean.all']])) return('incomplte, run mbhc.var() first');

  final.data.clusters<- mbhc.var$mbhc.out$data.sorted
  cuts<-list()
  colors.all<-c()
  x.el<- mbhc.var$mbhc.out$x.el

  for(i in 1:length(mbhc.var[['var.mean.all']])) {
    var.mean<- mbhc.var$var.mean.all[[i]]
    data<- mbhc.var$mbhc.out$data.sorted[,i,drop=F]

    #print(shape)
    #print(depth)
    #shape=shape
    #depth=depth
    cut.temp<-mbhc.find.cut.off.single(var.mean = var.mean, data = data,x.el.list = x.el)
    cuts[[i]]<-cut.temp
    name.temp<-paste(colnames(data),'.color',sep = "")

    colors.all<-cbind(colors.all, cut.temp$colors*10^(2*i-2))
    final.data.clusters<-cbind(final.data.clusters, cut.temp$colors)

    colnames(colors.all)[dim(colors.all)[2]]<- name.temp
    colnames(final.data.clusters)[dim(final.data.clusters)[2]]<-name.temp
  }

  #print('colorsall')
  #print(colors.all)

  colors.all<-data.frame(colors.all)
  colors.all$final<- rowSums(colors.all)
  #colors.all$final
  library(plyr)
  #print(colors.all$final)
  updated.colors<-mapvalues(colors.all$final, from = unique(colors.all$final), to=c(1:length(unique(colors.all$final))))
  colors.all$colors.final<- updated.colors
  detach('package:plyr')
  final.data.assignments<- cbind(final.data.clusters, cluster=updated.colors)
  final.data.clusters<- cbind( mbhc.var$mbhc.out$data.sorted, cluster=updated.colors)

  return(list( final.data.clusters=final.data.clusters,final.data.all.samples=final.data.assignments, final.colors=colors.all))

}

#' MBHC Run
#'
#' @param data,name,sampleFreqs
mbhc.run<- function(data,name="",sampleFreqs= NA,sampleNum=1, depth=200){
  if(length(data$ID)==0){return(print("RUN mbhc.prepdata"))}
  if(sum(is.na(sampleFreqs))>0){ data=data }else{data=cbind(data[,sampleFreqs], ID=data$ID)}

  #data.prep<- mbhc.prepdata(data)
  # data should have id.
  data.prep<- data
  dim<-dim(data)[2]
  s<- sampleNum

  if(dim==2){
    print('Running Single Sample Clustering')
    mbhc.out.single<- mbhc.single(data.prep, sampleNum=s)
    mbhc.cut.single<- mbhc.find.cut.off(mbhc.out.single, depth=depth)
    results<- list(final.dataframe<- mbhc.cut.single$final.data, mbhc.output<- mbhc.out.single,
                   data.prep<- data.prep)
    names(results)<- c(paste(name,".final.data", sep = ""),paste(name,".mbhc.single.output", sep = ""),
                       paste(name,".data.prep",sep = ""))

  }

  if(dim>2){
    print('Running Multiple Sample Clustering')
    mbhc.red<- mbhc.reduce.graph(data.prep)
    mbhc.out<- mbhc.multiple.cut(data.prep, x.el.cut = mbhc.red$x.el.red, sampleNum=s)
    mbhc.mv <- mbhc.var(mbhc.out,sampleNum = s)
    mbhc.cut<- mbhc.find.cut.off.multiple(mbhc.mv, shape=shape, depth=depth)
    results <- list(final.dataframe<-mbhc.cut$final.data.clusters ,data.prep<- data.prep,
                   reduce.output<- mbhc.red, mbhc.output<- mbhc.out, mbhc.cutoff<- mbhc.cut, mbhc.mvar= mbhc.mv )
    names(results)<- c(paste(name,".final.data", sep = ""), paste(name,".data.prep",sep = ""),
                       paste(name,".reduce.output", sep = ""),
                       paste(name,".mbhc.out", sep = ""), paste(name,".mbhc.cut", sep = ""),
                       paste(name,".mbhc.vars", sep=""))

  }
  return(results)

}