R/MAGOS.R

Defines functions mag.cn.block mag.cn.call.tcn mag.multiple.run mag.single.run cut.off.single mag.single cut.off.multiple mag.var mag.multiple mag.reduce.graph mag.prepdata mag.exp.var.v3 get.var.v3.1 fit.elements.v3 fit.elements.reduce.v3 mag.dist.v3

Documented in cut.off.multiple cut.off.single mag.multiple mag.multiple.run mag.prepdata mag.reduce.graph mag.single mag.single.run mag.var

### April 2020.
#Added mag.multiple.run with fold
#mag.exp.var.v3....comment edep=mean(edep.vec)...

### packages:
library(igraph)
library(dplyr)
library(Matrix)


mag.dist.v3<- function(vafs, depths){
  vaf<- mean(vafs[vafs>0.001])
  if(sum(vafs>0.001)==0){
    vaf<- mean(vafs)
  }
  depth<- mean(depths)

  s1<- depth*vaf
  s2<- depth-s1

  #s1/(s1+s2)
  #R<- dbeta(vafs, shape1=s1, shape2=s2, log=T)
  R <-c(dbeta(vafs[vafs!=0.001], shape1=s1, shape2=s2, log=T),
        dbeta(vafs[vafs==0.001], shape1=s1, shape2=s2, log=T)) #log should be F- because if Log=T we could have positive and negative values. We should add log
  #mean(R)
  #plot(c(vafs[vafs!=0.001], vafs[vafs==0.001]),R)
  #rbind(vafs[vafs!=0.001], R[1:58])
  #dbeta(vafs, shape1=s1, shape2=s2, log=T)
  #v<- mad(vafs, center=mean(vaf.test), constant = 1)
  v<-sum((vafs-vaf)^2)/length(vafs)
  #v<- sqrt(v)
  range<- max(vafs)-min(vafs)
  #range<- sqrt(range)
  #range<- range^2
  #cat(v, ' ',range,' ',v*range,'\n')
  #print(range)
  #print(v*range)
  v <- ifelse(v<0.0005, 0.0005, v)
  range<- ifelse(range<0.005,0.005,range)

  if(mean(R)>0){
    R <- R/(range*v)#;mean(R)
  }
  if(mean(R)<0){
    R<- -R*range*v
  }
  dist <- -mean(R)#/exp(5*(weight+0.0001))
  #dist<- ifelse(dist>0, -0.001,dist)

  results<- c(dist=dist, s1=s1, s2=s2)
  #print(paste(results, 'end'))
  return(results)
}



#mARCH 1st
fit.elements.reduce.v3<- function(elements,vaf.data){#, depth.data){
  #  fit.obj<- c()
  # fit.minval<- c()
  #dist.val<-c()
  #dist.shape<- c()
  result <- c();
  for(i in 1:(dim(vaf.data)[2]-1)){
    #print(i)
    variants<- vaf.data[elements, i]
    #depths<- depth.data[elements, i]

    #dist<- mag.dist.v3(variants,depths)
    dist<- max(variants)- min(variants)
    result <- rbind(result, dist)
    # dist.val<- c(dist.val, dist['dist'])
    # dist.shape<- c(dist.shape, dist['s1'])

    #fit1<-optimize(y=variants,exp.depth=exp.depth ,f=optim.obj, interval = c(1,exp.depth-1),tol=0.001)
    #fit.obj<- c(fit.obj, fit1$object)
    #fit.minval<- c(fit.minval, fit1$minimum)
  }

  #return(c(max(fit.obj),fit.minval[which.max(fit.obj)]))
  #return(c(max(dist.val),dist.shape[which.max(dist.val)]))
  #return(result[which.max(result[,'dist']), c('dist', 's1')])
  return(max(result))
}
fit.elements.v3<- function(elements,vaf.data, depth.data){
  #  fit.obj<- c()
  # fit.minval<- c()
  #dist.val<-c()
  #dist.shape<- c()
  result <- c();
  for(i in 1:(dim(vaf.data)[2]-1)){
    #print(i)
    variants<- vaf.data[elements, i]
    depths<- depth.data[elements, i]

    dist<- mag.dist.v3(variants,depths)
    result <- rbind(result, dist)
    # dist.val<- c(dist.val, dist['dist'])
    # dist.shape<- c(dist.shape, dist['s1'])

    #fit1<-optimize(y=variants,exp.depth=exp.depth ,f=optim.obj, interval = c(1,exp.depth-1),tol=0.001)
    #fit.obj<- c(fit.obj, fit1$object)
    #fit.minval<- c(fit.minval, fit1$minimum)
  }

  #return(c(max(fit.obj),fit.minval[which.max(fit.obj)]))
  #return(c(max(dist.val),dist.shape[which.max(dist.val)]))
  return(result[which.max(result[,'dist']), c('dist', 's1')])
}
get.var.v3.1 <-function(x, x.nb, s) {
  x.nb<- x.nb[as.logical(x)]
  a <- mean(x.nb);
  b <- var(x.nb);
  c <- sum(as.logical(x));
  d <- s;
  return(c(a, b, c, d));
}
mag.exp.var.v3<-function(efrq.vec, edep.vec, num, n=500, type='M'){
  expt.var<-c()
  depth.keep<- c()
  vafs<- c()
  #depth.range<- max(edep.vec)-min(edep.vec)
  #if(sd(edep.vec)<10){edep.vec<- mean(edep.vec)}
  a<-quantile(edep.vec,0.1)
  b<-quantile(edep.vec,0.9)
  lowerf.1 <- quantile(efrq.vec,0.15)
  higherf.1 <- quantile(efrq.vec,0.85);


  if(length(edep.vec)>5){
    edep.vec<- edep.vec[edep.vec>=a&
                          edep.vec<=b]
  } else {
    edep.vec<-c(mean(edep.vec), mean(edep.vec))
  }
  if(length(efrq.vec)>5){
    efrq.vec<- efrq.vec[efrq.vec>=lowerf.1&#quantile(efrq.vec,0.05)&
                          efrq.vec<=higherf.1]#quantile(efrq.vec,0.95)]
  } else {
    efrq.vec<-c(mean(efrq.vec), mean(efrq.vec))
  }
  #print('asjdahskdjhasjkd')
  #print(length(edep.vec))
  for(i in 1:n){
    edep<- sample(edep.vec,1, replace=T)
    #edep=mean(edep.vec)
    #efrq<- sample((efrq.vec),1, replace=T)
    efrq<- mean(efrq.vec)
    s1<-efrq*edep
    s2<-(1-efrq)*edep
    x<-rbeta(num,shape1 = s1,shape2 = s2)
    #hist(x, 50)
    #var(x)
    vafs<- c(vafs, s1/(s1+s2))
    expt.var<- c(expt.var, var(x))
    depth.keep<- c(depth.keep, edep)
    i<-i+1
  }
  return(list(exp.vat=(expt.var), depths=depth.keep, vafs=vafs))
}


########################################
########################################          Prep data and other functions

mag.prepdata<- function(arg.data){
  num<- dim(arg.data)[2]/2   # number of samples.     ### in the format of ref, alt, ref, alt
  cat('Number of samples: ', num)
  arg.data<- data.frame(arg.data)
  vaf.data<- c()
  depth.data<- c()
  for(i in 1:num){
    depth<-arg.data[,i*2]+arg.data[,i*2-1]
    vaf<- arg.data[,i*2]/depth

    depth.data<- cbind(depth.data, depth)
    colnames(depth.data)[i]<-paste("depth.", i, sep='')
    vaf.data<- cbind(vaf.data, vaf)
    colnames(vaf.data)[i]<-paste("vaf.", i, sep='')
  }

  ### arg data should have the frequencies in rows and columns for samples. for 3 samples, 100 mutation: 3 columns,100rows.
  vaf.data <- as.matrix(vaf.data)
  vaf.data <- round(vaf.data,3)
  vaf.data <- ifelse(vaf.data<1e-3, 1e-3, vaf.data)
  vaf.data <- ifelse(vaf.data> 1-(1e-3), 1-(1e-3), vaf.data)


  counts.data<- data.frame(arg.data, row.names=NULL)
  vaf.data<- data.frame(vaf.data, row.names=NULL)
  depth.data<- data.frame(depth.data, row.names=NULL)

  ID<- c(1:dim(counts.data)[1])

  counts.data$ID<- ID
  vaf.data$ID<- ID
  depth.data$ID<- ID

  results<- list(counts=counts.data,vafs=vaf.data, depths=depth.data)
  return(results)
}







########################################
########################################          MAG Multiple V3.1


mag.reduce.graph<-function(prep.data,x.el=matrix(), n=-100, threshold=0.03){

  time0<- as.numeric(Sys.time())
  #sampleNum<-1  #added oct 11 2018

  order.var<- apply(prep.data$vafs[,-which(names(prep.data$vafs)=='ID')],1,prod)
  order.ind<- order(order.var)
  sorted.order.var<- order.var[order.ind]

  vaf.data<- prep.data$vafs
  depth.data<- prep.data$depths

  vaf.sorted<- vaf.data[order.ind,]
  depth.sorted<- depth.data[order.ind,]

  IDs<- vaf.sorted[, 'ID']


  # if(dim(x.el)[1]==0){
  #   l<- dim(vaf.data)[1]
  #   x.el<- data.frame(diag(l))
  #   names(x.el)<- IDs
  # }
  #FEB 21 2019
  if(dim(x.el)[1]==1 & dim(x.el)[2]==1){
    l<- dim(vaf.data)[1]
    x.el<- data.frame(diag(l))
    x.el<- sapply(x.el, as.logical)
    colnames(x.el)<- IDs
  }


  mean.frqs<-t(apply( x.el, 1, function(x){mean(sorted.order.var[x])}))
  x.el.sorted<-x.el[order(mean.frqs),]
  x.el.final<- c()


  #x.nb<- data.sorted[,sampleNum]
  #ss.other<- data.sorted[,-c(sampleNum, dim(data.sorted)[2]), drop=F]

  size.threshold<-10

  l<-nrow(x.el)
  full<-floor(l/size.threshold)
  rem<-l%%size.threshold
  row.fold<-0

  if( rem < 5 ){ full<- full-1; rem<- rem+size.threshold}


  # optim.value.bound<-fit.variants.weight(c(0.4,0.46,0.05))[1]


  #optim.value.bound<-fit.elements.v2(c(TRUE,TRUE,TRUE), cbind(c(0.4,0.46, 0.43),c(1,2,3)))[1]


  #depth.bound<- min(apply(depth.sorted[,-dim(depth.sorted)[2]],2,mean))



  # optim.value.bound<- fit.elements.v3(c(TRUE,TRUE), cbind(c(0.4,0.43),c(1,2)),
  #                                    cbind(c(depth.bound,depth.bound),c(1,2)))[1]
  optim.value.bound<-threshold



  ind.full<- c(1:size.threshold)
  combn.el<- t(combn(ind.full,2))

  if(full>0){
    for(i in 1:full){

      fold<- c((1+(i-1)*size.threshold):(i*size.threshold))
      x.el.fold<-x.el.sorted[fold, ]
      #print(i)
      #initial.full<-apply(combn.el, 1, function(x){fit.elements.weight.reduce( x.el.fold[x[1],],
      #                                                                        x.el.fold[x[2],],
      #                                                                       x.nb, ss.other )})

      #### march 1st:
      #size.fold<- ifelse(min(apply(x.el.fold,1,sum))<2, 2,
      #  min(apply(x.el.fold, 1,sum)))
      vafs.fold<- vaf.sorted[apply(x.el.fold, 2, function(x) sum(x)==1),]
      #depths.fold<- depth.sorted[apply(x.el.fold, 2, function(x) sum(x)==1),]


      initial.full<- apply(combn.el, 1, function(x){
        #fit.elements.v2(as.logical(x.el.fold[x[1],]+x.el.fold[x[2],]),data.sorted)})
        #fit.elements.v3(as.logical(x.el.fold[x[1],]+x.el.fold[x[2],]),vaf.sorted, depth.sorted)})
        #fit.elements.v3(x.el.fold[x[1],]|x.el.fold[x[2],],vaf.sorted, depth.sorted)})
        #MARCH 1st
        fit.elements.reduce.v3(x.el.fold[x[1],]|x.el.fold[x[2],],vaf.sorted)})


      tempMatLoop<-matrix(100, nrow=size.threshold, ncol=size.threshold)
      tempMatLoop.2<-lower.tri(diag(100, nrow=size.threshold, ncol=size.threshold))

      #MARCH 1st
      #tempMatLoop[tempMatLoop.2] <- initial.full[1, ]
      tempMatLoop[tempMatLoop.2] <- initial.full
      cat('**********')
      cat(class(tempMatLoop))
      mat.loop<-Matrix::t(tempMatLoop)
      mat.loop.g<- mat.loop
      mat.loop.g[mat.loop <= optim.value.bound]<- 1
      mat.loop.g[mat.loop> optim.value.bound]<- 0
      sum(mat.loop.g==1)
      g1<-graph_from_adjacency_matrix(mat.loop.g, mode = 'undirected')
      complete_points<- max_cliques(g1, min=2)
      if(length(complete_points)>0){
        x.el.graph<-c()
        el.rm.graph<-c()
        for(i in 1:length(complete_points)){
          el.rm<- as.numeric(complete_points[[i]])
          el.rm<- el.rm[!el.rm%in%el.rm.graph]
          if(length(el.rm)>0){
            x.el.graph<- rbind(x.el.graph, as.logical(colSums(x.el.fold[el.rm,, drop=F])))
            el.rm.graph<- c(el.rm.graph, el.rm)
          }}
        x.el.fold<- rbind(x.el.fold[-el.rm.graph, , drop=F],x.el.graph)
      }
      x.el.final<- rbind(x.el.final, x.el.fold)
      {  #while(  min(mat.loop)< optim.value.bound & nrow(mat.loop)>2){#

        # el.rm <- which(mat.loop==min(mat.loop), arr.ind=T)
        #  el.rm <- el.rm[1,]   # april 14

        #el.rm<- unique(as.numeric(el.rm))

        # mat.loop<- mat.loop[-el.rm, -el.rm, drop=F]
        #if(sum(dim(mat.loop))==0){mat.loop=matrix(c(0,0,0,0),nrow = 2)}
        #x.el.fold<- rbind( x.el.fold[-el.rm, , drop=F], colSums(x.el.fold[el.rm,]))
        #colnames(x.el.fold)<-row.names(data.sorted)
        #}
      }
    }}

  rems<- c((l-rem+1):l)
  #print(rems)
  x.el.rem<- x.el.sorted[rems,]
  ind.rem<- c(1:rem)
  combn.rem<- t(combn(ind.rem, 2))
  # initial.rems<-apply(combn.rem, 1, function(x){fit.elements.weight.reduce( x.el.rem[x[1],],
  #                                                                           x.el.rem[x[2],],
  #                                                                           x.nb, ss.other )})
  initial.rems<- apply(combn.rem, 1, function(x){
    #fit.elements.v2(as.logical(x.el.rem[x[1],]+x.el.rem[x[2],]),arg.data)})
    #fit.elements.v3(as.logical(x.el.rem[x[1],] +x.el.rem[x[2],]),vaf.sorted, depth.sorted)})
    #fit.elements.v3(x.el.rem[x[1],]|x.el.rem[x[2],],vaf.sorted, depth.sorted)})
    fit.elements.reduce.v3(x.el.rem[x[1],]|x.el.rem[x[2],],vaf.sorted)})



  ###may 6th
  tempMatLoop<-matrix(100, nrow=rem, ncol=rem)
  tempMatLoop.2<-lower.tri(diag(100, nrow=rem, ncol=rem))
  #tempMatLoop[tempMatLoop.2] <- initial.rems[1, ]
  tempMatLoop[tempMatLoop.2] <- initial.rems
  cat('**********')                                 
  cat(class(tempMatLoop))
  mat.loop<-Matrix::t(tempMatLoop)
  #diag(mat.loop) <- 1e5;
  mat.loop.g<- mat.loop
  mat.loop.g[mat.loop <= optim.value.bound]<- 1
  mat.loop.g[mat.loop> optim.value.bound]<- 0
  sum(mat.loop.g==1)
  g1<-graph_from_adjacency_matrix(mat.loop.g, mode = 'undirected')
  complete_points<- max_cliques(g1, min=2)
  x.el.graph<-c()
  el.rm.graph<-c()

  #print(complete_points)
  if(length(complete_points)>0){
    for(i in 1:length(complete_points)){
      el.rm<- as.numeric(complete_points[[i]])
      el.rm<- el.rm[!el.rm%in%el.rm.graph]
      if(length(el.rm)>0){
        x.el.graph<- rbind(x.el.graph,as.logical(colSums(x.el.rem[el.rm,,drop=F])))
        el.rm.graph<- c(el.rm.graph, el.rm)
      }}
    x.el.rem<- rbind(x.el.rem[-el.rm.graph, , drop=F],x.el.graph)
    colSums(x.el.rem)}
  x.el.final<- rbind(x.el.final, x.el.rem)
  {
    #while(  min(mat.loop)< -10 &nrow(mat.loop)>2){#
    #  el.rm<- which(mat.loop==min(mat.loop), arr.ind=T)
    #  el.rm<- unique(as.numeric(el.rm))

    # mat.loop<- mat.loop[-el.rm, -el.rm, drop=F]
    #  if(sum(dim(mat.loop))==0){mat.loop=matrix(c(0,0,0,0),nrow = 2)}
    #  x.el.rem<- rbind( x.el.rem[-el.rm, , drop=F], colSums(x.el.rem[el.rm,]))
    #  colnames(x.el.fold)<-row.names(data.sorted)
    #}
    #x.el.final<- rbind(x.el.final, x.el.rem)
  }

  results<- list(x.el.red=x.el.final, vaf.data=vaf.data,vaf.sorted=vaf.sorted,
                 depth.data=depth.data,depth.sorted=depth.sorted);
  nn <- nrow(results$x.el.red)
  print("nn")
  print(nn)
  time1<- as.numeric(Sys.time())
  print('TIME:')
  time<- time1-time0
  print(time)
  #jj<-jj+1
  if(nn < size.threshold | nn == n) {
    return(results)
  } else {
    #mag.reduce.2(results, sampleNum=sampleNum,n=nn)
    mag.reduce.graph(prep.data = prep.data,x.el=x.el.final,n=nn, threshold = threshold)
  }



}




### added okay calculation to mag.mult; Also, added the first x.el to x.el.list as the first element, so s should start from 2.
mag.multiple<- function(prep.data,x.el.cut=data.frame()) {


  if(1>dim(prep.data$vafs)[2]-1){print("stop the algorithm and check the sampleNum; run mag.prepdata() first")}

  time0<-as.numeric(Sys.time())
  params<-c();  x.values<-c();  x.steps<-c();  row.loop<-c();x.step<-c()
  x.elements.list <- list()


  ## Sep2019:
  okay.list=list()



  if(ncol(prep.data$vafs) == 2){        ##### change '1' to '2' because the 2nd column is always id
    print('executing single sample clustering....'); flush.console();

  }

  order.var<- apply(prep.data$vafs[,-which(names(prep.data$vafs)=='ID')],1,prod)
  order.ind<- order(order.var)
  sorted.order.var<- order.var[order.ind]

  vaf.data<- prep.data$vafs
  depth.data<- prep.data$depths

  vaf.sorted<- vaf.data[order.ind,]
  depth.sorted<- depth.data[order.ind,]

  IDs<- vaf.sorted[, 'ID']






  #oct 18 2018    I WILL KEEP THE SORTED VERSION.

  n<- dim(vaf.data)[1]


  if( sum(dim(x.el.cut))==0){
    print("Mag.reduce has not been run.")
    x.elements<-data.frame(diag(n))
    #names(x.elements)<-c(1:length(x.nb))   feb 26
    #names(x.elements)<- IDs     APRIL 11 2019
    x.elements<- sapply(x.elements, as.logical)  #APRIL 11 2019
    colnames(x.elements)<- IDs
  }else{
    x.elements<- x.el.cut
  }

  x.elements.list[[1]]=x.elements


  ### Sep 2019
  okay.id= c(1:nrow(x.elements))
  okay.list[[1]]= okay.id



  l<- nrow(x.elements)

  #### new matloop with reduced x.el:
  indx.cut<-c(1:nrow(x.elements))
  combn.cut <- t(combn(indx.cut, 2))

  ###no need to run weights...
  cat('initial fitting ... '); flush.console();
  time1 <- as.numeric(Sys.time())
  #initial.1<-apply(combn.cut, 1, function(x){fit.elements.weight( x.elements[x[1],], x.elements[x[2],],
  #                                                               x.nb, arg.data.other )})
  # x.elements<- sapply(x.elements, as.logical)
  #x.elements <- matrix(as.logical(as.matrix(x.elements)), ncol=ncol(x.elements), byrow=T)

  initial.1<- apply(combn.cut, 1, function(x){
    fit.elements.v3(c(x.elements[x[1],]|x.elements[x[2],]),vaf.sorted, depth.sorted)})

  time2 <- as.numeric(Sys.time())
  cat('took ', time2-time1, ' seconds.\n'); flush.console();

  tempMatLoop<-Matrix(1000000, nrow=l, ncol=l, sparse = T)  #may 6th   DEC 2018: added sparse
  tempMatLoop.2<-lower.tri(diag(1000000, nrow=l, ncol=l))
  tempMatLoop[tempMatLoop.2] <- initial.1[1, ]
  cat('**********')
  cat(class(tempMatLoop))
  mat.loop<-Matrix::t(tempMatLoop)
  #diag(mat.loop) <- 1e5;


  tempPar1Loop<-lower.tri(diag(1000000, nrow=l, ncol=l))
  #tempPar2Loop<-lower.tri(diag(0, nrow=l, ncol=l))
  tempPar1Loop[tempPar1Loop] <- initial.1[2, ]
  #tempPar2Loop[tempPar2Loop] <- initial.1[3, ]

  par.1.loop<-t(tempPar1Loop)
  #par.2.loop<-t(tempPar2Loop)
  #weights.step<-c()
  #var.mean <- c()


  time2<-as.numeric(Sys.time())

  s<-2
  while(nrow(x.elements) > 1){
    cat(s, '...', nrow(x.elements), '...\n'); flush.console();

    x.values[s]<-min(mat.loop)
    el1<-Matrix::which(mat.loop==min(mat.loop), arr.ind = T)[1,1]
    el2<-Matrix::which(mat.loop==min(mat.loop), arr.ind = T)[1,2]
    el.rm<-c(el1,el2)
    params<-c(params, par.1.loop[el1,el2])#, par.2.loop[el1,el2]))

    #what happenes at each step

    ### FEB 19 2019
    #x.step <- x.elements[el.rm[1], ] + x.elements[el.rm[2],]   ###this is very clever! I dont need to update the xloop.
    x.step <- x.elements[el.rm[1], ]|x.elements[el.rm[2],]   ###this is very clever! I dont need to update the xloop.



    ###only use x.elements rows for each cluster!
    x.steps <- rbind(x.steps, x.step)
    x.elements <- rbind(x.elements[-el.rm,], x.step)
    x.elements.list[[s]]<-x.elements


    ### Sep2019:
    #update.okay:
    okay.id=c(okay.id[-el.rm],0)
    okay.list[[s]]=okay.id



    #####UPDATE MATRIX HERE
    mat.loop<-mat.loop[-el.rm, -el.rm, drop=F]
    par.1.loop<-par.1.loop[-el.rm, -el.rm, drop=F]


    mat.column.update<-c()
    par1.column.update<-c()
    #par2.column.update<-c()

    x1 <- x.elements[1:(nrow(x.elements)-1), ,drop=F]
    x2 <- x.elements[nrow(x.elements), , drop=F]   #the ones that are put together
    print('here')
    dim(x.elements)
    print(dim(x1))
    print(dim(x2))
    print('there')
    #fit.between <- t(apply(x1, 1, fit.elements.weight, element.2=x2,
    #                      arg.data=x.loop, arg.data.other=arg.data.other))

    #oct 2018
    fit.between<- apply(x1, 1, function(x){
      #fit.elements.v2(as.logical(x+x2), data.sorted)} )
      #fit.elements.v3(as.logical(x+x2), vaf.sorted, depth.sorted)} )
      #FEB 2019 19

      fit.elements.v3(x|x2, vaf.sorted, depth.sorted)} )






    mat.column.update <- fit.between[1,]
    par1.column.update <- fit.between[2,]
    #par2.column.update <- fit.between[, 3]

    #print(mat.loop)
    if(nrow(mat.loop) != 0){
      mat.loop<-cbind(mat.loop, mat.column.update )
      mat.loop<-rbind(mat.loop, rep(1000000,dim(mat.loop)[2]))    #just to keep mat.loop square.
      par.1.loop<-cbind(par.1.loop, par1.column.update)
      par.1.loop<-rbind(par.1.loop, rep(1000000, dim(par.1.loop)[2]))
      # par.2.loop<-cbind(par.2.loop, par2.column.update)
      # par.2.loop<-rbind(par.2.loop, rep(0, dim(par.2.loop)[2]))
      #print("MATLOOP UPDATE SHODE")
      #print(mat.loop)
      #print("END")
    }
    # print(mat.loop)

    s<-s+1
  }
  # colnames(var.mean)<-c("mean", "var","NumberOfPoints", "step", "weights")

  time3 <- as.numeric(Sys.time())
  cat('loop took ', time3 - time2, ' seconds.\n')
  cat('total took ', time3 - time0, ' seconds.\n')

  result <- list(x.el=x.elements.list,vaf.sorted=vaf.sorted,depth.sorted=depth.sorted,
                 prep.data=prep.data, okay.list=okay.list)
  return(result)
}






###Updated SEP2019: okay calculation from MAG
mag.var<- function(mag.out){

  #varmean is geneated based on sorted data
  x.el.list<- mag.out$x.el
  data<- data.frame(mag.out$vaf.sorted)
  #this also assumes that the data was ran on prep.data code so the last column has ID.
  num<- dim(data)[2]-1
  var.mean.list<-list()

  okay.list=mag.out$okay.list



  # SEP 2019: no need used okay.2 instead
  # x.el.red<- x.el.list[[1]]
  #  x.el.red.okay<- apply(x.el.red, 1, function(x) paste(x, collapse = ''))  #april 15
  #
  #
  #
  # SEP19: the x.el.red.IDs should be number of rows, not number of columns.
  # x.el.red.IDs= c(1:nrow(x.el.red))
  #

  for ( i in 1:num){
    var.mean<-c()
    x.data<- data[,i]
    names(data)
    depth.sorted.each<- mag.out$depth.sorted[,i]
    #s<-7
    for( s in 2:length(x.el.list)){
      x.el<- x.el.list[[s]]

      #sep19:
      okay.2=okay.list[[s]]

      temp<- t(apply(x.el, 1, get.var.v3.1, x.nb=x.data, s=s))

      # okay<- apply(x.el, 1, function(x){y=paste(x,collapse = ''); ind=x.el.red.okay%in%y;
      # res=ifelse(sum(ind)==0,0,x.el.red.IDs[ind])
      # return(res)})



      #print(cbind(okay.2, okay))  # SEP 2019: TESTED PERFECTLY! CAN JUST USE OKAY.2

      okay=okay.2

      depths<- apply(x.el,1, function(x) round(mean(depth.sorted.each[as.logical(x)])))



      #temp <- cbind(temp, weights.temp, okay)
      temp <- cbind(temp, depths ,okay)
      var.mean<- rbind(var.mean, temp)
    }

    #colnames(var.mean)<-c("mean", "var","NumberOfPoints", "step","weights","okay")
    colnames(var.mean)<-c("mean", "var","NumberOfPoints", "step","depth","okay")
    var.mean.list[[i]]<- var.mean
    names(var.mean.list)[i]<-paste(names(data)[i],".var.mean",sep="")


  }

  return(list(var.mean.all=var.mean.list, mag.out=mag.out))
}





####add all depths:


### UPDATED PROBABILIES.
cut.off.multiple  <-function(mag.var){
  time0 <- as.numeric(Sys.time())
  #var.mean <- mag.output$var.mean
  mag.var.list<-mag.var$var.mean.all

  ### FEB 13:
  depths.data<- mag.var$mag.out$prep.data$depths
  vafs.data<- mag.var$mag.out$prep.data$vafs

  for( i in 1:length(mag.var.list)){
    temp<- mag.var.list[[i]]
    temp<- data.frame(temp, row.names = NULL)
    temp[is.na(temp)]<-0
    temp$var<- round(temp$var, 6)
    temp$var<- temp$var+1e-7
    mag.var.list[[i]]<-temp
  }
  names(mag.var.list)<- names(mag.var$var.mean.all)
  x.el.list<- mag.var$mag.out$x.el


  flag <- FALSE
  m <- c()
  str <- c()  #clone structure line from temp var.mean matrix
  str.f <- c();  cutstr <- c();cutstr2 <- c()

  i <- max(mag.var.list[[1]]$step)
  # allPoints <- mag.output$x.el[[i]]
  allPoints<- x.el.list[[i]]   #feb28
  ok.points <- allPoints - allPoints  ##### To generate clean slate
  n <- length(allPoints)
  #	print(allPoints)
  id.ok <-c() #clusters with okay variance!
  temp.mv <- c(); temp.points <- c(); temp.step <- c();temp.point.step <- c();

  while(flag == FALSE){
    print(i); flush.console();
    ############
    ## I need to use x.el to get the breaks, because using StepI would cause problem
    ## in identical clusters and mutations.
    ## TO FIX THIS I JUST KEPT MY ORIGINAL METHOD -
    ## I just duplicated the rows in the case of identical mutations
    ## because there should be onlytwo rows at each step
    ## and having only 1 row means identical clusters.
    ############
    v <- i
    stepI.x.el <- x.el.list[[i]]
    stepI.x.el.step <- stepI.x.el[nrow(stepI.x.el), ]  #Only look at the most recent cluster
    stepII.x.el <- x.el.list[[i-1]]
    # stepII.breaks.x.el <- dplyr::setdiff(stepII.x.el, stepI.x.el)
    stepII.breaks.x.el <- dplyr::setdiff(as.data.frame(stepII.x.el), as.data.frame(stepI.x.el))

    temp.points <- rbind(temp.points, c(i, ok.points));
    temp.step <- rbind(temp.step, c(i, stepI.x.el.step));
    temp.point.step <- rbind(temp.point.step, c(i, sum(ok.points & stepI.x.el.step)))

    #print(sum( ok.points & stepI.x.el.step)==0)

    keep<- c(0,0)
    for( j in 1:length(mag.var.list)){
      var.mean<- mag.var.list[[j]]
      stepI <- var.mean[var.mean$step==i,]
      stepII <- var.mean[var.mean$step==i-1, ]  ## previous step

      if(sum(ok.points & stepI.x.el.step) == 0){     ###meaning stepI.x.el is not in ok.points
        #stepII.breaks <- setdiff(stepII[,-c(4,6)], stepI[,-c(4,6)])
        stepII.breaks <- dplyr::setdiff(stepII[,-4], stepI[,-4])
        if(nrow(stepII.breaks)==1){
          # print("why did this happen????")
          stepII.breaks <- rbind(stepII.breaks, stepII.breaks)
        }
        stepII.num <- unique(stepII[, 4])
        stepII.breaks <- cbind(stepII.breaks, step=stepII.num)   #MAY 9th
        # stepII.breaks <- cbind(stepII.breaks, stepII[1:2,c(4,6)])   #just add the step number back to the matrix        #############MARCh 14 add "okay"
        #for( ii in 1:dim(newtemp)[1]){              ###stepII.breaks always has 2 rows
        for(ii in 1:2){
          #sample.sim <- cluster.sim(edep=depth, efrq=stepII.breaks[ii,1], shape=shape, n=10000)
          var.sim <- c()
          size <- ifelse(stepII.breaks$NumberOfPoints[ii]<5, 5, stepII.breaks$NumberOfPoints[ii])
          depth.vec<-depths.data[depths.data$ID%in%names(stepII.breaks.x.el[ii,as.logical(stepII.breaks.x.el[ii,]),drop=F]),j]

          vaf.vec<-vafs.data[vafs.data$ID%in%names(stepII.breaks.x.el[ii,as.logical(stepII.breaks.x.el[ii,]),drop=F]),j]

          #var.sim<- mag.exp.var.v2.1(efrq = stepII.breaks[ii,1], edep.vec =depth.vec , num=size)
          #var.sim<- var.sim$exp.vat

          if(mean(depth.vec)>150){type<-'L'}
          if(mean(depth.vec)<=150){type<-'L'}
          if(mean(depth.vec)<=40){type<-'L'}

          #print(type)

          var.sim<- mag.exp.var.v3(efrq.vec = vaf.vec, edep.vec =depth.vec , num=size, type=type)
          var.sim<- var.sim$exp.vat
          #print('done mag')
          # print(sd(var.sim))


          m <- mean(var.sim) + 1e-6
          vv <- sd(var.sim)
          vmax <- max(var.sim)

          if(length(vaf.vec)>50 & max(vaf.vec)-min(vaf.vec)>0.05){
            ### added the second condition to ignore the clusters with all equal vafs.
            var.check<- var(vaf.vec[-which.max(vaf.vec)[1]])
          }else{
            var.check<- stepII.breaks[ii,2]
          }


          var.threshold<- m+2*vv
          #print(var.threshold)
          if(type=='H'){var.threshold<- m+3*vv}
          #   if(var(vaf.vec)!=0){
          #  test<-varTest(vaf.vec,sigma.squared = m+3*vv)
          # }
          #if(var(vaf.vec)==0){
          #  test<- list(p.value=1)
          #}

          var.threshold<- m+3*vv
          if(var.threshold>0.001){
            var.threshold<- round(var.threshold,3)
          }

          if(stepII.breaks[ii, 'okay'] >0 |
             #floor(var.check*10^4)/10^4 <  ceiling((round(m+3*vv,4)+0.00001)*10^4)/10^4) {
             floor(var.check*10^4)/10^4< m+3*vv  ) {

            keep[ii]<-keep[ii]+1
          }#changed to vmax on oct 14 2018
        }
      }
    }

    for(ii in 1:2){
      if(keep[ii]==length(mag.var.list)){
        cutstr2 <- rbind(cutstr2, stepII.breaks.x.el[ii,])
        #id.ok<-c(id.ok, stepII.breaks$id[ii])
        str <- rbind(str, stepII.breaks[ii,])
        ok.points <- ok.points + stepII.breaks.x.el[ii,]}
    }

    if(sum(ok.points>1)>0){
      print("ERROR")
    }
    if(sum(ok.points) == n){
      flag<-TRUE
    } else {
      i<-(v-1)
    }
    #if(v==i){iii<-TRUE}else{i<-(v-1)}
  }

  time1 <- as.numeric(Sys.time())
  cat("took ", (time1 - time0), " seconds.\n");
  #print(cutstr)
  colors<-colSums(cutstr2*c(1:nrow(cutstr2)))+1
  data<- mag.var$mag.out$vaf.sorted
  final.data<-cbind(data,colors)


  #PROBS:
  t=final.data

  t.1=c()
  for(i in 1:(ncol(t)-2)){
    temp=c()
    for(j in unique(t$colors)){
      #print(j)
      m.1=mean(t[t$colors==j,i])
      v.1=var(t[t$colors==j,i])

      a <- ((1 - m.1) / v.1 - 1 / m.1) * m.1 ^ 2
      b <- a * (1 / m.1 - 1)

      temp=rbind(temp,c(j, a, b))
    }
    colnames(temp)=c('colors',paste0('alpha.',i), paste0('beta.',i))
    t.1=cbind(t.1,temp[,-1])
  }
  t.1=cbind(unique(t$colors), t.1)
  colnames(t.1)[1]='colors'

  #head(t)

  t_means= t%>%select(contains(c('vaf','color')))%>%group_by(colors)%>%summarise_all(mean)
  i=1

  prob.all=c()
  for(i in 1:nrow(t)){
    vf=t[i,1:(ncol(t)-2)]
    #t.1
    prob.s=c()
    #j=1
    for( j in 1:length(vf)){
      #j is the sample number

      probs=c()
      prob=c()
      for(ii in unique(t$colors)){
        v=vf[,j]
        shape1 = t.1[t.1[,1]==ii,1+(2*j-1)]
        shape2=t.1[t.1[,1]==ii, 1+2*j]
        clust_mean= t_means[t_means$colors==ii, j+1]%>%pull()

        prob_flag= ifelse(v<clust_mean, TRUE, FALSE )

        pp=pbeta(v, shape1, shape2 , lower.tail = prob_flag)

        prob= c(prob, round(pp,3))
        #prob=c(prob,dbeta(vf[,j], shape1 = t.1[t.1[,1]==ii,1+(2*j-1)], shape2=t.1[t.1[,1]==ii, 1+2*j]))

      }
      prob.s=rbind(prob.s, prob)
      prob.s=apply(prob.s, 2,min )
    }
    prob.all=rbind(prob.all, prob.s)
  }
  colnames(prob.all)=paste0('colors.', unique(t$colors))
  rownames(prob.all)=NULL
  final.probs=cbind(t, prob.all)






  results<-list(str=str, cutclust=cutstr, cutclust2=cutstr2, colors=colors,probs=final.probs ,final.data=final.data)
  return(results)
}


########################################
########################################          MAG SINGLE V3
mag.single<- function(prep.data){

  if(is.null(prep.data$vafs$ID)){ return(print('Run mag.prepdata on the data'))}
  time0 <- as.numeric(Sys.time())


  vaf.data<- prep.data$vafs
  vaf.data<- round(vaf.data,3)    #change 1 (aug6,2018)
  vaf.data[,-dim(vaf.data)[2]]<- ifelse(vaf.data[,-dim(vaf.data)[2]] < 1e-3, 1e-3, vaf.data[,-dim(vaf.data)[2]])
  vaf.data[,-dim(vaf.data)[2]]<- ifelse(vaf.data[,-dim(vaf.data)[2]] > 1-1e-3, 1-1e-3, vaf.data[,-dim(vaf.data)[2]])

  depth.data<- prep.data$depths


  x.nb <- vaf.data;
  if(!is.null(ncol(vaf.data)) && ncol(vaf.data) > 0) {
    x.nb <- vaf.data$vaf.1
  }
  params<-c();  x.values<-c();  x.steps<-c();  row.loop<-c(); x.step<-c()
  x.elements.list<-list()
  x.elements<-data.frame(diag(length(x.nb)))



  sort.x<-sort(vaf.data$vaf.1)

  #vaf.depth<- merge(prep.data$vafs,prep.data$depths, by="ID")

  #x.loop<-sort.x
  sort.vaf.data <- vaf.data[order(vaf.data$vaf.1),]
  sort.depth.data<- depth.data[order(vaf.data$vaf.1),]


  names(x.elements)<-sort.vaf.data$ID
  #names(x.el.first)<- sort.vaf.data$ID

  ### APRIL 11 2019
  x.elements<- sapply(x.elements, as.logical)
  colnames(x.elements)<- sort.vaf.data$ID


  x.elements.preprc<- c()
  for(i in unique(sort.x)){
    #temp<-colSums(x.elements[sort.x==i,])
    temp<- as.logical(apply(x.elements[sort.x==i,, drop=F],2, sum ))  # APRIL 11 2019
    x.elements.preprc<- rbind(x.elements.preprc, temp)

  }

  #names(x.elements.preprc)<- sort.vaf.data$ID
  colnames(x.elements.preprc)<- sort.vaf.data$ID   # APRIL 11 2019

  ## names checked! correct!



  #### LOOK HERE:       THIS CHANGES TO X.element.preprc
  #x.el.first<- data.frame(diag(length(x.nb)))
  #x.el.first.okay<- apply(x.el.first, 1, function(x) paste(x, collapse = '')) # april 15


  # SEP 2019 commented:
  #x.el.first.okay<- apply(x.elements.preprc, 1, function(x) paste(as.numeric(x), collapse = '')) # april 15
  #####   Sep 2019: I dont even need x.el.first okay. I just need the initial ids. Then use el.rm to update it.
  #####   sep 2019: c

  x.el.first.okay.id<- c(1:dim(x.elements.preprc)[1])
  l<- dim(x.elements.preprc)[1]

  mat.loop<-matrix(1000000, ncol=l, nrow=l)
  par.1.loop<-matrix(100000, ncol=l, nrow=l)

  s.pairs <- cbind(1:(l-1), 2:l)
  s.likls <- t(apply(s.pairs, 1, function(x){
    fit.elements.v3(x.elements.preprc[x[1],]|x.elements.preprc[x[2],], sort.vaf.data, sort.depth.data)    #### APRIL 11 2019
  }))
  #mat.loop<-diag(0, nrow=l, ncol=l)
  for(i in 2:l){
    mat.loop[i-1,i] <- s.likls[i-1, 1]      ### the s.likls is the likelihood between i-1)th element and the i th.
    par.1.loop[i-1,i] <- s.likls[i-1, 2]
    # par.2.loop[i-1,i] <- s.likls[i-1, 3]
  }

  time1 <- as.numeric(Sys.time())
  var.mean<-c()

  ### aug12 add the step 0 to var.mean
  #temp<- t(apply(x.elements.preprc, 1, get.var, x.nb=sort.x, s=0 ))
  temp<- t(apply(x.elements.preprc, 1, get.var.v3.1, x.nb=sort.x, s=0 ))
  #okay<- apply(x.elements.preprc, 1, function(x) sum(paste(x, collapse = '')%in%x.el.first.okay))

  # okay<- apply(x.elements.preprc, 1, function(x) #dec 6 2018
  # {y=paste(as.numeric(x), collapse = '');  #APRIL 11 2019 added as.numeric
  # ind=x.el.first.okay%in%y;
  # res=ifelse(sum(ind)==0, 0, x.el.first.okay.id[ind])
  # return(res)
  # })
  okay=x.el.first.okay.id

  #weights<- rep(0, length(okay))  ##### just to have weight  APRIL 15

  ###add depth to var.mean    dec 6 2018
  #  depths<- apply(x.elements.preprc,1, function(x) round(mean(sort.depth.data[as.logical(x),'depth.1'])))

  depths<- apply(x.elements.preprc,1, function(x) round(mean(sort.depth.data[x,'depth.1'])))   # APRIL 11 2019


  temp<- cbind(temp,depths ,okay)

  var.mean<- rbind(var.mean, temp)
  s<-1
  x.elements<- x.elements.preprc

  m1=as.matrix(x.elements)
  m.preprc=Matrix(m1, sparse = T)   ###just to save the preproces matrix
  #x.elements.list[[1]]<-m2
  ###need to create new "n' for number of mutations. to find neighbours
  n<-length(sort.x)
  while(nrow(x.elements)>= 2){
    # print("INJA")
    # print(s)
    #print(mat.loop)

    #print(min(mat.loop))
    x.values[s]<-min(mat.loop)
    el1<-Matrix::which(mat.loop==min(mat.loop), arr.ind = T)[1,1]
    el2<-Matrix::which(mat.loop==min(mat.loop), arr.ind = T)[1,2]
    el.rm<-c(el1,el2)
    # print("Remove ina")
    # print(el.rm)
    params<-rbind(params,par.1.loop[el1,el2])

    #what happenes at each step
    #x.step <- x.elements[el.rm[1],] +x.elements[el.rm[2],]   ###this is very clever! I dont need to update the xloop.
    x.step <- x.elements[el.rm[1],] |x.elements[el.rm[2],]  # APRIL 11 2019



    ###only use x.elements rows for each cluster!
    x.steps<-rbind(x.steps, x.step)
    #print(x.steps)
    x.elements<-rbind(x.elements[-el.rm,], x.step)


    ########## change to sparse matrix saving:

    m1<- as.matrix(x.elements)
    m2<- Matrix(m1, sparse = T)
    x.elements.list[[s]]<- m2

    #### april 14 add "okay"
    temp<- t(apply(x.elements, 1, get.var.v3.1, x.nb=sort.x, s=s ))          ####for oc.v9 I changed x.nb=x.nb to x.nb=
    #####
    #okay<- apply(x.elements, 1, function(x) nrow(merge(t(x), x.el.first)))               ################# APRIL 14 2018   takeeeesss a long time
    ####################### faster version? :
    # okay<- apply(x.elements, 1, function(x) sum(paste(x, collapse = '')%in%x.el.first.okay))
    #print(s)
    # print('1.8')
    # SEP 19: commented the next 7 lines! I just need one line and use el.rm to update the okay.
    #okay<- apply(x.elements, 1, function(x) #dec 6 2018
    #{y=paste(as.numeric(x), collapse = '');   # APRIL 11 2019 added the numeric
    #ind=x.el.first.okay%in%y;
    #res=ifelse(sum(ind)==0, 0, x.el.first.okay.id[ind])
    #res=ifelse(y%in%x.el.first.okay,x.el.first.okay.id[which(x.el.first.okay==y)],0)
    #return(res)
    #})
    okay= c(okay[-el.rm],0)
    #print('1.9')
    # cbind(okay, c(okay.org[-el.rm],0))
    # okay.org=okay
    # weights<- rep(0, length(okay))  ##### just to have weight  APRIL 15


    ###add depth to var.mean    dec 6 2018
    depths<- apply(x.elements,1, function(x) round(mean(sort.depth.data[as.logical(x),'depth.1'])))

    #temp<- cbind(temp,depths,weights ,okay)
    temp<- cbind(temp,depths ,okay)   # removed the weigths APRIL 11 2019

    var.mean<- rbind(var.mean, temp)

    #####UPDATE MATRIX HERE
    mat.loop<-mat.loop[-el.rm, -el.rm]
    par.1.loop<-par.1.loop[-el.rm, -el.rm]
    #par.2.loop<-par.2.loop[-el.rm, -el.rm]

    mat.column.update<-c()
    par1.column.update<-c()
    #par2.column.update<-c()

    max<-max(which(x.step%in%1))
    min<-min(which(x.step%in%1))
    #		print(paste("min", min))
    nei<-c()
    if(min == 1 & max != n){
      neiIND<-which(x.elements[,max+1]==1)
      nei<-x.elements[neiIND,, drop=F]
      #			print("if1")
    }else if(max == n & min != 1){
      neiIND<-which(x.elements[,min-1]==1)
      nei<-x.elements[neiIND,, drop=F]
      #			print("if2")
    }else if( max!=n & min!=1){
      nei1<-which(x.elements[,min-1]==1)
      nei2<-which(x.elements[,max+1]==1)
      neiIND<-c(nei1,nei2)
      nei<-x.elements[neiIND,, drop=F]
      #			print("if3")
    }
    #print(nei)

    if(length(nei) > 0){
      x.last <- x.elements[nrow(x.elements), , drop=F]

      temp <- apply(nei, 1, function(x){
        #fit.elements.v3(as.logical(x+x.last), vaf.data=sort.vaf.data,depth.data=sort.depth.data)
        fit.elements.v3(x|x.last, vaf.data=sort.vaf.data,depth.data=sort.depth.data)   # APRIL 11 2019

      })

      ###try to only look at cluster neighbours
      ###instead of dim i will use sqrt of length. Because mat.loop is always square matrix   WORKS!!! SEP12
      l2<-sqrt(length(mat.loop))
      mat.column.update<-rep(100000,l2)
      mat.column.update[neiIND]<-temp[1,]
      par1.column.update<-rep(100000, l2)
      par1.column.update[neiIND]<-temp[2,]
      #par2.column.update<-rep(0, l2)
      #par2.column.update[neiIND]<-temp[3,]

      mat.loop<-cbind(mat.loop, mat.column.update )
      mat.loop<-rbind(mat.loop, rep(100000,dim(mat.loop)[2]))    #just to keep mat.loop square.
      par.1.loop<-cbind(par.1.loop, par1.column.update)
      par.1.loop<-rbind(par.1.loop, rep(100000, dim(par.1.loop)[2]))
      #par.2.loop<-cbind(par.2.loop, par2.column.update)
      #par.2.loop<-rbind(par.2.loop, rep(0, dim(par.2.loop)[2]))
    }

    s<-s+1
  }

  #var.mean <-cbind(var.mean, 1/(var.mean[,3]))
  colnames(var.mean)<-c("mean", "var","NumberOfPoints", "step","depth", 'okay')

  time2 <- as.numeric(Sys.time())
  cat("while took ", (time2 - time1), " seconds.\n")
  cat("total took ", (time2 - time0), " seconds.\n")

  result <- list(x.el=x.elements.list,x.el.preprocess=m.preprc,
                 params=params, var.mean=var.mean, prep.data=prep.data,
                 vaf.sorted=sort.vaf.data,depth.sorted= sort.depth.data,
                 freq.s= sort.x, time=list(time0,time1, time2))
  return(result)
}


cut.off.single <- function(mag.output){
  time0 <- as.numeric(Sys.time())
  #print(dim(mag.output))
  var.mean <- mag.output$var.mean
  head(var.mean)
  var.mean <- data.frame(var.mean, row.names = NULL)
  #var.mean$id<-round(var.mean$mean*var.mean$NumberOfPoints,5)
  var.mean[is.na(var.mean)] <- 0
  #var.mean.first <- var.mean
  #print(var.mean)
  var.mean$var <- round(var.mean$var, 6)
  var.mean$var <- var.mean$var + 1e-7
  #id.max<-max(var.mean$id)
  flag <- FALSE
  m <- c()
  str <- c()  #clone structure line from temp var.mean matrix
  str.f <- c()
  cutstr <- c()
  cutstr2 <- c()

  i <- max(var.mean$step)
  ##added aug 12:
  if(i==1){
    flag=T
    str<-var.mean
  }
  #print(flag)
  allPoints <- mag.output$x.el[[i]]   ### added +1 with the AUG6th2018 version, because the x.element.list has an extra element.
  allPoints<- as.data.frame(as.matrix(allPoints))
  ok.points <- allPoints - allPoints  ##### To generate clean slate
  n <- length(allPoints)
  #	print(allPoints)
  id.ok <-c() #clusters with okay variance!
  temp.mv <- c();
  temp.points <- c();
  temp.step <- c();
  temp.point.step <- c();



  var.sim.zero<- mag.exp.var.v3(efrq.vec = mag.output$vaf.sorted$vaf.1, edep.vec =mag.output$depth.sorted$depth.1 , num=n)    #### APRIL 11 2019
  var.sim.zero<- var.sim.zero$exp.vat

  m <- mean(var.sim.zero, na.rm=T) + 1e-6
  vv <- sd(var.sim.zero, na.rm=T)
  if(var(mag.output$vaf.sorted$vaf.1) < m+3*vv ){
    flag<-TRUE
    print('Only one clone detected')
    str<- var.mean[var.mean$step==i,]
    colors<- rep(1, n)
  }



  {
    while(flag == FALSE){
      #print(flag)
      # print('umad?')
      # print(flag)
      #print('enter')
      #print(i); flush.console();
      v <- i

      ############
      ## I need to use x.el to get the breaks, because using StepI would cause problem
      ## in identical clusters and mutations.
      ## TO FIX THIS I JUST KEPT MY ORIGINAL METHOD -
      ## I just duplicated the rows in the case of identical mutations
      ## because there should be onlytwo rows at each step
      ## and having only 1 row means identical clusters.
      ############

      stepI <- var.mean[var.mean$step==i,]
      stepI.x.el <- mag.output$x.el[[i]]
      stepI.x.el<- as.data.frame(as.matrix(stepI.x.el))

      stepI.x.el.step <- stepI.x.el[nrow(stepI.x.el), ]  #Only look at the most recent cluster

      stepII <- var.mean[var.mean$step==i-1, ]  ## previous step
      #stepII$var<-stepII$var,)
      if(i>1){
        stepII.x.el <- mag.output$x.el[[i-1]]
        stepII.x.el<- as.data.frame(as.matrix(stepII.x.el))
      }
      if(i==1){
        stepII.x.el <- mag.output$x.el.preprocess
        stepII.x.el<- as.data.frame(as.matrix(stepII.x.el))

      }
      #print("II")
      #print(stepII.x.el)

      #if( !(id.sum%in%id.ok)){    ###this means if the division on this step is not happening on the "ok" clusters. so the okay clusters should still exist in the temp.
      #if(sum(abs(id.ok-id.sum)<0.1)==0){
      #newtemp<-temp[!(temp$id%in%id.ok),]
      #if(!id.I%in%id.ok){
      temp.points <- rbind(temp.points, c(i, ok.points));
      temp.step <- rbind(temp.step, c(i, stepI.x.el.step));
      temp.point.step <- rbind(temp.point.step, c(i, sum(ok.points & stepI.x.el.step)))


      #print(ok.points)
      #print(sum(ok.points & stepI.x.el.step))
      if(sum(ok.points & stepI.x.el.step) == 0){     ###meaning stepI.x.el is not in ok.points

        #stepII.breaks<- stepII[!stepII$id%in%stepI,]
        #print("stepI ")
        #print(stepI )
        #print("stepII ")
        #print(stepII )
        #stepII.breaks <- setdiff(stepII[,-c(4,7)], stepI[,-c(4,7)])
        stepII.breaks <- dplyr::setdiff(stepII[,-4], stepI[,-4])   #dec 6 2018
        if(nrow(stepII.breaks)==1){
          #print("why did this happen????")
          stepII.breaks <- rbind(stepII.breaks, stepII.breaks)
        }
        stepII.num <- stepII[, 4]
        stepII.breaks <- cbind(stepII.breaks, stepII[1:2,4])   #just add the step number back to the matrix
        stepII.breaks.x.el <- dplyr::setdiff(stepII.x.el, stepI.x.el)

        #for( ii in 1:dim(newtemp)[1]){              ###stepII.breaks always has 2 rows
        for(ii in 1:2){
          #sample.sim <- cluster.sim(edep=depth, efrq=stepII.breaks[ii,1], shape=shape, n=10000)
          var.sim <- c()
          size <- ifelse(stepII.breaks$NumberOfPoints[ii]<5, 5, stepII.breaks$NumberOfPoints[ii])
          #print(size)
          #for(j in 1:500){
          # sample.sim.2 <- sample.sim[sample(1:10000, size=size),]   #temp[1,3]
          #  var.sim <- c(var.sim, var(sample.sim.2[, 4]))
          #}
          #var.sim<- mag.exp.var.v2(efrq=stepII.breaks[ii,1], edep=stepII.breaks[ii,'depth'], num=size)

          #print(stepII.breaks.x.el)

          vafs<- mag.output$vaf.sorted$vaf.1[as.matrix(stepII.breaks.x.el[ii,])]                 #### APRIL 11 2019
          depths<- mag.output$depth.sorted$depth.1[as.matrix(stepII.breaks.x.el[ii,])]

          var.sim<- mag.exp.var.v3(efrq.vec = (vafs) , edep.vec = (depths) , num=size)    #### APRIL 11 2019
          var.sim<- var.sim$exp.vat
          #print(var.sim)

          ###MAY 2019:
          if(length(vafs)>20 & max(vafs)-min(vafs)>0.05){
            #print('UMAD TOOOOOOOOO')
            ### added the second condition to ignore the clusters with all equal vafs.
            var.check<- var(vafs[-which.max(vafs)[1]])
          }else{
            var.check<- stepII.breaks[ii,2]}




          m <- mean(var.sim, na.rm=T) + 1e-6
          vv <- sd(var.sim, na.rm=T)
          vmax <- max(var.sim, na.rm=T)
          #temp.mv <- rbind(temp.mv, c(i, ii, m, vv, vmax, stepII.breaks[ii,2]))

          #if(stepII.breaks[ii, 'NumberOfPoints'] < 11 | stepII.breaks[ii,2]*(1+stepII.breaks[ii, "weights"]) < m+vv) {        #okay cluster
          #if(stepII.breaks[ii, 'okay'] ==1 | (stepII.breaks[ii,2])*(1+stepII.breaks[ii, "weights"]) < m+vv) {        #okay cluster   APRIL 15
          # print(stepII.breaks)
          #print(vv)
          if(stepII.breaks[ii, 'okay'] >0 |
             # stepII.breaks[ii,2] < m+3*vv ) {   #dec 6 2018
             var.check<m+3*vv){
            str.f <- c(str.f, stepII.breaks[ii,1])
            #print("here")
            #print(str)
            str <- rbind(str, stepII.breaks[ii,])
            #print(stepII.breaks.x.el)
            cutstr2 <- rbind(cutstr2, stepII.breaks.x.el[ii,])
            #print(str)

            #id.ok<-c(id.ok, stepII.breaks$id[ii])
            ok.points <- ok.points + stepII.breaks.x.el[ii,]
            #print(ok.points)
          }
        }
      }

      ###stop condition
      #if(abs(sum(str$id)-id.max)<0.000001){flag<-TRUE}else{i<-(v-1)}
      #print(i)      ###new condition: sum of id.okay== id.max

      #if(abs(sum(str$id)-id.max)<0.000001){flag<-TRUE}else{i<-(v-1)}

      ####new condition    oct 31
      #print("LAST PRINT")
      if(sum(ok.points>1)>0){
        print("ERROR")
      }
      if(sum(ok.points) == n){
        flag<-TRUE
      } else {
        i<-(v-1)
      }
      #if(v==i){iii<-TRUE}else{i<-(v-1)}
    }

    for( i in 1:nrow(str)){
      #print(i)
      if(str$step[i]!=0){
        xel <- mag.output$x.el[[str$step[i]]]}
      if(str$step[i]==0){
        xel <- mag.output$x.el.preprocess}

      #print(xel)
      for(j in 1:nrow(xel)){
        m1 <- mean(mag.output$freq.s[xel[j,]==1])    ##changed to sorted. after oc.9#### OCT 30: THis should be changed for OC11 since there is no sorting in that
        if(abs(m1-str[i,1]) < 0.001){
          cutstr<-rbind(cutstr, xel[j,])  ## what's the difference between cutstr and cutstr2????????
        }
      }
    }


    time1 <- as.numeric(Sys.time())
    cat("took ", (time1 - time0), " seconds.\n");
    #print(cutstr)
    colors<-colSums(cutstr*c(1:nrow(cutstr)))+1
  }

  final.data<-cbind(mag.output$vaf.sorted,colors)
  #print('hi')
  #  print(final.data)

  #PROBS:
  t=final.data


  t.1=c()
  for(i in 1:(ncol(t)-2)){
    temp=c()
    for(j in unique(t$colors)){
      #print(j)
      m.1=mean(t[t$colors==j,i])
      v.1=var(t[t$colors==j,i])

      a <- ((1 - m.1) / v.1 - 1 / m.1) * m.1 ^ 2
      b <- a * (1 / m.1 - 1)

      temp=rbind(temp,c(j, a, b))
    }
    colnames(temp)=c('colors',paste0('alpha.',i), paste0('beta.',i))
    t.1=cbind(t.1,temp[,-1, drop=F])
  }
  t.1=cbind(unique(t$colors), t.1)
  colnames(t.1)[1]='colors'

  #head(t)

  t_means= t%>%select(contains(c('vaf','color')))%>%group_by(colors)%>%summarise_all(mean)
  i=1

  prob.all=c()
  for(i in 1:nrow(t)){
    vf=t[i,1:(ncol(t)-2), drop=F]
    #t.1
    prob.s=c()
    #j=1
    for( j in 1:length(vf)){
      #j is the sample number

      probs=c()
      prob=c()
      for(ii in unique(t$colors)){
        v=vf[,j]
        shape1 = t.1[t.1[,1]==ii,1+(2*j-1)]
        shape2=t.1[t.1[,1]==ii, 1+2*j]
        clust_mean= t_means[t_means$colors==ii, j+1]%>%pull()

        prob_flag= ifelse(v<clust_mean, TRUE, FALSE )

        pp=pbeta(v, shape1, shape2 , lower.tail = prob_flag)

        prob= c(prob, round(pp,3))
        #prob=c(prob,dbeta(vf[,j], shape1 = t.1[t.1[,1]==ii,1+(2*j-1)], shape2=t.1[t.1[,1]==ii, 1+2*j]))

      }
      prob.s=rbind(prob.s, prob)
      prob.s=apply(prob.s, 2,min )
    }
    prob.all=rbind(prob.all, prob.s)
  }
  colnames(prob.all)=paste0('colors.', unique(t$colors))
  rownames(prob.all)=NULL
  final.probs=cbind(t, prob.all)





  results<-list(str=str, cutclust=cutstr, cutclust2=cutstr2, colors=colors, final.data=final.data, probs=final.probs)
  return(results)
}






####################################
####################################        MAG RUN

mag.single.run<- function(input.data, fold=F){
  data.prep<- mag.prepdata(input.data)
  purity<- 1
  mag<- mag.single(data.prep)
  cut<- cut.off.single(mag)
  temp<- merge(cut$final.data, data.prep$depths, by='ID')
  #plot(temp$vaf.1, temp$depth.1, col=temp$colors, pch=19)
  sum1<-temp%>%group_by(colors)%>%summarize(meanVAF=mean(vaf.1))
  purity<- 2*max(sum1$meanVAF)
  if(purity>1 & fold==F){purity='There are clusters with frequency higher than 0.5- consider folding.'}
  if(fold==T & sum(sum1$meanVAF>0.5)>0){
    fold.colors<- sum1$colors[sum1$meanVAF>0.5]
    data.folded<- temp
    data.folded$vaf.1[data.folded$colors%in%fold.colors]<- 1-data.folded$vaf.1[data.folded$colors%in%fold.colors]
    data.prep.fold<- data.prep
    data.prep.fold$vafs$vaf.1<- data.folded$vaf.1
    mag<- mag.single(data.prep.fold)
    cut<- cut.off.single(mag)
    temp<- merge(cut$final.data, data.prep$depths, by='ID')
    #plot(temp$vaf.1, temp$depth.1, col=temp$colors, pch=19)
    sum2<- temp%>%group_by(colors)%>%summarize(meanVAF=mean(vaf.1))
    purity<- 2*max(sum2$meanVAF)
  }
  temp<- temp[,c(2,4,3,1)]
  res<- list(mag=mag, cut=cut, results=temp, fold=fold, purity=purity)
  return(res)
}

# APRIL 2020: added fold.
mag.multiple.run<- function(input.data, reduce=T, threshold=0.03, fold= F){
  cat('*******');flush.console()
  prep= mag.prepdata(input.data)
  purity=c()

  if(reduce){
    red=mag.reduce.graph(prep, threshold = threshold)
    mag=mag.multiple(prep, x.el.cut = red$x.el.red)
    mv=mag.var(mag)
    cut=cut.off.multiple(mv)
  }
  if(!reduce){
    #red=mag.reduce.graph(prep)
    mag=mag.multiple(prep)#, x.el.cut = red$x.el.red)
    mv=mag.var(mag)
    cut=cut.off.multiple(mv)
  }

  temp=cut$final.data
  #plot(temp$vaf.1, temp$vaf.2, col=temp$colors, xlim=c(0,1), ylim=c(0,1))
  sum1=temp%>%group_by(colors)%>%summarise_all(mean)
  purity=apply(sum1, 2, max)*2
  purity=purity[c(-1, -length(purity))]

  fs=which(sum1[,-c(1, ncol(sum1))]>0.5, arr.ind = T)
  print('what')
  print(purity)
  if(fold==F & any(purity>1)){purity='There are clusters with frequency higher than 0.5- consider folding.'
  print(purity)}

  if(fold==T & nrow(fs)>0){
    fold.prep= prep
    for( i in 1:nrow(fs)){
      bad_col= as.numeric(sum1[fs[i,1],1 ])
      bad_sam= fs[i,2]

      bad_ID=temp$ID[temp$colors==bad_col]
      head(temp)

      fold.prep$vafs[fold.prep$vafs$ID%in%bad_ID,bad_sam]=1-fold.prep$vafs[fold.prep$vafs$ID%in%bad_ID,bad_sam]
    }

    #plot(fold.prep$vafs$vaf.1, fold.prep$vafs$vaf.2, xlim=c(0,1), ylim=c(0,1))


    if(reduce){
      red=mag.reduce.graph(fold.prep, threshold = threshold)
      mag=mag.multiple(fold.prep, x.el.cut = red$x.el.red)
      mv=mag.var(mag)
      cut=cut.off.multiple(mv)
    }
    if(!reduce){
      #red=mag.reduce.graph(prep)
      mag=mag.multiple(fold.prep)#, x.el.cut = red$x.el.red)
      mv=mag.var(mag)
      cut=cut.off.multiple(mv)
    }
  }
  temp=cut$final.data
  #plot(temp$vaf.1, temp$vaf.2, col=temp$colors, xlim=c(0,1), ylim=c(0,1))
  sum1=temp%>%group_by(colors)%>%summarise_all(mean)
  purity=apply(sum1, 2, max)*2
  purity=purity[c(-1, -length(purity))]

  res=list(results=temp, purity=purity, reduce=reduce, mag=mag, cut=cut)
  return(res)

}





####################################
####################################        CNV ASSIGNMENT




## determine the best cluster of each SNV
## input:   cl.cnv is the output from mag.cn.block()
##
mag.cn.call.tcn=function(cut, cn.input, cl.cnv, upper=10){
  cn.output=c()
  temp=c()
  temp2=data.frame(cut$final.data%>%group_by(colors)%>%summarise_all(funs(mean)))

  for(m in 1:nrow(cn.input[[1]])){  ## each snv in CNV region
    res=c()
    for(c in unique(temp2$colors)){  ## each cluster containing a snv
      for(s in 1:length(cn.input)){  ## each sample
        cn.sample=cn.input[[s]][m, ]
        cn.tmp=cn.sample$cn
        cn.vaf=cn.sample$vaf
        cn.blk=cn.sample$block
        blk.cl = cl.cnv[which(cl.cnv$block == cn.blk), 'cl.cnv']
        vaf.tcn=temp2[temp2$colors==blk.cl, s+1]
        for(k in 1:max(min(upper, cn.tmp), 1)){
          vaf.sc=temp2[temp2$colors==c, s+1]
          exp.vaf=2*k*vaf.sc/(2*vaf.tcn*cn.tmp+(1-2*vaf.tcn)*2)
          res=rbind(res,c(abs(cn.vaf-exp.vaf),s,c,blk.cl,k))
        }
      }
    }
    colnames(res)=c('error', 'sample', 'cl.snv', 'cl.cnv', 'k')
    agg = aggregate(error ~ cl.snv + cl.cnv + k, data=res, 'sum')
    best = agg[which.min(agg$error), ]
    cn.output = rbind(cn.output, best)
  }
  if(nrow(cut$final.data)<nrow(cn.input[[1]])){
    print('Warning: number of CNV events is greater than SNVs. CNV assignment may not be reliable.')}
  return(data.frame(cn.output))
}

## determine the best cluster a CNV belongs to
## it is the majority vote of all SNVs in a CNV block
## input: cut is from mag
## input: cn.input is a list of data frames.
## each data frame has 3 columns: vaf, cn that is tcn, and block that is the CNV block ID.
## input: upper is the max cnv it searches
## output: a data frame with 3 columns: cluster a cnv belongs to, block ID of a CNV, number of SNVs that support the CNV cluster assignment
mag.cn.block=function(cut, cn.input, upper=10){
  cn.output=c()
  temp=c()
  temp2=data.frame(cut$final.data%>%group_by(colors)%>%summarise_all(funs(mean)))

  for(m in 1:nrow(cn.input[[1]])){  ## each snv in CNV region
    res=c()
    for(c in unique(temp2$colors)){  ## each cluster containing a snv
      for(r in unique(temp2$colors)){  ## each cluster containing a cnv region
        for(s in 1:length(cn.input)){  ## each sample
          cn.sample=cn.input[[s]][m, ]
          cn.tmp=cn.sample$cn
          cn.blk=cn.sample$block
          cn.vaf=cn.sample$vaf
          for(k in 1:max(min(upper, cn.tmp), 1)){
            vaf.sc=temp2[temp2$colors==c, s+1]
            vaf.tcn=temp2[temp2$colors==r, s+1]
            exp.vaf=2*k*vaf.sc/(2*vaf.tcn*cn.tmp+(1-2*vaf.tcn)*2)
            res=rbind(res,c(abs(cn.vaf-exp.vaf),s,c,r,k,cn.blk))
          }
        }
      }
    }
    colnames(res)=c('error', 'sample', 'cl.snv', 'cl.cnv', 'k', 'block')
    agg = aggregate(error ~ cl.snv + cl.cnv + k + block, data=res, 'sum')
    best = agg[which.min(agg$error), ]
    cn.output = rbind(cn.output, best)
  }
  block.best.cl = do.call(rbind, lapply(split(cn.output, cn.output$block), function(x) {
    a = aggregate(cl.snv ~ cl.cnv + block, data=x, 'length')
    a[which.max(a$cl.snv), ]
  }))

  if(nrow(cut$final.data)<nrow(cn.input[[1]])){
    print('Warning: number of CNV events is greater than SNVs. CNV assignment may not be reliable.')}
  return(block.best.cl)
}
liliulab/magos documentation built on June 1, 2024, 3:31 a.m.