get.var <-function(x, x.nb, s, weight=1) {
a <- mean(x.nb[as.logical(x)]);
b <- var(x.nb[as.logical(x)])*weight;
c <- sum(as.logical(x));
d <- s;
return(c(a, b, c, d));
}
fit.variants.weight <- function(variants){ #to calculate the initial matrix
weight <- variants[length(variants)]
variants <- variants[-length(variants)]
#fit<-optim(par1, y=variants, fn=optim.obj, weight=weight,method="Brent",lower=1.5, upper=1000)#, control=list(factr=1e-3, maxit=0, trace=0)) #, lower=1.5, upper=1000, method='L-BFGS-B'
fit1<-optimize(y=variants,weight=weight,exp.depth=exp.depth ,f=optim.obj, interval = c(1,199),tol=0.001)#,lower=1, upper=200, control=list(factr=1e-3, maxit=200, trace=0)) #, lower=1.5, upper=1000, method='L-BFGS-B'
return(c(fit1$object,fit1$minimum))#,c(fit3$optim$bestval, fit3$optim$bestmem)))
}
fit.variants <- function(variants){ #to calculate the initial matrix
return(fit.variants.weight(c(variants, -0.0001)));
}
fit.elements <- function(element.1, element.2, arg.data){
data.1 <- arg.data[as.logical(element.1)]
data.2 <- arg.data[as.logical(element.2)]
data.merge <- c(data.1, data.2)
fit <- fit.variants(data.merge)
return(fit)
}
fit.elements.weight <- function(element.1, element.2, arg.data, arg.data.other){
data.1 <- arg.data[as.logical(element.1)]
data.2 <- arg.data[as.logical(element.2)]
weight <- get.weights.within(element.1 + element.2, arg.data.other)
data.merge <- c(data.1, data.2, 10*weight)
fit <- fit.variants.weight(data.merge)
print(data.merge)
return(fit)
}
fit.elements.weight.reduce <- function(element.1, element.2, arg.data, arg.data.other){
data.1 <- arg.data[as.logical(element.1)]
data.2 <- arg.data[as.logical(element.2)]
weight <- get.weights.within(element.1 + element.2, arg.data.other)
data.merge <- c(data.1, data.2, 10*weight)
fit <- fit.variants.weight(data.merge)
#print(data.merge)
return(fit)
}
get.weights.between <- function(x1, x2, arg.data) {
xx <- x1;
if(is.vector(x1)) {
xx <- t(as.data.frame(x1));
}
weights <- apply(xx, 1, function(x) { x <- x + x2; return(get.weights.within(x, arg.data=arg.data)) } );
return(weights)
}
get.weights.within <- function(x1, arg.data) { #arg.data should be the remaining samples
weights <- c()
#print(arg.data)
if(sum(x1)>1){
weights <- apply(arg.data, 2, function(x){ return(max(dist(x[as.logical(x1)]))) })
} else{
weights <- 0
}
#print(weis)
return(max(weights))
}
optim.obj <- function(par1,exp.depth=exp.depth, y, weight=-0.0001){
R <-dbeta(y, shape1=par1, shape2=exp.depth-par1, log=T) #log should be F- because if Log=T we could have positive and negative values. We should add log
#print(R)
#R[which(R < 1e-10)] <- 0;
#if(length(which(R == 0)) > 0) {
#return(1000);
#}
# print(R)
# print(paste(y, shapes))
# print(mean(R))
v <- sd(y)
range<- max(y)-min(y)
#print(v)
#print('v')
#print(v)
#print('range')
#print(range)
# print('v,meanR,logR, expWei')
v <- ifelse(v<0.005, 0.005, v)
range<- ifelse(range<0.005,0.005,range)
#R <- R/v
R <- R/(range*v)
# print(mean(R))
#R <- log(R)
# print('mean')
# print(mean(R))
#print(exp(5*(weight+0.0001)))
results <- -mean(R)/exp(5*(weight+0.0001))
#print(paste(results, 'end'))
return(results)
} #MAY 6th version. Uses range- only one par.
#' MBHC preparing the data
#'
#' This function creates ID for the data and preprocess the data for the MBHC.
#' @param arg.data the input data. In the dataframe format.
#' @export
#' mbhc.prepdata()
mbhc.prepdata<- function(arg.data){
### arg data should have the frequencies in rows and columns for samples. for 3 samples, 100 mutation: 3 columns,100rows.
arg.data <- as.matrix(arg.data)
arg.data <- round(arg.data,3)
arg.data <- ifelse(arg.data<1e-3, 1e-3, arg.data)
arg.data <- ifelse(arg.data> 1-(1e-3), 1-(1e-3), arg.data)
prep.data<- data.frame(arg.data, row.names=NULL)
prep.data$ID<- c(1:dim(prep.data)[1])
return(prep.data)
}
#' Model based hc
#'
#' This function performs the model-based hierarchical clustering
#'
#' @param arg.data,sampleNum Arg.data is the output of mbhc.prepdata function with IDs added. sampleNum for single sample should be 1
#' @export
#'
mbhc.single <- function(arg.data, sampleNum=1){
if(is.null(arg.data$ID)){ return(print('Run mag.prep on the data'))}
time0 <- as.numeric(Sys.time())
arg.data<- round(arg.data,3)
arg.data[,-dim(arg.data)[2]]<- ifelse(arg.data[,-dim(arg.data)[2]] < 1e-3, 1e-3, arg.data[,-dim(arg.data)[2]])
arg.data[,-dim(arg.data)[2]]<- ifelse(arg.data[,-dim(arg.data)[2]] > 1-1e-3, 1-1e-3, arg.data[,-dim(arg.data)[2]])
x.nb <- arg.data;
if(!is.null(ncol(arg.data)) && ncol(arg.data) > 0) {
x.nb <- arg.data[,sampleNum]
}
params<-c(); x.values<-c(); x.steps<-c(); row.loop<-c(); x.step<-c()
x.elements.list<-list()
x.elements<-data.frame(diag(length(x.nb)))
x.el.first<- data.frame(diag(length(x.nb)))
x.el.first.okay<- apply(x.el.first, 1, function(x) paste(x, collapse = '')) # april 15
sort.x<-sort(arg.data[,sampleNum])
#sort.x<- ifelse(sort.x < 1e-9, 1e-9, sort.x)
#sort.x<- ifelse(sort.x > 1-1e-4, 1-1e-3, sort.x)
#x.loop<-sort.x
sort.arg.data<- arg.data[order(arg.data[,sampleNum]),]
names(x.elements)<-sort.arg.data$ID
names(x.el.first)<- sort.arg.data$ID
l<-length(sort.x)
mat.loop<-matrix(10^5, ncol=l, nrow=l)
par.1.loop<-matrix(0, ncol=l, nrow=l)
#par.2.loop<-matrix(0, ncol=l, nrow=l)
s.pairs <- cbind(sort.x[1:(l-1)], sort.x[2:l])
#print(s.pairs)
s.likls <- t(apply(s.pairs, 1, fit.variants)) #calculated likelihoods on the pairs
#mat.loop<-diag(0, nrow=l, ncol=l)
for(i in 2:l){
mat.loop[i-1,i] <- s.likls[i-1, 1] ### the s.likls is the likelihood between i-1)th element and the i th.
par.1.loop[i-1,i] <- s.likls[i-1, 2]
# par.2.loop[i-1,i] <- s.likls[i-1, 3]
}
time1 <- as.numeric(Sys.time())
var.mean<-c()
s<-1
while(nrow(x.elements)>= 2){
# print("INJA")
# print(s)
#print(mat.loop)
#print(min(mat.loop))
x.values[s]<-min(mat.loop)
el1<-which(mat.loop==min(mat.loop), arr.ind = T)[1,1]
el2<-which(mat.loop==min(mat.loop), arr.ind = T)[1,2]
el.rm<-c(el1,el2)
# print("Remove ina")
# print(el.rm)
params<-rbind(params,par.1.loop[el1,el2])
#what happenes at each step
x.step <- x.elements[el.rm[1],] + x.elements[el.rm[2],] ###this is very clever! I dont need to update the xloop.
###only use x.elements rows for each cluster!
x.steps<-rbind(x.steps, x.step)
#print(x.steps)
x.elements<-rbind(x.elements[-el.rm,], x.step)
x.elements.list[[s]]<-x.elements
#### april 14 add "okay"
temp<- t(apply(x.elements, 1, get.var, x.nb=sort.x, s=s )) ####for oc.v9 I changed x.nb=x.nb to x.nb=
#####
#okay<- apply(x.elements, 1, function(x) nrow(merge(t(x), x.el.first))) ################# APRIL 14 2018 takeeeesss a long time
####################### faster version? :
okay<- apply(x.elements, 1, function(x) sum(paste(x, collapse = '')%in%x.el.first.okay))
weights<- rep(0, length(okay)) ##### just to have weight APRIL 15
temp<- cbind(temp,weights ,okay)
var.mean<- rbind(var.mean, temp)
#####UPDATE MATRIX HERE
mat.loop<-mat.loop[-el.rm, -el.rm]
par.1.loop<-par.1.loop[-el.rm, -el.rm]
#par.2.loop<-par.2.loop[-el.rm, -el.rm]
mat.column.update<-c()
par1.column.update<-c()
#par2.column.update<-c()
max<-max(which(x.step%in%1))
min<-min(which(x.step%in%1))
# print(paste("min", min))
nei<-c()
if(min == 1 & max != l){
neiIND<-which(x.elements[,max+1]==1)
nei<-x.elements[neiIND,]
# print("if1")
}else if(max == l & min != 1){
neiIND<-which(x.elements[,min-1]==1)
nei<-x.elements[neiIND,]
# print("if2")
}else if( max!=l & min!=1){
nei1<-which(x.elements[,min-1]==1)
nei2<-which(x.elements[,max+1]==1)
neiIND<-c(nei1,nei2)
nei<-x.elements[neiIND,]
# print("if3")
}
if(length(nei) > 0){
x.last <- x.elements[nrow(x.elements), ]
#print('sortx')
#print(sort.x)
#print(x.last)
#print(nei)
#print('done')
temp <- apply(nei, 1, fit.elements, arg.data=sort.x, element.2=x.last)
###try to only look at cluster neighbours
###instead of dim i will use sqrt of length. Because mat.loop is always square matrix WORKS!!! SEP12
l2<-sqrt(length(mat.loop))
mat.column.update<-rep(10^5,l2)
mat.column.update[neiIND]<-temp[1,]
par1.column.update<-rep(0, l2)
par1.column.update[neiIND]<-temp[2,]
#par2.column.update<-rep(0, l2)
#par2.column.update[neiIND]<-temp[3,]
mat.loop<-cbind(mat.loop, mat.column.update )
mat.loop<-rbind(mat.loop, rep(10^5,dim(mat.loop)[2])) #just to keep mat.loop square.
par.1.loop<-cbind(par.1.loop, par1.column.update)
par.1.loop<-rbind(par.1.loop, rep(0, dim(par.1.loop)[2]))
#par.2.loop<-cbind(par.2.loop, par2.column.update)
#par.2.loop<-rbind(par.2.loop, rep(0, dim(par.2.loop)[2]))
}
s<-s+1
}
#var.mean <-cbind(var.mean, 1/(var.mean[,3]))
colnames(var.mean)<-c("mean", "var","NumberOfPoints", "step", "weights", 'okay')
time2 <- as.numeric(Sys.time())
cat("while took ", (time2 - time1), " seconds.\n")
cat("total took ", (time2 - time0), " seconds.\n")
result <- list(x.el=x.elements.list, params=params, var.mean=var.mean, data=arg.data, data.sorted=sort.arg.data,freq.s= sort.x, time=list(time0,time1, time2))
return(result)
}
#' find cut off
#'
#' This function finds the optimal cluster assignment for the data.
#' @param mbhc.output The output of mbhc.single is the input of find.cut.off
#' @export
mbhc.find.cut.off <- function(mbhc.output){
library(dplyr)
time0 <- as.numeric(Sys.time())
var.mean <- mbhc.output$var.mean
var.mean <- data.frame(var.mean, row.names = NULL)
var.mean[is.na(var.mean)] <- 0
var.mean$var <- round(var.mean$var, 4)
vars<- var.mean$var[var.mean$var>0]
var.bound<- quantile(vars)[4]*1.5
flag <- FALSE
m <- c()
str <- c() #clone structure line from temp var.mean matrix
str.f <- c()
cutstr <- c()
cutstr2 <- c()
i <- max(var.mean$step)
allPoints <- mbhc.output$x.el[[i]]
ok.points <- allPoints - allPoints ##### To generate clean slate
n <- length(allPoints)
# print(allPoints)
id.ok <-c() #clusters with okay variance!
temp.mv <- c();
temp.points <- c();
temp.step <- c();
temp.point.step <- c();
while(flag == FALSE){
#print(flag)
print(i); flush.console();
v <- i
############
## I need to use x.el to get the breaks, because using StepI would cause problem
## in identical clusters and mutations.
## TO FIX THIS I JUST KEPT MY ORIGINAL METHOD -
## I just duplicated the rows in the case of identical mutations
## because there should be onlytwo rows at each step
## and having only 1 row means identical clusters.
############
stepI <- var.mean[var.mean$step==i,]
stepI.x.el <- mbhc.output$x.el[[i]]
stepI.x.el.step <- stepI.x.el[nrow(stepI.x.el), ] #Only look at the most recent cluster
stepII <- var.mean[var.mean$step==i-1, ] ## previous step
#stepII$var<-stepII$var,)
stepII.x.el <- mbhc.output$x.el[[i-1]]
print("II")
#print(stepII.x.el)
#if( !(id.sum%in%id.ok)){ ###this means if the division on this step is not happening on the "ok" clusters. so the okay clusters should still exist in the temp.
#if(sum(abs(id.ok-id.sum)<0.1)==0){
#newtemp<-temp[!(temp$id%in%id.ok),]
#if(!id.I%in%id.ok){
temp.points <- rbind(temp.points, c(i, ok.points));
temp.step <- rbind(temp.step, c(i, stepI.x.el.step));
temp.point.step <- rbind(temp.point.step, c(i, sum(ok.points & stepI.x.el.step)))
print(ok.points)
print(sum(ok.points & stepI.x.el.step))
if(sum(ok.points & stepI.x.el.step) == 0){ ###meaning stepI.x.el is not in ok.points
stepII.breaks <- setdiff(stepII[,-c(4,6)], stepI[,-c(4,6)])
if(nrow(stepII.breaks)==1){
print("why did this happen????")
stepII.breaks <- rbind(stepII.breaks, stepII.breaks)
}
stepII.num <- stepII[, 4]
stepII.breaks <- cbind(stepII.breaks, stepII[1:2,c(4,6)]) #just add the step number back to the matrix
stepII.breaks.x.el <- setdiff(stepII.x.el, stepI.x.el)
for(ii in 1:2){
if(stepII.breaks[ii, 'okay'] ==1 | stepII.breaks$var[ii]<var.bound) { #okay cluster APRIL 15
str.f <- c(str.f, stepII.breaks[ii,1])
#print("here")
#print(str)
str <- rbind(str, stepII.breaks[ii,])
print(stepII.breaks.x.el)
cutstr2 <- rbind(cutstr2, stepII.breaks.x.el[ii,])
#print(str)
#id.ok<-c(id.ok, stepII.breaks$id[ii])
ok.points <- ok.points + stepII.breaks.x.el[ii,]
#print(ok.points)
}
}
}
#print("LAST PRINT")
if(sum(ok.points>1)>0){
print("ERROR")
}
if(sum(ok.points) == n){
flag<-TRUE
} else {
i<-(v-1)
}
#if(v==i){iii<-TRUE}else{i<-(v-1)}
}
for( i in 1:nrow(str)){
#print(i)
xel <- mbhc.output$x.el[[str$step[i]]]
#print(xel)
for(j in 1:nrow(xel)){
m1 <- mean(mbhc.output$freq.s[xel[j,]==1]) ##changed to sorted. after oc.9#### OCT 30: THis should be changed for OC11 since there is no sorting in that
if(abs(m1-str[i,1]) < 0.001){
cutstr<-rbind(cutstr, xel[j,]) ## what's the difference between cutstr and cutstr2????????
}
}
}
time1 <- as.numeric(Sys.time())
cat("took ", (time1 - time0), " seconds.\n");
#print(cutstr)
colors<-colSums(cutstr*c(1:nrow(cutstr)))+1
final.data<-cbind(mbhc.output$data.sorted,colors)
results<-list(str=str, cutclust=cutstr, cutclust2=cutstr2, colors=colors, final.data=final.data)
return(results)
}
#'
#'Reduce function
#'
mbhc.reduce.graph<- function(arg.data,x.el=data.frame() ,sampleNum=1, n=-100){
#x.el<- reduce.out$x.el.red
#data<- reduce.out$data
#data.sorted<- reduce.out$data.sorted
#IDs<- data.sorted[, dim(data.sorted)[2]]
data<- arg.data
data.sorted<- arg.data[order(arg.data[,sampleNum]),]
IDs<- data.sorted[, dim(data.sorted)[2]]
if(dim(x.el)[1]==0){
l<- dim(data)[1]
x.el<- data.frame(diag(l))
names(x.el)<- IDs
}
mean.frqs<-t(apply( x.el, 1, function(x){mean(data.sorted[as.logical(x),sampleNum])}))
x.el.sorted<-x.el[order(mean.frqs),]
x.el.final<- data.frame()
x.nb<- data.sorted[,sampleNum]
ss.other<- data.sorted[,-c(sampleNum, dim(data.sorted)[2]), drop=F]
size.threshold<-5
l<-nrow(x.el)
full<-floor(l/size.threshold)
rem<-l%%size.threshold
row.fold<-0
if( rem < 5 ){ full<- full-1; rem<- rem+size.threshold}
optim.value.bound<-fit.variants.weight(c(0.4,0.46,0.05))[1]
ind.full<- c(1:size.threshold)
combn.el<- t(combn(ind.full,2))
if(full>0){
for(i in 1:full){
fold<- c((1+(i-1)*size.threshold):(i*size.threshold))
x.el.fold<-x.el.sorted[fold, ]
#print(i)
initial.full<-apply(combn.el, 1, function(x){fit.elements.weight.reduce( x.el.fold[x[1],],
x.el.fold[x[2],],
x.nb, ss.other )})
tempMatLoop<-matrix(1000, nrow=size.threshold, ncol=size.threshold)
tempMatLoop.2<-lower.tri(diag(0, nrow=size.threshold, ncol=size.threshold))
tempMatLoop[tempMatLoop.2] <- initial.full[1, ]
mat.loop<-t(tempMatLoop)
mat.loop.g<- mat.loop
mat.loop.g[mat.loop <= optim.value.bound]<- 1
mat.loop.g[mat.loop> optim.value.bound]<- 0
sum(mat.loop.g==1)
g1<-graph_from_adjacency_matrix(mat.loop.g, mode = 'undirected')
complete_points<- max_cliques(g1, min=2)
if(length(complete_points)>0){
x.el.graph<-c()
el.rm.graph<-c()
for(i in 1:length(complete_points)){
el.rm<- as.numeric(complete_points[[i]])
el.rm<- el.rm[!el.rm%in%el.rm.graph]
if(length(el.rm)>0){
x.el.graph<- rbind(x.el.graph, colSums(x.el.fold[el.rm,]))
el.rm.graph<- c(el.rm.graph, el.rm)
}}
x.el.fold<- rbind(x.el.fold[-el.rm.graph, , drop=F],x.el.graph)
colSums(x.el.fold)}
x.el.final<- rbind(x.el.final, x.el.fold)
{ #while( min(mat.loop)< optim.value.bound & nrow(mat.loop)>2){#
# el.rm <- which(mat.loop==min(mat.loop), arr.ind=T)
# el.rm <- el.rm[1,] # april 14
#el.rm<- unique(as.numeric(el.rm))
# mat.loop<- mat.loop[-el.rm, -el.rm, drop=F]
#if(sum(dim(mat.loop))==0){mat.loop=matrix(c(0,0,0,0),nrow = 2)}
#x.el.fold<- rbind( x.el.fold[-el.rm, , drop=F], colSums(x.el.fold[el.rm,]))
#colnames(x.el.fold)<-row.names(data.sorted)
#}
}
}}
rems<- c((l-rem+1):l)
#print(rems)
x.el.rem<- x.el.sorted[rems,]
ind.rem<- c(1:rem)
combn.rem<- t(combn(ind.rem, 2))
initial.rems<-apply(combn.rem, 1, function(x){fit.elements.weight.reduce( x.el.rem[x[1],],
x.el.rem[x[2],],
x.nb, ss.other )})
###may 6th
tempMatLoop<-matrix(1000, nrow=rem, ncol=rem)
tempMatLoop.2<-lower.tri(diag(0, nrow=rem, ncol=rem))
tempMatLoop[tempMatLoop.2] <- initial.rems[1, ]
mat.loop<-t(tempMatLoop)
#diag(mat.loop) <- 1e5;
mat.loop.g<- mat.loop
mat.loop.g[mat.loop <= optim.value.bound]<- 1
mat.loop.g[mat.loop> optim.value.bound]<- 0
sum(mat.loop.g==1)
g1<-graph_from_adjacency_matrix(mat.loop.g, mode = 'undirected')
complete_points<- max_cliques(g1, min=2)
x.el.graph<-c()
el.rm.graph<-c()
#print(complete_points)
if(length(complete_points)>0){
for(i in 1:length(complete_points)){
el.rm<- as.numeric(complete_points[[i]])
el.rm<- el.rm[!el.rm%in%el.rm.graph]
if(length(el.rm)>0){
x.el.graph<- rbind(x.el.graph, colSums(x.el.rem[el.rm,]))
el.rm.graph<- c(el.rm.graph, el.rm)
}}
x.el.rem<- rbind(x.el.rem[-el.rm.graph, , drop=F],x.el.graph)
colSums(x.el.rem)}
x.el.final<- rbind(x.el.final, x.el.rem)
{
#while( min(mat.loop)< -10 &nrow(mat.loop)>2){#
# el.rm<- which(mat.loop==min(mat.loop), arr.ind=T)
# el.rm<- unique(as.numeric(el.rm))
# mat.loop<- mat.loop[-el.rm, -el.rm, drop=F]
# if(sum(dim(mat.loop))==0){mat.loop=matrix(c(0,0,0,0),nrow = 2)}
# x.el.rem<- rbind( x.el.rem[-el.rm, , drop=F], colSums(x.el.rem[el.rm,]))
# colnames(x.el.fold)<-row.names(data.sorted)
#}
#x.el.final<- rbind(x.el.final, x.el.rem)
}
results<- list(x.el.red=x.el.final, data=arg.data, data.sorted=data.sorted);
nn <- nrow(results$x.el.red)
print("jj")
print(jj)
jj<-jj+1
if(nn < size.threshold | nn == n) {
return(results)
} else {
#mbhc.reduce.2(results, sampleNum=sampleNum,n=nn)
mbhc.reduce.graph(arg.data=data, sampleNum=sampleNum,x.el=x.el.final,n=nn)
}
}
#'
#'MBHC multiple
#'
#' This function performs the clustering on multiple samples.
#' @export
#'
#' @param arg.data,x.el.cut,sampleNum Arg.data should be a dataframe with IDs in the last column. Each column correspond to a sample. X.el.cut is the output structure of mbhc.reduce which performs a prliminary clustering. it can be empty. SampleNum is the column of the primary sample that clustering needs to be run on.
mbhc.multiple.cut <- function(arg.data,x.el.cut=data.frame(), sampleNum=1) {
if(sampleNum>dim(arg.data)[2]-1){print("stop the algorithm and check the sampleNum; run mbhc.prepdata() first")}
time0<-as.numeric(Sys.time())
params<-c(); x.values<-c(); x.steps<-c(); row.loop<-c();x.step<-c()
x.elements.list <- list()
weights.list <- list()
weights.dist <- c()
arg.data <- as.matrix(arg.data);
arg.data.other <- c();
if(ncol(arg.data) == 2){ ##### change '1' to '2' because the 2nd column is always id
print('executing single sample clustering....'); flush.console();
return(mag.single(arg.data[,1])) #feb 26: the second column is the IDs
} else {
arg.data.other<- arg.data[, -c(sampleNum,dim(arg.data)[2]), drop=F] #other samples in the data FEB26: aarg.datauming last column is ID
}
#feb 27: took these out of the if statement. So every version is done based on sorted data.
order<-order(arg.data[,sampleNum])
arg.data.other<-arg.data.other[order,,drop=F]
data.sorted<- arg.data[order, ]
x.nb<- arg.data[order, sampleNum]
IDs<- arg.data[order,dim(arg.data)[2]] #FeB26
if( sum(dim(x.el.cut))==0){
print("Mag.reduce has not been run.")
x.elements<-data.frame(diag(length(x.nb)))
#names(x.elements)<-c(1:length(x.nb)) feb 26
names(x.elements)<- IDs
}else{
x.elements<- x.el.cut
}
#print(class(arg.data.other))
#x.elements<- x.el.cut
#work on the target sample--- use other samples to find weight
x.loop <- x.nb
l <- length(x.loop)
# generate pairs of the indexes... ----------------------------------------
indx<-c(1:length(x.nb))
combn.idx <- t(combn(indx, 2))
weights <- apply(combn.idx, 1, function(x)
{ temp.elements <- rep(0, length(x.nb)); temp.elements[x] <- 1;
return(get.weights.within(temp.elements, arg.data.other)) } ); ### Now I Have the initial weights
combn.x <- t(combn(x.nb, 2))
initial <- cbind(combn.x, weights)
weights.list[[1]]<-initial
#print(initial)
l<- nrow(x.elements)
#### new matloop with reduced x.el:
indx.cut<-c(1:nrow(x.elements))
combn.cut <- t(combn(indx.cut, 2))
###no need to run weights...
cat('initial fitting ... '); flush.console();
time1 <- as.numeric(Sys.time())
initial.1<-apply(combn.cut, 1, function(x){fit.elements.weight( x.elements[x[1],], x.elements[x[2],],
x.nb, arg.data.other )})
time2 <- as.numeric(Sys.time())
cat('took ', time2-time1, ' seconds.\n'); flush.console();
tempMatLoop<-matrix(1000, nrow=l, ncol=l) #may 6th
tempMatLoop.2<-lower.tri(diag(0, nrow=l, ncol=l))
tempMatLoop[tempMatLoop.2] <- initial.1[1, ]
mat.loop<-t(tempMatLoop)
#diag(mat.loop) <- 1e5;
tempPar1Loop<-lower.tri(diag(0, nrow=l, ncol=l))
#tempPar2Loop<-lower.tri(diag(0, nrow=l, ncol=l))
tempPar1Loop[tempPar1Loop] <- initial.1[2, ]
#tempPar2Loop[tempPar2Loop] <- initial.1[3, ]
par.1.loop<-t(tempPar1Loop)
#par.2.loop<-t(tempPar2Loop)
weights.step<-c()
var.mean <- c()
time2<-as.numeric(Sys.time())
s<-1
while(nrow(x.elements) > 1){
cat(s, '...', nrow(x.elements), '...\n'); flush.console();
x.values[s]<-min(mat.loop)
el1<-which(mat.loop==min(mat.loop), arr.ind = T)[1,1]
el2<-which(mat.loop==min(mat.loop), arr.ind = T)[1,2]
el.rm<-c(el1,el2)
params<-c(params, par.1.loop[el1,el2])#, par.2.loop[el1,el2]))
#what happenes at each step
x.step <- x.elements[el.rm[1], ] + x.elements[el.rm[2],] ###this is very clever! I dont need to update the xloop.
###only use x.elements rows for each cluster!
x.steps <- rbind(x.steps, x.step)
# print(x.steps)
x.elements <- rbind(x.elements[-el.rm,], x.step)
x.elements.list[[s]]<-x.elements
var.mean.temp <- t(apply(x.elements, 1, get.var, x.nb=x.nb, s=s))
weights.temp <- apply(x.elements, 1, get.weights.within, arg.data=arg.data.other)
var.mean.temp <- cbind(var.mean.temp, weights.temp)
var.mean <- rbind(var.mean, var.mean.temp)
#####UPDATE MATRIX HERE
mat.loop<-mat.loop[-el.rm, -el.rm, drop=F]
par.1.loop<-par.1.loop[-el.rm, -el.rm, drop=F]
mat.column.update<-c()
par1.column.update<-c()
#par2.column.update<-c()
x1 <- x.elements[1:(nrow(x.elements)-1), ]
x2 <- x.elements[nrow(x.elements), ] #the ones that are put together
#print(x1)
#print(x2)
fit.between <- t(apply(x1, 1, fit.elements.weight, element.2=x2, arg.data=x.loop, arg.data.other=arg.data.other))
mat.column.update <- fit.between[, 1]
par1.column.update <- fit.between[, 2]
#par2.column.update <- fit.between[, 3]
#print(mat.loop)
if(nrow(mat.loop) != 0){
mat.loop<-cbind(mat.loop, mat.column.update )
mat.loop<-rbind(mat.loop, rep(1000,dim(mat.loop)[2])) #just to keep mat.loop square.
par.1.loop<-cbind(par.1.loop, par1.column.update)
par.1.loop<-rbind(par.1.loop, rep(0, dim(par.1.loop)[2]))
# par.2.loop<-cbind(par.2.loop, par2.column.update)
# par.2.loop<-rbind(par.2.loop, rep(0, dim(par.2.loop)[2]))
#print("MATLOOP UPDATE SHODE")
#print(mat.loop)
#print("END")
}
#print(mat.loop)
s<-s+1
}
colnames(var.mean)<-c("mean", "var","NumberOfPoints", "step", "weights")
time3 <- as.numeric(Sys.time())
cat('loop took ', time3 - time2, ' seconds.\n')
cat('total took ', time3 - time0, ' seconds.\n')
result <- list(x.el=x.elements.list,data.sorted=data.sorted, data=arg.data, params=params, var.mean=var.mean, wights=weights.list ,freq.s=x.nb ,freq=x.nb)
return(result)
}
#' VAR in other samples
mbhc.var<- function(mbhc.out, sampleNum=1){
#varmean is geneated based on sorted data
x.el.list<- mbhc.out$x.el
data<- data.frame(mbhc.out$data.sorted)
#this also assumes that the data was ran on prep.data code so the last column has ID.
num<- dim(data)[2]-1
var.mean.list<-list()
ss.other<- data[,-c(sampleNum,dim(data)[2]),drop=F]
x.el.red<- x.el.list[[1]]
x.el.red.okay<- apply(x.el.red, 1, function(x) paste(x, collapse = '')) #april 15
for ( i in 1:num){
var.mean<-c()
x.data<- data[,i]
names(data)
#s<-7
for( s in 1:length(x.el.list)){
x.el<- x.el.list[[s]]
temp<- t(apply(x.el, 1, get.var, x.nb=x.data, s=s))
weights.temp <- apply(x.el, 1, get.weights.within, arg.data=ss.other)
#okay<- apply( x.el, 1, function(x) nrow(merge(t(x),x.el.red)))
#april 16
okay<- apply(x.el, 1, function(x) sum(paste(x,collapse = '')%in%x.el.red.okay))
temp <- cbind(temp, weights.temp, okay)
var.mean<- rbind(var.mean, temp)
}
colnames(var.mean)<-c("mean", "var","NumberOfPoints", "step","weights","okay")
var.mean.list[[i]]<- var.mean
names(var.mean.list)[i]<-paste(names(data)[i],".var.mean",sep="")
}
return(list(var.mean.all=var.mean.list, mbhc.out=mbhc.out))
}
#'
mbhc.find.cut.off.single <- function(var.mean, data, x.el.list){
#detach('package:plyr')
library(dplyr)
time0 <- as.numeric(Sys.time())
#print(dim(mag.output))
#var.mean <- mag.output$var.mean
var.mean <- data.frame(var.mean, row.names = NULL)
#var.mean$id<-round(var.mean$mean*var.mean$NumberOfPoints,5)
var.mean[is.na(var.mean)] <- 0
var.mean.first <- var.mean
#print(var.mean)
var.mean$var <- round(var.mean$var, 3)
#var.mean$var <- var.mean$var + 1e-7
#id.max<-max(var.mean$id)
##################### add okay structure to the var mean:
#var.mean$okay<- 0
#var.mean[var.mean$step==1,'okay']<- 1
vars<- var.mean$var[var.mean$var>0]
var.bound<- quantile(vars)[4]*1.5
flag <- FALSE
m <- c()
str <- c() #clone structure line from temp var.mean matrix
str.f <- c()
cutstr <- c()
cutstr2 <- c()
#pointsMax<- max(rowSums(x.el.list[[1]]))
i <- max(var.mean$step)
# allPoints <- mag.output$x.el[[i]]
allPoints<- x.el.list[[i]] #feb28
ok.points <- allPoints - allPoints ##### To generate clean slate
n <- length(allPoints)
# print(allPoints)
id.ok <-c() #clusters with okay variance!
temp.mv <- c();
temp.points <- c();
temp.step <- c();
temp.point.step <- c();
while(flag == FALSE){
#print(flag)
print(i); flush.console();
v <- i
############
## I need to use x.el to get the breaks, because using StepI would cause problem
## in identical clusters and mutations.
## TO FIX THIS I JUST KEPT MY ORIGINAL METHOD -
## I just duplicated the rows in the case of identical mutations
## because there should be onlytwo rows at each step
## and having only 1 row means identical clusters.
############
stepI <- var.mean[var.mean$step==i,]
stepI.x.el <- x.el.list[[i]]
stepI.x.el.step <- stepI.x.el[nrow(stepI.x.el), ] #Only look at the most recent cluster
stepII <- var.mean[var.mean$step==i-1, ] ## previous step
#stepII$var<-stepII$var,)
stepII.x.el <- x.el.list[[i-1]]
#if( !(id.sum%in%id.ok)){ ###this means if the division on this step is not happening on the "ok" clusters. so the okay clusters should still exist in the temp.
#if(sum(abs(id.ok-id.sum)<0.1)==0){
#newtemp<-temp[!(temp$id%in%id.ok),]
#if(!id.I%in%id.ok){
temp.points <- rbind(temp.points, c(i, ok.points));
temp.step <- rbind(temp.step, c(i, stepI.x.el.step));
temp.point.step <- rbind(temp.point.step, c(i, sum(ok.points & stepI.x.el.step)))
print('here')
#print(ok.points)
#print(stepI.x.el.step)
#print(sum( ok.points & stepI.x.el.step)==0)
if(sum(ok.points & stepI.x.el.step) == 0){ ###meaning stepI.x.el is not in ok.points
#stepII.breaks<- stepII[!stepII$id%in%stepI,]
print("stepI ")
#print(stepI )
#print("stepII ")
#print(stepII )
#stepII.breaks <- setdiff(stepII[,-c(4,6)], stepI[,-c(4,6)])
stepII.breaks <- setdiff(stepII[,-4], stepI[,-4])
if(nrow(stepII.breaks)==1){
print("why did this happen????")
stepII.breaks <- rbind(stepII.breaks, stepII.breaks)
}
stepII.num <- unique(stepII[, 4])
# stepII.breaks <- cbind(stepII.breaks, stepII[1:2,c(4,6)]) #just add the step number back to the matrix #############MARCh 14 add "okay"
stepII.breaks <- cbind(stepII.breaks, step=stepII.num) #MAY 9th
stepII.breaks.x.el <- setdiff(stepII.x.el, stepI.x.el)
#for( ii in 1:dim(newtemp)[1]){ ###stepII.breaks always has 2 rows
for(ii in 1:2){
#sample.sim <- cluster.sim(edep=depth, efrq=stepII.breaks[ii,1], shape=shape, n=10000)
#var.sim <- c()
#size <- ifelse(stepII.breaks$NumberOfPoints[ii]<5, 5, stepII.breaks$NumberOfPoints[ii])
#print(size)
#for(j in 1:500){
# sample.sim.2 <- sample.sim[sample(1:10000, size=size),] #temp[1,3]
#var.sim <- c(var.sim, sd(sample.sim.2[, 4]))
#}
# m <- mean(var.sim) + 1e-6
# vv <- sd(var.sim)
# vmax <- max(var.sim)
# temp.mv <- rbind(temp.mv, c(i, ii, m, vv, vmax, stepII.breaks[ii,2]))
#if(stepII.breaks[ii, 'okay'] ==1 | sqrt(stepII.breaks[ii,2])*(1+stepII.breaks[ii, "weights"]) < m+vv) { #okay cluster
#if(stepII.breaks[ii, 'okay'] ==1 | sqrt(stepII.breaks[ii,2])*(1+stepII.breaks[ii, "weights"]) < m+vv) { #okay cluster ####change april 11
if(stepII.breaks[ii, 'okay'] ==1 | stepII.breaks$var[ii]<var.bound) { #okay cluster ####change 6/2/2018
str.f <- c(str.f, stepII.breaks[ii,1])
#print("here")
#print(str)
str <- rbind(str, stepII.breaks[ii,])
cutstr2 <- rbind(cutstr2, stepII.breaks.x.el[ii,])
#print(str)
#id.ok<-c(id.ok, stepII.breaks$id[ii])
ok.points <- ok.points + stepII.breaks.x.el[ii,]
#print('OKAY LAST')
#print(sum(ok.points))
#print(ok.points)
}
}
}
###stop condition
#if(abs(sum(str$id)-id.max)<0.000001){flag<-TRUE}else{i<-(v-1)}
#print(i) ###new condition: sum of id.okay== id.max
#if(abs(sum(str$id)-id.max)<0.000001){flag<-TRUE}else{i<-(v-1)}
####new condition oct 31
#print("LAST PRINT")
if(sum(ok.points>1)>0){
print("ERROR")
}
if(sum(ok.points) == n){
flag<-TRUE
} else {
i<-(v-1)
}
#if(v==i){iii<-TRUE}else{i<-(v-1)}
}
for( i in 1:nrow(str)){
#print(i)
xel <- x.el.list[[str$step[i]]] #feb 28
#print(xel)
for(j in 1:nrow(xel)){
m1 <- mean(data[xel[j,]==1]) #feb 28
##changed to sorted. after oc.9#### OCT 30: THis should be changed for OC11 since there is no sorting in that
if(abs(m1-str[i,1]) < 0.001){
cutstr<-rbind(cutstr, xel[j,]) ## what's the difference between cutstr and cutstr2????????
}
}
}
time1 <- as.numeric(Sys.time())
cat("took ", (time1 - time0), " seconds.\n");
#print(cutstr)
colors<-colSums(cutstr*c(1:nrow(cutstr)))+1
final.data<-cbind(data,colors)
results<-list(str=str, cutclust=cutstr, cutclust2=cutstr2, colors=colors, final.data=final.data)
return(results)
}
#' find cut off multiple
mbhc.find.cut.off.multiple <- function(mbhc.var){
library(dplyr)
if(is.null(mbhc.var[['var.mean.all']])) return('incomplte, run mbhc.var() first');
final.data.clusters<- mbhc.var$mbhc.out$data.sorted
cuts<-list()
colors.all<-c()
x.el<- mbhc.var$mbhc.out$x.el
for(i in 1:length(mbhc.var[['var.mean.all']])) {
var.mean<- mbhc.var$var.mean.all[[i]]
data<- mbhc.var$mbhc.out$data.sorted[,i,drop=F]
#print(shape)
#print(depth)
#shape=shape
#depth=depth
cut.temp<-mbhc.find.cut.off.single(var.mean = var.mean, data = data,x.el.list = x.el)
cuts[[i]]<-cut.temp
name.temp<-paste(colnames(data),'.color',sep = "")
colors.all<-cbind(colors.all, cut.temp$colors*10^(2*i-2))
final.data.clusters<-cbind(final.data.clusters, cut.temp$colors)
colnames(colors.all)[dim(colors.all)[2]]<- name.temp
colnames(final.data.clusters)[dim(final.data.clusters)[2]]<-name.temp
}
#print('colorsall')
#print(colors.all)
colors.all<-data.frame(colors.all)
colors.all$final<- rowSums(colors.all)
#colors.all$final
library(plyr)
#print(colors.all$final)
updated.colors<-mapvalues(colors.all$final, from = unique(colors.all$final), to=c(1:length(unique(colors.all$final))))
colors.all$colors.final<- updated.colors
detach('package:plyr')
final.data.assignments<- cbind(final.data.clusters, cluster=updated.colors)
final.data.clusters<- cbind( mbhc.var$mbhc.out$data.sorted, cluster=updated.colors)
return(list( final.data.clusters=final.data.clusters,final.data.all.samples=final.data.assignments, final.colors=colors.all))
}
#' MBHC Run
#'
#' @param data,name,sampleFreqs
mbhc.run<- function(data,name="",sampleFreqs= NA,sampleNum=1, depth=200){
if(length(data$ID)==0){return(print("RUN mbhc.prepdata"))}
if(sum(is.na(sampleFreqs))>0){ data=data }else{data=cbind(data[,sampleFreqs], ID=data$ID)}
#data.prep<- mbhc.prepdata(data)
# data should have id.
data.prep<- data
dim<-dim(data)[2]
s<- sampleNum
if(dim==2){
print('Running Single Sample Clustering')
mbhc.out.single<- mbhc.single(data.prep, sampleNum=s)
mbhc.cut.single<- mbhc.find.cut.off(mbhc.out.single, depth=depth)
results<- list(final.dataframe<- mbhc.cut.single$final.data, mbhc.output<- mbhc.out.single,
data.prep<- data.prep)
names(results)<- c(paste(name,".final.data", sep = ""),paste(name,".mbhc.single.output", sep = ""),
paste(name,".data.prep",sep = ""))
}
if(dim>2){
print('Running Multiple Sample Clustering')
mbhc.red<- mbhc.reduce.graph(data.prep)
mbhc.out<- mbhc.multiple.cut(data.prep, x.el.cut = mbhc.red$x.el.red, sampleNum=s)
mbhc.mv <- mbhc.var(mbhc.out,sampleNum = s)
mbhc.cut<- mbhc.find.cut.off.multiple(mbhc.mv, shape=shape, depth=depth)
results <- list(final.dataframe<-mbhc.cut$final.data.clusters ,data.prep<- data.prep,
reduce.output<- mbhc.red, mbhc.output<- mbhc.out, mbhc.cutoff<- mbhc.cut, mbhc.mvar= mbhc.mv )
names(results)<- c(paste(name,".final.data", sep = ""), paste(name,".data.prep",sep = ""),
paste(name,".reduce.output", sep = ""),
paste(name,".mbhc.out", sep = ""), paste(name,".mbhc.cut", sep = ""),
paste(name,".mbhc.vars", sep=""))
}
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.