#K is fixed.
#Returns a list of two items: membership and W
#W will be used for plotting when choosing K.
Klines<-function(x,y,
K,num_init,mc.cores){
if (K==1){
result<-Klines_1(x,y)
}else{
result<-Klines_not_1(x,y,
K,num_init,mc.cores)
}
return(result)
}
#K is fixed (K=1).
#Returns a list of two items: membership and W
Klines_1<-function(x,y){
n<-length(x)
membership<-rep(1,n)
#Get perpDistances. Compare to update100Times().
k<-1
idx<-which(membership==k)
x_k<-x[idx]
y_k<-y[idx]
if (var(y_k)>0){
suppressMessages(reg_res<-lmodel2::lmodel2(y_k~x_k)$regression.results)
a<-reg_res$Slope[reg_res$Method=="MA"]
b<-reg_res$Intercept[reg_res$Method=="MA"]
}else{ #Edge case
a<-0
b<-y_k[1]
}
perpDistances<-abs(a*x-y+b)/sqrt(a^2+1)
#Get W. Compare to Klines_each.
W<-1/n*sum(perpDistances^2)
toReturn<-list(membership=membership,W=W)
return(toReturn)
}
#K is fixed (K is not 1).
#Finds the line centers that minimizes W (average squared perpendicular distance) with num_init initializations
#Returns a list of two items: membership and W
Klines_not_1<-function(x,y,
K,num_init,mc.cores){
#Run Klines_each num_init times
results<-parallel::mclapply(1:num_init,FUN=function(i){
return(Klines_each(x,y,K))
},mc.cores=mc.cores) #results is a list (length is num_init) of lists (length is 2)
Ws<-sapply(results,FUN=function(result){
return(result$W)
})
idx<-which.min(Ws)
return(results[[idx]])
}
#K is fixed (K is not 1).
#Finds the line centers that minimizes W (average squared perpendicular distance) with 1 initialization
#Returns a list of two items: membership and W
Klines_each<-function(x,y,K){
n<-length(x)
#Randomly initialize
membership<-sample(1:K,size=n,replace=TRUE)
while (length(unique(membership))<K | any(table(membership)<3)){ #Edge case
membership<-sample(1:K,size=n,replace=TRUE)
}
#Update 100 times
result<-update100Times(x,y,K,membership)
membership<-result$membership
#perpDistances<-result$perpDistances #Don't need to store perpDistances the first time update100Times is called
#EM update 100 times
membership<-EMUpdate100Times(x,y,K,membership)
#Update 100 times
result<-update100Times(x,y,K,membership)
membership<-result$membership
perpDistances<-result$perpDistances
#Calculate W
WEntires<-sapply(1:n,FUN=function(i){
return(perpDistances[i,membership[i]])
})
W<-1/n*sum(WEntires^2)
return(list(membership=membership,W=W))
}
#Given the current membership assignment, update the assignment 100 times
#Returns a list of two items: membership, perpDistances
update100Times<-function(x,y,K,membership){
n<-length(x)
for (i in 1:100){
#Step 1, calculate the line centers based on the current membership assignment:
lineCenters<-sapply(1:K,FUN=function(k){
idx<-which(membership==k)
x_k<-x[idx]
y_k<-y[idx]
if(var(y_k)>0){
suppressMessages(reg_res<-lmodel2::lmodel2(y_k~x_k)$regression.results)
a<-reg_res$Slope[reg_res$Method=="MA"]
b<-reg_res$Intercept[reg_res$Method=="MA"]
}else{ #Edge case
a<-0
b<-y_k[1]
}
return(c(a,b))
}) #2*K
#Step 2, update membership assignment:
#Get the matrix of perpendicular distances
perpDistances<-apply(lineCenters,2,FUN=function(beta){ #2 means by column
a<-beta[1]
b<-beta[2]
return(abs(a*x-y+b)/sqrt(a^2+1)) #n*1
}) #n*K
#Store the current membership in membershipOld
membershipOld<-membership
#Assign the points to the line centers
membership<-max.col(-perpDistances,"first")
while(length(unique(membership))<K | any(table(membership)<3)){ #Edge case
noise<-rnorm(n*K,mean=0,sd=sd(perpDistances))
noiseMatrix<-matrix(noise,nrow=n)
perpDistances<-perpDistances-noise
membership<-max.col(-perpDistances,ties.method="first")
}
if(identical(membershipOld,membership)){
break
}
}
return(list(membership=membership,perpDistances=perpDistances))
} #End of update100Times
#Given the current membership assignment, update the assignment 100 times
#Returns membership
EMUpdate100Times<-function(x,y,K,membership){
n<-length(x)
data<-cbind(x,y)
#Calculate a set of initial parameters
parameters<-lapply(1:K,FUN=function(k){
idx<-which(membership==k)
p_k<-length(idx)/n
data_k<-data[idx,]
mu_k<-colMeans(data_k)
Sigma_k<-cov(data_k)
parameters_k<-list(p_k,mu_k,Sigma_k)
return(parameters_k)
}) #A list of K lists, each of length 3
for(i in 1:100){
#E-step
#Calculate the n*K density matrix
densityMatrix<-sapply(parameters,FUN=function(parameters_k){
p_k<-parameters_k[[1]]
mu_k<-parameters_k[[2]]
Sigma_k<-parameters_k[[3]]
density_k<-mvtnorm::dmvnorm(data,mean=mu_k,sigma=Sigma_k)
density_k<-p_k*density_k
return(density_k)
}) #n*K
#Get the n*K weight matrix kxi
kxi<-densityMatrix/rowSums(densityMatrix) #n*K
kxi[is.na(kxi)]<-0 #Edge case
parametersOld<-parameters
#M-step
kxiColSums<-colSums(kxi) #Vector of length K
p<-colSums(kxi)/n #Vector of length K
mu<-t(kxi)%*%data/kxiColSums #Kx2
parameters<-lapply(1:K,FUN=function(k){
p_k<-p[k]
mu_k<-mu[k,]
sigma2_Xk<-kxi[,k]%*%(x-mu[k,1])^2/kxiColSums[k]
sigma2_Yk<-kxi[,k]%*%(y-mu[k,2])^2/kxiColSums[k]
sigma_XYk<-kxi[,k]%*%((x-mu[k,1])*(y-mu[k,2]))/kxiColSums[k]
Sigma_k<-rbind(c(sigma2_Xk,sigma_XYk),c(sigma_XYk,sigma2_Yk))
parameters_k<-list(p_k,mu_k,Sigma_k)
return(parameters_k)
}) #A list of K lists, each of length 3
diff<-max(abs(unlist(parametersOld)-unlist(parameters)))
if(is.na(diff)){ #Edge case
diff<-0
}
if(diff<=0.001){
break
}
} #End of for loop
membership<-apply(kxi,1,FUN=function(kxiRow){ #1 means by row
return(which.max(kxiRow))
})
while(length(unique(membership))<K | any(table(membership)<3)){ #Edge case
noise<-rnorm(n*K,mean=0,sd=sd(kxi))
noiseMatrix<-matrix(noise,nrow=n)
kxi<-kxi-noise
membership<-max.col(-kxi,ties.method="first")
}
return(membership)
} #End of EMUpdate100Times
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.