R/5_Klines.R

Defines functions Klines Klines_1 Klines_not_1 Klines_each update100Times EMUpdate100Times

#K is fixed.
#Returns a list of two items: membership and W
#W will be used for plotting when choosing K.
Klines<-function(x,y,
                 K,num_init,mc.cores){
  if (K==1){
    result<-Klines_1(x,y)
    }else{
    result<-Klines_not_1(x,y,
                         K,num_init,mc.cores)
    }
  return(result)
}

#K is fixed (K=1).
#Returns a list of two items: membership and W
Klines_1<-function(x,y){
  n<-length(x)
  membership<-rep(1,n)
  
  #Get perpDistances. Compare to update100Times().
  k<-1
  idx<-which(membership==k)
  x_k<-x[idx]
  y_k<-y[idx]
  if (var(y_k)>0){
    suppressMessages(reg_res<-lmodel2::lmodel2(y_k~x_k)$regression.results)
    a<-reg_res$Slope[reg_res$Method=="MA"]
    b<-reg_res$Intercept[reg_res$Method=="MA"]
  }else{ #Edge case
    a<-0
    b<-y_k[1]
  }
  
  perpDistances<-abs(a*x-y+b)/sqrt(a^2+1)

  #Get W. Compare to Klines_each.
  W<-1/n*sum(perpDistances^2)

  toReturn<-list(membership=membership,W=W)
  return(toReturn)
}

#K is fixed (K is not 1).
#Finds the line centers that minimizes W (average squared perpendicular distance) with num_init initializations
#Returns a list of two items: membership and W
Klines_not_1<-function(x,y,
                       K,num_init,mc.cores){
  #Run Klines_each num_init times
  results<-parallel::mclapply(1:num_init,FUN=function(i){
    return(Klines_each(x,y,K))
  },mc.cores=mc.cores) #results is a list (length is num_init) of lists (length is 2)
  Ws<-sapply(results,FUN=function(result){
    return(result$W)
    })
  idx<-which.min(Ws)
  return(results[[idx]])
}

#K is fixed (K is not 1).
#Finds the line centers that minimizes W (average squared perpendicular distance) with 1 initialization
#Returns a list of two items: membership and W
Klines_each<-function(x,y,K){
  n<-length(x)
  
  #Randomly initialize
  membership<-sample(1:K,size=n,replace=TRUE)
  while (length(unique(membership))<K | any(table(membership)<3)){ #Edge case
    membership<-sample(1:K,size=n,replace=TRUE)
  }
  
  #Update 100 times
  result<-update100Times(x,y,K,membership)
  membership<-result$membership
  #perpDistances<-result$perpDistances #Don't need to store perpDistances the first time update100Times is called
  
  #EM update 100 times
  membership<-EMUpdate100Times(x,y,K,membership)
  
  #Update 100 times
  result<-update100Times(x,y,K,membership)
  membership<-result$membership
  perpDistances<-result$perpDistances
  
  #Calculate W
  WEntires<-sapply(1:n,FUN=function(i){
    return(perpDistances[i,membership[i]])
  })
  W<-1/n*sum(WEntires^2)
  
  return(list(membership=membership,W=W))
}

#Given the current membership assignment, update the assignment 100 times
#Returns a list of two items: membership, perpDistances
update100Times<-function(x,y,K,membership){
  n<-length(x)
  
  for (i in 1:100){
    #Step 1, calculate the line centers based on the current membership assignment:
    lineCenters<-sapply(1:K,FUN=function(k){
      idx<-which(membership==k)
      x_k<-x[idx]
      y_k<-y[idx]
      if(var(y_k)>0){
        suppressMessages(reg_res<-lmodel2::lmodel2(y_k~x_k)$regression.results)
        a<-reg_res$Slope[reg_res$Method=="MA"]
        b<-reg_res$Intercept[reg_res$Method=="MA"]
      }else{ #Edge case
        a<-0
        b<-y_k[1]
      }
      return(c(a,b))
    }) #2*K
    #Step 2, update membership assignment:
    #Get the matrix of perpendicular distances
    perpDistances<-apply(lineCenters,2,FUN=function(beta){ #2 means by column
      a<-beta[1]
      b<-beta[2]
      return(abs(a*x-y+b)/sqrt(a^2+1)) #n*1
    }) #n*K
    #Store the current membership in membershipOld
    membershipOld<-membership
    #Assign the points to the line centers
    membership<-max.col(-perpDistances,"first")
    while(length(unique(membership))<K | any(table(membership)<3)){ #Edge case
      noise<-rnorm(n*K,mean=0,sd=sd(perpDistances))
      noiseMatrix<-matrix(noise,nrow=n)
      perpDistances<-perpDistances-noise
      membership<-max.col(-perpDistances,ties.method="first")
    }
    
    if(identical(membershipOld,membership)){
      break
    }
  }
  
  return(list(membership=membership,perpDistances=perpDistances))
} #End of update100Times

#Given the current membership assignment, update the assignment 100 times
#Returns membership
EMUpdate100Times<-function(x,y,K,membership){
  n<-length(x)
  data<-cbind(x,y)
  
  #Calculate a set of initial parameters
  parameters<-lapply(1:K,FUN=function(k){
    idx<-which(membership==k)
    p_k<-length(idx)/n
    data_k<-data[idx,]
    mu_k<-colMeans(data_k)
    Sigma_k<-cov(data_k)
    parameters_k<-list(p_k,mu_k,Sigma_k)
    return(parameters_k)
  }) #A list of K lists, each of length 3
  
  for(i in 1:100){
    #E-step
    #Calculate the n*K density matrix
    densityMatrix<-sapply(parameters,FUN=function(parameters_k){
      p_k<-parameters_k[[1]]
      mu_k<-parameters_k[[2]]
      Sigma_k<-parameters_k[[3]]
      density_k<-mvtnorm::dmvnorm(data,mean=mu_k,sigma=Sigma_k)
      density_k<-p_k*density_k
      return(density_k)
    }) #n*K
    
    #Get the n*K weight matrix kxi
    kxi<-densityMatrix/rowSums(densityMatrix) #n*K
    kxi[is.na(kxi)]<-0 #Edge case
    
    parametersOld<-parameters
    
    #M-step
    kxiColSums<-colSums(kxi) #Vector of length K
    p<-colSums(kxi)/n #Vector of length K
    mu<-t(kxi)%*%data/kxiColSums #Kx2
    parameters<-lapply(1:K,FUN=function(k){
      p_k<-p[k]
      mu_k<-mu[k,]
      sigma2_Xk<-kxi[,k]%*%(x-mu[k,1])^2/kxiColSums[k]
      sigma2_Yk<-kxi[,k]%*%(y-mu[k,2])^2/kxiColSums[k]
      sigma_XYk<-kxi[,k]%*%((x-mu[k,1])*(y-mu[k,2]))/kxiColSums[k]
      Sigma_k<-rbind(c(sigma2_Xk,sigma_XYk),c(sigma_XYk,sigma2_Yk))
      parameters_k<-list(p_k,mu_k,Sigma_k)
      return(parameters_k)
    }) #A list of K lists, each of length 3
    
    diff<-max(abs(unlist(parametersOld)-unlist(parameters)))
    if(is.na(diff)){ #Edge case
      diff<-0
    }
    if(diff<=0.001){
      break
    }
  } #End of for loop
  
  membership<-apply(kxi,1,FUN=function(kxiRow){ #1 means by row
    return(which.max(kxiRow))
  })
  while(length(unique(membership))<K | any(table(membership)<3)){ #Edge case
    noise<-rnorm(n*K,mean=0,sd=sd(kxi))
    noiseMatrix<-matrix(noise,nrow=n)
    kxi<-kxi-noise
    membership<-max.col(-kxi,ties.method="first")
  }
  
  return(membership)
} #End of EMUpdate100Times
heatherjzhou/gR2.2 documentation built on Nov. 14, 2019, 12:14 a.m.