Manuscript_files/20190708_Exp_experiments_R1.R

#################################################################
##                            Exp1                            ##
#################################################################

# Load libraries
library('MixSim')
library('mclust')

# Load source files for minibatch EM algorithms
source('https://raw.githubusercontent.com/hiendn/StoEMMIX/master/Manuscript_files/20190708_ExpPois_functions.R')

# Set memory limit
Sys.setenv('R_MAX_VSIZE'=10000000000000)

## Extract various required variables
# Get the number of subpopulations
g <- 3
## Estimate mixture model parameters
# Proportions
Pi <- c(0.2,0.1,0.7)
# Lambda
Lambda <- c(1,9,15)
# True matrix
True_matrix <- matrix(NA,g,2)
for (ii in 1:g) {
  True_matrix[ii,] <- c(Pi[ii],Lambda[ii])
}

## Setup parameters
# Number of observations to simulation
NN <- 10^6
# Set the number repetitions
Rep <- 100
# Number of components to fit
Groups <- 3
# Set a random seed
set.seed(20190708)
# Construct a matrix to store the results
Results <- matrix(NA,100,9)
Timing <- matrix(NA,100,5)
ARI_results <- matrix(NA,100,9)
SE_results <- matrix(NA,100,9)

# Conduct simulation study
for (rr in 1:Rep) {
  
  # Simulate data
  IDs <- sample(1:3,NN,replace=TRUE,prob=Pi)
  Data <- matrix(NA,nrow=NN,ncol=1)
  Data[IDs==1] <- rexp(sum(IDs==1),Lambda[1])
  Data[IDs==2] <- rexp(sum(IDs==2),Lambda[2])
  Data[IDs==3] <- rexp(sum(IDs==3),Lambda[3])
  
  # Randomly generate labels for initialization
  Samp <- sample(1:Groups,NN,replace = T)
  
  # Initialize parameters
  Pi_int <- table(Samp)/NN
  Lambda_int <- 1/c(mean(Data[Samp==1]),mean(Data[Samp==2]),mean(Data[Samp==3]))
  # Run batch EM algorithm 
  Tick <- proc.time()[3]
  MC <- EMExponential_pol(data=Data,pi_r=Pi_int,lambda_r = Lambda_int,maxit_r = 10,groups_r = g)
  Timing[rr,1] <- proc.time()[3]-Tick
  # Get likelihood value for batch EM algorithm
  Results[rr,1] <-MC$`reg_log-likelihood`
  # Get parameter estimates
  MC_matrix <- matrix(NA,g,2)
  for (ii in 1:g) {
    MC_matrix[ii,] <- c(MC$reg_proportions[ii],MC$reg_lambda[ii])
  }
  SE_results[rr,1] <- sum(apply((as.matrix(dist(rbind(MC_matrix,True_matrix),diag=T,upper=T))[1:g,(g+1):(2*g)])^2,1,min))
  # Get ARI
  ARI_results[rr,1] <- adjustedRandIndex(IDs,Exp_clust(Data,MC$reg_proportions,MC$reg_lambda))
  
  # Run minibatch algorithm with batch size 10000
  Tick <- proc.time()[3]
  Sto <- stoExponential_pol(Data, Pi_int, Lambda_int,
                      10*NN/10000,Groups,0.6,1-10^-10,10000)
  Results[rr,2] <- Sto$`reg_log-likelihood`
  Results[rr,3] <- Sto$`pol_log-likelihood`
  Timing[rr,2] <- proc.time()[3]-Tick
  # Get parameter estimates
  Reg_matrix <- matrix(NA,g,2)
  for (ii in 1:g) {
    Reg_matrix[ii,] <- c(Sto$reg_proportions[ii],Sto$reg_lambda[ii])
  }
  SE_results[rr,2] <- sum(apply((as.matrix(dist(rbind(Reg_matrix,True_matrix),diag=T,upper=T))[1:g,(g+1):(2*g)])^2,1,min))
  # Get ARI (reg)
  ARI_results[rr,2] <- adjustedRandIndex(IDs,Exp_clust(Data,Sto$reg_proportions,Sto$reg_lambda))
  
  # Get parameter estimates (polyak)
  Pol_matrix <- matrix(NA,g,2)
  for (ii in 1:g) {
    Pol_matrix[ii,] <- c(Sto$pol_proportions[ii],Sto$pol_lambda[ii])
  }
  SE_results[rr,3] <- sum(apply((as.matrix(dist(rbind(Pol_matrix,True_matrix),diag=T,upper=T))[1:g,(g+1):(2*g)])^2,1,min))
  # Get ARI (pol)
  ARI_results[rr,3] <- adjustedRandIndex(IDs,Exp_clust(Data,Sto$pol_proportions,Sto$pol_lambda))
  
  # Run minibatch algorithm with batch size 20000
  Tick <- proc.time()[3]
  Sto <- stoExponential_pol(Data, Pi_int, Lambda_int,
                      10*NN/20000,Groups,0.6,1-10^-10,20000)
  Results[rr,4] <- Sto$`reg_log-likelihood`
  Results[rr,5] <- Sto$`pol_log-likelihood`
  Timing[rr,3] <- proc.time()[3]-Tick
  # Get parameter estimates (regular)
  Reg_matrix <- matrix(NA,g,2)
  for (ii in 1:g) {
    Reg_matrix[ii,] <- c(Sto$reg_proportions[ii],Sto$reg_lambda[ii])
  }
  SE_results[rr,4] <- sum(apply((as.matrix(dist(rbind(Reg_matrix,True_matrix),diag=T,upper=T))[1:g,(g+1):(2*g)])^2,1,min))
  # Get ARI (reg)
  ARI_results[rr,4] <- adjustedRandIndex(IDs,Exp_clust(Data,Sto$reg_proportions,Sto$reg_lambda))
  
  # Get parameter estimates (polyak)
  Pol_matrix <- matrix(NA,g,2)
  for (ii in 1:g) {
    Pol_matrix[ii,] <- c(Sto$pol_proportions[ii],Sto$pol_lambda[ii])
  }
  SE_results[rr,5] <- sum(apply((as.matrix(dist(rbind(Pol_matrix,True_matrix),diag=T,upper=T))[1:g,(g+1):(2*g)])^2,1,min))
  # Get ARI (pol)
  ARI_results[rr,5] <- adjustedRandIndex(IDs,Exp_clust(Data,Sto$pol_proportions,Sto$pol_lambda))
  
  # Save and print outputs
  save(Results,file='./Exp1.rdata')
  print(c(rr,Results[rr,]))
  save(Timing,file='./Exp1timing.rdata')
  save(ARI_results,file='./Exp1ARI.rdata')
  save(SE_results,file='./Exp1SE.rdata')
  print(c(rr,Timing[rr,]))
  print(c(rr,ARI_results[rr,]))
  print(c(rr,SE_results[rr,]))
  
  # Also sink results to a text file
  sink('./Exp1.txt',append = TRUE)
  cat(rr,Results[rr,],'\n')
  sink()
  sink('./Exp1timing.txt',append = TRUE)
  cat(rr,Timing[rr,],'\n')
  sink()
  sink('./Exp1ARI.txt',append = TRUE)
  cat(rr,ARI_results[rr,],'\n')
  sink()
  sink('./Exp1SE.txt',append = TRUE)
  cat(rr,SE_results[rr,],'\n')
  sink()
}

#################################################################
##                           Exp2                             ##
#################################################################

# Load libraries
library('MixSim')
library('mclust')

# Load source files for minibatch EM algorithms
source('https://raw.githubusercontent.com/hiendn/StoEMMIX/master/Manuscript_files/20190708_ExpPois_functions.R')

# Set memory limit
Sys.setenv('R_MAX_VSIZE'=10000000000000)

## Extract various required variables
# Get the number of subpopulations
g <- 3
## Estimate mixture model parameters
# Proportions
Pi <- c(0.2,0.1,0.7)
# Lambda
Lambda <- c(1,9,15)
# True matrix
True_matrix <- matrix(NA,g,2)
for (ii in 1:g) {
  True_matrix[ii,] <- c(Pi[ii],Lambda[ii])
}

## Setup parameters
# Number of observations to simulation
NN <- 10^7
# Set the number repetitions
Rep <- 100
# Number of components to fit
Groups <- 3
# Set a random seed
set.seed(20190708)
# Construct a matrix to store the results
Results <- matrix(NA,100,9)
Timing <- matrix(NA,100,5)
ARI_results <- matrix(NA,100,9)
SE_results <- matrix(NA,100,9)

# Conduct simulation study
for (rr in 1:Rep) {
  
  # Simulate data
  IDs <- sample(1:3,NN,replace=TRUE,prob=Pi)
  Data <- matrix(NA,nrow=NN,ncol=1)
  Data[IDs==1] <- rexp(sum(IDs==1),Lambda[1])
  Data[IDs==2] <- rexp(sum(IDs==2),Lambda[2])
  Data[IDs==3] <- rexp(sum(IDs==3),Lambda[3])
  
  # Randomly generate labels for initialization
  Samp <- sample(1:Groups,NN,replace = T)
  
  # Initialize parameters
  Pi_int <- table(Samp)/NN
  Lambda_int <- 1/c(mean(Data[Samp==1]),mean(Data[Samp==2]),mean(Data[Samp==3]))
  # Run batch EM algorithm 
  Tick <- proc.time()[3]
  MC <- EMExponential_pol(data=Data,pi_r=Pi_int,lambda_r = Lambda_int,maxit_r = 10,groups_r = g)
  Timing[rr,1] <- proc.time()[3]-Tick
  # Get likelihood value for batch EM algorithm
  Results[rr,1] <-MC$`reg_log-likelihood`
  # Get parameter estimates
  MC_matrix <- matrix(NA,g,2)
  for (ii in 1:g) {
    MC_matrix[ii,] <- c(MC$reg_proportions[ii],MC$reg_lambda[ii])
  }
  SE_results[rr,1] <- sum(apply((as.matrix(dist(rbind(MC_matrix,True_matrix),diag=T,upper=T))[1:g,(g+1):(2*g)])^2,1,min))
  # Get ARI
  ARI_results[rr,1] <- adjustedRandIndex(IDs,Exp_clust(Data,MC$reg_proportions,MC$reg_lambda))
  
  # Run minibatch algorithm with batch size 10000
  Tick <- proc.time()[3]
  Sto <- stoExponential_pol(Data, Pi_int, Lambda_int,
                            10*NN/100000,Groups,0.6,1-10^-10,100000)
  Results[rr,2] <- Sto$`reg_log-likelihood`
  Results[rr,3] <- Sto$`pol_log-likelihood`
  Timing[rr,2] <- proc.time()[3]-Tick
  # Get parameter estimates
  Reg_matrix <- matrix(NA,g,2)
  for (ii in 1:g) {
    Reg_matrix[ii,] <- c(Sto$reg_proportions[ii],Sto$reg_lambda[ii])
  }
  SE_results[rr,2] <- sum(apply((as.matrix(dist(rbind(Reg_matrix,True_matrix),diag=T,upper=T))[1:g,(g+1):(2*g)])^2,1,min))
  # Get ARI (reg)
  ARI_results[rr,2] <- adjustedRandIndex(IDs,Exp_clust(Data,Sto$reg_proportions,Sto$reg_lambda))
  
  # Get parameter estimates (polyak)
  Pol_matrix <- matrix(NA,g,2)
  for (ii in 1:g) {
    Pol_matrix[ii,] <- c(Sto$pol_proportions[ii],Sto$pol_lambda[ii])
  }
  SE_results[rr,3] <- sum(apply((as.matrix(dist(rbind(Pol_matrix,True_matrix),diag=T,upper=T))[1:g,(g+1):(2*g)])^2,1,min))
  # Get ARI (pol)
  ARI_results[rr,3] <- adjustedRandIndex(IDs,Exp_clust(Data,Sto$pol_proportions,Sto$pol_lambda))
  
  # Run minibatch algorithm with batch size 20000
  Tick <- proc.time()[3]
  Sto <- stoExponential_pol(Data, Pi_int, Lambda_int,
                            10*NN/200000,Groups,0.6,1-10^-10,200000)
  Results[rr,4] <- Sto$`reg_log-likelihood`
  Results[rr,5] <- Sto$`pol_log-likelihood`
  Timing[rr,3] <- proc.time()[3]-Tick
  # Get parameter estimates (regular)
  Reg_matrix <- matrix(NA,g,2)
  for (ii in 1:g) {
    Reg_matrix[ii,] <- c(Sto$reg_proportions[ii],Sto$reg_lambda[ii])
  }
  SE_results[rr,4] <- sum(apply((as.matrix(dist(rbind(Reg_matrix,True_matrix),diag=T,upper=T))[1:g,(g+1):(2*g)])^2,1,min))
  # Get ARI (reg)
  ARI_results[rr,4] <- adjustedRandIndex(IDs,Exp_clust(Data,Sto$reg_proportions,Sto$reg_lambda))
  
  # Get parameter estimates (polyak)
  Pol_matrix <- matrix(NA,g,2)
  for (ii in 1:g) {
    Pol_matrix[ii,] <- c(Sto$pol_proportions[ii],Sto$pol_lambda[ii])
  }
  SE_results[rr,5] <- sum(apply((as.matrix(dist(rbind(Pol_matrix,True_matrix),diag=T,upper=T))[1:g,(g+1):(2*g)])^2,1,min))
  # Get ARI (pol)
  ARI_results[rr,5] <- adjustedRandIndex(IDs,Exp_clust(Data,Sto$pol_proportions,Sto$pol_lambda))
  
  # Save and print outputs
  save(Results,file='./Exp2.rdata')
  print(c(rr,Results[rr,]))
  save(Timing,file='./Exp2timing.rdata')
  save(ARI_results,file='./Exp2ARI.rdata')
  save(SE_results,file='./Exp2SE.rdata')
  print(c(rr,Timing[rr,]))
  print(c(rr,ARI_results[rr,]))
  print(c(rr,SE_results[rr,]))
  
  # Also sink results to a text file
  sink('./Exp2.txt',append = TRUE)
  cat(rr,Results[rr,],'\n')
  sink()
  sink('./Exp2timing.txt',append = TRUE)
  cat(rr,Timing[rr,],'\n')
  sink()
  sink('./Exp2ARI.txt',append = TRUE)
  cat(rr,ARI_results[rr,],'\n')
  sink()
  sink('./Exp2SE.txt',append = TRUE)
  cat(rr,SE_results[rr,],'\n')
  sink()
}
hiendn/StoEMMIX documentation built on Sept. 9, 2019, 6:07 a.m.