ERPM:

######################################################################
## Simulation and estimation of Exponential Random Partition Models ## 
## Functions used to run the phase 3 of the estimation algorithm    ##
## Author: Marion Hoffman                                           ##
######################################################################


# Phase 3 for a single partition
run_phase3_single <- function(partition,
                       estimates.phase2, 
                       z.obs, 
                       nodes, 
                       effects, 
                       objects, 
                       burnin, 
                       thining,
                       a.scaling,
                       length.p3, 
                       neighborhood,
                       sizes.allowed,
                       sizes.simulated,
                       fixed.estimates,
                       parallel = F,
                       cpus = 1) {
  
  num.nodes <- nrow(nodes)
  num.effects <- length(effects$names)
  
  # find a good starting point
  #first.partition <- find_startingpoint_single(nodes,sizes.allowed)
  first.partition <- partition
  
  # simulate a large sample with the estimates found in phase 2 
  if(parallel){
    
    sfExport("startingestimates", "first.partition", "nodes", "effects", "objects", "burnin", "thining", "length.p3", "cpus", "neighborhood", "sizes.allowed", "sizes.simulated")
    res <- sfLapply(1:cpus, fun = function(k) {
      set.seed(k)
      subres <- draw_Metropolis_single(estimates.phase2, first.partition, nodes, effects, objects, burnin, thining, ceiling(length.p3/cpus), neighborhood, sizes.allowed, sizes.simulated)
      return(subres)
    }
    )
    all.z <- c()
    for(k in 1:cpus) all.z <- rbind(all.z,res[[k]]$draws)
    length.p3 <- cpus * ceiling(length.p3/cpus)
    results.phase3 <- list("draws" = all.z, "last.partition" = res[[cpus]]$last.partition, "all.partitions" = NULL) 
    
  }else{
    
    results.phase3 <- draw_Metropolis_single(estimates.phase2, first.partition, nodes, effects, objects, burnin, thining, length.p3, neighborhood, sizes.allowed, sizes.simulated)
  }
  z.phase3 <- results.phase3$draws
  
  # calculate autocorrelation to check afterhand
  autocors <- rep(0,num.effects)
  for(e in 1:num.effects){
    autocors[e] <- cor(results.phase3$draws[1:(length.p3-1),e],results.phase3$draws[2:length.p3,e])
  }
  print("Autocorrelations in phase 3:")
  print(autocors)

  
  # hack for size constraints
  if(!is.null(sizes.allowed)){
    length.p3 <- nrow(z.phase3)
    print("new length of phase 3")
    print(length.p3)
  }
  
  # calculations of phase 3: mean, sd, se, conv ratios
  res.phase3 <- phase3(estimates.phase2, z.phase3, z.obs, nodes, effects, length.p3, fixed.estimates)
  
  return(list("means" = res.phase3$finalmean, 
              "standard.deviations" = res.phase3$finalsd, 
              "standard.errors" = res.phase3$finalse, 
              "convergence.ratios" = res.phase3$finalconvratios,
              "inv.zcov" = res.phase3$inv.zcov,
              "inv.scaling" = res.phase3$inv.scaling,
              "autocorrelations" = autocors))
  
}


# Phase 3 for multiple partitions
run_phase3_multiple <- function(partitions,
                              estimates.phase2, 
                              z.obs, 
                              presence.tables, 
                              nodes, 
                              effects, 
                              objects, 
                              burnin, 
                              thining,
                              a.scaling,
                              length.p3, 
                              neighborhood,
                              sizes.allowed,
                              sizes.simulated,
                              fixed.estimates,
                              parallel = F,
                              cpus = 1) {
  
  num.nodes <- nrow(nodes)
  num.effects <- length(effects$names)
  
  # find a good starting point
  #first.partitions <- find_startingpoint_multiple(presence.tables,nodes,sizes.allowed)
  first.partitions <- partitions
  
  # simulate a large sample with the estimates found in phase 2 
  if(parallel){
    
    sfExport("startingestimates", "first.partitions", "presence.tables", "nodes", "effects", "objects", "burnin", "thining", "length.p3", "cpus", "neighborhood", "sizes.allowed", "sizes.simulated")
    res <- sfLapply(1:cpus, fun = function(k) {
      set.seed(k)
      subres <- draw_Metropolis_multiple(estimates.phase2, first.partitions, presence.tables, nodes, effects, objects, burnin, thining, ceiling(length.p3/cpus), neighborhood, sizes.allowed, sizes.simulated, return.all.partitions = T)
      return(subres)
    }
    )
    all.z <- c()
    all.partitions <- list()
    for(k in 1:cpus) {
      all.z <- rbind(all.z,res[[k]]$draws)
      all.partitions[[k]] <- res[[k]]$all.partitions
    }
    length.p3 <- cpus * ceiling(length.p3/cpus)
    results.phase3 <- list("draws" = all.z, "last.partitions" = res[[cpus]]$last.partitions, "all.partitions" = all.partitions) 
  
  }else{
    
    results.phase3 <- draw_Metropolis_multiple(estimates.phase2, first.partitions, presence.tables, nodes, effects, objects, burnin, thining, length.p3, neighborhood, sizes.allowed, sizes.simulated, return.all.partitions = T)
  
  }
  z.phase3 <- results.phase3$draws
  
  # calculate autocorrelation to check afterhand
  autocors <- rep(0,num.effects)
  for(e in 1:num.effects){
    autocors[e] <- cor(results.phase3$draws[1:(length.p3-1),e],results.phase3$draws[2:length.p3,e])
  }
  print("Autocorrelations in phase 3:")
  print(autocors)
  
  # hack for size constraints
  if(!is.null(sizes.allowed)){
    length.p3 <- nrow(z.phase3)
    print("new length of phase 3")
    print(length.p3)
  }
  
  # calculations of phase 3: mean, sd, se, conv ratios
  res.phase3 <- phase3(estimates.phase2, z.phase3, z.obs, nodes, effects, length.p3, fixed.estimates)
  
  return(list("draws" = z.phase3, 
              "means" = res.phase3$finalmean, 
              "standard.deviations" = res.phase3$finalsd, 
              "standard.errors" = res.phase3$finalse, 
              "convergence.ratios" = res.phase3$finalconvratios,
              "inv.zcov" = res.phase3$inv.zcov,
              "inv.scaling" = res.phase3$inv.scaling,
              "autocorrelations" = autocors,
              "all.partitions" = results.phase3$all.partitions))
  
}


# Phase 3 for multiple partitions
run_phase3_multiple_secondparallel <- function(partitions,
                                estimates.phase2, 
                                z.obs, 
                                presence.tables, 
                                nodes, 
                                effects, 
                                objects, 
                                burnin, 
                                thining,
                                a.scaling,
                                length.p3, 
                                neighborhood,
                                sizes.allowed,
                                sizes.simulated,
                                fixed.estimates,
                                parallel = F,
                                cpus = 1) {
  
  num.nodes <- nrow(nodes)
  num.effects <- length(effects$names)
  
  # find a good starting point
  #first.partitions <- find_startingpoint_multiple(presence.tables,nodes,sizes.allowed)
  first.partitions <- partitions
  
  # simulate a large sample with the estimates found in phase 2 
  results.phase3 <- draw_Metropolis_multiple_secondparallel(estimates.phase2, first.partitions, presence.tables, nodes, effects, objects, burnin, thining, length.p3, neighborhood, sizes.allowed, sizes.simulated, parallel, cpus)
  z.phase3 <- results.phase3$draws
  
  # calculate autocorrelation to check afterhand
  autocors <- rep(0,num.effects)
  for(e in 1:num.effects){
    autocors[e] <- cor(results.phase3$draws[1:(length.p3-1),e],results.phase3$draws[2:length.p3,e])
  }
  print("Autocorrelations in phase 3:")
  print(autocors)
  
  # hack for size constraints
  if(!is.null(sizes.allowed)){
    length.p3 <- nrow(z.phase3)
    print("new length of phase 3")
    print(length.p3)
  }
  
  # calculations of phase 3: mean, sd, se, conv ratios
  res.phase3 <- phase3(estimates.phase2, z.phase3, z.obs, nodes, effects, length.p3, fixed.estimates)
  
  return(list("means" = res.phase3$finalmean, 
              "standard.deviations" = res.phase3$finalsd, 
              "standard.errors" = res.phase3$finalse, 
              "convergence.ratios" = res.phase3$finalconvratios,
              "inv.zcov" = res.phase3$inv.zcov,
              "inv.scaling" = res.phase3$inv.scaling,
              "autocorrelations" = autocors))
  
}


# function for the core calculations of phase 3: estimate means, s.d., s.e., and convergence ratios
phase3 <- function(estimates.phase2,
                   z.phase3,
                   z.obs,
                   nodes,
                   effects,
                   length.p3,
                   fixed.estimates){
  
  num.nodes <- nrow(nodes)
  num.effects <- length(effects$names)
  
  # mean statistics
  finalmean <- rep(0,num.effects)
  for(e in 1:num.effects) {
    finalmean[e] <- Reduce('+',z.phase3[,e]) / length.p3
  }
  
  # standard deviations
  finalsd <- rep(0, num.effects)
  for(e in 1:num.effects) {
    statse <- rep(0,length.p3)
    for(i in 1:length.p3) {
      statse[i] <- z.phase3[i,e]
    }
    finalsd[e] <- sd(statse)
  }
  
  #covariance matrix
  inverted_matrices <- calculate_inverted_covariance_and_scaling(estimates.phase2, 
                                                                 z.obs, 
                                                                 nodes, 
                                                                 effects, 
                                                                 objects, 
                                                                 a.scaling = 0,
                                                                 length.phase = length.p3, 
                                                                 z.phase = z.phase3,
                                                                 fixed.estimates)
  inv.zcov <- inverted_matrices$inv.zcov
  inv.scaling <- inverted_matrices$inv.scaling
  
  #finalse <- finalsd / sqrt(length.p3)
  finalse <- sqrt(diag(inv.zcov))
  
  # convergence ratios
  finalconvratios <- (finalmean - z.obs) / finalsd
  
  print("Estimated statistics after phase 3")
  print(finalmean)
  print("Estimates after phase 3")
  print(estimates.phase2)
  
  return(list("finalmean" = finalmean,
              "finalsd" = finalsd,
              "finalse" = finalse,
              "finalconvratios" = finalconvratios,
              "inv.zcov" = inverted_matrices$inv.zcov,
              "inv.scaling" = inverted_matrices$inv.scaling))
}