R/setse_auto.R
In rsetse: Strain Elevation Tension Spring Embedding

Documented in setse_auto

#' SETSe embedding with automatic drag and timestep selection
#'
#' Embeds/smooths a feature network using the SETSe algorithm automatically finding convergence parameters using a grid search.
#' 
#' @param g An igraph object
#' @param force A character string. This is the node attribute that contains the force the nodes exert on the network.
#' @param distance A character string. The edge attribute that contains the original/horizontal distance between nodes.
#' @param edge_name A character string. This is the edge attribute that contains the edge_name of the edges.
#' @param k A character string. This is k for the moment don't change it.
#' @param tstep A numeric. The time interval used to iterate through the network dynamics.
#' @param mass A numeric. This is the mass constant of the nodes in normalised networks this is set to 1.
#' @param max_iter An integer. The maximum number of iterations before stopping. Larger networks usually need more iterations.
#' @param tol A numeric. The tolerance factor for early stopping.
#' @param sparse Logical. Whether or not the function should be run using sparse matrices. must match the actual matrix, this could prob be automated
#' @param hyper_iters integer. The hyper parameter that determines the number of iterations allowed to find an acceptable convergence value.
#' @param drag_min integer. A power of ten. The lowest drag value to be used in the search
#' @param drag_max integer. A power of ten. if the drag exceeds this value the tstep is reduced
#' @param tstep_change numeric. A value between 0 and 1 that determines how much the time step will be reduced by default value is 0.5
#' @param hyper_tol numeric. The convergence tolerance when trying to find the minimum value
#' @param hyper_max integer. The maximum number of iterations that SETSe will go through whilst searching for the minimum.
#' @param sample Integer. The dynamics will be stored only if the iteration number is a multiple of the sample. 
#'  This can greatly reduce the size of the results file for large numbers of iterations. Must be a multiple of the max_iter
#' @param static_limit Numeric. The maximum value the static force can reach before the algorithm terminates early. This
#' prevents calculation in a diverging system. The value should be set to some multiple greater than one of the force in the system.
#' If left blank the static limit is the system absolute mean force.
#' @param verbose Logical. This value sets whether messages generated during the process are suppressed or not.
#' @param include_edges logical. An optional variable on whether to calculate the edge tension and strain. Default is TRUE.
#'  included for ease of integration into the bicomponent functions.
#' @param noisy_termination Stop the process if the static force does not monotonically decrease.
#' 
#' @details This is one of the most commonly used SETSe functions. It automatically selects the convergence time-step and drag values
#' to ensure efficient convergence.
#' 
#' The noisy_termination parameter is used as in some cases the convergence process can get stuck in the noisy zone of SETSe space.
#' To prevent this the process is stopped early if the static force does not monotonically decrease.  On large networks this 
#' greatly speeds up the search for good parameter values. It increases the chance of successful convergence. 
#' More detail on auto-SETSe can be found in the paper "The spring bounces back" (Bourne 2020).
#' 
#' @return A list containing 5 dataframes.
#' \enumerate{
#'   \item The network dynamics describing several key figures of the network during the convergence process, this includes the static_force
#'   \item The node embeddings. Includes all data on the nodes the forces exerted on them position and dynamics at simulation termination
#'   \item time taken. the amount of time taken per component, includes the edge and nodes of each component
#'   \item The edge embeddings. Includes all data on the edges as well as the strain and tension values.
#'   \item memory_df A dataframe recording the iteration history of the convergence of each component.
#' }
#' 
#' @family setse
# @seealso \code{\link{setse_auto}} \code{\link{setse}}
#' @examples
#' 
#' set.seed(234) #set the random see for generating the network
#' g <- generate_peels_network(type = "E")
#' embeddings <- g %>%
#' prepare_edges(k = 500, distance = 1) %>%
#' #prepare the network for a binary embedding
#' prepare_categorical_force(., node_names = "name",
#'                      force_var = "class") %>%
#' #embed the network using auto_setse
#'   setse_auto(., force = "class_A")
#' 
#' @export

setse_auto <- function(g, 
                       force ="force", 
                       distance = "distance", 
                       edge_name = "edge_name",
                       k = "k",
                       tstep = 0.02, 
                       mass = 1, 
                       max_iter = 100000, 
                       tol = 2e-3,
                       sparse = FALSE,
                       hyper_iters = 100,
                       hyper_tol  = 0.01,
                       hyper_max = 30000,
                       drag_min = 0.01,
                       drag_max = 100,
                       tstep_change = 0.2,
                       sample = 100,
                       static_limit = NULL,
                       verbose = FALSE,
                       include_edges = TRUE,
                       noisy_termination = TRUE){
  
  #The more negative the log ratio becomes the larger the drag ratio becomes
  
  #mass <- ifelse(is.null(mass), mass_adjuster(g, force = force, resolution_limit = TRUE), mass)
  
  if(verbose){print("prepping dataset")}
  #Prep the data before going into the converger
  Prep <- setse_data_prep(g = g, 
                          force = force, 
                          distance = distance, 
                          mass = mass, 
                          edge_name = edge_name,
                          k = k,
                          sparse = sparse)
  
  if(is.null(static_limit)){
    static_limit <- sum(abs(igraph::vertex_attr(g, force)))
  }
  
  #This is a safety feature!
  #makes sure the static limit is not smaller than the machine precision. If it is the static limit is set to half
  #the tolerance making the simulation terminate at the first opportunity
  #print(paste("static limit greater than eps", static_limit> .Machine$double.eps^0.5))
  res_stat_limit <- ifelse(static_limit> .Machine$double.eps^0.5, static_limit, tol/2)
  
  #This print out can be deleted, it is only here for debugging reasons
  #print(paste("static_limit", res_stat_limit ))
  
  memory_df<-tibble::tibble(iteration = 1:(2+hyper_iters),
                    error = NA,
                    perc_change = NA,
                    log_ratio = NA,
                    common_drag_iter = NA,
                    tstep = NA,
                    direction = 1,
                    target_area = NA,
                    res_stat = NA,
                    upper = NA,
                    lower = NA,
                    best_log_ratio =NA,
                    stable = NA)
  #set initial round data
  memory_df$log_ratio[1] <- 0
  memory_df$common_drag_iter[1] <- 0
  memory_df$tstep[1] <- tstep
  memory_df$error[1] <- drag_max
  memory_df$best_log_ratio[1] <- 0
  memory_df$direction[1] = 1 #increasing
  memory_df$target_area[1] <- FALSE
  memory_df$res_stat[1] <- ifelse(res_stat_limit<tol, 2*tol, res_stat_limit) #this ensures setse is run at least once
  #The above conduition is only activated on components with very small amounts of force
  #These two are the limits of the log ratio. they are set to +/- infinity
  memory_df$upper[1] <- -log10(drag_max/tstep)
  memory_df$lower[1] <-  Inf
  memory_df$upper[2] <- -log10(drag_max/tstep)
  memory_df$lower[2] <- Inf
  #This calculates the percentage change it is set to 1 by default for all values of res_stat =2
  memory_df$perc_change[1] <- 1
  memory_df$perc_change[2] <- 1
  #This is whether the iteration is stable or not
  memory_df$stable[1] <- FALSE
  memory_df$stable[2] <- FALSE
  
  drag_iter<- 1
  
  #The vector of drag values to search through
  #If the target zone is not found the time step is reduced and the search repeated.
  #This process continues until the target zone is found or hyper_iters < drag_iter
  drag_values <- seq(round(log10(drag_min)), round(log10(drag_max)), by = 1)
  drag_value_index <- 1

  #the tstep value that is passed to setse_core.
  #This value is adapted if no suitable drag coeffeicient is found
  tstep_adapt <- tstep
  
  #The ratio of the drag coefficient over the timestep, the result is then log10'ened
  memory_df$log_ratio[drag_iter+1] <- log10(tstep) - drag_values[drag_value_index]  
  #The first drag value is simply the lowest drag value
  memory_df$common_drag_iter[drag_iter+1] <- 10^drag_values[drag_value_index]
  
  min_error <- which.min(memory_df$error)
  
  if(verbose){print("beginning embeddings search")}
  
  #For the loop to continue the following conditions must all hold
  #1 The number of iterations the loop has gone through has to be smaller than the hyper_iters variable 
  #2 The residual static force has to be bigger than the minimum system tolerance
  #3 The las two rounds cannot both be stable
  
  #The below commented code is useful in some debugging situations
  
  # print(paste("Iters smaller than max",(drag_iter <= hyper_iters),
  #         "res_stat > tol or NA", ifelse(is.na(memory_df$res_stat[drag_iter]>tol), TRUE, memory_df$res_stat[drag_iter]>tol),
  #         "sims unstable", !all(memory_df$stable[c(drag_iter, drag_iter-1)]))
  #       )
  
  # while(drag_iter<5){  
  while((drag_iter <= hyper_iters) &
        ifelse(is.na(memory_df$res_stat[drag_iter]>tol), TRUE, memory_df$res_stat[drag_iter]>tol) &
        !all(memory_df$stable[c(drag_iter, drag_iter-1)])
  ){
    drag_iter <- drag_iter+1    

    #If the drag exceeds the max reset to minimum and divide the time by value x
    #This should never be needed but just in case it is here.
    #can be removed after testing to make everything easier
    if(memory_df$common_drag_iter[drag_iter] > drag_max){
      
      drag_value_index <- 1
      tstep_adapt <- tstep_adapt*tstep_change
      
      #move on to the next drag value and log ratio
      memory_df$log_ratio[drag_iter+1] <- drag_values[drag_value_index] - log10(tstep_adapt)
      
      memory_df$common_drag_iter[drag_iter] <- 10^drag_values[drag_value_index]
      
    } 
    
    # print( memory_df$common_drag_iter[drag_iter])
    embeddings_data <- setse_core(
      node_embeddings = Prep$node_embeddings, 
      ten_mat = Prep$ten_mat, 
      non_empty_matrix = Prep$non_empty_matrix, 
      kvect = Prep$kvect, 
      dvect = Prep$dvect, 
      mass = mass,
      tstep = tstep_adapt, 
      max_iter = hyper_max, 
      coef_drag = memory_df$common_drag_iter[drag_iter],
      tol = tol,
      sparse = sparse,
      sample = sample,
      static_limit = static_limit,
      noisy_termination = noisy_termination) 
    
    node_embeds <- embeddings_data$node_embeddings
    
    memory_df$tstep[drag_iter] <- tstep_adapt
    
    memory_df$res_stat[drag_iter] <- sum(abs(node_embeds$static_force))
    #In certain circumstances setse core terminates straight away producing NA values.
    #The below line prevents NA values causing issues by ensureing that the params only produces a max res_stat
    memory_df$res_stat[drag_iter] <- ifelse(is.na(memory_df$res_stat[drag_iter]), res_stat_limit, memory_df$res_stat[drag_iter])
    
    #set the residual static force to res_stat_limit if it exceeds this value
    memory_df$res_stat[drag_iter] <- ifelse(memory_df$res_stat[drag_iter]>res_stat_limit, 
                                            res_stat_limit ,
                                            memory_df$res_stat[drag_iter])
    
    #when noisy_termination =TRUE. processes that terminate due to noisy behaviour are treated as if they had exceeded the
    #static limit
    #note noisy behaviour is a system that does not have a monotonic reduction in static force
    if(noisy_termination & nrow(embeddings_data$network_dynamics)>1){

      monotonic_decrease <- all(embeddings_data$network_dynamics$static_force[2:nrow(embeddings_data$network_dynamics)]<
      embeddings_data$network_dynamics$static_force[1:(nrow(embeddings_data$network_dynamics)-1)])
      
      memory_df$res_stat[drag_iter] <-  ifelse(monotonic_decrease, memory_df$res_stat[drag_iter] , res_stat_limit)
    }
    
    #Is the algo in the convex area?
    memory_df$target_area[drag_iter] <- memory_df$res_stat[drag_iter] < res_stat_limit
    
    #stores a temporary error value
    temp_error <- memory_df$res_stat[drag_iter] - tol
    
    #Log error is negative when small. Absolute difference is used to prevent NaNs if the true error is smaller than the tolerance
    memory_df$error[drag_iter] <- log10(abs(temp_error))
    
    #setting the limits
    #this is triggered when at least one hyperiteration is less than res_stat_limit
    if(sum(memory_df$target_area, na.rm = T)>=1){
      #print("A")
      #Identify row containing the minimum error
      min_error <- which.min(memory_df$error)
      #find log ratio that give the lowest error. there may be more than one
      upper_ratios <- memory_df$log_ratio[memory_df$log_ratio[min_error] >= memory_df$log_ratio] %>% unique()
      #find the log ratio that gives the maximum error aka the res_stat_limit, there are almost certainly more than one of those
      lower_ratios  <- memory_df$log_ratio[memory_df$log_ratio[min_error] <= memory_df$log_ratio] %>% unique()
      
      #This test returns a vector of length zero if the best value is the only unique value.
      #In that case another test needs to be performed.
      upper_check_1 <- !is.finite( upper_ratios[rank(-upper_ratios, ties.method =  "first")==2])
      #this checks the previous check is actually longer than 1
      upper_check_2 <- length(upper_check_1)>0
      upper_result_check <- ifelse(upper_check_2, upper_check_1, FALSE)
      
      #this is the same for the lower section
      lower_check_1 <- !is.finite( lower_ratios[rank(lower_ratios, ties.method =  "first" )==2])
      lower_check_2 <- length(lower_check_1)>0
      lower_result_check <- ifelse(lower_check_2, lower_check_1, FALSE)
      
      memory_df$upper[drag_iter+1] <-  ifelse(upper_result_check, 
                                              # ifelse(!is.finite(upper_result_check), 
                                              -log10(drag_max/tstep), 
                                              upper_ratios[rank(-upper_ratios, ties.method =  "first")==2])  
      
      memory_df$lower[drag_iter+1] <- ifelse(lower_result_check, 
                                             Inf, 
                                             lower_ratios[rank(lower_ratios, ties.method =  "first")==2])
      
      memory_df$best_log_ratio[drag_iter] <- memory_df$log_ratio[min_error]
      
      # percent change in res stat
      memory_df$perc_change[drag_iter] <- (memory_df$res_stat[drag_iter]-min(memory_df$res_stat[1:(drag_iter-1)]))/min(memory_df$res_stat[1:(drag_iter-1)])
      
    } else {
      #print("B")
      #finally if the search has never been in the target zone the upper and lower limits are the same Inf as before
      memory_df$upper[drag_iter+1] <- memory_df$upper[drag_iter] 
      memory_df$lower[drag_iter+1] <-  memory_df$lower[drag_iter]
      memory_df$best_log_ratio[drag_iter] <- memory_df$best_log_ratio[drag_iter-1]
      
      #The change is twice the hyper tolerance 
      memory_df$perc_change[drag_iter] <- 2*hyper_tol
      
    }
    #complete the stability part of the dataframe. This replaces any infinite values with 2*tol so that an error won't be thrown
    
    #If the change in res stat is smaller than the hyper tolerance the system is considered stable.
    #This is only ever used INSIDE the target zone
    memory_df$stable[drag_iter] <- ifelse(is.finite(memory_df$perc_change[drag_iter]), 
                                          abs(memory_df$perc_change[drag_iter]), #The absolute percent change otherwise large negative values count as stable
                                          hyper_tol*2 ) < hyper_tol
    
    ###
    #if you are in the target zone updates happen adaptively if not using a fixed step size
    ###
    if( sum(memory_df$target_area, na.rm = T)>0 ){
      #print("1")
      # print("upper limit found")
      #This makes the next drag ratio the mean of the upper and lower limit.
      #The first time the drag is in the target area the lower limit will be -Inf
      memory_df$log_ratio[drag_iter+1] <- (memory_df$upper[drag_iter+1] + memory_df$lower[drag_iter+1] )/2
      #print(memory_df$log_ratio[drag_iter+1] )
      #if the new log ratio has been used before, break the tie by using the mid point between
      #the best log ratio and the lower log ratio
      memory_df$log_ratio[drag_iter+1] <- ifelse(memory_df$log_ratio[drag_iter+1] %in% memory_df$log_ratio[memory_df$iteration <=drag_iter & 
                                                                                                             memory_df$tstep==tstep_adapt], 
                                                 (memory_df$lower[drag_iter] + memory_df$best_log_ratio[drag_iter])/2,
                                                 memory_df$log_ratio[drag_iter+1] )
      #print(memory_df$log_ratio[drag_iter+1] )
      #If the lower limit has not been found the log ratio will be infinite. 
      #In that case subtract the mean of the best value and the upper limit from the best value otherwise do nothing
      memory_df$log_ratio[drag_iter+1] <- ifelse(is.finite(memory_df$log_ratio[drag_iter+1]),
                                                 memory_df$log_ratio[drag_iter+1],
                                                 memory_df$best_log_ratio[drag_iter] - (memory_df$best_log_ratio[drag_iter] + memory_df$upper[drag_iter] )/2)
      
      #Calculate the common drag iteration for this round
      memory_df$common_drag_iter[drag_iter+1] <- 10^( -memory_df$log_ratio[drag_iter+1]) * tstep_adapt
      #print(memory_df$log_ratio[drag_iter+1] )
    } else {
      #print("2")
      #Before the non trivial upper and lower bound are found simple search is performed to find the target zone
      #fixed rate search across the plateau
      
        drag_value_index <-  drag_value_index + 1
        
        #If the index exceeds the total number of elements in the drag range
        #reset the to minimum drag value and reduce the timstep
 
        #if(10^drag_values[drag_value_index] > drag_max){
        if(drag_value_index > length(drag_values)) {
         
          drag_value_index <- 1
          tstep_adapt <- tstep_adapt*tstep_change
        }
        
      #move on to the next drag value and log ratio
      memory_df$log_ratio[drag_iter+1] <-  log10(tstep_adapt) - drag_values[drag_value_index] 
      
      memory_df$common_drag_iter[drag_iter+1] <- 10^(drag_values[drag_value_index])
    }
    

    if(verbose){
      # message_val <- ifelse(memory_df$res_stat[drag_iter] < memory_df$res_stat[drag_iter-1], 
      #                       "static force decreasing",  
      #                       "static force increasing or stable")
      # print(c((drag_iter <= hyper_iters), (memory_df$res_stat[drag_iter]>tol), !all(memory_df$stable[c(drag_iter, drag_iter-1)])))
      print(paste("Iteration",drag_iter, "drag value",
                  signif(memory_df$common_drag_iter[drag_iter],3),
                  "tstep", memory_df$tstep[drag_iter],
                  #message_val, 
                  "static force is",
                  signif(memory_df$res_stat[drag_iter],3), "target is", signif(tol,3)))
      
     # print(paste("log ratio",memory_df$log_ratio[drag_iter+1]  ,"next drag", memory_df$common_drag_iter[drag_iter+1]))
    }
  }
  
  #Keep only value that actually have a residual force
  memory_df <- memory_df %>%
    dplyr::filter(!is.na(res_stat))
  
  
  #If the smallest residual force is not less than the tolerance even after the minimum point hase been found
  #Then run the setse algo again using the best log ratio but for the maximum number of iterations
  #Ten percent is added to the drag score as the found value is likely to be noisy. This will speed up the convergence
  #If it slows it down by over damping it won't be too bad... I am open to changing this!
  if(min(memory_df$res_stat)>tol){
    
    #if any value has reduced the static_force use the best one.
    #otherwise use the last drag value as it doesn't matter anyway

    if(memory_df$best_log_ratio[nrow(memory_df)]==0){
      
      drag_val <- memory_df$common_drag_iter[nrow(memory_df)]
    } else{
      
      drag_val  <-10^( -memory_df$best_log_ratio[nrow(memory_df)]) * tstep_adapt
      
    }
    
    
    print(paste0("Minimum tolerance not exceeded, running setse on best parameters, drag value ", signif(drag_val, 3)))
    embeddings_data <- setse_core_time_shift(
      node_embeddings = Prep$node_embeddings, 
      ten_mat = Prep$ten_mat, 
      non_empty_matrix = Prep$non_empty_matrix, 
      kvect = Prep$kvect, 
      dvect = Prep$dvect, 
      mass = mass,
      tstep = tstep_adapt, 
      max_iter = max_iter, 
      coef_drag = drag_val, #uses the best/last coefficient of drag
      tol = tol,
      sparse = sparse,
      sample = sample,
      static_limit = static_limit,
      tstep_change = 0.5,
      dynamic_reset = TRUE) #This is false as something has to be produced by the algo. I can change later 
    
    
    
    
  }
  
  #This is TRUE in almost all cases, but for bicomp is set to false.
  if(include_edges){  
    #Put in edge embeddings
    embeddings_data$edge_embeddings <- calc_tension_strain(g = g,
                                                           embeddings_data$node_embeddings,
                                                           distance = distance, 
                                                           edge_name = edge_name, 
                                                           k = k)
  }
  
  #Add the search record
  embeddings_data$memory_df <- memory_df
  
  return(embeddings_data)
}