R/n_occupancy_function.R

Defines functions n_occupancy_function

n_occupancy_function <- function(hv_list, classification = NULL, method = "subsample", FUN = mean, num.points.max = NULL, verbose = TRUE,
                                 distance.factor = 1, check.hyperplane = FALSE, box_density = 5000, print_log = FALSE, append = TRUE, 
                                 iteration = NULL, thin = FALSE, quant.thin = 0.5, seed = NULL, path = "."){
  
  # check if hv_list is of class HypervolumeList
  if(! inherits(hv_list, "HypervolumeList")){
    stop("An object of class HypervolumeList is needed.")
  }
  
  if(! is.null(classification)){
    if(length(hv_list@HVList) != length(classification)){
      stop("The length of hv_list must be the same as the length of classification.")
    }
  }
  

  if(!is.null(seed)){
    # get random state from global environment
    old_state <- get0(".Random.seed", envir = .GlobalEnv, inherits = FALSE)
    
    set.seed(seed)
    
    on.exit({
      # restore random state
      if (!is.null(old_state)) {
        assign(".Random.seed", old_state, envir = .GlobalEnv, inherits = FALSE)
      }
    })
    
  }
  

  
  # store some properties of the hypervolumes stored in hv_list:
  # dimensionality of random points, volume, density, names
  np_list <- unlist(lapply(hv_list@HVList, function(x) nrow(x@RandomPoints)))
  vol_list <- unlist(lapply(hv_list@HVList, function(x) x@Volume))
  hv_point_density_list <- np_list / vol_list
  dimhv_list <- unlist(lapply(hv_list@HVList, function(x) ncol(x@RandomPoints)))
  hv_names_list <- unlist(lapply(hv_list@HVList, function(x) x@Name))
  Data <- lapply(hv_list@HVList, function(x) x@Data)
  
  if(any(is.nan(hv_point_density_list))) {
    hv_point_density_list[is.nan(hv_point_density_list)] <- NA
  }
  
  if (length(unique(dimhv_list)) > 1){
    stop('Dimensionality of hypervolumes is not the same.')
  }
  
  

  
  if (check.hyperplane){
    for(i in 1:length(hv_list@HVList)){
      hv_df <- as.data.frame(hv_list[[i]]@Data)
      
      res <- findLinearCombos(hv_df)
      
      if (res[2] != "NULL") {
        warning("Some data is hyperplanar")
      }
    }
  }
  
  
  
  
  # This should check if dimhv_list contains only as single value
  # I think it will work in most cases. It will provide strange results when,
  # for example, dimhv_list is a vector of length 1 containing only a 0.
  
  if(length(unique(dimhv_list)) > 1) stop("Dimensionality of hypervolumes is not the same.")
  
  # Extract the method with lapply and check if they are the same
  # I would put a stop here, but feel free to modify if needed
  
  method_check <- unlist(lapply(hv_list@HVList, function(x) x@Method))
  if(length(unique(method_check)) > 1) stop("Hypervolumes building method is not the same.")
  
  
  #calculate max number of points according to dimensionality 
  if (is.null(num.points.max)) {
    num.points.max = ceiling(10^(3 + sqrt(unique(dimhv_list))))
    if (verbose & identical(method, "subsample")) {
      cat(sprintf("Choosing num.points.max=%.0f (use a larger value for more accuracy.)\n", 
                  num.points.max))
    }
  }
  
  
  #get the density of the HV with minimum points density
  density_list <- c()
  for (i in 1:length(hv_point_density_list)){
    density_list <- c(density_list, hv_point_density_list[i], num.points.max/hv_list[[i]]@Volume)
    mindensity = min(density_list, na.rm = TRUE)
  }
  
  if (verbose & identical(method, "subsample")) {
    cat(sprintf("Using minimum density of %f\n", mindensity))
  }
  
  
  #for each HV, keep the number of points that gives the same density to each HV according to their volume
  numpointstokeep_hv_list <- c()
  for (i in 1:length(hv_list@HVList)){
    numpointstokeep_hv <- floor(mindensity*vol_list[i])
    numpointstokeep_hv_list <- c(numpointstokeep_hv_list, numpointstokeep_hv)
  }
  
  
  ####################################################################################################################
  ###   check that no HV is  HV are empty
  ####################################################################################################################
  
  is_any_pointstokeep_null = FALSE
  nopointstokeep <- c()
  
  for (i in 1:length(numpointstokeep_hv_list)){
    if (numpointstokeep_hv_list[i] == 0 | is.null(numpointstokeep_hv_list[i])){
      is_any_pointstokeep_null = TRUE
      nopointstokeep <- c(nopointstokeep, i)
    }
  }
  
  if (is_any_pointstokeep_null == TRUE){
    stop(paste0("hv",nopointstokeep,"has no random points and is empty."))
    
  }
  
  #####################################################################################################################  
  #####################################################################################################################
  
  
  #subsample all hypervolumes to get same points density
  hv_points_ss_list <- list()
  
  for (i in 1:length(hv_list@HVList)){
    # randomly select points with as many points as the numpointstokeep is equal to
    hv_points_ss <- hv_list[[i]]@RandomPoints[sample(1:np_list[i], size = numpointstokeep_hv_list[i]), , drop = FALSE] 
    hv_points_ss_list[[i]] <- hv_points_ss
  }
  
  dim <- dimhv_list[1]
  
  point_density = nrow(hv_points_ss_list[[1]])/hv_list[[1]]@Volume
  
  # average point spacing
  cutoff_dist = point_density^(-1/dim) * distance.factor
  
  
  #####################################################################################################################
  #####################################################################################################################
  
  ### SUBSAMPLE METHOD ###
  
  if(identical(method, "subsample")){
    
    # collate individual subsampled Random Points in a single matrix
    total_hv_points_ss  <- do.call(rbind,  hv_points_ss_list)
    
    # creating the kd tree
    tree <- kdtree_build(total_hv_points_ss, verbose = verbose)
    
    # set variables for using kdtree_ball_query_multiple
    points_numeric <- t(total_hv_points_ss)
    nr = nrow(total_hv_points_ss)
    nc = ncol(total_hv_points_ss)
    radius <- cutoff_dist
    
    # perform kdtree_ball_query_multiple, to calculate how many points are present in a n-dimensional ball of radius 
    # equal to cutoff_dist
    result_id <- kdtree_ball_query_multiple(tree, points_numeric, nr, nc, radius, verbose)
    
    # remove the kd tree after having performed kdtree_ball_query_multiple
    rm(tree)
    
    # determine how many random points points we need to keep, basically it is the mean number of
    # random points 
    num_points_to_sample_in_intersection = nrow(total_hv_points_ss) / length(hv_list@HVList)
    
    # select points inversely proportional to th result of kdtree_ball_query_multiple. Points that overlap with many
    # other points have less probability to be subsampled
    
    to_keep <- sample(1:nrow(total_hv_points_ss), num_points_to_sample_in_intersection, prob = 1 / result_id)
    total_hv_points_ss <- total_hv_points_ss[to_keep, ]
  }
  
  
  #####################################################################################################################
  #####################################################################################################################
  
  ### BOX METHOD ###
  
  if(identical(method, "box")){
    
    # collate individual Random Points in a single matrix. Compared to the subsample method all the points are retained
    total_hv_points_ss  <- do.call(rbind, lapply(hv_list@HVList, function(x) x@RandomPoints))
    
    # Create a box that contains the all the points of all the hypervolumes
    # deermine a range augmented by 1.05 to avoid edge effects
    random_box <- apply(total_hv_points_ss, 2, range)*1.05
    
    # determine the volume of the box
    random_box_vol <- prod(apply(random_box, 1, function(x) sum(abs(x))))
    
    # determine how many random points we need to hae given the volume
    random_box_points <- floor(box_density*random_box_vol) 
    
    # build the random box
    total_hv_points_ss <- apply(random_box, 2, function(x) runif(random_box_points, x[1], x[2]))
    
    
  }
  
  
  
  
  #####################################################################################################################
  #####################################################################################################################
  
  
  if (verbose) {
    cat("Beginning ball queries... \n")
  }
  
  
  #####################################################################################################################
  #####################################################################################################################
  
  
  #compare the set to the resampled HVs, individually 
  
  if (verbose){
    pb <- progress_bar$new(total = length(hv_points_ss_list))
    pb$tick(0)
  }
  
  final_points_intersection_list <- vector(mode = "list", length(hv_points_ss_list))
  
  for (i in 1:length(hv_points_ss_list)){
    
    if (verbose){
      if (!pb$finished){
        pb$update(i/length(hv_points_ss_list))
      }
    }
    
    final_points_intersection_list[[i]] <- evalfspherical(data = hv_points_ss_list[[i]], radius = cutoff_dist, 
                                                          points =  total_hv_points_ss, verbose = verbose)
    
  } 
  
  if (verbose){
    pb$terminate()
  }
  
  res <- do.call("cbind", final_points_intersection_list)
  
  total_hv_points_ss <- total_hv_points_ss[rowSums(res) > 0, ]
  res <- res[rowSums(res) > 0, ]
  
  if(thin){
    
    if(verbose){
      cat("\nthin = TRUE, this can take a long time to run...\n")
    }
    
    points_numeric = t(as.matrix(total_hv_points_ss))
    nr = nrow(total_hv_points_ss)
    nc = ncol(total_hv_points_ss)
    
    tree <- kdtree_build(total_hv_points_ss, verbose = verbose)
    radius = cutoff_dist
    
    result_id <- kdtree_ball_query_id_multiple(tr = tree, ptlist = points_numeric, nr = nr, nc = nc, r = radius, verb = verbose)
    rm(tree)
    
    if(verbose){
      pb <- progress_bar$new(total = nr)
      cat("\nFinding optimal threshold distance...\n")
    }
    
    res_dist <- rep(NA, length(result_id))
    for(i in 1:length(result_id)){
      res_dist[i] <- mean(dist(total_hv_points_ss[result_id[[i]] + 1,, drop = FALSE]), na.rm = TRUE)
      if(verbose){
        pb$tick()
      }
      
    }
    
    if(verbose){
      pb$terminate()
    }
    
    
    est_thresh <- quantile(res_dist, quant.thin, na.rm = TRUE)
    
    tree <- kdtree_build(total_hv_points_ss, verbose = verbose)
    result_id <- kdtree_ball_query_id_multiple(tr = tree, ptlist = points_numeric, nr = nr, nc = nc, r = est_thresh, verb = verbose)
    rm(tree)
    
    rp_id <- c(1:nr)
    store <- c()
    
    
    
    if(verbose){
      pb <- progress_bar$new(total = nr)
      cat("\nSubsetting random points to a threshold distance of", est_thresh, "...\n")
    }
    
    for(i in 1:nr){

      if(verbose){
        pb$tick()
      }
      
      id_i <- c(result_id[[i]]+1)
      if(any(rp_id == i)){
        id_i <- id_i[id_i != i]
        rp_id <- rp_id[! rp_id %in% id_i]
        store <- c(store,i)} 
      else{
        next()
      }
    }
    
    if(verbose){
      pb$terminate()
    }
    
    
    total_hv_points_ss <- total_hv_points_ss[store, ] 
    res <- res[store, ]
  }
  
  
  
  
  
  rownames(total_hv_points_ss) <- NULL
  colnames(total_hv_points_ss) <- colnames(hv_list@HVList[[1]]@RandomPoints)
  res <- res[rowSums(res) > 0, ]
  res[res > 0]<- 1
  rownames(res) <- NULL
  
  total_hv_points_ss <- total_hv_points_ss
  final_points_intersection <- res
  
  intersection_weights <- sweep(final_points_intersection, 1, apply(final_points_intersection, 1, sum), "/")
  final_volume_intersection <- sum(apply(intersection_weights, 2, function(x) mean(x[x > 0])) * vol_list)
  final_density <- nrow(final_points_intersection) / final_volume_intersection 
  final_single_volumes <- apply(final_points_intersection, 2, sum) /  final_density
  
  if(print_log){
    if(! append){
      write.table(data.frame(input = vol_list, re_computed = final_single_volumes, ratio = vol_list/final_single_volumes),
                  "log_occupancy.txt", 
                  row.names = FALSE,
                  append = append)
    } else{
      suppressWarnings(write.table(data.frame(n = iteration, input = vol_list, re_computed = final_single_volumes, ratio = vol_list/final_single_volumes),
                  file = path, 
                  row.names = FALSE,
                  col.names = FALSE,
                  append = append)) }
  }
  
  if(verbose){
    message("===================")
    cat("comparing input and recomputed volumes:\n")
    cat("\n")
    cat("input -", "re-computed -", "ratio\n")
    
    for(i in 1:ncol(final_points_intersection)){
      a <- round(vol_list[i], 3)
      b <- round(final_single_volumes[i], 3)
      cat(a, "-", b, "-", round(a/b, 3), "\n")
      cat("------------------\n")
    }
    
    mae <- mean(abs(vol_list - final_single_volumes))
    rmse <- sqrt(mean((vol_list - final_single_volumes)^2))
    cat("\n")
    cat("MAE:", mae, "\n")
    cat("RMSE:", rmse, "\n")
    message("===================")
  }
  
  
  cn <- dimnames(hv_list@HVList[[1]]@RandomPoints)[[2]]
  dn <- list(NULL, cn)
  
  
  
  if(is.null(classification)){
    Data <- unique(do.call("rbind", Data))
    #generate a new HV corresponding to the overlap between all the input HVs
    result = new("Hypervolume")
    result@Method = "n_occupancy"
    result@Data = Data
    result@Dimensionality = dim
    result@Volume = final_volume_intersection
    result@PointDensity = final_density
    if(thin){
      qthin <- quant.thin
    } else {
      qthin <- NULL
    }
    result@Parameters = list(method = method, 
                             box_density = switch(identical(method, "subsample") + 1, box_density, NULL),
                             distance.factor = distance.factor,
                             check.hyperplane = check.hyperplane,
                             num.points.max = num.points.max,
                             classification = classification,
                             FUN = FUN,
                             thin = thin,
                             quant.thin = qthin,
                             seed = seed)
    
    result@RandomPoints = matrix(total_hv_points_ss, ncol = dim)
    result@ValueAtRandomPoints = apply(final_points_intersection, 1, FUN)
    result@Name = "occupancy"

    
    # set dimnames
    dimnames(result@RandomPoints) = dn
    dimnames(result@Data) = dn
    
  }
  
  if(!is.null(classification)){
    
    # split hypervolumes by classification and then apply the aggregation function
    unique_groups <- unique(classification)
    #hv_list_split <- split(hv_list@HVList, classification)
    
    # create an empty list to store the results 
    hv_list_res <- vector(length(unique_groups), mode = "list")
    
    # create an empty hypervolume as a model for storing results
    empty_hypervolume <- new("Hypervolume")
    
    
    for(i in 1:length(unique_groups)){
      
      # extract final points intersection of the i-th group for further calculation
      result <- final_points_intersection[, classification == unique_groups[i], drop = FALSE]
      
      # merge data points of the hypervolumes of the i-th group
      # unique because some data points can be shared among the hypervolumes
      data_merged <- unique(do.call(rbind, Data[classification == unique_groups[i]]))
      
      # calculate the hypervolume as the number of points or results that are greater than 0 times mindsensity
      #res_vol <-  nrow(result[rowSums(result) > 0, ]) / nrow(result) * final_volume_intersection
      
      # volume is the sum of hypervolumes minus the intersections
      res_vol <- nrow(result[rowSums(result) > 0, , drop = FALSE]) / nrow(final_points_intersection) * final_volume_intersection
      
      # apply FUN, mean as the default, to result. This calculates the occupancy aka how many hypervolumes include
      # a given random point
      empty_hypervolume@ValueAtRandomPoints <- apply(final_points_intersection[, classification == unique_groups[i], drop = FALSE], 1, FUN)
      
      # assign a method      
      empty_hypervolume@Method <- "n_occupancy"
      
      # assign the name of the i-th to the hypervolume
      empty_hypervolume@Name <- unique_groups[i]
      
      # assign data points as did at line 241
      empty_hypervolume@Data <-  data_merged
      empty_hypervolume@Dimensionality <- dim
      if(thin){
        qthin <- quant.thin
      } else {
        qthin <- NULL
      }
      empty_hypervolume@Parameters = list(method = method, 
                                          box_density = switch(identical(method, "subsample") + 1, box_density, NULL),
                                          distance.factor = distance.factor,
                                          check.hyperplane = check.hyperplane,
                                          num.points.max = num.points.max,
                                          classification = classification,
                                          FUN = FUN,
                                          thin = thin,
                                          quant.thin = qthin,
                                          seed = seed)

      
      # Random points are common to each group. Random points whose ValueAtRandomPoints is 0 are not
      # removed. This is for assuring the correct behaviour of furter calculations.
      
      empty_hypervolume@RandomPoints = matrix(as.matrix(as.data.frame(total_hv_points_ss)), 
                                              ncol = dim)
      empty_hypervolume@Volume <- res_vol
      empty_hypervolume@PointDensity = nrow(result[rowSums(result) > 0, , drop = FALSE]) / res_vol
      
      # set dimnames
      dimnames(empty_hypervolume@RandomPoints) = dn
      dimnames(empty_hypervolume@Data) = dn
      
      hv_list_res[[i]] <- empty_hypervolume
      
    }
    
    
    # if classification is NULL an object of class Hypervolume is created
    # otherwise an object of class HypervolumeList is returned
    
    if(length(unique_groups) == 1){
      result <- empty_hypervolume
    } else {
      hv_list_res <- new("HypervolumeList", HVList = hv_list_res)
      result <- hv_list_res
    }
    
  }
  
  result
}
bblonder/hypervolume documentation built on May 3, 2024, 8:45 a.m.