inst/scripts/data.R

## Dataset01

plane_2D <- function(sample_size = 100, with_seed = NULL, coefficient_x_1 = 1,
                     coefficient_x_2 = 1, coefficient_y_1 = -1, coefficient_y_2 = 1, intercept_x = -10,
                     intercept_y = 8, u_min = 10, u_max = 30, v_min = 10, v_max = 20, num_of_noise_dim = 2, min_noise = 0, max_noise = 1) {

  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  u <- runif(sample_size, min = u_min, max = u_max)
  v <- runif(sample_size, min = v_min, max = v_max)
  x <- coefficient_x_1 * u + coefficient_x_2 * v + intercept_x
  y <- coefficient_y_1 * u + coefficient_y_2 * v + intercept_y

  df <- tibble::tibble(x1 = x, x2 = y)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), 3:(3 + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)
  df

}

## Dataset02

curvilinear_2D <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 2, min_noise = -1, max_noise = 1){
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  x <- runif(sample_size, 0, 2)
  y <- -(x^3 + runif(sample_size, 0, 3)) + runif(sample_size, 0, 0.5)

  df <- tibble::tibble(x1 = x, x2 = y)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), 3:(3 + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)
  df

}

## Dataset03

gaussian_clusters <- function(sample_size = 300, with_seed = NULL, num_clusters = 5, mean_matrix = rbind(c(1,0,0,0), c(0,1,0,0), c(0,0,1,0), c(0,0,0,1), c(0,0,0,0)),
                              var_vec = c(0.05, 0.05, 0.05, 0.05, 0.05), num_dims = 4, num_noise_dims = 0,
                              min_noise = -0.05, max_noise = 0.05) {

  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  if (sample_size < num_clusters) {
    stop('Number of clusters exceed the number of observations.')

  }

  if ((num_dims == 0) | (num_dims == 1)) {
    stop('There should be at least two dimensions.')

  }

  if (dim(mean_matrix)[1] != length(var_vec)) {
    stop('The length of mean and variance vectors are different.')

  }

  if (dim(mean_matrix)[1] != num_clusters) {
    stop('There is not enough mean values for clusters.')

  }

  if (dim(mean_matrix)[2] != num_dims) {
    stop('There is not enough mean values for dimensions.')

  }

  if (length(var_vec) != num_clusters) {
    stop('There is not enough varaiance values for clusters.')

  }

  # To check that the assigned n is divided by three
  if ((sample_size%%num_clusters) != 0) {
    warning("The sample size should be a product of number of clusters.")
    cluster_size <- floor(sample_size/num_clusters)

  } else {
    cluster_size <- sample_size/num_clusters
  }


  # To generate empty tibble
  column_names <- paste0(rep("x", num_dims), 1:num_dims)
  df <- tibble::tibble(!!!stats::setNames(rep(list(NULL), length(column_names)), column_names))

  for (i in 1:num_clusters) {

    # To filter the mean values for specific cluster
    mean_val_for_cluster <- mean_matrix |>
      tibble::as_tibble(.name_repair = "unique") |>
      dplyr::filter(dplyr::row_number() == i) |>
      unlist(use.names = FALSE)

    # To filter the variance values for specific cluster
    variance_val_for_cluster <- var_vec[i]

    # Initialize an empty list to store the vectors with column
    # values
    dim_val_list <- list()

    for (j in 1:num_dims) {

      dim_val_list[[column_names[j]]] <- stats::rnorm(cluster_size, mean = mean_val_for_cluster[j],
                                                      sd = variance_val_for_cluster)

    }
    # To generate a tibble for a cluster
    df_cluster <- tibble::as_tibble(dim_val_list)

    df <- dplyr::bind_rows(df, df_cluster)

  }

  if (num_noise_dims != 0) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_noise_dims), (NCOL(df) + 1):((NCOL(df) + 1) + num_noise_dims))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_noise_dims) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- runif(n,
                                                       min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * runif(n,
                                                              min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

  } else {

    df

  }
}

## Dataset04

cube_3D_with_noise <- function(sample_size = 216, with_seed = NULL, num_of_effective_dims = 3, center=rep(0,num_of_effective_dims), l=1, num_of_noise_dim = 1,
                               min_noise = -0.1, max_noise = 0.1){

  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }


  df <-  matrix(runif(sample_size*num_of_effective_dims,-1,1),ncol=num_of_effective_dims)
  df <-  as.data.frame(
    t(apply(df*(l/2),1,'+',center))
  )
  names(df) <- make.names(seq_len(num_of_effective_dims))

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), 4:(4 + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df
}

## Dataset05

nonlinear_2D <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 5,
                         min_noise = -0.1, max_noise = 0.2) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  theta = runif(sample_size, 0.2, 0.6 * pi)
  x = cos(theta) + rnorm(sample_size, 10, 0.03)
  y = sin(theta) + rnorm(sample_size, 10, 0.03)

  df <- tibble::tibble(x1 = x, x2 = y)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), 3:(3 + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)
  df

}

## Dataset06

clusters_different_shapes <- function(sample_size = 300, with_seed = NULL, num_gussian_clusters = 4, num_non_gaussian_clusters = 2,
                                      cluster_sd_gau = 0.05, cluster_sd_non_gau = 0.1, num_dims = 7, a = 2, b = 4) {


  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  num_clusters <- num_gussian_clusters + num_non_gaussian_clusters

  # To check that the assigned sample_size is divided by number of clusters
  if ((sample_size%%num_clusters) != 0) {
    stop("The sample size should be a product of number of clusters.")

  } else {
    cluster_size <- sample_size/num_clusters
  }

  ## Generate Gaussian clusters

  # Create a vector of possible values (0 and 1)
  values <- c(0, 1)

  # Create an expanded grid with 0's and 1's
  mean_val_grid <- tidyr::expand_grid(!!!setNames(rep(list(values), num_dims),
                                                  paste0("mean_dim", 1:num_dims)))

  # To select combinations for assigned number of clusters

  mean_val_grid_gau <- mean_val_grid |>
    dplyr::slice_sample(n = num_gussian_clusters)

  mean_val_grid_non_gau <- mean_val_grid |>
    dplyr::slice_sample(n = num_non_gaussian_clusters)


  # To generate empty tibble
  column_names <- paste0(rep("x", num_dims), 1:num_dims)
  df <- tibble(!!!setNames(rep(list(NULL), length(column_names)), column_names))

  for (i in 1:num_gussian_clusters) {

    # To filter the mean values for specific cluster
    mean_val_for_cluster <- mean_val_grid_gau |>
      dplyr::filter(dplyr::row_number() == i) |>
      unlist(use.names = FALSE)

    # Initialize an empty list to store the vectors with column
    # values
    dim_val_list <- list()

    for (j in 1:num_dims) {

      dim_val_list[[column_names[j]]] <- rnorm(cluster_size, mean = mean_val_for_cluster[j],
                                               sd = cluster_sd_gau)

    }
    # To generate a tibble for a cluster
    df_gau_cluster <- tibble::as_tibble(dim_val_list)

    df <- dplyr::bind_rows(df, df_gau_cluster)

  }

  phi <- runif(cluster_size, max = 2*pi)
  rho <- sqrt(runif(cluster_size))

  for (i in 1:num_non_gaussian_clusters) {

    # To filter the mean values for specific cluster
    presence_of_elipse_cluster <- mean_val_grid_non_gau |>
      dplyr::filter(dplyr::row_number() == i) |>
      unlist(use.names = FALSE)

    # Initialize an empty list to store the vectors with column
    # values
    dim_val_list_n <- list()

    for (j in 1:num_dims) {
      if(presence_of_elipse_cluster[j] == 1){
        dim_val_list_n[[column_names[j]]] <- sqrt(a)*rho*cos(phi) + b
        ## Surface of poolar coordinate
      } else {
        dim_val_list_n[[column_names[j]]] <- rnorm(cluster_size, mean = 0,
                                                   sd = cluster_sd_non_gau)

      }



    }
    # To generate a tibble for a cluster
    df_non_gau_cluster <- tibble::as_tibble(dim_val_list_n)

    df <- dplyr::bind_rows(df, df_non_gau_cluster)

  }

  df

}

## Dataset07

sine_curve_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 8,
                                  min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  theta = runif(sample_size, 0,1.80 * pi)
  x = theta
  y = sin(theta)

  df <- tibble::tibble(x1 = x, x2 = y)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), 3:(3 + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)
  df

}

## Dataset08

diff_shape_clusters <- function(sample_size = 300, with_seed = NULL, num_of_noise_dim = 3,
                                min_noise = -0.05, max_noise = 0.05) {

  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by 2
  if ((sample_size%%6) != 0) {
    stop("The sample size should be a product of six.")

  } else {
    cluster_size <- sample_size/6
  }

  theta = runif(cluster_size, 0.20,0.60 * pi)

  df1 <- tibble::tibble(
    x = cos(theta) + rnorm(cluster_size, 1, 0.03),
    y = sin(theta) + rnorm(cluster_size, 1, 0.03),

    z = rep(0, cluster_size) + rnorm(cluster_size, 1, 0.03),
    w = rep(0, cluster_size) - rnorm(cluster_size, 1, 0.03))

  df2 <- tibble::tibble(x=rnorm(cluster_size, mean = 0.6, sd = 0.05), y=rnorm(cluster_size, mean = 0, sd = 0.05), z=rnorm(cluster_size, mean = 0, sd = 0.05), w=rnorm(cluster_size, mean = 0, sd = 0.05))

  df3 <- tibble::tibble(x=rnorm(cluster_size, mean = 1, sd = 0.05), y=rnorm(cluster_size, mean = 0, sd = 0.05), z=rnorm(cluster_size, mean = 0, sd = 0.05), w=rnorm(cluster_size, mean = 0, sd = 0.05))

  df4 <- tibble::tibble(x=rnorm(cluster_size, mean = 0, sd = 0.05), y=rnorm(cluster_size, mean = 1, sd = 0.05), z=rnorm(cluster_size, mean = 0, sd = 0.05), w=rnorm(cluster_size, mean = 0, sd = 0.05))

  df5 <- tibble::tibble(x=rnorm(cluster_size, mean = 0, sd = 0.15), y=rnorm(cluster_size, mean = 0, sd = 0.15), z=rnorm(cluster_size, mean = 1, sd = 0.15), w=rnorm(cluster_size, mean = 0, sd = 0.15))

  df6 <- tibble::tibble(x=rnorm(cluster_size, mean = 0, sd = 0.05), y=rnorm(cluster_size, mean = 0, sd = 0.05), z=rnorm(cluster_size, mean = 0, sd = 0.05), w=rnorm(cluster_size, mean = 1, sd = 0.05))
  df <- dplyr::bind_rows(df1, df2, df3, df4, df5, df6)
  df <- df |>
    dplyr::rename(x1 = x, x2 = y, x3 = z, x4 = w)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)
  df

}

## Dataset09

three_clusters_data_with_noise <- function(sample_size = 100, with_seed = NULL, num_dims = 7, num_of_noise_dim = 3,
                                           min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by three
  if ((sample_size %% 3) != 0) {
    stop("The sample size should be a product of three.")

  } else {
    cluster_size <- sample_size/3
  }
  df <- snedata::three_clusters_data(n = cluster_size, dim = num_dims) ## n = number of points per Gaussian
  df <- df |>
    dplyr::select(-color)
  names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)
  df

}

## Dataset10

torus_with_noise <- function(sample_size = 100, with_seed = NULL, num_ffective_dims = 3, num_of_noise_dim = 4,
                             min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  torus <- geozoo::torus(p = num_ffective_dims, n = sample_size)
  df <- tibble::as_tibble(torus$points, .name_repair = "unique")
  names(df) <- paste0(rep("x", num_ffective_dims), 1:num_ffective_dims)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), 4:(4 + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)
  df

}

## Dataset11

spiral_with_noise <- function(sample_size = 100, with_seed = NULL, num_dims = 10, num_of_noise_dim = 0,
                              min_noise = -0.5, max_noise = 0.5) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  result  <- mgc::mgc.sims.spiral(n = sample_size, d = num_dims)  # simulate 100 samples in 10 dimensions

  df <- tibble::as_tibble(result$X, .name_repair = "unique")
  names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

  if (num_of_noise_dim != 0) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                       min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                              min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)
    df

  } else {

    df

  }

}

## Dataset12

roman_surface_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 4,
                                     min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  klein <- geozoo::roman.surface(n = sample_size, a = 1)
  df <- tibble::as_tibble(klein$points, .name_repair = "unique")
  names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))


  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)
  df

}

## Dataset13

mobius_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 7,
                              min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  mobius <- geozoo::mobius.experiment(p = 5, n = sample_size)
  df <- tibble::as_tibble(mobius$points, .name_repair = "unique")
  names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))


  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)
  df

}

## Dataset14

dini_surface_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 1,
                                    min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  dini <- geozoo::dini.surface(n = sample_size, a = 1, b = 1)
  df <- tibble::as_tibble(dini$points, .name_repair = "unique")
  names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))


  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)
  df

}

## Dataset15

conic_spiral_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 4,
                                    min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  conic_spiral <- geozoo::conic.spiral(n = sample_size, a = .2, b = 1, c = .1, w = 2)
  df <- tibble::as_tibble(conic_spiral$points, .name_repair = "unique")
  names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))


  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)
  df

}

## Dataset16

s_curve_data_hole_with_noise <- function(sample_size = 280, with_seed = NULL, num_of_noise_dim = 4,
                                         min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # ## Need to change later
  # if (sample_size == 280) {
  #
  #   sample_size_n <- 300
  #
  # } else if (sample_size == 477) {
  #
  #   sample_size_n <- 500
  #
  # } else if (sample_size == 954) {
  #
  #   sample_size_n <- 1000
  #
  # } else if (sample_size == 1409) {
  #
  #   sample_size_n <- 1500
  #
  # } else {
  #
  # }

  df <- snedata::s_curve_hole(n_samples = sample_size, noise = 0) ## Should add more data because remove to create the hole
  df <- df |>
    dplyr::select(-color)
  names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

  sample_size_n <- NROW(df)


  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size_n,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size_n,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset17

long_cluster_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 5,
                                    min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by 2
  if ((sample_size%%2) != 0) {
    stop("The sample size should be a product of two.")

  } else {
    cluster_size <- sample_size/2
  }



  df_2_split <- snedata::long_cluster_data(n = cluster_size) |>
    dplyr::group_by(color) |>
    dplyr::group_split()

  df_2_split_1 <- df_2_split[[1]]
  df_2_split_1$x <- df_2_split_1$x - 20
  df_2_split_1$y <- df_2_split_1$y - 20

  df <- bind_rows(df_2_split_1, df_2_split[[2]]) |>
    dplyr::select(-color)
  names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)
  df

}

## Dataset18

nonlinear_connect_with_noise <- function(sample_size = 400, with_seed = NULL, num_of_noise_dim = 3,
                                         min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by four
  if ((sample_size%%4) != 0) {
    stop("The sample size should be a product of 4.")

  } else {
    cluster_size <- sample_size/4
  }

  theta = runif(cluster_size, 0,0.80 * pi)
  x = cos(theta) + rnorm(cluster_size, 10, 0.03)
  y = sin(theta) + rnorm(cluster_size, 10, 0.03)
  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)


  df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x = cos(-theta) + rnorm(cluster_size, 10, 0.03) + rnorm(cluster_size, 0.1, 0)
  y = sin(-theta) + rnorm(cluster_size, 10, 0.03) + rnorm(cluster_size, 0.1, 0)
  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x = cos(-theta) + rnorm(cluster_size, 10, 0.03) + rnorm(cluster_size, 0.1, 0)
  z = sin(-theta) + rnorm(cluster_size, 10, 0.03) + rnorm(cluster_size, 0.1, 0)
  y <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df3 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x = cos(theta) + rnorm(cluster_size, 10, 0.03) + rnorm(cluster_size, 0.1, 0)
  z = sin(theta) + rnorm(cluster_size, 10, 0.03) + rnorm(cluster_size, 0.1, 0)
  y <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df4 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  df <- dplyr::bind_rows(df1, df2, df3, df4)
  names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)
  df

}

## Dataset19

nonlinear_mirror_with_noise <- function(sample_size = 400, with_seed = NULL, num_of_noise_dim = 2,
                                        min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by 2
  if ((sample_size%%2) != 0) {
    stop("The sample size should be a product of 2.")

  } else {
    cluster_size <- sample_size/2
  }

  x <- runif(cluster_size, -8, 1.5)
  y <- -(exp(x) + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.7)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -8, 1.5)
  y <- (exp(x) + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.7)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  df <- dplyr::bind_rows(df1, df2)
  names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)
  df

}

## Dataset20

three_circulars_with_noise <- function(sample_size = 300, with_seed = NULL, num_of_noise_dim = 3,
                                       min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by three
  if ((sample_size%%3) != 0) {
    stop("The sample size should be a product of 3.")

  } else {
    cluster_size <- sample_size/3
  }

  theta = runif(cluster_size, 0.0,2 * pi)
  x = cos(theta) + rnorm(cluster_size, 10, 0.03)
  y = sin(theta) + rnorm(cluster_size, 10, 0.03)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x = 0.5 * cos(theta) + rnorm(cluster_size, 10, 0.03)
  y = 0.5 * sin(theta) + rnorm(cluster_size, 10, 0.03)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x = rnorm(cluster_size, 10, 0.03)
  y = rnorm(cluster_size, 10, 0.03)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df3 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  df <- dplyr::bind_rows(df1, df2, df3)
  names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)
  df

}

## Dataset21

cluster_and_curvilinear__with_noise_and_bkg_noise <- function(sample_size = 260, with_seed = NULL, num_of_noise_dim = 8,
                                                              min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by 2
  if ((sample_size%%2) != 0) {
    stop("The sample size should be a product of 2.")

  } else {
    cluster_size <- (sample_size - sample_size * 0.3)/2
  }

  theta = runif(cluster_size, 0.20,0.60 * pi)
  x = cos(theta) + rnorm(cluster_size, 10, 0.03)
  y = sin(theta) + rnorm(cluster_size, 10, 0.03)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x = rnorm(cluster_size, 10, 0.05)
  y = rnorm(cluster_size, 10, 0.05)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.05)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.05)

  df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x = rnorm(sample_size * 0.3, 11, 0.5)
  y = rnorm(sample_size * 0.3, 11, 0.5)

  z <- rep(0, sample_size * 0.3) + rnorm(sample_size * 0.3, 10, 0.05)
  w <- rep(0, sample_size * 0.3) - rnorm(sample_size * 0.3, 10, 0.05)

  df3 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  df <- dplyr::bind_rows(df1, df2, df3)
  names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)
  df

}

## Dataset22

link_data_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 4,
                                 min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by 2
  if ((sample_size%%2) != 0) {
    stop("The sample size should be a product of 2.")

  } else {
    cluster_size <- sample_size/2
  }


  df <- snedata::link_data(n = cluster_size)
  df <- df |>
    dplyr::select(-color)
  names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset23

swiss_roll_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 7,
                                  min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }


  df <- snedata::swiss_roll(n = sample_size)
  df <- df |>
    dplyr::select(-color)
  names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset24

curvy_tree_with_noise <- function(sample_size = 300, with_seed = NULL, num_of_noise_dim = 2,
                                  min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by three
  if ((sample_size%%3) != 0) {
    stop("The sample size should be a product of 3.")

  } else {
    cluster_size <- sample_size/3
  }


  x <- runif(cluster_size, -2, 2)
  y <- -(x^3 + runif(cluster_size, 0, 6)) + runif(cluster_size, 0, 0.2)

  z <- rnorm(cluster_size, 10, 0.1)
  w <- rnorm(cluster_size, 10, 0.1)

  df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, 0, 2)
  y <- (x^3 + runif(cluster_size, 0, 6)) + runif(cluster_size, 0, 0.2)

  z <- rnorm(cluster_size, 10, 0.1)
  w <- rnorm(cluster_size, 10, 0.1)

  df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -2, 0)
  y <- -(x^3 + runif(cluster_size, 0, 6)) + runif(cluster_size, 0, 0.2) + 10

  z <- rnorm(cluster_size, 10, 0.1)
  w <- rnorm(cluster_size, 10, 0.1)

  df3 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  df <- dplyr::bind_rows(df1, df2, df3)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset25

tree_with_noise <- function(sample_size = 300, with_seed = NULL, num_of_noise_dim = 5,
                            min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by 5
  if ((sample_size%%5) != 0) {
    stop("The sample size should be a product of 5.")

  } else {
    cluster_size <- sample_size/5
  }


  x <- runif(cluster_size, -3, 3)
  y <- abs(0.5 * x)

  z <- rnorm(cluster_size, 10, 0.03)
  w <- rnorm(cluster_size, 10, 0.03)

  df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -0.5, 0.5)
  y <- abs(10*x)

  z <- rnorm(cluster_size, 10, 0.03)
  w <- rnorm(cluster_size, 10, 0.03)

  df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -6, 3)
  y <- (-1) * abs(0.5 * x + 5)

  z <- rnorm(cluster_size, 10, 0.03)
  w <- rnorm(cluster_size, 10, 0.03)

  df3 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -0.5, 0.5)
  y <- (-1) * abs(10 * x) - 5

  z <- rnorm(cluster_size, 10, 0.03)
  w <- rnorm(cluster_size, 10, 0.03)

  df4 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -5, 5)
  y <- x

  z <- rnorm(cluster_size, 10, 0.03)
  w <- rnorm(cluster_size, 10, 0.03)

  df5 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  df <- dplyr::bind_rows(df1, df2, df3, df4, df5)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset26

cell_cycle_with_noise <- function(sample_size = 300, with_seed = NULL, num_of_noise_dim = 7,
                                  min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by three
  if ((sample_size%%3) != 0) {
    stop("The sample size should be a product of 3.")

  } else {
    cluster_size <- sample_size/3
  }


  r1 <- 2
  r2 <- 1

  theta = runif(cluster_size, 0, 2 * pi)
  x <- rep(0, cluster_size)
  y <- r1 * cos(theta)
  z <- r2 * sin(theta)

  df1 <- tibble::tibble(x1=x, x2=y, x3=z)

  x <- r2 * cos(theta)
  y <- rep(0, cluster_size)
  z <- r1 * sin(theta)

  df2 <- tibble::tibble(x1=x, x2=y, x3=z)

  x <- r1 * cos(theta)
  y <- r2 * sin(theta)
  z <- rep(0, cluster_size)

  df3 <- tibble::tibble(x1=x, x2=y, x3=z)

  df <- dplyr::bind_rows(df1, df2, df3)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset27

curvy_cell_cycle_with_noise <- function(sample_size = 300, with_seed = NULL, num_of_noise_dim = 4,
                                        min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by three
  if ((sample_size%%3) != 0) {
    stop("The sample size should be a product of three.")

  } else {
    cluster_size <- sample_size/3
  }


  r <- sqrt(3)/3

  theta = runif(cluster_size, 0, 2 * pi)
  x <- cos(theta)
  y <- r + sin(theta)
  z <- cos(3 * theta)/3

  df1 <- tibble::tibble(x1=x, x2=y, x3=z)

  x <- cos(theta) + 0.5
  y <- sin(theta) - r/2
  z <- cos(3 * theta)/3

  df2 <- tibble::tibble(x1=x, x2=y, x3=z)

  x <- cos(theta) - 0.5
  y <- sin(theta) - r/2
  z <- cos(3 * theta)/3

  df3 <- tibble::tibble(x1=x, x2=y, x3=z)

  df <- dplyr::bind_rows(df1, df2, df3)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset28
two_curvy_panckakes_with_noise <- function(sample_size = 300, with_seed = NULL, num_of_noise_dim = 0,
                                           min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by 2
  if ((sample_size%%2) != 0) {
    stop("The sample size should be a product of 2.")

  } else {
    cluster_size <- sample_size/2
  }


  phi <- runif(cluster_size, max = 2*pi)
  rho <- sqrt(runif(cluster_size))

  theta <- runif(cluster_size, 0,1.80 * pi)
  x <- theta
  y <- sin(theta)

  df1 <- tibble::tibble(x1=x, x2=y, x3=sqrt(1)*rho*cos(phi) + 4, x4=sqrt(1)*rho*sin(phi) + 4)
  df2 <- tibble::tibble(x1=x+1, x2=y+1, x3=sqrt(1)*rho*cos(phi) + 6, x4=sqrt(1)*rho*sin(phi) + 6)


  df <- dplyr::bind_rows(df1, df2)

  if (num_of_noise_dim != 0) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                       min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                              min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

  } else {

    df

  }

}


## Dataset29

small_big_sphere_with_noise <- function(sample_size = 390, with_seed = NULL, num_of_noise_dim = 0,
                                        min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by 13
  if ((sample_size%%13) != 0) {
    stop("The sample size should be a product of 13.")

  } else {
    small_sphere_sample_size <- sample_size/13
  }


  df <- snedata::taspheres(n_samples = small_sphere_sample_size, d = 3, n_spheres = 4, r = 3) |>
    dplyr::select(-labels) # Creates a dataframe consisting of samples from the d-spheres of radius r enclosed within a larger d-sphere of radius 5 * r

  if (num_of_noise_dim != 0) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                       min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                              min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

  } else {

    df

  }

}

## Dataset30

seven_branching_data_with_noise <- function(sample_size = 210, with_seed = NULL, num_of_noise_dim = 2,
                                            min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by seven
  if ((sample_size%%7) != 0) {
    stop("The sample size should be a product of 7.")

  } else {
    cluster_size <- sample_size/7
  }



  x <- runif(cluster_size, -2, 2)
  y <- -(x^3 + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.2)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -2, 1.5)
  y <- (x^3 + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.2)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -2, 1.5)
  y <- (1 + (x-3)^2 + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.1)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df3 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -0.5, 3)
  y <- (1 + -(x-3)^2 + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.1)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df4 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -1, 1)
  y <- (20 + x^3 + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.01)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df5 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -2, 2)
  y <- (x^2 + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.01) + 10

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df6 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -2, 2)
  y <- (x^2 + runif(cluster_size, 0, 0.2)) + runif(cluster_size, 0, 0.01) + 15

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df7 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  df <- dplyr::bind_rows(df1, df2, df3, df4, df5, df6, df7)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset31

four_branching_data_with_noise <- function(sample_size = 400, with_seed = NULL, num_of_noise_dim = 5,
                                           min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by 4
  if (((sample_size - sample_size * 0.1)%%4) != 0) {
    stop("The sample size should be a product of 4.")

  } else {
    cluster_size <- (sample_size - sample_size * 0.1)/4
  }

  x <- runif(cluster_size, -5, 1)
  y <- (exp(x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -1, 5)
  y <- (exp(-x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, 0, 5)
  y <- (log(x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df3 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -5, 0)
  y <- (log(-x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df4 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(sample_size * 0.1, -5, 0)
  y <- runif(sample_size * 0.1, 0, 0.8) + runif(sample_size * 0.1, 0, 0.8)

  z <- rep(0, sample_size * 0.1) + rnorm(sample_size * 0.1, 10, 0.03)
  w <- rep(0, sample_size * 0.1) - rnorm(sample_size * 0.1, 10, 0.03)

  df5 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  df <- dplyr::bind_rows(df1, df2, df3, df4, df5)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset32

eight_branching_data_with_noise <- function(sample_size = 400, with_seed = NULL, num_of_noise_dim = 8,
                                            min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by eight
  if ((sample_size%%8) != 0) {
    stop("The sample size should be a product of 8.")

  } else {
    cluster_size <- sample_size/8
  }

  x <- runif(cluster_size, -1, 2)
  y <- (exp(x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -1, 1)
  y <- (exp(2*x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -1, 0.6)
  y <- (exp(3*x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df3 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -1, 3)
  y <- (exp(0.5*x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df4 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -2, 1)
  y <- (exp(-x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df5 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -1, 1)
  y <- (exp(2*-x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df6 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -0.6, 1)
  y <- (exp(3*-x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df7 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -3, 1)
  y <- (exp(0.5*-x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df8 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  df <- dplyr::bind_rows(df1, df2, df3, df4, df5, df6, df7, df8)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset33

one_doublet_with_noise <- function(sample_size = 110, with_seed = NULL, num_of_noise_dim = 6,
                                   min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }


  # To check that the assigned sample_size is divided by 2.2
  if (((sample_size * 10)%%22) != 0) { #sample_size%%2.2
    stop("The sample size should be a product of 2.2.")

  } else {
    cluster_size <- (sample_size * 10)/22
  }


  df1 <- tibble::tibble(x=rnorm(cluster_size, mean = 0, sd = 0.05), y=rnorm(cluster_size, mean = 1, sd = 0.05), z=rnorm(cluster_size, mean = 0, sd = 0.05), w=rnorm(cluster_size, mean = 0, sd = 0.05))

  df2 <- tibble::tibble(x=rnorm(cluster_size, mean = 1, sd = 0.05), y=rnorm(cluster_size, mean = 0, sd = 0.05), z=rnorm(cluster_size, mean = 0, sd = 0.05), w=rnorm(cluster_size, mean = 0, sd = 0.05))

  df3_new <- (df1 + df2) / 2
  #get a sample of 10
  samp <- sample(nrow(df3_new), cluster_size * 0.20) ## 20% from the original dataset

  #data in the sample
  df3 <- df3_new[samp,]


  df <- dplyr::bind_rows(df1, df2, df3)
  df <- df |>
    dplyr::rename(x1 = x, x2 = y, x3 = z, x4 = w)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset34

two_curvilinear_data_with_noise <- function(sample_size = 250, with_seed = NULL, num_of_noise_dim = 2,
                                            min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by two
  if (((sample_size - sample_size * 0.2)%%2) != 0) {
    warning("The sample size should be a product of two.")
    cluster_size <- floor((sample_size - sample_size * 0.2)/2)

  } else {
    cluster_size <- (sample_size - sample_size * 0.2)/2
  }

  x <- runif(cluster_size, -2, -0.5)
  y <- (x^2 + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, 0.5, 2)
  y <- (x^2 + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- rnorm(sample_size * 0.2, mean = 0, sd = 0.4)
  y <- rnorm(sample_size * 0.2, mean = 1.5, sd = 0.5)

  z <- rep(0, sample_size * 0.2) + rnorm(sample_size * 0.2, 10, 0.03)
  w <- rep(0, sample_size * 0.2) - rnorm(sample_size * 0.2, 10, 0.03)

  df3 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  df <- dplyr::bind_rows(df1, df2, df3)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}


## Dataset35

sphere_data_with_noise <- function(sample_size = 250, with_seed = NULL, num_of_noise_dim = 1,
                                   min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  df <- snedata::sphere(sample_size) |>
    dplyr::select(-color)

  names(df) <- paste0(rep("x", 3), 1:3)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset36

three_doublets_with_noise <- function(sample_size = 210, with_seed = NULL, num_of_noise_dim = 0,
                                      min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by 4.2
  if (((sample_size * 10)%%42) != 0) {
    stop("The sample size should be a product of number of clusters.")

  } else {
    cluster_size <- sample_size/4.2
  }


  df1 <- tibble::tibble(x1=rnorm(cluster_size, mean = 3, sd = 0.05), x2 = rnorm(cluster_size, mean = 1, sd = 0.05), x3=rnorm(cluster_size, mean = 1, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x7=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x8=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x9=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x10=rnorm(cluster_size, mean = 1, sd = 0.05))

  df2 <- tibble::tibble(x1=rnorm(cluster_size, mean = 1, sd = 0.05), x2=rnorm(cluster_size, mean = 1, sd = 0.05), x3=rnorm(cluster_size, mean = 1, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x7=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x8=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x9=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x10=rnorm(cluster_size, mean = 1, sd = 0.05))

  df3_new <- (df1 + df2) / 2
  #get a sample of 10
  samp <- sample(nrow(df3_new), cluster_size * 0.40) ## 20% from the original dataset

  #data in the sample
  df3 <- df3_new[samp,]

  df4 <- tibble::tibble(x1=rnorm(cluster_size, mean = 1, sd = 0.05), x2=rnorm(cluster_size, mean = 1, sd = 0.05), x3=rnorm(cluster_size, mean = 1, sd = 0.05), x4=rnorm(cluster_size, mean = 3, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x7=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x8=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x9=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x10=rnorm(cluster_size, mean = 1, sd = 0.05))

  df5_new <- (df2 + df4) / 2

  #get a sample of 10
  samp1 <- sample(nrow(df5_new), cluster_size * 0.30) ## 20% from the original dataset

  #data in the sample
  df5 <- df5_new[samp1,]

  df6_new <- (df1 + df4) / 2

  #get a sample of 10
  samp2 <- sample(nrow(df6_new), cluster_size * 0.50) ## 20% from the original dataset

  #data in the sample
  df6 <- df6_new[samp2,]

  df <- dplyr::bind_rows(df1, df2, df3, df4, df5, df6)

  if (num_of_noise_dim != 0) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                       min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                              min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

  } else {

    df

  }

}

## Dataset37

one_doublet_four_clusters_with_noise <- function(sample_size = 210, with_seed = NULL, num_of_noise_dim = 0,
                                                 min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by 4.4
  if (((sample_size * 10)%%44) != 0) { #sample_size%%4.4
    stop("The sample size should be a product of 4.4.")

  } else {
    cluster_size <- (sample_size * 10)/44
  }


  df1 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2 = rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x7=rnorm(cluster_size, mean = 1, sd = 0.05))

  df2 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x7=rnorm(cluster_size, mean = 0, sd = 0.05))

  df3_new <- (df1 + df2) / 2
  #get a sample of 10
  samp <- sample(nrow(df3_new), cluster_size * 0.40) ## 20% from the original dataset

  #data in the sample
  df3 <- df3_new[samp,]

  df4 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 1, sd = 0.05), x4=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x7=rnorm(cluster_size, mean = 0, sd = 0.05))


  df5 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x7=rnorm(cluster_size, mean = 0, sd = 0.05))

  df <- bind_rows(df1, df2, df3, df4, df5)

  if (num_of_noise_dim != 0) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                       min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                              min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

  } else {

    df

  }

}

## Dataset38

one_doublet_dfifferent_var_clusters_with_noise <- function(sample_size = 260, with_seed = NULL, num_of_noise_dim = 0,
                                                           min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by 2.6
  if (((sample_size * 10)%%26) != 0) {
    stop("The sample size should be a product of 2.6.")

  } else {
    cluster_size <- sample_size/2.6
  }


  df1 <- tibble::tibble(x1=rnorm(cluster_size, mean = 1, sd = 0.1), x2 = rnorm(cluster_size, mean = 0, sd = 0.08), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 0, sd = 0.08),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.08),
                        x7=rnorm(cluster_size, mean = 1, sd = 0.08),
                        x8=rnorm(cluster_size, mean = 1, sd = 0.02),
                        x9=rnorm(cluster_size, mean = 0, sd = 0.02),
                        x10=rnorm(cluster_size, mean = 0, sd = 0.02))

  df2 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.02), x2=rnorm(cluster_size, mean = 0, sd = 0.02), x3=rnorm(cluster_size, mean = 0, sd = 0.02), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 1, sd = 0.02),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.02),
                        x7=rnorm(cluster_size, mean = 0, sd = 0.02),
                        x8=rnorm(cluster_size, mean = 1, sd = 0.02),
                        x9=rnorm(cluster_size, mean = 0, sd = 0.02),
                        x10=rnorm(cluster_size, mean = 0, sd = 0.02))

  df3_new <- (df1 + df2) / 2
  #get a sample of 10
  samp <- sample(nrow(df3_new), cluster_size * 0.60) ## 20% from the original dataset

  #data in the sample
  df3 <- df3_new[samp,]

  df <- bind_rows(df1, df2, df3)

  if (num_of_noise_dim != 0) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                       min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                              min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

  } else {

    df

  }

}

## Dataset39

one_doublet_dfifferent_pattern_clusters_with_noise <- function(sample_size = 280, with_seed = NULL, num_of_noise_dim = 0,
                                                               min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by 2.8
  if (((sample_size * 10)%%28) != 0) {
    stop("The sample size should be a product of 2.8.")

  } else {
    cluster_size <- sample_size/2.8
  }


  theta <- runif(cluster_size, 0.20, 0.60 * pi)

  df1 <- tibble::tibble(
    x1 = cos(theta) + rnorm(cluster_size, 1, 0.5),
    x2 = sin(theta) + rnorm(cluster_size, 1, 0.03),

    x3 = cos(theta) + rnorm(cluster_size, 1, 0.03),
    x4 = sin(theta) + rnorm(cluster_size, 1, 0.03),

    x5 = cos(theta) + rnorm(cluster_size, 1, 0.03),
    x6 = sin(theta) + rnorm(cluster_size, 1, 0.03),

    x7 = cos(theta) + rnorm(cluster_size, 1, 0.05),
    x8 = sin(theta) + rnorm(cluster_size, 1, 0.03),

    x9 = cos(theta) + rnorm(cluster_size, 1, 0.3),
    x10 = sin(theta) + rnorm(cluster_size, 1, 0.03))

  df2 <- tibble::tibble(x1=rnorm(cluster_size, mean = 1, sd = 0.1), x2 = rnorm(cluster_size, mean = 0, sd = 0.08), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 0, sd = 0.08),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.08),
                        x7=rnorm(cluster_size, mean = 1, sd = 0.08),
                        x8=rnorm(cluster_size, mean = 1, sd = 0.02),
                        x9=rnorm(cluster_size, mean = 0, sd = 0.02),
                        x10=rnorm(cluster_size, mean = 0, sd = 0.02))


  df3_new <- (df1 + df2) / 2
  #get a sample of 10
  samp <- sample(nrow(df3_new), cluster_size * 0.80) ## 20% from the original dataset

  #data in the sample
  df3 <- df3_new[samp,]

  df <- dplyr::bind_rows(df1, df2, df3)

  if (num_of_noise_dim != 0) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                       min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                              min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

  } else {

    df

  }

}

## Dataset40

two_doublets_parallel_with_noise <- function(sample_size = 440, with_seed = NULL, num_of_noise_dim = 0,
                                             min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by 4.4
  if (((sample_size * 10)%%44) != 0) { #sample_size%%4.4
    stop("The sample size should be a product of 4.4.")

  } else {
    cluster_size <- (sample_size * 10)/44
  }


  df1 <- tibble::tibble(x1=rnorm(cluster_size, mean = 1, sd = 0.05), x2 = rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x7=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x8=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x9=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x10=rnorm(cluster_size, mean = 0, sd = 0.05))

  df2 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x7=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x8=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x9=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x10=rnorm(cluster_size, mean = 0, sd = 0.05))

  df3_new <- (df1 + df2) / 2
  #get a sample of 10
  samp <- sample(nrow(df3_new), cluster_size * 0.20) ## 20% from the original dataset

  #data in the sample
  df3 <- df3_new[samp,]

  df4 <- tibble::tibble(x1=rnorm(cluster_size, mean = -1, sd = 0.05), x2 = rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x7=rnorm(cluster_size, mean = -1, sd = 0.05),
                        x8=rnorm(cluster_size, mean = -1, sd = 0.05),
                        x9=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x10=rnorm(cluster_size, mean = 0, sd = 0.05))

  df5 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = -1, sd = 0.05),
                        x5=rnorm(cluster_size, mean = -1, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x7=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x8=rnorm(cluster_size, mean = -1, sd = 0.05),
                        x9=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x10=rnorm(cluster_size, mean = 0, sd = 0.05))

  df6_new <- (df4 + df5) / 2
  #get a sample of 10
  samp1 <- sample(nrow(df6_new), cluster_size * 0.20) ## 20% from the original dataset

  #data in the sample
  df6 <- df6_new[samp1,]

  df <- dplyr::bind_rows(df1, df2, df3, df4, df5, df6)

  if (num_of_noise_dim != 0) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                       min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                              min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

  } else {

    df

  }

}

## Dataset41

one_doublets_with_bkg_noise <- function(sample_size = 250, with_seed = NULL, num_of_noise_dim = 0,
                                        min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by 2.5
  if (((sample_size * 10)%%25) != 0) {
    stop("The sample size should be a product of 2.5.")

  } else {
    cluster_size <- sample_size/2.5
  }


  df1 <- tibble::tibble(x1=rnorm(cluster_size, mean = 1, sd = 0.05), x2 = rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x7=rnorm(cluster_size, mean = 1, sd = 0.05))

  df2 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x7=rnorm(cluster_size, mean = 0, sd = 0.05))

  df3_new <- (df1 + df2) / 2
  #get a sample of 10
  samp <- sample(nrow(df3_new), cluster_size * 0.20) ## 20% from the original dataset

  #data in the sample
  df3 <- df3_new[samp,]

  df4_new <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.2), x2 = rnorm(cluster_size, mean = 0, sd = 0.5), x3=rnorm(cluster_size, mean = 0.5, sd = 0.5), x4=rnorm(cluster_size, mean = 0.2, sd = 0.5),
                            x5=rnorm(cluster_size, mean = 0.2, sd = 0.3),
                            x6=rnorm(cluster_size, mean = 0, sd = 0.5),
                            x7=rnorm(cluster_size, mean = 0, sd = 0.3))

  #get a sample of 10
  samp1 <- sample(nrow(df4_new), cluster_size * 0.30) ## 20% from the original dataset

  #data in the sample
  df4 <- df4_new[samp1,]

  df <- dplyr::bind_rows(df1, df2, df3, df4)

  if (num_of_noise_dim != 0) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                       min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                              min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

  } else {

    df

  }

}

## Dataset42

curvy_branching_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 0,
                                       min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by three
  if ((sample_size%%2) != 0) {
    warning("The sample size should be a product of number of clusters.")
    cluster_size <- floor(sample_size/2)

  } else {
    cluster_size <- sample_size/2
  }


  theta = runif(cluster_size, 0.20, 0.90 * pi)

  df1 <- tibble::tibble(
    x1 = cos(theta) + rnorm(cluster_size, 1, 0.06),
    x2 = sin(theta) + rnorm(cluster_size, 1, 0.06),

    x3 = cos(theta) + rnorm(cluster_size, 1, 0.06),
    x4 = sin(theta) + rnorm(cluster_size, 1, 0.06)
  )

  theta1 = runif(cluster_size, 0.20, 0.90 * pi)

  df2 <- tibble::tibble(
    x1 = cos(-theta1) + rnorm(cluster_size, 1, 0.06),
    x2 = sin(-theta1) + rnorm(cluster_size, 1, 0.06),

    x3 = cos(-theta1) + rnorm(cluster_size, 1, 0.06),
    x4 = sin(-theta1) + rnorm(cluster_size, 1, 0.06)
  )

  df <- bind_rows(df1, df2)

  if (num_of_noise_dim != 0) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                       min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                              min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

  } else {

    df

  }

}

## Dataset43

two_doublets_with_bkg_noise <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 3,
                                        min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by four
  if ((sample_size%%4) != 0) {
    stop("The sample size should be a product of 4.")

  } else {
    cluster_size <- sample_size/4
  }


  df1 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 0, sd = 0.05))


  df2 <- tibble::tibble(x1=rnorm(cluster_size, mean = 1, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 0, sd = 0.05))

  df6_new <- (df1 + df2) / 2
  #get a sample of 10
  samp <- sample(nrow(df6_new), cluster_size * 0.20) ## 20% from the original dataset

  #data in the sample
  df6 <- df6_new[samp,]


  df3 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 1, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 0, sd = 0.05))

  df7_new <- (df1 + df3) / 2
  #get a sample of 10
  samp <- sample(nrow(df7_new), cluster_size * 0.20) ## 20% from the original dataset

  #data in the sample
  df7 <- df7_new[samp,]

  df4 <- tibble::tibble(x1=rnorm(cluster_size * 0.6, mean = 0, sd = 0.5), x2=rnorm(cluster_size * 0.6, mean = 0, sd = 0.5), x3=rnorm(cluster_size * 0.6, mean = 0, sd = 0.5), x4=rnorm(cluster_size * 0.6, mean = 0, sd = 0.5))


  df <- dplyr::bind_rows(df1, df2, df3, df6, df7, df4)

  if (num_of_noise_dim != 0) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                       min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                              min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

  } else {

    df

  }

}

## Dataset44

two_nonlinear_with_noise <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 0,
                                     min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by two
  if ((sample_size%%2) != 0) {
    stop("The sample size should be a product of 2.")

  } else {
    cluster_size <- sample_size/2
  }


  x <- runif(cluster_size, -8, 1.5)
  y <- -(exp(x) + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.7)

  z <- -(exp(x) + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.7)
  w <- -(exp(x) + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.7)

  df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -8, 1.5)
  y <- 3 - (exp(x) + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.7)

  z <- 3 - (exp(x) + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.7)
  w <- 3 - (exp(x) + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.7)

  df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  df <- dplyr::bind_rows(df1, df2)

  if (num_of_noise_dim != 0) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                       min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                              min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

  } else {

    df

  }

}

## Dataset45

two_s_curve_hole_with_noise <- function(sample_size = 280, with_seed = NULL, num_of_noise_dim = 4,
                                        min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # ## Need to change later
  # if (sample_size == 280) {
  #   sample_size_n <- 300
  #
  # } else if (sample_size == 660) {
  #
  #   sample_size_n <- 700
  #
  # } else if (sample_size == 954) {
  #
  #   sample_size_n <- 1000
  #
  # } else if (sample_size == 1400) {
  #
  #   sample_size_n <- 1500
  #
  # } else {
  #
  # }


  ## S curve with a hole
  df1 <- snedata::s_curve_hole(n_samples = sample_size/2, noise = 0) ## Should add more data because remove to create the hole
  df1 <- df1 |>
    dplyr::select(-color)
  names(df1) <- paste0(rep("x",3), 1:3)

  df2 <- df1 + 1

  df <- dplyr::bind_rows(df1, df2)

  sample_size_n <- NROW(df)


  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size_n,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size_n,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset46

three_grid_with_noise <- function(sample_size = 300, with_seed = NULL, num_of_noise_dim = 2,
                                  min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  if ((sample_size %% 3) != 0) {

    stop("The sample size should be a product of three.")

  } else {

    if (((sqrt(sample_size/3)) %% 1) != 0) {

      stop("The square root should exists.")

    } else {

      n_value <- sqrt(sample_size/3)

    }

  }

  df1 <- snedata::grid_data(n = n_value)
  df1 <- df1 |>
    dplyr::select(-color)

  names(df1) <- paste0(rep("x",2), 1:2)
  df1$x3 <- runif(nrow(df1), -0.01, 0.01)
  df1$x4 <- runif(nrow(df1), -0.01, 0.01)

  df2 <- snedata::grid_data(n = n_value)
  df2 <- df2 |>
    dplyr::select(-color)

  names(df2) <- paste0(rep("x",2), c(1, 3))
  df2$x2 <- runif(nrow(df2), -0.01, 0.01)
  df2$x4 <- runif(nrow(df2), -0.01, 0.01)
  df2 <- df2 |>
    dplyr::select(x1, x2, x3, x4)

  df3 <- snedata::grid_data(n = n_value)
  df3 <- df3 |>
    dplyr::select(-color)

  names(df3) <- paste0(rep("x",2), c(1, 4))
  df3$x2 <- runif(nrow(df3), -0.01, 0.01)
  df3$x3 <- runif(nrow(df3), -0.01, 0.01)
  df3 <- df3 |>
    dplyr::select(x1, x2, x3, x4)


  df <- dplyr::bind_rows(df1, df2, df3)

  sample_size <- NROW(df)


  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset47

one_grid_diff_with_bkg_noise <- function(sample_size = 260, with_seed = NULL, num_of_noise_dim = 5,
                                         min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }


  if (((sample_size - (sample_size * 6/26)) %% 2) != 0) {

    stop("The sample size should be a product of two.")

  } else {

    if (((sqrt((sample_size - (sample_size * 6/26)) / 2)) %% 1) != 0) {

      stop("The square root should exists.")

    } else {

      n_value <- sqrt((sample_size - (sample_size * 0.6/2.6)) / 2)

    }

  }



  df1 <- snedata::grid_data(n = n_value)
  df1 <- df1 |>
    dplyr::select(-color)

  names(df1) <- paste0(rep("x",2), 1:2)

  df3 <- df1 + 3

  df1 <- dplyr::bind_rows(df1, df3)


  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df1) + 1):((NCOL(df1) + 1) + num_of_noise_dim))

  #sample_size <- NROW(df1) + NROW(df1) * 0.6/2

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(NROW(df1),
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(NROW(df1),
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df1 <- dplyr::bind_cols(df1, df_noise)

  ## To add background noise
  column_names_bkg <- paste0(rep("x", NCOL(df1)), 1:NCOL(df1))

  noise_bkg_val_list <- list()

  for (j in 1:NCOL(df1)) {
    noise_bkg_val_list[[column_names_bkg[j]]] <- rnorm(sample_size * 0.6/2.6, mean = 3, sd = 5)


  }

  df2 <- tibble::as_tibble(noise_bkg_val_list)


  df <- dplyr::bind_rows(df1, df2)

  df

}

## Dataset48

two_grid_with_bkg_noise <- function(sample_size = 260, with_seed = NULL, num_of_noise_dim = 5,
                                    min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  if (((sample_size - (sample_size * 6/26)) %% 2) != 0) {

    stop("The sample size should be a product of two.")

  } else {

    if (((sqrt((sample_size - (sample_size * 6/26)) / 2)) %% 1) != 0) {

      stop("The square root should exists.")

    } else {

      n_value <- sqrt((sample_size - (sample_size * 0.6/2.6)) / 2)

    }

  }

  df1 <- snedata::grid_data(n = n_value)
  df1 <- df1 |>
    dplyr::select(-color)

  names(df1) <- paste0(rep("x",2), 1:2)

  df3 <- df1 + 5

  df1 <- dplyr::bind_rows(df1, df3)


  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df1) + 1):((NCOL(df1) + 1) + num_of_noise_dim))

  #sample_size <- NROW(df1) + NROW(df1) * 0.6/2

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(NROW(df1),
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(NROW(df1),
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df1 <- dplyr::bind_cols(df1, df_noise)

  ## To add background noise
  column_names_bkg <- paste0(rep("x", NCOL(df1)), 1:NCOL(df1))

  noise_bkg_val_list <- list()

  for (j in 1:NCOL(df1)) {
    noise_bkg_val_list[[column_names_bkg[j]]] <- rnorm(sample_size * 0.6/2.6, mean = 3, sd = 5)


  }

  df2 <- tibble::as_tibble(noise_bkg_val_list)


  df <- dplyr::bind_rows(df1, df2)

  df

}

## Dataset49

traingular_3D_with_noise <- function(sample_size = 150, with_seed = NULL, num_of_noise_dim = 7,
                                     min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  trace.point <- runif(3)
  corner.points <- tibble::tibble(x1 =c(  0,  1, 0.5, 0.5),
                                  x2 =c(  0,  0,   1, 0.5),
                                  x3 =c(  0,  0,   0,   1))
  df <- tibble::tibble(x1 =rep(0,sample_size),
                       x2 =rep(0,sample_size),
                       x3 =rep(0,sample_size))
  for(i in 1:sample_size){
    trace.point    = (corner.points[sample(4,1),]+trace.point)/2
    df[i,] = trace.point
  }


  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset50

triangular_plane_with_bkg_noise <- function(sample_size = 675, with_seed = NULL, num_of_noise_dim = 2,
                                            min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by three
  if ((sample_size%%3) != 0) {
    stop("The sample size should be a product of 3.")

  } else {
    cluster_size <- sample_size/3
  }


  trace.point <- runif(2)
  corner.points <- tibble::tibble(x1 =c(  0,  1, 0.5),
                                  x2 =c(  0,  0,   1))
  df1 <- tibble::tibble(x1 =rep(0,cluster_size),
                        x2 =rep(0,cluster_size))
  for(i in 1:cluster_size){
    trace.point    = (corner.points[sample(3,1),]+trace.point)/2
    df1[i,] = trace.point
  }


  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df1) + 1):((NCOL(df1) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- stats::runif(cluster_size,
                                                            min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * stats::runif(cluster_size,
                                                                   min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df1 <- dplyr::bind_cols(df1, df_noise)

  ## To add background noise
  column_names_bkg <- paste0(rep("x", NCOL(df1)), 1:NCOL(df1))

  noise_bkg_val_list <- list()

  for (j in 1:NCOL(df1)) {
    noise_bkg_val_list[[column_names_bkg[j]]] <- stats::rnorm(cluster_size, mean = 0.025, sd = 0.5)


  }

  df2 <- tibble::as_tibble(noise_bkg_val_list)


  df <- dplyr::bind_rows(df1, df2, -df1)

  df

}

## Dataset51

two_curvilinear_with_noise <- function(sample_size = 150, with_seed = NULL, num_of_noise_dim = 0,
                                       min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by three
  if ((sample_size%%2) != 0) {
    stop("The sample size should be a product of 2.")

  } else {
    cluster_size <- sample_size/2
  }


  theta = runif(cluster_size, 0.20, 0.90 * pi)

  df1 <- tibble::tibble(
    x1 = cos(theta) + rnorm(cluster_size, 1, 0.06),
    x2 = sin(theta) + rnorm(cluster_size, 1, 0.06),

    x3 = cos(theta) + rnorm(cluster_size, 1, 0.06),
    x4 = sin(theta) + rnorm(cluster_size, 1, 0.06)
  )

  theta1 <- stats::runif(cluster_size, 0.20, 0.90 * pi)

  df2 <- tibble::tibble(
    x1 = 1 + cos(theta1) + rnorm(cluster_size, 1, 0.06),
    x2 = 1 + sin(theta1) + rnorm(cluster_size, 1, 0.06),

    x3 = 1 + cos(theta1) + rnorm(cluster_size, 1, 0.06),
    x4 = 1 + sin(theta1) + rnorm(cluster_size, 1, 0.06)
  )

  df <- dplyr::bind_rows(df1, df2)

  if (num_of_noise_dim != 0 ) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- stats::runif(sample_size,
                                                              min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * stats::runif(sample_size,
                                                                     min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

  } else {

    df

  }

}

## Dataset52

two_curvilinear_diff_with_noise <- function(sample_size = 150, with_seed = NULL, num_of_noise_dim = 0,
                                            min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by three
  if ((sample_size%%2) != 0) {
    stop("The sample size should be a product of 2.")
    cluster_size <- floor(sample_size/2)

  } else {
    cluster_size <- sample_size/2
  }


  theta = runif(cluster_size, 0.40, 0.70 * pi)

  df1 <- tibble::tibble(
    x1 = cos(theta) + rnorm(cluster_size, 1, 0.06),
    x2 = sin(theta) + rnorm(cluster_size, 1, 0.06),

    x3 = cos(theta) + rnorm(cluster_size, 1, 0.06),
    x4 = sin(theta) + rnorm(cluster_size, 1, 0.06)
  )

  theta1 = runif(cluster_size, 0.20, 0.90 * pi)

  df2 <- tibble::tibble(
    x1 = 1 + cos(theta1) + rnorm(cluster_size, 1, 0.06),
    x2 = 1 + sin(theta1) + rnorm(cluster_size, 1, 0.06),

    x3 = cos(theta1) + rnorm(cluster_size, 1, 0.06),
    x4 = sin(theta1) + rnorm(cluster_size, 1, 0.06)
  )

  df <- bind_rows(df1, df2)

  if (num_of_noise_dim != 0) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                       min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                              min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

  } else {

    df

  }

}

## Dataset53

two_linear_diff_with_noise <- function(sample_size = 150, with_seed = NULL, num_of_noise_dim = 5,
                                       min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by three
  if ((sample_size%%3) != 0) {
    stop("The sample size should be a product of 3.")

  } else {
    cluster_size <- sample_size/3
  }


  df_2_split <- snedata::long_cluster_data(n = cluster_size) |>
    dplyr::group_by(color) |>
    dplyr::group_split()

  df_2_split_1 <- df_2_split[[1]]
  df_2_split_1$x <- df_2_split_1$x - 20
  df_2_split_1$y <- df_2_split_1$y - 20

  df_2_split_3 <- df_2_split[[1]]
  df_2_split_3$x <- df_2_split_3$x + 10
  df_2_split_3$y <- df_2_split_3$y + 10

  df <- dplyr::bind_rows(df_2_split_1, df_2_split[[2]], df_2_split_3) |>
    dplyr::select(-color)

  names(df) <- paste0(rep("x",2), 1:2)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset54

three_linear_with_noise <- function(sample_size = 150, with_seed = NULL, num_of_noise_dim = 5,
                                    min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by three
  if ((sample_size%%3) != 0) {
    stop("The sample size should be a product of 3.")

  } else {
    cluster_size <- sample_size/3
  }


  df_2_split <- snedata::long_cluster_data(n = cluster_size) |>
    dplyr::group_by(color) |>
    dplyr::group_split()

  df_2_split_1 <- df_2_split[[1]]
  df_2_split_1$x <- df_2_split_1$x - 20
  df_2_split_1$y <- df_2_split_1$y - 20

  df_2_split_3 <- df_2_split[[1]]
  df_2_split_3$x <- df_2_split_3$x - 10
  df_2_split_3$y <- df_2_split_3$y + 10

  df <- dplyr::bind_rows(df_2_split_1, df_2_split[[2]], df_2_split_3) |>
    dplyr::select(-color)

  names(df) <- paste0(rep("x",2), 1:2)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset55

three_nonlinear_with_noise <- function(sample_size = 150, with_seed = NULL, num_of_noise_dim = 0,
                                       min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by three
  if ((sample_size%%3) != 0) {
    stop("The sample size should be a product of 3.")
    cluster_size <- floor(sample_size/3)

  } else {
    cluster_size <- sample_size/3
  }


  phi <- runif(cluster_size, max = 2*pi)
  rho <- sqrt(runif(cluster_size))

  theta <- runif(cluster_size, 0,1.80 * pi)
  x <- theta
  y <- sin(theta)

  df1 <- tibble::tibble(x1=x, x2=y, x3=sqrt(1)*rho*cos(phi) + 4, x4=sqrt(1)*rho*sin(phi) + 4)
  df2 <- tibble::tibble(x1=x+1, x2=y+1, x3=sqrt(1)*rho*cos(phi) + 6, x4=sqrt(1)*rho*sin(phi) + 6)
  df3 <- tibble::tibble(x1=x-1, x2=y-1, x3=sqrt(1)*rho*cos(phi) + 8, x4=sqrt(1)*rho*sin(phi) + 8)

  df <- dplyr::bind_rows(df1, df2, df3)

  if (num_of_noise_dim != 0 ) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                       min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                              min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

  } else {

    df

  }

}

## Dataset56

three_cluster_mirror_with_noise <- function(sample_size = 150, with_seed = NULL, num_of_noise_dim = 0,
                                            min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by three
  if ((sample_size%%6) != 0) {
    stop("The sample size should be a product of 6.")

  } else {
    cluster_size <- sample_size/6
  }


  df1 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 0, sd = 0.05))

  df2 <- tibble::tibble(x1=rnorm(cluster_size, mean = 1, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 0, sd = 0.05))

  df3 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 1, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 0, sd = 0.05))

  df_1 <- dplyr::bind_rows(df1, df2, df3)

  df_2 <- df_1 + 2
  df <- dplyr::bind_rows(df_1, df_2)

  if (num_of_noise_dim != 0) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                       min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                              min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

  } else {

    df

  }

}

## Dataset57

s_curve_with_noise <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 4,
                               min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  df <- snedata::s_curve(n_samples = sample_size, noise = 0.05)
  df <- df |>
    dplyr::select(-color)
  names(df) <- paste0(rep("x",3), 1:3)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset58

mobius_cluster_with_noise <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 7,
                                      min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  mobius <- geozoo::mobius.experiment(p = 5, n = sample_size* 0.80)

  df1 <- tibble::as_tibble(mobius$points)

  names(df1) <- paste0(rep("x", length(names(df1))), 1: length(names(df1)))


  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df1) + 1):((NCOL(df1) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size* 0.80,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size* 0.80,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df1 <- dplyr::bind_cols(df1, df_noise)

  ## To add background noise
  column_names_bkg <- paste0(rep("x", NCOL(df1)), 1:NCOL(df1))

  noise_bkg_val_list <- list()

  for (j in 1:NCOL(df1)) {
    noise_bkg_val_list[[column_names_bkg[j]]] <- rnorm(sample_size * 0.20, mean = 0, sd = 0.3)


  }

  df2 <- tibble::as_tibble(noise_bkg_val_list)


  df <- dplyr::bind_rows(df1, df2)


  df

}

## Dataset59

four_long_clusters_with_bkg_noise <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 5,
                                              min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by three
  if ((sample_size%%5) != 0) {
    stop("The sample size should be a product of 5.")

  } else {
    cluster_size <- sample_size/5
  }


  df_2_split <- snedata::long_cluster_data(n = cluster_size) |>
    dplyr::group_by(color) |>
    dplyr::group_split()

  df_2_split_1 <- df_2_split[[1]]
  df_2_split_1$x <- df_2_split_1$x - 20
  df_2_split_1$y <- df_2_split_1$y - 20

  df_2_split_3 <- df_2_split[[1]]
  df_2_split_3$x <- df_2_split_3$x - 10
  df_2_split_3$y <- df_2_split_3$y + 10

  df_2_split_4 <- df_2_split[[1]]
  df_2_split_4$x <- df_2_split_4$x + 20
  df_2_split_4$y <- df_2_split_4$y + 30

  df1 <- bind_rows(df_2_split_1, df_2_split[[2]], df_2_split_3, df_2_split_4) |>
    dplyr::select(-color)

  names(df1) <- paste0(rep("x",2), 1:2)


  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df1) + 1):((NCOL(df1) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(cluster_size * 4,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(cluster_size * 4,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df1 <- dplyr::bind_cols(df1, df_noise)

  ## To add background noise
  column_names_bkg <- paste0(rep("x", NCOL(df1)), 1:NCOL(df1))

  noise_bkg_val_list <- list()

  for (j in 1:NCOL(df1)) {
    noise_bkg_val_list[[column_names_bkg[j]]] <- rnorm(cluster_size, mean = 0, sd = 10)


  }

  df2 <- tibble::as_tibble(noise_bkg_val_list)


  df <- dplyr::bind_rows(df1, df2)

  df

}

## Dataset60

curvy_branching_cluster_with_bkg_noise <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 0,
                                                   min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by four
  if ((sample_size%%4) != 0) {
    stop("The sample size should be a product of 4.")

  } else {
    cluster_size <- sample_size/4
  }


  theta <- runif(cluster_size, 0.20, 0.90 * pi)

  df1 <- tibble::tibble(
    x1 = cos(theta) + rnorm(cluster_size, 1, 0.06),
    x2 = sin(theta) + rnorm(cluster_size, 1, 0.06),

    x3 = cos(theta) + rnorm(cluster_size, 1, 0.06),
    x4 = sin(theta) + rnorm(cluster_size, 1, 0.06)
  )

  theta1 <- runif(cluster_size, 0.20, 0.90 * pi)

  df2 <- tibble::tibble(
    x1 = cos(-theta1) + rnorm(cluster_size, 1, 0.06),
    x2 = sin(-theta1) + rnorm(cluster_size, 1, 0.06),

    x3 = cos(-theta1) + rnorm(cluster_size, 1, 0.06),
    x4 = sin(-theta1) + rnorm(cluster_size, 1, 0.06)
  )


  df3 <- tibble::tibble(x1 = rnorm(cluster_size, mean = 1, sd = 0.08), x2 = rnorm(cluster_size, mean = 1, sd = 0.08), x3=rnorm(cluster_size, mean = 1, sd = 0.08), x4=rnorm(cluster_size, mean = 1, sd = 0.08))

  df4 <- tibble::tibble(x1 = rnorm(cluster_size, mean = 1, sd = 1), x2 = rnorm(cluster_size, mean = 1, sd = 1), x3=rnorm(cluster_size, mean = 1, sd = 1), x4=rnorm(cluster_size, mean = 1, sd = 1))


  df <- dplyr::bind_rows(df1, df2, df3, df4)

  if (num_of_noise_dim != 0) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                       min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                              min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

  } else {

    df

  }



}

## Dataset61

three_diff_linear_with_noise <- function(sample_size = 150, with_seed = NULL, num_of_noise_dim = 8,
                                         min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by three
  if ((sample_size%%3) != 0) {
    stop("The sample size should be a product of 3.")

  } else {
    cluster_size <- sample_size/3
  }


  df_2_split <- snedata::long_cluster_data(n = cluster_size) |>
    dplyr::group_by(color) |>
    dplyr::group_split()

  df_2_split_1 <- df_2_split[[1]]
  df_2_split_1$x <- df_2_split_1$x - 250
  df_2_split_1$y <- df_2_split_1$y - 20

  df_2_split_3 <- tibble::tibble(x = -df_2_split[[1]]$y, y = df_2_split[[1]]$x)

  df <- dplyr::bind_rows(df_2_split_1, df_2_split[[2]], df_2_split_3) |>
    dplyr::select(-color)

  names(df) <- paste0(rep("x",2), 1:2)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset62

four_diff_long_clutsers_with_noise <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 2,
                                               min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by four
  if ((sample_size%%4) != 0) {
    stop("The sample size should be a product of 4.")

  } else {
    cluster_size <- sample_size/4
  }


  df_2_split <- snedata::long_cluster_data(n = cluster_size) |>
    dplyr::group_by(color) |>
    dplyr::group_split()

  df_2_split_1 <- df_2_split[[1]]
  df_2_split_1$x <- df_2_split_1$x - 150
  df_2_split_1$y <- df_2_split_1$y - 20

  df_2_split_3 <- tibble::tibble(x = df_2_split[[1]]$y - 70, y = -df_2_split[[1]]$x)

  df_2_split_4 <- tibble::tibble(x = df_2_split_3$x, y = df_2_split_3$y + 150)

  df <- dplyr::bind_rows(df_2_split_1, df_2_split[[2]], df_2_split_3, df_2_split_4) |>
    dplyr::select(-color)

  names(df) <- paste0(rep("x",2), 1:2)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}


## Dataset63

two_s_curves_with_noise <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 7,
                                    min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by three
  if ((sample_size%%2) != 0) {
    warning("The sample size should be a product of 2.")
    cluster_size <- floor(sample_size/2)

  } else {
    cluster_size <- sample_size/2
  }


  df1 <- snedata::s_curve(n_samples = cluster_size)
  df1 <- df1 |>
    dplyr::select(-color)
  names(df1) <- paste0(rep("x",3), 1:3)

  df2 <- tibble::tibble(x1 = -df1$x1 + 5, x2 = df1$x2 + 1, x3 = df1$x3 + 1)

  df <- dplyr::bind_rows(df1, df2)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset64

plane_2D_with_hole <- function(sample_size = 800, with_seed = NULL, num_of_noise_dim = 8, min_noise = 0, max_noise = 1) {

  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  ## Need to change later
  # if (sample_size == 412) {
  #
  #   sample_size_n <- 480
  #
  # } else if (sample_size == 800) {
  #
  #   sample_size_n <- 1000
  #
  # } else if (sample_size == 960) {
  #
  #   sample_size_n <- 1200
  #
  # } else if (sample_size == 1620) {
  #
  #   sample_size_n <- 2000
  #
  # } else {
  #
  # }

  # To check that the assigned sample_size is divided by four
  # if ((sample_size_n%%4) != 0) {
  #   stop("The sample size should be a product of 4.")
  #
  # } else {
  #   cluster_size <- sample_size_n/4
  # }

  u <- runif(sample_size, min = 10, max = 30)
  v <- runif(sample_size, min = 10, max = 20)
  x <- u + v - 10
  y <- v - u + 8

  df1 <- tibble::tibble(x1 = x, x2 = y)

  anchor <- c(1, 1)
  indices <- rowSums((sweep(df1, 2, anchor, `-`))) > 30
  df1 <- df1[indices, ]
  rownames(df1) <- NULL

  df2 <- tibble::tibble(x1 = -df1$x2 + 26, x2 = df1$x1 - 15)
  df3 <- tibble::tibble(x1 = df1$x2 + 30, x2 = -df1$x1 + 25)

  df <- dplyr::bind_rows(df1 - 10, df1 + 10, df2, df3)

  sample_size_n <- NROW(df)


  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), 3:(3 + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size_n,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size_n,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset65

mirror_s_curves_with_noise <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 7,
                                       min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by three
  if ((sample_size%%2) != 0) {
    stop("The sample size should be a product of 2.")

  } else {
    cluster_size <- sample_size/2
  }


  df1 <- snedata::s_curve(n_samples = cluster_size)
  df1 <- df1 |>
    dplyr::select(-color)
  names(df1) <- paste0(rep("x",3), 1:3)

  df2 <- tibble::tibble(x1 = -df1$x1 + 2, x2 = df1$x2, x3 = df1$x3)

  df <- dplyr::bind_rows(df1, df2)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset66

gaussian_clusters_diff_points <- function(sample_size = 400, with_seed = NULL, cluster_size_vec = c(50, 100, 200, 50), num_clusters = 4, mean_matrix = rbind(c(1,0,0,0,0,0), c(0,1,0,0,0,0), c(0,0,1,0,0,0), c(0,0,0,1,0,0)),
                                          var_vec = c(0.02, 0.05, 0.06, 0.1), num_dims = 6, num_noise_dims = 4,
                                          min_noise = -0.05, max_noise = 0.05) {

  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  if (sample_size == 400) {

    cluster_size_vec <- c(50, 100, 200, 50)

  } else if (sample_size == 500) {

    cluster_size_vec <- c(50, 150, 200, 100)

  } else if (sample_size == 1000) {

    cluster_size_vec <- c(300, 350, 200, 150)

  } else if (sample_size == 1500) {

    cluster_size_vec <- c(450, 350, 400, 300)

  } else {

    cluster_size_vec <- rep(sample_size/4, 4)

  }


  if (sample_size < num_clusters) {
    stop('Number of clusters exceed the number of observations.')

  }

  if ((num_dims == 0) | (num_dims == 1)) {
    stop('There should be at least two dimensions.')

  }

  if (dim(mean_matrix)[1] != length(var_vec)) {
    stop('The length of mean and variance vectors are different.')

  }

  if (dim(mean_matrix)[1] != num_clusters) {
    stop('There is not enough mean values for clusters.')

  }

  if (dim(mean_matrix)[2] != num_dims) {
    stop('There is not enough mean values for dimensions.')

  }

  if (length(var_vec) != num_clusters) {
    stop('There is not enough varaiance values for clusters.')

  }

  # # To check that the assigned n is divided by three
  # if ((n%%num_clusters) != 0) {
  #   warning("The sample size should be a product of number of clusters.")
  #   cluster_size <- floor(n/num_clusters)
  #
  # } else {
  #   cluster_size <- n/num_clusters
  # }

  # To generate empty tibble
  column_names <- paste0(rep("x", num_dims), 1:num_dims)
  df <- tibble::tibble(!!!stats::setNames(rep(list(NULL), length(column_names)), column_names))

  for (i in 1:num_clusters) {

    # To filter the mean values for specific cluster
    mean_val_for_cluster <- mean_matrix |>
      tibble::as_tibble(.name_repair = "unique") |>
      dplyr::filter(dplyr::row_number() == i) |>
      unlist(use.names = FALSE)

    # To filter the variance values for specific cluster
    variance_val_for_cluster <- var_vec[i]

    num_points_cluster <- cluster_size_vec[i]

    # Initialize an empty list to store the vectors with column
    # values
    dim_val_list <- list()

    for (j in 1:num_dims) {

      dim_val_list[[column_names[j]]] <- stats::rnorm(num_points_cluster, mean = mean_val_for_cluster[j],
                                                      sd = variance_val_for_cluster)

    }
    # To generate a tibble for a cluster
    df_cluster <- tibble::as_tibble(dim_val_list)

    df <- dplyr::bind_rows(df, df_cluster)

  }

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_noise_dims), (NCOL(df) + 1):((NCOL(df) + 1) + num_noise_dims))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_noise_dims) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset67

cluster_and_curvilinear_with_noise <- function(sample_size = 200, cluster_size_vec = c(50, 150), with_seed = NULL, num_of_noise_dim = 3,
                                               min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  if (sample_size == 200) {
    cluster_size_vec <- c(50, 150)

  } else if (sample_size == 500) {

    cluster_size_vec <- c(200, 300)

  } else if (sample_size == 1000) {

    cluster_size_vec <- c(600, 400)

  } else if (sample_size == 1500) {

    cluster_size_vec <- c(500, 1000)

  } else {

    cluster_size_vec <- rep(sample_size/2, 2)

  }


  theta = runif(cluster_size_vec[1], 0.20,0.60 * pi)
  x = cos(theta) + rnorm(cluster_size_vec[1], 10, 0.03)
  y = sin(theta) + rnorm(cluster_size_vec[1], 10, 0.03)

  z <- rep(0, cluster_size_vec[1]) + rnorm(cluster_size_vec[1], 10, 0.03)
  w <- rep(0, cluster_size_vec[1]) - rnorm(cluster_size_vec[1], 10, 0.03)

  df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x = rnorm(cluster_size_vec[2], 10, 0.05)
  y = rnorm(cluster_size_vec[2], 10, 0.05)

  z <- rep(0, cluster_size_vec[2]) + rnorm(cluster_size_vec[2], 10, 0.05)
  w <- rep(0, cluster_size_vec[2]) - rnorm(cluster_size_vec[2], 10, 0.05)

  df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  df <- dplyr::bind_rows(df1, df2)
  names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)
  df

}

## Dataset68

one_grid_diff <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 2,
                          min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  if ((sample_size %% 2) != 0) {

    stop("The sample size should be a product of two.")

  } else {

    if (((sqrt(sample_size/2)) %% 1) != 0) {

      stop("The square root should exists.")

    } else {

      n_value <- sqrt(sample_size/2)

    }

  }



  df1 <- snedata::grid_data(n = n_value)
  df1 <- df1 |>
    dplyr::select(-color)

  names(df1) <- paste0(rep("x",2), 1:2)

  df3 <- df1 + 3

  df1 <- dplyr::bind_rows(df1, df3)


  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df1) + 1):((NCOL(df1) + 1) + num_of_noise_dim))

  sample_size <- NROW(df1)

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(NROW(df1),
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(NROW(df1),
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df1, df_noise)



  df

}

## Dataset69

curvy_branching_cluster <- function(sample_size = 200, cluster_size_vec = c(50, 100, 50), with_seed = NULL, num_of_noise_dim = 6,
                                    min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  if ((sample_size == 200)) {
    cluster_size_vec <- c(50, 100, 50)
  } else if ((sample_size == 500)) {

    cluster_size_vec <- c(100, 300, 100)

  } else if ((sample_size == 1000)) {

    cluster_size_vec <- c(200, 600, 200)

  } else if ((sample_size == 1500)) {

    cluster_size_vec <- c(250, 1000, 250)

  } else {

    # To check that the assigned sample_size is divided by number of clusters
    if ((sample_size%%3) != 0) {
      stop("The sample size should be a product of three.")

    } else {
      cluster_size_vec <- rep(sample_size/3, 3)
    }

  }

  theta <- runif(cluster_size_vec[1], 0.20, 0.90 * pi)

  df1 <- tibble::tibble(
    x1 = cos(theta) + rnorm(cluster_size_vec[1], 1, 0.06),
    x2 = sin(theta) + rnorm(cluster_size_vec[1], 1, 0.06),

    x3 = cos(theta) + rnorm(cluster_size_vec[1], 1, 0.06),
    x4 = sin(theta) + rnorm(cluster_size_vec[1], 1, 0.06)
  )

  theta1 <- runif(cluster_size_vec[3], 0.20, 0.90 * pi)

  df2 <- tibble::tibble(
    x1 = cos(-theta1) + rnorm(cluster_size_vec[3], 1, 0.06),
    x2 = sin(-theta1) + rnorm(cluster_size_vec[3], 1, 0.06),

    x3 = cos(-theta1) + rnorm(cluster_size_vec[3], 1, 0.06),
    x4 = sin(-theta1) + rnorm(cluster_size_vec[3], 1, 0.06)
  )


  df3 <- tibble::tibble(x1 = rnorm(cluster_size_vec[2], mean = 1, sd = 0.08), x2 = rnorm(cluster_size_vec[2], mean = 1, sd = 0.08), x3=rnorm(cluster_size_vec[2], mean = 1, sd = 0.08), x4=rnorm(cluster_size_vec[2], mean = 1, sd = 0.08))



  df <- dplyr::bind_rows(df1, df2, df3)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}

## Dataset70

clusters_with_different_shapes <- function(sample_size = 400, with_seed = NULL, cluster_size_vec = NULL, num_gussian_clusters = 4, num_non_gaussian_clusters = 2,
                                           cluster_sd_gau = 0.05, cluster_sd_non_gau = 0.1, num_dims = 7, a = 2, b = 4) {


  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  num_clusters <- num_gussian_clusters + num_non_gaussian_clusters

  if ((sample_size == 400)) {
    cluster_size_vec <- c(50, 50, 50, 50, 100, 100)
  } else if ((sample_size == 500)) {

    cluster_size_vec <- c(100, 50, 50, 50, 100, 150)

  } else if ((sample_size == 1000)) {

    cluster_size_vec <- c(100, 100, 100, 100, 300, 300)

  } else if ((sample_size == 1500)) {

    cluster_size_vec <- c(250, 150, 150, 150, 350, 450)

  } else {

    # To check that the assigned sample_size is divided by number of clusters
    if ((sample_size%%num_clusters) != 0) {
      stop("The sample size should be a product of number of clusters.")

    } else {
      cluster_size_vec <- rep(sample_size/num_clusters, num_clusters)
    }

  }

  if ((num_gussian_clusters == 0) | (num_gussian_clusters == 0)) {

    stop("There should be at least one Gaussian or non-Gaussian cluster.")

  }

  ## Generate Gaussian clusters

  # Create a vector of possible values (0 and 1)
  values <- c(0, 1)

  # Create an expanded grid with 0's and 1's
  mean_val_grid <- tidyr::expand_grid(!!!setNames(rep(list(values), num_dims),
                                                  paste0("mean_dim", 1:num_dims)))

  # To select combinations for assigned number of clusters

  mean_val_grid_gau <- mean_val_grid |>
    dplyr::slice_sample(n = num_gussian_clusters)

  mean_val_grid_non_gau <- mean_val_grid |>
    dplyr::slice_sample(n = num_non_gaussian_clusters)


  # To generate empty tibble
  column_names <- paste0(rep("x", num_dims), 1:num_dims)
  df <- tibble(!!!setNames(rep(list(NULL), length(column_names)), column_names))

  for (i in 1:num_gussian_clusters) {

    # To filter the mean values for specific cluster
    mean_val_for_cluster <- mean_val_grid_gau |>
      dplyr::filter(dplyr::row_number() == i) |>
      unlist(use.names = FALSE)

    # Initialize an empty list to store the vectors with column
    # values
    dim_val_list <- list()

    for (j in 1:num_dims) {

      dim_val_list[[column_names[j]]] <- rnorm(cluster_size_vec[i], mean = mean_val_for_cluster[j],
                                               sd = cluster_sd_gau)

    }
    # To generate a tibble for a cluster
    df_gau_cluster <- tibble::as_tibble(dim_val_list)

    df <- dplyr::bind_rows(df, df_gau_cluster)

  }



  for (i in 1:num_non_gaussian_clusters) {

    phi <- runif(cluster_size_vec[(num_clusters - i)], max = 2*pi)
    rho <- sqrt(runif(cluster_size_vec[(num_clusters - i)]))

    # To filter the mean values for specific cluster
    presence_of_elipse_cluster <- mean_val_grid_non_gau |>
      dplyr::filter(dplyr::row_number() == i) |>
      unlist(use.names = FALSE)

    # Initialize an empty list to store the vectors with column
    # values
    dim_val_list_n <- list()

    for (j in 1:num_dims) {
      if(presence_of_elipse_cluster[j] == 1){
        dim_val_list_n[[column_names[j]]] <- sqrt(a)*rho*cos(phi) + b
        ## Surface of poolar coordinate
      } else {
        dim_val_list_n[[column_names[j]]] <- rnorm(cluster_size_vec[(num_clusters - i)], mean = 0,
                                                   sd = cluster_sd_non_gau)

      }



    }
    # To generate a tibble for a cluster
    df_non_gau_cluster <- tibble::as_tibble(dim_val_list_n)

    df <- dplyr::bind_rows(df, df_non_gau_cluster)

  }

  df

}

Try the cardinalR package in your browser

Any scripts or data that you put into this service are public.

cardinalR documentation built on May 29, 2024, 4:37 a.m.