Clustering datasets

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(cardinalR)
library(langevitour)
library(dplyr)
gau_data <- gaussian_clusters(sample_size = 300, num_clusters = 5, 
                  mean_matrix = rbind(c(1,0,0,0), c(0,1,0,0), c(0,0,1,0), 
                                      c(0,0,0,1), c(0,0,0,0)), 
                  var_vec = c(0.05, 0.05, 0.05, 0.05, 0.05), 
                  num_dims = 4, num_noise_dims = 0, 
                  min_noise = -0.05, max_noise = 0.05) 

langevitour(gau_data)
## To generate Gaussian clusters which have equal number of points in each cluster with same variation with different dimensions
## sample_size: Total number of points in the data set 
## cluster_sd: Standard deviation of a cluster 

## output: df: tibble

gaussian_clusters <- function(sample_size = 300, with_seed = NULL, num_clusters = 3,
    cluster_sd = 0.05, num_dims = 3) {

    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    # To check that the assigned sample_size is divided by three
    if ((sample_size%%num_clusters) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/num_clusters)

    } else {
        cluster_size <- sample_size/num_clusters
    }

    # Create a vector of possible values (0 and 1)
    values <- c(0, 1)

    # Create an expanded grid with 0's and 1's
    mean_val_grid <- tidyr::expand_grid(!!!setNames(rep(list(values), num_dims),
        paste0("mean_dim", 1:num_dims)))

    # To select combinations for assigned number of clusters

    mean_val_grid <- mean_val_grid %>%
        dplyr::slice_sample(n = num_clusters)


    # To generate empty tibble
    column_names <- paste0(rep("x", num_dims), 1:num_dims)
    df <- tibble(!!!setNames(rep(list(NULL), length(column_names)), column_names))

    for (i in 1:num_clusters) {

        # To filter the mean values for specific cluster
        mean_val_for_cluster <- mean_val_grid %>%
            dplyr::filter(dplyr::row_number() == i) %>%
            unlist(use.names = FALSE)

        # Initialize an empty list to store the vectors with column
        # values
        dim_val_list <- list()

        for (j in 1:num_dims) {

            dim_val_list[[column_names[j]]] <- rnorm(cluster_size, mean = mean_val_for_cluster[j],
                sd = cluster_sd)

        }
        # To generate a tibble for a cluster
        df_cluster <- tibble::as_tibble(dim_val_list)

        df <- bind_rows(df, df_cluster)

    }

    df

}

Dataset 1 (2D plane)

plane_2D <- function(sample_size = 100, with_seed = NULL, coefficient_x_1 = 1,
    coefficient_x_2 = 1, coefficient_y_1 = -1, coefficient_y_2 = 1, intercept_x = -10,
    intercept_y = 8, u_min = 10, u_max = 30, v_min = 10, v_max = 20, num_of_noise_dim = 2, min_noise = 0, max_noise = 1) {

    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    u <- runif(sample_size, min = u_min, max = u_max)
    v <- runif(sample_size, min = v_min, max = v_max)
    x <- coefficient_x_1 * u + coefficient_x_2 * v + intercept_x
    y <- coefficient_y_1 * u + coefficient_y_2 * v + intercept_y

    df <- tibble::tibble(x1 = x, x2 = y)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), 3:(3 + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)
    df

}
dataset1 <- plane_2D(sample_size = 300, with_seed = 2023062801)

langevitour(dataset1)

Dataset 2 (2D curvilinear)

curvilinear_2D <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 2, min_noise = -1, max_noise = 1){
  # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

  x <- runif(sample_size, 0, 2)
  y <- -(x^3 + runif(sample_size, 0, 3)) + runif(sample_size, 0, 0.5)

  df <- tibble::tibble(x1 = x, x2 = y)

  # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), 3:(3 + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)
    df

}
dataset2 <- curvilinear_2D(sample_size = 300, with_seed = 2023062801)

langevitour(dataset2)

Dataset 3 (5 Gaussian clusters)

dataset3 <- gaussian_clusters(sample_size = 250, with_seed = 2023062801, num_clusters = 5, cluster_sd = 0.05, num_dims = 4)

langevitour(dataset3)

Dataset 4 (3D Cube) (remove)

## Equidistant Solid Cube: A function to generate a solid cube with
# equidistant points p dimension of object n length of number of
# points in each dimension Return points:location of points edges:
# edges of the object
cube_3D_with_noise <- function(with_seed = NULL, num_of_effective_dims = 3, num_of_noise_dim = 2,
    min_noise = -0.1, max_noise = 0.1) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    cube <- geozoo::cube.solid.grid(p = num_of_effective_dims, n = 11)
    df <- tibble::as_tibble(cube$points, .name_repair = "unique")
    names(df) <- paste0(rep("x", num_of_effective_dims), 1:num_of_effective_dims)

    sample_size <- NROW(df)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), 4:(4 + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    return(list(df = df, sample_size = NROW(cube$points)))

}
dataset4 <- cube_3D_with_noise()$df

cube_3D_with_noise()$sample_size

langevitour(dataset4)

Dataset 5 (Non-linear) (nonlinear)

nonlinear_2D <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 2,
    min_noise = -1, max_noise = 1) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    theta = runif(sample_size, 0.2, 0.6 * pi)
    x = cos(theta) + rnorm(sample_size, 10, 0.03)
    y = sin(theta) + rnorm(sample_size, 10, 0.03)

    df <- tibble::tibble(x1 = x, x2 = y)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), 3:(3 + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)
    df

}
dataset5 <- nonlinear_2D(sample_size = 250, num_of_noise_dim = 5, min_noise = -0.1, max_noise = 0.2)

langevitour(dataset5)

Dataset 6 (Different shape clusters) (clustering)

## To generate different types of clusters
## sample_size: Total number of points in the data set 
## cluster_sd: Standard deviation of a cluster 

## output: df: tibble

clusters_different_shapes <- function(sample_size = 300, with_seed = NULL, num_gussian_clusters = 4, num_non_gaussian_clusters = 2,
    cluster_sd_gau = 0.05, cluster_sd_non_gau = 0.1, num_dims = 7, a = 2, b = 4) {


    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

  num_clusters <- num_gussian_clusters + num_non_gaussian_clusters

    # To check that the assigned sample_size is divided by three
    if ((sample_size%%num_clusters) != 0) {
        warning("The sample size should be a product of three.")
        cluster_size <- floor(sample_size/num_clusters)

    } else {
        cluster_size <- sample_size/num_clusters
    }

  ## Generate Gaussian clusters

    # Create a vector of possible values (0 and 1)
    values <- c(0, 1)

    # Create an expanded grid with 0's and 1's
    mean_val_grid <- tidyr::expand_grid(!!!setNames(rep(list(values), num_dims),
        paste0("mean_dim", 1:num_dims)))

    # To select combinations for assigned number of clusters

    mean_val_grid_gau <- mean_val_grid %>%
        dplyr::slice_sample(n = num_gussian_clusters)

    mean_val_grid_non_gau <- mean_val_grid %>%
        dplyr::slice_sample(n = num_non_gaussian_clusters)


    # To generate empty tibble
    column_names <- paste0(rep("x", num_dims), 1:num_dims)
    df <- tibble(!!!setNames(rep(list(NULL), length(column_names)), column_names))

    for (i in 1:num_gussian_clusters) {

        # To filter the mean values for specific cluster
        mean_val_for_cluster <- mean_val_grid_gau %>%
            dplyr::filter(dplyr::row_number() == i) %>%
            unlist(use.names = FALSE)

        # Initialize an empty list to store the vectors with column
        # values
        dim_val_list <- list()

        for (j in 1:num_dims) {

            dim_val_list[[column_names[j]]] <- rnorm(cluster_size, mean = mean_val_for_cluster[j],
                sd = cluster_sd_gau)

        }
        # To generate a tibble for a cluster
        df_gau_cluster <- tibble::as_tibble(dim_val_list)

        df <- bind_rows(df, df_gau_cluster)

    }

    phi <- runif(cluster_size, max = 2*pi)
    rho <- sqrt(runif(cluster_size))

    for (i in 1:num_non_gaussian_clusters) {

        # To filter the mean values for specific cluster
        presence_of_elipse_cluster <- mean_val_grid_non_gau %>%
            dplyr::filter(dplyr::row_number() == i) %>%
            unlist(use.names = FALSE)

        # Initialize an empty list to store the vectors with column
        # values
        dim_val_list_n <- list()

        for (j in 1:num_dims) {
          if(presence_of_elipse_cluster[j] == 1){
            dim_val_list_n[[column_names[j]]] <- sqrt(a)*rho*cos(phi) + b
            ## Surface of poolar coordinate
          } else {
            dim_val_list_n[[column_names[j]]] <- rnorm(cluster_size, mean = 0,
                sd = cluster_sd_non_gau)

          }



        }
        # To generate a tibble for a cluster
        df_non_gau_cluster <- tibble::as_tibble(dim_val_list_n)

        df <- bind_rows(df, df_non_gau_cluster)

    }

    df

}
dataset6 <- clusters_different_shapes()

langevitour(dataset6)

Dataset 7 (Sine curve with noise) (nonlinear)

sine_curve_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    theta = runif(sample_size, 0,1.80 * pi)
    x = theta
    y = sin(theta)

    df <- tibble::tibble(x1 = x, x2 = y)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), 3:(3 + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)
    df

}
dataset7 <- sine_curve_with_noise()

langevitour(dataset7)

Dataset 9 (three clusters data with extra noise dimensions) (add)

three_clusters_data_with_noise <- function(sample_size = 100, with_seed = NULL, num_dims = 7, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    # To check that the assigned sample_size is divided by three
    if ((sample_size %% 3) != 0) {
        warning("The sample size should be a product of three.")
        cluster_size <- floor(sample_size/3)

    } else {
        cluster_size <- sample_size/3
    }
    df <- snedata::three_clusters_data(n = cluster_size, dim = num_dims) ## n = number of points per Gaussian
    df <- df %>% 
      select(-color)
    names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)
    df

}
dataset9 <- three_clusters_data_with_noise(sample_size = 300, num_of_noise_dim = 3)

langevitour(dataset9)

Dataset 10 (Torus with extra noise dimensions) (remove)

# Torus: A function to generate a torus in any dimension
# p dimension of object
# n number of points
# radius radiuses of the torus, set from largest to smallest
# Return
# points:location of points
# edges: edges of the object (null)
torus_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    torus <- geozoo::torus(p = 3, n = sample_size)
    df <- tibble::as_tibble(torus$points, .name_repair = "unique")
    names(df) <- paste0(rep("x", 3), 1:3)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), 4:(4 + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)
    df

}
dataset10 <- torus_with_noise(sample_size = 300, num_of_noise_dim = 4)

langevitour(dataset10)

Dataset 11 (Spiral with extra noise dimensions) (remove)

spiral_with_noise <- function(sample_size = 100, with_seed = NULL, num_dims = 10, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    result  <- mgc::mgc.sims.spiral(n = sample_size, d = num_dims)  # simulate 100 samples in 10 dimensions

    df <- tibble::as_tibble(result$X, .name_repair = "unique")
    names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))


    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)
    df

}
dataset11 <- spiral_with_noise(sample_size = 300, num_of_noise_dim = 4)

langevitour(dataset11)

Dataset 12 (Roman surface with extra noise dimensions) (remove)

roman_surface_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    klein <- geozoo::roman.surface(n = sample_size, a = 1)
    df <- tibble::as_tibble(klein$points, .name_repair = "unique")
    names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))


    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)
    df

}
dataset12 <- roman_surface_with_noise(sample_size = 300, num_of_noise_dim = 3, min_noise = -0.05, max_noise = 0.05)

langevitour(dataset12)

Dataset 13 (Roman surface with extra noise dimensions) (add)

#Mobius Experiment
# A function to generate a 5-D mobius strip in the third dimension.
# p dimension of object.  (5)
# n number of points

mobius_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    mobius <- geozoo::mobius.experiment(p = 5, n = sample_size)
    df <- tibble::as_tibble(mobius$points, .name_repair = "unique")
    names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))


    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)
    df

}
dataset13 <- mobius_with_noise(sample_size = 300, num_of_noise_dim = 3)

langevitour(dataset13)

Dataset 14 (Dini surface with extra noise dimensions) (remove)

dini_surface_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    dini <- geozoo::dini.surface(n = sample_size, a = 1, b = 1)
    df <- tibble::as_tibble(dini$points, .name_repair = "unique")
    names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))


    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)
    df

}
dataset14 <- dini_surface_with_noise(sample_size = 300, num_of_noise_dim = 3)

langevitour(dataset14)

Dataset 15 (conic spiral with extra noise dimensions) (remove)

conic_spiral_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    conic_spiral <- geozoo::conic.spiral(n = sample_size, a = .2, b = 1, c = .1, w = 2)
    df <- tibble::as_tibble(conic_spiral$points, .name_repair = "unique")
    names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))


    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)
    df

}
dataset15 <- conic_spiral_with_noise(sample_size = 300, num_of_noise_dim = 3,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset15)

Dataset 16 (S curve with a hole and extra noise dimensions) (add)

s_curve_data_hole_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    df <- snedata::s_curve_hole(n_samples = sample_size, noise = 0) ## Should add more data because remove to create the hole
    df <- df %>% 
      select(-color)
    names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

    sample_size_n <- NROW(df)


    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size_n,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size_n,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    return(list(df = df, sample_size = sample_size_n))

}
dataset16 <- s_curve_data_hole_with_noise(sample_size = 300, num_of_noise_dim = 3,
    min_noise = -0.05, max_noise = 0.05)$df

s_curve_data_hole_with_noise(sample_size = 300, num_of_noise_dim = 3,
    min_noise = -0.05, max_noise = 0.05)$sample_size

langevitour(dataset16)

Dataset 17 (add)

long_cluster_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

  # To check that the assigned sample_size is divided by three
    if ((sample_size%%2) != 0) {
        warning("The sample size should be a product of two.")
        cluster_size <- floor(sample_size/2)

    } else {
        cluster_size <- sample_size/2
    }



    df_2_split <- snedata::long_cluster_data(n = cluster_size) %>% 
  group_by(color) %>% 
  group_split()

df_2_split_1 <- df_2_split[[1]]
df_2_split_1$x <- df_2_split_1$x - 20
df_2_split_1$y <- df_2_split_1$y - 20

df <- bind_rows(df_2_split_1, df_2_split[[2]]) %>% 
  select(-color)
    names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)
    df

}
dataset17 <- long_cluster_with_noise(sample_size = 300, num_of_noise_dim = 3,
    min_noise = -0.05, max_noise = 0.05)


langevitour(dataset17)

Dataset 18 (nonlinear)

nonlinear_connect_with_noise <- function(sample_size = 400, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

  # To check that the assigned sample_size is divided by three
    if ((sample_size%%4) != 0) {
        warning("The sample size should be a product of four.")
        cluster_size <- floor(sample_size/4)

    } else {
        cluster_size <- sample_size/4
    }

  theta = runif(cluster_size, 0,0.80 * pi)
  x = cos(theta) + rnorm(cluster_size, 10, 0.03)
  y = sin(theta) + rnorm(cluster_size, 10, 0.03)
  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)


  df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x = cos(-theta) + rnorm(cluster_size, 10, 0.03) + rnorm(cluster_size, 0.1, 0) 
  y = sin(-theta) + rnorm(cluster_size, 10, 0.03) + rnorm(cluster_size, 0.1, 0)
  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x = cos(-theta) + rnorm(cluster_size, 10, 0.03) + rnorm(cluster_size, 0.1, 0) 
  z = sin(-theta) + rnorm(cluster_size, 10, 0.03) + rnorm(cluster_size, 0.1, 0)
  y <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df3 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x = cos(theta) + rnorm(cluster_size, 10, 0.03) + rnorm(cluster_size, 0.1, 0) 
  z = sin(theta) + rnorm(cluster_size, 10, 0.03) + rnorm(cluster_size, 0.1, 0)
  y <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df4 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  df <- bind_rows(df1, df2, df3, df4) 
    names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)
    df

}
dataset18 <- nonlinear_connect_with_noise(sample_size = 400, num_of_noise_dim = 3,
    min_noise = -0.05, max_noise = 0.05)


langevitour(dataset18)

Dataset 19 (nonlinear)

nonlinear_mirror_with_noise <- function(sample_size = 400, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

  # To check that the assigned sample_size is divided by three
    if ((sample_size%%2) != 0) {
        warning("The sample size should be a product of four.")
        cluster_size <- floor(sample_size/2)

    } else {
        cluster_size <- sample_size/2
    }

  x <- runif(cluster_size, -8, 1.5)
y <- -(exp(x) + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.7)

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x <- runif(cluster_size, -8, 1.5)
y <- (exp(x) + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.7)

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  df <- bind_rows(df1, df2) 
    names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)
    df

}
dataset19 <- nonlinear_mirror_with_noise(sample_size = 400, num_of_noise_dim = 3,
    min_noise = -0.05, max_noise = 0.05)


langevitour(dataset19)

Dataset 20 (add)

three_circulars_with_noise <- function(sample_size = 300, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

  # To check that the assigned sample_size is divided by three
    if ((sample_size%%3) != 0) {
        warning("The sample size should be a product of four.")
        cluster_size <- floor(sample_size/3)

    } else {
        cluster_size <- sample_size/3
    }

  theta = runif(cluster_size, 0.0,2 * pi)
x = cos(theta) + rnorm(cluster_size, 10, 0.03)
y = sin(theta) + rnorm(cluster_size, 10, 0.03)

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x = 0.5 * cos(theta) + rnorm(cluster_size, 10, 0.03)
y = 0.5 * sin(theta) + rnorm(cluster_size, 10, 0.03)

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x = rnorm(cluster_size, 10, 0.03)
y = rnorm(cluster_size, 10, 0.03)

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df3 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  df <- bind_rows(df1, df2, df3) 
    names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)
    df

}
dataset20 <- three_circulars_with_noise(sample_size = 300, num_of_noise_dim = 3,
    min_noise = -0.05, max_noise = 0.05)


langevitour(dataset20)

Dataset 21 (clustering)

cluster_and_curvilinear__with_noise_and_bkg_noise <- function(sample_size = 260, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by 2
  if ((sample_size%%2) != 0) {
    stop("The sample size should be a product of 2.")

  } else {
    cluster_size <- (sample_size - sample_size * 0.3)/2
  }

  theta = runif(cluster_size, 0.20,0.60 * pi)
  x = cos(theta) + rnorm(cluster_size, 10, 0.03)
  y = sin(theta) + rnorm(cluster_size, 10, 0.03)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

  df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x = rnorm(cluster_size, 10, 0.05)
  y = rnorm(cluster_size, 10, 0.05)

  z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.05)
  w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.05)

  df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x = rnorm(sample_size * 0.3, 11, 0.5)
  y = rnorm(sample_size * 0.3, 11, 0.5)

  z <- rep(0, sample_size * 0.3) + rnorm(sample_size * 0.3, 10, 0.05)
  w <- rep(0, sample_size * 0.3) - rnorm(sample_size * 0.3, 10, 0.05)

  df3 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  df <- dplyr::bind_rows(df1, df2, df3)
  names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)
  df

}
dataset21 <- cluster_and_curvilinear__with_noise_and_bkg_noise(sample_size = 260, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)


langevitour(dataset21)

Dataset 22 (add)

link_data_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      # To check that the assigned sample_size is divided by three
    if ((sample_size%%2) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/2)

    } else {
        cluster_size <- sample_size/2
    }


    df <- snedata::link_data(n = cluster_size)
    df <- df %>% 
      select(-color)
    names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset22 <- link_data_with_noise(sample_size = 300, num_of_noise_dim = 3,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset22)

Dataset 23 (add)

swiss_roll_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }


    df <- snedata::swiss_roll(n = sample_size)
    df <- df %>% 
      select(-color)
    names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset23 <- swiss_roll_with_noise(sample_size = 300, num_of_noise_dim = 3,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset23)

Dataset 24 (branching)

curvy_tree_with_noise <- function(sample_size = 300, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    # To check that the assigned sample_size is divided by three
    if ((sample_size%%3) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/3)

    } else {
        cluster_size <- sample_size/3
    }


    x <- runif(cluster_size, -2, 2)
y <- -(x^3 + runif(cluster_size, 0, 6)) + runif(cluster_size, 0, 0.2)

z <- rnorm(cluster_size, 10, 0.1)
w <- rnorm(cluster_size, 10, 0.1)

df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, 0, 2)
y <- (x^3 + runif(cluster_size, 0, 6)) + runif(cluster_size, 0, 0.2)

z <- rnorm(cluster_size, 10, 0.1)
w <- rnorm(cluster_size, 10, 0.1)

df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -2, 0)
y <- -(x^3 + runif(cluster_size, 0, 6)) + runif(cluster_size, 0, 0.2) + 10

z <- rnorm(cluster_size, 10, 0.1)
w <- rnorm(cluster_size, 10, 0.1)

df3 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

df <- bind_rows(df1, df2, df3)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset24 <- curvy_tree_with_noise(sample_size = 300, num_of_noise_dim = 1,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset24)

Dataset 25 (branching)

tree_with_noise <- function(sample_size = 300, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    # To check that the assigned sample_size is divided by three
    if ((sample_size%%5) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/5)

    } else {
        cluster_size <- sample_size/5
    }


x <- runif(cluster_size, -3, 3)
y <- abs(0.5 * x) 

z <- rnorm(cluster_size, 10, 0.03)
w <- rnorm(cluster_size, 10, 0.03)

df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -0.5, 0.5)
y <- abs(10*x) 

z <- rnorm(cluster_size, 10, 0.03)
w <- rnorm(cluster_size, 10, 0.03)

df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -6, 3)
y <- (-1) * abs(0.5 * x + 5) 

z <- rnorm(cluster_size, 10, 0.03)
w <- rnorm(cluster_size, 10, 0.03)

df3 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -0.5, 0.5)
y <- (-1) * abs(10 * x) - 5

z <- rnorm(cluster_size, 10, 0.03)
w <- rnorm(cluster_size, 10, 0.03)

df4 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -5, 5)
y <- x

z <- rnorm(cluster_size, 10, 0.03)
w <- rnorm(cluster_size, 10, 0.03)

df5 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

df <- bind_rows(df1, df2, df3, df4, df5)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset25 <- tree_with_noise(sample_size = 500, num_of_noise_dim = 3,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset25)

Dataset 26 (add)

cell_cycle_with_noise <- function(sample_size = 300, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    # To check that the assigned sample_size is divided by three
    if ((sample_size%%3) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/3)

    } else {
        cluster_size <- sample_size/3
    }


r1 <- 2
r2 <- 1

theta = runif(cluster_size, 0, 2 * pi)
x <- rep(0, cluster_size)
y <- r1 * cos(theta)
z <- r2 * sin(theta)

df1 <- tibble::tibble(x1=x, x2=y, x3=z)

x <- r2 * cos(theta)
y <- rep(0, cluster_size)
z <- r1 * sin(theta)

df2 <- tibble::tibble(x1=x, x2=y, x3=z)

x <- r1 * cos(theta)
y <- r2 * sin(theta)
z <- rep(0, cluster_size)

df3 <- tibble::tibble(x1=x, x2=y, x3=z)

df <- bind_rows(df1, df2, df3)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset26 <- cell_cycle_with_noise(sample_size = 300, num_of_noise_dim = 3,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset26)

Dataset 27 (add)

curvy_cell_cycle_with_noise <- function(sample_size = 300, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    # To check that the assigned sample_size is divided by three
    if ((sample_size%%3) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/3)

    } else {
        cluster_size <- sample_size/3
    }


r = sqrt(3)/3

theta = runif(cluster_size, 0, 2 * pi)
x <- cos(theta)
y <- r + sin(theta)
z <- cos(3 * theta)/3

df1 <- tibble::tibble(x1=x, x2=y, x3=z)

x <- cos(theta) + 0.5
y <- sin(theta) - r/2
z <- cos(3 * theta)/3

df2 <- tibble::tibble(x1=x, x2=y, x3=z)

x <- cos(theta) - 0.5
y <- sin(theta) - r/2
z <- cos(3 * theta)/3

df3 <- tibble::tibble(x1=x, x2=y, x3=z)

df <- bind_rows(df1, df2, df3)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset27 <- curvy_cell_cycle_with_noise(sample_size = 300, num_of_noise_dim = 3,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset27)

Dataset 28 (nonlinear)

two_curvy_panckakes_with_noise <- function(sample_size = 300, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    # To check that the assigned sample_size is divided by three
    if ((sample_size%%2) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/2)

    } else {
        cluster_size <- sample_size/2
    }


phi <- runif(cluster_size, max = 2*pi)
rho <- sqrt(runif(cluster_size))

theta = runif(cluster_size, 0,1.80 * pi)
x = theta
y = sin(theta)

df1 <- tibble::tibble(x1=x, x2=y, x3=sqrt(1)*rho*cos(phi) + 4, x4=sqrt(1)*rho*sin(phi) + 4)
df2 <- tibble::tibble(x1=x+1, x2=y+1, x3=sqrt(1)*rho*cos(phi) + 6, x4=sqrt(1)*rho*sin(phi) + 6)


df <- bind_rows(df1, df2)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset28 <- two_curvy_panckakes_with_noise(sample_size = 300, num_of_noise_dim = 3,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset28)

Dataset 29 (add)

small_big_sphere_with_noise <- function(sample_size = 390, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    # To check that the assigned sample_size is divided by three
    if ((sample_size%%13) != 0) {
        warning("The sample size should be a product of number of clusters.")
        small_sphere_sample_size <- floor(sample_size/13)

    } else {
        small_sphere_sample_size <- sample_size/13
    }


df <- snedata::taspheres(n_samples = small_sphere_sample_size, d = 3, n_spheres = 4, r = 3) %>% select(-labels) # Creates a dataframe consisting of samples from the d-spheres of radius r enclosed within a larger d-sphere of radius 5 * r

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset29 <- small_big_sphere_with_noise(sample_size = 390, num_of_noise_dim = 3,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset29)

Dataset 30 (branching)

seven_branching_data_with_noise <- function(sample_size = 210, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    # To check that the assigned sample_size is divided by three
    if ((sample_size%%7) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/7)

    } else {
        cluster_size <- sample_size/7
    }



x <- runif(cluster_size, -2, 2)
y <- -(x^3 + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.2)

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -2, 1.5)
y <- (x^3 + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.2)

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -2, 1.5)
y <- (1 + (x-3)^2 + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.1)

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df3 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -0.5, 3)
y <- (1 + -(x-3)^2 + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.1)

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df4 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -1, 1)
y <- (20 + x^3 + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.01)

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df5 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -2, 2)
y <- (x^2 + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.01) + 10

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df6 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -2, 2)
y <- (x^2 + runif(cluster_size, 0, 0.2)) + runif(cluster_size, 0, 0.01) + 15

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df7 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

df <- bind_rows(df1, df2, df3, df4, df5, df6, df7)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset30 <- seven_branching_data_with_noise(sample_size = 420, num_of_noise_dim = 3,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset30)

Dataset 31 (branching)

four_branching_data_with_noise <- function(sample_size = 400, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    # To check that the assigned sample_size is divided by three
    if (((sample_size - sample_size * 0.1)%%4) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor((sample_size - sample_size * 0.1)/4)

    } else {
        cluster_size <- (sample_size - sample_size * 0.1)/4
    }




x <- runif(cluster_size, -5, 1)
y <- (exp(x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2)

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -1, 5)
y <- (exp(-x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2)

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w) 

x <- runif(cluster_size, 0, 5)
y <- (log(x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2)

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df3 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w) 

x <- runif(cluster_size, -5, 0)
y <- (log(-x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2)

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df4 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w) 

x <- runif(sample_size * 0.1, -5, 0)
y <- runif(sample_size * 0.1, 0, 0.8) + runif(sample_size * 0.1, 0, 0.8)

z <- rep(0, sample_size * 0.1) + rnorm(sample_size * 0.1, 10, 0.03)
w <- rep(0, sample_size * 0.1) - rnorm(sample_size * 0.1, 10, 0.03)

df5 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

df <- bind_rows(df1, df2, df3, df4, df5)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset31 <- four_branching_data_with_noise(sample_size = 400, num_of_noise_dim = 3,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset31)

Dataset 32 (branching)

eight_branching_data_with_noise <- function(sample_size = 400, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    # To check that the assigned sample_size is divided by three
    if ((sample_size%%8) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/8)

    } else {
        cluster_size <- sample_size/8
    }




x <- runif(cluster_size, -1, 2)
y <- (exp(x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size) 

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -1, 1)
y <- (exp(2*x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2) 

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -1, 0.6)
y <- (exp(3*x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2) 

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df3 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -1, 3)
y <- (exp(0.5*x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2) 

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df4 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -2, 1)
y <- (exp(-x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2) 

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df5 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -1, 1)
y <- (exp(2*-x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2) 

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df6 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -0.6, 1)
y <- (exp(3*-x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2) 

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df7 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -3, 1)
y <- (exp(0.5*-x) + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2) 

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df8 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

df <- bind_rows(df1, df2, df3, df4, df5, df6, df7, df8)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset32 <- eight_branching_data_with_noise(sample_size = 400, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset32)

Dataset 33 (clustering)

one_doublet_with_noise <- function(sample_size = 110, with_seed = NULL, num_of_noise_dim = 6,
                                   min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }


  # To check that the assigned sample_size is divided by 2.2
  if (((sample_size * 10)%%22) != 0) { #sample_size%%2.2
    stop("The sample size should be a product of 2.2.")

  } else {
    cluster_size <- (sample_size * 10)/22
  }


  df1 <- tibble::tibble(x=rnorm(cluster_size, mean = 0, sd = 0.05), y=rnorm(cluster_size, mean = 1, sd = 0.05), z=rnorm(cluster_size, mean = 0, sd = 0.05), w=rnorm(cluster_size, mean = 0, sd = 0.05))

  df2 <- tibble::tibble(x=rnorm(cluster_size, mean = 1, sd = 0.05), y=rnorm(cluster_size, mean = 0, sd = 0.05), z=rnorm(cluster_size, mean = 0, sd = 0.05), w=rnorm(cluster_size, mean = 0, sd = 0.05))

  df3_new <- (df1 + df2) / 2
  #get a sample of 10
  samp <- sample(nrow(df3_new), cluster_size * 0.20) ## 20% from the original dataset

  #data in the sample
  df3 <- df3_new[samp,]


  df <- dplyr::bind_rows(df1, df2, df3)
  df <- df |>
    dplyr::rename(x1 = x, x2 = y, x3 = z, x4 = w)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}
dataset33 <- one_doublet_with_noise(sample_size = 110, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset33)

Dataset 34 (nonlinear)

two_curvilinear_data_with_noise <- function(sample_size = 250, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    # To check that the assigned sample_size is divided by three
    if (((sample_size - sample_size * 0.2)%%2) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor((sample_size - sample_size * 0.2)/2)

    } else {
        cluster_size <- (sample_size - sample_size * 0.2)/2
    }




x <- runif(cluster_size, -2, -0.5)
y <- (x^2 + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2) 

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, 0.5, 2)
y <- (x^2 + runif(cluster_size, 0, 0.1)) + runif(cluster_size, 0, 0.2) 

z <- rep(0, cluster_size) + rnorm(cluster_size, 10, 0.03)
w <- rep(0, cluster_size) - rnorm(cluster_size, 10, 0.03)

df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- rnorm(sample_size * 0.2, mean = 0, sd = 0.4)
y <- rnorm(sample_size * 0.2, mean = 1.5, sd = 0.5)

z <- rep(0, sample_size * 0.2) + rnorm(sample_size * 0.2, 10, 0.03)
w <- rep(0, sample_size * 0.2) - rnorm(sample_size * 0.2, 10, 0.03)

df3 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

df <- bind_rows(df1, df2, df3)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset34 <- two_curvilinear_data_with_noise(sample_size = 500, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset34)

Dataset 35 (add)

sphere_data_with_noise <- function(sample_size = 250, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

    df <- snedata::sphere(sample_size) %>%
      select(-color)

    names(df) <- paste0(rep("x", 3), 1:3)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset35 <- sphere_data_with_noise(sample_size = 500, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset35)

Dataset 36 (clustering)

three_doublets_with_noise <- function(sample_size = 210, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      # To check that the assigned sample_size is divided by three
    if ((sample_size%%4.2) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/4.2)

    } else {
        cluster_size <- sample_size/4.2
    }


    df1 <- tibble::tibble(x1=rnorm(cluster_size, mean = 3, sd = 0.05), x2 = rnorm(cluster_size, mean = 1, sd = 0.05), x3=rnorm(cluster_size, mean = 1, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x5=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x6=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x7=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x8=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x9=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x10=rnorm(cluster_size, mean = 1, sd = 0.05))

df2 <- tibble::tibble(x1=rnorm(cluster_size, mean = 1, sd = 0.05), x2=rnorm(cluster_size, mean = 1, sd = 0.05), x3=rnorm(cluster_size, mean = 1, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x5=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x6=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x7=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x8=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x9=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x10=rnorm(cluster_size, mean = 1, sd = 0.05))

df3_new <- (df1 + df2) / 2
#get a sample of 10
samp <- sample(nrow(df3_new), cluster_size * 0.40) ## 20% from the original dataset

#data in the sample
df3 <- df3_new[samp,]

df4 <- tibble::tibble(x1=rnorm(cluster_size, mean = 1, sd = 0.05), x2=rnorm(cluster_size, mean = 1, sd = 0.05), x3=rnorm(cluster_size, mean = 1, sd = 0.05), x4=rnorm(cluster_size, mean = 3, sd = 0.05),
                      x5=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x6=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x7=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x8=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x9=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x10=rnorm(cluster_size, mean = 1, sd = 0.05))

df5_new <- (df2 + df4) / 2

#get a sample of 10
samp1 <- sample(nrow(df5_new), cluster_size * 0.30) ## 20% from the original dataset

#data in the sample
df5 <- df5_new[samp1,]

df6_new <- (df1 + df4) / 2

#get a sample of 10
samp2 <- sample(nrow(df6_new), cluster_size * 0.50) ## 20% from the original dataset

#data in the sample
df6 <- df6_new[samp2,]

df <- bind_rows(df1, df2, df3, df4, df5, df6)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset36 <- three_doublets_with_noise(sample_size = 210, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset36)

Dataset 37 (clustering)

one_doublet_four_clusters_with_noise <- function(sample_size = 210, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by 4.4
  if (((sample_size * 10)%%44) != 0) { #sample_size%%4.4
    stop("The sample size should be a product of 4.4.")

  } else {
    cluster_size <- (sample_size * 10)/44
  }


  df1 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2 = rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x7=rnorm(cluster_size, mean = 1, sd = 0.05))

  df2 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x7=rnorm(cluster_size, mean = 0, sd = 0.05))

  df3_new <- (df1 + df2) / 2
  #get a sample of 10
  samp <- sample(nrow(df3_new), cluster_size * 0.40) ## 20% from the original dataset

  #data in the sample
  df3 <- df3_new[samp,]

  df4 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 1, sd = 0.05), x4=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x7=rnorm(cluster_size, mean = 0, sd = 0.05))


  df5 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x7=rnorm(cluster_size, mean = 0, sd = 0.05))

  df <- bind_rows(df1, df2, df3, df4, df5)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}
dataset37 <- one_doublet_four_clusters_with_noise(sample_size = 440, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset37)

Dataset 38 (clustering)

one_doublet_dfifferent_var_clusters_with_noise <- function(sample_size = 260, with_seed = NULL, num_of_noise_dim = 0,
                                                           min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by 2.6
  if (((sample_size * 10)%%26) != 0) {
    stop("The sample size should be a product of 2.6.")

  } else {
    cluster_size <- sample_size/2.6
  }


  df1 <- tibble::tibble(x1=rnorm(cluster_size, mean = 1, sd = 0.1), x2 = rnorm(cluster_size, mean = 0, sd = 0.08), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 0, sd = 0.08),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.08),
                        x7=rnorm(cluster_size, mean = 1, sd = 0.08),
                        x8=rnorm(cluster_size, mean = 1, sd = 0.02),
                        x9=rnorm(cluster_size, mean = 0, sd = 0.02),
                        x10=rnorm(cluster_size, mean = 0, sd = 0.02))

  df2 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.02), x2=rnorm(cluster_size, mean = 0, sd = 0.02), x3=rnorm(cluster_size, mean = 0, sd = 0.02), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 1, sd = 0.02),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.02),
                        x7=rnorm(cluster_size, mean = 0, sd = 0.02),
                        x8=rnorm(cluster_size, mean = 1, sd = 0.02),
                        x9=rnorm(cluster_size, mean = 0, sd = 0.02),
                        x10=rnorm(cluster_size, mean = 0, sd = 0.02))

  df3_new <- (df1 + df2) / 2
  #get a sample of 10
  samp <- sample(nrow(df3_new), cluster_size * 0.60) ## 20% from the original dataset

  #data in the sample
  df3 <- df3_new[samp,]

  df <- bind_rows(df1, df2, df3)

  if (num_of_noise_dim != 0) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                       min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                              min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

  } else {

    df

  }

}
dataset38 <- one_doublet_dfifferent_var_clusters_with_noise(sample_size = 260, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset38)

Dataset 39 (clustering)

one_doublet_dfifferent_pattern_clusters_with_noise <- function(sample_size = 280, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      # To check that the assigned sample_size is divided by three
    if ((sample_size%%2.8) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/2.8)

    } else {
        cluster_size <- sample_size/2.8
    }


    theta = runif(cluster_size, 0.20, 0.60 * pi)

df1 <- tibble::tibble(
x1 = cos(theta) + rnorm(cluster_size, 1, 0.5),
x2 = sin(theta) + rnorm(cluster_size, 1, 0.03),

x3 = cos(theta) + rnorm(cluster_size, 1, 0.03),
x4 = sin(theta) + rnorm(cluster_size, 1, 0.03),

x5 = cos(theta) + rnorm(cluster_size, 1, 0.03),
x6 = sin(theta) + rnorm(cluster_size, 1, 0.03),

x7 = cos(theta) + rnorm(cluster_size, 1, 0.05),
x8 = sin(theta) + rnorm(cluster_size, 1, 0.03),

x9 = cos(theta) + rnorm(cluster_size, 1, 0.3),
x10 = sin(theta) + rnorm(cluster_size, 1, 0.03))

df2 <- tibble::tibble(x1=rnorm(cluster_size, mean = 1, sd = 0.1), x2 = rnorm(cluster_size, mean = 0, sd = 0.08), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x5=rnorm(cluster_size, mean = 0, sd = 0.08),
                      x6=rnorm(cluster_size, mean = 0, sd = 0.08),
                      x7=rnorm(cluster_size, mean = 1, sd = 0.08),
                      x8=rnorm(cluster_size, mean = 1, sd = 0.02),
                      x9=rnorm(cluster_size, mean = 0, sd = 0.02),
                      x10=rnorm(cluster_size, mean = 0, sd = 0.02))


df3_new <- (df1 + df2) / 2
#get a sample of 10
samp <- sample(nrow(df3_new), cluster_size * 0.80) ## 20% from the original dataset

#data in the sample
df3 <- df3_new[samp,]

df <- bind_rows(df1, df2, df3)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset39 <- one_doublet_dfifferent_pattern_clusters_with_noise(sample_size = 280, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset39)

Dataset 40 (clustering)

two_doublets_parallel_with_noise <- function(sample_size = 440, with_seed = NULL, num_of_noise_dim = 0,
                                             min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by 4.4
  if (((sample_size * 10)%%44) != 0) { #sample_size%%4.4
    stop("The sample size should be a product of 4.4.")

  } else {
    cluster_size <- (sample_size * 10)/44
  }


  df1 <- tibble::tibble(x1=rnorm(cluster_size, mean = 1, sd = 0.05), x2 = rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x7=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x8=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x9=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x10=rnorm(cluster_size, mean = 0, sd = 0.05))

  df2 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x7=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x8=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x9=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x10=rnorm(cluster_size, mean = 0, sd = 0.05))

  df3_new <- (df1 + df2) / 2
  #get a sample of 10
  samp <- sample(nrow(df3_new), cluster_size * 0.20) ## 20% from the original dataset

  #data in the sample
  df3 <- df3_new[samp,]

  df4 <- tibble::tibble(x1=rnorm(cluster_size, mean = -1, sd = 0.05), x2 = rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                        x5=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x7=rnorm(cluster_size, mean = -1, sd = 0.05),
                        x8=rnorm(cluster_size, mean = -1, sd = 0.05),
                        x9=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x10=rnorm(cluster_size, mean = 0, sd = 0.05))

  df5 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = -1, sd = 0.05),
                        x5=rnorm(cluster_size, mean = -1, sd = 0.05),
                        x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x7=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x8=rnorm(cluster_size, mean = -1, sd = 0.05),
                        x9=rnorm(cluster_size, mean = 0, sd = 0.05),
                        x10=rnorm(cluster_size, mean = 0, sd = 0.05))

  df6_new <- (df4 + df5) / 2
  #get a sample of 10
  samp1 <- sample(nrow(df6_new), cluster_size * 0.20) ## 20% from the original dataset

  #data in the sample
  df6 <- df6_new[samp1,]

  df <- dplyr::bind_rows(df1, df2, df3, df4, df5, df6)

  if (num_of_noise_dim != 0) {

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
      if ((j%%2) == 0) {
        noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                       min = min_noise, max = max_noise)
      } else {
        noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                              min = min_noise, max = max_noise)
      }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

  } else {

    df

  }

}
dataset40 <- two_doublets_parallel_with_noise(sample_size = 440, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset40)

Dataset 41 (clustering)

one_doublets_with_bkg_noise <- function(sample_size = 250, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      # To check that the assigned sample_size is divided by three
    if ((sample_size%%2.5) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/2.5)

    } else {
        cluster_size <- sample_size/2.5
    }


    df1 <- tibble::tibble(x1=rnorm(cluster_size, mean = 1, sd = 0.05), x2 = rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x5=rnorm(cluster_size, mean = 0, sd = 0.05),
                      x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                      x7=rnorm(cluster_size, mean = 1, sd = 0.05))

df2 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x5=rnorm(cluster_size, mean = 1, sd = 0.05),
                      x6=rnorm(cluster_size, mean = 0, sd = 0.05),
                      x7=rnorm(cluster_size, mean = 0, sd = 0.05))

df3_new <- (df1 + df2) / 2
#get a sample of 10
samp <- sample(nrow(df3_new), cluster_size * 0.20) ## 20% from the original dataset

#data in the sample
df3 <- df3_new[samp,]

df4_new <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.2), x2 = rnorm(cluster_size, mean = 0, sd = 0.5), x3=rnorm(cluster_size, mean = 0.5, sd = 0.5), x4=rnorm(cluster_size, mean = 0.2, sd = 0.5),
                      x5=rnorm(cluster_size, mean = 0.2, sd = 0.3),
                      x6=rnorm(cluster_size, mean = 0, sd = 0.5),
                      x7=rnorm(cluster_size, mean = 0, sd = 0.3))

#get a sample of 10
samp1 <- sample(nrow(df4_new), cluster_size * 0.30) ## 20% from the original dataset

#data in the sample
df4 <- df4_new[samp1,]

df <- bind_rows(df1, df2, df3, df4)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset41 <- one_doublets_with_bkg_noise(sample_size = 250, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset41)

Dataset 42 (clustering)

curvy_branching_with_noise <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      # To check that the assigned sample_size is divided by three
    if ((sample_size%%2) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/2)

    } else {
        cluster_size <- sample_size/2
    }


    theta = runif(cluster_size, 0.20, 0.90 * pi)

df1 <- tibble::tibble(
x1 = cos(theta) + rnorm(cluster_size, 1, 0.06),
x2 = sin(theta) + rnorm(cluster_size, 1, 0.06),

x3 = cos(theta) + rnorm(cluster_size, 1, 0.06),
x4 = sin(theta) + rnorm(cluster_size, 1, 0.06)
)

theta1 = runif(cluster_size, 0.20, 0.90 * pi)

df2 <- tibble::tibble(
x1 = cos(-theta1) + rnorm(cluster_size, 1, 0.06),
x2 = sin(-theta1) + rnorm(cluster_size, 1, 0.06),

x3 = cos(-theta1) + rnorm(cluster_size, 1, 0.06),
x4 = sin(-theta1) + rnorm(cluster_size, 1, 0.06)
)

df <- bind_rows(df1, df2)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset42 <- curvy_branching_with_noise(sample_size = 250, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset42)

Dataset 43 (clustering)

two_doublets_with_bkg_noise <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      # To check that the assigned sample_size is divided by three
    if ((sample_size%%4) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/4)

    } else {
        cluster_size <- sample_size/4
    }


    df1 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 0, sd = 0.05))


df2 <- tibble::tibble(x1=rnorm(cluster_size, mean = 1, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 0, sd = 0.05))

df6_new <- (df1 + df2) / 2
#get a sample of 10
samp <- sample(nrow(df6_new), cluster_size * 0.20) ## 20% from the original dataset

#data in the sample
df6 <- df6_new[samp,]


df3 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 1, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 0, sd = 0.05))

df7_new <- (df1 + df3) / 2
#get a sample of 10
samp <- sample(nrow(df7_new), cluster_size * 0.20) ## 20% from the original dataset

#data in the sample
df7 <- df7_new[samp,]

df4 <- tibble::tibble(x1=rnorm(cluster_size * 0.6, mean = 0, sd = 0.5), x2=rnorm(cluster_size * 0.6, mean = 0, sd = 0.5), x3=rnorm(cluster_size * 0.6, mean = 0, sd = 0.5), x4=rnorm(cluster_size * 0.6, mean = 0, sd = 0.5))


df <- bind_rows(df1, df2, df3, df6, df7, df4)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset43 <- two_doublets_with_bkg_noise(sample_size = 400, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset43)

Dataset 44 (clustering)

two_nonlinear_with_noise <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

  # To check that the assigned sample_size is divided by three
    if ((sample_size%%2) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/2)

    } else {
        cluster_size <- sample_size/2
    }


x <- runif(cluster_size, -8, 1.5)
y <- -(exp(x) + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.7)

z <- -(exp(x) + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.7)
w <- -(exp(x) + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.7)

df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

x <- runif(cluster_size, -8, 1.5)
y <- 3 - (exp(x) + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.7)

z <- 3 - (exp(x) + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.7)
w <- 3 - (exp(x) + runif(cluster_size, 0, 1)) + runif(cluster_size, 0, 0.7)

df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

df <- bind_rows(df1, df2)




    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset44 <- two_nonlinear_with_noise(sample_size = 200, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset44)

Dataset 45 (add)

two_s_curve_hole_with_noise <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }


## S curve with a hole
df1 <- snedata::s_curve_hole(n_samples = sample_size/2, noise = 0) ## Should add more data because remove to create the hole
df1 <- df1 %>% 
  select(-color)
names(df1) <- paste0(rep("x",3), 1:3)

df2 <- df1 + 1

df <- bind_rows(df1, df2)

sample_size <- NROW(df)


    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    return(list(df = df, sample_size = sample_size))

}
dataset45 <- two_s_curve_hole_with_noise(sample_size = 400, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

dataset45$sample_size

langevitour(dataset45$df)

Dataset 46 (add)

three_grid_with_noise <- function(n_value = 19, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

df1 <- snedata::grid_data(n = n_value)
df1 <- df1 %>%
  select(-color)

names(df1) <- paste0(rep("x",2), 1:2)
df1$x3 <- runif(nrow(df1), -0.01, 0.01)
df1$x4 <- runif(nrow(df1), -0.01, 0.01)

df2 <- snedata::grid_data(n = n_value)
df2 <- df2 %>%
  select(-color)

names(df2) <- paste0(rep("x",2), c(1, 3))
df2$x2 <- runif(nrow(df2), -0.01, 0.01)
df2$x4 <- runif(nrow(df2), -0.01, 0.01)
df2 <- df2 %>%
  select(x1, x2, x3, x4)

df3 <- snedata::grid_data(n = n_value)
df3 <- df3 %>%
  select(-color)

names(df3) <- paste0(rep("x",2), c(1, 4))
df3$x2 <- runif(nrow(df3), -0.01, 0.01)
df3$x3 <- runif(nrow(df3), -0.01, 0.01)
df3 <- df3 %>%
  select(x1, x2, x3, x4)


df <- bind_rows(df1, df2, df3)

sample_size <- NROW(df)


    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    return(list(df = df, sample_size = sample_size))

}
dataset46 <- three_grid_with_noise(n_value = 19, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

dataset46$sample_size

langevitour(dataset46$df)

Dataset 47 (add)

one_grid_diff_with_bkg_noise <- function(sample_size = 260, with_seed = NULL, num_of_noise_dim = 5,
                                         min_noise = -0.5, max_noise = 0.5) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }


  if (((sample_size - (sample_size * 6/26)) %% 2) != 0) {

    stop("The sample size should be a product of two.")

  } else {

    if (((sqrt((sample_size - (sample_size * 6/26)) / 2)) %% 1) != 0) {

      stop("The square root should exists.")

    } else {

      n_value <- sqrt((sample_size - (sample_size * 0.6/2.6)) / 2)

    }

  }



  df1 <- snedata::grid_data(n = n_value)
  df1 <- df1 |>
    dplyr::select(-color)

  names(df1) <- paste0(rep("x",2), 1:2)

  df3 <- df1 + 3

  df1 <- dplyr::bind_rows(df1, df3)


  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df1) + 1):((NCOL(df1) + 1) + num_of_noise_dim))

  #sample_size <- NROW(df1) + NROW(df1) * 0.6/2

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(NROW(df1),
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(NROW(df1),
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df1 <- dplyr::bind_cols(df1, df_noise)

  ## To add background noise
  column_names_bkg <- paste0(rep("x", NCOL(df1)), 1:NCOL(df1))

  noise_bkg_val_list <- list()

  for (j in 1:NCOL(df1)) {
    noise_bkg_val_list[[column_names_bkg[j]]] <- rnorm(sample_size * 0.6/2.6, mean = 3, sd = 5)


  }

  df2 <- tibble::as_tibble(noise_bkg_val_list)


  df <- dplyr::bind_rows(df1, df2)

  df

}
dataset47 <- one_grid_diff_with_bkg_noise(sample_size = 260, num_of_noise_dim = 5,
    min_noise = -0.05, max_noise = 0.05)


langevitour(dataset47)

Dataset 48 (add)

two_grid_with_bkg_noise <- function(n_value = 10, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }



    df1 <- snedata::grid_data(n = n_value)
    df1 <- df1 %>%
      select(-color)

    names(df1) <- paste0(rep("x",2), 1:2)

    df3 <- df1 + 5

    df1 <- dplyr::bind_rows(df1, df3)


    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df1) + 1):((NCOL(df1) + 1) + num_of_noise_dim))

    sample_size <- NROW(df1) + NROW(df1) * 0.6/2

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(NROW(df1),
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(NROW(df1),
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df1 <- dplyr::bind_cols(df1, df_noise)

    ## To add background noise
    column_names_bkg <- paste0(rep("x", NCOL(df1)), 1:NCOL(df1))

    noise_bkg_val_list <- list()

    for (j in 1:NCOL(df1)) {
        noise_bkg_val_list[[column_names_bkg[j]]] <- rnorm(sample_size * 0.6/2.6, mean = 3, sd = 5)


    }

    df2 <- tibble::as_tibble(noise_bkg_val_list)


    df <- dplyr::bind_rows(df1, df2)

    return(list(df = df, sample_size = sample_size))

}
dataset48 <- two_grid_with_bkg_noise(n_value = 10, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

dataset48$sample_size

langevitour(dataset48$df)

Dataset 49 (add)

traingular_3D_with_noise <- function(sample_size = 150, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

trace.point <- runif(3)
corner.points <- tibble::tibble(x1 =c(  0,  1, 0.5, 0.5), 
                           x2 =c(  0,  0,   1, 0.5), 
                           x3 =c(  0,  0,   0,   1))
df <- tibble::tibble(x1 =rep(0,sample_size),
                           x2 =rep(0,sample_size),
                           x3 =rep(0,sample_size))
for(i in 1:sample_size){
  trace.point    = (corner.points[sample(4,1),]+trace.point)/2
  df[i,] = trace.point
}


    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset49 <- traingular_3D_with_noise(sample_size = 500, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset49)

Dataset 50 (add)

triangular_plane_with_bkg_noise <- function(sample_size = 675, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      # To check that the assigned sample_size is divided by three
    if ((sample_size%%3) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/3)

    } else {
        cluster_size <- sample_size/3
    }


    trace.point <- runif(2)
corner.points <- tibble::tibble(x1 =c(  0,  1, 0.5), 
                           x2 =c(  0,  0,   1))
df1 <- tibble::tibble(x1 =rep(0,cluster_size),
                           x2 =rep(0,cluster_size))
for(i in 1:cluster_size){
  trace.point    = (corner.points[sample(3,1),]+trace.point)/2
  df1[i,] = trace.point
}


    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df1) + 1):((NCOL(df1) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(cluster_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(cluster_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df1 <- dplyr::bind_cols(df1, df_noise)

    ## To add background noise
    column_names_bkg <- paste0(rep("x", NCOL(df1)), 1:NCOL(df1))

    noise_bkg_val_list <- list()

    for (j in 1:NCOL(df1)) {
        noise_bkg_val_list[[column_names_bkg[j]]] <- rnorm(cluster_size, mean = 0.025, sd = 0.5)


    }

    df2 <- tibble::as_tibble(noise_bkg_val_list)


    df <- dplyr::bind_rows(df1, df2, -df1)

    df

}
dataset50 <- triangular_plane_with_bkg_noise(sample_size = 675, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset50)

Dataset 51 (clustering)

two_curvilinear_with_noise <- function(sample_size = 150, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      # To check that the assigned sample_size is divided by three
    if ((sample_size%%2) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/2)

    } else {
        cluster_size <- sample_size/2
    }


    theta = runif(cluster_size, 0.20, 0.90 * pi)

df1 <- tibble::tibble(
x1 = cos(theta) + rnorm(cluster_size, 1, 0.06),
x2 = sin(theta) + rnorm(cluster_size, 1, 0.06),

x3 = cos(theta) + rnorm(cluster_size, 1, 0.06),
x4 = sin(theta) + rnorm(cluster_size, 1, 0.06)
)

theta1 = runif(cluster_size, 0.20, 0.90 * pi)

df2 <- tibble::tibble(
x1 = 1 + cos(theta1) + rnorm(cluster_size, 1, 0.06),
x2 = 1 + sin(theta1) + rnorm(cluster_size, 1, 0.06),

x3 = 1 + cos(theta1) + rnorm(cluster_size, 1, 0.06),
x4 = 1 + sin(theta1) + rnorm(cluster_size, 1, 0.06)
)

df <- bind_rows(df1, df2)


    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset51 <- two_curvilinear_with_noise(sample_size = 100, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset51)

Dataset 52 (clustering)

two_curvilinear_diff_with_noise <- function(sample_size = 150, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      # To check that the assigned sample_size is divided by three
    if ((sample_size%%2) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/2)

    } else {
        cluster_size <- sample_size/2
    }


    theta = runif(cluster_size, 0.40, 0.70 * pi)

df1 <- tibble::tibble(
x1 = cos(theta) + rnorm(cluster_size, 1, 0.06),
x2 = sin(theta) + rnorm(cluster_size, 1, 0.06),

x3 = cos(theta) + rnorm(cluster_size, 1, 0.06),
x4 = sin(theta) + rnorm(cluster_size, 1, 0.06)
)

theta1 = runif(cluster_size, 0.20, 0.90 * pi)

df2 <- tibble::tibble(
x1 = 1 + cos(theta1) + rnorm(cluster_size, 1, 0.06),
x2 = 1 + sin(theta1) + rnorm(cluster_size, 1, 0.06),

x3 = cos(theta1) + rnorm(cluster_size, 1, 0.06),
x4 = sin(theta1) + rnorm(cluster_size, 1, 0.06)
)

df <- bind_rows(df1, df2)


    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset52 <- two_curvilinear_diff_with_noise(sample_size = 100, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset52)

Dataset 53 (clustering)

two_linear_diff_with_noise <- function(sample_size = 150, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      # To check that the assigned sample_size is divided by three
    if ((sample_size%%3) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/3)

    } else {
        cluster_size <- sample_size/3
    }


    df_2_split <- snedata::long_cluster_data(n = cluster_size) %>% 
  group_by(color) %>% 
  group_split()

df_2_split_1 <- df_2_split[[1]]
df_2_split_1$x <- df_2_split_1$x - 20
df_2_split_1$y <- df_2_split_1$y - 20

df_2_split_3 <- df_2_split[[1]]
df_2_split_3$x <- df_2_split_3$x + 10
df_2_split_3$y <- df_2_split_3$y + 10

df <- dplyr::bind_rows(df_2_split_1, df_2_split[[2]], df_2_split_3) %>% 
  dplyr::select(-color)

names(df) <- paste0(rep("x",2), 1:2)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset53 <- two_linear_diff_with_noise(sample_size = 150, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset53)

Dataset 54 (clustering)

three_linear_with_noise <- function(sample_size = 150, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      # To check that the assigned sample_size is divided by three
    if ((sample_size%%3) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/3)

    } else {
        cluster_size <- sample_size/3
    }


    df_2_split <- snedata::long_cluster_data(n = cluster_size) %>% 
  group_by(color) %>% 
  group_split()

df_2_split_1 <- df_2_split[[1]]
df_2_split_1$x <- df_2_split_1$x - 20
df_2_split_1$y <- df_2_split_1$y - 20

df_2_split_3 <- df_2_split[[1]]
df_2_split_3$x <- df_2_split_3$x - 10
df_2_split_3$y <- df_2_split_3$y + 10

df <- dplyr::bind_rows(df_2_split_1, df_2_split[[2]], df_2_split_3) %>% 
  dplyr::select(-color)

names(df) <- paste0(rep("x",2), 1:2)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset54 <- three_linear_with_noise(sample_size = 150, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset54)

Dataset 55 (clustering)

three_nonlinear_with_noise <- function(sample_size = 150, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      # To check that the assigned sample_size is divided by three
    if ((sample_size%%3) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/3)

    } else {
        cluster_size <- sample_size/3
    }


    phi <- runif(cluster_size, max = 2*pi)
rho <- sqrt(runif(cluster_size))

theta = runif(cluster_size, 0,1.80 * pi)
x = theta
y = sin(theta)

df1 <- tibble::tibble(x1=x, x2=y, x3=sqrt(1)*rho*cos(phi) + 4, x4=sqrt(1)*rho*sin(phi) + 4)
df2 <- tibble::tibble(x1=x+1, x2=y+1, x3=sqrt(1)*rho*cos(phi) + 6, x4=sqrt(1)*rho*sin(phi) + 6)
df3 <- tibble::tibble(x1=x-1, x2=y-1, x3=sqrt(1)*rho*cos(phi) + 8, x4=sqrt(1)*rho*sin(phi) + 8)

df <- bind_rows(df1, df2, df3)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset55 <- three_nonlinear_with_noise(sample_size = 150, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset55)

Dataset 56 (clustering)

three_cluster_mirror_with_noise <- function(sample_size = 150, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      # To check that the assigned sample_size is divided by three
    if ((sample_size%%6) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/6)

    } else {
        cluster_size <- sample_size/6
    }


    df1 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 0, sd = 0.05))

df2 <- tibble::tibble(x1=rnorm(cluster_size, mean = 1, sd = 0.05), x2=rnorm(cluster_size, mean = 0, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 0, sd = 0.05))

df3 <- tibble::tibble(x1=rnorm(cluster_size, mean = 0, sd = 0.05), x2=rnorm(cluster_size, mean = 1, sd = 0.05), x3=rnorm(cluster_size, mean = 0, sd = 0.05), x4=rnorm(cluster_size, mean = 0, sd = 0.05))

df_1 <- bind_rows(df1, df2, df3)

df_2 <- df_1 + 2
df <- bind_rows(df_1, df_2)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset56 <- three_cluster_mirror_with_noise(sample_size = 150, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset56)

Dataset 57 (add)

s_curve_with_noise <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

  df <- snedata::s_curve(n_samples = sample_size, noise = 0.05) 
  df <- df %>% 
    dplyr::select(-color)
  names(df) <- paste0(rep("x",3), 1:3)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset57 <- s_curve_with_noise(sample_size = 200, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset57)

Dataset 58 (add)

#Mobius Experiment
# A function to generate a 5-D mobius strip in the third dimension.
# p dimension of object.  (5)
# n number of points

mobius_cluster_with_noise <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      mobius <- geozoo::mobius.experiment(p = 5, n = sample_size* 0.80)

df1 <- tibble::as_tibble(mobius$points)

names(df1) <- paste0(rep("x", length(names(df1))), 1: length(names(df1)))


    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df1) + 1):((NCOL(df1) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size* 0.80,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size* 0.80,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df1 <- dplyr::bind_cols(df1, df_noise)

    ## To add background noise
    column_names_bkg <- paste0(rep("x", NCOL(df1)), 1:NCOL(df1))

    noise_bkg_val_list <- list()

    for (j in 1:NCOL(df1)) {
        noise_bkg_val_list[[column_names_bkg[j]]] <- rnorm(sample_size * 0.20, mean = 0, sd = 0.3)


    }

    df2 <- tibble::as_tibble(noise_bkg_val_list)


    df <- dplyr::bind_rows(df1, df2)


    df

}
dataset58 <- mobius_cluster_with_noise(sample_size = 250, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset58)

Dataset 59 (clustering)

four_long_clusters_with_bkg_noise <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      # To check that the assigned sample_size is divided by three
    if ((sample_size%%5) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/5)

    } else {
        cluster_size <- sample_size/5
    }


    df_2_split <- snedata::long_cluster_data(n = cluster_size) %>% 
  group_by(color) %>% 
  group_split()

df_2_split_1 <- df_2_split[[1]]
df_2_split_1$x <- df_2_split_1$x - 20
df_2_split_1$y <- df_2_split_1$y - 20

df_2_split_3 <- df_2_split[[1]]
df_2_split_3$x <- df_2_split_3$x - 10
df_2_split_3$y <- df_2_split_3$y + 10

df_2_split_4 <- df_2_split[[1]]
df_2_split_4$x <- df_2_split_4$x + 20
df_2_split_4$y <- df_2_split_4$y + 30

df1 <- bind_rows(df_2_split_1, df_2_split[[2]], df_2_split_3, df_2_split_4) %>% 
  select(-color)

names(df1) <- paste0(rep("x",2), 1:2)


    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df1) + 1):((NCOL(df1) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(cluster_size * 4,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(cluster_size * 4,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df1 <- dplyr::bind_cols(df1, df_noise)

    ## To add background noise
    column_names_bkg <- paste0(rep("x", NCOL(df1)), 1:NCOL(df1))

    noise_bkg_val_list <- list()

    for (j in 1:NCOL(df1)) {
        noise_bkg_val_list[[column_names_bkg[j]]] <- rnorm(cluster_size, mean = 0, sd = 10)


    }

    df2 <- tibble::as_tibble(noise_bkg_val_list)


    df <- dplyr::bind_rows(df1, df2)

    df

}
dataset59 <- four_long_clusters_with_bkg_noise(sample_size = 150, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset59)

Dataset 60 (clustering)

curvy_branching_cluster_with_bkg_noise <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      # To check that the assigned sample_size is divided by three
    if ((sample_size%%4) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/4)

    } else {
        cluster_size <- sample_size/4
    }


    theta = runif(cluster_size, 0.20, 0.90 * pi)

df1 <- tibble::tibble(
x1 = cos(theta) + rnorm(cluster_size, 1, 0.06),
x2 = sin(theta) + rnorm(cluster_size, 1, 0.06),

x3 = cos(theta) + rnorm(cluster_size, 1, 0.06),
x4 = sin(theta) + rnorm(cluster_size, 1, 0.06)
)

theta1 = runif(cluster_size, 0.20, 0.90 * pi)

df2 <- tibble::tibble(
x1 = cos(-theta1) + rnorm(cluster_size, 1, 0.06),
x2 = sin(-theta1) + rnorm(cluster_size, 1, 0.06),

x3 = cos(-theta1) + rnorm(cluster_size, 1, 0.06),
x4 = sin(-theta1) + rnorm(cluster_size, 1, 0.06)
)


df3 <- tibble::tibble(x1 = rnorm(cluster_size, mean = 1, sd = 0.08), x2 = rnorm(cluster_size, mean = 1, sd = 0.08), x3=rnorm(cluster_size, mean = 1, sd = 0.08), x4=rnorm(cluster_size, mean = 1, sd = 0.08))

df4 <- tibble::tibble(x1 = rnorm(cluster_size, mean = 1, sd = 1), x2 = rnorm(cluster_size, mean = 1, sd = 1), x3=rnorm(cluster_size, mean = 1, sd = 1), x4=rnorm(cluster_size, mean = 1, sd = 1))


df <- bind_rows(df1, df2, df3, df4)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset60 <- curvy_branching_cluster_with_bkg_noise(sample_size = 200, num_of_noise_dim = 6,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset60)

Dataset 61 (add)

three_diff_linear_with_noise <- function(sample_size = 150, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      # To check that the assigned sample_size is divided by three
    if ((sample_size%%3) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/3)

    } else {
        cluster_size <- sample_size/3
    }


    df_2_split <- snedata::long_cluster_data(n = cluster_size) %>% 
  group_by(color) %>% 
  group_split()

df_2_split_1 <- df_2_split[[1]]
df_2_split_1$x <- df_2_split_1$x - 250
df_2_split_1$y <- df_2_split_1$y - 20

df_2_split_3 <- tibble::tibble(x = -df_2_split[[1]]$y, y = df_2_split[[1]]$x)

df <- dplyr::bind_rows(df_2_split_1, df_2_split[[2]], df_2_split_3) %>% 
  select(-color)

names(df) <- paste0(rep("x",2), 1:2)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset61 <- three_diff_linear_with_noise(sample_size = 300, num_of_noise_dim = 8,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset61)

Dataset 62 (add)

four_diff_long_clutsers_with_noise <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      # To check that the assigned sample_size is divided by three
    if ((sample_size%%3) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/4)

    } else {
        cluster_size <- sample_size/4
    }


    df_2_split <- snedata::long_cluster_data(n = cluster_size) %>% 
  group_by(color) %>% 
  group_split()

df_2_split_1 <- df_2_split[[1]]
df_2_split_1$x <- df_2_split_1$x - 150
df_2_split_1$y <- df_2_split_1$y - 20

df_2_split_3 <- tibble::tibble(x = df_2_split[[1]]$y - 70, y = -df_2_split[[1]]$x)

df_2_split_4 <- tibble::tibble(x = df_2_split_3$x, y = df_2_split_3$y + 150)

df <- dplyr::bind_rows(df_2_split_1, df_2_split[[2]], df_2_split_3, df_2_split_4) %>% 
  select(-color)

names(df) <- paste0(rep("x",2), 1:2)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset62 <- four_diff_long_clutsers_with_noise(sample_size = 500, num_of_noise_dim = 8,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset62)

Dataset 63 (add)

two_s_curves_with_noise <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      # To check that the assigned sample_size is divided by three
    if ((sample_size%%3) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/2)

    } else {
        cluster_size <- sample_size/2
    }


    df1 <- snedata::s_curve(n_samples = cluster_size) 
df1 <- df1 %>% 
  select(-color)
names(df1) <- paste0(rep("x",3), 1:3)

df2 <- tibble::tibble(x1 = -df1$x1 + 5, x2 = df1$x2 + 1, x3 = df1$x3 + 1)

df <- dplyr::bind_rows(df1, df2)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset63 <- two_s_curves_with_noise(sample_size = 500, num_of_noise_dim = 5,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset63)

Dataset 64 (add)

plane_2D_with_hole <- function(sample_size = 100, with_seed = NULL, num_of_noise_dim = 2, min_noise = 0, max_noise = 1) {

  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # To check that the assigned sample_size is divided by four
  if ((sample_size%%4) != 0) {
    stop("The sample size should be a product of 4.")

  } else {
    cluster_size <- sample_size/4
  }

  u <- runif(cluster_size, min = 10, max = 30)
  v <- runif(cluster_size, min = 10, max = 20)
  x <- u + v - 10
  y <- v - u + 8

  df1 <- tibble::tibble(x1 = x, x2 = y)

  anchor <- c(1, 1)
  indices <- rowSums((sweep(df1, 2, anchor, `-`))) > 20
  df1 <- df1[indices, ]
  rownames(df1) <- NULL

  df2 <- tibble::tibble(x1 = -df1$x2 + 26, x2 = df1$x1 - 15)
  df3 <- tibble::tibble(x1 = df1$x2 + 30, x2 = -df1$x1 + 25)

  df <- dplyr::bind_rows(df1 - 10, df1 + 10, df2, df3)

  sample_size <- NROW(df)


  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), 3:(3 + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  return(list(df = df, sample_size = sample_size))

}
dataset64 <- plane_2D_with_hole(sample_size = 480, with_seed = 20230531)

dataset64$sample_size

langevitour(dataset64$df)

Dataset 65 (add)

mirror_s_curves_with_noise <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 8,
    min_noise = -0.5, max_noise = 0.5) {
    # To check the seed is not assigned
    if (!is.null(with_seed)) {
        set.seed(with_seed)
    }

      # To check that the assigned sample_size is divided by three
    if ((sample_size%%3) != 0) {
        warning("The sample size should be a product of number of clusters.")
        cluster_size <- floor(sample_size/2)

    } else {
        cluster_size <- sample_size/2
    }


    df1 <- snedata::s_curve(n_samples = cluster_size) 
    df1 <- df1 %>% 
      select(-color)
    names(df1) <- paste0(rep("x",3), 1:3)

    df2 <- tibble::tibble(x1 = -df1$x1 + 2, x2 = df1$x2, x3 = df1$x3)

    df <- dplyr::bind_rows(df1, df2)

    # To generate column names for noise dimensions
    column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

    # Initialize an empty list to store the vectors with column
    # values
    noise_dim_val_list <- list()

    for (j in 1:num_of_noise_dim) {
        if ((j%%2) == 0) {
            noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                min = min_noise, max = max_noise)
        } else {
            noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                min = min_noise, max = max_noise)
        }


    }

    df_noise <- tibble::as_tibble(noise_dim_val_list)
    df <- dplyr::bind_cols(df, df_noise)

    df

}
dataset65 <- mirror_s_curves_with_noise(sample_size = 600, num_of_noise_dim = 5,
    min_noise = -0.05, max_noise = 0.05)

langevitour(dataset65)

Dataset 66 (clustering)

gaussian_clusters_diff_points <- function(n = 400, cluster_size_vec = c(50, 100, 200, 50), with_seed = NULL, num_clusters = 4, mean_matrix = rbind(c(1,0,0,0,0,0), c(0,1,0,0,0,0), c(0,0,1,0,0,0), c(0,0,0,1,0,0)),
                              var_vec = c(0.02, 0.05, 0.06, 0.1), num_dims = 6, num_noise_dims = 4,
                              min_noise = -0.05, max_noise = 0.05) {

  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  if (n < num_clusters) {
    stop('Number of clusters exceed the number of observations.')

  }

  if ((num_dims == 0) | (num_dims == 1)) {
    stop('There should be at least two dimensions.')

  }

  if (dim(mean_matrix)[1] != length(var_vec)) {
    stop('The length of mean and variance vectors are different.')

  }

  if (dim(mean_matrix)[1] != num_clusters) {
    stop('There is not enough mean values for clusters.')

  }

  if (dim(mean_matrix)[2] != num_dims) {
    stop('There is not enough mean values for dimensions.')

  }

  if (length(var_vec) != num_clusters) {
    stop('There is not enough varaiance values for clusters.')

  }

  # # To check that the assigned n is divided by three
  # if ((n%%num_clusters) != 0) {
  #   warning("The sample size should be a product of number of clusters.")
  #   cluster_size <- floor(n/num_clusters)
  # 
  # } else {
  #   cluster_size <- n/num_clusters
  # }

  # To generate empty tibble
  column_names <- paste0(rep("x", num_dims), 1:num_dims)
  df <- tibble::tibble(!!!stats::setNames(rep(list(NULL), length(column_names)), column_names))

  for (i in 1:num_clusters) {

    # To filter the mean values for specific cluster
    mean_val_for_cluster <- mean_matrix |>
      tibble::as_tibble(.name_repair = "unique") |>
      dplyr::filter(dplyr::row_number() == i) |>
      unlist(use.names = FALSE)

    # To filter the variance values for specific cluster
    variance_val_for_cluster <- var_vec[i]

    num_points_cluster <- cluster_size_vec[i]

    # Initialize an empty list to store the vectors with column
    # values
    dim_val_list <- list()

    for (j in 1:num_dims) {

      dim_val_list[[column_names[j]]] <- stats::rnorm(num_points_cluster, mean = mean_val_for_cluster[j],
                                               sd = variance_val_for_cluster)

    }
    # To generate a tibble for a cluster
    df_cluster <- tibble::as_tibble(dim_val_list)

    df <- dplyr::bind_rows(df, df_cluster)

  }

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_noise_dims), (NCOL(df) + 1):((NCOL(df) + 1) + num_noise_dims))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_noise_dims) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(n,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(n,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}
dataset66 <- gaussian_clusters_diff_points(n = 1500, cluster_size_vec = c(450, 350, 400, 300), with_seed = NULL, num_clusters = 4, mean_matrix = rbind(c(1,0,0,0,0,0), c(0,1,0,0,0,0), c(0,0,1,0,0,0), c(0,0,0,1,0,0)),
                              var_vec = c(0.02, 0.05, 0.06, 0.1), num_dims = 6, num_noise_dims = 4,
                              min_noise = -0.05, max_noise = 0.05)

langevitour(dataset66)

Dataset 67 (clustering)

cluster_and_curvilinear_with_noise <- function(sample_size = 200, cluster_size_vec = c(50, 150), with_seed = NULL, num_of_noise_dim = 3,
    min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }


  theta = runif(cluster_size_vec[1], 0.20,0.60 * pi)
  x = cos(theta) + rnorm(cluster_size_vec[1], 10, 0.03)
  y = sin(theta) + rnorm(cluster_size_vec[1], 10, 0.03)

  z <- rep(0, cluster_size_vec[1]) + rnorm(cluster_size_vec[1], 10, 0.03)
  w <- rep(0, cluster_size_vec[1]) - rnorm(cluster_size_vec[1], 10, 0.03)

  df1 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  x = rnorm(cluster_size_vec[2], 10, 0.05)
  y = rnorm(cluster_size_vec[2], 10, 0.05)

  z <- rep(0, cluster_size_vec[2]) + rnorm(cluster_size_vec[2], 10, 0.05)
  w <- rep(0, cluster_size_vec[2]) - rnorm(cluster_size_vec[2], 10, 0.05)

  df2 <- tibble::tibble(x1 = x, x2 = y, x3 = z, x4 = w)

  df <- dplyr::bind_rows(df1, df2)
  names(df) <- paste0(rep("x", NCOL(df)), 1:NCOL(df))

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)
  df

}
dataset67 <- cluster_and_curvilinear_with_noise(sample_size = 1500, cluster_size_vec = c(500, 1000), with_seed = NULL, num_of_noise_dim = 3,
    min_noise = -0.05, max_noise = 0.05)


langevitour(dataset67)

Dataset 68 (add)

one_grid_diff <- function(sample_size = 200, with_seed = NULL, num_of_noise_dim = 2,
                                         min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  if ((sample_size %% 2) != 0) {

    stop("The sample size should be a product of two.")

  } else {

    if (((sqrt(sample_size/2)) %% 1) != 0) {

      stop("The square root should exists.")

    } else {

      n_value <- sqrt(sample_size/2)

    }

  }



  df1 <- snedata::grid_data(n = n_value)
  df1 <- df1 |>
    dplyr::select(-color)

  names(df1) <- paste0(rep("x",2), 1:2)

  df3 <- df1 + 3

  df1 <- dplyr::bind_rows(df1, df3)


  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df1) + 1):((NCOL(df1) + 1) + num_of_noise_dim))

  sample_size <- NROW(df1)

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(NROW(df1),
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(NROW(df1),
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df1, df_noise)



  return(list(df = df, sample_size = NROW(df)))

}
dataset68 <- one_grid_diff(sample_size = 512, with_seed = NULL, num_of_noise_dim = 2,
                                         min_noise = -0.05, max_noise = 0.05)

dataset68$sample_size
langevitour(dataset68$df)

Dataset 69 (clustering)

curvy_branching_cluster <- function(sample_size = 200, cluster_size_vec = c(50, 100, 50), with_seed = NULL, num_of_noise_dim = 6,
                                                   min_noise = -0.05, max_noise = 0.05) {
  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  # # To check that the assigned sample_size is divided by three
  # if ((sample_size%%3) != 0) {
  #   stop("The sample size should be a product of 4.")
  # 
  # } else {
  #   cluster_size <- sample_size/3
  # }


  theta <- runif(cluster_size_vec[1], 0.20, 0.90 * pi)

  df1 <- tibble::tibble(
    x1 = cos(theta) + rnorm(cluster_size_vec[1], 1, 0.06),
    x2 = sin(theta) + rnorm(cluster_size_vec[1], 1, 0.06),

    x3 = cos(theta) + rnorm(cluster_size_vec[1], 1, 0.06),
    x4 = sin(theta) + rnorm(cluster_size_vec[1], 1, 0.06)
  )

  theta1 <- runif(cluster_size_vec[3], 0.20, 0.90 * pi)

  df2 <- tibble::tibble(
    x1 = cos(-theta1) + rnorm(cluster_size_vec[3], 1, 0.06),
    x2 = sin(-theta1) + rnorm(cluster_size_vec[3], 1, 0.06),

    x3 = cos(-theta1) + rnorm(cluster_size_vec[3], 1, 0.06),
    x4 = sin(-theta1) + rnorm(cluster_size_vec[3], 1, 0.06)
  )


  df3 <- tibble::tibble(x1 = rnorm(cluster_size_vec[2], mean = 1, sd = 0.08), x2 = rnorm(cluster_size_vec[2], mean = 1, sd = 0.08), x3=rnorm(cluster_size_vec[2], mean = 1, sd = 0.08), x4=rnorm(cluster_size_vec[2], mean = 1, sd = 0.08))



  df <- dplyr::bind_rows(df1, df2, df3)

  # To generate column names for noise dimensions
  column_names <- paste0(rep("x", num_of_noise_dim), (NCOL(df) + 1):((NCOL(df) + 1) + num_of_noise_dim))

  # Initialize an empty list to store the vectors with column
  # values
  noise_dim_val_list <- list()

  for (j in 1:num_of_noise_dim) {
    if ((j%%2) == 0) {
      noise_dim_val_list[[column_names[j]]] <- runif(sample_size,
                                                     min = min_noise, max = max_noise)
    } else {
      noise_dim_val_list[[column_names[j]]] <- (-1) * runif(sample_size,
                                                            min = min_noise, max = max_noise)
    }


  }

  df_noise <- tibble::as_tibble(noise_dim_val_list)
  df <- dplyr::bind_cols(df, df_noise)

  df

}
dataset69 <- curvy_branching_cluster(sample_size = 200, cluster_size_vec = c(50, 100, 50), with_seed = NULL, num_of_noise_dim = 6, min_noise = -0.05, max_noise = 0.05)


langevitour(dataset69)

Dataset 70 (clustering)

clusters_different_shapes_diff_num_points <- function(sample_size = 400, with_seed = NULL, cluster_size_vec = c(50, 50, 50, 50, 100, 100), num_gussian_clusters = 4, num_non_gaussian_clusters = 2,
                                      cluster_sd_gau = 0.05, cluster_sd_non_gau = 0.1, num_dims = 7, a = 2, b = 4) {


  # To check the seed is not assigned
  if (!is.null(with_seed)) {
    set.seed(with_seed)
  }

  num_clusters <- num_gussian_clusters + num_non_gaussian_clusters



  ## Generate Gaussian clusters

  # Create a vector of possible values (0 and 1)
  values <- c(0, 1)

  # Create an expanded grid with 0's and 1's
  mean_val_grid <- tidyr::expand_grid(!!!setNames(rep(list(values), num_dims),
                                                  paste0("mean_dim", 1:num_dims)))

  # To select combinations for assigned number of clusters

  mean_val_grid_gau <- mean_val_grid |>
    dplyr::slice_sample(n = num_gussian_clusters)

  mean_val_grid_non_gau <- mean_val_grid |>
    dplyr::slice_sample(n = num_non_gaussian_clusters)


  # To generate empty tibble
  column_names <- paste0(rep("x", num_dims), 1:num_dims)
  df <- tibble(!!!setNames(rep(list(NULL), length(column_names)), column_names))

  for (i in 1:num_gussian_clusters) {

    # To filter the mean values for specific cluster
    mean_val_for_cluster <- mean_val_grid_gau |>
      dplyr::filter(dplyr::row_number() == i) |>
      unlist(use.names = FALSE)

    # Initialize an empty list to store the vectors with column
    # values
    dim_val_list <- list()

    for (j in 1:num_dims) {

      dim_val_list[[column_names[j]]] <- rnorm(cluster_size_vec[i], mean = mean_val_for_cluster[j],
                                               sd = cluster_sd_gau)

    }
    # To generate a tibble for a cluster
    df_gau_cluster <- tibble::as_tibble(dim_val_list)

    df <- dplyr::bind_rows(df, df_gau_cluster)

  }



  for (i in 1:num_non_gaussian_clusters) {

    phi <- runif(cluster_size_vec[(num_clusters - i)], max = 2*pi)
    rho <- sqrt(runif(cluster_size_vec[(num_clusters - i)]))

    # To filter the mean values for specific cluster
    presence_of_elipse_cluster <- mean_val_grid_non_gau |>
      dplyr::filter(dplyr::row_number() == i) |>
      unlist(use.names = FALSE)

    # Initialize an empty list to store the vectors with column
    # values
    dim_val_list_n <- list()

    for (j in 1:num_dims) {
      if(presence_of_elipse_cluster[j] == 1){
        dim_val_list_n[[column_names[j]]] <- sqrt(a)*rho*cos(phi) + b
        ## Surface of poolar coordinate
      } else {
        dim_val_list_n[[column_names[j]]] <- rnorm(cluster_size_vec[(num_clusters - i)], mean = 0,
                                                   sd = cluster_sd_non_gau)

      }



    }
    # To generate a tibble for a cluster
    df_non_gau_cluster <- tibble::as_tibble(dim_val_list_n)

    df <- dplyr::bind_rows(df, df_non_gau_cluster)

  }

  df

}
dataset70 <- clusters_different_shapes_diff_num_points(sample_size = 1500, with_seed = NULL, cluster_size_vec = c(250, 150, 150, 150, 350, 450), num_gussian_clusters = 4, num_non_gaussian_clusters = 2,
                                      cluster_sd_gau = 0.05, cluster_sd_non_gau = 0.1, num_dims = 7, a = 2, b = 4)


langevitour(dataset70)

Dataset 71 (add)

sphere_df <- sphere(radius = 1, resolution = 20, num_noise_dims = 3, 
                    min_noise = -0.05, max_noise = 0.05) |> as.data.frame()

langevitour(sphere_df)


Try the cardinalR package in your browser

Any scripts or data that you put into this service are public.

cardinalR documentation built on May 29, 2024, 4:37 a.m.