Nothing
#' Generate Small Spheres Within a Big Sphere
#'
#' This function generates a dataset representing a structure with a small and big spheres.
#'
#' @param n A numeric vector (default: c(1000, 100)) representing the sample sizes of the big and small spheres respectively.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#' @param k A numeric value (default: 3) representing the number of small spheres.
#' @param r A numeric vector (default: c(15, 3)) representing the radius of the big and small spheres respectively.
#' @param loc A numeric value (default: 10 / sqrt(3) representing how far the small spheres are placed from each other.
#' @return A data containing small spheres within a big sphere.
#' @export
#'
#' @examples
#' set.seed(20240412)
#' sphere_data <- gen_clusteredSpheres(n = 500, p = 4, r = 1)
#' head(sphere_data, 5)
gen_clusteredSpheres <- function(k = 3, p = 4, n = c(1000, 100), r = c(15, 3),
loc = 10 / sqrt(3)) {
if (p < 3) {
cli::cli_abort("p should be greater than 3.")
}
if (length(n) != 2) {
cli::cli_abort("n should contain exactly two values.")
}
if (any(n < 0)) {
cli::cli_abort("Values in n should be positive.")
}
if (any(r < 0)) {
cli::cli_abort("Values in r should be positive.")
}
n_big <- n[1]
n_small <- n[2]
r_big <- r[1]
r_small <- r[2]
d_dim_sphere <- gen_unifSphere(n_small, p, r_small)
small_spheres <- lapply(seq_len(k), function(i) {
center <- stats::rnorm(p, sd = loc)
sweep(d_dim_sphere, 2, center, "+")
})
big_sphere <- gen_unifSphere(n_big, p, r_big)
small_labeled <- lapply(seq_along(small_spheres), function(i) {
cbind(small_spheres[[i]], cluster = paste0("small_", i))
})
big_labeled <- cbind(big_sphere, cluster = "big")
df <- dplyr::bind_rows(c(small_labeled, list(big_labeled))) |>
tibble::as_tibble()
names(df) <- append(paste0("x", 1:p), "cluster")
## Swap rows
df <- randomize_rows(df)
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate Gaussian cluster with the Mobius Cluster
#'
#' This function generates a dataset consisting of a mobius cluster and Gaussian cluster.
#'
#' @param n A numeric vector (default: c(200, 100)) representing the sample sizes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#' @return A data containing the mobius cluster and Gaussian cluster.
#' @export
#'
#' @examples
#' data <- gen_mobiusGau(n = c(200, 100), p = 4)
gen_mobiusGau <- function(n = c(200, 100), p = 4) {
if (p < 3) {
cli::cli_abort("p should be greater than 3.")
}
if (length(n) != 2) {
cli::cli_abort("n should contain exactly two values.")
}
if (any(n < 0)) {
cli::cli_abort("Values in n should be positive.")
}
df1 <- gen_mobius(n = n[1], p = p) |>
dplyr::mutate(cluster = "cluster1")
## To add background noise
df2 <- gen_gaussian(n = n[2], p = p, s = diag(p) * 0.01) |>
dplyr::mutate(cluster = "cluster2")
df <- dplyr::bind_rows(df1, df2)
## Swap rows
df <- randomize_rows(df)
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate overlapped two conic spiral shapes
#'
#' This function generates overlapped tow conic spiral shapes.
#'
#' @param n A numeric vector (default: c(500, 500)) representing the sample sizes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#'
#' @return A data containing a two overlapped conic spiral shapes.
#'
#' @examples
#' set.seed(20240412)
#' data <- gen_overlapConicSpirals(n = c(500, 500), p = 4)
#'
#' @export
gen_overlapConicSpirals <- function(n = c(500, 500), p = 4) {
if (p < 3) {
cli::cli_abort("p should be greater than 3.")
}
if (length(n) != 2) {
cli::cli_abort("n should contain exactly 2 values.")
}
if (any(n < 0)) {
cli::cli_abort("Values in n should be positive.")
}
df1 <- tibble::as_tibble(geozoo::conic.spiral(n = n[1])$points, .name_repair = "minimal") |>
rlang::set_names(paste0("x", 1:3)) |>
dplyr::mutate(cluster = "cluster1")
df2 <- tibble::as_tibble(geozoo::conic.spiral(n = n[2])$points[,c(3, 1, 2)], .name_repair = "minimal") |>
rlang::set_names(paste0("x", 1:3)) |>
dplyr::mutate(cluster = "cluster2")
df <- dplyr::bind_rows(df1, df2)
if (p > 3) {
noise_df <- gen_noisedims(n = NROW(df), p = (p-3), m = rep(0, p-3), s = rep(0.05, p-3))
colnames(noise_df) <- paste0("x", 4:p)
df <- dplyr::bind_cols(df, noise_df) |>
dplyr::select(dplyr::starts_with("x"), "cluster")
}
## Swap rows
df <- randomize_rows(df)
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate intersected two conic spiral shapes
#'
#' This function generates intersected tow conic spiral shapes.
#'
#' @param n A numeric vector (default: c(500, 500)) representing the sample sizes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#'
#' @return A data containing the two intersected conic spiral shapes.
#'
#' @examples
#' set.seed(20240412)
#' data <- gen_intersectConicSpirals(n = c(500, 500), p = 4)
#'
#' @export
gen_intersectConicSpirals <- function(n = c(500, 500), p = 4) {
if (p < 3) {
cli::cli_abort("p should be greater than 3.")
}
if (length(n) != 2) {
cli::cli_abort("n should contain exactly 2 values.")
}
if (any(n < 0)) {
cli::cli_abort("Values in n should be positive.")
}
df1 <- tibble::as_tibble(geozoo::conic.spiral(n = n[1])$points, .name_repair = "minimal") |>
rlang::set_names(paste0("x", 1:3)) |>
dplyr::mutate(cluster = "cluster1")
df2 <- tibble::as_tibble(geozoo::conic.spiral(n = n[2])$points[,c(2, 1, 3)], .name_repair = "minimal") |>
rlang::set_names(paste0("x", 1:3)) |>
dplyr::mutate(cluster = "cluster2")
df <- dplyr::bind_rows(df1, df2)
if (p > 3) {
noise_df <- gen_noisedims(n = NROW(df), p = (p-3), m = rep(0, p-3), s = rep(0.05, p-3))
colnames(noise_df) <- paste0("x", 4:p)
df <- dplyr::bind_cols(df, noise_df) |>
dplyr::select(dplyr::starts_with("x"), "cluster")
}
## Swap rows
df <- randomize_rows(df)
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate intersected two dini surface shapes
#'
#' This function generates intersected tow dini surface shapes.
#'
#' @param n A numeric vector (default: c(500, 500)) representing the sample sizes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#'
#' @return A data containing the two intersected dini surface shapes.
#'
#' @examples
#' set.seed(20240412)
#' data <- gen_overlapDiniSurface(n = c(500, 500), p = 4)
#' @export
gen_overlapDiniSurface <- function(n = c(500, 500), p = 4) {
if (p < 3) {
cli::cli_abort("p should be greater than 3.")
}
if (length(n) != 2) {
cli::cli_abort("n should contain exactly 2 values.")
}
if (any(n < 0)) {
cli::cli_abort("Values in n should be positive.")
}
df1 <- tibble::as_tibble(geozoo::dini.surface(n = n[1])$points, .name_repair = "minimal") |>
rlang::set_names(paste0("x", 1:3)) |>
dplyr::mutate(cluster = "cluster1")
df2 <- tibble::as_tibble(geozoo::dini.surface(n = n[2])$points[,c(2, 1, 3)], .name_repair = "minimal") |>
rlang::set_names(paste0("x", 1:3)) |>
dplyr::mutate(cluster = "cluster2")
df <- dplyr::bind_rows(df1, df2)
if (p > 3) {
noise_df <- gen_noisedims(n = NROW(df), p = (p-3), m = rep(0, p-3), s = rep(0.05, p-3))
colnames(noise_df) <- paste0("x", 4:p)
df <- dplyr::bind_cols(df, noise_df) |>
dplyr::select(dplyr::starts_with("x"), "cluster")
}
## Swap rows
df <- randomize_rows(df)
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate overlapped two dini surface shapes
#'
#' This function generates overlapped tow dini surface shapes.
#'
#' @param n A numeric vector (default: c(500, 500)) representing the sample sizes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#'
#' @return A data containing the two overlapped dini surface shapes.
#'
#' @examples
#' set.seed(20240412)
#' data <- gen_intersectDiniSurface(n = c(500, 500), p = 4)
#' @export
gen_intersectDiniSurface <- function(n = c(500, 500), p = 4) {
if (p < 3) {
cli::cli_abort("p should be greater than 3.")
}
if (length(n) != 2) {
cli::cli_abort("n should contain exactly 2 values.")
}
if (any(n < 0)) {
cli::cli_abort("Values in n should be positive.")
}
df1 <- tibble::as_tibble(geozoo::dini.surface(n = n[1])$points, .name_repair = "minimal") |>
rlang::set_names(paste0("x", 1:3)) |>
dplyr::mutate(cluster = "cluster1")
df2 <- tibble::as_tibble(geozoo::dini.surface(n = n[2])$points[,c(3, 2, 1)], .name_repair = "minimal") |>
rlang::set_names(paste0("x", 1:3)) |>
dplyr::mutate(cluster = "cluster2")
df <- dplyr::bind_rows(df1, df2)
if (p > 3) {
noise_df <- gen_noisedims(n = NROW(df), p = (p-3), m = rep(0, p-3), s = rep(0.05, p-3))
colnames(noise_df) <- paste0("x", 4:p)
df <- dplyr::bind_cols(df, noise_df) |>
dplyr::select(dplyr::starts_with("x"), "cluster")
}
## Swap rows
df <- randomize_rows(df)
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate two roman surface shaped clusters
#'
#' This function generates tow roman surface shaped clusters
#'
#' @param n A numeric vector (default: c(500, 500)) representing the sample sizes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#'
#' @return A data containing the tow roman surface shaped clusters.
#'
#' @examples
#' set.seed(20240412)
#' data <- gen_clustRomanSurface(n = c(500, 500), p = 4)
#' @export
gen_clustRomanSurface <- function(n = c(500, 500), p = 4) {
if (p < 3) {
cli::cli_abort("p should be greater than 3.")
}
if (length(n) != 2) {
cli::cli_abort("n should contain exactly 2 values.")
}
if (any(n < 0)) {
cli::cli_abort("Values in n should be positive.")
}
df1 <- tibble::as_tibble(geozoo::roman.surface(n = n[1])$points + 1, .name_repair = "minimal") |>
rlang::set_names(paste0("x", 1:3)) |>
dplyr::mutate(cluster = "cluster1")
df2 <- tibble::as_tibble(geozoo::roman.surface(n = n[2])$points[,c(3, 1, 2)], .name_repair = "minimal") |>
rlang::set_names(paste0("x", 1:3)) |>
dplyr::mutate(cluster = "cluster2")
df <- dplyr::bind_rows(df1, df2)
if (p > 3) {
noise_df <- gen_noisedims(n = NROW(df), p = (p-3), m = rep(0, p-3), s = rep(0.05, p-3))
colnames(noise_df) <- paste0("x", 4:p)
df <- dplyr::bind_cols(df, noise_df) |>
dplyr::select(dplyr::starts_with("x"), "cluster")
}
## Swap rows
df <- randomize_rows(df)
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate overlapped two torus shapes
#'
#' This function generates overlapped tow torus shapes.
#'
#' @param n A numeric vector (default: c(500, 500)) representing the sample sizes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#'
#' @return A data containing the two overlapped torus shapes.
#'
#' @examples
#' set.seed(20240412)
#' data <- gen_overlapTorus(n = c(500, 500), p = 4)
#'
#' @export
gen_overlapTorus <- function(n = c(500, 500), p = 4) {
if (p < 3) {
cli::cli_abort("p should be greater than 3.")
}
if (length(n) != 2) {
cli::cli_abort("n should contain exactly 2 values.")
}
if (any(n < 0)) {
cli::cli_abort("Values in n should be positive.")
}
df1 <- tibble::as_tibble(geozoo::torus(p = 3, n = n[1])$points, .name_repair = "minimal") |>
rlang::set_names(paste0("x", 1:3)) |>
dplyr::mutate(cluster = "cluster1")
df2 <- tibble::as_tibble(geozoo::torus(p = 3, n = n[2])$points[,c(3, 1, 2)], .name_repair = "minimal") |>
rlang::set_names(paste0("x", 1:3)) |>
dplyr::mutate(cluster = "cluster2")
df <- dplyr::bind_rows(df1, df2)
if (p > 3) {
noise_df <- gen_noisedims(n = NROW(df), p = (p-3), m = rep(0, p-3), s = rep(0.05, p-3))
colnames(noise_df) <- paste0("x", 4:p)
df <- dplyr::bind_cols(df, noise_df) |>
dplyr::select(dplyr::starts_with("x"), "cluster")
}
## Swap rows
df <- randomize_rows(df)
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate Curvy Branching Clusters with Noise
#'
#' This function generates data with curvy branching clusters along with added noise.
#'
#' @param n A numeric vector (default: c(300, 200)) representing the sample sizes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#'
#' @return A matrix containing the generated data, with each row representing a data point.
#' @export
#'
#' @examples
#'
#' # Generate curvy branching clusters with noise with custom parameters
#' set.seed(20240412)
#' data <- gen_twoCurvy(n = c(300, 200), p = 4)
gen_twoCurvy <- function(n = c(300, 200), p = 4) {
if (p < 4) {
cli::cli_abort("p should be 4 or greater.")
}
if (length(n) != 2) {
cli::cli_abort("n should contain exactly 2 values.")
}
if (any(n < 0)) {
cli::cli_abort("Values in n should be positive.")
}
theta1 <- stats::runif(n[1], 0.20, 0.90 * pi)
df1 <- tibble::tibble(
x1 = cos(theta1) + stats::rnorm(n[1], 1, 0.06),
x2 = sin(theta1) + stats::rnorm(n[1], 1, 0.06),
x3 = cos(theta1) + stats::rnorm(n[1], 1, 0.06),
x4 = sin(theta1) + stats::rnorm(n[1], 1, 0.06),
cluster = "cluster1"
)
theta2 <- stats::runif(n[2], 0.20, 0.90 * pi)
df2 <- tibble::tibble(
x1 = cos(-theta2) + stats::rnorm(n[2], 1, 0.06),
x2 = sin(-theta2) + stats::rnorm(n[2], 1, 0.06),
x3 = cos(-theta2) + stats::rnorm(n[2], 1, 0.06),
x4 = sin(-theta2) + stats::rnorm(n[2], 1, 0.06),
cluster = "cluster2"
)
df <- dplyr::bind_rows(df1, df2)
if (p > 5) {
noise_df <- gen_noisedims(n = NROW(df), p = (p-4), m = rep(0, p-4), s = rep(0.05, p-4))
colnames(noise_df) <- paste0("x", 5:p)
df <- dplyr::bind_cols(df, noise_df) |>
dplyr::select(dplyr::starts_with("x"), "cluster")
}
## Swap rows
df <- randomize_rows(df)
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate Curvy Branching Cluster Data
#'
#' This function generates two curvy clusters and one Gaussian cluster in the middle.
#'
#' @param n A numeric vector (default: c(200, 200, 100)) representing the sample sizes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#' @return A data containing two curvy clusters and one Gaussian cluster.
#' @export
#'
#' @examples
#'
#' # Generate curvy branching cluster data with custom parameters
#' set.seed(20240412)
#' data <- gen_twoCurvyGau(n = c(200, 200, 100), p = 4)
gen_twoCurvyGau <- function(n = c(200, 200, 100), p = 4) {
if (p < 4) {
cli::cli_abort("p should be 4 or greater.")
}
if (length(n) != 3) {
cli::cli_abort("n should contain exactly 3 values.")
}
if (any(n < 0)) {
cli::cli_abort("Values in n should be positive.")
}
df1 <- gen_twoCurvy(n = n[1:2], p = p)
df2 <- gen_gaussian(n = n[3], p = p, s = diag(4) * 0.01) |>
dplyr::mutate(cluster = "cluster3")
df <- dplyr::bind_rows(df1, df2)
## Swap rows
df <- randomize_rows(df)
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate Curvy Branching Cluster Data with Background Noise
#'
#' This function generates data with two curvy clusters and one Gaussian cluster with background noise.
#'
#' @param n A numeric vector (default: c(200, 200, 100, 50)) representing the sample sizes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#' @return A data containing two curvy clusters and one Gaussian cluster with background noise.
#' @export
#'
#' @examples
#' # Generate curvy branching cluster data with background noise with custom parameters
#' set.seed(20240412)
#' data <- gen_twoCurvyGauBkg(n = c(200, 200, 100, 50), p = 4)
gen_twoCurvyGauBkg <- function(n = c(200, 200, 100, 50), p = 4) {
if (p < 4) {
cli::cli_abort("p should be 4 or greater.")
}
if (length(n) != 4) {
cli::cli_abort("n should contain exactly 4 values.")
}
if (any(n < 0)) {
cli::cli_abort("Values in n should be positive.")
}
df1 <- gen_twoCurvyGau(n = n[1:3], p = p)
df2 <- gen_bkgnoise(n = n[4], p = p, m = rep(1, p), s = rep(1, p)) |>
dplyr::mutate(cluster = "bkg_noise")
df <- dplyr::bind_rows(df1, df2)
## Swap rows
df <- randomize_rows(df)
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate One Grid with Background Noise
#'
#' This function generates a grid data and background noise.
#'
#' @param n A numeric vector (default: c(10, 10)) representing the number of grid points along each axes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#' @return A data containing one grid data with background noise.
#' @export
#'
#' @examples
#' set.seed(20240412)
#' one_grid_bkg <- gen_oneGridBkg(n = c(10, 10), p = 4)
gen_oneGridBkg <- function(n = c(10, 10), p = 4) {
if (length(n) != 2) {
cli::cli_abort("n should contain exactly 2 values.")
}
if (any(n < 0)) {
cli::cli_abort("Values in n should be positive.")
}
df1 <- gen_gridcube(n = n, p = 2)
noise_df <- gen_noisedims(n = n[1] * n[2], p = (p-2), m = rep(0, p-2), s = rep(0.05, p-2))
names(noise_df) <- paste0("x", 3:p)
df1 <- dplyr::bind_cols(df1, noise_df)
df2 <- gen_bkgnoise(n = NROW(df1) * 0.3, p = p, m = c(0, 0, 0, 0), s = c(2, 2, 2, 2))
df <- dplyr::bind_rows(df1, df2)
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate Two Grids with Different Offset
#'
#' This function generates two grids with an offset.
#'
#' @param n A numeric vector (default: c(10, 10)) representing the number of grid points along each axes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#' @return A data containing two grids data.
#' @export
#'
#' @examples
#' set.seed(20240412)
#' two_grid_comb <- gen_twoGrids(n = c(10, 10), p = 4)
gen_twoGrids <- function(n = c(10, 10), p = 4) {
if (length(n) != 2) {
cli::cli_abort("n should contain exactly 2 values.")
}
if (any(n < 0)) {
cli::cli_abort("Values in n should be positive.")
}
df1 <- gen_gridcube(n = n, p = 2)
df2 <- df1 + 3
noise_df <- gen_noisedims(n = n[1] * n[2], p = (p-2), m = rep(0, p-2), s = rep(0.05, p-2))
names(noise_df) <- paste0("x", 3:p)
df1 <- dplyr::bind_cols(df1, noise_df)
noise_df <- gen_noisedims(n = n[1] * n[2], p = (p-2), m = rep(0, p-2), s = rep(0.05, p-2))
names(noise_df) <- paste0("x", 3:p)
df2 <- dplyr::bind_cols(df2, noise_df)
df <- dplyr::bind_rows(df1, df2)
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate Two Grids with Background Noise
#'
#' This function generates two grids with background noise.
#'
#' @param n A numeric vector (default: c(10, 10)) representing the number of grid points along each axes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#' @return A data containing two grids data with background noise.
#' @export
#'
#' @examples
#' set.seed(20240412)
#' two_grid_comb_bkg <- gen_twoGridsBkg(n = c(10, 10), p = 4)
gen_twoGridsBkg <- function(n = c(10, 10), p = 4) {
if (length(n) != 2) {
cli::cli_abort("n should contain exactly 2 values.")
}
if (any(n < 0)) {
cli::cli_abort("Values in n should be positive.")
}
df1 <- gen_gridcube(n = n, p = 2)
df3 <- df1 + 5
noise_df <- gen_noisedims(n = n[1] * n[2], p = (p-2), m = rep(0, p-2), s = rep(0.05, p-2))
names(noise_df) <- paste0("x", 3:p)
df1 <- dplyr::bind_cols(df1, noise_df)
noise_df <- gen_noisedims(n = n[1] * n[2], p = (p-2), m = rep(0, p-2), s = rep(0.05, p-2))
names(noise_df) <- paste0("x", 3:p)
df3 <- dplyr::bind_cols(df3, noise_df)
df1 <- dplyr::bind_rows(df1, df3)
df2 <- gen_bkgnoise(n = NROW(df1) * 0.1, p = p, m = c(0, 0, 0, 0), s = c(2, 2, 2, 2))
df <- dplyr::bind_rows(df1, df2)
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate Long Cluster Data
#'
#' This function generates a dataset consisting of any number of long clusters.
#'
#' @param n A numeric vector (default: c(200, 500, 300)) representing the sample sizes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#' @param k A numeric value (default: 3) representing the number of clusters.
#' @return A data containing the long cluster data.
#' @export
#'
#' @examples
#' set.seed(20240412)
#' long_cluster <- gen_longClust(n = c(200, 500, 300), p = 4, k = 3)
gen_longClust <- function(n = c(200, 500, 300), p = 4, k = 3) {
if (k < 2) {
cli::cli_abort("k should be greater than 2.")
}
if (p < 2) {
cli::cli_abort("p should be greater than 2.")
}
if (length(n) != k) {
cli::cli_abort("n should contain exactly {.val {k}} values.")
}
if (any(n < 0)) {
cli::cli_abort("Values in n should be positive.")
}
df <- tibble::tibble()
for (j in 1:k) {
df1 <- gen_longLinear(n = n[j], p = 4) |>
dplyr::mutate(cluster = paste0("cluster", j))
df <- dplyr::bind_rows(df, df1)
}
## To swap rows
df <- randomize_rows(df)
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate Overlapped Any number of Curvy Circle Clusters
#'
#' This function generates a dataset representing a structure with any number of overlapped curvy circles.
#'
#' @param n A numeric vector (default: c(200, 500, 300)) representing the sample sizes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#' @param k A numeric value (default: 3) representing the number of clusters.
#' @return A data containing the any number of overlapped curvy circle clusters.
#' @export
#'
#' @examples
#' set.seed(20240412)
#' curvy_cell_cycle_data <- gen_overlappedCurvyCycle(n = c(200, 500, 300), p = 4, k = 3)
gen_overlappedCurvyCycle <- function(n = c(200, 500, 300), p = 4, k = 3) {
if (k < 2) {
cli::cli_abort("k should be greater than 2.")
}
if (p < 2) {
cli::cli_abort("p should be greater than 2.")
}
if (length(n) != k) {
cli::cli_abort("n should contain exactly {.val {k}} values.")
}
if (any(n < 0)) {
cli::cli_abort("Values in n should be positive.")
}
df <- tibble::tibble()
for (i in 1:k) {
shift_vec <- append(sample(seq(-0.5, 0.5, 0.2), 3), rep(0, p-3))
scale_vec <- append(sample(seq(-0.5, 0.5, 0.2), 3), rep(1, p-3))
df3 <- gen_curvycycle(n[i], p = p) |>
purrr::map2(scale_vec, ~ .x * .y) |>
dplyr::bind_cols() |>
purrr::map2(shift_vec, ~ .x * .y) |>
dplyr::bind_cols()
df3 <- df3|>
stats::setNames(names(df3)) |>
tibble::as_tibble() |>
dplyr::mutate(cluster = paste0("cluster", i))
df <- dplyr::bind_rows(df, df3)
}
## To swap rows
df <- randomize_rows(df)
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate Shited Any number of 2-D Circle Clusters
#'
#' This function generates a dataset representing a structure with any number of shifted circles.
#'
#' @param n A numeric vector (default: c(200, 500, 300)) representing the sample sizes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#' @param k A numeric value (default: 3) representing the number of clusters.
#' @return A data containing the any number of shifted circle clusters.
#' @export
#'
#' @examples
#' set.seed(20240412)
#' circular_clusters_data <- gen_shiftedCircleClusts(n = c(200, 500, 300), p = 4, k = 3)
gen_shiftedCircleClusts <- function(n = c(200, 500, 300), p = 4, k = 3) {
if (k < 2) {
cli::cli_abort("k should be greater than 2.")
}
if (p < 2) {
cli::cli_abort("p should be greater than 2.")
}
if (length(n) != k) {
cli::cli_abort("n should contain exactly {.val {k}} values.")
}
if (any(n < 0)) {
cli::cli_abort("Values in n should be positive.")
}
df <- tibble::tibble()
for (i in 1:k) {
## Generate scale factors for circles
scale_factors_vec <- stats::runif(1, 0, 2)
df3 <- gen_circle(n[i], p = p) |>
dplyr::mutate(across(where(is.numeric), ~ .x * scale_factors_vec)) |>
dplyr::mutate(cluster = paste0("cluster", i))
df <- dplyr::bind_rows(df, df3)
}
## To swap rows
df <- randomize_rows(df)
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate Overlapped Any number of Circle Clusters
#'
#' This function generates a dataset representing a structure with any number of overlapped circles.
#'
#' @param n A numeric vector (default: c(200, 500, 300)) representing the sample sizes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#' @param k A numeric value (default: 3) representing the number of clusters.
#' @return A data containing the any number of overlapped circle clusters.
#' @export
#'
#' @examples
#' set.seed(20240412)
#' cell_cycle_data <- gen_overlappedCircleClusts(n = c(200, 500, 300), p = 4, k = 3)
gen_overlappedCircleClusts <- function(n = c(200, 500, 300), p = 4, k = 3) {
if (k < 2) {
cli::cli_abort("k should be greater than 2.")
}
if (p < 2) {
cli::cli_abort("p should be greater than 2.")
}
if (length(n) != k) {
cli::cli_abort("n should contain exactly {.val {k}} values.")
}
if (any(n < 0)) {
cli::cli_abort("Values in n should be positive.")
}
r1 <- 2
r2 <- 1
perms <- gtools::permutations(n = p, r = p)
num_perms <- NROW(perms)
# Ensure we don't try to sample more permutations than available
selected_permute <- sample(1:num_perms, k, replace = FALSE)
# Generate k datasets with swapped columns and varying 'n'
swapped_datasets_varying_n <- lapply(1:k, function(i) {
scale_factors <- append(c(2, 1), rep(1, p-2))
generated_tibble <- gen_circle(n[i], p = p) |>
purrr::map2(scale_factors, ~ .x * .y) |>
dplyr::bind_cols()
generated_tibble <- generated_tibble|>
stats::setNames(names(generated_tibble)) |>
tibble::as_tibble()
# Permute the columns of the tibble
perm_indices <- perms[selected_permute[i], ]
df <- generated_tibble[, perm_indices]
df <- df |>
dplyr::mutate(cluster = paste0("cluster", i))
names(df) <- append(paste0("x", 1:p), "cluster") # Ensure consistent column names
df
})
df <- dplyr::bind_rows(swapped_datasets_varying_n)
## To swap rows
df <- randomize_rows(df)
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate Long Cluster Data
#'
#' This function generates a dataset consisting of two long clusters.
#'
#' @param n A numeric vector (default: c(200, 300)) representing the sample sizes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#' @return A data containing the long cluster data.
#' @export
#'
#' @examples
#' set.seed(20240412)
#' long_cluster <- gen_twoLongClusts(n = c(200, 300), p = 4)
gen_twoLongClusts <- function(n = c(200, 300), p = 4) {
if (p < 4) {
stop(cli::cli_alert_danger("p should be 4 or greater."))
}
if (length(n) != 2) {
stop(cli::cli_alert_danger("n should contain exactly 2 values."))
}
if (any(n < 0)) {
stop(cli::cli_alert_danger("Values in n should be positive."))
}
x1 <- 0:(n[1] - 1) + 0.03 * n[1] * stats::rnorm(n[1])
x2 <- 0:(n[1] - 1) + 0.03 * n[1] * stats::rnorm(n[1])
x3 <- 0:(n[1] - 1) + 0.03 * n[1] * stats::rnorm(n[1])
x4 <- 0:(n[1] - 1) + 0.03 * n[1] * stats::rnorm(n[1])
df1 <- tibble::tibble(x1 = x1,
x2 = x2,
x3 = x3,
x4 = x4)
x1 <- 0:(n[2] - 1) + 0.03 * n[2] * stats::rnorm(n[2]) + n[2] / 5
x2 <- 0:(n[2] - 1) + 0.03 * n[2] * stats::rnorm(n[2]) - n[2] / 5
x3 <- 0:(n[2] - 1) + 0.03 * n[2] * stats::rnorm(n[2]) + n[2] / 5
x4 <- 0:(n[2] - 1) + 0.03 * n[2] * stats::rnorm(n[2]) - n[2] / 5
df2 <- tibble::tibble(x1 = x1,
x2 = x2,
x3 = x3,
x4 = x4)
df <- dplyr::bind_rows(df1, df2)
if (p > 4) {
cli::cli_alert_info("Adding noise dimensions to reach the desired dimensionality.")
noise_mat <- gen_noise_dims(
n = NROW(df), num_noise = p - 4,
min_n = -0.5, max_n = 0.5
)
colnames(noise_mat) <- paste0("x", 5:p)
df <- dplyr::bind_cols(df, noise_mat)
}
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate Three Different Linear Data
#'
#' This function generates a dataset consisting of three different linear clusters.
#'
#' @param n A numeric vector (default: c(200, 300, 150)) representing the sample sizes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#' @return A data containing the three different linear data.
#' @export
#'
#' @examples
#' set.seed(20240412)
#' three_diff_linear <- gen_threeAngledLongClusts(n = c(200, 300, 150), p = 4)
gen_threeAngledLongClusts <- function(n = c(200, 300, 150), p = 4) {
if (p < 4) {
stop(cli::cli_alert_danger("p should be 4 or greater."))
}
if (length(n) != 3) {
stop(cli::cli_alert_danger("n should contain exactly 3 values."))
}
if (any(n < 0)) {
stop(cli::cli_alert_danger("Values in n should be positive."))
}
x1 <- 0:(n[1] - 1) + 0.03 * n[1] * stats::rnorm(n[1]) - 150
x2 <- 0:(n[1] - 1) + 0.03 * n[1] * stats::rnorm(n[1]) - 20
x3 <- 0:(n[1] - 1) + 0.03 * n[1] * stats::rnorm(n[1])
x4 <- 0:(n[1] - 1) + 0.03 * n[1] * stats::rnorm(n[1]) - 65
df1 <- tibble::tibble(x1 = x1,
x2 = x2,
x3 = x3,
x4 = x4)
x1 <- 0:(n[2] - 1) + 0.03 * n[2] * stats::rnorm(n[2]) + n[2] / 5
x2 <- 0:(n[2] - 1) + 0.03 * n[2] * stats::rnorm(n[2]) - n[2] / 5
x3 <- 0:(n[2] - 1) + 0.03 * n[2] * stats::rnorm(n[2]) + n[2] / 5
x4 <- 0:(n[2] - 1) + 0.03 * n[2] * stats::rnorm(n[2]) - n[2] / 5
df2 <- tibble::tibble(x1 = x1,
x2 = x2,
x3 = x3,
x4 = x4)
x1 <- -(0:(n[3] - 1) + 0.03 * n[3] * stats::rnorm(n[3]))
x2 <- 0:(n[3] - 1) + 0.03 * n[3] * stats::rnorm(n[3])
x3 <- 0:(n[3] - 1) + 0.03 * n[3] * stats::rnorm(n[3])
x4 <- -(0:(n[3] - 1) + 0.03 * n[3] * stats::rnorm(n[3]))
df3 <- tibble::tibble(x1 = x1,
x2 = x2,
x3 = x3,
x4 = x4)
df <- dplyr::bind_rows(df1, df2, df3)
if (p > 4) {
cli::cli_alert_info("Adding noise dimensions to reach the desired dimensionality.")
noise_mat <- gen_noise_dims(
n = NROW(df), num_noise = p - 4,
min_n = -0.5, max_n = 0.5
)
colnames(noise_mat) <- paste0("x", 5:p)
df <- dplyr::bind_cols(df, noise_mat)
}
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate Four Different Long Clusters
#'
#' This function generates a dataset consisting of four different long clusters.
#'
#' @param n A numeric vector (default: c(200, 150, 300, 150)) representing the sample sizes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#' @return A data containing the four different long clusters.
#' @export
#'
#' @examples
#' set.seed(20240412)
#' four_diff_long_clusters <- gen_fourLongClusts(n = c(200, 150, 300, 150), p = 4)
gen_fourLongClusts <- function(n = c(200, 150, 300, 150), p = 4) {
if (p < 4) {
stop(cli::cli_alert_danger("p should be 4 or greater."))
}
if (length(n) != 4) {
stop(cli::cli_alert_danger("n should contain exactly 4 values."))
}
if (any(n < 0)) {
stop(cli::cli_alert_danger("Values in n should be positive."))
}
x1 <- 0:(n[1] - 1) + 0.03 * n[1] * stats::rnorm(n[1]) - 150
x2 <- 0:(n[1] - 1) + 0.03 * n[1] * stats::rnorm(n[1])
x3 <- 0:(n[1] - 1) + 0.03 * n[1] * stats::rnorm(n[1]) - 20
x4 <- 0:(n[1] - 1) + 0.03 * n[1] * stats::rnorm(n[1]) - 85
df1 <- tibble::tibble(x1 = x1,
x2 = x2,
x3 = x3,
x4 = x4)
x1 <- 0:(n[2] - 1) + 0.03 * n[2] * stats::rnorm(n[2]) + n[2] / 5
x2 <- 0:(n[2] - 1) + 0.03 * n[2] * stats::rnorm(n[2]) - n[2] / 5
x3 <- 0:(n[2] - 1) + 0.03 * n[2] * stats::rnorm(n[2]) + n[2] / 5
x4 <- 0:(n[2] - 1) + 0.03 * n[2] * stats::rnorm(n[2]) - n[2] / 5
df2 <- tibble::tibble(x1 = x1,
x2 = x2,
x3 = x3,
x4 = x4)
x1 <- 0:(n[3] - 1) + 0.03 * n[3] * stats::rnorm(n[3]) - 70
x2 <- -(0:(n[3] - 1) + 0.03 * n[3] * stats::rnorm(n[3]))
x3 <- 0:(n[3] - 1) + 0.03 * n[3] * stats::rnorm(n[3])
x4 <- 0:(n[3] - 1) + 0.03 * n[3] * stats::rnorm(n[3]) - 85
df3 <- tibble::tibble(x1 = x1,
x2 = x2,
x3 = x3,
x4 = x4)
x1 <- 0:(n[4] - 1) + 0.03 * n[4] * stats::rnorm(n[4]) - 70
x2 <- -(0:(n[4] - 1) + 0.03 * n[4] * stats::rnorm(n[4])) + 150
x3 <- 0:(n[4] - 1) + 0.03 * n[4] * stats::rnorm(n[4])
x4 <- 0:(n[4] - 1) + 0.03 * n[4] * stats::rnorm(n[4]) + 85
df4 <- tibble::tibble(x1 = x1,
x2 = x2,
x3 = x3,
x4 = x4)
df <- dplyr::bind_rows(df1, df2, df3, df4)
if (p > 4) {
cli::cli_alert_info("Adding noise dimensions to reach the desired dimensionality.")
noise_mat <- gen_noise_dims(
n = NROW(df), num_noise = p - 4,
min_n = -0.5, max_n = 0.5
)
colnames(noise_mat) <- paste0("x", 5:p)
df <- dplyr::bind_cols(df, noise_mat)
}
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
#' Generate Three Linear Clusters
#'
#' This function generates data with three linear clusters.
#'
#' @param n A numeric vector (default: c(200, 300, 150)) representing the sample sizes.
#' @param p A numeric value (default: 4) representing the number of dimensions.
#'
#' @return A data containing the three long clusters.
#' @export
#'
#' @examples
#'
#' # Generate three linear clusters with noise with custom parameters
#' set.seed(20240412)
#' data <- gen_threeLongClusts(n = c(200, 300, 150), p = 4)
gen_threeLongClusts <- function(n = c(200, 300, 150), p = 4) {
if (p < 4) {
stop(cli::cli_alert_danger("p should be 4 or greater."))
}
if (length(n) != 3) {
stop(cli::cli_alert_danger("n should contain exactly 3 values."))
}
if (any(n < 0)) {
stop(cli::cli_alert_danger("Values in n should be positive."))
}
x1 <- 0:(n[1] - 1) + 0.03 * n[1] * stats::rnorm(n[1]) + 100
x2 <- 0:(n[1] - 1) + 0.03 * n[1] * stats::rnorm(n[1]) - 100
x3 <- 0:(n[1] - 1) + 0.03 * n[1] * stats::rnorm(n[1]) - 100
x4 <- 0:(n[1] - 1) + 0.03 * n[1] * stats::rnorm(n[1]) + 100
df1 <- tibble::tibble(x1 = x1,
x2 = x2,
x3 = x3,
x4 = x4)
x1 <- 0:(n[2] - 1) + 0.03 * n[2] * stats::rnorm(n[2]) + n[2] / 5
x2 <- 0:(n[2] - 1) + 0.03 * n[2] * stats::rnorm(n[2]) - n[2] / 5
x3 <- 0:(n[2] - 1) + 0.03 * n[2] * stats::rnorm(n[2]) + n[2] / 5
x4 <- 0:(n[2] - 1) + 0.03 * n[2] * stats::rnorm(n[2]) - n[2] / 5
df2 <- tibble::tibble(x1 = x1,
x2 = x2,
x3 = x3,
x4 = x4)
x1 <- 0:(n[3] - 1) + 0.03 * n[3] * stats::rnorm(n[3]) - 10
x2 <- 0:(n[3] - 1) + 0.03 * n[3] * stats::rnorm(n[3]) + 10
x3 <- 0:(n[3] - 1) + 0.03 * n[3] * stats::rnorm(n[3]) - 10
x4 <- 0:(n[3] - 1) + 0.03 * n[3] * stats::rnorm(n[3]) + 10
df3 <- tibble::tibble(x1 = x1,
x2 = x2,
x3 = x3,
x4 = x4)
df <- dplyr::bind_rows(df1, df2, df3)
if (p > 4) {
cli::cli_alert_info("Adding noise dimensions to reach the desired dimensionality.")
noise_mat <- gen_noise_dims(
n = NROW(df), num_noise = p - 4,
min_n = -0.5, max_n = 0.5
)
colnames(noise_mat) <- paste0("x", 5:p)
df <- dplyr::bind_cols(df, noise_mat)
}
cli::cli_alert_success("Data generation completed successfully! 🎉")
return(df)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.