Nothing
## ----include=FALSE------------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.align='center',
dpi = 92,
fig.retina = 2
)
options(tibble.print_min = 4L, tibble.print_max = 4L)
## ----eval=FALSE---------------------------------------------------------------
# # Uncomment:
# # install.packages("groupdata2")
## ----eval=FALSE---------------------------------------------------------------
# # Uncomment:
# # install.packages("devtools")
# # devtools::install_github("LudvigOlsen/groupdata2")
## ----error=FALSE, message=FALSE, warning=FALSE--------------------------------
# Attaching groupdata2
library(groupdata2)
# Attaching other packages used in this vignette
library(dplyr)
library(tidyr)
require(ggplot2, quietly = TRUE) # Attach if installed
library(knitr)
# We will also be using plyr a few times, but we don't attach this
# because of possible conflicts with dplyr. Instead we use its functions
# like so: plyr::count()
## -----------------------------------------------------------------------------
df <- data.frame(
"x" = c(1:12),
"species" = factor(rep(c('cat', 'pig', 'human'), 4)),
"age" = sample(c(1:100), 12)
)
## -----------------------------------------------------------------------------
groups <- group_factor(df, 5, method = 'n_dist')
groups
df$groups <- groups
df %>% kable(align = 'c')
## -----------------------------------------------------------------------------
df %>%
group_by(groups) %>%
summarize(mean_age = mean(age)) %>%
kable(align = 'c')
## -----------------------------------------------------------------------------
df <- data.frame(
"x" = c(1:12),
"species" = factor(rep(c('cat', 'pig', 'human'), 4)),
"age" = sample(c(1:100), 12)
)
## -----------------------------------------------------------------------------
groups <- group_factor(df, 5, method = 'n_dist', force_equal = TRUE)
groups
plyr::count(groups) %>%
rename(group = x, size = freq) %>%
kable(align = 'c')
## -----------------------------------------------------------------------------
df <- head(df, length(groups)) %>%
mutate(group = groups)
df %>% kable(align = 'c')
## -----------------------------------------------------------------------------
df <- data.frame(
"x" = c(1:12),
"species" = factor(rep(c('cat', 'pig', 'human'), 4)),
"age" = sample(c(1:100), 12)
)
## -----------------------------------------------------------------------------
df_grouped <- group(df, 5, method = 'n_dist')
df_grouped %>% kable(align = 'c')
## -----------------------------------------------------------------------------
df_means <- df %>%
group(5, method = 'n_dist') %>%
summarise(mean_age = mean(age))
df_means %>% kable(align = 'c')
## -----------------------------------------------------------------------------
df <- data.frame(
"x" = c(1:12),
"species" = factor(rep(c('cat', 'pig', 'human'), 4)),
"age" = sample(c(1:100), 12)
)
## -----------------------------------------------------------------------------
df_grouped <- df %>%
group(5, method = 'n_dist', force_equal = TRUE)
df_grouped %>% kable(align = 'c')
## -----------------------------------------------------------------------------
df <- data.frame(
"x" = c(1:12),
"species" = factor(rep(c('cat', 'pig', 'human'), 4)),
"age" = sample(c(1:100), 12)
)
## -----------------------------------------------------------------------------
df_list <- splt(df, 5, method = 'n_dist')
df_list %>% kable(align = 'c')
## -----------------------------------------------------------------------------
v <- c(1:6)
splt(v, 3, method = 'n_dist')
## -----------------------------------------------------------------------------
df <- data.frame(
"x" = c(1:12),
"species" = factor(rep(c('cat', 'pig', 'human'), 4)),
"age" = sample(c(1:100), 12)
)
## -----------------------------------------------------------------------------
df_list <- splt(df, 5, method = 'n_dist', force_equal = TRUE)
df_list %>% kable(align = 'c')
## -----------------------------------------------------------------------------
df <- data.frame(
"participant" = factor(rep(c('1', '2', '3', '4', '5', '6'), 3)),
"age" = rep(sample(c(1:100), 6), 3),
"diagnosis" = factor(rep(c('a', 'b', 'a', 'a', 'b', 'b'), 3)),
"score" = sample(c(1:100), 3 * 6)
)
df <- df %>%
arrange(participant)
# Remove index
rownames(df) <- NULL
# Add session info
df$session <- rep(c('1','2', '3'), 6)
kable(df, align = 'c')
## -----------------------------------------------------------------------------
df_folded <- fold(df, 3, method = 'n_dist')
# Order by folds
df_folded <- df_folded %>%
arrange(.folds)
kable(df_folded, align = 'c')
## -----------------------------------------------------------------------------
df_folded <- fold(df, 3, cat_col = 'diagnosis', method = 'n_dist')
# Order by folds
df_folded <- df_folded %>%
arrange(.folds)
kable(df_folded, align = 'c')
## -----------------------------------------------------------------------------
df_folded %>%
count(.folds, diagnosis) %>%
kable(align='c')
## -----------------------------------------------------------------------------
df_folded <- fold(df, 3, id_col = 'participant', method = 'n_dist')
# Order by folds
df_folded <- df_folded %>%
arrange(.folds)
# Remove index (Looks prettier in the table!)
rownames(df_folded) <- NULL
kable(df_folded, align = 'c')
## -----------------------------------------------------------------------------
df_folded %>%
count(.folds, participant) %>%
kable(align='c')
## -----------------------------------------------------------------------------
df_folded <- fold(df, 3, cat_col = 'diagnosis', id_col = 'participant', method = 'n_dist')
# Order by folds
df_folded <- df_folded %>%
arrange(.folds)
kable(df_folded, align = 'c')
## -----------------------------------------------------------------------------
df_folded %>%
count(.folds, diagnosis, participant) %>%
kable(align='c')
## -----------------------------------------------------------------------------
df <- data.frame(
"participant" = factor(rep(c('1', '2', '3', '4', '5', '6'), 3)),
"age" = rep(sample(c(1:100), 6), 3),
"diagnosis" = factor(rep(c('a', 'b', 'a', 'a', 'b', 'b'), 3)),
"score" = sample(c(1:100), 3 * 6)
)
df <- df %>% arrange(participant)
# Remove index
rownames(df) <- NULL
# Add session info
df$session <- rep(c('1','2', '3'), 6)
kable(df, align = 'c')
## -----------------------------------------------------------------------------
df_partitioned <- partition(df, 0.3, list_out = FALSE)
# Order by partitions
df_partitioned <- df_partitioned %>%
arrange(.partitions)
# Partition Sizes
df_partitioned %>%
count(.partitions) %>%
kable(align = 'c')
kable(df_partitioned, align = 'c')
## -----------------------------------------------------------------------------
df_partitioned <- partition(df, 0.3, cat_col = 'diagnosis', list_out = FALSE)
# Order by partitions
df_partitioned <- df_partitioned %>%
arrange(.partitions)
kable(df_partitioned, align = 'c')
## -----------------------------------------------------------------------------
df_partitioned %>%
count(.partitions, diagnosis) %>%
kable(align='c')
## -----------------------------------------------------------------------------
df_partitioned <- partition(df, 0.5, id_col = 'participant', list_out = FALSE)
# Order by partitions
df_partitioned <- df_partitioned %>%
arrange(.partitions)
kable(df_partitioned, align = 'c')
## -----------------------------------------------------------------------------
df_partitioned %>%
count(.partitions, participant) %>%
kable(align='c')
## -----------------------------------------------------------------------------
df_partitioned <- partition(
data = df,
p = 0.5,
cat_col = 'diagnosis',
id_col = 'participant',
list_out = FALSE
)
# Order by folds
df_partitioned <- df_partitioned %>%
arrange(.partitions)
kable(df_partitioned, align = 'c')
## -----------------------------------------------------------------------------
df_partitioned %>%
count(.partitions, diagnosis, participant) %>%
kable(align='c')
## ----echo=FALSE---------------------------------------------------------------
set.seed(2)
## -----------------------------------------------------------------------------
df <- data.frame(
"participant" = factor(rep(c('1', '2', '3', '4', '5', '6'), 3)),
"age" = rep(sample(c(1:100), 6), 3),
"diagnosis" = factor(rep(c('a', 'b', 'a', 'a', 'b', 'b'), 3)),
"score" = sample(c(1:100), 3 * 6)
)
df <- df %>%
arrange(participant)
# Add session info
df$session <- rep(c('1','2', '3'), 6)
# Sample dataset to get imbalances
df <- df %>%
sample_frac(0.7) %>%
arrange(participant)
# Remove index
rownames(df) <- NULL
# Counts
df %>%
count(diagnosis, participant) %>%
kable(align = 'c')
df %>%
count(diagnosis) %>%
kable(align = 'c')
kable(df, align = 'c')
## -----------------------------------------------------------------------------
df_balanced <- balance(df, "min", cat_col = "diagnosis") %>%
arrange(diagnosis, participant)
# Counts
df_balanced %>%
count(diagnosis) %>%
kable(align = 'c')
kable(df_balanced, align = 'c')
## -----------------------------------------------------------------------------
df_balanced <- balance(df, "min", cat_col = "diagnosis", id_col = "participant", id_method = "n_rows_c") %>%
arrange(diagnosis, participant)
# Partition Sizes
df_balanced %>%
count(diagnosis) %>%
kable(align = 'c')
kable(df_balanced, align = 'c')
## -----------------------------------------------------------------------------
df_balanced %>%
count(diagnosis, participant) %>%
kable(align='c')
## -----------------------------------------------------------------------------
df_balanced <- balance(df, "max", cat_col = "diagnosis") %>%
arrange(diagnosis, participant)
# Counts
df_balanced %>%
count(diagnosis) %>%
kable(align = 'c')
kable(df_balanced, align = 'c')
## -----------------------------------------------------------------------------
df_balanced <- balance(df, "max", cat_col = "diagnosis",
id_col = "participant", id_method = "n_rows_c") %>%
arrange(diagnosis, participant)
# Partition Sizes
df_balanced %>%
count(diagnosis) %>%
kable(align = 'c')
kable(df_balanced, align = 'c')
## -----------------------------------------------------------------------------
df_balanced %>%
count(diagnosis, participant) %>%
kable(align='c')
## -----------------------------------------------------------------------------
df_balanced <- balance(df, 3, cat_col = "diagnosis") %>%
arrange(diagnosis, participant)
# Counts
df_balanced %>%
count(diagnosis) %>%
kable(align = 'c')
kable(df_balanced, align = 'c')
## -----------------------------------------------------------------------------
df_balanced %>%
count(diagnosis, participant) %>%
kable(align='c')
## -----------------------------------------------------------------------------
df_balanced <- balance(df, 3, cat_col = "diagnosis",
id_col = "participant", id_method = "n_rows_c") %>%
arrange(diagnosis, participant)
# Partition Sizes
df_balanced %>%
count(diagnosis) %>%
kable(align = 'c')
kable(df_balanced, align = 'c')
## -----------------------------------------------------------------------------
df_balanced %>%
count(diagnosis, participant) %>%
kable(align='c')
## -----------------------------------------------------------------------------
df <- data.frame(
"x" = c(1:12),
"species" = factor(rep(c('cat', 'pig', 'human'), 4)),
"age" = sample(c(1:100), 12)
)
## -----------------------------------------------------------------------------
groups <- group_factor(df, 5, method = 'n_dist', randomize = TRUE)
groups
## -----------------------------------------------------------------------------
df_list <- splt(df, 5, method = 'n_dist', randomize = TRUE)
df_list %>% kable(align = 'c')
## ----echo=FALSE, eval=requireNamespace("ggplot2")-----------------------------
#
# Examples to show difference between methods
# This could be made interactive! This way you could test what happens in different situations by
# by simply moving a slider!
#
vec <- c(1:57)
n <- 6
if (exists ('n_meth_v57n6')){
rm(n_meth_v57n6)
}
for (meth in c('n_dist', 'n_fill' ,'n_last','n_rand')){
data_temp <- data.frame(plyr::count(group_factor(vec, n, method = meth)))
names(data_temp)[names(data_temp)=="freq"] <- meth
if (exists ('n_meth_v57n6')) {
n_meth_v57n6 <- cbind(n_meth_v57n6, data_temp)
} else {
n_meth_v57n6 <- data_temp
}
}
forced_equal <- plyr::count(group_factor(vec, n, method = 'n_last', force_equal = TRUE))
n_meth_v57n6$forced_equal <- forced_equal$freq
n_meth_v57n6 <- n_meth_v57n6[ , !duplicated(colnames(n_meth_v57n6))]
# gather() data frame for plotting
data_plot <- n_meth_v57n6 %>%
gather(method, group_size,-1)
upper_limit <- max(data_plot$group_size)+1
lower_limit <- min(data_plot$group_size)-1
v57n6_plot <- ggplot(data_plot, aes(x, group_size))
## Output
# Data frame
n_meth_v57n6
# Plot
v57n6_plot +
geom_point() +
scale_y_continuous(limit = c(lower_limit, upper_limit),
breaks = round(seq(lower_limit, upper_limit, by = 2),1)) +
#scale_y_continuous(limit = c(lower_limit, upper_limit))+
facet_wrap('method', ncol=1) +
labs(x = 'group',
y = 'group Size',
title = 'Distribution of Elements in groups')+
theme_bw()+
theme(axis.text.y = element_text(size=9),
axis.text.x = element_text(size=9))
## ----echo=FALSE, eval=requireNamespace("ggplot2")-----------------------------
vec <- c(1:117)
n <- 11
if (exists ('n_meth_v117n11')){
rm(n_meth_v117n11)
}
for (meth in c('n_dist', 'n_fill' ,'n_last','n_rand')){
data_temp <- data.frame(plyr::count(group_factor(vec, n, method = meth)))
names(data_temp)[names(data_temp)=="freq"] <- meth
if (exists ('n_meth_v117n11')) {
n_meth_v117n11 <- cbind(n_meth_v117n11, data_temp)
} else {
n_meth_v117n11 <- data_temp
}
}
forced_equal <- plyr::count(group_factor(vec, n, method = 'n_last', force_equal = TRUE))
n_meth_v117n11$forced_equal <- forced_equal$freq
n_meth_v117n11 <- n_meth_v117n11[ , !duplicated(colnames(n_meth_v117n11))]
# gather() data frame for plotting
data_plot <- n_meth_v117n11 %>%
gather(method, group_size,-1)
v117n11_plot <- ggplot(data_plot, aes(x, group_size))
upper_limit <- max(data_plot$group_size)+1
lower_limit <- min(data_plot$group_size)-1
## Output
# Data frame
n_meth_v117n11
# Plot
v117n11_plot +
geom_point() +
scale_y_continuous(limit = c(lower_limit, upper_limit),
breaks = round(seq(lower_limit, upper_limit, by = 2),1)) +
facet_wrap('method', ncol=1) +
labs(x = 'group',
y = 'group Size',
title = 'Distribution of Elements in groups')+
theme_bw()+
theme(axis.text.y = element_text(size=9),
axis.text.x = element_text(size=9))
## ----echo=FALSE, eval=requireNamespace("ggplot2")-----------------------------
vec <- c(1:100)
if (exists ('greedy_data')){
rm(greedy_data)
}
for (n in c(8,15,20)){
group_sizes <- plyr::count(group_factor(vec, n, method='greedy'))
data_temp <- data.frame(group_sizes, 'Size' = factor(n))
if (exists ('greedy_data')) {
greedy_data <- rbind(greedy_data, data_temp)
} else {
greedy_data <- data_temp
}
}
greedy_plot <- ggplot(greedy_data, aes(x, freq, color=Size))
greedy_plot +
geom_point() +
labs(x = 'group',
y = 'group Size',
title = 'Greedy Distribution of Elements in groups',
color = 'Size') +
theme_bw()+
theme(plot.margin = unit(c(1,1,1,1), "cm"))+
theme(axis.text.y = element_text(size=9),
axis.text.x = element_text(size=9))
## ----echo=FALSE, eval=requireNamespace("ggplot2")-----------------------------
vec <- c(1:1000)
if (exists ('staircase_data')){
rm(staircase_data)
}
for (n in c(2, 5, 11)){
group_sizes <- plyr::count(group_factor(vec, n, method='staircase'))
data_temp <- data.frame(group_sizes, 'step_size' = factor(n))
if (exists ('staircase_data')) {
staircase_data <- rbind(staircase_data, data_temp)
} else {
staircase_data <- data_temp
}
}
staircase_plot <- ggplot(staircase_data, aes(x, freq, color=step_size))
staircase_plot +
geom_point() +
#scale_x_continuous(breaks = round(seq(1, max(data_temp$x), by = 2),1))+
labs(x = 'group',
y = 'group Size',
title = 'Staircasing Distribution of Elements in groups',
color = 'Step Size') +
theme_bw()+
theme(axis.text.y = element_text(size=9),
axis.text.x = element_text(size=7))
## ----echo=FALSE, eval=requireNamespace("ggplot2")-----------------------------
staircase_data <- staircase_data %>%
group_by(step_size) %>%
mutate(cumsum = cumsum(freq))
staircase_cumulative_plot <- ggplot(staircase_data, aes(x, cumsum, color=step_size))
staircase_cumulative_plot +
geom_point() +
labs(x = 'group',
y = 'Cumulative sum of group sizes',
title = 'Staircasing Cumulative Sum of group Sizes',
color = 'Step Size') +
theme_bw()+
theme(axis.text.y = element_text(size=9),
axis.text.x = element_text(size=7))
## ----echo=FALSE, eval=requireNamespace("ggplot2")-----------------------------
vec <- c(1:1000)
if (exists ('primes_data')){
rm(primes_data)
}
for (n in c(2, 5, 11)){
group_sizes <- plyr::count(group_factor(vec, n, method='primes'))
data_temp <- data.frame(group_sizes, 'start_at' = factor(n))
if (exists ('primes_data')) {
primes_data <- rbind(primes_data, data_temp)
} else {
primes_data <- data_temp
}
}
primes_plot <- ggplot(primes_data, aes(x, freq, color=start_at))
primes_plot +
geom_point() +
#scale_x_continuous(breaks = round(seq(1, max(data_temp$x), by = 2),1))+
labs(x = 'group',
y = 'group Size',
title = 'Prime numbers method - Elements per groups',
color = 'Start at') +
theme_bw()+
theme(axis.text.y = element_text(size=9),
axis.text.x = element_text(size=7))
## ----echo=FALSE, eval=requireNamespace("ggplot2")-----------------------------
primes_data <- primes_data %>%
group_by(start_at) %>%
mutate(cumsum = cumsum(freq))
primes_cumulative_plot <- ggplot(primes_data, aes(x, cumsum, color=start_at))
primes_cumulative_plot +
geom_point() +
labs(x = 'group',
y = 'Cumulative sum of group sizes',
title = 'Primes Cumulative Sum of group Sizes',
color = 'Start At') +
theme_bw()+
theme(axis.text.y = element_text(size=9),
axis.text.x = element_text(size=7))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.