knitr::opts_chunk$set(echo = TRUE) library(tidyverse) # load_pkg <- rlang::quos(tidyverse, , lubridate, readxl, janitor, tictoc) # # invisible(lapply(lapply(load_pkg, rlang::quo_name), # library, # character.only = TRUE # ))
load("C:/rprojects/ggradialbar/data/tidy_data.rda") dataset <- readRDS("C:/rprojects/ggradialbar/data-raw/team-project-data.rds")
dfs <- dataset$data is_tibble(dfs) # TRUE dfs_na_count <- colSums(is.na(dfs)) print(dfs_na_count) # Column Missing values # recognition 32 # dummy 2217 sapply(dfs, class) # .phase character # .cluster factor grs <- dataset$feature_groups grs_wide <- pivot_wider(grs, names_from = feature, values_from = .gr_name) is_tibble(grs) # TRUE grs_na_count <- colSums(is.na(grs)) # print(grs_na_count) # No missing values
# Keeping dummy column and converting data to long format and then removing observations with missing values data_with_dummy <- dfs %>% pivot_longer(names_to = "feature", values_to = "value", cols = -c(.id, .phase, .cluster)) %>% drop_na() combined_data <- full_join(data_with_dummy,grs, by = "feature") # Removing Dummy column before converting data to long format and then removing observations with missing values # data_without_dummy <- dfs %>% # select_if(~!all(is.na(.))) %>% # pivot_longer(names_to = "feature", values_to = "value", cols = -c(.id, .phase, .cluster)) %>% # drop_na() # combined_data_without_dummy <- full_join(data_without_dummy,grs, by = "feature") # checking if tibbles are equal in both scenarios mentioned above. # all_equal(data_with_dummy,data_without_dummy) # Result = TRUE # all_equal(combined_data,combined_data_without_dummy) # Result = TRUE
# Using Combined data from here on. sapply(combined_data, class) combined_na_count <- colSums(is.na(combined_data)) print(combined_na_count) # Data contains no missing values and now is in tidy format.
cluster_idx = 2L cluster_abbrev = "PT" group_names = grs$.gr_name post_treatment = TRUE scale_rng = c(-1,1)*1.5 # LONG DATA tidy_data <- tidy_data %>% rename(f = feature, v = value, group = .gr_name) %>% select(-c(".gr_id")) tidy_data_without_phase <- tidy_data %>% select(-c(".phase")) %>% mutate(.phase := NA_character_) tidy_data_without_id <- tidy_data %>% select(-c(".id")) %>% mutate(.id := NA_character_) tidy_data_without_both <- tidy_data %>% select(-c(".phase", ".id")) %>% mutate(.phase := NA_character_) %>% mutate(.id := NA_character_) # WIDE DATA dfs <-dfs %>% select(-c(dummy)) %>% drop_na() # 2185 obs of 80 variables dfs_without_phase <- dfs %>% select(-c(".phase")) %>% mutate(.phase := NA_character_) dfs_without_id <- dfs %>% select(-c(".id")) %>% mutate(.id := NA_character_) dfs_without_both <- dfs %>% select(-c(".phase", ".id")) %>% mutate(.phase := NA_character_) %>% mutate(.id := NA_character_) # LONG DATA tidy_d <- tidy_data %>% filter(.cluster == levels(.cluster)[cluster_idx]) # 96844 obs for cluster 1 # 23713 obs for cluster 2 # 26487 obs for cluster 3 # 23633 obs for cluster 4 tidy_d_without_phase <- tidy_data_without_phase %>% filter(.cluster == levels(.cluster)[cluster_idx]) tidy_d_without_id <- tidy_data_without_id %>% filter(.cluster == levels(.cluster)[cluster_idx]) tidy_d_without_both <- tidy_data_without_both %>% filter(.cluster == levels(.cluster)[cluster_idx]) # WIDE DATA df_wide <- dfs %>% mutate(.cluster = factor(.cluster)) %>% filter(.cluster == levels(.cluster)[cluster_idx]) # 1236 obs for cluster 1 # 305 obs for cluster 2 # 343 obs for cluster 3 # 301 obs for cluster 4 df_wide_w_p <- dfs_without_phase %>% mutate(.cluster = factor(.cluster)) %>% filter(.cluster == levels(.cluster)[cluster_idx]) df_wide_w_i <- dfs_without_id %>% mutate(.cluster = factor(.cluster)) %>% filter(.cluster == levels(.cluster)[cluster_idx]) df_wide_w_b <- dfs_without_both %>% mutate(.cluster = factor(.cluster)) %>% filter(.cluster == levels(.cluster)[cluster_idx]) # LONG DATA data_cluster_long <- tidy_d %>% mutate(.cluster = fct_drop(as.factor(.cluster))) %>% mutate(.cluster = factor(.cluster, labels = paste(cluster_abbrev, levels(.cluster)))) data_cluster_long_w_p <- tidy_d_without_phase %>% mutate(.cluster = fct_drop(as.factor(.cluster))) %>% mutate(.cluster = factor(.cluster, labels = paste(cluster_abbrev, levels(.cluster)))) data_cluster_long_w_i <- tidy_d_without_id %>% mutate(.cluster = fct_drop(as.factor(.cluster))) %>% mutate(.cluster = factor(.cluster, labels = paste(cluster_abbrev, levels(.cluster)))) data_cluster_long_w_b <- tidy_d_without_both %>% mutate(.cluster = fct_drop(as.factor(.cluster))) %>% mutate(.cluster = factor(.cluster, labels = paste(cluster_abbrev, levels(.cluster)))) # WIDE DATA data_cluster_wide <- df_wide %>% mutate(.cluster = fct_drop(as.factor(.cluster))) %>% mutate(.cluster = factor(.cluster, labels = paste(cluster_abbrev, levels(.cluster)))) data_cluster_wide_w_p <- df_wide_w_p %>% mutate(.cluster = fct_drop(as.factor(.cluster))) %>% mutate(.cluster = factor(.cluster, labels = paste(cluster_abbrev, levels(.cluster)))) data_cluster_wide_w_i <- df_wide_w_i %>% mutate(.cluster = fct_drop(as.factor(.cluster))) %>% mutate(.cluster = factor(.cluster, labels = paste(cluster_abbrev, levels(.cluster)))) data_cluster_wide_w_b <- df_wide_w_b %>% mutate(.cluster = fct_drop(as.factor(.cluster))) %>% mutate(.cluster = factor(.cluster, labels = paste(cluster_abbrev, levels(.cluster)))) d <- unique(tidy_data$f)
# data_l <- data_cluster_long %>% # dplyr::group_by(f) %>% # dplyr::add_count(.cluster) %>% # dplyr::arrange(.cluster) %>% # dplyr::ungroup() %>% # dplyr::group_by(.phase, .cluster, f, group) %>% # dplyr::summarize(avg = mean(v), sd = sd(v), n = n[1], .groups = "keep" ) %>% # dplyr::ungroup() %>% # dplyr::mutate(error = qnorm(0.975) * sd / sqrt(n)) %>% # # winsorize cluster averages # dplyr::mutate(avg = ifelse(avg > scale_rng[2], scale_rng[2], avg)) %>% # dplyr::mutate(avg = ifelse(avg < scale_rng[1], scale_rng[1], avg)) %>% # dplyr::arrange(group,f,.cluster) # # # # dplyr::inner_join(tibble(f=unique(data_cluster_long$f), group = group_names), by = "f") %>% # dplyr::arrange(group, f, .cluster) all_equal(data_l_gg, data_l) data_l_gg <- data_cluster_long %>% dplyr::group_by(f) %>% dplyr::add_count(.cluster) %>% dplyr::arrange(.cluster) %>% dplyr::ungroup() %>% dplyr::group_by(.phase, .cluster, f) %>% dplyr::summarize(avg = mean(v), sd = sd(v), n = n[1]) %>% dplyr::ungroup() %>% dplyr::mutate(error = qnorm(0.975) * sd / sqrt(n)) %>% # winsorize cluster averages dplyr::mutate(avg = ifelse(avg > scale_rng[2], scale_rng[2], avg)) %>% dplyr::mutate(avg = ifelse(avg < scale_rng[1], scale_rng[1], avg)) %>% dplyr::inner_join(tibble(f=unique(data_cluster_long$f), group = group_names), by = "f") %>% dplyr::arrange(group, f, .cluster) all_equal(data_n,data_l_n) data_l_n <- data_l %>% select(-c("n")) data_n <- data %>% select(-c("n")) data <- data_cluster_wide %>% pivot_longer(names_to = "f", values_to = "v", cols = -c(.id, .phase, .cluster)) %>% # tabyl generates a frequency table on .cluster inner_join(janitor::tabyl(data_cluster_wide, .cluster) %>% select(.cluster, n), by = ".cluster") %>% # adds an additional varibale `n` that contains the value - 'Total number of observations in the cluster' for each observation group_by(.phase, .cluster, f) %>% summarize(avg = mean(v), sd = sd(v), n = n[1]) %>% ungroup() %>% mutate(error = qnorm(0.975)*sd/sqrt(n)) %>% # winsorize cluster averages mutate(avg = ifelse(avg > scale_rng[2], scale_rng[2], avg)) %>% mutate(avg = ifelse(avg < scale_rng[1], scale_rng[1], avg)) %>% # mutate(sd = if_else(avg < 0, sd, -sd)) %>% inner_join(tibble(f = names(df_wide %>% select(-starts_with("."))), group = group_names), by = "f") %>% arrange(group, f, .cluster) head(dfs$physics) fff <- unique() data <- df_wide %>% pivot_longer(names_to = "f", values_to = "v", cols = -c(.id, .phase, .cluster)) %>% # tabyl generates a frequency table on .cluster inner_join(janitor::tabyl(dfs, .cluster) %>% select(.cluster, n), by = ".cluster") %>%# adds an additional varibale `n` that contains the value - 'Total number of observations in the cluster' for each obs group_by(.phase, .cluster, f) %>% summarize(avg = mean(v), sd = sd(v), n = n[1]) %>% ungroup() %>% mutate(error = qnorm(0.975)*sd/sqrt(n)) %>% # winsorize cluster averages mutate(avg = ifelse(avg > scale_rng[2], scale_rng[2], avg)) %>% mutate(avg = ifelse(avg < scale_rng[1], scale_rng[1], avg)) %>% # mutate(sd = if_else(avg < 0, sd, -sd)) %>% inner_join(tibble(f = names(df %>% select(-starts_with("."))), group = group_names), by = "f") %>% arrange(group, f, .cluster) library(diffdf) diffdf(data_cluster_wide,data_cluster_long_created)
# for post_treatment- for long data data <- tidy_data %>% dplyr::group_by(.id) %>% dplyr::mutate(n = n_distinct(.phase)) %>% dplyr::filter(n == 2) %>% dplyr::select(-n) %>% dplyr::arrange(.id) %>% dplyr::ungroup() # %>% count(unique(n)) # n == 1 ; 18401 # n == 2 ; 152276 v <- tidy_data %>% group_by(feature) %>% add_count(.cluster) %>% arrange(.cluster) x <- as.factor(tidy_data$.phase) y <- levels(x) xx <- levels(x)[1] yy <- levels(x)[2]
cluster_idx = "sa" # only 1L, 2L etc; works if (!(is.integer(cluster_idx))) rlang::abort("cluster_idx must be a integer value") cluster_assignment = TRUE # only character or factor; works if (is.logical(cluster_assignment) || is.numeric(cluster_assignment)) rlang::abort("cluster_assignment: wrong datatype. Character or factor datatype expected") ## checks for phase present and cluster_phase cluster_phase = TRUE # check works if (is.logical(cluster_phase) || is.numeric(cluster_phase)) rlang::abort("cluster_phase : wrong datatype. Character or factor datatype expected") phase_present = 2 # check works if (!(is.logical(phase_present))) rlang::abort("phase_present must be logical either 'TRUE' or 'FALSE'") cluster_phase = tidy_data$.phase phase_present = FALSE if (is.null(cluster_phase) && !(phase_present)) rlang::abort("If 'phase_present = TRUE', you must provide 'cluster_phase'") if (!is.null(cluster_phase) && (phase_present)) rlang::abort("You provided 'cluster_phase'; please Set 'phase_present = TRUE'") ## doesnt work ## checks for cluster_name and cluster_abbrev cluster_name = 3455 if (!is.character(cluster_name) || (str_length(cluster_name) > 15)) rlang::warn("Recommendation : a in character and length less than or equal to 15; otherwise plot may look distorted.") cluster_abbrev = "PTTTTTT" if (length(cluster_abbrev) > 4) rlang::abort("Recommendation : name in character and length less than or equal to 4; otherwise plot may look distorted.") # does not work ## checks for group_names and show_group_names show_group_names <- 2 if (is.logical(group_names) || is.numeric(group_names)) rlang::abort("group_names : wrong datatype. Character or factor datatype expected") if (!(is.logical(show_group_names))) rlang::abort("show_group_names must be logical either 'TRUE' or 'FALSE'") if(nchar(cluster_name) > 10) rlang::warn("stopppp ") ad <- NA nchar(cluster_name) cluster_assignment = NULL str_length(ad) if(!phase_present && is.null(cluster_assignment)) rlang::abort("If 'phase_present = TRUE', you must provide 'cluster_assignment'.") unique_clusters <- sort(unique(tidy_data$.cluster)) unique_clusters_check <- as.integer(unique_clusters) y <- min(unique_clusters) if (!(cluster_idx %in% unique_clusters_check)) { print("bad") } else { print("toll") }
# Setting up cluster assignment argument cluster_assignment = NULL # or dfs$.cluster if (!(is.null(cluster_assignment))) { cluster_assignment <- cluster_assignment unique_clusters <- sort(unique(cluster_assignment)) } else { cluster_values <- c("C1") cluster_levels <- c("C1") dfs <- dfs %>% dplyr::mutate(.cluster_dummy = factor(cluster_values, levels = cluster_levels)) cluster_assignment <- dfs$.cluster_dummy unique_clusters <- sort(unique(cluster_assignment)) } cluster_idx = 1L # 5L throws error if (!(is.null(cluster_idx))) { if (cluster_idx > length(unique_clusters)) { rlang::abort("The Cluster value, you specified does not exist ") } }
# Setting up group_names parameter group_names = NULL # or grs$.gr_name if (!(is.null(group_names))) { if(!is.factor(group_names)) group_names <- as.factor(group_names) rlang::inform("group_names converted to factor, if it is not already") } else { group_values <- c("SingleGroup") group_levels <- c("SingleGroup") dfs <- dfs %>% dplyr::mutate(.gr_name = factor(group_values, levels = group_levels)) group_names <- dfs$.gr_name rlang::inform("A new factor variable was added to the data, that conatins group_names") rlang::inform("Since you did not speccify group_names, only one group exists") }
install.packages("styler") # The goal of styler is to provide non-invasive pretty-printing of R source code while adhering to the tidyverse formatting rules. library(styler) # style_file() # styles .R, .Rmd .Rnw and .Rprofile, files. # style_dir() # styles all .R and/or .Rmd files in a directory. # style_pkg() # styles the source files of an R package.
install.packages("goodpractice") # Advice on R Package Building library(goodpractice) gp("C:/rprojects/ggradialbar") # package path
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.