knitr::opts_chunk$set(echo = TRUE)


library(tidyverse)

# load_pkg <- rlang::quos(tidyverse, , lubridate, readxl, janitor, tictoc)
# 
# invisible(lapply(lapply(load_pkg, rlang::quo_name),
#   library,
#   character.only = TRUE
# ))
load("C:/rprojects/ggradialbar/data/tidy_data.rda")

dataset  <- readRDS("C:/rprojects/ggradialbar/data-raw/team-project-data.rds")
dfs <- dataset$data

is_tibble(dfs) # TRUE

dfs_na_count <- colSums(is.na(dfs))

print(dfs_na_count)

# Column        Missing values      
# recognition       32
# dummy            2217 

sapply(dfs, class)
# .phase character 
# .cluster factor

grs <- dataset$feature_groups


grs_wide <- pivot_wider(grs, names_from = feature, values_from = .gr_name)

is_tibble(grs) # TRUE

grs_na_count <- colSums(is.na(grs))

# print(grs_na_count)

# No missing values
# Keeping dummy column and converting data to long format and then removing observations with missing values

data_with_dummy <- dfs %>%
  pivot_longer(names_to = "feature", values_to = "value", cols = -c(.id, .phase, .cluster)) %>%
  drop_na()

combined_data <- full_join(data_with_dummy,grs, by = "feature")


# Removing Dummy column before converting data to long format and then removing observations with missing values 

# data_without_dummy <- dfs %>% 
# select_if(~!all(is.na(.))) %>%
# pivot_longer(names_to = "feature", values_to = "value", cols = -c(.id, .phase, .cluster)) %>%
# drop_na()


# combined_data_without_dummy <- full_join(data_without_dummy,grs, by = "feature")



# checking if tibbles are equal in both scenarios mentioned above. 

# all_equal(data_with_dummy,data_without_dummy) # Result = TRUE


# all_equal(combined_data,combined_data_without_dummy) # Result = TRUE
# Using Combined data from here on. 
sapply(combined_data, class)


combined_na_count <- colSums(is.na(combined_data))

print(combined_na_count)

# Data contains no missing values and now is in tidy format.
cluster_idx = 2L

cluster_abbrev = "PT"

group_names = grs$.gr_name

post_treatment = TRUE

scale_rng = c(-1,1)*1.5


# LONG DATA

tidy_data <- tidy_data %>% 
  rename(f = feature, v  = value, group = .gr_name) %>%
  select(-c(".gr_id"))

tidy_data_without_phase <- tidy_data %>%
  select(-c(".phase")) %>%
  mutate(.phase := NA_character_)

tidy_data_without_id <- tidy_data %>%
  select(-c(".id")) %>%
  mutate(.id := NA_character_)

tidy_data_without_both <- tidy_data %>%
  select(-c(".phase", ".id")) %>%
  mutate(.phase := NA_character_) %>%
  mutate(.id := NA_character_)

# WIDE DATA

dfs <-dfs %>% select(-c(dummy))  %>% drop_na() # 2185 obs of 80 variables


dfs_without_phase <- dfs %>% select(-c(".phase")) %>% 
  mutate(.phase := NA_character_) 

dfs_without_id <- dfs %>% select(-c(".id")) %>% 
  mutate(.id := NA_character_)

dfs_without_both <- dfs %>%
  select(-c(".phase", ".id")) %>%
  mutate(.phase := NA_character_) %>%
  mutate(.id := NA_character_)


# LONG DATA

tidy_d  <- tidy_data %>% filter(.cluster == levels(.cluster)[cluster_idx])  # 96844 obs for cluster 1
                                                         # 23713 obs for cluster 2 
                                                         # 26487 obs for cluster 3
                                                         # 23633 obs for cluster 4



tidy_d_without_phase  <- tidy_data_without_phase %>% filter(.cluster == levels(.cluster)[cluster_idx])



tidy_d_without_id  <- tidy_data_without_id %>% filter(.cluster == levels(.cluster)[cluster_idx])



tidy_d_without_both  <- tidy_data_without_both %>% filter(.cluster == levels(.cluster)[cluster_idx])




# WIDE DATA

df_wide <- dfs %>%
  mutate(.cluster = factor(.cluster)) %>%
  filter(.cluster == levels(.cluster)[cluster_idx]) # 1236 obs for cluster 1 
                                                    # 305 obs for cluster 2
                                                    # 343 obs for cluster 3
                                                    # 301 obs for cluster 4


df_wide_w_p <- dfs_without_phase %>%
  mutate(.cluster = factor(.cluster)) %>%
  filter(.cluster == levels(.cluster)[cluster_idx])

df_wide_w_i <- dfs_without_id %>%
  mutate(.cluster = factor(.cluster)) %>%
  filter(.cluster == levels(.cluster)[cluster_idx])

df_wide_w_b <- dfs_without_both %>%
  mutate(.cluster = factor(.cluster)) %>%
  filter(.cluster == levels(.cluster)[cluster_idx])

# LONG DATA

data_cluster_long <- tidy_d %>% mutate(.cluster = fct_drop(as.factor(.cluster))) %>%
mutate(.cluster = factor(.cluster, labels = paste(cluster_abbrev, levels(.cluster))))



data_cluster_long_w_p <- tidy_d_without_phase %>% mutate(.cluster = fct_drop(as.factor(.cluster))) %>%
mutate(.cluster = factor(.cluster, labels = paste(cluster_abbrev, levels(.cluster))))



data_cluster_long_w_i <- tidy_d_without_id %>% mutate(.cluster = fct_drop(as.factor(.cluster))) %>%
mutate(.cluster = factor(.cluster, labels = paste(cluster_abbrev, levels(.cluster))))



data_cluster_long_w_b <- tidy_d_without_both %>% mutate(.cluster = fct_drop(as.factor(.cluster))) %>%
mutate(.cluster = factor(.cluster, labels = paste(cluster_abbrev, levels(.cluster))))


# WIDE DATA

data_cluster_wide <- df_wide %>% mutate(.cluster = fct_drop(as.factor(.cluster))) %>%
mutate(.cluster = factor(.cluster, labels = paste(cluster_abbrev, levels(.cluster))))


data_cluster_wide_w_p <- df_wide_w_p %>% mutate(.cluster = fct_drop(as.factor(.cluster))) %>%
mutate(.cluster = factor(.cluster, labels = paste(cluster_abbrev, levels(.cluster))))


data_cluster_wide_w_i <- df_wide_w_i %>% mutate(.cluster = fct_drop(as.factor(.cluster))) %>%
mutate(.cluster = factor(.cluster, labels = paste(cluster_abbrev, levels(.cluster))))


data_cluster_wide_w_b <- df_wide_w_b %>% mutate(.cluster = fct_drop(as.factor(.cluster))) %>%
mutate(.cluster = factor(.cluster, labels = paste(cluster_abbrev, levels(.cluster))))

d <- unique(tidy_data$f)
# data_l <- data_cluster_long %>%
#   dplyr::group_by(f) %>%
#   dplyr::add_count(.cluster) %>%
#   dplyr::arrange(.cluster) %>%
#   dplyr::ungroup() %>%
#   dplyr::group_by(.phase, .cluster, f, group) %>%
#   dplyr::summarize(avg = mean(v), sd = sd(v), n = n[1], .groups = "keep" ) %>%
#   dplyr::ungroup() %>%
#   dplyr::mutate(error = qnorm(0.975) * sd / sqrt(n)) %>%
#   # winsorize cluster averages
#   dplyr::mutate(avg = ifelse(avg > scale_rng[2], scale_rng[2], avg)) %>%
#   dplyr::mutate(avg = ifelse(avg < scale_rng[1], scale_rng[1], avg)) %>%
#   dplyr::arrange(group,f,.cluster)
#
#
#
#   dplyr::inner_join(tibble(f=unique(data_cluster_long$f), group = group_names), by = "f") %>%
#   dplyr::arrange(group, f, .cluster)


all_equal(data_l_gg, data_l)

data_l_gg <- data_cluster_long %>%
  dplyr::group_by(f) %>%
  dplyr::add_count(.cluster) %>%
  dplyr::arrange(.cluster) %>%
  dplyr::ungroup() %>%
  dplyr::group_by(.phase, .cluster, f) %>%
  dplyr::summarize(avg = mean(v), sd = sd(v), n = n[1]) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(error = qnorm(0.975) * sd / sqrt(n)) %>%
  # winsorize cluster averages
  dplyr::mutate(avg = ifelse(avg > scale_rng[2], scale_rng[2], avg)) %>%
  dplyr::mutate(avg = ifelse(avg < scale_rng[1], scale_rng[1], avg)) %>%
  dplyr::inner_join(tibble(f=unique(data_cluster_long$f), group = group_names), by = "f") %>%
  dplyr::arrange(group, f, .cluster)


all_equal(data_n,data_l_n)

data_l_n <- data_l %>% select(-c("n"))

data_n <- data %>% select(-c("n"))

data <- data_cluster_wide %>%
    pivot_longer(names_to = "f", values_to = "v", cols = -c(.id, .phase, .cluster)) %>% # tabyl generates a frequency table on .cluster
    inner_join(janitor::tabyl(data_cluster_wide, .cluster) %>% select(.cluster, n), by = ".cluster") %>%   # adds an additional varibale `n` that contains the value - 'Total number of observations in the cluster' for each observation
    group_by(.phase, .cluster, f) %>%
    summarize(avg = mean(v), sd = sd(v), n = n[1]) %>%
    ungroup() %>%
    mutate(error = qnorm(0.975)*sd/sqrt(n)) %>%
    # winsorize cluster averages
    mutate(avg = ifelse(avg > scale_rng[2], scale_rng[2], avg)) %>%
    mutate(avg = ifelse(avg < scale_rng[1], scale_rng[1], avg)) %>%
    # mutate(sd = if_else(avg < 0, sd, -sd)) %>%
    inner_join(tibble(f = names(df_wide %>% select(-starts_with("."))), group = group_names), by = "f") %>%
    arrange(group, f, .cluster)



head(dfs$physics)

fff <- unique()

data <- df_wide %>%
    pivot_longer(names_to = "f", values_to = "v", cols = -c(.id, .phase, .cluster)) %>% # tabyl generates a frequency table on .cluster
    inner_join(janitor::tabyl(dfs, .cluster) %>% select(.cluster, n), by = ".cluster") %>%# adds an additional varibale `n` that contains the value - 'Total number of observations in the cluster' for each obs
    group_by(.phase, .cluster, f) %>%
    summarize(avg = mean(v), sd = sd(v), n = n[1]) %>%
    ungroup() %>%
    mutate(error = qnorm(0.975)*sd/sqrt(n)) %>%
    # winsorize cluster averages
    mutate(avg = ifelse(avg > scale_rng[2], scale_rng[2], avg)) %>%
    mutate(avg = ifelse(avg < scale_rng[1], scale_rng[1], avg)) %>%
    # mutate(sd = if_else(avg < 0, sd, -sd)) %>%
    inner_join(tibble(f = names(df %>% select(-starts_with("."))), group = group_names), by = "f") %>%
    arrange(group, f, .cluster)


library(diffdf)
diffdf(data_cluster_wide,data_cluster_long_created)

Breaking down and Solving problems:

# for post_treatment- for long data
data <- tidy_data %>% 
   dplyr::group_by(.id) %>%
   dplyr::mutate(n = n_distinct(.phase)) %>%
   dplyr::filter(n == 2) %>%
   dplyr::select(-n) %>%
   dplyr::arrange(.id) %>%
   dplyr::ungroup()


# %>% count(unique(n)) 
#  n == 1 ; 18401
#  n == 2 ; 152276

v <- tidy_data %>% 
  group_by(feature) %>% 
  add_count(.cluster) %>%
  arrange(.cluster)

x <- as.factor(tidy_data$.phase)

y <- levels(x)


xx <- levels(x)[1] 
yy <- levels(x)[2]
cluster_idx = "sa" # only 1L, 2L etc; works

if (!(is.integer(cluster_idx))) rlang::abort("cluster_idx must be a integer value")

cluster_assignment = TRUE # only character or factor; works 

if (is.logical(cluster_assignment) || is.numeric(cluster_assignment)) rlang::abort("cluster_assignment: wrong datatype. Character or factor datatype expected")

## checks for phase present and cluster_phase

cluster_phase =  TRUE # check works 

if (is.logical(cluster_phase) || is.numeric(cluster_phase)) rlang::abort("cluster_phase : wrong datatype. Character or factor datatype expected")

phase_present = 2 # check works
if (!(is.logical(phase_present))) rlang::abort("phase_present must be logical either 'TRUE' or 'FALSE'")

cluster_phase = tidy_data$.phase
phase_present = FALSE

if (is.null(cluster_phase) && !(phase_present)) rlang::abort("If 'phase_present = TRUE', you must provide 'cluster_phase'")

if (!is.null(cluster_phase) && (phase_present)) rlang::abort("You provided 'cluster_phase'; please Set 'phase_present = TRUE'") ## doesnt work


## checks for cluster_name and cluster_abbrev

cluster_name = 3455
if (!is.character(cluster_name) || (str_length(cluster_name) > 15)) rlang::warn("Recommendation : a in character and length less than or equal to 15; otherwise plot may look distorted.")

cluster_abbrev = "PTTTTTT"
if (length(cluster_abbrev) > 4) rlang::abort("Recommendation : name in character and length less than or equal to 4; otherwise plot may look distorted.")
# does not work

## checks for group_names and show_group_names

show_group_names <- 2

if (is.logical(group_names) || is.numeric(group_names)) rlang::abort("group_names : wrong datatype. Character or factor datatype expected")

if (!(is.logical(show_group_names))) rlang::abort("show_group_names must be logical either 'TRUE' or 'FALSE'")

if(nchar(cluster_name) > 10) rlang::warn("stopppp ")

ad <- NA

nchar(cluster_name)
cluster_assignment = NULL
str_length(ad)

if(!phase_present && is.null(cluster_assignment)) rlang::abort("If 'phase_present = TRUE', you must provide 'cluster_assignment'.")


unique_clusters <- sort(unique(tidy_data$.cluster))

unique_clusters_check <- as.integer(unique_clusters)

y <- min(unique_clusters)

if (!(cluster_idx %in% unique_clusters_check)) {

  print("bad")
} else {

  print("toll")

}
# Setting up cluster assignment argument

cluster_assignment = NULL   # or dfs$.cluster

if (!(is.null(cluster_assignment))) {

  cluster_assignment <- cluster_assignment
  unique_clusters <- sort(unique(cluster_assignment))


  } else {

    cluster_values <- c("C1")
    cluster_levels <- c("C1")
    dfs <- dfs %>%
      dplyr::mutate(.cluster_dummy = factor(cluster_values, levels = cluster_levels))

    cluster_assignment <- dfs$.cluster_dummy
    unique_clusters <- sort(unique(cluster_assignment))
    }

cluster_idx = 1L       # 5L throws error
if (!(is.null(cluster_idx))) {

  if (cluster_idx > length(unique_clusters)) {

    rlang::abort("The Cluster value, you specified does not exist ")
    }
}
# Setting up group_names parameter

group_names = NULL   # or grs$.gr_name

if (!(is.null(group_names))) {
  if(!is.factor(group_names)) group_names <- as.factor(group_names)

  rlang::inform("group_names converted to factor, if it is not already")
  } else {

    group_values <- c("SingleGroup")
    group_levels <- c("SingleGroup")

    dfs <- dfs %>%
      dplyr::mutate(.gr_name = factor(group_values, levels = group_levels))

      group_names <- dfs$.gr_name

      rlang::inform("A new factor variable was added to the data, that conatins group_names")
      rlang::inform("Since you did not speccify group_names, only one group exists")
  }
install.packages("styler") # The goal of styler is to provide non-invasive pretty-printing of R source code while adhering to the tidyverse formatting rules.
library(styler)

# style_file() # styles .R, .Rmd .Rnw and .Rprofile, files.

# style_dir() # styles all .R and/or .Rmd files in a directory.

# style_pkg() # styles the source files of an R package.
install.packages("goodpractice") # Advice on R Package Building
library(goodpractice)

gp("C:/rprojects/ggradialbar") # package path



Ashish-Soni08/ggradialbar documentation built on April 15, 2021, 4:11 a.m.