R/EDAnorm.R

Defines functions EDAnorm

Documented in EDAnorm

#' Fast Single Population & Normality EDA from a Sampling Plan
#'
#' @param data A data frame in the long format with three columns representing the "Trial" number for investigating
#' Reproducability, the "Sample" number for investigating Repeatability, and the "Result" column consisting of the measured
#' value.  Note, all three columns are numeric columns but only the "Result" column needs to be so and should be a continous
#' variable.  The "EDAnorm" function is not intended to be used on discrete data as it's a normality assessement.  It should also
#' be noted that the "EDAnorm" function was created to be compatable with the output from the "RCSample" function where the
#' simulated data from the "RCSample" function can be easily feed into the "EDAnorm" function for quick single population and
#' normality EDA.  In addition, there are eight data files already created, "EDF1" - "EDF8", that cover differnt types of
#' sampling examples from a manufacturing process with some of them including process shifts.  These files cover the situation
#' when the number of "Trials" is > 2 and when the number of "Sample" is > 2 for investigating Repeatability and Reproducability.
#' For the case when the number of "Sample" is <= 2 OR the number of "Trial" is < 3, the "Trial" column is simply ignored and
#' the sampling plan is assummed to consist of single samples pulled and only the underlying population is investigated.
#' There are five data files already created, "CI1" - "CI5", that cover this situation.  In this instance only a single
#' population analysis is performed.
#' @param Trial.coded "Trial" is defined as the outer stage of the sampling plan describing the processes reproducability.  This
#' can be thought of as the time intervals in which a defined number of consecutive "Sample" is drawn and tested from the
#' manufacturing process.
#' @param Sample.coded "Sample" is defined as the inner stage of the sampling plan describing the processes repeatability.
#' NOTE:  The total number of "Sample" must be > 7 for this analysis.  This is due to the normality assessemnt performed in
#' which "Sample" <= 7 are subject to false interpretations of the normality assessments.
#' @param Result.coded "Result" is defined as the output from the "Trial" and "Sample" stages measurments.
#' @param sigma_pop In the event the total number of "Trial" is < 2 OR the total number of "Sample" <= 2 for each "Trail" a
#' single population EDA is executed.  This results in two sided standard error analysis and allows for one to enter a
#' defined / "KNOWN" poplation standard deviation.  If this is "UNKNOWN" the NULL value simply uses the "t" statistic for
#' approximating the standard error of the mean.
#' @param p.CI In the event the total number of "Trial" is < 2 OR the total number of "Sample" <= 2 for each "Trail" a
#' single population EDA is executed.  This results in a two sided standard error analysis and allows for one to enter a defined
#' two sided confidence interval.  By default this is set to the two sided 95% confidence interval of 0.975.  If another
#' two sided confidence interval is needed simply change "p.CI" to the appropriate value.
#' @return The output is a single list containing many of the statistic data set created along with the individual plots created
#' from the inputs, i.e. \code{data}, \code{Trial.coded}, \code{Sample.coded}, \code{Result.coded}, \code{sigma_pop},
#' &/or \code{p.CI}.  As shown below it is recommended to save the output of the function to a defined list variable where the
#' items from the list can be called out specifically.
#' @export
#' @examples
#'
#' # When the data is already in the corret format:
#'
#' saved.output.1 <- EDAnorm(data = EDF1)
#' saved.output.2 <- EDAnorm(data = CI1, sigma_pop = 7.2)
#'
#' # If the "Trial", "Sample", and "Result" are defined deferently than what is expressed, simply call them out accordingly.
#' # In the examples above the "EDF1" & "CI1" data set columns are already coded accordingly and so you don't need to explicitly
#' # call them out in the function.  However, if you do you will still get the same result.
#'
#' saved.output.3 <- EDAnorm(data = EDF1, Trial.coded = EDF1$Trial, Sample.coded = EDF1$Sample, Result.coded = EDF1$Result)
#'
#' # To view the last two- three "grid" plots generated in the final listed output, e.g. "grid.newpage_draw_plot1",
#' # you will need to do the following:
#'
#' #grid.newpage()
#' #grid.draw(saved.output.1$grid.newpage_draw_plot1)
#'
#' #grid.newpage()
#' #grid.draw(saved.output.1$grid.newpage_draw_plot2)
#'
#' #grid.newpage()
#' #grid.draw(saved.output.1$grid.newpage_draw_plot3)
#'
#' @import ggthemes
#' @import tidyr
#' @import broom
#' @importFrom gridExtra arrangeGrob tableGrob
#' @importFrom grid grid.newpage grid.draw
#' @importFrom stats shapiro.test oneway.test lm median bartlett.test kruskal.test quantile qnorm qt
#' @importFrom timeDate skewness kurtosis
#' @importFrom nortest ad.test
#' @importFrom car leveneTest

EDAnorm <- function(data,
                    Trial.coded = NULL,
                    Sample.coded = NULL,
                    Result.coded = NULL,
                    sigma_pop = NULL,
                    p.CI = 0.975
                    ){

  options(warn=-1)

  # Transform the incoming data to fit the calls made below:

  ifelse(
         !is.null(Trial.coded) | !is.null(Sample.coded) | !is.null(Result.coded),

          # ---------IF TRUE

          data_trans <- as_tibble(data) %>%
                        dplyr::mutate(Trial = Trial.coded,
                                      Sample = Sample.coded,
                                      Result = Result.coded) %>%
                        dplyr::select(Trial, Sample, Result),

         # **********IF FALSE

         data_trans <- data)

  if(length(data_trans$Sample) < 7) {
    print(paste0("Total Sample Size = ", length(data_trans$Sample)))
    stop('The total sample size must be > 7!')
  }

  # Basic EDA statistics added to the incoming data:

  ifelse(
        length(data_trans$Sample) > 7 & max(unique(data_trans$Trial)) > 2 & max(unique(data_trans$Sample)) > 7,

      # ----------IF TRUE:

        sampled_data_w_stats <- data_trans %>%
          dplyr::group_by(Trial) %>%
          dplyr::mutate(x_bar = round(mean(Result), 3),
                        m_bar = round(median(Result), 3),
                        r_bar = round(diff(range(Result)), 3),
                        sk = round(skewness(Result), 3),
                        kt = round(kurtosis(Result), 3),
                        xm_diff = round(x_bar - m_bar, 3),
                        AD.pv = round(ad.test(Result)$p.value, 4),
                        SW.pv = round(shapiro.test(Result)$p.value, 4),
                        Sub_Norm = factor(ifelse(AD.pv > 0.05 & SW.pv > 0.05, "1. Normal (AD & SW)",
                                          ifelse(AD.pv > 0.05, "2. AD Normal",
                                                ifelse(SW.pv > 0.05, "3. SW Normal",
                                                       "4. Not Normal"))),
                                          levels = c("1. Normal (AD & SW)",
                                                    "2. AD Normal",
                                                    "3. SW Normal",
                                                    "4. Not Normal")),
                    l.rsq = round(summary(lm(Result ~ Sample))$r.squared, 3),
                    l.m = round(summary(lm(Result ~ Sample))$coefficients[2], 3),
                    l.m.pv = round(summary(lm(Result ~ Sample))$coefficients[8], 3),
                    l.int = round(summary(lm(Result ~ Sample))$coefficients[1], 3),
                    l.int.pv = round(summary(lm(Result ~ Sample))$coefficients[7], 3),
                    Sub_Lin = factor(ifelse(l.m.pv > 0.01, "No Linearity",
                                            "Linearity"),
                                    levels = c("No Linearity",
                                                "Linearity"))) %>%
          dplyr::ungroup(),

       # **********IF FALSE:

        ifelse(
              length(data_trans$Sample) > 7 & max(unique(data_trans$Trial)) > 2 & max(unique(data_trans$Sample)) > 2,

              # ----------IF TRUE

              sampled_data_w_stats <- data_trans %>%
                  dplyr::group_by(Trial) %>%
                  dplyr::mutate(x_bar = round(mean(Result), 3),
                                m_bar = round(median(Result), 3),
                                r_bar = round(diff(range(Result)), 3),
                                sk = round(skewness(Result), 3),
                                kt = round(kurtosis(Result), 3),
                                xm_diff = round(x_bar - m_bar, 3),
                                SW.pv = round(shapiro.test(Result)$p.value, 4),
                                Sub_Norm = factor(ifelse(SW.pv > 0.05, "1. SW Normal",
                                                  "2. Not Normal"),
                                                  levels = c("1. SW Normal",
                                                             "2. Not Normal")),
                                l.rsq = round(summary(lm(Result ~ Sample))$r.squared, 3),
                                l.m = round(summary(lm(Result ~ Sample))$coefficients[2], 3),
                                l.m.pv = round(summary(lm(Result ~ Sample))$coefficients[8], 3),
                                l.int = round(summary(lm(Result ~ Sample))$coefficients[1], 3),
                                l.int.pv = round(summary(lm(Result ~ Sample))$coefficients[7], 3),
                                Sub_Lin = factor(ifelse(l.m.pv > 0.01, "No Linearity",
                                                        "Linearity"),
                                                        levels = c("No Linearity",
                                                                   "Linearity"))) %>%
                  dplyr::ungroup(),

              # **********IF FALSE

              sampled_data <- data_trans

              )
        )

  if(length(data_trans$Sample) > 7 & max(unique(data_trans$Trial)) > 2 & max(unique(data_trans$Sample)) > 2){

  # Next we condense the data down to a single summarised data table

  summary_stats <- sampled_data_w_stats %>%
    dplyr::select(-Sample) %>%
    dplyr::distinct(Trial, .keep_all = T)

  #  Now lets start creating our initial plots

  stats_xm_diff <- ggplot(summary_stats, aes(x = Trial)) +
    geom_point(aes(y = xm_diff), col = "black", size = 1.5) +
    geom_smooth(aes(y = xm_diff), method = "loess",
                col = "Blue", fill = "skyblue", size = 1) +
    geom_hline(aes(yintercept = mean(summary_stats$xm_diff),
                   linetype = "Average Diff."), color = "Black") +
    labs(title = "EDA: Running Average of Mean-Median with 95% CI",
         y = "Statistic Value",
         x = "Each Sample Trial (t)") +
    geom_hline(aes(yintercept = 0, linetype = "Normality"), color = "Red") +
    scale_linetype_manual(name = "Lines", values = c(2, 2),
                          guide = guide_legend(override.aes = list(color = c("Black", "Red")))) +
    theme_tufte()

  stats_sk <- ggplot(summary_stats, aes(x = Trial)) +
    geom_point(aes(y = sk), col = "black", size = 1.5) +
    geom_smooth(aes(y = sk), method = "loess",
                col = "green4", fill = "seagreen1", size = 1) +
    labs(title = "EDA: Running Average of Skewness with 95% CI",
         y = "Statistic Value",
         x = "Each Sample Trial (t)") +
    geom_hline(aes(yintercept = mean(summary_stats$sk),
                   linetype = "Average Skew"), color = "Black") +
    geom_hline(aes(yintercept = 0, linetype = "Normality"), color = "Red") +
    scale_linetype_manual(name = "Lines", values = c(2, 2),
                          guide = guide_legend(override.aes = list(color = c("Black", "Red")))) +
    theme_tufte()

  stats_kt <- ggplot(summary_stats, aes(x = Trial)) +
    geom_point(aes(y = kt), col = "black", size = 1.5) +
    geom_smooth(aes(y = kt), method = "loess",
                col = "darkorchid4", fill = "magenta", size = 1) +
    labs(title = "EDA: Running Average of Kurtosis with 95% CI",
         y = "Statistic Value",
         x = "Each Sample Trial (t)") +
    geom_hline(aes(yintercept = mean(summary_stats$kt),
                   linetype = "Average Kurt"), color = "Black") +
    geom_hline(aes(yintercept = 0, linetype = "Normality"), color = "Red") +
    scale_linetype_manual(name = "Lines", values = c(2, 2),
                          guide = guide_legend(override.aes = list(color = c("Black", "Red")))) +
    theme_tufte()

  lm_testing <- ggplot(sampled_data_w_stats, aes(x = factor(Trial), y = Result)) +
    geom_boxplot(aes(fill = Sub_Lin)) +
    labs(title = "EDA: Testing Order Linearity for each Sample Set Drawn",
         y = "Measured Value") +
    theme_tufte() +
    scale_x_discrete(name = "Each Sample Trial (t)",
                     breaks = seq(1, max(unique(sampled_data_w_stats$Trial)),
                                  0.05*max(unique(sampled_data_w_stats$Trial))
                                  )) +
    scale_fill_manual("Linearity Testing",
                      values = c("Green",
                                 "Red"),
                      drop = FALSE)

  ifelse(length(levels(sampled_data_w_stats$Sub_Norm)) > 2,
         norm.values <- c("royalblue1", "springgreen1", "sienna1", "red"),
         norm.values <- c("royalblue", "red"))

  norm_testing <- ggplot(sampled_data_w_stats, aes(x = factor(Trial), y = Result)) +
    geom_boxplot(aes(fill = Sub_Norm)) +
    labs(title = "EDA: Testing Normality for each Sample Set Drawn",
         y = "Measured Value") +
    theme_tufte() +
    scale_x_discrete(name = "Each Sample Trial (t)",
                     breaks = seq(1, max(unique(sampled_data_w_stats$Trial)),
                                  0.05*max(unique(sampled_data_w_stats$Trial))
                     )) +
    scale_fill_manual(name = "Normality Testing",
                      values = norm.values,
                      drop = FALSE)

  sampled_data_diff_ttest <- summary_stats %>%
    dplyr::do(Model = t.test(summary_stats$xm_diff, mu = 0)) %>%
    tidy(Model) %>%
    dplyr::mutate(Test = method, Alternative = alternative) %>%
    dplyr::mutate(Test = ifelse(Test == "One Sample t-test",
                                "(H_0: Diff = 0)",
                                NA),
                  Alternative = ifelse(Alternative == "two.sided",
                                       "(H_1: Diff /= 0)",
                                       NA)) %>%
    dplyr::select(Test, Alternative, p.value, estimate, conf.low, conf.high) %>%
    dplyr::mutate(p.value = round(p.value , 3),
                  estimate = round(estimate, 3),
                  conf.low = round(conf.low, 3),
                  conf.high = round(conf.high, 3)) %>%
    gather(key = ID, value = One_Sample_t_test) %>%
    tableGrob()

  sampled_data_sk_ttest <- summary_stats %>%
    dplyr::do(Model = t.test(summary_stats$sk, mu = 0)) %>%
    tidy(Model) %>%
    dplyr::mutate(Null = method, Alt. = alternative) %>%
    dplyr::mutate(Null = ifelse(Null == "One Sample t-test",
                                "H_0: Skew = 0)",
                                NA),
                  Alt. = ifelse(Alt. == "two.sided",
                                "(H_1: Skew /= 0)",
                                NA)) %>%
    dplyr::select(Null, Alt., p.value, estimate, conf.low, conf.high) %>%
    dplyr::mutate(p.value = round(p.value , 3),
                  estimate = round(estimate, 3),
                  conf.low = round(conf.low, 3),
                  conf.high = round(conf.high, 3)) %>%
    gather(key = ID, value = One_Sample_t_test) %>%
    tableGrob()

  sampled_data_kt_ttest <- summary_stats %>%
    dplyr::do(Model = t.test(summary_stats$kt, mu = 0)) %>%
    tidy(Model) %>%
    dplyr::mutate(Null = method, Alt. = alternative) %>%
    dplyr::mutate(Null = ifelse(Null == "One Sample t-test",
                                "H_0: Kurt = 0)",
                                NA),
                  Alt. = ifelse(Alt. == "two.sided",
                                "(H_1: Kurt /= 0)",
                                NA)) %>%
    dplyr::select(Null, Alt., p.value, estimate, conf.low, conf.high) %>%
    dplyr::mutate(p.value = round(p.value , 3),
                  estimate = round(estimate, 3),
                  conf.low = round(conf.low, 3),
                  conf.high = round(conf.high, 3)) %>%
    gather(key = ID, value = One_Sample_t_test) %>%
    tableGrob()

  sampled_data_EQB_ANOVA <- data_trans %>%
    dplyr::do(Model = bartlett.test(Result ~ Trial, data = .)) %>%
    tidy(Model) %>%
    dplyr::mutate(Test = method) %>%
    dplyr::select(Test, p.value)

  sampled_data_EQL_ANOVA <- data_trans %>%
    dplyr::do(Model = leveneTest(Result ~ factor(Trial), data = ., center = mean)) %>%
    tidy(Model) %>%
    dplyr::mutate(Test = term) %>%
    dplyr::filter(Test == "group") %>%
    dplyr::select(Test, p.value) %>%
    dplyr::mutate(Test = ifelse(Test == "group",
                                "Levene's test of homogeneity of variances"))

  sampled_data_ANOVA_EV <- data_trans %>%
    dplyr::do(Model = oneway.test(Result ~ Trial, data = ., var.equal = T)) %>%
    tidy(Model) %>%
    dplyr::select(method, p.value) %>%
    dplyr::mutate(Test = method) %>%
    dplyr::select(Test, p.value) %>%
    dplyr::mutate(Test = ifelse(Test == "One-way analysis of means",
                                "One Way ANOVA (Assumming Equal Var.)",
                                NA))

  sampled_data_ANOVA_NEV <- data_trans %>%
    dplyr::do(Model = oneway.test(Result ~ Trial, data = .)) %>%
    tidy(Model) %>%
    dplyr::select(method, p.value) %>%
    dplyr::mutate(Test = method) %>%
    dplyr::select(Test, p.value) %>%
    dplyr::mutate(Test = ifelse(Test == "One-way analysis of means (not assuming equal variances)",
                                "One Way ANOVA (Not Assuming Equal Var.)",
                                NA))

  sampled_data_ANOVA_KW <- data_trans %>%
    dplyr::do(Model = kruskal.test(Result ~ Trial, data = .)) %>%
    tidy(Model) %>%
    dplyr::select(method, p.value) %>%
    dplyr::mutate(Test = method) %>%
    dplyr::select(Test, p.value)

  frac_non_normal <- summary_stats %>%
    dplyr::summarise(frac = mean(Sub_Norm == "4. Not Normal")) %>%
    dplyr::mutate(Test = paste("Franction Not Normal =", frac)) %>%
    dplyr::mutate(p.value = NA) %>%
    dplyr::select(Test, p.value)

  # Main ANOVA Table to be reported

  ANOVA_results.final <- rbind(sampled_data_EQB_ANOVA,
                         sampled_data_EQL_ANOVA,
                         sampled_data_ANOVA_EV,
                         sampled_data_ANOVA_NEV,
                         sampled_data_ANOVA_KW,
                         frac_non_normal) %>%
    dplyr::mutate(p.value = round(p.value, 4))

  # Main ANOVA Table to be reported in a grob

  ANOVA_results <-  ANOVA_results.final %>%
                        tableGrob()

  LM <- lm(x_bar ~ Trial, data = summary_stats)
  R2 <- c(summary(LM)$r.squared, NA, NA, NA)
  LM_coeff <- summary(LM)$coefficients
  reg_tbl <- as.data.frame(as.matrix(round(rbind(LM_coeff, R2), 4))) %>%
    dplyr::mutate(p.value = `Pr(>|t|)`) %>%
    dplyr::select(Estimate, p.value)
  Linear_Model_Overall <- c("Int", "Slope", "R_2")

  frac_linearity <- summary_stats %>%
    dplyr::summarise(frac = round(1.0 - mean(Sub_Lin == "No Linearity"), 2)) %>%
    dplyr::mutate(Linear_Model_Overall = "Franction w/ Linearity") %>%
    dplyr::mutate(Estimate = frac,
                  p.value = NA) %>%
    dplyr::select(-frac)

  # Main Linear Model Table to be reported

  LM_tbl.final <- cbind(reg_tbl, Linear_Model_Overall) %>%
    dplyr::select(Linear_Model_Overall, Estimate, p.value) %>%
    rbind(. , frac_linearity)

  # Main Linear Model Table to be reported in a grob

  LM_tbl <- LM_tbl.final %>%
              tableGrob()

  #  Put it all together

  grid.tbl.1 <- arrangeGrob(norm_testing, lm_testing,
                             ANOVA_results, LM_tbl,
                             layout_matrix = rbind(c(1, 2),
                                                   c(3, 4)),
                             widths = c(1, 1))

  grid.tbl.2 <- arrangeGrob(stats_xm_diff, stats_sk, stats_kt,
                             sampled_data_diff_ttest, sampled_data_sk_ttest,
                             sampled_data_kt_ttest,
                             layout_matrix = rbind(c(1, 4),
                                                   c(2, 5),
                                                   c(3, 6)),
                             widths = c(1.5, 1))

  #  If the samples are statistically shown to belong to one population,
  #  grouping is irrelavant.  Thus, we can look at how the samples
  #  can be used to build a single population.

  pop_dist <- data_trans %>%
    dplyr::summarise(Mean = round(mean(Result), 3),
                     Median = round(median(Result), 3),
                     St.Dev = round(sd(Result), 3),
                     Skew = round(skewness(Result), 3),
                     Kurt = round(kurtosis(Result), 3),
                     SW.pv = round(shapiro.test(Result)$p.value, 3),
                     AD.pv = round(ad.test(Result)$p.value, 3)) %>%
    gather(key = Statistic, value = Value) %>%
    tableGrob()

  CLT_dist <- summary_stats %>%
    dplyr::summarise(CLT_Mean = round(mean(x_bar), 3),
                     CLT_Median = round(median(x_bar), 3),
                     CLT_St.Dev = round(sd(x_bar), 3),
                     CLT_Skew = round(skewness(x_bar), 3),
                     CLT_Kurt = round(kurtosis(x_bar), 3),
                     CLT_SW.pv = round(shapiro.test(x_bar)$p.value, 3),
                     CLT_AD.pv = round(ad.test(x_bar)$p.value, 3)) %>%
    gather(key = Statistic, value = Value) %>%
    tableGrob()

  pop_dist_plot <- ggplot(data_trans, aes(x = Result)) +
    geom_histogram(fill = "Blue", alpha = 0.3) +
    labs(title = "EDA of Sampled Data",
         subtitle = "Density Plot of the Individual Values",
         y = "Percent",
         x = "Measured Value") +
    theme_tufte()

  CLT_dist_plot <- ggplot(summary_stats, aes(x = x_bar)) +
    geom_histogram(fill = "Green", alpha = 0.3) +
    labs(title = "EDA of Sampled Data",
         subtitle = "Density Plot of the Means of the Samples",
         y = "Percent",
         x = "Calculated Value") +
    theme_tufte()

  #  Again, lets put it all together

  grid.tbl.3 <- arrangeGrob(pop_dist_plot, pop_dist,
                            CLT_dist_plot, CLT_dist,
                            layout_matrix = rbind(c(1, 3),
                                                  c(2, 4)),
                            widths = c(1, 1))


  final.return <- list("Transformed_Date" = data_trans,
                       "Summary_Stats" = summary_stats,
                       "XM_Plot" = stats_xm_diff,
                       "Skew_Plot" = stats_sk,
                       "Kurt_Plot" = stats_kt,
                       "LM_Plot" = lm_testing,
                       "Norm_Plot" = norm_testing,
                       "ANOVA_Data" = ANOVA_results.final,
                       "LM_Data" = LM_tbl.final,
                       "grid.newpage_draw_plot1" = grid.tbl.1,
                       "grid.newpage_draw_plot2" = grid.tbl.2,
                       "grid.newpage_draw_plot3" = grid.tbl.3)

  return(final.return)

  } else {

    if(!is.null(sigma_pop)){

    ############# KNOWN Population Sigma ########################

    # When the sample size is <= 2 we simply drop the trial indicator and treat each sample as its own unique trial:

    n <- matrix(length(sampled_data$Result), nrow = 1, ncol = 1,
                dimnames = list("n", "Statistic"))

    Quantiles <- matrix(quantile(sampled_data$Result), nrow = 5, ncol = 1,
                        dimnames = list(c("Min", "1st Q", "Med", "3rd Q", "Max"),
                                        c("Statistic")))

    IQR <- matrix(Quantiles[4] - Quantiles[2], nrow = 1, ncol = 1,
                  dimnames = list("IQR", "Statistic"))

    xbar <- matrix(round(mean(sampled_data$Result), 2), nrow = 1, ncol = 1,
                   dimnames = list("xbar", "Statistic"))

    xbar_med <- matrix(round(xbar-Quantiles[3], 2), nrow = 1, ncol = 1,
                       dimnames = list("xbar-Med", "Statistic"))

    Skew <- matrix(round(skewness(sampled_data$Result)[1], 3), nrow = 1, ncol = 1,
                   dimnames = list("Skew", "Statistic"))

    Kurt <- matrix(round(kurtosis(sampled_data$Result)[1], 3), nrow = 1, ncol = 1,
                   dimnames = list("Kurt", "Statistic"))

    s <- matrix(round(sd(sampled_data$Result), 3), nrow = 1, ncol = 1,
                dimnames = list("s", "Statistic"))

    se <- matrix(round(sigma_pop/sqrt(length(sampled_data$Result)), 3), nrow = 1,
                 ncol = 1, dimnames = list("se", "Statistic"))

    # Given that sigma is known we want to find the two-sided margin of error using the
    # z-score method

    ME.Z.2S <- matrix(round(qnorm(p.CI)*se, 3), nrow = 1,
                      ncol = 1, dimnames = list("ME.Z.2S", "Statistic"))

    # We should also include checks for normality

    AD.p.value <- matrix(round(ad.test(sampled_data$Result)$p.value, 3), nrow = 1,
                         ncol = 1, dimnames = list("AD.p.value", "Statistic"))

    SW.p.value <- matrix(round(shapiro.test(sampled_data$Result)$p.value, 3), nrow = 1,
                         ncol = 1, dimnames = list("SW.p.value", "Statistic"))

    #  Lets put it all together in a table for reporting

    sample_stats <- as.data.frame(
      rbind(n, Quantiles, IQR, xbar, xbar_med, Skew, Kurt, s, se, ME.Z.2S, AD.p.value,
            SW.p.value))

    tableStats <- tableGrob(sample_stats)

    if(max(unique(sampled_data$Trial)) < 3){
      sampled_data_t <- sampled_data
      sampled_data_t$Trial <- sampled_data_t$Sample
    } else {
      sampled_data_t <- sampled_data
    }

    # Lets check linearity.

    l_model <- lm(Result ~ Trial, data = sampled_data_t)

    R_2 <- c(round(summary(l_model)$r.squared, 3), NA, NA, NA)

    l_sum <- summary(l_model)$coefficients

    # To report on our linear model (lm) lets put the info in a table ...

    reg_tbl <- as.data.frame(t(as.matrix(round(rbind(l_sum, R_2), 4))))

    names(reg_tbl) <- c("Int", "Slope", "R_2")

    tablelm <- tableGrob(reg_tbl)

    #  Lets also create a histogram of our "single" population ...

    hg <- ggplot(sampled_data, aes(x = Result)) +
      geom_histogram(col = "black", fill = "steelblue") +
      geom_rug() +
      ggtitle("Histogram of Sample") +
      xlab("Result") +
      theme_tufte()

    #  ... as well as a boxplot of our "single" population ...

    bp <- ggplot(sampled_data, aes(x = 1, y = Result)) +
      geom_boxplot() +
      coord_flip() +
      ggtitle("Boxplot of Sample") +
      ylab("Result") +
      theme_tufte() +
      theme(axis.title.y = element_blank(),
            axis.text.y = element_blank(),
            axis.ticks.y = element_blank())

    #  ... and a linear regression (lm) graph ...

    lmp <- ggplot(sampled_data, aes(x = Trial, y = Result)) +
      geom_point(col = "Black") +
      geom_smooth(method = "lm", se = F, col = "Red", size = 1) +
      ggtitle("Linear Regresion vs. Order") +
      ylab("Result") +
      theme_tufte()

    #  ... with the lm residual graphs for fitted & order along with a histogram ...

    lmp_r <- ggplot() +
      geom_point(aes(x = l_model$fitted.values, y = l_model$residuals)) +
      geom_hline(yintercept = 0, col = "red", size = 1) +
      ggtitle("Residuals Versus the Fitted Values") +
      xlab("Fitted Values") +
      ylab("Residual") +
      theme_tufte()

    lmp_rord <- ggplot() +
      geom_point(aes(x = sampled_data$Trial, y = l_model$residuals)) +
      geom_hline(yintercept = 0, col = "red", size = 1) +
      ggtitle("Residuals Versus the Order") +
      xlab("Observation Order") +
      ylab("Residual") +
      theme_tufte()

    lmp_rhist <- ggplot() +
      geom_histogram(aes(x = l_model$residuals),
                     col = "black",
                     fill = "steelblue") +
      ggtitle("Histogram of the Residuals") +
      xlab("Residual") +
      theme_tufte()

    #  ... and we will include the Normality qq plot

    norm_qq <- ggplot(sampled_data, aes(sample = Result)) +
      stat_qq() +
      stat_qq_line(color = "red", size = 1) +
      ggtitle("Normality QQ Plot") +
      xlab("Theoretical") +
      ylab("Sample") +
      theme_tufte()

    # Lets view all our exploratory data analysis

    grid.tbl.1 <- arrangeGrob(tableStats, hg, bp,
                 layout_matrix = rbind(c(1, 2, 2),
                                       c(1, 3, 3)),
                 widths = c(1, 1, 1))

    grid.tbl.2 <- arrangeGrob(tablelm, lmp, lmp_r, lmp_rhist, norm_qq, lmp_rord,
                 layout_matrix = rbind(c(1, 2, 3),
                                       c(4, 5, 6)),
                 widths = c(1, 1, 1))

    final.return <- list("Transformed_Date" = data_trans,
                         "Summary_Stats" = sample_stats,
                         "Histogram" = hg,
                         "Boxplot" = bp,
                         "LM_plot" = lmp,
                         "LM_Residual_Plot" = lmp_r,
                         "LM_Residual_Order_Plot" = lmp_rord,
                         "LM_Residual_Histogram" = lmp_rhist,
                         "Nomrmal_qq" = norm_qq,
                         "grid.newpage_draw_plot1" = grid.tbl.1,
                         "grid.newpage_draw_plot2" = grid.tbl.2)

    return(final.return)

    } else {

      ############# UNKNOWN Population Sigma ########################

      # Lets generate some basic exploratory statistics of our sample

      n <- matrix(length(sampled_data$Result), nrow = 1, ncol = 1,
                  dimnames = list("n", "Statistic"))

      Quantiles <- matrix(quantile(sampled_data$Result), nrow = 5, ncol = 1,
                          dimnames = list(c("Min", "1st Q", "Med", "3rd Q", "Max"),
                                          c("Statistic")))

      IQR <- matrix(Quantiles[4] - Quantiles[2], nrow = 1, ncol = 1,
                    dimnames = list("IQR", "Statistic"))

      xbar <- matrix(round(mean(sampled_data$Result), 2), nrow = 1, ncol = 1,
                     dimnames = list("xbar", "Statistic"))

      xbar_med <- matrix(round(xbar-Quantiles[3], 2), nrow = 1, ncol = 1,
                         dimnames = list("xbar-Med", "Statistic"))

      Skew <- matrix(round(skewness(sampled_data$Result)[1], 3), nrow = 1, ncol = 1,
                     dimnames = list("Skew", "Statistic"))

      Kurt <- matrix(round(kurtosis(sampled_data$Result)[1], 3), nrow = 1, ncol = 1,
                     dimnames = list("Kurt", "Statistic"))

      s <- matrix(round(sd(sampled_data$Result), 3), nrow = 1, ncol = 1,
                  dimnames = list("s", "Statistic"))

      se <- matrix(round(s/sqrt(length(sampled_data$Result)), 3), nrow = 1,
                   ncol = 1, dimnames = list("se", "Statistic"))

      # Given that sigma is UNKNOWN we want to find the two-sided margin of error using the
      # t-score method

      ME.t.2S <- matrix(round(qt(p = p.CI, df = n-1)*se, 3), nrow = 1,
                        ncol = 1, dimnames = list("ME.t.2S", "Statistic"))

      # We should also include checks for normality

      AD.p.value <- matrix(round(ad.test(sampled_data$Result)$p.value, 3), nrow = 1,
                           ncol = 1, dimnames = list("AD.p.value", "Statistic"))

      SW.p.value <- matrix(round(shapiro.test(sampled_data$Result)$p.value, 3), nrow = 1,
                           ncol = 1, dimnames = list("SW.p.value", "Statistic"))

      #  Lets put it all together in a table for reporting

      sample_stats <- as.data.frame(
        rbind(n, Quantiles, IQR, xbar, xbar_med, Skew, Kurt, s, se, ME.t.2S, AD.p.value,
              SW.p.value))

      tableStats <- tableGrob(sample_stats)

      if(max(unique(sampled_data$Trial)) < 3){
        sampled_data_t <- sampled_data
        sampled_data_t$Trial <- sampled_data_t$Sample
      } else {
        sampled_data_t <- sampled_data
      }

      # Lets check linearity.

      l_model <- lm(Result ~ Trial, data = sampled_data_t)

      R_2 <- c(round(summary(l_model)$r.squared, 3), NA, NA, NA)

      l_sum <- summary(l_model)$coefficients

      # To report on our linear model (lm) lets put the info in a table ...

      reg_tbl <- as.data.frame(t(as.matrix(round(rbind(l_sum, R_2), 4))))

      names(reg_tbl) <- c("Int", "Slope", "R_2")

      tablelm <- tableGrob(reg_tbl)

      #  Lets also create a histogram of our "single" population ...

      hg <- ggplot(sampled_data, aes(x = Result)) +
        geom_histogram(col = "black", fill = "steelblue") +
        geom_rug() +
        ggtitle("Histogram of Sample") +
        xlab("Result") +
        theme_tufte()

      #  ... as well as a boxplot of our "single" population ...

      bp <- ggplot(sampled_data, aes(x = 1, y = Result)) +
        geom_boxplot() +
        coord_flip() +
        ggtitle("Boxplot of Sample") +
        ylab("Result") +
        theme_tufte() +
        theme(axis.title.y = element_blank(),
              axis.text.y = element_blank(),
              axis.ticks.y = element_blank())

      #  ... and a linear regression (lm) graph ...

      lmp <- ggplot(sampled_data, aes(x = Trial, y = Result)) +
        geom_point(col = "Black") +
        geom_smooth(method = "lm", se = F, col = "Red", size = 1) +
        ggtitle("Linear Regresion vs. Order") +
        ylab("Result") +
        theme_tufte()

      #  ... with the lm residual graphs for fitted & order along with a histogram ...

      lmp_r <- ggplot() +
        geom_point(aes(x = l_model$fitted.values, y = l_model$residuals)) +
        geom_hline(yintercept = 0, col = "red", size = 1) +
        ggtitle("Residuals Versus the Fitted Values") +
        xlab("Fitted Values") +
        ylab("Residual") +
        theme_tufte()

      lmp_rord <- ggplot() +
        geom_point(aes(x = sampled_data$Trial, y = l_model$residuals)) +
        geom_hline(yintercept = 0, col = "red", size = 1) +
        ggtitle("Residuals Versus the Order") +
        xlab("Observation Order") +
        ylab("Residual") +
        theme_tufte()

      lmp_rhist <- ggplot() +
        geom_histogram(aes(x = l_model$residuals),
                       col = "black",
                       fill = "steelblue") +
        ggtitle("Histogram of the Residuals") +
        xlab("Residual") +
        theme_tufte()

      #  ... and we will include the Normality qq plot

      norm_qq <- ggplot(sampled_data, aes(sample = Result)) +
        stat_qq() +
        stat_qq_line(color = "red", size = 1) +
        ggtitle("Normality QQ Plot") +
        xlab("Theoretical") +
        ylab("Sample") +
        theme_tufte()

      # Lets view all our exploratory data analysis

      grid.tbl.1 <- arrangeGrob(tableStats, hg, bp,
                   layout_matrix = rbind(c(1, 2, 2),
                                         c(1, 3, 3)),
                   widths = c(1, 1, 1))

      grid.tbl.2 <- arrangeGrob(tablelm, lmp, lmp_r, lmp_rhist, norm_qq, lmp_rord,
                   layout_matrix = rbind(c(1, 2, 3),
                                         c(4, 5, 6)),
                   widths = c(1, 1, 1))

      final.return <- list("Transformed_Date" = data_trans,
                           "Summary_Stats" = sample_stats,
                           "Histogram" = hg,
                           "Boxplot" = bp,
                           "LM_plot" = lmp,
                           "LM_Residual_Plot" = lmp_r,
                           "LM_Residual_Order_Plot" = lmp_rord,
                           "LM_Residual_Histogram" = lmp_rhist,
                           "Nomrmal_qq" = norm_qq,
                           "grid.newpage_draw_plot1" = grid.tbl.1,
                           "grid.newpage_draw_plot2" = grid.tbl.2)

      return(final.return)

    }

  }

  options(warn=0)

}
upi-qe/qualityr documentation built on Nov. 5, 2019, 11:05 a.m.