#' Fast Single Population & Normality EDA from a Sampling Plan
#'
#' @param data A data frame in the long format with three columns representing the "Trial" number for investigating
#' Reproducability, the "Sample" number for investigating Repeatability, and the "Result" column consisting of the measured
#' value. Note, all three columns are numeric columns but only the "Result" column needs to be so and should be a continous
#' variable. The "EDAnorm" function is not intended to be used on discrete data as it's a normality assessement. It should also
#' be noted that the "EDAnorm" function was created to be compatable with the output from the "RCSample" function where the
#' simulated data from the "RCSample" function can be easily feed into the "EDAnorm" function for quick single population and
#' normality EDA. In addition, there are eight data files already created, "EDF1" - "EDF8", that cover differnt types of
#' sampling examples from a manufacturing process with some of them including process shifts. These files cover the situation
#' when the number of "Trials" is > 2 and when the number of "Sample" is > 2 for investigating Repeatability and Reproducability.
#' For the case when the number of "Sample" is <= 2 OR the number of "Trial" is < 3, the "Trial" column is simply ignored and
#' the sampling plan is assummed to consist of single samples pulled and only the underlying population is investigated.
#' There are five data files already created, "CI1" - "CI5", that cover this situation. In this instance only a single
#' population analysis is performed.
#' @param Trial.coded "Trial" is defined as the outer stage of the sampling plan describing the processes reproducability. This
#' can be thought of as the time intervals in which a defined number of consecutive "Sample" is drawn and tested from the
#' manufacturing process.
#' @param Sample.coded "Sample" is defined as the inner stage of the sampling plan describing the processes repeatability.
#' NOTE: The total number of "Sample" must be > 7 for this analysis. This is due to the normality assessemnt performed in
#' which "Sample" <= 7 are subject to false interpretations of the normality assessments.
#' @param Result.coded "Result" is defined as the output from the "Trial" and "Sample" stages measurments.
#' @param sigma_pop In the event the total number of "Trial" is < 2 OR the total number of "Sample" <= 2 for each "Trail" a
#' single population EDA is executed. This results in two sided standard error analysis and allows for one to enter a
#' defined / "KNOWN" poplation standard deviation. If this is "UNKNOWN" the NULL value simply uses the "t" statistic for
#' approximating the standard error of the mean.
#' @param p.CI In the event the total number of "Trial" is < 2 OR the total number of "Sample" <= 2 for each "Trail" a
#' single population EDA is executed. This results in a two sided standard error analysis and allows for one to enter a defined
#' two sided confidence interval. By default this is set to the two sided 95% confidence interval of 0.975. If another
#' two sided confidence interval is needed simply change "p.CI" to the appropriate value.
#' @return The output is a single list containing many of the statistic data set created along with the individual plots created
#' from the inputs, i.e. \code{data}, \code{Trial.coded}, \code{Sample.coded}, \code{Result.coded}, \code{sigma_pop},
#' &/or \code{p.CI}. As shown below it is recommended to save the output of the function to a defined list variable where the
#' items from the list can be called out specifically.
#' @export
#' @examples
#'
#' # When the data is already in the corret format:
#'
#' saved.output.1 <- EDAnorm(data = EDF1)
#' saved.output.2 <- EDAnorm(data = CI1, sigma_pop = 7.2)
#'
#' # If the "Trial", "Sample", and "Result" are defined deferently than what is expressed, simply call them out accordingly.
#' # In the examples above the "EDF1" & "CI1" data set columns are already coded accordingly and so you don't need to explicitly
#' # call them out in the function. However, if you do you will still get the same result.
#'
#' saved.output.3 <- EDAnorm(data = EDF1, Trial.coded = EDF1$Trial, Sample.coded = EDF1$Sample, Result.coded = EDF1$Result)
#'
#' # To view the last two- three "grid" plots generated in the final listed output, e.g. "grid.newpage_draw_plot1",
#' # you will need to do the following:
#'
#' #grid.newpage()
#' #grid.draw(saved.output.1$grid.newpage_draw_plot1)
#'
#' #grid.newpage()
#' #grid.draw(saved.output.1$grid.newpage_draw_plot2)
#'
#' #grid.newpage()
#' #grid.draw(saved.output.1$grid.newpage_draw_plot3)
#'
#' @import ggthemes
#' @import tidyr
#' @import broom
#' @importFrom gridExtra arrangeGrob tableGrob
#' @importFrom grid grid.newpage grid.draw
#' @importFrom stats shapiro.test oneway.test lm median bartlett.test kruskal.test quantile qnorm qt
#' @importFrom timeDate skewness kurtosis
#' @importFrom nortest ad.test
#' @importFrom car leveneTest
EDAnorm <- function(data,
Trial.coded = NULL,
Sample.coded = NULL,
Result.coded = NULL,
sigma_pop = NULL,
p.CI = 0.975
){
options(warn=-1)
# Transform the incoming data to fit the calls made below:
ifelse(
!is.null(Trial.coded) | !is.null(Sample.coded) | !is.null(Result.coded),
# ---------IF TRUE
data_trans <- as_tibble(data) %>%
dplyr::mutate(Trial = Trial.coded,
Sample = Sample.coded,
Result = Result.coded) %>%
dplyr::select(Trial, Sample, Result),
# **********IF FALSE
data_trans <- data)
if(length(data_trans$Sample) < 7) {
print(paste0("Total Sample Size = ", length(data_trans$Sample)))
stop('The total sample size must be > 7!')
}
# Basic EDA statistics added to the incoming data:
ifelse(
length(data_trans$Sample) > 7 & max(unique(data_trans$Trial)) > 2 & max(unique(data_trans$Sample)) > 7,
# ----------IF TRUE:
sampled_data_w_stats <- data_trans %>%
dplyr::group_by(Trial) %>%
dplyr::mutate(x_bar = round(mean(Result), 3),
m_bar = round(median(Result), 3),
r_bar = round(diff(range(Result)), 3),
sk = round(skewness(Result), 3),
kt = round(kurtosis(Result), 3),
xm_diff = round(x_bar - m_bar, 3),
AD.pv = round(ad.test(Result)$p.value, 4),
SW.pv = round(shapiro.test(Result)$p.value, 4),
Sub_Norm = factor(ifelse(AD.pv > 0.05 & SW.pv > 0.05, "1. Normal (AD & SW)",
ifelse(AD.pv > 0.05, "2. AD Normal",
ifelse(SW.pv > 0.05, "3. SW Normal",
"4. Not Normal"))),
levels = c("1. Normal (AD & SW)",
"2. AD Normal",
"3. SW Normal",
"4. Not Normal")),
l.rsq = round(summary(lm(Result ~ Sample))$r.squared, 3),
l.m = round(summary(lm(Result ~ Sample))$coefficients[2], 3),
l.m.pv = round(summary(lm(Result ~ Sample))$coefficients[8], 3),
l.int = round(summary(lm(Result ~ Sample))$coefficients[1], 3),
l.int.pv = round(summary(lm(Result ~ Sample))$coefficients[7], 3),
Sub_Lin = factor(ifelse(l.m.pv > 0.01, "No Linearity",
"Linearity"),
levels = c("No Linearity",
"Linearity"))) %>%
dplyr::ungroup(),
# **********IF FALSE:
ifelse(
length(data_trans$Sample) > 7 & max(unique(data_trans$Trial)) > 2 & max(unique(data_trans$Sample)) > 2,
# ----------IF TRUE
sampled_data_w_stats <- data_trans %>%
dplyr::group_by(Trial) %>%
dplyr::mutate(x_bar = round(mean(Result), 3),
m_bar = round(median(Result), 3),
r_bar = round(diff(range(Result)), 3),
sk = round(skewness(Result), 3),
kt = round(kurtosis(Result), 3),
xm_diff = round(x_bar - m_bar, 3),
SW.pv = round(shapiro.test(Result)$p.value, 4),
Sub_Norm = factor(ifelse(SW.pv > 0.05, "1. SW Normal",
"2. Not Normal"),
levels = c("1. SW Normal",
"2. Not Normal")),
l.rsq = round(summary(lm(Result ~ Sample))$r.squared, 3),
l.m = round(summary(lm(Result ~ Sample))$coefficients[2], 3),
l.m.pv = round(summary(lm(Result ~ Sample))$coefficients[8], 3),
l.int = round(summary(lm(Result ~ Sample))$coefficients[1], 3),
l.int.pv = round(summary(lm(Result ~ Sample))$coefficients[7], 3),
Sub_Lin = factor(ifelse(l.m.pv > 0.01, "No Linearity",
"Linearity"),
levels = c("No Linearity",
"Linearity"))) %>%
dplyr::ungroup(),
# **********IF FALSE
sampled_data <- data_trans
)
)
if(length(data_trans$Sample) > 7 & max(unique(data_trans$Trial)) > 2 & max(unique(data_trans$Sample)) > 2){
# Next we condense the data down to a single summarised data table
summary_stats <- sampled_data_w_stats %>%
dplyr::select(-Sample) %>%
dplyr::distinct(Trial, .keep_all = T)
# Now lets start creating our initial plots
stats_xm_diff <- ggplot(summary_stats, aes(x = Trial)) +
geom_point(aes(y = xm_diff), col = "black", size = 1.5) +
geom_smooth(aes(y = xm_diff), method = "loess",
col = "Blue", fill = "skyblue", size = 1) +
geom_hline(aes(yintercept = mean(summary_stats$xm_diff),
linetype = "Average Diff."), color = "Black") +
labs(title = "EDA: Running Average of Mean-Median with 95% CI",
y = "Statistic Value",
x = "Each Sample Trial (t)") +
geom_hline(aes(yintercept = 0, linetype = "Normality"), color = "Red") +
scale_linetype_manual(name = "Lines", values = c(2, 2),
guide = guide_legend(override.aes = list(color = c("Black", "Red")))) +
theme_tufte()
stats_sk <- ggplot(summary_stats, aes(x = Trial)) +
geom_point(aes(y = sk), col = "black", size = 1.5) +
geom_smooth(aes(y = sk), method = "loess",
col = "green4", fill = "seagreen1", size = 1) +
labs(title = "EDA: Running Average of Skewness with 95% CI",
y = "Statistic Value",
x = "Each Sample Trial (t)") +
geom_hline(aes(yintercept = mean(summary_stats$sk),
linetype = "Average Skew"), color = "Black") +
geom_hline(aes(yintercept = 0, linetype = "Normality"), color = "Red") +
scale_linetype_manual(name = "Lines", values = c(2, 2),
guide = guide_legend(override.aes = list(color = c("Black", "Red")))) +
theme_tufte()
stats_kt <- ggplot(summary_stats, aes(x = Trial)) +
geom_point(aes(y = kt), col = "black", size = 1.5) +
geom_smooth(aes(y = kt), method = "loess",
col = "darkorchid4", fill = "magenta", size = 1) +
labs(title = "EDA: Running Average of Kurtosis with 95% CI",
y = "Statistic Value",
x = "Each Sample Trial (t)") +
geom_hline(aes(yintercept = mean(summary_stats$kt),
linetype = "Average Kurt"), color = "Black") +
geom_hline(aes(yintercept = 0, linetype = "Normality"), color = "Red") +
scale_linetype_manual(name = "Lines", values = c(2, 2),
guide = guide_legend(override.aes = list(color = c("Black", "Red")))) +
theme_tufte()
lm_testing <- ggplot(sampled_data_w_stats, aes(x = factor(Trial), y = Result)) +
geom_boxplot(aes(fill = Sub_Lin)) +
labs(title = "EDA: Testing Order Linearity for each Sample Set Drawn",
y = "Measured Value") +
theme_tufte() +
scale_x_discrete(name = "Each Sample Trial (t)",
breaks = seq(1, max(unique(sampled_data_w_stats$Trial)),
0.05*max(unique(sampled_data_w_stats$Trial))
)) +
scale_fill_manual("Linearity Testing",
values = c("Green",
"Red"),
drop = FALSE)
ifelse(length(levels(sampled_data_w_stats$Sub_Norm)) > 2,
norm.values <- c("royalblue1", "springgreen1", "sienna1", "red"),
norm.values <- c("royalblue", "red"))
norm_testing <- ggplot(sampled_data_w_stats, aes(x = factor(Trial), y = Result)) +
geom_boxplot(aes(fill = Sub_Norm)) +
labs(title = "EDA: Testing Normality for each Sample Set Drawn",
y = "Measured Value") +
theme_tufte() +
scale_x_discrete(name = "Each Sample Trial (t)",
breaks = seq(1, max(unique(sampled_data_w_stats$Trial)),
0.05*max(unique(sampled_data_w_stats$Trial))
)) +
scale_fill_manual(name = "Normality Testing",
values = norm.values,
drop = FALSE)
sampled_data_diff_ttest <- summary_stats %>%
dplyr::do(Model = t.test(summary_stats$xm_diff, mu = 0)) %>%
tidy(Model) %>%
dplyr::mutate(Test = method, Alternative = alternative) %>%
dplyr::mutate(Test = ifelse(Test == "One Sample t-test",
"(H_0: Diff = 0)",
NA),
Alternative = ifelse(Alternative == "two.sided",
"(H_1: Diff /= 0)",
NA)) %>%
dplyr::select(Test, Alternative, p.value, estimate, conf.low, conf.high) %>%
dplyr::mutate(p.value = round(p.value , 3),
estimate = round(estimate, 3),
conf.low = round(conf.low, 3),
conf.high = round(conf.high, 3)) %>%
gather(key = ID, value = One_Sample_t_test) %>%
tableGrob()
sampled_data_sk_ttest <- summary_stats %>%
dplyr::do(Model = t.test(summary_stats$sk, mu = 0)) %>%
tidy(Model) %>%
dplyr::mutate(Null = method, Alt. = alternative) %>%
dplyr::mutate(Null = ifelse(Null == "One Sample t-test",
"H_0: Skew = 0)",
NA),
Alt. = ifelse(Alt. == "two.sided",
"(H_1: Skew /= 0)",
NA)) %>%
dplyr::select(Null, Alt., p.value, estimate, conf.low, conf.high) %>%
dplyr::mutate(p.value = round(p.value , 3),
estimate = round(estimate, 3),
conf.low = round(conf.low, 3),
conf.high = round(conf.high, 3)) %>%
gather(key = ID, value = One_Sample_t_test) %>%
tableGrob()
sampled_data_kt_ttest <- summary_stats %>%
dplyr::do(Model = t.test(summary_stats$kt, mu = 0)) %>%
tidy(Model) %>%
dplyr::mutate(Null = method, Alt. = alternative) %>%
dplyr::mutate(Null = ifelse(Null == "One Sample t-test",
"H_0: Kurt = 0)",
NA),
Alt. = ifelse(Alt. == "two.sided",
"(H_1: Kurt /= 0)",
NA)) %>%
dplyr::select(Null, Alt., p.value, estimate, conf.low, conf.high) %>%
dplyr::mutate(p.value = round(p.value , 3),
estimate = round(estimate, 3),
conf.low = round(conf.low, 3),
conf.high = round(conf.high, 3)) %>%
gather(key = ID, value = One_Sample_t_test) %>%
tableGrob()
sampled_data_EQB_ANOVA <- data_trans %>%
dplyr::do(Model = bartlett.test(Result ~ Trial, data = .)) %>%
tidy(Model) %>%
dplyr::mutate(Test = method) %>%
dplyr::select(Test, p.value)
sampled_data_EQL_ANOVA <- data_trans %>%
dplyr::do(Model = leveneTest(Result ~ factor(Trial), data = ., center = mean)) %>%
tidy(Model) %>%
dplyr::mutate(Test = term) %>%
dplyr::filter(Test == "group") %>%
dplyr::select(Test, p.value) %>%
dplyr::mutate(Test = ifelse(Test == "group",
"Levene's test of homogeneity of variances"))
sampled_data_ANOVA_EV <- data_trans %>%
dplyr::do(Model = oneway.test(Result ~ Trial, data = ., var.equal = T)) %>%
tidy(Model) %>%
dplyr::select(method, p.value) %>%
dplyr::mutate(Test = method) %>%
dplyr::select(Test, p.value) %>%
dplyr::mutate(Test = ifelse(Test == "One-way analysis of means",
"One Way ANOVA (Assumming Equal Var.)",
NA))
sampled_data_ANOVA_NEV <- data_trans %>%
dplyr::do(Model = oneway.test(Result ~ Trial, data = .)) %>%
tidy(Model) %>%
dplyr::select(method, p.value) %>%
dplyr::mutate(Test = method) %>%
dplyr::select(Test, p.value) %>%
dplyr::mutate(Test = ifelse(Test == "One-way analysis of means (not assuming equal variances)",
"One Way ANOVA (Not Assuming Equal Var.)",
NA))
sampled_data_ANOVA_KW <- data_trans %>%
dplyr::do(Model = kruskal.test(Result ~ Trial, data = .)) %>%
tidy(Model) %>%
dplyr::select(method, p.value) %>%
dplyr::mutate(Test = method) %>%
dplyr::select(Test, p.value)
frac_non_normal <- summary_stats %>%
dplyr::summarise(frac = mean(Sub_Norm == "4. Not Normal")) %>%
dplyr::mutate(Test = paste("Franction Not Normal =", frac)) %>%
dplyr::mutate(p.value = NA) %>%
dplyr::select(Test, p.value)
# Main ANOVA Table to be reported
ANOVA_results.final <- rbind(sampled_data_EQB_ANOVA,
sampled_data_EQL_ANOVA,
sampled_data_ANOVA_EV,
sampled_data_ANOVA_NEV,
sampled_data_ANOVA_KW,
frac_non_normal) %>%
dplyr::mutate(p.value = round(p.value, 4))
# Main ANOVA Table to be reported in a grob
ANOVA_results <- ANOVA_results.final %>%
tableGrob()
LM <- lm(x_bar ~ Trial, data = summary_stats)
R2 <- c(summary(LM)$r.squared, NA, NA, NA)
LM_coeff <- summary(LM)$coefficients
reg_tbl <- as.data.frame(as.matrix(round(rbind(LM_coeff, R2), 4))) %>%
dplyr::mutate(p.value = `Pr(>|t|)`) %>%
dplyr::select(Estimate, p.value)
Linear_Model_Overall <- c("Int", "Slope", "R_2")
frac_linearity <- summary_stats %>%
dplyr::summarise(frac = round(1.0 - mean(Sub_Lin == "No Linearity"), 2)) %>%
dplyr::mutate(Linear_Model_Overall = "Franction w/ Linearity") %>%
dplyr::mutate(Estimate = frac,
p.value = NA) %>%
dplyr::select(-frac)
# Main Linear Model Table to be reported
LM_tbl.final <- cbind(reg_tbl, Linear_Model_Overall) %>%
dplyr::select(Linear_Model_Overall, Estimate, p.value) %>%
rbind(. , frac_linearity)
# Main Linear Model Table to be reported in a grob
LM_tbl <- LM_tbl.final %>%
tableGrob()
# Put it all together
grid.tbl.1 <- arrangeGrob(norm_testing, lm_testing,
ANOVA_results, LM_tbl,
layout_matrix = rbind(c(1, 2),
c(3, 4)),
widths = c(1, 1))
grid.tbl.2 <- arrangeGrob(stats_xm_diff, stats_sk, stats_kt,
sampled_data_diff_ttest, sampled_data_sk_ttest,
sampled_data_kt_ttest,
layout_matrix = rbind(c(1, 4),
c(2, 5),
c(3, 6)),
widths = c(1.5, 1))
# If the samples are statistically shown to belong to one population,
# grouping is irrelavant. Thus, we can look at how the samples
# can be used to build a single population.
pop_dist <- data_trans %>%
dplyr::summarise(Mean = round(mean(Result), 3),
Median = round(median(Result), 3),
St.Dev = round(sd(Result), 3),
Skew = round(skewness(Result), 3),
Kurt = round(kurtosis(Result), 3),
SW.pv = round(shapiro.test(Result)$p.value, 3),
AD.pv = round(ad.test(Result)$p.value, 3)) %>%
gather(key = Statistic, value = Value) %>%
tableGrob()
CLT_dist <- summary_stats %>%
dplyr::summarise(CLT_Mean = round(mean(x_bar), 3),
CLT_Median = round(median(x_bar), 3),
CLT_St.Dev = round(sd(x_bar), 3),
CLT_Skew = round(skewness(x_bar), 3),
CLT_Kurt = round(kurtosis(x_bar), 3),
CLT_SW.pv = round(shapiro.test(x_bar)$p.value, 3),
CLT_AD.pv = round(ad.test(x_bar)$p.value, 3)) %>%
gather(key = Statistic, value = Value) %>%
tableGrob()
pop_dist_plot <- ggplot(data_trans, aes(x = Result)) +
geom_histogram(fill = "Blue", alpha = 0.3) +
labs(title = "EDA of Sampled Data",
subtitle = "Density Plot of the Individual Values",
y = "Percent",
x = "Measured Value") +
theme_tufte()
CLT_dist_plot <- ggplot(summary_stats, aes(x = x_bar)) +
geom_histogram(fill = "Green", alpha = 0.3) +
labs(title = "EDA of Sampled Data",
subtitle = "Density Plot of the Means of the Samples",
y = "Percent",
x = "Calculated Value") +
theme_tufte()
# Again, lets put it all together
grid.tbl.3 <- arrangeGrob(pop_dist_plot, pop_dist,
CLT_dist_plot, CLT_dist,
layout_matrix = rbind(c(1, 3),
c(2, 4)),
widths = c(1, 1))
final.return <- list("Transformed_Date" = data_trans,
"Summary_Stats" = summary_stats,
"XM_Plot" = stats_xm_diff,
"Skew_Plot" = stats_sk,
"Kurt_Plot" = stats_kt,
"LM_Plot" = lm_testing,
"Norm_Plot" = norm_testing,
"ANOVA_Data" = ANOVA_results.final,
"LM_Data" = LM_tbl.final,
"grid.newpage_draw_plot1" = grid.tbl.1,
"grid.newpage_draw_plot2" = grid.tbl.2,
"grid.newpage_draw_plot3" = grid.tbl.3)
return(final.return)
} else {
if(!is.null(sigma_pop)){
############# KNOWN Population Sigma ########################
# When the sample size is <= 2 we simply drop the trial indicator and treat each sample as its own unique trial:
n <- matrix(length(sampled_data$Result), nrow = 1, ncol = 1,
dimnames = list("n", "Statistic"))
Quantiles <- matrix(quantile(sampled_data$Result), nrow = 5, ncol = 1,
dimnames = list(c("Min", "1st Q", "Med", "3rd Q", "Max"),
c("Statistic")))
IQR <- matrix(Quantiles[4] - Quantiles[2], nrow = 1, ncol = 1,
dimnames = list("IQR", "Statistic"))
xbar <- matrix(round(mean(sampled_data$Result), 2), nrow = 1, ncol = 1,
dimnames = list("xbar", "Statistic"))
xbar_med <- matrix(round(xbar-Quantiles[3], 2), nrow = 1, ncol = 1,
dimnames = list("xbar-Med", "Statistic"))
Skew <- matrix(round(skewness(sampled_data$Result)[1], 3), nrow = 1, ncol = 1,
dimnames = list("Skew", "Statistic"))
Kurt <- matrix(round(kurtosis(sampled_data$Result)[1], 3), nrow = 1, ncol = 1,
dimnames = list("Kurt", "Statistic"))
s <- matrix(round(sd(sampled_data$Result), 3), nrow = 1, ncol = 1,
dimnames = list("s", "Statistic"))
se <- matrix(round(sigma_pop/sqrt(length(sampled_data$Result)), 3), nrow = 1,
ncol = 1, dimnames = list("se", "Statistic"))
# Given that sigma is known we want to find the two-sided margin of error using the
# z-score method
ME.Z.2S <- matrix(round(qnorm(p.CI)*se, 3), nrow = 1,
ncol = 1, dimnames = list("ME.Z.2S", "Statistic"))
# We should also include checks for normality
AD.p.value <- matrix(round(ad.test(sampled_data$Result)$p.value, 3), nrow = 1,
ncol = 1, dimnames = list("AD.p.value", "Statistic"))
SW.p.value <- matrix(round(shapiro.test(sampled_data$Result)$p.value, 3), nrow = 1,
ncol = 1, dimnames = list("SW.p.value", "Statistic"))
# Lets put it all together in a table for reporting
sample_stats <- as.data.frame(
rbind(n, Quantiles, IQR, xbar, xbar_med, Skew, Kurt, s, se, ME.Z.2S, AD.p.value,
SW.p.value))
tableStats <- tableGrob(sample_stats)
if(max(unique(sampled_data$Trial)) < 3){
sampled_data_t <- sampled_data
sampled_data_t$Trial <- sampled_data_t$Sample
} else {
sampled_data_t <- sampled_data
}
# Lets check linearity.
l_model <- lm(Result ~ Trial, data = sampled_data_t)
R_2 <- c(round(summary(l_model)$r.squared, 3), NA, NA, NA)
l_sum <- summary(l_model)$coefficients
# To report on our linear model (lm) lets put the info in a table ...
reg_tbl <- as.data.frame(t(as.matrix(round(rbind(l_sum, R_2), 4))))
names(reg_tbl) <- c("Int", "Slope", "R_2")
tablelm <- tableGrob(reg_tbl)
# Lets also create a histogram of our "single" population ...
hg <- ggplot(sampled_data, aes(x = Result)) +
geom_histogram(col = "black", fill = "steelblue") +
geom_rug() +
ggtitle("Histogram of Sample") +
xlab("Result") +
theme_tufte()
# ... as well as a boxplot of our "single" population ...
bp <- ggplot(sampled_data, aes(x = 1, y = Result)) +
geom_boxplot() +
coord_flip() +
ggtitle("Boxplot of Sample") +
ylab("Result") +
theme_tufte() +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank())
# ... and a linear regression (lm) graph ...
lmp <- ggplot(sampled_data, aes(x = Trial, y = Result)) +
geom_point(col = "Black") +
geom_smooth(method = "lm", se = F, col = "Red", size = 1) +
ggtitle("Linear Regresion vs. Order") +
ylab("Result") +
theme_tufte()
# ... with the lm residual graphs for fitted & order along with a histogram ...
lmp_r <- ggplot() +
geom_point(aes(x = l_model$fitted.values, y = l_model$residuals)) +
geom_hline(yintercept = 0, col = "red", size = 1) +
ggtitle("Residuals Versus the Fitted Values") +
xlab("Fitted Values") +
ylab("Residual") +
theme_tufte()
lmp_rord <- ggplot() +
geom_point(aes(x = sampled_data$Trial, y = l_model$residuals)) +
geom_hline(yintercept = 0, col = "red", size = 1) +
ggtitle("Residuals Versus the Order") +
xlab("Observation Order") +
ylab("Residual") +
theme_tufte()
lmp_rhist <- ggplot() +
geom_histogram(aes(x = l_model$residuals),
col = "black",
fill = "steelblue") +
ggtitle("Histogram of the Residuals") +
xlab("Residual") +
theme_tufte()
# ... and we will include the Normality qq plot
norm_qq <- ggplot(sampled_data, aes(sample = Result)) +
stat_qq() +
stat_qq_line(color = "red", size = 1) +
ggtitle("Normality QQ Plot") +
xlab("Theoretical") +
ylab("Sample") +
theme_tufte()
# Lets view all our exploratory data analysis
grid.tbl.1 <- arrangeGrob(tableStats, hg, bp,
layout_matrix = rbind(c(1, 2, 2),
c(1, 3, 3)),
widths = c(1, 1, 1))
grid.tbl.2 <- arrangeGrob(tablelm, lmp, lmp_r, lmp_rhist, norm_qq, lmp_rord,
layout_matrix = rbind(c(1, 2, 3),
c(4, 5, 6)),
widths = c(1, 1, 1))
final.return <- list("Transformed_Date" = data_trans,
"Summary_Stats" = sample_stats,
"Histogram" = hg,
"Boxplot" = bp,
"LM_plot" = lmp,
"LM_Residual_Plot" = lmp_r,
"LM_Residual_Order_Plot" = lmp_rord,
"LM_Residual_Histogram" = lmp_rhist,
"Nomrmal_qq" = norm_qq,
"grid.newpage_draw_plot1" = grid.tbl.1,
"grid.newpage_draw_plot2" = grid.tbl.2)
return(final.return)
} else {
############# UNKNOWN Population Sigma ########################
# Lets generate some basic exploratory statistics of our sample
n <- matrix(length(sampled_data$Result), nrow = 1, ncol = 1,
dimnames = list("n", "Statistic"))
Quantiles <- matrix(quantile(sampled_data$Result), nrow = 5, ncol = 1,
dimnames = list(c("Min", "1st Q", "Med", "3rd Q", "Max"),
c("Statistic")))
IQR <- matrix(Quantiles[4] - Quantiles[2], nrow = 1, ncol = 1,
dimnames = list("IQR", "Statistic"))
xbar <- matrix(round(mean(sampled_data$Result), 2), nrow = 1, ncol = 1,
dimnames = list("xbar", "Statistic"))
xbar_med <- matrix(round(xbar-Quantiles[3], 2), nrow = 1, ncol = 1,
dimnames = list("xbar-Med", "Statistic"))
Skew <- matrix(round(skewness(sampled_data$Result)[1], 3), nrow = 1, ncol = 1,
dimnames = list("Skew", "Statistic"))
Kurt <- matrix(round(kurtosis(sampled_data$Result)[1], 3), nrow = 1, ncol = 1,
dimnames = list("Kurt", "Statistic"))
s <- matrix(round(sd(sampled_data$Result), 3), nrow = 1, ncol = 1,
dimnames = list("s", "Statistic"))
se <- matrix(round(s/sqrt(length(sampled_data$Result)), 3), nrow = 1,
ncol = 1, dimnames = list("se", "Statistic"))
# Given that sigma is UNKNOWN we want to find the two-sided margin of error using the
# t-score method
ME.t.2S <- matrix(round(qt(p = p.CI, df = n-1)*se, 3), nrow = 1,
ncol = 1, dimnames = list("ME.t.2S", "Statistic"))
# We should also include checks for normality
AD.p.value <- matrix(round(ad.test(sampled_data$Result)$p.value, 3), nrow = 1,
ncol = 1, dimnames = list("AD.p.value", "Statistic"))
SW.p.value <- matrix(round(shapiro.test(sampled_data$Result)$p.value, 3), nrow = 1,
ncol = 1, dimnames = list("SW.p.value", "Statistic"))
# Lets put it all together in a table for reporting
sample_stats <- as.data.frame(
rbind(n, Quantiles, IQR, xbar, xbar_med, Skew, Kurt, s, se, ME.t.2S, AD.p.value,
SW.p.value))
tableStats <- tableGrob(sample_stats)
if(max(unique(sampled_data$Trial)) < 3){
sampled_data_t <- sampled_data
sampled_data_t$Trial <- sampled_data_t$Sample
} else {
sampled_data_t <- sampled_data
}
# Lets check linearity.
l_model <- lm(Result ~ Trial, data = sampled_data_t)
R_2 <- c(round(summary(l_model)$r.squared, 3), NA, NA, NA)
l_sum <- summary(l_model)$coefficients
# To report on our linear model (lm) lets put the info in a table ...
reg_tbl <- as.data.frame(t(as.matrix(round(rbind(l_sum, R_2), 4))))
names(reg_tbl) <- c("Int", "Slope", "R_2")
tablelm <- tableGrob(reg_tbl)
# Lets also create a histogram of our "single" population ...
hg <- ggplot(sampled_data, aes(x = Result)) +
geom_histogram(col = "black", fill = "steelblue") +
geom_rug() +
ggtitle("Histogram of Sample") +
xlab("Result") +
theme_tufte()
# ... as well as a boxplot of our "single" population ...
bp <- ggplot(sampled_data, aes(x = 1, y = Result)) +
geom_boxplot() +
coord_flip() +
ggtitle("Boxplot of Sample") +
ylab("Result") +
theme_tufte() +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank())
# ... and a linear regression (lm) graph ...
lmp <- ggplot(sampled_data, aes(x = Trial, y = Result)) +
geom_point(col = "Black") +
geom_smooth(method = "lm", se = F, col = "Red", size = 1) +
ggtitle("Linear Regresion vs. Order") +
ylab("Result") +
theme_tufte()
# ... with the lm residual graphs for fitted & order along with a histogram ...
lmp_r <- ggplot() +
geom_point(aes(x = l_model$fitted.values, y = l_model$residuals)) +
geom_hline(yintercept = 0, col = "red", size = 1) +
ggtitle("Residuals Versus the Fitted Values") +
xlab("Fitted Values") +
ylab("Residual") +
theme_tufte()
lmp_rord <- ggplot() +
geom_point(aes(x = sampled_data$Trial, y = l_model$residuals)) +
geom_hline(yintercept = 0, col = "red", size = 1) +
ggtitle("Residuals Versus the Order") +
xlab("Observation Order") +
ylab("Residual") +
theme_tufte()
lmp_rhist <- ggplot() +
geom_histogram(aes(x = l_model$residuals),
col = "black",
fill = "steelblue") +
ggtitle("Histogram of the Residuals") +
xlab("Residual") +
theme_tufte()
# ... and we will include the Normality qq plot
norm_qq <- ggplot(sampled_data, aes(sample = Result)) +
stat_qq() +
stat_qq_line(color = "red", size = 1) +
ggtitle("Normality QQ Plot") +
xlab("Theoretical") +
ylab("Sample") +
theme_tufte()
# Lets view all our exploratory data analysis
grid.tbl.1 <- arrangeGrob(tableStats, hg, bp,
layout_matrix = rbind(c(1, 2, 2),
c(1, 3, 3)),
widths = c(1, 1, 1))
grid.tbl.2 <- arrangeGrob(tablelm, lmp, lmp_r, lmp_rhist, norm_qq, lmp_rord,
layout_matrix = rbind(c(1, 2, 3),
c(4, 5, 6)),
widths = c(1, 1, 1))
final.return <- list("Transformed_Date" = data_trans,
"Summary_Stats" = sample_stats,
"Histogram" = hg,
"Boxplot" = bp,
"LM_plot" = lmp,
"LM_Residual_Plot" = lmp_r,
"LM_Residual_Order_Plot" = lmp_rord,
"LM_Residual_Histogram" = lmp_rhist,
"Nomrmal_qq" = norm_qq,
"grid.newpage_draw_plot1" = grid.tbl.1,
"grid.newpage_draw_plot2" = grid.tbl.2)
return(final.return)
}
}
options(warn=0)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.