R/dataShape.R
In rosetta: Parallel Use of Statistical Packages in Teaching

#' #' @rdname normalityAssessment
#' #' @export
#' dataShape <- function(sampleVector, na.rm=TRUE, type=2, digits=2,
#'                       conf.level=.95, plots=TRUE, xLabs = NA,
#'                       yLabs = NA, qqCI=TRUE, labelOutliers = TRUE,
#'                       sampleSizeOverride = NULL) {
#'
#'   ### Note: this is adapted from the 'skewness' and 'kurtosis' functions
#'   ### in 'e1071' and the way they're computed in 'describe' in 'psych', and
#'   ### for the variances, also based on Stan Browns page at
#'   ### https://www.tc3.edu/instruct/sbrown/stat/shape.htm, as well as of course
#'   ### on Joanes & Gill (1998)
#'
#'   res <- list(input = as.list(environment()),
#'               intermediate = list(), output = list());
#'
#'   if (any(isNA <- is.na(sampleVector))) {
#'     if (na.rm) {
#'       res$intermediate$sampleVector <- sampleVector <- sampleVector[!isNA];
#'     }
#'     else {
#'       return(NA);
#'     }
#'   } else {
#'     res$intermediate$sampleVector <- sampleVector;
#'   }
#'
#'   if (!(type %in% (1:3))) {
#'     stop("Invalid 'type' argument. This must be 1, 2 or 3; see ?skewness for more info.");
#'   }
#'
#'   ### Number of observations
#'   res$intermediate$n <- n <- length(sampleVector);
#'
#'   if (n < 3) {
#'     stop("There are only ", n, " datapoints/observations - this is too few to ",
#'          "compute a sensible estimate of the 'shape' of this data.");
#'   }
#'
#'   ### First moment (mean)
#'   res$intermediate$m1 <-       sum(sampleVector^1) / n;
#'
#'   ### Center the data
#'   res$intermediate$centeredVector <- sampleVector <- sampleVector - res$intermediate$m1;
#'
#'   ### Second moment (variance)
#'   res$intermediate$m2 <- m2 <- sum(sampleVector^2) / n;
#'
#'   ### Third moment
#'   res$intermediate$m3 <- m3 <- sum(sampleVector^3) / n;
#'
#'   ### Fourth moment
#'   res$intermediate$m4 <- m4 <- sum(sampleVector^4) / n;
#'
#'   ### Sample skewness
#'   res$intermediate$g1 <- m3 / (m2 ^ (3/2));
#'
#'   ### Sample (excess) kurtosis
#'   res$intermediate$g2 <- (m4 / (m2 ^ 2)) - 3;
#'
#'   ###  Population skewness
#'   res$intermediate$G1 <- res$intermediate$g1 *
#'                          sqrt( n * (n-1) ) /
#'                          ( n - 2 );
#'
#'   ### Population kurtosis
#'   res$intermediate$G2 <- (n-1) *
#'                          ( (n+1) * res$intermediate$g2 + 6 ) /
#'                          ( (n-2) * (n-3) );
#'
#'   ### Type 3 correction for skewness
#'   res$intermediate$b1 <- res$intermediate$g1 *
#'                          (((n-1) / n) ^ (3/2));
#'
#'   ### Type 3 correction for kurtosis
#'   res$intermediate$b2 <- (res$intermediate$g2 + 3) *
#'                          (((n-1) / n) ^ 2) - 3;
#'
#'   ### Now for the variances. We compute the variance in a normal
#'   ### population, because when we want to test the skewness and
#'   ### kurtosis, we do so under the assumption that they come from a
#'   ### normal distribution. Plus, no idea how to compute the variance
#'   ### otherwise :-)
#'
#'   ### These formulas come from the Joanes & Gill (1998) article.
#'
#'   ### Note that we use the 'overridden sample size' if one was specified.
#'   ### We do this because when this function is used
#'   if (is.numeric(sampleSizeOverride)) n <- sampleSizeOverride;
#'
#'   res$intermediate$var.g1 <- ( 6 * (n-2) ) /
#'                              ( (n+1) * (n+3) );
#'   res$intermediate$var.g2 <- ( 24*n * (n-2) * (n-3) ) /
#'                              ( (n+1)^2 * (n+3) * (n + 5) );
#'   res$intermediate$var.G1 <- res$intermediate$var.g1 *
#'                              sqrt(n * (n-1)) /
#'                              (n-2)^2;
#'   res$intermediate$var.G2 <- res$intermediate$var.g2 *
#'                              (((n-1) * (n+1)) /
#'                               ((n-2) * (n-3))) ^ 2;
#'   res$intermediate$var.b1 <- res$intermediate$var.g1 *
#'                              ((n-1) / n) ^ 3;
#'   res$intermediate$var.b2 <- res$intermediate$var.g2 *
#'                              ((n-1) / n) ^ 4;
#'
#'   ### And the standard error for the skewness and the kurtosis,
#'   ### based on the formulas on https://www.tc3.edu/instruct/sbrown/stat/shape.htm
#'   res$intermediate$se.G1 <- sqrt( ( 6 * n * (n-1) ) /
#'                                   ( (n-2) * (n+1) * (n+3) ) );
#'   res$intermediate$se.G2 <- 2 * res$intermediate$se.G1 *
#'                             sqrt( (n^2 - 1) /
#'                                   ( (n-3) * (n+5) ) );
#'
#'   ### Confidence interval for skewness and kurtosis in the population
#'   res$intermediate$zMultiplier <- stats::qnorm(1 - (1-conf.level)/2);
#'   res$intermediate$ci.G1 <- c(res$intermediate$G1 -
#'                                 res$intermediate$zMultiplier *
#'                                 res$intermediate$se.G1,
#'                               res$intermediate$G1 +
#'                                 res$intermediate$zMultiplier *
#'                                 res$intermediate$se.G1);
#'   res$intermediate$ci.G2 <- c(res$intermediate$G2 -
#'                                 res$intermediate$zMultiplier *
#'                                 res$intermediate$se.G2,
#'                               res$intermediate$G2 +
#'                                 res$intermediate$zMultiplier *
#'                                 res$intermediate$se.G2);
#'
#'   ### Z-test to test against normal distribution
#'   res$intermediate$z.G1 <- res$intermediate$G1 / res$intermediate$se.G1;
#'   res$intermediate$z.G2 <- res$intermediate$G2 / res$intermediate$se.G2;
#'   res$intermediate$p.G1 <- 2*(1-stats::pnorm(abs(res$intermediate$z.G1)));
#'   res$intermediate$p.G2 <- 2*(1-stats::pnorm(abs(res$intermediate$z.G2)));
#'
#'   ### Hartigans' Dip Test
#'   res$output$D.dip.test <- diptest::dip.test(sampleVector)$statistic[[1]];
#'   res$output$p.dip.test <- diptest::dip.test(sampleVector)$p[[1]];
#'
#'   ### Store requested estimates in output object
#'   if (type == 1) {
#'     res$output$skewness <- res$intermediate$g1;
#'     res$output$kurtosis <- res$intermediate$g2;
#'     res$output$type <- "g";
#'   }
#'   else if (type == 2) {
#'     res$output$skewness <- res$intermediate$G1;
#'     res$output$kurtosis <- res$intermediate$G2;
#'     res$output$type <- "G";
#'   }
#'   else if (type == 3)  {
#'     res$output$skewness <- res$intermediate$b1;
#'     res$output$kurtosis <- res$intermediate$b2;
#'     res$output$type <- "b";
#'   }
#'
#'   if (plots) {
#'     res$intermediate$histogram <-
#'       normalHist(res$intermediate$sampleVector)$plot;
#'     res$intermediate$qq <-
#'       ggqq(res$intermediate$sampleVector, ci=qqCI);
#'     res$intermediate$boxplot <-
#'       ggBoxplot(res$intermediate$sampleVector, labelOutliers=labelOutliers);
#'     if (!is.null(xLabs)) {
#'       if (is.na(xLabs)) {
#'         res$intermediate$histogram <-
#'           res$intermediate$histogram +
#'           ggplot2::theme(axis.title.x = ggplot2::element_blank());
#'         res$intermediate$qq <-
#'           res$intermediate$qq +
#'           ggplot2::theme(axis.title.x = ggplot2::element_blank());
#'         res$intermediate$boxplot <-
#'           res$intermediate$boxplot +
#'           ggplot2::theme(axis.title.x = ggplot2::element_blank());
#'       } else {
#'         res$intermediate$histogram <-
#'           res$intermediate$histogram + ggplot2::xlab(xLabs$hist);
#'         res$intermediate$qq <-
#'           res$intermediate$qq + ggplot2::xlab(xLabs$qq);
#'         res$intermediate$boxplot <-
#'           res$intermediate$boxplot + ggplot2::xlab(xLabs$box);
#'       }
#'     }
#'
#'     if (!is.null(yLabs)) {
#'       if (is.na(yLabs)) {
#'         res$intermediate$histogram <-
#'           res$intermediate$histogram +
#'           ggplot2::theme(axis.title.y = ggplot2::element_blank());
#'         res$intermediate$qq <-
#'           res$intermediate$qq +
#'           ggplot2::theme(axis.title.y = ggplot2::element_blank());
#'         res$intermediate$boxplot <-
#'           res$intermediate$boxplot +
#'           ggplot2::theme(axis.title.y = ggplot2::element_blank());
#'       } else {
#'         res$intermediate$histogram <-
#'           res$intermediate$histogram + ggplot2::ylab(yLabs$hist);
#'         res$intermediate$qq <-
#'           res$intermediate$qq + ggplot2::ylab(yLabs$qq);
#'         res$intermediate$boxplot <-
#'           res$intermediate$boxplot + ggplot2::ylab(yLabs$box);
#'       }
#'     }
#'
#'     res$output$plot <-
#'       gridExtra::arrangeGrob(res$intermediate$histogram,
#'                              res$intermediate$qq,
#'                              res$intermediate$boxplot, ncol=3);
#'   }
#'
#'   ### Return results object
#'   class(res) <- "dataShape";
#'   return(res);
#'
#' }
#'
#' #' @method print dataShape
#' #' @rdname normalityAssessment
#' #' @export
#' print.dataShape <- function(x, digits=x$input$digits, extraNotification=TRUE, ...) {
#'   if (x$output$type == "G") {
#'     skewness.inference <- paste0("  (se = ", format(x$intermediate$se.G1, digits=digits),
#'                                  ", confidence interval = [",
#'                                  paste0(format(x$intermediate$ci.G1, digits=digits), collapse=", "),
#'                                  "], z = ", format(x$intermediate$z.G1, digits=digits),
#'                                  ", ", formatPvalue(x$intermediate$p.G1, digits=digits), ")");
#'     kurtosis.inference <- paste0("  (se = ", format(x$intermediate$se.G2, digits=digits),
#'                                  ", confidence interval = [",
#'                                  paste0(format(x$intermediate$ci.G2, digits=digits), collapse=", "),
#'                                  "], z = ", format(x$intermediate$z.G2, digits=digits),
#'                                  ", ", formatPvalue(x$intermediate$p.G2, digits=digits), ")");
#'   }
#'   else {
#'     skewness.inference <- kurtosis.inference <- "";
#'   }
#'   cat(paste0("Skewness (", x$output$type, "1): ",
#'              round(x$output$skewness, digits=digits),
#'              skewness.inference, "\n"));
#'   cat(paste0("Kurtosis (", x$output$type, "2): ",
#'              round(x$output$kurtosis, digits=digits),
#'              kurtosis.inference, "\n"));
#'   cat(paste0("Hartigans' Dip Test: ",
#'              round(x$output$D.dip.test, digits=digits), ", ",
#'              formatPvalue(x$output$p.dip.test), "\n"));
#'
#'   if (x$input$plots) {
#'     grid::grid.draw(x$output$plot);
#'   }
#'
#'   if (extraNotification && x$output$type == "g") {
#'     cat("\nNote: g1 and g2 are biased estimates of the population skewness and kurtosis.",
#'         "For unbiased estimates, use 'type=2' or 'type=3'.");
#'   }
#'   else if (extraNotification && x$output$type == "G") {
#'     cat("\nNote: G1 and G2 are the estimates for skewness and kurtosis used by SPSS and SAS,",
#'         "and corrected for the bias present in g1 and g2 ('type=1'). Note that b1 and b2 ('type=3')",
#'         "may perform better in small samples from a normal distribution.");
#'   }
#'   else if (extraNotification && x$output$type == "b") {
#'     cat("\nNote: b1 and b2 are estimates for skewness and kurtosis that have smaller mean-squared error in small",
#'         "samples from a normal distribution than G1 and G2, which are used by SPSS and SAS ('type=2').");
#'   }
#' }
#'
#' #' @method pander dataShape
#' #' @rdname normalityAssessment
#' #' @export
#' pander.dataShape <- function(x, digits=x$input$digits, extraNotification=TRUE, ...) {
#'   if (x$output$type == "G") {
#'     skewness.inference <- paste0("  (se = ", format(x$intermediate$se.G1, digits=digits),
#'                                  ", confidence interval = [",
#'                                  paste0(format(x$intermediate$ci.G1, digits=digits), collapse=", "),
#'                                  "], z = ", format(x$intermediate$z.G1, digits=digits),
#'                                  ", ", formatPvalue(x$intermediate$p.G1, digits=digits), ")");
#'     kurtosis.inference <- paste0("  (se = ", format(x$intermediate$se.G2, digits=digits),
#'                                  ", confidence interval = [",
#'                                  paste0(format(x$intermediate$ci.G2, digits=digits), collapse=", "),
#'                                  "], z = ", format(x$intermediate$z.G2, digits=digits),
#'                                  ", ", formatPvalue(x$intermediate$p.G2, digits=digits), ")");
#'   }
#'   else {
#'     skewness.inference <- kurtosis.inference <- "";
#'   }
#'   cat(paste0("Skewness (", x$output$type, "1): ",
#'              round(x$output$skewness, digits=digits),
#'              skewness.inference, "  \n"));
#'   cat(paste0("Kurtosis (", x$output$type, "2): ",
#'              round(x$output$kurtosis, digits=digits),
#'              kurtosis.inference, "  \n"));
#'   cat(paste0("Hartigans' Dip Test: ",
#'              round(x$output$D.dip.test, digits=digits), ", ",
#'              formatPvalue(x$output$p.dip.test), "\n"));
#'
#'   if (extraNotification && x$output$type == "g") {
#'     cat("\nNote: g1 and g2 are biased estimates of the population skewness and kurtosis.",
#'         "For unbiased estimates, use 'type=2' or 'type=3'.\n");
#'   }
#'   else if (extraNotification && x$output$type == "G") {
#'     cat("\nNote: G1 and G2 are the estimates for skewness and kurtosis used by SPSS and SAS,",
#'         "and corrected for the bias present in g1 and g2 ('type=1'). Note that b1 and b2 ('type=3')",
#'         "may perform better in small samples from a normal distribution.\n");
#'   }
#'   else if (extraNotification && x$output$type == "b") {
#'     cat("\nNote: b1 and b2 are estimates for skewness and kurtosis that have smaller mean-squared error in small",
#'         "samples from a normal distribution than G1 and G2, which are used by SPSS and SAS ('type=2').\n");
#'   }
#'
#'   cat("\n");
#'
#'   if (x$input$plots) {
#'     grid::grid.draw(x$output$plot);
#'   }
#'
#' }