knitr::opts_chunk$set(echo = FALSE) library(pipeline) library(reshape2) library(tidyverse) library(PoweR) library(car) options(scipen=20) #disable scientific notation for numbers smaller than x (i.e., 10) digits (e.g., 4.312e+22)
# run analyses from the json file on the data study <- pipeline(json_file_name, data_file_name) first_paired_analysis <- grep("effect_size_d_paired", study$analysis)[1] analysis <- study$analysis[[first_paired_analysis]] xlabel<-analysis$params$larger ylabel<-analysis$params$smaller ylabelstring <- ylabel alpha <- analysis$params$alpha.level conf.level <- analysis$params$conf.level alternative <- analysis$params$alternative #create two variables (x and y) that contain values of two datasets to be compared x <- data[[xlabel]] y <- data[[ylabel]] diff <- x-y #difference scores #Add difference to data dataframe for plotting. data[["diff"]] <- diff # Convert data to long format data.long <- data %>% select_at(vars(xlabel, ylabel)) %>% mutate(sub_id = 1:nrow(.)) %>% gather_("iv", "dv", c(xlabel, ylabel)) #change matrix output from functions to dataframe, add CI from between, add labels and means #order of matrix is flipped around for bs.ci, which returns alphabetically ordered rows data_for_ci <- select_at(data, vars(ylabel, xlabel)) ci.sum<-as.data.frame(cm.ci(data_for_ci)) ordernames<-rownames(ci.sum)#store order of dataframe rows ifelse(rownames(ci.sum)[1]!=xlabel,ci.sum<-ci.sum[rev(rownames(ci.sum)),],NA)#flip order around ci.sum$iv <- c(xlabel,ylabel) ci.sum$dv <- c(mean(x),mean(y)) ci.sum <- ci.sum[order(ci.sum[["iv"]]),] ci.sum[["lower.between"]] <- as.data.frame(bs.ci(data_for_ci))$lower ci.sum[["upper.between"]] <- as.data.frame(bs.ci(data_for_ci))$upper ci.sum <- ci.sum[order(ordernames),]#flip order around ################################################################## ################################################################## ######## PLOT DATA AND CHECK FOR OUTLIERS AND NORMALITY ########## ################################################################## ################################################################## #Test normality normalityrejections<-(statcompute(21, diff, levels = c(0.05))$decision + statcompute(6, diff, levels = c(0.05))$decision + statcompute(2, diff, levels = c(0.05))$decision + statcompute(7, diff, levels = c(0.05))$decision) #Testing equality of variances p_valueLevene<-leveneTest(data.long[["dv"]] ~ as.factor((data.long[["iv"]])))$"Pr(>F)"[1:1] if (p_valueLevene < 0.05){equalvar<-"the assumption that variances are equal is rejected (consider reporting robust statistics)."} if (p_valueLevene >= 0.05){equalvar<-"the assumption that variances are equal is not rejected."} cat("Levene's test for equality of variances (p = ", round(p_valueLevene, digits=2),") indicates that ",equalvar,sep="") ####################################################### ####################################################### ###Calculate CI around Cohen's d for within designs ### ####################################################### ####################################################### sd1 <- sd(x) #standard deviation of measurement 1 sd2 <- sd(y) #standard deviation of measurement 2 s_diff <- sd(x-y) #standard deviation of the difference scores N <- length(x) #number of pairs s_av <- sqrt((sd1^2+sd2^2)/2) #averaged standard deviation of both measurements #Cohen's d_av, using s_av as standardizer m_diff <- mean(y-x) d_av <- m_diff/s_av d_av d_av_unb <- (1-(3/(4*(N-1)-1)))*d_av d_av_unb #get the t-value for the CI t_value <- m_diff/(s_diff/sqrt(N)) nct_limits <- conf.limits.nct(t.value = t_value, df = N-1, conf.level = 0.95) ci_l_d_av <- nct_limits$Lower.Limit*s_diff/(s_av*sqrt(N)) ci_u_d_av <- nct_limits$Upper.Limit*s_diff/(s_av*sqrt(N)) ci_l_d_av ci_u_d_av #Cohen's d_z, using s_diff as standardizer d_z <- t_value/sqrt(N) d_z d_z_unb <- (1-(3/(4*(N-1)-1)))*d_z ci_l_d_z <- nct_limits$Lower.Limit/sqrt(N-1) ci_u_d_z <- nct_limits$Upper.Limit/sqrt(N-1) ci_l_d_z ci_u_d_z r <- cor(x, y) #correlation between dependent measures ttestresult <- t.test(y, x, alternative = alternative, paired = TRUE, var.equal = TRUE, conf.level = conf.level) p_value <- ttestresult$p.value #store p-value from dependent t-test #Specify direction of difference if (mean(x)>mean(y)){direction<-"greater than"} if(mean(x)<mean(y)){direction<-"smaller than"} if(p_value < alpha){surprising<-"surprising"} if(p_value >= alpha){surprising<-" not surprising"} #Interpret size of effect (last resort - use only if effect size cannot be compared to other relevant effects in the literature) if (abs(d_av) < 0.2){effectsize<-"tiny"} if (0.2 <= abs(d_av) && abs(d_av) < 0.5){effectsize<-"small"} if (0.5 <= abs(d_av) && abs(d_av) < 0.8){effectsize<-"medium"} if (abs(d_av) >= 0.8){effectsize<-"large"} #Common Langaue Effect Size (McGraw & Wong, 1992) CL <- pnorm(abs(m_diff/s_diff))
This document summarizes a comparison between two independent groups, comparing r ylabelstring
between the r xlabel
and r ylabel
conditions. This script can help to facilitate the analysis of data, and the word-output might prevent copy-paste errors when transferring results to a manuscript.
Researchers can base their statistical inferences on Frequentist or robust statistics, as well as on Bayesian statistics. Effect sizes and their confidence intervals are provided, thus inviting researchers to interpret their data from multiple perspectives.
Boxplots can be used to identify outliers. Boxplots give the median (thick line), and 25% of the data above and below the median (box). End of whiskers are the maximum and minimum value when excluding outliers (which are indicated by dots).
car::scatterplot(x~y, grid=TRUE, col = 1, asp=TRUE, smooth = FALSE)
The dependent t-test assumes that difference scores are normally distributed and that the variances of the two groups are equal. It does not assume the data within each measurement (so within the r xlabel
and r ylabel
condition) are normally distributed. If the normality assumption is violated, the Type 1 error rate of the test is no longer controlled, and can substantially increase beyond the chosen significance level. Formally, a normality test based on the data is incorrect, and the normality assumption should be tested on additional (e.g., pilot) data. Nevertheless, a two-step procedure (testing the data for normality, and using alternatives for the traditional t-test if normality is violated) works well (see Rochon, Gondan, & Kieser, 2012).
Yap and Sim (2011, p. 2153) recommend: "If the distribution is symmetric with low kurtosis values (i.e. symmetric short-tailed distribution), then the D'Agostino-Pearson and Shapiro-Wilkes tests have good power. For symmetric distribution with high sample kurtosis (symmetric long-tailed), the researcher can use the JB, Shapiro-Wilkes, or Anderson-Darling test." The Kolmogorov-Smirnov (K-S) test is often used, but no longer recommended, and not included here.
If a normality test rejects the assumptions that the data is normally distributed (with p < .05) non-parametric or robust statistics have to be used (robust analyses are provided below).
The normality assumption was rejected in r normalityrejections
out of 4 normality tests (Anderson-Darling, D'Agostino-Pearson, and Shapiro-Wilk).
Test Name | p-value
------------- | -------------
Shapiro-Wilk | p r ifelse(statcompute(21, diff, levels = c(0.05))$pvalue>=0.001," = ", " < ")
r ifelse(statcompute(21, diff, levels = c(0.05))$pvalue>=0.001, round(statcompute(21, diff, levels = c(0.05))$pvalue, digits=3), "0.001")
D'Agostino-Pearson | p r ifelse(statcompute(6, diff, levels = c(0.05))$pvalue>0.001," = ", " < ")
r ifelse(statcompute(6, diff, levels = c(0.05))$pvalue>0.001, round(statcompute(6, diff, levels = c(0.05))$pvalue, digits=3), "0.001")
Anderson-Darling | p r ifelse(statcompute(2, diff, levels = c(0.05))$pvalue>0.001," = ", " < ")
r ifelse(statcompute(2, diff, levels = c(0.05))$pvalue>0.001, round(statcompute(2, diff, levels = c(0.05))$pvalue, digits=3), "0.001")
Jarque-Berra | p r ifelse(statcompute(7, diff, levels = c(0.05))$pvalue>0.001," = ", " < ")
r ifelse(statcompute(7, diff, levels = c(0.05))$pvalue>0.001, round(statcompute(7, diff, levels = c(0.05))$pvalue, digits=3), "0.001")
In very large samples (when the test for normality has close to 100% power) tests for normality can result in significant results even when data is normally distributed, based on minor deviations from normality. In very small samples (e.g., n = 10), deviations from normality might not be detected, but this does not mean the data is normally distributed. Always look at a plot of the data in addition to the test results.
The density (or proportion of the observations) is plotted on the y-axis. The grey bars are a histogram of the difference scores. Judging whether data is normally distributed on the basis of a histogram depends too much on the number of bins (or bars) in the graph. A kernel density plot (a non-parametric technique for density estimation) provides an easier way to check the normality of the data by comparing the shape of the density plot (the black line) with a normal distribution (the red dotted line, based on the observed mean and standard deviation). For dependent t-tests, the main DV is the difference score, and therefore the difference score should be normally distributed.
#density plot with normal distribution (red) and kernel desity plot ggplot(data, aes(x=diff)) + geom_histogram(colour="black", fill="grey", aes(y = ..density..)) + stat_function(fun = dnorm, args = c(mean=mean(data$diff), sd=sd(data$diff)), size = 1, color = "red", lty=2) + geom_density(fill=NA, colour="black", size = 1) + ggtitle("Difference scores") + theme_bw(base_size=14) + theme(panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank()) require(gridExtra) #density plot with normal distribution (red) and kernel desity plot p1<-ggplot(data, aes_string(x=xlabel)) + geom_histogram(colour="black", fill="grey", aes(y = ..density..)) + stat_function(fun = dnorm, args = c(mean=mean(x), sd=sd(x)), size = 1, color = "red", lty=2) + geom_density(fill=NA, colour="black", size = 1) + ggtitle(xlabel)+ theme_bw(base_size=14) + theme(panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank()) #density plot with normal distribution (red) and kernel desity plot p2<-ggplot(data, aes_string(x=ylabel)) + geom_histogram(colour="black", fill="grey", aes(y = ..density..)) + stat_function(fun = dnorm, args = c(mean=mean(y), sd=sd(y)), size = 1, color = "red", lty=2) + geom_density(fill=NA, colour="black", size = 1) + ggtitle(ylabel) + theme_bw(base_size=14) + theme(panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank()) grid.arrange(p1, p2, nrow=2)
In the Q-Q plot for the difference scores the points should fall on the line. Deviations from the line in the upper and lower quartiles indicates the tails of the distributions are thicker or thinner than in the normal distribution. An S-shaped curve with a dip in the middle indicates data is left-skewed (more values to the right of the distribution), while a bump in the middle indicates data is right-skewed (more values to the left of the distribution). For interpretation examples, see here.
require(HLMdiag) #Q-Q plot ggplot_qqnorm(diff, line = "quantile") + ggtitle("Difference scores") + theme_bw(base_size=14) + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank()) qq1<-ggplot_qqnorm(x, line = "quantile") + ggtitle(xlabel) + theme_bw(base_size=14) + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank()) qq2<-ggplot_qqnorm(y, line = "quantile") + ggtitle(ylabel) + theme_bw(base_size=14) + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank()) grid.arrange(qq1, qq2, ncol=2)
In addition to the normality assumption, a second assumption of the t-test is that variances in both groups are equal. The variance is the standard deviation, squared, and the assumption is thus that the variance in the r xlabel
condition (r round(sd1^2, digits = 2)
) equals that in the r ylabel
condition (r round(sd2^2, digits = 2)
). Markowski & Markowski (1990) show that if sample sizes are equal, violations of the equal variance assumption do not lead to unsatisfactory performance (defined as actual significance levels falling outside a 0.03-0.07 boundary for a nominal alpha level of 0.05).
This equality of variances assumption is typically examined with Levene's test, although in small samples, Levene's test can have low power, and thus fail to reject the null-hypothesis that variances are equal, even when they are unequal. Levene's test for equality of variances (p r ifelse(p_valueLevene>0.001," = ", " < ")
r ifelse(p_valueLevene>0.001, round(p_valueLevene, digits=3), "0.001")
) indicates that r equalvar
Before looking at the results of the Frequentist statistics and the Robust statistics, decide which of these answer the question you are interested in. Choosing between these two options depending on the outcome of the statistical test inflates the Type 1 error rate. You can always report Bayesian statistics.
A p-value is the probability of obtaining the observed result, or a more extreme result, assuming the null-hypothesis is true. It is not the probability that the null-hypothesis or the alternative hypothesis is true (for such inferences, see Bayesian statistics below). In repeated sampling, r 100*conf.level
% of future r 100*conf.level
% confidence intervals can be expected to contain the true population parameters (e.g, the mean difference or the effect size). Confidence intervals are not a statement about the probability that a single confidence interval contains the true population parameter, but a statement about the probability that future confidence intervals will contain the true population parameter. Hedges' g (also referred to as d~unbiased~, see Borenstein, Hedges, Higgins, & Rothstein, 2009) is provided as best estimate of Cohen's d, but the best estimate of the confidence interval is based on d~av~ (as recommended by Cumming, 2012). Hedges's g and the r 100*conf.level
% CI around the effect size are calculated using the MBESS package by (Kelley (2007). The common language effect size expresses the probability that in any random pairing of two observations from both groups, the observation from one group is higher than the observation from the other group, see McGraw & Wong, 1992. In a dependent t-test, the effect size Cohen's d can be calculated by using a standardizer that controls for the correlation between observations (d~av~) or not (d~z~). Both are provided, but d~av~ (or actually it's unbiased estimate, g~av~) is recommended. For a discussion, see Lakens, 2013. Default interpretations of the size of an effect as provided here should only be used as a last resort, and it is preferable to interpret the size of the effect in relation to other effects in the literature, or in terms of its practical significance.
The mean r ylabelstring
of participants in the r xlabel
condition (M = r round(mean(x), digits = 2)
, SD = r round(sd1, digits = 2)
) was r direction
the mean of participants in the r ylabel
condition (M = r round(mean(y), digits = 2)
, SD = r round(sd2,digits=2)
, r = r round(r, digits = 2)
). The difference between measurements (M = r round(m_diff, digits=2)
, SD = r round(s_diff, digits=2)
, r 100*conf.level
% CI = [r round(ttestresult$conf.int[1], digits=2)
;r round(ttestresult$conf.int[2],digits=2)
]) was analyzed with a dependent t-test, t(r round(ttestresult$parameter, digits=2)
) = r round(t_value, digits=2)
, p r ifelse(p_value>0.001," = ", " < ")
r ifelse(p_value>0.001, formatC(round(p_value, digits=3),digits=3, format="f"), "0.001")
, Hedges' g = r round(d_av_unb, digits=2)
, r 100*conf.level
% CI [r round(ci_l_d_av, digits=2)
;r round(ci_u_d_av, digits=2)
] (or d~z~ = r round(d_z, digits=2)
, r 100*conf.level
% CI [r round(ci_l_d_z, digits=2)
;r round(ci_u_d_z, digits=2)
]). This can be considered a r effectsize
effect. The observed data is r surprising
under the assumption that the null-hypothesis is true. The Common Language effect size (McGraw & Wong, 1992) indicates that after controlling for individual differences, the likelihood that a persons r ylabelstring
in the r xlabel
condition is r direction
the r ylabelstring
in the r ylabel
condition is r round(100*CL, digits=0)
%.
r 100*conf.level
% within (crossbars) and between (endpoints of lines) confidence intervals following Morey (2008) and Baguley (2012).#Example 1: means and two-tiered 95% CI (within and between) suggested by Baguley ggplot(ci.sum, aes(x=iv, y=dv, group=1)) + # geom_bar(position=position_dodge(.9), colour="black", stat="identity", fill="white") + geom_errorbar(width=.25, size=0.5, aes(ymin=lower, ymax=upper)) + geom_errorbar(width=0, size=1, aes(ymin=lower.between, ymax=upper.between)) + geom_point(size=2) + # geom_point(data=data.long) + geom_violin(data=data.long, aes(group=iv), alpha=0) + theme_bw(base_size=14) + theme(panel.grid.major.x = element_blank())
r 100*conf.level
% CI (between & within)#Example 2: bar chart with individual data point and 95% CI (between) ggplot(ci.sum, aes(x=iv, y=dv, group=1)) + geom_bar(position=position_dodge(.9), colour="black", stat="identity", fill="white") + geom_errorbar(width=.5, size=0.5, aes(ymin=lower, ymax=upper)) + geom_errorbar(width=.3, size=0.5, aes(ymin=lower.between, ymax=upper.between)) + geom_point(data=data.long, alpha=0.25) + theme_bw(base_size=14) + theme(panel.grid.major.x = element_blank())
r 100*conf.level
% CI (between and within)#Example 2: bar chart with individual data point and 95% CI (between) ggplot(ci.sum, aes(x=iv, y=dv, group=1)) + geom_bar(position=position_dodge(.9), colour="black", stat="identity", fill="white") + geom_errorbar(width=.25, size=0.5, aes(ymin=lower, ymax=upper)) + geom_errorbar(width=0, size=1.1, aes(ymin=lower.between, ymax=upper.between)) + theme_bw(base_size=14) + theme(panel.grid.major.x = element_blank())
This script uses the reshape2 package to convert data from wide to long format, the PoweR package to perform the normality tests, HLMdiag to create the QQplots, ggplot2 for all plots, gtable and gridExtra to combine multiple plots into one, car to perform Levene's test, MBESS to calculate effect sizes and their confidence intervals, WRS for the robust statistics, BayesFactor for the bayes factor, and BEST to calculate the Bayesian highest density interval.
Auguie, B. (2012). gridExtra: functions in Grid graphics. R package version 0.9.1, URL: http://CRAN.R-project.org/package=gridExtra.
Baguley, T. (2012). Calculating and graphing within-subject confidence intervals for ANOVA. Behavior research methods, 44, 158-175.
Borenstein, M., Hedges, L. V., Higgins, J. P., & Rothstein, H. R. (2009). Introduction to meta-analysis. Hoboken, NJ: Wiley.
Box, G. E. P. (1953). Non-normality and tests on variance. Biometrika, 40, 318-335.
Cumming, G. (2012). Understanding the new statistics: Effect sizes, confidence intervals, and meta-analysis. New York: Routledge.
Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd ed.). Hillsdale, NJ: Erlbaum.
Fox, J. & Weisberg, S. (2011). An R Companion to Applied Regression, Second edition. Sage, Thousand Oaks CA. URL: http://socserv.socsci.mcmaster.ca/jfox/Books/Companion.
Kelley, K. (2005). The effects of nonnormal distributions on confidence intervals around the standardized mean difference: Bootstrap and parametric confidence intervals. Educational and Psychological Measurement, 65, 51-69.
Kelley, K. (2007). Confidence intervals for standardized effect sizes: Theory, application, and implementation. Journal of Statistical Software, 20, 1-24.
Kelley, K. & Lai, K. (2012). MBESS. R package version 3.3.3, URL: http://CRAN.R-project.org/package=MBESS.
Kruschke, J. (2010). Doing Bayesian data analysis: A tutorial introduction with R. Academic Press.
Kruschke, J. K. (2013). Bayesian estimation supersedes the t-test. Journal of Experimental Psychology: General, 142, 573-603.
Kruschke, J. K., & Meredith, M. (2014). BEST: Bayesian Estimation Supersedes the t-test. R package version 0.2.2, URL: http://CRAN.R-project.org/package=BEST.
Lakens, D. (2013). Calculating and reporting effect sizes to facilitate cumulative science: a practical primer for t-tests and ANOVAs. Frontiers in psychology, 4.
Loy, A., & Hofmann, H. (2014). HLMdiag: A Suite of Diagnostics for Hierarchical Linear Models. R. Journal of Statistical Software, 56, pp. 1-28. URL: http://www.jstatsoft.org/v56/i05/.
McGraw, K. O., & Wong, S. P. (1992). A common language effect size statistic. Psychological Bulletin, 111, 361-365.
Micheaux, PLd. & Tran, V. (2012). PoweR. URL: http://www.biostatisticien.eu/PoweR/.
Morey, R. D. (2008). Confidence intervals from normalized data: A correction to Cousineau (2005). Tutorial in Quantitative Methods for Psychology, 4, 61-64.
Morey, R. D. & Rouder, J. N. (2011). Bayes Factor Approaches for Testing Interval Null Hypotheses. Psychological Methods, 16, 406-419
Morey R and Rouder J (2015). BayesFactor: Computation of Bayes Factors for Common Designs. R package version 0.9.11-1, URL: http://CRAN.R-project.org/package=BayesFactor.
Rochon, J., Gondan, M., & Kieser, M. (2012). To test or not to test: Preliminary assessment of normality when comparing two independent samples. BMC Medical Research Methodology, 12:81.
Rouder, J. N., Speckman, P. L., Sun, D., Morey, R. D., & Iverson, G. (2009). Bayesian t-tests for accepting and rejecting the null hypothesis. Psychonomic Bulletin & Review, 16, 752-760
Ruxton, G. D. (2006). The unequal variance t-test is an underused alternative to Student's t-test and the Mann-Whitney U test. Behavioral Ecology, 17, 688-690.
Wickham, H. (2007). Reshaping Data with the reshape Package. Journal of Statistical Software, 21, pp. 1-20. URL: http://www.jstatsoft.org/v21/i12/.
Wickham, H. (2009). ggplot2: elegant graphics for data analysis. Springer New York. ISBN 978-0-387-98140-6, URL: http://had.co.nz/ggplot2/book.
Wickham, H. (2012). gtable: Arrange grobs in tables. R package version 0.1.2, URL: http://CRAN.R-project.org/package=gtable.
Wilcox, R. R. (2012). Introduction to robust estimation and hypothesis testing. Academic Press.
Wilcox, R. R., & Sch?nbrodt, F. D. (2015). The WRS package for robust statistics in R (version 0.27.5). URL: https://github.com/nicebread/WRS.
Wilcox, R. R., & Tian, T. S. (2011). Measuring effect size: a robust heteroscedastic approach for two or more groups. Journal of Applied Statistics, 38, 1359-1368.
Yap, B. W., & Sim, C. H. (2011). Comparisons of various types of normality tests. Journal of Statistical Computation and Simulation, 81, 2141-2155.
data
sessionInfo()
Copyright ? 2015 Daniel Lakens
Lakens, D. (2015). The perfect t-test. Retrieved from https://github.com/Lakens/perfect-t-test. doi:10.5281/zenodo.17603
This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. For more information, see the GNU Affero General Public License
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.