#' Basic descriptive statistics derived from psych package
#'
#' There are many summary statistics available in R; this function
#' provides the ones most useful for scale construction and item analysis in classic psychometrics.
#' Range is most useful for the first pass in a data set, to check for coding errors.
#'
#' @param x A data frame or matrix
#' @param na.rm The default is to delete missing data. na.rm=FALSE will delete the case.
#' @param interp Should the median be standard or interpolated
#' @param skew Should the skew and kurtosis be calculated?
#' @param ranges Should the range be calculated?
#' @param trim trim=.1 -- trim means by dropping the top and bottom trim fraction
#' @param type Which estimate of skew and kurtosis should be used? (See details.)
#' @param check Should we check for non-numeric variables? Slower but helpful.
#' @param fast if TRUE, will do n, means, sds, ranges for an improvement in speed. If NULL, will switch to fast mode for large (ncol * nrow > 10^7) problems, otherwise defaults to fast = FALSE}
#' @param quant if not NULL, will find the specified quantiles. e.g quant=c(.25,.75) will find the 25th and 75th percentiles.
#' @param IQR If TRUE, show the interquartile range
#' @param head show the first 1:head cases for each variable in describeData
#' @param tail Show the last nobs-tail cases for each variable in describeData
#' @details Please see \link[psych]{describe}
#'
#' @return A data frame of the relevant statistics
#'
#' item name
#' item number
#' number of valid cases
#' mean
#' standard deviation
#' trimmed mean (with trim defaulting to .1)
#' median (standard or interpolated
#' mad: median absolute deviation (from the median)
#' minimum
#' maximum
#' skew
#' kurtosis
#' standard error
#'
#' @note For very large data sets that are data.frames, describe can be rather slow. Converting the
#' data to a matrix first is recommended. However, if the data are of different types, (factors or
#' logical), this is not possible. If the data set includes columns of character data, it is also
#' not possible. Thus, a quick pass with \code{\link{describeData}} is recommended.For the greatest
#' speed, at the cost of losing information, do not ask for ranges or for skew and turn off check.
#' This is done automatically if the fast option is TRUE or for large data sets.Note that by default,
#' fast=NULL. But if the number of cases x number of variables exceeds (ncol * nrow > 10^7), fast will
#' be set to TRUE. This will provide just n, mean, sd, min, max, range, and standard errors. To get
#' all of the statistics (but at a cost of greater time) set fast=FALSE.The problem seems to be a memory
#' limitation in that the time taken is an accelerating function of nvars * nobs. Thus, for a largish
#' problem (72,000 cases with 1680 variables) which might take 330 seconds, doing it as two sets of 840
#' variable cuts the time down to 80 seconds. }
#' @author {http://personality-project.org/revelle.html} \cr
#' Maintainer: William Revelle \email{revelle@northwestern.edu} \cr
#'
#' @note The object returned is a data frame with the normal precision of R. However, to control the number
#' of digits displayed, you can set digits in a print command, rather than losing precision at the descriptive
#' stats level. See the last two examples. One just sets the number of digits, one gives uses signif to make
#' 'prettier' output where all numbers are displayed to the same number of digits.
#' @references Joanes, D.N. and Gill, C.A (1998). Comparing measures of sample skewness and kurtosis. The Statistician, 47, 183-189.
#'
#' Revelle, W. (2017) psych: Procedures for Personality and Psychological Research, Northwestern University, Evanston, Illinois, USA
#' @examples
#' data(mtcars)
#' stat_describe(mtcars)
#' stat_describe(mtcars,skew=FALSE)
#' stat_describe(mtcars,IQR=TRUE) #show the interquartile Range
#' stat_describe(mtcars,quant=c(.1,.25,.5,.75,.90) ) #find the 10th, 25th, 50th,
#' #75th and 90th percentiles
#' stat_describeData(mtcars) #the fast version
#' #now show how to adjust the displayed number of digits
#' des <- stat_describe(mtcars) #find the descriptive statistics. Keep the original accuracy
#' des #show the normal output, which is rounded to 2 decimals
#' print(des,digits=3) #show the output, but round to 3 (trailing) digits
#' print(des, signif=3) #round all numbers to the 3 significant digits
#'
#' @name stat_describe
#' @rdname stat_describe
#' @export
#'
#changed October 12, 2011 to use apply because mean and sd are deprecated for data.frames
#modified January 10, 2014 to add the check option to improve speed. A few other improvements
#modified December 2014 to add the fast option for large data sets
#modified May 21, 2015 to allow non-numeric data to be described (but with a warning)
#further modified June 21, 2016 to allow for character input as well as well reporting quantiles
#tried to improve the speed by using multicores, but this requires using s or lapply which don't do what I need.
stat_describe <- function (x,
na.rm=TRUE,
interp=FALSE,
skew=TRUE,
ranges=TRUE,
trim=.1,
type=3,
check=TRUE,
fast=NULL,
quant=NULL,
IQR=FALSE) #basic stats after dropping non-numeric data #slightly faster if we don't do skews
{
cl <- match.call()
#first, define a local function
valid <- function(x) {sum(!is.na(x))}
if(!na.rm) x <- na.omit(x) #just complete cases
if(is.null(fast)) {
if (prod(dim(x)) > 10^7) {fast <- TRUE } else {fast <- FALSE}} #the default is to use fast for large data sets
if(fast) {
skew <- FALSE
}
numstats <- 10 + length(quant) + IQR
if ( is.null(dim(x)[2]))
{ #do it for vectors or
len <- 1
nvar <- 1
stats = matrix(rep(NA,numstats),ncol=numstats) #create a temporary array
stats[1, 1] <- valid(x )
stats[1, 2] <- mean(x, na.rm=na.rm )
stats[1,10] <- sd(x,na.rm=na.rm)
if(interp) {stats[1, 3] <- interp.median(x,na.rm=na.rm ) } else {stats[1,3] <- median(x,na.rm=na.rm) }
stats[1,9] <- mean(x,na.rm=na.rm, trim=trim)
stats[1, 4] <- min(x, na.rm=na.rm )
stats[1, 5] <- max(x, na.rm=na.rm )
stats[1, 6] <- skew(x,na.rm=na.rm,type=type )
stats[1,7] <- mad(x,na.rm=na.rm)
stats[1,8] <- kurtosi(x,na.rm=na.rm,type=type)
vars <- 1
if(!is.null(quant)) { Qnt <- quantile(x,prob=quant,na.rm=TRUE)
stats[1,(IQR+11):numstats] <- t(Qnt)}
if(IQR)
{
Quart <- t(quantile(x,prob=c(.25,.75),na.rm=TRUE))
Iqr <- Quart[,2] -Quart[,1]
stats[1,11] <- Iqr
}
rownames(stats) <- "X1"
} else
{
nvar <- ncol(x)
stats = matrix(rep(NA,nvar*numstats),ncol=numstats) #create a temporary array
if(is.null(colnames(x))) colnames(x) <- paste0("X",1:ncol(x))
rownames(stats) <- colnames(x)
stats[,1] <- apply(x,2,valid)
vars <- c(1:nvar)
##adapted from the pairs function to convert logical or categorical to numeric
if(!is.matrix(x) && check)
{ #does not work for matrices
for(i in 1:nvar)
{
if(!is.numeric(x[[i]] ))
{
if(fast) {x[[i]] <- NA} else
{
if(is.factor(unlist(x[[i]])) | is.character(unlist(x[[i]])))
{
x[[i]] <- as.numeric(x[[i]])
# if(is.factor(unlist(x[[i]]))) { #fixed 5/21/15
# x[[i]] <- as.numeric(x[[i]])}
} else
{
x[[i]] <- NA
}
}
rownames(stats)[i] <- paste(rownames(stats)[i],"*",sep="")
}
}
}
x <- as.matrix(x)
if(!is.numeric(x))
{
message("Converted non-numeric matrix input to numeric. Are you sure you wanted to do this. Please check your data")
x <- matrix(as.numeric(x),ncol=nvar)
rownames(stats) <- paste0(rownames(stats),"*")
}
stats[,2] <- apply(x, 2,mean,na.rm=na.rm )
stats[,10] <- apply(x,2,sd,na.rm=na.rm)
if (skew)
{
stats[, 6] <- skew(x,na.rm=na.rm,type=type )
stats[,8] <- kurtosi(x,na.rm=na.rm,type=type)
}
if(ranges)
{
if(fast)
{
stats[,4] <- apply(x,2,min,na.rm=na.rm)
stats[,5] <- apply(x,2,max,na.rm = na.rm)
} else
{
stats[, 4] <- apply(x,2,min, na.rm=na.rm )
stats[, 5] <- apply(x,2,max, na.rm=na.rm )
stats[,7] <- apply(x,2,mad, na.rm=na.rm)
stats[,9] <- apply(x,2, mean,na.rm=na.rm,trim=trim)
if(interp)
{
stats[, 3] <- apply(x,2,interp.median,na.rm=na.rm )
} else
{
stats[,3] <- apply(x,2,median,na.rm=na.rm) }
}
}
if(!is.null(quant)) { Qnt <- apply(x,2,quantile,prob=quant,na.rm=TRUE)
stats[,(IQR+11):numstats] <- t(Qnt)}
if(IQR)
{
Quart <- t(apply(x,2,quantile,prob=c(.25,.75),na.rm=TRUE))
Iqr <- Quart[,2] - Quart[,1]
stats[,11] <- Iqr
}
} #end of maxtrix input
#now summarize the results
if (numstats > (10 + IQR))
{
colnames(stats)[(11+IQR):numstats] <- paste0("Q",quant[1:length(quant)])
}
#the following output was cleaned up on June 22, 2016 added the quantile information.
#the various options are ranges, skew, fast, numstats > 10
if(fast)
{
answer <- data.frame(vars=vars,
n = stats[,1],
mean=stats[,2],
sd = stats[,10],
se=stats[,10]/sqrt(stats[,1]))
} #minimal case
#if((!skew) && ranges) {answer <- data.frame(vars=vars,n = stats[,1],mean=stats[,2], sd = stats[,10],min= stats[,4],max=stats[,5], range=stats[,5]-stats[,4],se=stats[,10]/sqrt(stats[,1])) }
if(skew)
{
if(ranges)
{
answer <- data.frame(vars=vars,
n = stats[,1],
mean=stats[,2],
sd = stats[,10],
median = stats[, 3],
trimmed =stats[,9],
mad = stats[,7],
min= stats[,4],
max=stats[,5],
range=stats[,5]-stats[,4],
skew = stats[, 6],
kurtosis = stats[,8],
se=stats[,10]/sqrt(stats[,1]))
} else
{
answer <- data.frame(vars=vars,
n = stats[,1],
mean=stats[,2],
sd = stats[,10],
skew = stats[, 6],
kurtosis = stats[,8],
se=stats[,10]/sqrt(stats[,1]))
}
} else {
if(ranges) {
answer <- data.frame(vars=vars,
n = stats[,1],
mean=stats[,2],
sd = stats[,10],
min= stats[,4],
max=stats[,5],
range=stats[,5]-stats[,4],
se=stats[,10]/sqrt(stats[,1]))
} else {
answer <- data.frame(vars=vars,
n = stats[,1],
mean=stats[,2],
sd = stats[,10],
se=stats[,10]/sqrt(stats[,1]))
}
}
if(IQR) answer <- data.frame(answer,IQR=stats[,11])
if (numstats > (10+ IQR)) {
if(nvar > 1 )
{
answer <- data.frame(answer, stats[,(IQR+11):numstats]) #add the quantile information
} else {
answer <- data.frame(answer, t(stats[,(IQR+11):numstats]))
}
}
return(answer)
}
stat_describeFast <- function(x) {
nvar <- NCOL(x)
nobs <- NROW(x)
}
#added 1/11/14
#modified 12/29/14 to handle cases with non-numeric data
#for fast descriptions
stat_describeData <- function (x, head = 4, tail = 4)
{
valid <- function(x) {
sum(!is.na(x))
}
nvar <- ncol(x)
all.numeric <- nvar
ans <- matrix(NA,nrow=nvar,ncol=2)
nobs <- nrow(x)
cc <- 0
cc <- try(complete.cases(x),silent=TRUE)
if(class(cc) == "try-error") cc <- NA
cc <- sum(cc,na.rm=TRUE)
for (i in 1:nvar) {
if (is.numeric(x[,i])) {ans[i,2] <- 1 } else {
if ((is.factor(x[,i])) || (is.logical(x[,i]))) {
ans[i,2] <- 2
} else {
if (is.character(x[,i])) {
ans[i,2] <- 3
} else {ans[i,2] <- 4}
}
}
ans[i,1] <- valid(x[,i])
}
if (is.numeric(unlist(x))) {
all.numeric <- TRUE
}
else {
all.numeric <- FALSE
}
H1 <- t(x[1:head,1:nvar])
T1 <- t(x[(nobs-tail+1):nobs,1:nvar])
temp <- data.frame(V=1:nvar,ans,H1,T1)
colnames(temp) <- c("variable #", "n.obs", "type", paste("H",
1:head, sep = ""), paste("T", 1:tail, sep = ""))
rownames(temp)[temp[,"type"]!=1] <- paste(rownames(temp)[temp[,"type"]!=1],"*",sep="")
result <- (list(n.obs = nobs, nvar = nvar, all.numeric = all.numeric,
complete.cases = cc, variables = temp))
class(result) <- c("psych", "describeData")
return(result)
}
#corrected May 7, 2007
#modified October ,2011 to use apply for mean and sd
#modified April, 2012 to return 3 estimates, depending upon type
#partly based upon e1071 skewness and kurtosis
skew <-
function (x, na.rm = TRUE,type=3)
{
if (length(dim(x)) == 0) {
if (na.rm) {
x <- x[!is.na(x)]
}
sdx <- sd(x,na.rm=na.rm)
mx <- mean(x)
n <- length(x[!is.na(x)])
switch(type,
{skewer <- sqrt(n) *( sum((x - mx)^3, na.rm = na.rm)/( sum((x - mx)^2,na.rm = na.rm)^(3/2)))}, #case 1
{skewer <- n *sqrt(n-1) *( sum((x - mx)^3, na.rm = na.rm)/((n-2) * sum((x - mx)^2,na.rm = na.rm)^(3/2)))}, #case 2
{skewer <- sum((x - mx)^3)/(n * sd(x)^3) }) #case 3
} else {
skewer <- rep(NA,dim(x)[2])
if (is.matrix(x)) {mx <- colMeans(x,na.rm=na.rm)} else {mx <- apply(x,2,mean,na.rm=na.rm)}
sdx <- apply(x,2,sd,na.rm=na.rm)
for (i in 1:dim(x)[2]) {
n <- length(x[!is.na(x[,i]),i])
switch(type,
{skewer[i] <-sqrt(n) *( sum((x[,i] - mx[i])^3, na.rm = na.rm)/( sum((x[,i] - mx[i])^2,na.rm = na.rm)^(3/2)))}, #type 1
{skewer[i] <- n *sqrt(n-1) *( sum((x[,i] - mx[i])^3, na.rm = na.rm)/((n-2) * sum((x[,i] - mx[i])^2,na.rm = na.rm)^(3/2)))},#type 2
{skewer[i] <- sum((x[,i] - mx[i])^3, na.rm = na.rm)/(n * sdx[i]^3)} #type 3
) #end switch
} #end loop
}
return(skewer)
}
#modified November 24, 2010 to use an unbiased estimator of kurtosis as the default
#and again April 22, 2012 to include all three types
kurtosi <- function (x,
na.rm = TRUE,
type=3)
{
if (length(dim(x)) == 0)
{
if (na.rm)
{
x <- x[!is.na(x)]
}
if (is.matrix(x) ) { mx <- colMeans(x,na.rm=na.rm)} else {mx <- mean(x,na.rm=na.rm)}
sdx <- sd(x,na.rm=na.rm)
n <- length(x[!is.na(x)])
switch(type,
{kurt <- sum((x - mx)^4,
na.rm = na.rm)*n /(sum((x - mx)^2,
na.rm = na.rm)^2) -3}, #type 1
{kurt <- n*(n + 1)*sum((x - mx)^4,
na.rm = na.rm)/( (n - 1)*(n - 2)*(n - 3)*(sum((x - mx)^2,
na.rm = na.rm)/(n - 1))^2) -3 *(n- 1)^2 /((n - 2)*(n - 3)) }, # type 2
{kurt <- sum((x - mx)^4)/(n *sdx^4) -3}) # type 3
}else {
kurt <- rep(NA,dim(x)[2])
# mx <- mean(x,na.rm=na.rm)
mx <-apply(x,2 ,mean,na.rm=na.rm)
if(type==3) sdx <- apply(x,2,sd,na.rm=na.rm)
for (i in 1:dim(x)[2])
{
n <- length(x[!is.na(x[,i]),i])
switch(type,
{kurt[i] <- sum((x[,i] - mx[i])^4, na.rm = na.rm)*length(x[,i]) /(sum((x[,i] - mx[i])^2,na.rm = na.rm)^2) -3}, #type 1
{
xi <- x[,i]-mx[i]
kurt[i] <- n*(n + 1)*sum((x[,i] - mx[i])^4, na.rm = na.rm)/( (n - 1)*(n - 2)*(n - 3)*(sum((x[,i] - mx[i])^2,na.rm = na.rm)/(n - 1))^2) -3 *(n- 1)^2 /((n - 2)*(n - 3)) }, #type 2
{kurt[i] <- sum((x[,i] - mx[i])^4, na.rm = na.rm)/((length(x[,i]) - sum(is.na(x[,i]))) * sdx[i]^4) -3}, #type 3
{NULL})
names(kurt) <- colnames(x)
}
}
return(kurt)
}
#added November 15, 2010
#adapted from the mult.norm function of the QuantPsyc package
mardia <- function(x,na.rm=TRUE,plot=TRUE)
{
cl <- match.call()
x <- as.matrix(x) #in case it was a dataframe
if(na.rm) x <- na.omit(x)
if(nrow(x) > 0)
{
n <- dim(x)[1]
p <- dim(x)[2]
x <- scale(x,scale=FALSE) #zero center
S <- cov(x)
S.inv <- solve(S)
D <- x %*% S.inv %*% t(x)
b1p <- sum(D^3)/n^2
b2p <- tr(D^2)/n
chi.df <- p*(p+1)*(p+2)/6
k <- (p+1)*(n+1)*(n+3)/(n*((n+1)*(p+1) -6))
small.skew <- n*k*b1p/6
M.skew <- n*b1p/6
M.kurt <- (b2p - p * (p+2))*sqrt(n/(8*p*(p+2)))
p.skew <- 1-pchisq(M.skew,chi.df)
p.small <- 1 - pchisq(small.skew,chi.df)
p.kurt <- 2*(1- pnorm(abs(M.kurt)))
d =sqrt(diag(D))
if(plot) {qqnorm(d);qqline(d)}
results <- list(n.obs=n,n.var=p, b1p = b1p,b2p = b2p,skew=M.skew,small.skew=small.skew,p.skew=p.skew,p.small=p.small,kurtosis=M.kurt,p.kurt=p.kurt,d = d,Call=cl)
return(results)
} else
{
warning("no cases with complete data, mardia quit.")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.