R/stat-describe.R

Defines functions stat_describeFast mardia

#'  Basic descriptive statistics derived from psych package
#'
#'  There are many summary statistics available in R; this function
#'  provides the ones most useful for scale construction and item analysis in classic psychometrics.
#'  Range is most useful for the first pass in a data set, to check for coding errors.
#'
#' @param x  A data frame or matrix
#' @param na.rm  The default is to delete missing data.  na.rm=FALSE will delete the case.
#' @param interp Should the median be standard or interpolated
#' @param skew  Should the skew and kurtosis be calculated?
#' @param ranges  Should the range be calculated?
#' @param trim  trim=.1 -- trim means by dropping the top and bottom trim fraction
#' @param type Which estimate of skew and kurtosis should be used?  (See details.)
#' @param check Should we check for non-numeric variables?  Slower but helpful.
#' @param fast if TRUE, will do n, means, sds, ranges for an improvement in speed.  If NULL, will switch to fast mode for large (ncol * nrow > 10^7) problems, otherwise defaults to fast = FALSE}
#' @param quant if not NULL, will find the specified quantiles. e.g quant=c(.25,.75) will find the 25th and 75th percentiles.
#' @param IQR If TRUE, show the interquartile range
#' @param head show the first 1:head cases for each variable in describeData
#' @param tail Show the last nobs-tail cases for each variable in describeData
#' @details  Please see \link[psych]{describe}
#'
#' @return A data frame of the relevant statistics
#'
#'    item name
#'    item number
#'    number of valid cases
#'    mean
#'    standard deviation
#'    trimmed mean (with trim defaulting to .1)
#'    median (standard or interpolated
#'            mad: median absolute deviation (from the median)
#'            minimum
#'            maximum
#'            skew
#'            kurtosis
#'            standard error
#'
#'  @note  For very large data sets that are data.frames, describe can be rather slow. Converting the
#'   data to a matrix first is recommended.  However, if the data are of different types, (factors or
#'   logical), this is  not possible.  If the data set includes columns of character data, it is also
#'   not possible.  Thus, a quick pass with \code{\link{describeData}} is recommended.For the greatest
#'   speed, at the cost of losing information, do not ask for ranges or for skew and turn off check.
#'   This is done automatically if the fast option is TRUE or for large data sets.Note that by default,
#'   fast=NULL.  But if the number of cases x number of variables exceeds (ncol * nrow > 10^7), fast will
#'   be set to TRUE.  This will provide just n, mean, sd, min, max, range, and standard errors. To get
#'   all of the statistics (but at a cost of greater time) set fast=FALSE.The problem seems to be a memory
#'   limitation in that the time taken is an accelerating function of nvars * nobs.  Thus, for a largish
#'   problem (72,000 cases with 1680 variables) which might take 330 seconds, doing it as two sets of 840
#'   variable cuts the time down to 80 seconds.   }
#'  @author {http://personality-project.org/revelle.html} \cr
#'    Maintainer: William Revelle \email{revelle@northwestern.edu} \cr
#'
#' @note The object returned is a data frame with the normal precision of R.  However, to control the number
#'  of digits displayed, you can set digits in a print command, rather than losing precision at the descriptive
#'  stats level.  See the last two examples.  One just sets the number of digits, one gives uses signif  to make
#'  'prettier'  output where all numbers are displayed to the same number of digits.
#' @references  Joanes, D.N. and Gill, C.A (1998).  Comparing measures of sample skewness and kurtosis.  The Statistician, 47, 183-189.
#'
#' Revelle, W. (2017) psych: Procedures for Personality and Psychological Research, Northwestern University, Evanston, Illinois, USA
#' @examples
#'    data(mtcars)
#'    stat_describe(mtcars)
#'    stat_describe(mtcars,skew=FALSE)
#'    stat_describe(mtcars,IQR=TRUE) #show the interquartile Range
#'    stat_describe(mtcars,quant=c(.1,.25,.5,.75,.90) ) #find the 10th, 25th, 50th,
#'    #75th and 90th percentiles
#'    stat_describeData(mtcars) #the fast version
#'    #now show how to adjust the displayed number of digits
#'    des <- stat_describe(mtcars)  #find the descriptive statistics.  Keep the original accuracy
#'    des  #show the normal output, which is rounded to 2 decimals
#'    print(des,digits=3)  #show the output, but round to 3 (trailing) digits
#'    print(des, signif=3) #round all numbers to the 3 significant digits
#'
#' @name stat_describe
#' @rdname stat_describe
#' @export
#'

#changed October 12, 2011 to use apply because mean and sd are deprecated for data.frames
#modified January 10, 2014 to add the check option to improve speed.  A few other improvements
#modified December 2014 to add the fast option for large data sets
#modified May 21, 2015 to allow non-numeric data to be described (but with a warning)
#further modified June 21, 2016 to allow for character input as well as well reporting quantiles
#tried to improve the speed by using multicores, but this requires using s or lapply which don't do what I need.
stat_describe <- function (x,
                      na.rm=TRUE,
                      interp=FALSE,
                      skew=TRUE,
                      ranges=TRUE,
                      trim=.1,
                      type=3,
                      check=TRUE,
                      fast=NULL,
                      quant=NULL,
                      IQR=FALSE)   #basic stats after dropping non-numeric data #slightly faster if we don't do skews
{
    cl <- match.call()
    #first, define a local function
    valid <- function(x) {sum(!is.na(x))}
    if(!na.rm) x <- na.omit(x)   #just complete cases

    if(is.null(fast)) {
        if (prod(dim(x)) > 10^7) {fast <- TRUE } else {fast <- FALSE}}  #the default is to use fast for large data sets
    if(fast) {
        skew <- FALSE
    }
    numstats <- 10 + length(quant)	+ IQR
    if ( is.null(dim(x)[2]))
    {        #do it for vectors or
        len  <- 1
        nvar <- 1
        stats = matrix(rep(NA,numstats),ncol=numstats)    #create a temporary array
        stats[1, 1] <-  valid(x )
        stats[1, 2] <-  mean(x, na.rm=na.rm )
        stats[1,10] <- sd(x,na.rm=na.rm)
        if(interp) {stats[1, 3] <- interp.median(x,na.rm=na.rm  ) }  else {stats[1,3] <- median(x,na.rm=na.rm) }
        stats[1,9] <- mean(x,na.rm=na.rm, trim=trim)
        stats[1, 4] <-  min(x, na.rm=na.rm )
        stats[1, 5] <-  max(x, na.rm=na.rm )
        stats[1, 6] <-  skew(x,na.rm=na.rm,type=type  )
        stats[1,7] <-  mad(x,na.rm=na.rm)
        stats[1,8] <-  kurtosi(x,na.rm=na.rm,type=type)
        vars <- 1
        if(!is.null(quant)) { Qnt <- quantile(x,prob=quant,na.rm=TRUE)
        stats[1,(IQR+11):numstats] <- t(Qnt)}
        if(IQR)
        {
            Quart <- t(quantile(x,prob=c(.25,.75),na.rm=TRUE))
            Iqr <- Quart[,2] -Quart[,1]
            stats[1,11] <- Iqr
        }
        rownames(stats) <- "X1"
    } else
    {
        nvar <- ncol(x)
        stats = matrix(rep(NA,nvar*numstats),ncol=numstats)    #create a temporary array

        if(is.null(colnames(x))) colnames(x) <- paste0("X",1:ncol(x))
        rownames(stats) <- colnames(x)
        stats[,1] <- apply(x,2,valid)
        vars <- c(1:nvar)
        ##adapted from the pairs function to convert logical or categorical to numeric

        if(!is.matrix(x) && check)
        {  #does not work for matrices
            for(i in 1:nvar)
            {
                if(!is.numeric(x[[i]] ))
                {
                    if(fast)  {x[[i]] <- NA} else
                    {
                        if(is.factor(unlist(x[[i]])) | is.character(unlist(x[[i]])))
                        {
                            x[[i]] <- as.numeric(x[[i]])

                            # if(is.factor(unlist(x[[i]]))) { #fixed 5/21/15
                            #             x[[i]] <- as.numeric(x[[i]])}

                        } else
                        {
                            x[[i]] <- NA
                        }
                    }
                    rownames(stats)[i] <- paste(rownames(stats)[i],"*",sep="")
                }
            }
        }

        x <- as.matrix(x)
        if(!is.numeric(x))
        {
            message("Converted non-numeric matrix input to numeric.  Are you sure you wanted to do this. Please check your data")
            x <- matrix(as.numeric(x),ncol=nvar)
            rownames(stats) <- paste0(rownames(stats),"*")
        }

        stats[,2] <- apply(x, 2,mean,na.rm=na.rm )
        stats[,10] <- apply(x,2,sd,na.rm=na.rm)
        if (skew)
        {
            stats[, 6] <-  skew(x,na.rm=na.rm,type=type  )
            stats[,8] <- kurtosi(x,na.rm=na.rm,type=type)
        }
        if(ranges)
        {
            if(fast)
            {
                stats[,4] <- apply(x,2,min,na.rm=na.rm)
                stats[,5] <- apply(x,2,max,na.rm = na.rm)
            } else
            {
                stats[, 4] <-  apply(x,2,min, na.rm=na.rm )
                stats[, 5] <-  apply(x,2,max, na.rm=na.rm )
                stats[,7] <-   apply(x,2,mad, na.rm=na.rm)
                stats[,9]  <- apply(x,2, mean,na.rm=na.rm,trim=trim)
                if(interp)
                {
                    stats[, 3] <- apply(x,2,interp.median,na.rm=na.rm  )
                } else
                {
                    stats[,3] <- apply(x,2,median,na.rm=na.rm) }
            }
        }
        if(!is.null(quant)) { Qnt <- apply(x,2,quantile,prob=quant,na.rm=TRUE)
        stats[,(IQR+11):numstats] <- t(Qnt)}

        if(IQR)
        {
            Quart <- t(apply(x,2,quantile,prob=c(.25,.75),na.rm=TRUE))
            Iqr <- Quart[,2] - Quart[,1]
            stats[,11] <- Iqr
        }
    }  #end of maxtrix input
    #now summarize the results
    if (numstats > (10 + IQR))
    {
        colnames(stats)[(11+IQR):numstats] <- paste0("Q",quant[1:length(quant)])
    }

    #the following output was cleaned up on June 22, 2016 added the quantile information.

    #the various options are ranges, skew, fast, numstats > 10
    if(fast)
    {
        answer <-  data.frame(vars=vars,
                              n = stats[,1],
                              mean=stats[,2],
                              sd = stats[,10],
                              se=stats[,10]/sqrt(stats[,1]))
    }  #minimal case

    #if((!skew) && ranges) {answer <-  data.frame(vars=vars,n = stats[,1],mean=stats[,2], sd = stats[,10],min= stats[,4],max=stats[,5], range=stats[,5]-stats[,4],se=stats[,10]/sqrt(stats[,1])) }
    if(skew)
    {
        if(ranges)
        {
            answer  <-  data.frame(vars=vars,
                                   n = stats[,1],
                                   mean=stats[,2],
                                   sd = stats[,10],
                                   median = stats[, 3],
                                   trimmed =stats[,9],
                                   mad = stats[,7],
                                   min= stats[,4],
                                   max=stats[,5],
                                   range=stats[,5]-stats[,4],
                                   skew = stats[, 6],
                                   kurtosis = stats[,8],
                                   se=stats[,10]/sqrt(stats[,1]))
        } else
        {
            answer  <-  data.frame(vars=vars,
                                   n = stats[,1],
                                   mean=stats[,2],
                                   sd = stats[,10],
                                   skew = stats[, 6],
                                   kurtosis = stats[,8],
                                   se=stats[,10]/sqrt(stats[,1]))
        }
    } else {
        if(ranges) {
            answer <-  data.frame(vars=vars,
                                  n = stats[,1],
                                  mean=stats[,2],
                                  sd = stats[,10],
                                  min= stats[,4],
                                  max=stats[,5],
                                  range=stats[,5]-stats[,4],
                                  se=stats[,10]/sqrt(stats[,1]))
        } else {
            answer  <-  data.frame(vars=vars,
                                   n = stats[,1],
                                   mean=stats[,2],
                                   sd = stats[,10],
                                   se=stats[,10]/sqrt(stats[,1]))
        }
    }
    if(IQR) answer <- data.frame(answer,IQR=stats[,11])
    if (numstats > (10+ IQR)) {
        if(nvar > 1 )
        {
            answer <- data.frame(answer, stats[,(IQR+11):numstats])   #add the quantile information
        } else {
            answer <- data.frame(answer, t(stats[,(IQR+11):numstats]))
        }
    }
    return(answer)
}




stat_describeFast <- function(x) {
    nvar <- NCOL(x)
    nobs <- NROW(x)

}

#added 1/11/14
#modified 12/29/14 to handle cases with non-numeric data
#for fast descriptions
stat_describeData <- function (x, head = 4, tail = 4)
{
        valid <- function(x) {
            sum(!is.na(x))
        }
        nvar <- ncol(x)
        all.numeric <- nvar
        ans <- matrix(NA,nrow=nvar,ncol=2)
        nobs <- nrow(x)
        cc <- 0
        cc <- try(complete.cases(x),silent=TRUE)
        if(class(cc) == "try-error") cc <- NA
        cc <- sum(cc,na.rm=TRUE)
        for (i in 1:nvar) {

            if (is.numeric(x[,i])) {ans[i,2] <- 1 } else {
                if ((is.factor(x[,i])) || (is.logical(x[,i]))) {
                    ans[i,2]  <- 2
                } else {
                    if (is.character(x[,i])) {
                        ans[i,2] <- 3
                    } else {ans[i,2] <- 4}
                }
            }
            ans[i,1]  <- valid(x[,i])
        }


        if (is.numeric(unlist(x))) {
            all.numeric <- TRUE
        }
        else {
            all.numeric <- FALSE
        }
        H1 <- t(x[1:head,1:nvar])
        T1 <- t(x[(nobs-tail+1):nobs,1:nvar])
        temp <- data.frame(V=1:nvar,ans,H1,T1)

        colnames(temp) <- c("variable #", "n.obs", "type", paste("H",
                                                                 1:head, sep = ""), paste("T", 1:tail, sep = ""))
        rownames(temp)[temp[,"type"]!=1] <- paste(rownames(temp)[temp[,"type"]!=1],"*",sep="")
        result <- (list(n.obs = nobs, nvar = nvar, all.numeric = all.numeric,
                        complete.cases = cc, variables = temp))
        class(result) <- c("psych", "describeData")
        return(result)
}


#corrected May 7, 2007
#modified October ,2011 to use apply for mean and sd
#modified April, 2012 to return 3 estimates, depending upon type
#partly based upon e1071  skewness and kurtosis
skew <-
    function (x, na.rm = TRUE,type=3)
    {
        if (length(dim(x)) == 0) {
            if (na.rm) {
                x <- x[!is.na(x)]
            }
            sdx <- sd(x,na.rm=na.rm)
            mx <- mean(x)
            n <- length(x[!is.na(x)])
            switch(type,
                   {skewer <- sqrt(n) *( sum((x - mx)^3,  na.rm = na.rm)/( sum((x - mx)^2,na.rm = na.rm)^(3/2)))}, #case 1
                   {skewer <- n *sqrt(n-1) *( sum((x - mx)^3,  na.rm = na.rm)/((n-2) * sum((x - mx)^2,na.rm = na.rm)^(3/2)))}, #case 2
                   {skewer <- sum((x - mx)^3)/(n * sd(x)^3) })  #case 3
        } else {

            skewer <- rep(NA,dim(x)[2])
            if (is.matrix(x)) {mx <- colMeans(x,na.rm=na.rm)} else {mx <- apply(x,2,mean,na.rm=na.rm)}
            sdx <- apply(x,2,sd,na.rm=na.rm)
            for (i in 1:dim(x)[2]) {
                n <- length(x[!is.na(x[,i]),i])
                switch(type,
                       {skewer[i] <-sqrt(n) *( sum((x[,i] - mx[i])^3,  na.rm = na.rm)/( sum((x[,i] - mx[i])^2,na.rm = na.rm)^(3/2)))}, #type 1
                       {skewer[i] <- n *sqrt(n-1) *( sum((x[,i] - mx[i])^3,  na.rm = na.rm)/((n-2) * sum((x[,i] - mx[i])^2,na.rm = na.rm)^(3/2)))},#type 2
                       {skewer[i] <- sum((x[,i] - mx[i])^3,  na.rm = na.rm)/(n * sdx[i]^3)} #type 3
                ) #end switch
            } #end loop
        }
        return(skewer)
    }


#modified November 24, 2010 to use an unbiased estimator of kurtosis as the default
#and again April 22, 2012 to include all three types
kurtosi <- function (x,
                     na.rm = TRUE,
                     type=3)
{
    if (length(dim(x)) == 0)
    {
        if (na.rm)
        {
            x <- x[!is.na(x)]
        }
        if (is.matrix(x) ) { mx <- colMeans(x,na.rm=na.rm)} else {mx <- mean(x,na.rm=na.rm)}
        sdx <- sd(x,na.rm=na.rm)
        n <- length(x[!is.na(x)])
        switch(type,
        {kurt <- sum((x - mx)^4,
                    na.rm = na.rm)*n /(sum((x - mx)^2,
                    na.rm = na.rm)^2)  -3},  #type 1
        {kurt <- n*(n + 1)*sum((x - mx)^4,
                    na.rm = na.rm)/( (n - 1)*(n - 2)*(n - 3)*(sum((x - mx)^2,
                    na.rm = na.rm)/(n - 1))^2)  -3 *(n- 1)^2 /((n - 2)*(n - 3)) }, # type 2
        {kurt <- sum((x - mx)^4)/(n *sdx^4)  -3})  #	type 3
    }else {
        kurt <- rep(NA,dim(x)[2])
        #  mx <- mean(x,na.rm=na.rm)
        mx <-apply(x,2 ,mean,na.rm=na.rm)
        if(type==3)  sdx <- apply(x,2,sd,na.rm=na.rm)
        for (i in 1:dim(x)[2])
        {
            n <- length(x[!is.na(x[,i]),i])
            switch(type,
                {kurt[i] <- sum((x[,i] - mx[i])^4,  na.rm = na.rm)*length(x[,i]) /(sum((x[,i] - mx[i])^2,na.rm = na.rm)^2)  -3},  #type 1
                {
                    xi <- x[,i]-mx[i]
                    kurt[i] <- n*(n + 1)*sum((x[,i] - mx[i])^4,  na.rm = na.rm)/( (n - 1)*(n - 2)*(n - 3)*(sum((x[,i] - mx[i])^2,na.rm = na.rm)/(n - 1))^2)  -3 *(n- 1)^2 /((n - 2)*(n - 3)) },  #type 2
                {kurt[i] <- sum((x[,i] - mx[i])^4,  na.rm = na.rm)/((length(x[,i]) - sum(is.na(x[,i]))) * sdx[i]^4)  -3},  #type 3
                {NULL})
            names(kurt) <- colnames(x)
        }
    }
    return(kurt)
}


#added November 15, 2010
#adapted from the mult.norm function of the QuantPsyc package

mardia <- function(x,na.rm=TRUE,plot=TRUE)
{
    cl <- match.call()
    x <- as.matrix(x)     #in case it was a dataframe
    if(na.rm) x <- na.omit(x)
    if(nrow(x) >  0)
    {
        n <- dim(x)[1]
        p <- dim(x)[2]
        x <- scale(x,scale=FALSE)  #zero center
        S <- cov(x)
        S.inv <- solve(S)
        D <- x %*% S.inv %*% t(x)
        b1p <- sum(D^3)/n^2
        b2p <- tr(D^2)/n
        chi.df <- p*(p+1)*(p+2)/6
        k <- (p+1)*(n+1)*(n+3)/(n*((n+1)*(p+1) -6))
        small.skew <- n*k*b1p/6
        M.skew <- n*b1p/6
        M.kurt <- (b2p - p * (p+2))*sqrt(n/(8*p*(p+2)))
        p.skew <- 1-pchisq(M.skew,chi.df)
        p.small <- 1 - pchisq(small.skew,chi.df)
        p.kurt <- 2*(1- pnorm(abs(M.kurt)))
        d =sqrt(diag(D))
        if(plot) {qqnorm(d);qqline(d)}
        results <- list(n.obs=n,n.var=p, b1p = b1p,b2p = b2p,skew=M.skew,small.skew=small.skew,p.skew=p.skew,p.small=p.small,kurtosis=M.kurt,p.kurt=p.kurt,d = d,Call=cl)
        return(results)
    } else
    {
        warning("no cases with complete data, mardia  quit.")
    }
}
ShouyeLiu/metaboliteUtility documentation built on May 6, 2019, 9:07 a.m.