R/Summarize.R
In FSA: Simple Fisheries Stock Assessment Methods

Documented in Summarize Summarize.default Summarize.formula

#' @title Summary statistics for a numeric variable.
#'
#' @description Summary statistics for a single numeric variable, possibly separated by the levels of a factor variable or variables. This function is very similar to \code{\link[base]{summary}} for a numeric variable.
#'
#' @details This function is primarily used with formulas of the following types (where \code{quant} and \code{factor} generically represent quantitative/numeric and factor variables, respectively):
#' \tabular{ll}{
#'   Formula \tab Description of Summary \cr
#'   \code{~quant} \tab Numerical summaries (see below) of \code{quant}.\cr
#'   \code{quant~factor} \tab Summaries of \code{quant} separated by levels in \code{factor}.\cr
#'   \code{quant~factor1*factor2} \tab Summaries of \code{quant} separated by the combined levels in \code{factor1} and \code{factor2}.\cr
#' }
#' 
#' Numerical summaries include all results from \code{\link[base]{summary}} (min, Q1, mean, median, Q3, and max) and the sample size, valid sample size (sample size minus number of \code{NA}s), and standard deviation (i.e., \code{sd}). \code{NA} values are removed from the calculations with \code{na.rm=TRUE} (the DEFAULT). The number of digits in the returned results are controlled with \code{digits=}.
#' 
#' @note Students often need to examine basic statistics of a quantitative variable separated for different levels of a categorical variable. These results may be obtained with \code{\link[base]{tapply}}, \code{\link[base]{by}}, or \code{\link[stats]{aggregate}} (or with functions in other packages), but the use of these functions is not obvious to newbie students or return results in a format that is not obvious to newbie students. Thus, the formula method to \code{Summarize} allows newbie students to use a common notation (i.e., formula) to easily compute summary statistics for a quantitative variable separated by the levels of a factor.
#'
#' @aliases Summarize Summarize.default Summarize.formula
#'
#' @param object A vector of numeric data.
#' @param digits A single numeric that indicates the number of decimals to round the numeric summaries.
#' @param na.rm A logical that indicates whether numeric missing values (\code{NA}) should be removed (\code{=TRUE}, default) or not.
#' @param exclude A string that contains the level that should be excluded from a factor variable.
#' @param data A data.frame that contains the variables in \code{formula}.
#' @param nvalid A string that indicates how the \dQuote{validn} result will be handled. If \code{"always"} then \dQuote{validn} will always be shown and if \code{"never"} then \dQuote{validn} will never be shown. However, if \code{"different"} (DEFAULT), then \dQuote{validn} will only be shown if it differs from \dQuote{n} (or if at least one group differs from \dQuote{n} when summarized by multiple groups).
#' @param percZero A string that indicates how the \dQuote{percZero} result will be handled. If \code{"always"} then \dQuote{percZero} will always be shown and if \code{"never"} then \dQuote{percZero} will never be shown. However, if \code{"different"} (DEFAULT), then \dQuote{percZero} will only be shown if it is greater than zero (or if at least one group is greater than zero when summarized by multiple groups).
#' @param \dots Not implemented.
#'
#' @return A named vector or data frame (when a quantitative variable is separated by one or two factor variables) of summary statistics for numeric data.
#'
#' @author Derek H. Ogle, \email{DerekOgle51@gmail.com}
#'
#' @seealso See \code{\link[base]{summary}} for related one dimensional functionality. See \code{\link[base]{tapply}}, \code{summaryBy} in \pkg{doBy}, \code{describe} in \pkg{psych}, \code{describe} in \pkg{prettyR}, and \code{basicStats} in \pkg{fBasics} for similar \dQuote{by} functionality.
#'
#' @keywords misc
#'
#' @examples
#' ## Create a data.frame of "data"
#' n <- 102
#' d <- data.frame(y=c(0,0,NA,NA,NA,runif(n-5)),
#'                 w=sample(7:9,n,replace=TRUE),
#'                 v=sample(0:2,n,replace=TRUE),
#'                 g1=factor(sample(c("A","B","C",NA),n,replace=TRUE)),
#'                 g2=factor(sample(c("male","female","UNKNOWN"),n,replace=TRUE)),
#'                 g3=sample(c("a","b","c","d"),n,replace=TRUE),
#'                 stringsAsFactors=FALSE)
#' 
#' # typical output of summary() for a numeric variable
#' summary(d$y)   
#'
#' # this function           
#' Summarize(d$y,digits=3)
#' Summarize(~y,data=d,digits=3)
#' Summarize(y~1,data=d,digits=3)
#'
#' # note that nvalid is not shown if there are no NAs and
#' #   percZero is not shown if there are no zeros
#' Summarize(~w,data=d,digits=3)
#' Summarize(~v,data=d,digits=3)
#' 
#' # note that the nvalid and percZero results can be forced to be shown
#' Summarize(~w,data=d,digits=3,nvalid="always",percZero="always")
#' 
#' ## Numeric vector by levels of a factor variable
#' Summarize(y~g1,data=d,digits=3)
#' Summarize(y~g2,data=d,digits=3)
#' Summarize(y~g2,data=d,digits=3,exclude="UNKNOWN")
#'
#' ## Numeric vector by levels of two factor variables
#' Summarize(y~g1+g2,data=d,digits=3)
#' Summarize(y~g1+g2,data=d,digits=3,exclude="UNKNOWN")
#' 
#' ## What happens if RHS of formula is not a factor
#' Summarize(y~w,data=d,digits=3)
#'
#' ## Summarizing multiple variables in a data.frame (must reduce to numerics)
#' lapply(as.list(d[,1:3]),Summarize,digits=4)
#'
#' @rdname Summarize
#' @export
Summarize <- function(object, ...) {
  UseMethod("Summarize")   
}

#' @rdname Summarize
#' @export
Summarize.default <- function(object,digits=getOption("digits"),
                              na.rm=TRUE,exclude=NULL,
                              nvalid=c("different","always","never"),
                              percZero=c("different","always","never"),...) {
  nvalid <- match.arg(nvalid)
  percZero <- match.arg(percZero)
  ## Do some checking on object type
  if (is.data.frame(object)) STOP("'Summarize' does not work with a data.frame.")
  if (is.matrix(object)) {
    if (ncol(object)>1) STOP("'Summarize' does not work with matrices.")
    else {
      # convert 1-d matrices to vectors
      if (is.numeric(object)) object <- as.numeric(object[,1])
      else object <- as.factor(object[,1])
    }
  }
  ## Start processing
  if (!is.numeric(object)) STOP("'Summarize' only works with a numeric variable")
  else iSummarizeQ1(object,digits,na.rm,nvalid,percZero)
}

#' @rdname Summarize
#' @export
Summarize.formula <- function(object,data=NULL,digits=getOption("digits"),
                              na.rm=TRUE,exclude=NULL,
                              nvalid=c("different","always","never"),
                              percZero=c("different","always","never"),...) {
  nvalid <- match.arg(nvalid)
  percZero <- match.arg(percZero)
  ## Handle the formula
  tmp <- iHndlFormula(object,data,expNumR=1)
  ## Start Processing
  if (tmp$vnum==1) {
    ## Only one variable ... send first column of model.frame
    ## to summarize.default
    Summarize(tmp$mf[,1],digits=digits,na.rm=na.rm,
              exclude=exclude,nvalid,percZero,...)
  } else {
    ## More than one variable
    if (!tmp$metExpNumR) STOP("Must have one variable on LHS of formula with more than one variable")
    if (tmp$Rclass %in% c("numeric","integer")) {
      iSummarizeQf(tmp,digits,na.rm,exclude,nvalid,percZero)
    } else STOP("'Summarize' only works with a numeric variable on LHS.")
  }
}


##############################################################
## Internal function for vector of quantitative data
##############################################################
iSummarizeQ1 <- function(object,digits,na.rm,nvalid,percZero) {
  ## get overall sample size
  n <- length(object)
  ## get nvalid and number of NAs
  n.valid <- validn(object)
  ## count zeros (must get rid of NAs first ... 
  ##    otherwise counted as zeros)
  tmp <- object[!is.na(object)]
  zrs <- length(tmp[tmp==0])
  ## compute mean and sd
  mean <- mean(object,na.rm=na.rm)
  sd <- stats::sd(object,na.rm=na.rm)
  ## get other summary values
  tmp <- summary(object,na.rm=na.rm)[c("Min.","1st Qu.","Median","3rd Qu.","Max.")]
  res <- c(mean,sd,tmp)
  names(res) <- c("mean","sd","min","Q1","median","Q3","max")
  ## Add on percent that are zero???
  if (percZero=="always" | (percZero=="different" & zrs>0)) {
    res <- c(res,zrs/n.valid*100)
    names(res)[length(res)] <- "percZero"
  }
  ## Round all results to this point
  res <- round(res,digits)  
  ## Add on valid n???
  if (nvalid=="always" | (nvalid=="different" & n.valid!=n)) {
    res <- c(n.valid,res)
    names(res)[1] <- "nvalid"
  }
  ## Add on n
  res <- c(n,res)
  names(res)[1] <- "n"
  ## Return the result
  res
}

##############################################################
## Internal function for formula with quantitative response
##############################################################
iSummarizeQf <- function(tmp,digits,na.rm,exclude,nvalid,percZero) {
  ## Set defaults for whether variables should be unfactored or not
  unfactor1 <- unfactor2 <- FALSE
  ## Exclude values from factor variables if asked to
  if (!is.null(exclude)) {
    for (i in seq_along(exclude)) {
      tmp$mf <- droplevels(tmp$mf[apply(data.frame(tmp$mf[,-tmp$Rpos])!=exclude[i],1,all),])
    }
  }
  ## Get the response variable
  nv <- tmp$mf[,tmp$Rpos]
  ## Make sure LHS is simple enough
  if (tmp$Enum>2) STOP("With a quantitative response (LHS), the RHS may contain only one or two factors.")
  ## Get results for quant variable by each level of interaction variable.
  if (tmp$Enum==1) { ## Only one explanatory variable
    rv <- tmp$mf[,tmp$Enames]
    intres <- tapply(nv,rv,Summarize,na.rm=na.rm,nvalid="always",percZero="always")
  } else { ## Two explanatory variables
    rv <- tmp$mf[,tmp$Enames[1]]
    cv <- tmp$mf[,tmp$Enames[2]]
    iv <- interaction(rv,cv,sep=":")
    intres <- tapply(nv,iv,Summarize,na.rm=na.rm,nvalid="always",percZero="always")
  }
  # Put together as a matrix
  res <- round(do.call(rbind,intres),digits)
  # get colnames of matrix version of tapply object
  res.names <- colnames(res)
  # split rownames of tapply object into component parts
  lvl.lbls <- t(data.frame(strsplit(rownames(res),"\\:")))
  # remove attribute names on the level labels
  attr(lvl.lbls,"dimnames") <- NULL
  # put together as a data.frame
  if (tmp$Enum==1) { ## Only one explanatory variable
    if (tmp$Eclass!="factor") mode(lvl.lbls) <- tmp$Eclass
      else lvl.lbls <- as.factor(lvl.lbls)
    res <- data.frame(lvl.lbls,res,stringsAsFactors=FALSE)
  } else {
    lvl.lbls1 <- lvl.lbls[,1]
    lvl.lbls2 <- lvl.lbls[,2]
    if (tmp$Eclass[1]!="factor") mode(lvl.lbls1) <- tmp$Eclass[1]
      else lvl.lbls1 <- as.factor(lvl.lbls1)
    if (tmp$Eclass[2]!="factor") mode(lvl.lbls2) <- tmp$Eclass[2]
      else lvl.lbls2 <- as.factor(lvl.lbls2)
    res <- data.frame(lvl.lbls1,lvl.lbls2,res,stringsAsFactors=FALSE)
  }
  # make sure colnames of new data.frame make sense
  # should be names of explanatory variables and the colnames
  # from the tapply result
  names(res) <- c(tmp$Enames,res.names)
  # eliminate row names
  rownames(res) <- NULL
  ## Remove validn column if asked to ("never") or not interesting (all = n)
  if (nvalid=="never" | (nvalid=="different" & all(res$n==res$nvalid))) {
    res <- res[,names(res)!="nvalid"]
  }
  ## Remove percZero column if asked to ("never") or not interesting (none >0)
  if (percZero=="never" | (percZero=="different" & !any(res$percZero>0,na.rm=TRUE))) {
    res <- res[,names(res)!="percZero"]
  }
  ## Return the result
  res
}