R/summary2.R

#' @title Summarize edsurvey.data.frame Variables
#'
#' @description Summarizes edsurvey.data.frame variables.
#'
#' @param data an \code{edsurvey.data.frame} or \code{light.edsurvey.data.frame}
#' @param variable character vector of variable names
#' @param weightVar character weight variable name. Default to be the default weight of \code{data} if it exists.
#'                  If the given survey data do not have a default weight,
#'                  the function will produce unweighted statistics instead. 
#'                  Can be set to \code{NULL} to return unweighted statistics.
#' @param omittedLevels a logical value. When set to \code{TRUE}, drops those levels of the specified \code{variable}.
#'                     Use print on an \code{edsurvey.data.frame} to see the omitted levels. Defaults to \code{FALSE}.
#'
#' @return 
#' summary of weighted or unweighted statistics of a given variable in an \code{edsurvey.data.frame}  
#' 
#' For categorical variables, summary results are a crosstab of all variables and include:
#' \describe{
#'   \item{[variable name]}{level of the variable in the column name that the row regards. There is one column per element of \code{variable}.}
#'   \item{N}{number of cases for each category. Weighted N is also produced if users choose to produce weighted statistics.}
#'   \item{Percent}{percentage of each category. Weighted percent is also produced if users choose to produce weighted statistics.}
#'   \item{SE}{standard error of the percentage statistics}
#'  }
#'  
#' For continuous variables, summary results are by variable and include:
#' \describe{
#'   \item{Variable}{name of the variable the row regards}
#'   \item{N}{total number of cases (both valid and invalid cases)}
#'   \item{Min.}{smallest value of the variable}
#'   \item{1st Qu.}{first quantile of the variable}
#'   \item{Median}{median value of the variable}
#'   \item{Mean}{mean of the variable}
#'   \item{3rd Qu.}{third quantile of the variable}
#'   \item{Max.}{largest value of the variable}
#'   \item{SD}{standard deviation or weighted standard deviation}
#'   \item{NA's}{number of \code{NA} in variable and in weight variables}
#'   \item{Zero-weights}{number of zero-weight cases if users choose to produce weighted statistics}
#' }
#' 
#' If the weight option is chosen, the function produces weighted percentile and standard deviation. Refer to the vignette titled 
#' \href{https://www.air.org/sites/default/files/EdSurvey-Statistics.pdf}{Statistics} and
#' the vignette titled
#' \href{https://www.air.org/sites/default/files/EdSurvey-Percentiles.pdf}{Percentile}
#' for how the function calculates these statistics (with and without plausible values). 
#' 
#' @importFrom stats quantile
#' @export
#' @example man\examples\summary2.R
#' @seealso \code{\link{percentile}} 
#' @author Paul Bailey and Trang Nguyen
summary2 <- function(data, variable,
                     weightVar=attr(getAttributes(data,"weights"),"default"),
                     omittedLevels=FALSE) {
  checkDataClass(data,c("light.edsurvey.data.frame","edsurvey.data.frame"))
  callc <- match.call()
  if (is.null(weightVar) || !weightVar %in% colnames(data)) {
    callc$weightVar <- NULL
    edf <- getData(data,variable, omittedLevels = omittedLevels, includeNaLabel = !omittedLevels)
    N <- nrow(edf)
    if(length(unique(typeOfVariable(variable,data))) > 1) {
      stop("Summarize only discrete or only continious variables together.")
    }
    if (unique(typeOfVariable(variable,data)) == "discrete") {
      ret <- as.data.frame(ftable(edf[,variable], exclude = NULL))
      colnames(ret) <- c(variable, "N")
      ret$Percent <- ret$N/N * 100
    } else {
      ret <- lapply(1:ncol(edf), function(i) {
        descriptiveContinuous(edf[[i]])
      })
      ret <- cbind("Variable" = names(edf), as.data.frame(do.call('rbind',ret)))
    }
    ret <- list(summary=ret)
    ret$call <- callc
    class(ret) <- "summary2"
    return(ret)
  } # end if (is.null(weightVar) || !weightVar %in% colnames(data))
  
  callc$weightVar <- weightVar
  data <- getData(data,c(variable, weightVar), omittedLevels = omittedLevels,
                  addAttributes = TRUE, includeNaLabel = !omittedLevels, drop = FALSE)
  if(length(unique(typeOfVariable(variable, data))) > 1) {
    stop("The summary2 function requires that all variables are discrete or all variables are continuous.")
  }
  if(unique(typeOfVariable(variable, data)) == "discrete") {
    ret <- edsurveyTable(as.formula(paste0("~ ",paste(variable, collapse=" + "))),
                         data=data,returnMeans=FALSE,
                         omittedLevels = omittedLevels,
                         weightVar = weightVar)
    colnames(ret$data)[3] <- "Weighted N"
    colnames(ret$data)[4] <- "Weighted Percent"
    colnames(ret$data)[5] <- "Weighted Percent SE"
    ret <- ret$data[,1:5]
  } else { # end # end if(typeOfVariable(variable,data) == "discrete")
    variableR <- variable
    # build a data.frame, robust to vector "variable"
    ret <- do.call(rbind, lapply(variable, function(v) {
      as.data.frame(suppressWarnings(percentile(v, data=data,
                                                percentiles=c(0,25,50,75,100),
                                                weightVar = weightVar)))$estimate
    }))
    # turn plausible value variables into their member parts
    ret0 <- lapply(1:length(variable), function(vi) {
      v <- variable[vi]
      if (hasPlausibleValue(v,data)) {
        v <- getPlausibleValue(v,data)
      }
      lm0 <- fast.sd(data[,v], data[,weightVar])
      meanVar <- lm0$mean
      sdVar <- lm0$std
      n <- nrow(data)
      wN <- sum(data[,weightVar],na.rm = TRUE)
      nNA <- sum(rowSums(is.na(data[,v,drop=FALSE])) > 0 | is.na(data[,weightVar]))
      return(c(n, wN, ret[vi, 1:3], meanVar, ret[vi, 4:5], sdVar, nNA,
               sum(data[,weightVar] == 0, na.rm = TRUE)))
    })
    ret <- do.call(rbind, ret0)
    colnames(ret) <- c("N","Weighted N","Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", 
                    "Max.", "SD","NA's", "Zero-weights")
    ret <- cbind("Variable" = variableR, as.data.frame(ret))
  } # end else for if(typeOfVariable(variable,data) == "discrete")
  ret <- list(summary=ret)
  ret$call <- callc
  class(ret) <- "summary2"
  return(ret)
}



#' @method print summary2
#' @export
print.summary2 <- function(x, ...) {
  call <- x$call
  if (!"weightVar" %in% names(call)) {
    cat("Estimates are not weighted.\n")
  } else {
    cat(paste0("Estimates are weighted using weight variable ", sQuote(call$weightVar),"\n"))
  } # end if (!"weightVar" %in% names(call))
  print(x$summary, ...)
}



# calculates a mean and standard deviation (std) estimate based on variables
# that are already read in.
# variables: the variables in this variable (this is not vectorized, simply allows for PV vars)
# weight: the full sample weight
fast.sd <- function(variables, weight) { 
  y <- as.matrix(variables) # need to abstract PVs
  variance <- mu <- rep(NA, ncol(y))
  for(i in 1:ncol(y)) {
    y0 <- y[!is.na(y[,i]) & !is.na(weight) & weight != 0,i]
    w0 <- weight[!is.na(y[,i]) & !is.na(weight) & weight !=0]
    mu[i] <- sum(w0 * y0)/sum(w0)
    variance[i] <- sum(w0 * (y0 - mu[i])^2)/sum(w0)
  }
  return(list(mean=mean(mu), std=sqrt(mean(variance))))
}

# returns the N, min, quartiles, max, mean, and sd of x, and number of NA's
# does no checking of x
descriptiveContinuous <- function(x) {
  # type=8 is the unbiased quantile, also what is used in EdSurvey::percentile
  q <- quantile(x, probs = c(0,0.25,0.5,0.75,1), na.rm = TRUE, type = 8)
  ret <- c(length(x),q[1:3],mean(x,na.rm=TRUE),q[4:5],sd(x,na.rm = TRUE), sum(is.na(x)))
  names(ret) <- c("N","Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", 
                  "Max.", "SD", "NA's")
  return(ret)
}

Try the EdSurvey package in your browser

Any scripts or data that you put into this service are public.

EdSurvey documentation built on May 2, 2019, 7:30 a.m.