R/sum_stat.R

#' Gives summary statistics (corresponds to Stata command summarize)
#'
#' @param x a data.frame
#' @param ... Variables to include. Defaults to all non-grouping variables. See the \link[dplyr]{select} documentation.
#' @param w Weights. Default to NULL.
#' @param i Condition
#' @param d Should detailed summary statistics be printed?
#' @param digits Number of significant decimal digits. Default to 3
#' @param .dots Used to work around non-standard evaluation.
#' @return a data.frame
#' @export
sum_up <- function(x, ...,  d = FALSE, w = NULL,  i = NULL, digits = 3) {
  UseMethod("sum_up")
}

#' @export
#' @method sum_up default
sum_up.default <- function(x, ...,  d = FALSE, w = NULL, digits = 3) {
  if (is.null(w)){
    x <- setNames(data_frame(x), "x")
  } else{
    x <- setNames(data_frame(x, w),  c("x", "weight"))
  }
  sum_up_(x, .dots = "x", d = d, w = w, digits = digits)
}



#' @export
#' @method sum_up data.frame
sum_up.data.frame <- function(x, ...,  d = FALSE, w = NULL,  i = NULL, digits = 3) {
  sum_up_(x, .dots = lazy_dots(...) , d = d, w = substitute(w), i = substitute(i), digits = digits)
}


#' @export
#' @rdname sum_up
sum_up_<- function(x, ..., .dots, d = FALSE,  w= NULL,  i = NULL, digits = 3) {
  w <- names(select_vars_(names(x), w))
  byvars <- as.character(groups(x))
  dots <- all_dots(.dots, ..., all_named = TRUE)
  vars <- select_vars_(names(x), dots, exclude = c(w, byvars))
  if (length(vars) == 0) {
    vars <- setdiff(names(x), c(byvars, w))
  }
  nums <- sapply(x, is.numeric)
  nums_name <- names(nums[nums == TRUE])
  vars <- intersect(vars, nums_name)
  if (!length(vars)) stop("Please select at least one non-numeric variable", call. = FALSE)
  newname = NULL
  if (!is.null(i)){
    newname <- tempname(x, 1)
    x <- mutate_(x, .dots = setNames(list(interp(~ as.integer(i), i = i)), newname))
    if (length(w)){
      x <- mutate_(x, .dots = setNames(list(interp(~ w*newname, w = as.name(w), newname = as.name(newname))), newname))
    }
    w <- newname
  }
  x <- select_(x, .dots = c(vars, byvars, w))
  # bug for do in data.table

  out <- do_(x, ~describe(., d = d, wname = w, byvars = byvars))
  out <- arrange_(out, .dots = c(byvars, "variable"))
  out <- select_(out, .dots = c(byvars, "variable", setdiff(names(out), c("variable", byvars))))
  print_pretty_summary(out, digits = digits)
  invisible(out)
}



describe <- function(M, d = FALSE, wname = character(0),  byvars = character(0)){
  if (length(byvars)){
    M <- select_(M, ~-one_of(byvars))
  }
  if (length(wname)){
    w <- M[[wname]]
    M <- select_(M, ~-one_of(wname))
  }
  else{
    w <- NULL
  }
  names <- names(M)
  # Now starts the code
  if (d==FALSE) {
    if (!is.null(w)){
      sum <- lapply(M ,function(x){
        take <- !is.na(x) & !is.na(w) & w > 0
        x_omit <- x[take]
        w_omit <- w[take]
        m <- matrixStats::weightedMean(x_omit, w = w_omit)
        c(length(x_omit), length(x)-length(x_omit), m, sqrt(matrixStats::weightedMean((x_omit-m)^2, w = w_omit)), matrixStats::colRanges(x_omit, dim = c(length(x_omit), 1)))
      })
    }else{
      sum <- lapply(M ,function(x){
        x_omit <- na.omit(x)
        c(length(x_omit), length(x) - length(x_omit), mean(x_omit), sd(x_omit), matrixStats::colRanges(x_omit, dim = c(length(x_omit), 1)))
      })
    }
    sum <- do.call(cbind, sum)
    sum <- as.data.frame(t(sum))
    sum <- bind_cols(data_frame(names), sum)
    sum <- setNames(sum, c("variable", "N","N_NA","mean","sd","min", "max"))
  } else {
    N <- nrow(M)
    f=function(x){
      if (!is.null(w)){
        take <- !is.na(x) & !is.na(w) & w > 0
        x_omit <- x[take]
        w_omit <- w[take]
        m <- matrixStats::weightedMean(x_omit, w = w_omit)
        sum_higher <- matrixStats::colWeightedMeans(cbind((x_omit-m)^2,(x_omit-m)^3,(x_omit-m)^4), w = w_omit)
        sum_higher[1] <- sqrt(sum_higher[1])
        sum_higher[2] <- sum_higher[2]/sum_higher[1]^3
        sum_higher[3] <- sum_higher[3]/sum_higher[1]^4
        sum_quantile <- pctile(x_omit, c(0, 0.01, 0.05, 0.1, 0.25, 0.50, 0.75, 0.9, 0.95, 0.99, 1), w = w_omit)
      } else{
        x_omit <- na.omit(x)
        m <- mean(x_omit)
        sum_higher <- colMeans(cbind((x_omit-m)^2,(x_omit-m)^3,(x_omit-m)^4))
        sum_higher[1] <- sqrt(sum_higher[1])
        sum_higher[2] <- sum_higher[2]/sum_higher[1]^3
        sum_higher[3] <- sum_higher[3]/sum_higher[1]^4
        sum_quantile= pctile(x_omit, c(0, 0.01, 0.05, 0.1, 0.25, 0.50, 0.75, 0.9, 0.95, 0.99, 1))
      }
      n_NA <- length(x) - length(x_omit)
      sum <- c(N-n_NA, n_NA, m, sum_higher, sum_quantile)
    }
    sum <- mclapply(M, f)
    sum <- do.call(cbind, sum)
    sum <- as.data.frame(t(sum))
    sum <- bind_cols(data_frame(names), sum)
    sum <- setNames(sum,  c("variable", "N","N_NA","mean","sd","skewness","kurtosis","min","p1","p5","p10","p25","p50","p75","p90","p95","p99","max"))
  }
  sum
}



print_pretty_summary <- function(x, digits = 3){
  if ("skewness" %in% names(x)){
    x1 <- select_(x, ~-one_of(c("p1","p5","p10","p25","p50","p75","p90","p95","p99")))
    x2 <-  select_(x, ~-one_of(c("N","N_NA","mean","sd","skewness","kurtosis", "min", "max")))
    print(format(x1, digits = digits, scientific = FALSE))
    print(format(x2, digits = digits, scientific = FALSE))
  } else{
    print(format(x, digits = digits, scientific = FALSE))
  }
}
eloualiche/miscr documentation built on May 16, 2019, 3:57 a.m.