R/summaryStats.R

Defines functions summaryStats

Documented in summaryStats

#' @title Summary Statistics
#' @description Easy summary statistics. Hmisc didn't do it for me.
#' @param df data.table from which summary statistics will come
#' @param minFreq either a percentage or an integer. Minimum samples/percentage of df needed to show unique values
#' @param vars A vector of variables to show summary stats for
#' @param sumInChar Tack on a column that you would like to see the sum grouped by the character variables
#' @param avgInChar Tack on a column that you would like to see the average grouped by the character variables
#' @param decimals How many decimals to show
#' @return Frequency Encoded Object
#' @importFrom utils stack
#' @importFrom data.table data.table setnames :=
#' @export

summaryStats <- function(df, minFreq = 0.01, vars = NULL, sumInChar = NULL, avgInChar = NULL, decimals = 2) {

  # df <- copy(RawQW)
  # vars <- c("Homeowner","Gender","LicenseSuspended","Age","SourceName","ClickBid")
  # var <- vars[[3]]
  # var <- "InsuranceCarrier"
  # sumInChar <- "PolicyBound"
  if(is.null(vars)) vars <- names(df)
  if(sum(is.na(match(vars,names(df)))) > 0) stop("Some vars not in df names")

  if (minFreq < 1) minFreq <- round(nrow(df)*minFreq)

  df <- copy(df)

  for(var in vars) {

    dc <- df[,get(var)]
    cl <- class(dc)

    if (cl %in% c("character")) {

      ret <- data.table(stack(table(dc, useNA = "ifany")))
      names(ret) <- c("Count",var)
      ret <- ret[,c(2,1)]
      if (!is.null(sumInChar)) ret <- merge(ret, df[,lapply(.SD,function(x) round(sum(x, na.rm = TRUE),decimals)),.SDcols = sumInChar, by = var], by = var)
      if (!is.null(avgInChar)) ret <- merge(ret, df[,lapply(.SD,function(x) round(mean(x, na.rm = TRUE),decimals)),.SDcols = avgInChar, by = var], by = var)
      ret <- ret[Count >= minFreq,][order(-Count)]
      ns <- length(unique(dc))-nrow(ret)

      if(nrow(ret) == 0){

        cat("\n\n------------------------------------------------------------\n")
        cat(var," - ",cl,"\n\n")
        cat("Nothing to show here")

      } else {

        cat("\n\n------------------------------------------------------------\n")
        cat(var," - ",cl,"\n\n")
        print(ret)
        if (ns > 0) cat("\nValues Not Shown: ",ns)

      }

    } else if(cl %in% c("integer","numeric")){ # Numeric Variables

      cat("\n\n------------------------------------------------------------\n")
      cat(var," - ",cl,"\n\n")
      cat("Sum of Col:         ", round(sum(dc, na.rm = TRUE), decimals),"\n")
      cat("Average Value:      ", round(mean(dc, na.rm = TRUE), decimals),"\n")
      cat("Standard Deviation: ", round(var(dc, na.rm = TRUE), decimals),"\n")
      cat("Missing Values:     ", round(sum(is.na(dc)), decimals),"\n")


    } else if(cl %in% c("Date")) {

      cat("\n\n------------------------------------------------------------\n")
      cat(var," - ",cl,"\n\n")
      cat("Date Range:",format(max(dc) - min(dc)))

    }

  }

}
utils::globalVariables("Count")
AnotherSamWilson/helperFuncs documentation built on Oct. 1, 2019, 8:51 p.m.