dmtools: Tools for Data Exploration and Data Mining

#' Compute summary statistics and frequency tables
#' 
#' @param dat data frame containing the variables to analyze.
#' @param varnum string or array with the continuous variables for which summary statistics are computed.
#' @param varclass string or array with the categorical variables for which frequency tables are computed.
#' @param target string or array with the categorical target variable to analyze in conjunction with the categorical variables.
#' @param event target event of interest to analyze.
#' @param percentiles percentiles to be computed on each analysis variable (given as values between 0 and 100 --e.g. c(0, 10, 90, 100)).
#' @param stats string containing the statistics to compute on the analysis variables (e.g. "mean sd").
#' @param top for categorical variables number of top most frequent categories to store in the output frequency table as long as their frequency
#' is at least \code{freqmin}. Use \code{top = NULL} to display all cases in alphabetical order. Use \code{top = 0} to display
#' all cases sorted by decreasing frequency.
#' @param freqmin for categorical variables minimum number of cases for a category to be stored in the output frequency table.
#' @param propmin for categorical variables minimum proportion of cases computed on nonmissing values to be stored in the output frequency table.
#' This is useful to get frequency resolution when a variable has too many missing values and almost no category satisfy the \code{freqmin} condition.
#' @param miss array containing the values that should be considered missing (for both numeric and character variables).
#' @return If both \code{varnum} and \code{varclass} are given, a list with names \code{summary} and \code{table} containing
#' respectively the summary statistics of continuous variables and the frequency table of categorical variables.
#'
#' If only \code{varnum} or \code{varclass} is given, a data frame containing the summary statistics or the frequency table,
#' respectively.
#'
#' The summary statistics table contains one row per variable and one column per percentile/statistic requested.
#' The percentiles are named using prefix 'p' as in 'p25' to indicate percentile 25.
#'
#' The frequency table contains one row per valid variable value, where "valid" means that their occurrence
#' frequency is at least \code{freqmin} (in absolute terms) or \code{propmin} (in relative terms).
#' All the values listed in parameter \code{miss} are considered as missing values and counted separately.
#'
#' For each variable, the following frequency information is shown:
#' - when \code{top} is not \code{NULL} and is \code{>0}, first the \code{top} most frequent categories
#' are shown, otherwise, all nonmissing values are shown.
#' - when \code{top} is not \code{NULL} and is \code{>0}, a row with the information on the "_other_" category
#' (that groups all remaining nonmissing values) is shown
#' - a row containing information on the missing cases (if any exist)
#' - a row containing the total count of valid top values, their proportion, and their target penetration
#' (when a target variable has been requested)
#' Note that summing the frequency of valid top values, the frequency of the "_other_" group, and the frequency
#' of missing values gives the total number of cases in the data, and their proportion sum to 1.
#'
#' The columns in the output frequency table are:
#' \itemize{
#' \item var analysis variable name
#' \item index column that indexes each non-NA and non-"other" variable value shown in the table
#' \item value value taken by the variable
#' \item freq frequency of the value
#' \item prop relative frequency of the value w.r.t. all the records
#' \item <target> (when a target variable is specified) target event penetration when the variable takes the corresponding value.
#' The name of the column is the name of the target variable analyzed followed by the target event value (e.g. y_1, where "y" is
#' the target variable and "1" is the event of interest)
#' }
#' The rightmost column contains the proportion of valid values w.r.t. the total number of nonmissing values, except
#' for the missing category, for which the proportiont is computed as the number of missing values w.r.t. the total number of cases.
#' 
#' @details
#' Missing values are allowed. For continuous variables they are removed from the summary statistics calculation;
#' for categorical variables they are considered as possible value, but treated specially as explained above.
#'
#' Quantiles 0 and 1, if given, are displayed on columns "min" and "max" in the data frame containing the summary statistics.
dm_summary = function(
  dat, 
  varnum, 
  varclass=NULL,
  target=NULL,
  event=1,
  percentiles=c(0, 1, 5, 10, 25, 50, 75, 90, 95, 99, 100),
  stats="mean sd",
  top=NULL,
  freqmin=1,
  propmin=0.01,
  miss=c(NA, NaN, "")) {

  #---------------------------------- Parse input parameters ----------------------------------
  varnum = parseVariables(varnum)
  varclass = parseVariables(varclass)
  stats = parseVariables(stats)
  target = parseVariables(target)
  
  # Check existence of variables
  varsNotFound = checkVariables(dat, c(varnum, varclass, target), )
  stopifnot(is.null(target) || length(target) == 1)
  stopifnot(is.null(varsNotFound))
  
  # Continue parsing...
  targetFlag = FALSE
  if (!is.null(target)) targetFlag = TRUE
  if (!is.null(top) && top < 0) top = NULL
  if (!is.null(freqmin) && freqmin < 0 || is.null(freqmin)) freqmin = 0
  if (is.null(propmin) && freqmin > 0) propmin = 1		# This implies that the prop >= propmin condition does NOT ADD NEW valid values to the set of valid values yielded by the freq >= freqmin condition
  if (!is.null(propmin) && propmin < 0 || is.null(propmin)) propmin = 0
  #-- Parse input parameters
  
  # Check if the user passed any variables at all
  if ( (is.null(varnum) || length(varnum) == 0) && (is.null(varclass) || length(varclass) == 0)) {
    # No variables to analyze...
    return(NULL)
  }
  #---------------------------------- Parse input parameters ----------------------------------
  
  
  #---------------------- Summary statistics for continuous variables -------------------------
  # Remove any factor attribute of continuous variables... (note that the simpler way of using dat[,varnum] = as.numeric(as.character(dat[,varnum])) does not work!)
  # NOTE that I don't update the 'dat' data frame because I don't want to remove the factor property for variables that are
  # analyzed both as continuous AND categorical variables.
  if (!is.null(varnum) && length(varnum) > 0) {	# varnum can be empty either by being NULL or being equal to ""
    cat("Analyzing continuous variables (", length(varnum), " variables)...\n", sep="")
    dat4summary = sapply(dat[,varnum, drop=FALSE], function(x) as.numeric(as.character(x)))
    if (!is.null(percentiles) && length(percentiles) > 0) {
      dat.percentiles = apply(dat4summary, 2, FUN=quantile, probs=percentiles/100, na.rm=TRUE)
      percentiles.names = paste("p", percentiles, sep="")
      rownames(dat.percentiles) = percentiles.names
    } else {
      dat.percentiles = NULL
      percentiles.names = NULL
    }
    dat.stats = matrix(nrow=0, ncol=length(varnum))
    # TODO: (2017/04/10) Try to improve performance here by computing all statistics at once!
    for (stat in stats) {
      dat.stats = rbind( dat.stats, apply(dat4summary, 2, stat, na.rm=TRUE) )
    }
    # Add the number of observations and number of non missing values
    dat.stats = rbind( rep(nrow(dat4summary), ncol(dat4summary)), apply(dat4summary, 2, function(x) sum(!is.na(x)) ), dat.stats )
    rownames(dat.stats) = c("n", "neff", stats)
    
    # Transpose the stats data frame so that the summary statistics are along the columns
    summary.out = as.data.frame( t( rbind(dat.stats, dat.percentiles) ) )
    summary.out$pmiss = (summary.out$n - summary.out$neff) / summary.out$n
    summary.out = summary.out[, c("n", "neff", "pmiss", stats, percentiles.names)]
    
    # Replace "0%" with "min" and "100%" with "max"
    if ("p0" %in% colnames(summary.out)) summary.out = rename.vars(summary.out, from="p0", to="min", info=FALSE)
    if ("p100" %in% colnames(summary.out)) summary.out = rename.vars(summary.out, from="p100", to="max", info=FALSE)
  }
  #---------------------- Summary statistics for continuous variables -------------------------
  
  
  #----------------------- Frequency table for categorical variables --------------------------
  if (!is.null(varclass) && length(varclass) > 0) {	# varclass can be empty either by being NULL or being equal to ""
    # Compute the frequency table of all variables and return the result as a list (using lapply())
    if (targetFlag) {
      dat.tablelist = lapply(dat[, varclass, drop=FALSE], table, dat[, target], useNA="ifany")
    } else {
      dat.tablelist = lapply(dat[, varclass, drop=FALSE], table, useNA="ifany")
    }
    
    # Store the list with the results in a data frame
    tab.out = list()
    for (v in varclass) {
      cat("Analyzing categorical variable ", v, "...\n", sep="")
      # Frequency table for the currently analyzed variable
      # NOTE the use of as.data.frame.matrix() instead of as.data.frame() in order to preserve the structure of the mxn contingency table!
      # (as.data.frame() would stack the horizonal dimension along the vertical dimension! --and we don't want that)
      # Ref: https://www.r-bloggers.com/how-to-convert-contingency-tables-to-data-frames-with-r/
      # NOTE about the output table:
      # - the variable's values frequencies are stored in column Freq
      # - the variable's values are stored in:
      # 	- column Var1 when no target variable is given
      # 	- as row names when a target variable is given
      if (targetFlag) {
        tabv = as.data.frame.matrix( dat.tablelist[[v]] )
      } else {
        tabv = as.data.frame( dat.tablelist[[v]] )
      }
      
      # Compute the total number of cases by variable value
      if (targetFlag) {
        # Total number of cases per variable's value
        tabv$Freq = apply(tabv, 1, sum)
        # Variable's values are stored as a column
        tabv$Var1 = rownames(tabv)
        rownames(tabv) = 1:nrow(tabv)
        # Compute the target penetration for the event of interest
        tabv[, "target"] = tabv[, as.character(event)] / tabv$Freq
        # Keep just the variable's values, number of cases, and target penetration
        tabv = tabv[, c("Var1", "Freq", "target")]
        # Name of the target column in the output frequency table
        targetcol = paste(target, as.character(event), sep="_")
      } else {
        # Name of the target column in the output frequency table (no column)
        targetcol = NULL
      }			
      
      # Total frequency and proportion on nonmissing cases (the proportion may be used for filtering of categories (based on parameter propmin)
      indna = tabv$Var1 %in% miss			# Note that when miss contains 'NaN', the corresponding name in tabv is "NaN" and a comparison of this name with NaN or "NaN" yields TRUE (great!)
      ntotal_notmiss = sum(tabv[!indna, "Freq"])
      freq = tabv$Freq
      prop = freq / ntotal_notmiss
      ntotal = sum(freq)
      
      # Top N, freq min and prop min
      if (!is.null(top) || freqmin > 0 || propmin > 0) {
        #------------------------------- FREQUENCY OF TOP VALID VALUES -----------------------------
        # Filter variable's values based on freqmin, propmin and top
        # (first the freqmin and propmin filters are applied and then the top 'top' cases are selected among those satisfying the freqmin OR propmin filter)
        # The result of using | prop >= propmin is that variables having too few nonmissing values can also be analyzed, since the propmin
        # condition is applied on the proportions calculated over nonmissing values.
        indvalid = !indna & (freq >= freqmin | prop >= propmin)
        nvalid = sum(indvalid)
        if (nvalid == 0) {
          indtop = NULL
          tabv.valid.sorted = NULL
          tabv.valid.sorted.top = NULL
          ntotal_validtop = 0
        } else {
          if (!is.null(top) && top == 0) {	# top = 0 means that we want ALL occurrences of the categorical variable (but sorted)
            indtop = 1:nvalid
          } else {
            indtop = 1:min(top, nvalid)  		# Note that top is guaranteed to be >= 1 or NULL (NULL is ok with the min() function)
          }
          
          # Sort nonmissing values by decreasing frequency on the "valid" variable values
          tabv.valid = tabv[indvalid,]
          if (targetFlag) {
            # Sort by decreasing target penetration
            ord = order( tabv.valid$target, decreasing=TRUE )
          } else {
            ord = order( tabv.valid$Freq, decreasing=TRUE )
          }
          tabv.valid.sorted = tabv.valid[ord,]

          # Add an index column (1, 2, 3, ...) for the valid values and update the information on proportions (because now the data is sorted by decreasing frequency)
          # NOTE that this proportion is computed BEFORE adding the NA information because the NA row stores
          # a proportion that is computed differentely (on the total number of cases)
          tabv.valid.sorted$index = 1:nrow(tabv.valid.sorted)
          tabv.valid.sorted$prop = tabv.valid.sorted$Freq / ntotal_notmiss

          # TOP valid values
          tabv.valid.sorted.top = tabv.valid.sorted[indtop, , drop=FALSE]

          #-- Values that are added to the "Total" row in the final table below
          # Number of valid and top variable's values (in this case this is the number of nonmissing values)
          ntotal_validtop = sum(tabv.valid.sorted[indtop, "Freq"])
          if (targetFlag) {
            # Target penetration on the valid and top variable's values (i.e. nonmissing values)
            target_validtop = weighted.mean(tabv.valid.sorted[indtop, "target"], tabv.valid.sorted[indtop, "Freq"])
          }
        }
        #------------------------------- FREQUENCY OF TOP VALID VALUES -----------------------------


        #-------------------------------- FREQUENCY OF "OTHER" VALUES ------------------------------
        # Compute values for the "other" group (i.e. all the rest that has not been selected and is not missing)f
        indother = !indna & !indvalid
        # Construct the "other" table as the indother cases + the non-top valid cases
        # Note: need to check that indtop is not empty or NULL because -indtop gives error in such cases!! grrrrrrr!
        if (length(indtop) > 0) {
          tabv.other = rbind( tabv[indother, , drop=FALSE], tabv.valid.sorted[-indtop, 1:ncol(tabv), drop=FALSE])
        } else {
          tabv.other = rbind( tabv[indother, , drop=FALSE], tabv.valid.sorted[, 1:ncol(tabv), drop=FALSE])
        }
        nother = nrow(tabv.other)
        if (nother > 0) {
          tabv.other.agg = data.frame(Var1=paste("_other_(n=", nother, ")", sep=""), Freq=sum(tabv.other$Freq))
          if (targetFlag) {
            tabv.other.agg$target = weighted.mean(tabv.other$target, tabv.other$Freq)
          }
        } else {
          tabv.other.agg = NULL
        }
        
        if (nother > 0) {
          tabv.other.agg$index = NA
          tabv.other.agg$prop = tabv.other.agg$Freq / ntotal
          ## Note that for the "other" group we divide by the total number of cases (NOT by the total number of nonmissing cases)
          ## This is because freq("other") + freq("validtop") + freq("NA") = ntotal
          ## therefore we want prop("other") + prop("validtop") + prop("NA") = 100%
        }
        #-------------------------------- FREQUENCY OF "OTHER" VALUES ------------------------------


        #----------------------------------- FINAL FREQUENCY TABLE ---------------------------------
        # Put all the information together
        # TODO: (2017/04/10) The output of cbind() may be inapropriate when indna contains more than one index... we should sum on them, but
        # it's not easy because of the NA values... aggregations using aggregate() omit NAs by default and I haven't found a way of including them
        # as valid values (as there is no na.action= option that includes NAs!! (all the options EXCLUDE NAs (see help(na.action))
        # In any case, right now in principle we get one row per type of missing value.
        tab.freq = rbind( tabv.valid.sorted.top, tabv.other.agg, cbind( tabv[indna, , drop=FALSE], index=rep(NA, sum(indna)), prop=as.numeric(tabv[indna, "Freq"])/ntotal ))
        ## drop=FALSE and as.numeric() are important to avoid the error "names contain missing values",
        ## arising from the fact that the NA category generates a "missing value" name.
        #----------------------------------- FINAL FREQUENCY TABLE ---------------------------------
      } else {
        # All variable values should be reported
        tab.freq = tabv
        # Add an index variable (1, 2, ...) that numbers each variable value
        tab.freq$index = 1:nrow(tab.freq)
        # Compute the proportion of cases w.r.t. the number of nonmissing cases
        tab.freq$prop = tab.freq$Freq / ntotal_notmiss
        # For the NA cases, compute its percentage w.r.t. the total number of cases
        tab.freq$prop[indna] = tab.freq$Freq[indna] / ntotal
        
        #-- Values that are added to the "Total" row below
        # Number of valid and top variable's values (in this case this is the number of nonmissing values)
        ntotal_validtop = ntotal_notmiss
        if (targetFlag) {
          # Target penetration on the valid and top variable's values (i.e. nonmissing values)
          target_validtop = weighted.mean(tab.freq[!indna, "target"], tab.freq[!indna, "Freq"])
        }
      }
      
      # Add the current frequency table to the output table
      # (note that we add a new row with the total information, where the total nonmissing cases are shown
      if (targetFlag) {
        tab.out = rbind(tab.out,
                        cbind(var=rep(v, nrow(tab.freq)), tab.freq[, c("index", "Var1", "Freq", "prop", "target")]),	# Resort the columns in tab.freq
                        cbind(var=v, index=NA, Var1="--TOTAL(valid&top)--", Freq=ntotal_validtop, prop=ntotal_validtop/ntotal, target=target_validtop))
        ## IMPORTANT: We need to use cbind() here and NOT c() because in the latter case we get an error that
        ## "--TOTAL--" is not a valid factor level! (because 'var' in the output data frame is considered a factor!)
      } else {
        tab.out = rbind(tab.out,
                        cbind(var=rep(v, nrow(tab.freq)), tab.freq[, c("index", "Var1", "Freq", "prop")]),
                        cbind(var=v, index=NA, Var1="--TOTAL(valid&top)--", Freq=ntotal_validtop, prop=ntotal_validtop/ntotal))		# IMPORTANT: We need to use cbind() here and NOT c() because in the latter case we get an error that "--TOTAL--" is not a valid factor level! (because 'var' in the output data frame is considered a factor!)
      }			
    }
    colnames(tab.out) = c("var", "index", "value", "freq", "prop", targetcol)
  }
  #----------------------- Frequency table for categorical variables --------------------------
  
  
  #---------------------------------------- Return info ---------------------------------------
  if (is.null(varclass) || length(varclass) == 0) {
    return(summary.out)
  } else if (is.null(varnum) || length(varnum) == 0) {
    return(tab.out)
  } else {
    return(list(summary=summary.out, table=tab.out))
  }
  #---------------------------------------- Return info ---------------------------------------
}