R/dict.R

#' Data dictionary.
#' 
#' Get information about the columns in a dataset.
#'
#' @param x Data frame to summarize.
#' @param nastrings Strings to consider NA. Uses this package's defaults.
#' @param run.on.sample Optionally, sample data to make this run faster. Enter the % of rows to sample.
#' @param do.droplevels Unused factorlevels can cause problems with this. Choose to re-leveling all factors. This is slower but may resolve errors.
#' @param return.lists Outliers and NAs can optionally be returned as lists to easily pull them out from data later. This slows things down though, so is not the default.
#'
#' @return Data frame with information for each column. 
#' Outliers use 1.5 IQR method. 
#' pareto80 is a measure of categorical concentration. It is the minimum % of categories that represent 80% of the rows. 80/20 rule applies if this is 0.2 for example. If it is 0.1, then 10% of categories make up 80% of rows. If columns are evenly distributed then it will be 0.8.
#' 
#' @export
#'
#' @examples
#' 
#' dict(cars)
#' 
dict = function( x, nastrings = bc::nastrings, run.on.sample = NULL, do.droplevels = FALSE, return.lists = FALSE ){

    dict = data.frame( 
      name = character(),
      stringsAsFactors = FALSE
    )

    # add row.
    x$bc.row = 1:nrow(x)

    # sample rows first so values are consistent across.
    splrows = sample( x$bc.row, size = min( 5, nrow(x) ), replace = FALSE )
    
    if( !is.null(run.on.sample) ) x = x[ sample( 1:nrow(x), size = ceiling( nrow(x) * run.on.sample ), replace = FALSE ), ]

    for( col in setdiff( colnames(x), 'bc.row' ) ){

        col.dict = data.frame( 
            name = col, 
            class = gsub( 'POSIXct', 'datetime(POSIXct)', class( x[[col]] )[1] ),
            rows = nrow(x),
            stringsAsFactors = FALSE
        )

        # factors are faster to analyze.
        if( is.character( x[[col]] ) ) x[[col]] = as.factor( x[[col]] )

        # replace na strings and relevel.
        if( is.factor( x[[col]] ) && !is.null(nastrings) ){
          
            newna = which( levels(x[[col]])[ x[[col]] ] %in% nastrings )
            
            if( length(newna) > 0 ){
                x[[col]][ newna ] <- NA
                x[[col]] = droplevels( x[[col]] )
            }
            
            rm(newna)
            
        }
        
        navals = which( is.na( x[[col]] ) )
        col.dict$unique = ifelse( !any( duplicated( x[[col]] ) ), 'unique', NA )

        if( is.factor( x[[col]] ) ){
          
          if( do.droplevels ) x[[col]] = droplevels( x[[col]] )

          if( length(navals) > 0 ) {
            vals = x[ -navals, c( 'bc.row', col ) ]
          } else {
            vals = x[ , c( 'bc.row', col ) ]
          }
          
          valcnt = rev( sort( table( vals[[col]] ) ) )
          col.dict$mode = as.character( names( valcnt )[1] )
          col.dict$pareto80 = round( ( sum( cumsum(valcnt) < (nrow(vals) * 0.8) ) + 1 ) / length(valcnt), 2 )

          col.dict$n.vals = length( levels( x[[col]] ) )
            
          rm( valcnt )

        } else if( is.numeric( x[[col]] ) || lubridate::is.Date(x[[col]]) || is.POSIXct(x[[col]]) ){

            if( length(navals) > 0 ) {
              vals = x[ -navals, c( 'bc.row', col ) ]
            } else {
              vals = x[ , c( 'bc.row', col ) ]
            }
          
            vals = dplyr::arrange_at( vals, col )

            col.dict$n.vals = length( unique( vals[[col]] ) )
            col.dict$min = vals[[col]][1]
            col.dict$pct25 = vals[[col]][ ceiling( nrow(vals) * .25 ) ]
            col.dict$pct50 = vals[[col]][ ceiling( nrow(vals) * .5 ) ]
            col.dict$pct75 = vals[[col]][ ceiling( nrow(vals) * .75 ) ]
            col.dict$max = vals[[col]][ nrow(vals) ]
            
            if( is.numeric( x[[col]] ) ){
              
              iqr = col.dict$pct75 - col.dict$pct25
      
              outlier.rows = vals$bc.row[ which( vals[[col]] > col.dict$pct75 + iqr  ) ]
              col.dict$outlier.ct = length(outlier.rows)
              col.dict$outlier.pct = col.dict$outlier.ct / nrow(vals)
              if( length(outlier.rows) > 0 ) col.dict$outlier.rows = if( return.lists ){ list(outlier.rows) } else { cc( head( outlier.rows, 5), sep = ', ' ) }

              rm(outlier.rows)

              col.dict$mean = mean(vals[[col]])
              col.dict$sd = sd(vals[[col]])
            }

        } else if( is.logical( x[[col]] ) ) {

          if( length(navals) > 0 ) {
            vals = x[ -navals, c( 'bc.row', col ) ]
          } else {
            vals = x[ , c( 'bc.row', col ) ]
          }
          
          col.dict$mean = mean(vals[[col]])
          col.dict$sd = sd(vals[[col]])
          
        }

        col.dict$na.ct = length(navals)
        col.dict$na.pct = length(navals) / nrow(x)
        if( length(navals) > 0 ) col.dict$na.rows = if( return.lists){ list( navals ) } else { cc( head( navals, 5 ), sep = ', ' ) }
        
        col.dict$sample = list( as.character(x[[col]])[splrows] )
        if( ! return.lists ) col.dict$sample = cc( unlist( col.dict$sample ), sep = ', ' )
              
        for( i in colnames(col.dict) ) if( is.numeric( col.dict[[i]] ) ) col.dict[[i]] = signif( col.dict[[i]], 3 )
        rm(i)

        for( i in colnames(col.dict) ) col.dict[[i]] = as.character(col.dict[[i]])
        rm(i)

        dict = dplyr::bind_rows( dict, col.dict )

        rm( col.dict, col, navals  )

    }

    for( i in colnames(dict) ) dict[[i]] = tonum( dict[[i]], do.na = 'return-unchanged', verbose = FALSE )
    
    return(dict)

}
superchordate/bc documentation built on June 20, 2019, 4:05 p.m.