R/Distributions.R

Defines functions deparse.names alignTables as.table.distribution rbind.humdrumR.table cbind.humdrumR.table humdrumR.table mutualInfo.default mutualInfo.probability mutualInfo entropies entropy.default entropy.density entropy.numeric entropy.probability entropy pMI like.data.frame like.default ic like pexprs unconditional conditional pdist.table pdist.humdrumR pdist.data.frame pdist.default pdist.probability pdist.count pdist count.table count.humdrumR count.default distmat cbind.distribution mergeLevels alignDistributions as.data.table.distribution as.matrix.distribution filter.distribution prettyBins prettyP prettyN Pequation print.distribution varnames getLevels getValues dist_type distribution

Documented in count.default count.humdrumR count.table entropy entropy.default entropy.density entropy.numeric entropy.probability mutualInfo mutualInfo.default mutualInfo.probability pdist pdist.count pdist.data.frame pdist.default pdist.humdrumR pdist.probability pdist.table print.distribution

# S4 Distributions ----

## Definitions ----





#' Distributions
#' 
#' HUmdrumR has ways to...
#' 
#' @name distributions
NULL


setClass('distribution', contains = 'data.frame', slots = c(Sort = 'integer'))
setClass('count', contains = 'distribution')
setClass('probability', contains = 'distribution', slots = c(N = 'integer', Condition = 'maybecharacter'))



### Constructors ----

distribution <- function(x, type, Sort = 0L, N = 0L, Condition = NULL) {
  df <- as.data.frame(x)
  args <- list(new('distribution', df, Sort = Sort))
  
  if (class(type) == 'character') {
    
    args$Class <- if (type == 'p') 'probability' else 'count'
    if (args$Class == 'probability') {
      args$N <- N
      args$Condition <- Condition
    }
    
  } else { # type must be a distribution
    args$Class <- c(class(type))
    if (args$Class == 'probability') {
      args$N <- type@N
      args$Condition <- type@Condition
    }

  }
  do.call('new', args)
}
  



### Accessors ----


dist_type <- function(dist) intersect(colnames(dist), c('n', 'p'))
getValues <- function(dist) as.data.frame(dist)[ , dist_type(dist)]
getLevels <- function(dist) as.data.frame(dist)[ , varnames(dist), drop = FALSE]


#' @export
varnames <- function(x) setdiff(colnames(x), c('n', 'p'))

### print() ----

#' @export
#' @rdname distributions
setMethod('show', 'distribution', \(object) print.distribution(object))

#' @export
#' @rdname distributions
print.distribution <- function(dist, digits = if (inherits(dist, 'probability')) 3 else 1,
                               syntaxHighlight = humdrumRoption('syntaxHighlight'),
                               zeros = '.') {
  
  type <- dist_type(dist)
  message <- paste0('humdrumR ', 
                    if (type == 'p') paste0('probability distribution ', Pequation(dist)) else 'count distribution')
  
  varnames <- varnames(dist)
  
  if (nrow(dist) == 0) {
    message <- paste0(message, ' (empty)')
    if (syntaxHighlight) message <- syntaxHighlight(message, 'G')
    cat(message, '\n')
    return(invisible(dist))
  }
  sort <- dist@Sort
  
  X <- getValues(dist)
  
  X <- if (type == 'p') prettyP(X, digits = digits, zeros = zeros) else prettyN(X, digits = digits, zeros = zeros)
  
  # do we scale or round?
  scale <- attr(X, 'scale')
  
  scale <- if (length(scale) == 1L) {
    if (scale > 0) paste0(' (', c('thousands', 'millions', 'billions', 'trillions')[scale], ')')
  }
  
  if (attr(X, 'negative') %||% FALSE) scale <- paste0(scale, ' (∃N < 0)')
  if (attr(X, 'approx')) scale <- paste0(scale, ', ~rounded')
  message <- paste0(message, scale)
  
  dist <- getLevels(dist)
  dist[[type]] <- X
  
  # check if we can widen
  iswide <- FALSE
  printmat <- (if(sort == -0L && length(varnames) >= 2L) {
    
    wide <- as.matrix(dcast(as.data.table(dist), rlang::new_formula(quote(...), rlang::sym(varnames[2])), fill = attr(X, 'zerofill'), value.var = type))
    
    factorcols <- colnames(wide) %in% varnames
    toprow    <- ifelse( factorcols, colnames(wide), '')
    toprow[length(varnames)] <- varnames[2]
    secondrow <- ifelse(!factorcols, colnames(wide), '')
    
    wide <- rbind(toprow, secondrow, wide, secondrow, toprow)
    wide <- apply(wide, 2, format, justify = 'right')
    wide <- wide[ , order(match(toprow, colnames(dist), nomatch = 2))]
    
    width <- sum(nchar(wide[1, ])) + ncol(wide) * 2
    
    iswide <- width < (options('width')$width - 10L)
    if (iswide) wide
  })  %||%  apply(rbind(colnames(dist), 
                        as.matrix(dist), 
                        colnames(dist)), 
                  2, format, justify = 'right')

  if (sort != 0L) {
    rank <- format(c('Rank', seq_len(nrow(printmat) - 2L), 'Rank'), 
                   justify = 'left')
    if (sort == 1) rank <- rev(rank)
    printmat <- cbind(Rank = rank, printmat)
    
  }
  

  
  
  if (syntaxHighlight) {
    
    syntax <- array('D', dim = dim(printmat))
    
    syntax[c(1, nrow(printmat)), ] <- 'N'
    
    dataCols <- !colnames(printmat) %in% c(varnames, 'Rank')
    rankCol <- colnames(printmat) == 'Rank'
    
    if (iswide) {
      syntax[c(2, nrow(printmat) - 1L), ] <- 'I' 
      dataRows <- 3:(nrow(printmat) - 2L)
    } else {
      dataRows <- 2:(nrow(printmat) - 1L)
    }
    
    syntax[dataRows, !dataCols & !rankCol] <- 'I'
    syntax[dataRows, rankCol] <- 'E'
    
    printmat[] <- syntaxHighlight(printmat, syntax)
    
    
    message <- syntaxHighlight(message, 'G')
  }


  cat(message, '\n')
  cat(apply(printmat, 1, paste, collapse = '  '), sep = '\n')
  cat(message, '\n')
  invisible(dist)
}


Pequation <- function(dist, f = 'P', collapse = ',') {
  varnames <- varnames(dist)
  
  condition <- dist@Condition 
  
  eq <- paste(setdiff(varnames, condition), collapse = collapse)
  if (!is.null(condition)) {
    condition <- paste(condition, collapse = collapse)
    eq <- paste0(eq, ' | ', condition)
  }
  
  paste0(f, '(', eq, ')')
}

prettyN <- function(N, digits = 1L, zeros = '.') {
  tens <- ifelse(N == 0, 0, log10(abs(N)) |> floor())
  
  thousands <- pmin(tens %/% 3, 4L)
  thousands[thousands < 0L] <- thousands[thousands < 0L] + 1L
  
  # scaling (powers of 10^3)
  scale <- c('', 'k', 'M', 'G', 'P')[1L + abs(thousands)]
  scale[thousands < 0] <- paste0('/', scale[thousands < 0])
  N <- N / 1000^thousands

  globalscale <- if (length(unique(scale[N != 0])) == 1L) {
    globalscale <- unique(thousands[N != 0])
    scale <- NULL
    globalscale
  }
  
  # round and format
  Nround <- round(N, digits = digits)
  approx <- N != Nround
  
  Nprint <- format(Nround, scientific = FALSE)
  Nprint[N != 0] <- gsub('^( *)0\\.', '\\1 .', Nprint[N != 0])
  Nprint <- gsub('\\.0+$', '', Nprint)
  Nprint <- gsub('^( *)0( *)$', paste0('\\1', zeros, '\\2'), Nprint)

  
  output <- paste0(Nprint, scale, ifelse(approx, '~', ''))
  output <- paste0(output, strrep(' ', max(nchar(output)) - nchar(output))) 
  
  attr(output, 'zerofill') <- zeros
  attr(output, 'scale') <- globalscale
  attr(output, 'approx') <- any(approx, na.rm = TRUE)
  attr(output, 'negative') <- any(N < 0, na.rm = TRUE)
  
  output
  
}


prettyP <- function(P, digits = 3, zeros = '.') {
  tens <- ifelse(P == 0, 0, log10(P) |> round())
  
  thousands <- (tens - 1) %/% -3
  scale <- min(thousands)
  
  P <- P * 1000^-scale
  
  Pround <- round(P, digits = digits)
  approx <- P != Pround
  
  Pprint <- format(Pround, scientific = FALSE)
  Pprint <- gsub('^( *)0\\.', '\\1 .', Pprint)
  Pprint <- gsub('\\0*$', '', Pprint)
  Pprint[P == 0] <- paste0(' ', zeros)

  
  output <- paste0(Pprint, ifelse(approx, '~', ''))
  maxnchar <- max(nchar(output))
  output <- paste0(output, strrep(' ', maxnchar - nchar(output))) 
  # output <- gsub(paste0('\\.', strrep('0', digits), '~'), paste0('.~', strrep(' ', digits)), output)
  
  attr(output, 'zerofill') <- paste0(' ', zeros, strrep(' ', maxnchar - 2L))
  attr(output, 'scale') <- if (scale != 0L) paste0('10^(', -scale * 3, ')')
  attr(output, 'approx') <- any(approx)
  
  output
  
}


prettyBins <- function(x, maxN = 20, quantiles = 0, right = TRUE, ...) {
  levels <- sort(unique(x))
  if (is.integer(x) || all(is.whole(levels))) {
    range <- diff(range(levels))
    
    if (range <= maxN) {
      return(factor(x, levels = min(levels) : max(levels)))
    }
    if (length(levels) <= maxN) return(factor(x, levels = levels))
  } else {
    if (length(levels) <= (maxN / 2)) return(factor(x, levels = levels))
  }
  
  if (quantiles) {
    if (is.logical(quantiles)) quantiles <- 10L
    
    breaks <- quantile(x, probs = seq(0, 1, length.out = quantiles + 1L))
  } else {
    breaks <- hist(x, plot = FALSE, ...)$breaks
  }
  
  cut(x, breaks, include.lowest = TRUE, right = right, dig.lab = 3)
  
}

### indexing ----
# 
setMethod('[', c('distribution', 'ANY', 'missing'),
          function(x, i, drop = FALSE) {

            df <- as.data.frame(x)[i , ,  drop = FALSE]
            
            if (drop) df else distribution(df, x)
          })
# 
setMethod('[', c('distribution', 'missing', 'atomic'),
          function(x, i, j, drop = FALSE) {
            

                        x <- unconditional(x)
                        varnames <- varnames(x)

                        j <- if (is.numeric(j)) varnames[j] else intersect(j, varnames)
                        
                        if (any(is.na(j)) || length(j) == 0L) .stop("Can't index this distribution with non-existent columns.")

                        type <- dist_type(x)
                        dt <- as.data.table(x)[ , setNames(list(sum(get(type))), type), by = j]
                        
                        if (drop) as.data.frame(dt) else distribution(dt, x)
                        

          })

setMethod('[', c('distribution', 'ANY', 'ANY'),
          function(x, i, j, drop = FALSE) {
            
            x <- x[i , ]
            x[ , j, drop = drop]
          
            
          })

setMethod('[', c('distribution', 'matrix'),
          function(x, i, j, cartesian = TRUE, drop = FALSE) {
            
            i <- as.data.frame(i)
            names(i) <- NULL
            
            do.call('[[', c(list(x, cartesian = cartesian, drop = drop), i))
            
          })

#' @export
setMethod('[[', 'distribution',
          function(x, i, j, ..., cartesian = FALSE, drop = FALSE) {
            
            args <- as.list(rlang::enexprs(...))
            
            args <- c(list(if (missing(i)) rlang::missing_arg() else i ),
                      list(if (missing(j)) rlang::missing_arg() else j ),
                      args)
            
            missing <- sapply(args, rlang::is_missing)
            args[!missing] <- lapply(args[!missing], rlang::eval_tidy)
            
            if (missing[2] && length(args) == 2L) {
              args <- args[1L]
              missing <- missing[1L]
            }
            
        
            levels <- getLevels(x)
            if (length(args) > length(levels)) .stop("This distribution only has {num2print(length(levels))} dimensions to index.",
                                    "You have provided {num2print(length(args))}.")
            levels <- levels[1:length(args)]
            
            args[!missing] <- Map(\(arg, lev) {
              if (is.numeric(arg)) unique(lev)[arg] else arg
            } , args[!missing], levels[!missing])
            
            i <- if (cartesian) {
              matches(args[!missing], levels[!missing])
            } else {
              Reduce('|', Map(`%in%`, levels[!missing], args[!missing]))
            }
           
            
            x[i, , drop = drop]
            



          })

setMethod('[', c('probability', 'missing', 'character'),
          function(x, i, j, ...) {

            N <- x@N
            x <- unconditional(x)
            x <- as.data.table(x)

            x <- as.data.frame(x[ , list(p = sum(p)), by = j])

            distribution(x, 'p', Condition = NULL, N = N)

          })


#' @export
filter.distribution <- function(.data, ..., drop = FALSE) {
  exprs <- rlang::enquos(...)

  .data[rlang::eval_tidy(exprs[[1]], data = as.data.frame(.data)), , drop = drop]
}

### coercion ----

#' @export
as.matrix.distribution <- function(x, wide = TRUE, ...) {
  varnames <- varnames(x)
  type <- dist_type(x)
  
  if (length(varnames) >= 2L && wide && x@Sort == -0L) {
    mat <- as.matrix(dcast(as.data.table(x), 
                    rlang::new_formula(quote(...), rlang::sym(varnames[2])), 
                    fill = 0, value.var = type))
    
    talldim <- colnames(mat) %in% varnames 
    
    rownames(mat) <- do.call('paste', c(as.data.frame(mat[ , talldim, drop = FALSE]), list(sep = '.')))
    mat <- mat[ , !talldim, drop = FALSE]
    names(dimnames(mat)) <- c(paste(varnames[talldim], collapse = '.'), varnames[2])
    mat <- array(as.numeric(mat), dim = dim(mat), dimnames = dimnames(mat))
    
    
  } else {
    mat <- matrix(getValues(x), ncol = 1)
    rownames(mat) <- do.call('paste', c(getLevels(x), list(sep = '.')))
    names(dimnames(mat)) <- c(paste(varnames, collapse = '.'), '')
    colnames(mat) <- type
  }
  
  mat
 
  mat
}

#' @export
setMethod('as.data.frame', 'distribution',
          function(x) {
  setNames(as.data.frame(x@.Data), colnames(x))
})

#' @export
as.data.table.distribution <- function(x) {
  as.data.table(as.data.frame(x))
}


### sorting ----

#' @rdname distributions
#' @export
setMethod('sort', 'distribution',
          function(x, decreasing = TRUE) {
  X <- getValues(x)
  
  x <- x[order(X, decreasing = decreasing), ]
  
  x@Sort <- if (decreasing) -1L else 1L
  x
})


### aligning ----

alignDistributions <- function(..., funcname = '') {
  
  dists <- list(...)
  varnames <- lapply(dists, varnames)
  if (length(Reduce(\(x, y) if (setequal(x, y)) x, varnames)) == 0L) .stop("If using {funcname} on distributions, they must all have the same dimension names.")
  varnames <- varnames[[1]]
  
  
  levels <- setNames(lapply(varnames, 
                            \(dn) do.call('mergeLevels', lapply(dists, \(dist) as.data.frame(dist)[[dn]]))), 
                     varnames)
  
  levels <- do.call('expand.grid', levels)

  
  aligned <- lapply(dists, 
                  \(dist) {
                    df <- merge(levels, as.data.frame(dist), all = TRUE)
                    
                    df <- df[matches(levels, df[,varnames, drop = FALSE]), , drop = FALSE] # put in right order
                    
                    type <- dist_type(dist)
                    ifelse(is.na(df[[type]]), 0, df[[type]])
                    
                  })
  
  names(aligned) <- sapply(dists, dist_type)
  
  list(Levels = levels, X = aligned)
}
  
mergeLevels <- function(...) {
  args <- list(...)
  newlev <- Reduce(union, args)
  
  mini <- do.call('pmin', c(lapply(args, 
                                   \(arg) {
                                     match(newlev, arg)
                                   }), 
                            list(na.rm = TRUE)))
  ord <- order(mini, newlev)
  newlev[ord]
}


#' @export
cbind.distribution <- function(...) {
  args <- list(...)
  dists <- sapply(args, inherits, what = 'distribution')
  
  aligned <- do.call('alignDistributions', args[dists])
  args[dists] <- aligned$X
  
  mat <- do.call('cbind', args)
  colnames(mat)[dists][colnames(mat)[dists] == ''] <- names(aligned$X)[colnames(mat)[dists] == '']

  rownames(mat) <- do.call('paste', c(aligned$Levels, list(sep = '.')))
  names(dimnames(mat)) <- c(paste(colnames(aligned$Levels), collapse = '.'), "")
  mat
}




### arithmetic ----

distmat <- function(factors, result, type = 'n') {
  if (inherits(factors, 'distribution')) factors <- getLevels(factors)
  mat <- matrix(result, ncol = 1)
  rownames(mat) <- do.call('paste', c(factors, list(sep = '.')))
  colnames(mat) <- type
  names(dimnames(mat)) <- c(paste(colnames(factors), collapse = '.'), '')
  mat
}

#' @export
setMethod('+', c('count', 'count'),
          function(e1, e2) {
            aligned <- alignDistributions(e1, e2, funcname = '+')
            
            df <- aligned$Levels
            df$n <- callGeneric(aligned$X[[1]], aligned$X[[2]])
            
            distribution(df, 'n')
          })

#' @export
setMethod('+', c('count', 'integer'),
          function(e1, e2) {
            df <- getLevels(e1)
            df$n <- callGeneric(getValues(e1), e2)
            
            distribution(df, 'n')
          })

#' @export
setMethod('Ops', c('distribution', 'distribution'),
          function(e1, e2) {
            aligned <- alignDistributions(e1, e2, funcname = .Generic)

            
            callGeneric(aligned$X[[1]], aligned$X[[2]])
            # distmat(aligned$Levels, result, dist_type(e1))
            
          })


#' @export
setMethod('Ops', c('distribution', 'numeric'),
          function(e1, e2) {
            
            callGeneric(getValues(e1), e2)
            # distmat(e1, result)

            
          })

#' @export
setMethod('Math', 'distribution',
          \(x) {
            distmat(x, callGeneric(getValues(x)))
          })

#' @export
setMethod('Summary', 'distribution',
          \(x, ..., na.rm = FALSE) {
            setNames(callGeneric(getValues(x), ..., na.rm), paste(varnames(x), collapse = '.'))
          })


#' @export
setMethod('mean', 'distribution',
          \(x, ..., na.rm = FALSE) {
            setNames(mean(getValues(x), ..., na.rm = na.rm), paste(varnames(x), collapse = '.'))
          })

#' @export
setMethod('*', c('probability', 'probability'),
          \(e1, e2) {
            
            varnames1 <- varnames(e1)
            varnames2 <- varnames(e2)
            
            if (length(intersect(varnames1, varnames2))) return(callNextMethod(e1, e2))
            
            if (length(varnames1) > 1L || length(varnames2) > 1L) .stop("Can't cross product probability distributions with more than one dimenion yet.")
            
            p1 <- setNames(e1$p, getLevels(e1)[[1]])
            p2 <- setNames(e2$p, getLevels(e2)[[1]])
            
            jointp <- outer(p1, p2, '*')
            
            df <- as.data.frame(as.table(jointp))
            colnames(df) <- c(varnames1, varnames2, 'p')
            
            distribution(df, 'p', Condition = NULL, N = sum(e1@N, e2@N))
            
            
          })












## User Functions ----


### count() ----


#' Tabulate and/or cross-tabulate data
#' 
#' The `count()` function is exactly like R's fundamental [table()][base::table] function,
#' except that 1) will give special treatment to humdrumR [token()] data 2)
#' has more intuitive/simple argument names 3) makes it easier to combine/manipulate
#' disparate output tables.
#' 
#' @details
#' 
#' The `count()` function is essentially a wrapper
#' around [base::table()][base::table] function.
#' However, any [token()] class arguments are treated like [factors()],
#' calling generating their own levels.
#' This assures that, for example, pitch data is tabulated in order of pitch height,
#' and "missing" pitches are counted as zero.
#' 
#' `count()` will, by default, count `NA` values if they are present---if you don't want
#' to count `NA`s, specify `na.rm = TRUE`.
#' You can also tell `count()` to exclude (not count) any other arbitrary values you
#' provide as a vector to the `exclude` argument.
#' 
#' 
#' `count()` will always give names to the dimensions of the table it creates.
#' You can specify these names directly as argument names, like `count(Kern = kern(Token))`;
#' if you don't specify a name, `count()` will make up a name(s) based on expression(s) it is tallying.
#' (Note that `count()` does not copy [base::table()]'s obtusely-named `dnn` or `deparse.level` arguments.)

#' @section Manipulating humdrum tables:
#' 
#' The output of `count()` is a special form of R `table`, a `humdrumR.table`.
#' Given two or more `humdrumR.table`s, if you apply basic R operators 
#' (e.g., arithmetic, comparisons) or row/column binding (`cbind`/`rbind`) 
#' `humdrumR` will align the tables by their dimension-names before
#' doing the operation.
#' This means, that if you have two tables of pitch data, but one table includes specific pitch and other doesn't,
#' you can still add them together or bind them into a matrix.
#'  See the examples!
#'
#' @examples 
#' 
#' generic <- c('c', 'c', 'e', 'g', 'a', 'b', 'b', 'b')
#' complex <- c('c', 'c#', 'e', 'f', 'g','g#', 'g#', 'a')
#' 
#' genericTable   <- count(generic)
#' complexTable <- count(complex)
#' 
#' genericTable
#' complexTable
#' 
#' genericTable + complexTable
#' 
#' cbind(genericTable, complexTable)
#' 
#' @name distributions
#' @export 
count.default <- function(..., sort = FALSE, na.rm = FALSE,
                          .drop = FALSE, binArgs = list()) {
  checks(sort, xTF | (xwholenum & xlen1))
  checks(na.rm, xTF)
  checks(.drop, xTF)
  
  args <- list(...)
  
  # get the appropriate (dim)names for each argument
  exprs <- as.list(substitute(list(...)))[-1L]
  
  varnames <- .names(args)
  if (any(varnames == '')) varnames[varnames == ''] <- vapply(exprs[varnames == ''], deparse, nlines = 1L, '')
  
  if (length(unique(lengths(args))) > 1L) .stop("Can't cross-tabulate these vectors ({harvard(varnames, 'and')}), because they are different lengths.")
  
  # factorize arguments as needed
  args <- lapply(args,
                 \(arg) {
                   if (inherits(arg, 'token')) arg <-  factorize(arg) 
                   if (is.numeric(arg)) do.call('prettyBins', c(list(arg), binArgs)) else arg
                 })
  
  
  argdf <- as.data.frame(args)
  colnames(argdf) <- varnames
  
  result <- rlang::eval_tidy(rlang::expr(count(argdf, !!!(rlang::syms(varnames)), name = 'n', .drop = !!.drop)))
  
  if (na.rm) result <- result[Reduce('&', lapply(result[ , varnames, drop = FALSE], \(col) !is.na(col))), , drop = FALSE]
  
  dist <- distribution(result, 'n')
  
  if (sort) sort(dist, decreasing = sort > 0L) else dist
  
  
  
}

#' @rdname distributions
#' @export
count.humdrumR <- function(x, ..., sort = FALSE, na.rm = FALSE, .drop = FALSE, binArgs = list()) {
  quos <- rlang::enquos(...)
  counts <- if (length(quos)) {
    names(quos) <- sapply(quos, rlang::as_label)
    quo <- rlang::quo(with(x, count.default(!!!quos, sort = !!sort, na.rm = !!na.rm, .drop = !!.drop, binArgs = binArgs)))
    rlang::eval_tidy(quo)
    
  } else {
    fields <- pullFields(x, union(selectedFields(x), getGroupingFields(x)), null = 'asis')
    do.call('count.default', c(as.list(fields), list(sort = sort, na.rm = na.rm, .drop = .drop, binArgs = binArgs)))
  }
  
  counts
}



#' @rdname distributions
#' @export
count.table <- function(..., sort = FALSE,
                        na.rm = FALSE,
                        .drop = FALSE) {
  tab <- list(...)[[1]]
  type <- if (any(tab < 1 & tab > 0)) 'p' else 'n'
  df <- as.data.frame(tab, responseName = type)
  
  dist <- if (type == 'p') {
    distribution(df, type, N = sum(tab), Condition = NULL)
  } else {
    distribution(df, type)
  }
  
  
  if (sort) dist <- sort.distribution(dist, decreasing = sort > 0L)
  
  dist
  
}




### pdist() -----



#' Tabulate and cross proportions
#' 
#' 
#' @export
pdist <- function(x, ..., condition = NULL, na.rm = FALSE, sort = FALSE, .drop = FALSE, binArgs = list()) {
  checks(sort, xTF | (xwholenum & xlen1))
  checks(na.rm, xTF)
  checks(.drop, xTF)
  
  UseMethod('pdist')
}

#' @rdname distributions
#' @export
pdist.count <-  function(x, ..., condition = NULL, na.rm = FALSE, sort = FALSE, .drop = FALSE, binArgs = list()) {
  
  if (na.rm) x <- x[!Reduce('|', lapply(getLevels(x), is.na)), ]
  if (sort) x <- sort(x, sort > 0L)
  
  x <- as.data.frame(x)
  
  exprs <- rlang::enexprs(...)
  if (length(exprs)) condition <- pexprs(exprs, colnames(x), condition)$Condition %||% condition
  
  n <- sum(x$n, na.rm = TRUE)
  
  x$p <- x$n / n
  x$n <- NULL
  
  dist <- distribution(x, 'p', N = n)
  
  if (!is.null(condition)) dist <- conditional(dist, condition = condition)
  
  if (sort) dist <- sort(dist, decreasing = sort > 0L)
  
  dist
  
}

#' @rdname distributions
#' @export
pdist.probability <-  function(x, ..., condition = NULL, na.rm = FALSE, sort = FALSE, .drop = FALSE, binArgs = list()) {
  exprs <- rlang::enexprs(...)
  if (length(exprs)) condition <- pexprs(exprs, colnames(x), condition)$Condition %||% condition
  
  if (!is.null(condition)) conditional(x, condition) else x
  
}


#' @rdname distributions
#' @export
pdist.default <-  function(..., condition = NULL, na.rm = FALSE, sort = FALSE, .drop = FALSE, binArgs = list()) {
  exprs <- as.list(substitute(list(...)))[-1]
  args <- list(...)
  
  # get the appropriate (dim)names for each argument
  
  varnames <- .names(args)
  if (any(varnames == '')) varnames[varnames == ''] <- vapply(exprs[varnames == ''], deparse, nlines = 1L, '')
  names(args) <- varnames
  
  # if condition is given as separate vector
  conditionName <- deparse(substitute(condition), nlines = 1L)
  if (!is.null(condition) && is.atomic(condition) && length(condition) == length(args[[1]])) {
    args[[conditionName]] <- condition
    condition <- conditionName
  }
  
  
  count <- do.call('count.default', c(args, list(na.rm = na.rm, sort = sort, .drop = .drop, binArgs = binArgs)))
  
  pdist(count, condition = condition)
  
}

#' @rdname distributions
#' @export
pdist.data.frame <-  function(x, ..., condition = NULL, na.rm = FALSE, sort = FALSE, .drop = FALSE, binArgs = list()) {
            exprs <- rlang::enexprs(...)
            
            if (length(exprs) == 0L) {
              args <- as.list(x)
            } else {
              parsed <- pexprs(exprs, colnames(x), condition)
              condition <- parsed$Condition 
              
              args <- rlang::eval_tidy(rlang::quo(list(!!!(parsed$Exprs))), data = x)
              names(args) <- names(parsed$Exprs)
            }
            
            
            do.call('pdist', c(args, list(condition = condition, na.rm = na.rm, sort = sort, .drop = .drop, binArgs = binArgs)))
            
          }






#' @rdname distributions
#' @export 
pdist.humdrumR <- function(x, ..., condition = NULL, na.rm = FALSE, sort = FALSE, .drop = FALSE, binArgs = list()) {
            
            exprs <- rlang::enexprs(...)
            
            humtab <- getHumtab(x, 'D')
            if (length(exprs)) {
              
              rlang::eval_tidy(rlang::expr(pdist(humtab, !!!exprs, condition = !!condition, na.rm = !!na.rm, sort = !!sort, .drop = !!.drop, binArgs = !!binArgs)))
            } else {
              selectedFields <- selectedFields(x)
              names(selectedFields) <- selectedFields
              
              do.call('pdist', 
                      c(list(humtab), 
                        as.list(selectedFields),
                        list(condition = condition, na.rm = na.rm, sort = sort, .drop = .drop, binArgs = binArgs)))
              
            }
}




#' @rdname distributions
#' @export
pdist.table <- function(x, ..., condition = NULL, na.rm = FALSE, sort = FALSE, binArgs = list()) {
            
            if (length(dim(x)) == 1L) condition <- NULL
            
            if (na.rm) {
              notna <- unname(lapply(varnames(x), \(dim) !is.na(dim)))
              x <- do.call('[', c(list(x), notna))
            }
            
            if (is.character(condition)) {
              condition <- pmatch(condition, varnames(x), duplicates.ok = FALSE)
              condition <- condition[!is.na(condition)]
              if (length(condition) == 0L) condition <- NULL
            }
            
            ptab <- proportions(x, margin = condition) 
            
            
            N <- marginSums(x, margin = condition)
            
            distribution(N, 'p')
            # new('probability.frame', ptab, N = as.integer(n), margin =  as.integer(condition))
          }



conditional <- function(pdist, condition) {
  varnames <- varnames(pdist)
  if (any(!condition %in% varnames)) .stop("We can only calculate a conditional probability across an existing dimension/factor.",
                                           "The <conditions|condition> {harvard(setdiff(varnames, condition), 'and')} are not dimensions of the given",
                                           "distribution ({harvard(varnames, 'and')}).")
  
  if (!is.null(pdist@Condition)) {
    if (setequal(condition, pdist@Condition)) return(pdist)
    pdist <- unconditional(pdist)
  }
  
  
  conditionvec <- do.call('paste', c(as.list(pdist)[condition], list(sep = '.')))
  
  margin <- c(tapply(pdist$p, conditionvec, sum))
  margin <- margin[unique(conditionvec)] # to match original order
  
  pdist$p <- ifelse(margin[conditionvec] == 0, 0, pdist$p / margin[conditionvec])
  
  pdist@Condition <- condition
  
  pdist@N <- setNames(as.integer(pdist@N * margin), names(margin))
  pdist
  
}
            
unconditional <- function(dist) {
  if (!inherits(dist, 'probability') || is.null(dist@Condition)) return(dist)
  
  
  N <- sum(dist@N)
  margins <- dist@N / N
  
  dist$p <- dist$p * margins[paste(as.data.frame(dist)[[dist@Condition]])] # paste to deal with NA levels
  
  
  dist@Condition <- NULL
  dist@N <- N
  dist
 
  
}


pexprs <- function(exprs, colnames, condition) {
  
  for (i in seq_along(exprs)) {
    expr <- exprs[[i]]
    
    exprs[[i]] <-  switch(class(expr),
                          integer = ,
                          numeric = rlang::sym(colnames[expr]),
                          character = rlang::sym(expr),
                          formula = ,
                          call = {
                            if (as.character(expr[[1]]) %in% c('|', '~')) {
                              
                              if (rlang::is_symbol(expr[[3]])) condition <- as.character(expr[[3]]) 
                              
                              exprs[[length(exprs) + 1L]] <- expr[[3]]
                              expr[[2]] 
                              
                            } else {
                              expr
                            }
                          },
                          expr)
    
  }
  
  names(exprs) <- sapply(exprs, rlang::as_label)
  exprs <- exprs[!duplicated(names(exprs))]
  
  list(Exprs = exprs, Condition = condition)
}






# Information Theory ----

## Likelihoods ----

### like() ----

#' @export
like <- function(..., distribution) UseMethod('like')

#' @export
ic <- function(..., distribution, base = 2) -log(like(..., distribution = distribution), base = base)

#' @export
like.default <- function(..., distribution) {
  like.data.frame(data.frame(...), distribution = distribution)
}

#' @export
like.data.frame <- function(df, ..., distribution) {
  
  if (missing(distribution)) distribution <- do.call('pdist', list(df, ...))
  
  colnames <- colnames(df)
  varnames <- varnames(distribution)
  
  if (!setequal(colnames, varnames)) .stop("To calculate likelihoods, the expected distribution must have the same variables as the observed variables.")

  distribution[as.matrix(df), , drop = TRUE]$p
  
}


#' @export
pMI <- function(..., distribution, base = 2) {
  df <- data.frame(...)
  
  if (missing(distribution)) distribution <- do.call('pdist', df)
  
  independent <- Reduce('*', lapply(varnames(distribution), \(j) distribution[ , j]))
  
  ic_observed <- ic(df, distribution = distribution, base = base)
  ic_independent <- ic(df, distribution = independent, base = base)
  
  ic_independent - ic_observed
  
}


## Distributional ----


### entropy() ----

#' Calculate Entropy or Information Content of variables 
#'
#' Information content and entropy are fundamental concepts in [information theory](https://en.wikipedia.org/wiki/Information_theory),
#' which quantify the amount of information (or "surprise") in a random variable.
#' Both concepts are closely related the probability density/mass of events: improbable events have higher information content.
#' The probability of *each* observation maps to the [information content](https://en.wikipedia.org/wiki/Information_content);
#' The average information content of a variable is the [entropy](https://en.wikipedia.org/wiki/Entropy_(information_theory)).
#' Information content/entropy can be calculated for discrete probabilities or continuous probabilities,
#' and humdrumR defines methods for calculating both.
#' 
#' @details 
#' 
#' To calculate information content or entropy, we must assume (or estimate) a probability distribution.
#' HumdrumR uses R's standard [table()] and [density()] functions to estimate discrte and continuous probability
#' distributions respectively.
#' 
#' Entropy is the average information content of a variable.
#' The `entropy()` function can accept either a [table()] object (for discrete variables), 
#' or a [density()] object (for continuous variables).
#' If `entropy()` is passed an [atomic][base::vector()] vector,
#' the values of the vector are treated as observations or a random variable:
#' for `numeric` vectors, the [stats::density()] function is used to estimate the probability distribution
#' of the random (continuous) variable, then entropy is computed for the density.
#' For other atomic vectors, [table()] is called to tabulate the discrete probability mass for each
#' observed level, and entropy is then computed for the table.
#'
#' The `ic()` function only accepts atomic vectors as its main (`x`) argument, but must also
#' be provided a `distribution` argument.
#' By default, the `distribution` argument is estimated using [density()] (`numeric` input) or [table()] (other input).
#' 
#' 
#' @family {Information theory functions} 
#' @export
entropy <- function(..., base = 2) {
  checks(base, xnumber & xpositive)
  
  UseMethod('entropy')
}



#' @rdname entropy
#' @export
H <- entropy
  
#' @rdname entropy
#' @export
entropy.probability <-  function(q, p, condition = NULL, base = 2) {
            if (!is.null(condition)) q <- conditional(q, condition)
  
            if (missing(p) || !inherits(p, 'probability')) {
              expected <- unconditional(q)$p
              observed <- q$p
             } else {
               # cross entropy of q and p!
                aligned <- alignDistributions(q, p, funcname = 'entropy')
                observed <- aligned$X[[1]]
                expected <- aligned$X[[2]]
               
             }
  
            observed <- ifelse(observed > 0L, log(observed, base = base), 0) 
            
            
            equation <- Pequation(q, 'H')
            setNames(-sum(expected * observed), equation)
          }


#' @rdname entropy
#' @export
entropy.numeric <- function(x, base = 2, na.rm = TRUE) {
  entropy(density(x, ...), base = base, na.rm = na.rm)
}


#' @rdname entropy
#' @export
entropy.density <- function(x, base = 2, na.rm = TRUE) {
            label <- rlang::expr_name(rlang::enexpr(x))
            if (any(is.na(x))) return(NA_real_)
            
            dx <- diff(x$x[1:2])
            
            equation <- paste0('H(', label, ')')
            
            setNames(-sum(log(x$y, base = base) * x$y * dx), equation)
            
            
          }


#' @rdname entropy
#' @export
entropy.default <- function(..., base = 2) {
            entropy(pdist(...), base = base)
          }


entropies <- function(..., base = 2, conditional = FALSE) {
  args <- list(...)
  
  names <- .names(args)
  names <- ifelse(names == '', sapply(substitute(list(...))[-1], deparse, nlines = 1L), names)
  names(args) <- names
  
  mat <- matrix(NA_real_, nrow = length(args), ncol = length(args))
  
  indices <- expand.grid(i = seq_along(args), j = seq_along(args))
  if (!conditional) indices <- subset(indices, j >= i)
  
  
  for (k in 1:nrow(indices)) {
    i <- indices$i[k]
    j <- indices$j[k]
    

    
    mat[i, j] <- if (i == j) {
      entropy(args[[i]], base = base)
    } else {
      do.call('entropy', c(args[c(i, j)], list(condition = if (conditional) names[j])))
    }
  }
  
  colnames(mat) <- rownames(mat) <- names
  mat
}


### mutualInfo() ----

#' Calculate Entropy or Information Content of variables 
#'
#' 
#' @family {Information theory functions} 
#' @rdname entropy
#' @export
mutualInfo <- function(..., base = 2) {
  checks(base, xnumber & xpositive)
  
  UseMethod('mutualInfo')
}



#' @rdname entropy
#' @export
mutualInfo.probability <-  function(x, base = 2) {
  varnames <- varnames(x)
  if (length(varnames) < 2L) .stop("Can't calculate mutual information of a single variable.")
  
  x <- unconditional(x)
  
  observed <- setNames(x$p, do.call('paste', c(getLevels(x), list(sep = '.'))))
  
  independent <- Reduce('*', lapply(varnames, \(j) x[ , j]))
  # expected <- (x[ , 1] * x[ , 2])
  independent <- setNames(independent$p, do.call('paste', c(getLevels(independent), list(sep = '.'))))
  
  independent <- independent[names(observed)]
  
  ratio <- observed / independent
  logratio <- ifelse(ratio == 0 | ratio == Inf, 0, log(ratio, base = base))
  
  equation <- Pequation(x, 'I', ';')
  
  setNames(sum(observed * logratio, na.rm = TRUE), equation)
  
}




#' @rdname entropy
#' @export
mutualInfo.default <- function(..., base = 2) {
  mutualInfo(pdist(...), base = base)
}






##################################################-
# table() extensions for humdrumR ---- ###########
##################################################-



humdrumR.table <- function(tab) {
  class(tab) <- unique(c('humdrumR.table', 'table', class(tab)))
  tab
}


#' @export
cbind.humdrumR.table <- function(...) {
  args <- list(...)
  classes <- unique(lapply(args, class))
  
  table.i <- which(sapply(args, is.table))
  tables <- args[table.i]
  tables <- lapply(tables, \(tab) if (length(dim(tab)) == 1) t(t(tab)) else tab)
  colnames <- unlist(lapply(tables, \(tab) if (is.null(colnames(tab))) rep('', ncol(tab)) else colnames(tab)))
  tables <- lapply(tables, `colnames<-`, value = NULL)
  tables <- if (length(tables) > 1L) alignTables(tables, 'cbind', margin = 1) else lapply(tables, unclass)
  
  args[table.i] <- tables
  newtab <- do.call('cbind', args)
  
  argnames <- rep(.names(args), sapply(args, ncol))
  colnames(newtab) <- ifelse(argnames == '', colnames, argnames)
  
  class(newtab) <- if (length(classes) == 1L) classes[[1]] else c('humdrumR.table', 'table')
  newtab
}

#' @export
rbind.humdrumR.table <- function(...) {
  args <- list(...)
  table.i <- which(sapply(args, is.table))
  tables <- args[table.i]
  
  tables <- lapply(tables, \(tab) if (length(dim(tab)) == 1) t(tab) else tab)
  rownames <- unlist(lapply(tables, \(tab) if (is.null(rownames(tab))) rep('', nrow(tab)) else rownames(tab)))
  tables <- lapply(tables, `rownames<-`, value = NULL)
  tables <- alignTables(tables, 'rbind', margin = 2) 
  
  args[table.i] <- tables
  newtab <- do.call('rbind', args)
  
  argnames <- rep(.names(args), sapply(args, \(arg) if (length(dim(arg)) == 1L) 1 else nrow(arg)))
  rownames(newtab) <- ifelse(argnames == '', rownames, argnames)
  
  class(newtab) <- 'table'
  newtab
  
  
}



#' @export
as.table.distribution <- function(x) {
  levels <- lapply(getLevels(x), unique)
  varnames <- varnames(x)
  type <- dist_type(x)
  
  x <- as.data.frame(x)
  x[varnames] <- lapply(x[varnames], paste) # forces NA to be strings
  
  tab <- array(dim = lengths(levels), dimnames = lapply(levels, paste))
  tab[do.call('cbind', x[varnames])] <- x[[type]]
  dimnames(tab) <- levels
  
  humdrumR.table(tab)
  
}

alignTables <- function(tables, funcname = '', margin = NULL) {
  tables <- lapply(tables, 
                   \(tab) {
                     dn <- dimnames(tab)
                     null <- sapply(dn, is.null)
                     dn[null] <- lapply(dim(tab)[null], \(n) paste0(seq_len(n), '___'))
                     dimnames(tab) <- lapply(dn, \(names) ifelse(is.na(names), '_<NA>_', names) ) 
                     tab})
  olddimnames <- lapply(tables, dimnames)
  dimensions <- lapply(olddimnames, .names)
  dimensions <- Reduce(\(a, b) ifelse(a == b, a, paste(a, b, sep = if (nchar(funcname) == 1) funcname else '/')), dimensions)
  if (length(unique(lengths(olddimnames))) > 1L) .stop("If using {funcname} on tables, they must all have the same number of dimensions.")
  
  dimnames <- as.list(as.data.frame(do.call('rbind', olddimnames)))
  if (is.null(margin)) margin <- seq_along(dimnames)
  
  # dimnames[ margin] <- lapply(dimnames[ margin], \(dim) rep(list(stringr::str_sort(Reduce('union', dim), numeric = TRUE)), length(dim)))
  dimnames[ margin] <- lapply(dimnames[margin], 
                              \(dim) {
                                uniondim <- Reduce('union', dim)
                                
                                # get order
                                uniondim <- uniondim[order(do.call('pmin', c(list(na.rm = TRUE), lapply(dim, \(d) match(uniondim, d)))),
                                                           uniondim)]
                                
                                rep(list(uniondim), length(dim))
                                
                              })
  
  
  dimnames <- as.data.frame(do.call('rbind', dimnames))
  rownames(dimnames) <- dimensions
  
  empties <- lapply(dimnames, \(dn) array(data = 0L, dim = lengths(dn), dimnames = dn))
  
  Map(tables, empties,
      f = \(tab, empty) {
        
        # indices <- as.matrix(do.call('expand.grid', c(dimnames(tab), list(stringsAsFactors = FALSE))))
        indices <- rep(list(rlang::missing_arg()), length(dim(tab)))
        indices[margin] <- dimnames(tab)[margin]
        empty[] <- do.call('[<-', c(list(empty), indices, list(value = tab)))
        # empty[indices] <- tab[indices]
        
        dimnames(empty) <- lapply(dimnames(empty), 
                                  \(names) {
                                    if (!any(grepl('[0-9]___', names))) ifelse(names == '_<NA>_', NA_character_, names)
                                  })
        empty
      }) 
  
}


### table() -----

deparse.names <- function(exprs, deparse.level = 1L) {
  # exprs <- as.list(substitute(list(...)))[-1L]
  # if (length(exprs) == 1L && is.list(..1) && !is.null(nm <- names(..1))) 
  # return(nm)
  vapply(exprs, \(x) switch(deparse.level + 1, 
                            "", 
                            if (is.symbol(x)) as.character(x) else "",
                            deparse(x, nlines = 1)[1L]),
         "")
}

#' @rdname distributions
#' @export
setGeneric('table', signature = 'x',
           def = function(x, ..., exclude = if (useNA == 'no') c(NA, NaN), useNA = 'no', dnn = NULL, deparse.level = 1) {
             # this is an approach to making base::table generic, but dispatched on a single x argument.
             # this is necessary to make it so we can dispatch on x=humdrumR with non-standard evaluation of ... 
             # the hope is that this function behaves exactly like base::table (for non humdrumR classes), but its tricky to achieve.
             # table(x = ..., x=...) will cause an error that doesn't appear in base::table.
             args <- list(...)
             
             if (missing(x)) {
               if (is.null(dnn)) dnn <- character(length(args))
               dnn <- ifelse(dnn == '', names(args), dnn)
               names(args)[1] <- 'x'
               return(do.call('table', 
                              c(args, 
                                list(exclude = exclude, useNA = useNA, dnn = dnn, deparse.level = deparse.level))))
             }
             exprs <- sys.call()[-1]
             exprs <- exprs[!.names(exprs) %in% c('exclude', 'useNA', 'dnn', 'deparse.level', '.drop', 'na.rm')]
             names <- .names(exprs)
             if (!any(names == 'x')) names[which(names == '')[1]] <- 'x'
             
             
             args <- append(args, list(x = x), which(names == 'x')[1] - 1) # put x into right position (argument order)
             
             
             if (is.null(dnn)) dnn <- character(length(args))
             dnn <- ifelse(dnn == '', names(args), dnn)
             dnn <- ifelse(dnn == '' | (dnn == 'x' & .names(exprs) != 'x'), deparse.names(exprs, deparse.level), dnn)
             
             token <- any(sapply(args, inherits, what = 'token'))
             args <- lapply(args, factorize)
             
             tab <- do.call(base::table, c(args, list(exclude = exclude, useNA = useNA, dnn = dnn, deparse.level = deparse.level)))
             
             tab
           }) 




#' @export
#' @rdname distributions
setMethod('table', 'humdrumR', 
          function(x, ..., exclude = if (useNA == 'no') c(NA, NaN),
                   useNA = 'no', dnn = NULL,
                   deparse.level = 1) {
            
            quos <- rlang::enquos(...)
            tab <- if (length(quos)) {
              dnn <- .names(quos)
              dnn[dnn == ''] <- deparse.names(quos[dnn == ''], deparse.level = deparse.level)
              quo <- rlang::quo(with(x, table(!!!quos, exclude = exclude, useNA = !!useNA, dnn = dnn)))
              rlang::eval_tidy(quo)
              
              
            } else {
              fields <- pullFields(x, union(selectedFields(x), getGroupingFields(x)))
              
              do.call('table', c(as.list(fields), list(exclude = exclude, useNA = useNA, dnn = dnn, deparse.level = deparse.level)))
            }
            
            humdrumR.table(tab)
            
          })

#' @export
#' @rdname distributions
setMethod('table', 'distribution',
          function(x) {
            as.table(x)
          })
Computational-Cognitive-Musicology-Lab/humdrumR documentation built on Oct. 22, 2024, 9:28 a.m.