R/ThreadNet_Metrics.R

Defines functions estimate_network_complexity print_network_nodes_edges estimate_task_complexity_index routineness_metric compression_index compute_entropy plot_entropy

Documented in compression_index compute_entropy estimate_network_complexity estimate_task_complexity_index plot_entropy routineness_metric

 ##########################################################################################################
# THREADNET:  Metrics

# This software may be used according to the terms provided in the
# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0?
# Absolutely no warranty!
##########################################################################################################
# Functions for metrics:  entropy, complexity, routine-ness, etc.


#' @title Estimates the number of paths in a directed graph
#' @description This function takes a network descripts (nodes and edges, as generaged by the functino threads_to_network, and estimates the number of paths.
#' as described in Haerem, Pentland and Miller (2015). The estimate correlates with the McCabe's (1975) cyclometric complexity.
#' @name estimate_network_complexity
#' @param net Object with dataframe for nodes and edges
#' @return number
#' @export
estimate_network_complexity <- function(net){ return(estimate_task_complexity_index( nrow(net$nodeDF), nrow(net$edgeDF)) ) }

# returns a string with the number of nodes and edges in  the  network
#' @title returns a string with the number of nodes and edges in  the  network
#' @description returns a string with the number of nodes and edges
#' @name print_network_nodes_edges
#' @param net Object with dataframe for nodes and edges
#' @return string
#' @export
print_network_nodes_edges <- function(net){ return(paste0('Number of nodes = ', nrow(net$nodeDF),'  Number of edges = ', nrow(net$edgeDF) ) ) }


#' @title Estimates the number of paths in a directed graph
#' @description Same as estimate_network_complexity, but takes this version takes vertices and edges as parameters
#' @name estimate_task_complexity_index
#' @param v number of vertices (or nodes)
#' @param e number of edges
#' @return number
#' @export
estimate_task_complexity_index <- function(v,e){

  #INPUT ARGS:
  # in MatLab version, arguments were v (vertices) and e (edges)
  # v = number of vertices
  # tested for range of 10 < v < 100
  # e = number of edges
  # print("edges")
  # print(e)
  # print("vertices")
  # print(v)
  #
  # OUTPUT ARG:
  # cidx correlates with Log10(simple paths) with r>= 0.8

  # from ORM paper analysis, constant is 0.12.
  # For boundary condition of 2 nodes and 1 edge, complexity index=0, constant = 0.08
  return( 10^( 0.08 + 0.08*e - 0.08*v) )
}


#################################################################
#' @title Computes a metric of routineness based on frequency of ngrams
#' @description Computes the fraction of observed behavior that conforms to an observed pattern.
#' Current version uses ngrams, but it would be good to use spmf pattern mining to avoid including duplicate patterns  (e.g., a-b-c and b-c-d)
#' @name routineness_metric
#' @param o  data frame with occurresnces or events
#' @param TN  name of column with threadNumbers
#' @param CF name of column with contextual factor
#' @param n size of ngram
#' @param m how many of the most frequent ngrams to include. When m > 1, there is a risk of duplication.
#' @return number, index of routineness.
#' @export
routineness_metric <- function(o,TN,CF,n,m){

  # get the ngrams
  ng=count_ngrams(o,TN,CF,n)

# print(ng[1:m,])

  # return the ratio of occurrences in the top m most frequent ngrams to total occurrences
    return( sum(ng$freq[1:m])*n/nrow(o) )
}


#############################################################################
#' @title Computes the compressibility of the data in one column of a data frame
#' @description Compressibility is an index of complexity -- more compressible means less complex.  This function computes the ratio of compressed data
#' to the original data.  Should be between zero and one.  Uses built-in functions for in=memory compression
#' @name compression_index
#' @param df  a data frame containing occurrences or events
#' @param CF  a column or contextual factor in that data frame
#' @return number containing compressibility index, 0 < i < 1
#' @export
compression_index <- function(df,CF){ return(
    length(memCompress(paste0(as.character(df[[CF]])),type="gzip")) /
    length(paste0(as.character(df[[CF]]))) ) }


#######################################################################es
#compute entropy for a set of observations in a column from a data frame
#' @title Compute the entropy of a contextual factor
#' @description Each column in the raw data represents a contextual factor.  This function computes the entropy of each factor that is selected for use in the
#' analysis.
#' @name compute_entropy
#' @param freq is the frequency distribution of the levels in the factor
#' @return number
#' @export
compute_entropy <- function(freq){
  N = sum(freq)
  p = freq/N
  plnp = p*log(p)
  return(-sum(plnp))
}

# code to plot entropy as a function of zoom_level
#' @title  plot entropy as a function of zoom_level
#' @description  Gets the zoom levels, grep out the 'Z_' column names...
#' @name plot_entropy
#' @param e  data from of events with zoom levels
#' @return regular R plot
#' @export
plot_entropy <- function(e){
 plot(unlist(lapply(grep('ZM_',colnames(e)),function(i){compute_entropy(table(e[[i]]))})))
}
ThreadNet/ThreadNet documentation built on July 26, 2019, 8:16 p.m.