Nothing
#' Obtain Zipf's Distribution Parameters from Data
#'
#' Zipf's distribution is commonly found for text data. Closely related to the
#' Pareto and power-law distributions, the Zipf's distribution produces
#' highly skewed data. This function obtains the best fitting parameters
#' to Zipf's distribution
#'
#'
#' @details The best parameters are optimized by minimizing the aboslute
#' difference between the original frequencies and the frequencies obtained
#' by the \emph{beta} and \emph{alpha} parameters in the following
#' formula (Piantadosi, 2014):
#'
#' \emph{f(r) proportional to 1 / (r + beta)^alpha}
#'
#' where \emph{f(r)} is the \emph{r}th most frequency,
#' \emph{r} is the rank-order of the data, \emph{beta}
#' is a shift in the rank (following Mandelbrot, 1953, 1962),
#' and \emph{alpha} is the power of the rank with greater
#' values suggesting greater differences between the largest
#' frequency to the next, and so forth.
#'
#' @param data Numeric vector, matrix, or data frame.
#' Numeric data to determine Zipf's distribution parameters
#'
#' @return Returns a vector containing the estimated \code{beta} and
#' \code{alpha} parameters. Also contains \code{zipfs_sse} which corresponds
#' to the sum of square error between frequencies based
#' on the parameter values estimated and the original data frequencies
#'
#' @examples
#' # Generate factor data
#' two_factor <- simulate_factors(
#' factors = 2, # factors = 2
#' variables = 6, # variables per factor = 6
#' loadings = 0.55, # loadings between = 0.45 to 0.65
#' cross_loadings = 0.05, # cross-loadings N(0, 0.05)
#' correlations = 0.30, # correlation between factors = 0.30
#' sample_size = 1000 # number of cases = 1000
#' )
#'
#' # Transform data to Mandelbrot's Zipf's
#' two_factor_zipfs <- data_to_zipfs(
#' lf_object = two_factor,
#' beta = 2.7,
#' alpha = 1
#' )
#'
#' # Obtain Zipf's distribution parameters
#' obtain_zipfs_parameters(two_factor_zipfs$data)
#'
#' @references
#' Mandelbrot, B. (1953).
#' An informational theory of the statistical structure of language.
#' \emph{Communication Theory}, \emph{84}, 486–502.
#'
#' Mandelbrot, B. (1962).
#' On the theory of word frequencies and on related Markovian models of discourse.
#' \emph{Structure of Language and its Mathematical Aspects}, 190–219.
#'
#' Piantadosi, S. T. (2014).
#' Zipf’s word frequency law in natural language: A critical review and future directions.
#' \emph{Psychonomic Bulletin & Review}, \emph{21}(5), 1112-1130.
#'
#' @author
#' Alexander P. Christensen <alexpaulchristensen@gmail.com>,
#' Hudson Golino <hfg9s@virginia.edu>,
#' Luis Eduardo Garrido <luisgarrido@pucmm.edu>
#'
#' @export
#'
# Obtain Zipf's distribution parameters
# Updated 28.09.2022
obtain_zipfs_parameters <- function(data)
{
# Check if data is vector
if(is.vector(data)){
# Obtain frequencies
frequencies <- data
# Obtain Zipf's values
zipfs <- frequencies / length(data)
}else{
# Ensure data is matrix
data <- as.matrix(data)
# Obtain frequencies
frequencies <- as.vector(data)
# Obtain Zipf's values
zipfs <- frequencies / nrow(data)
}
# Assume rank order based on frequencies
rank_order <- rank(-frequencies)
# For large datasets, remove frequencies
rm(frequencies); gc(FALSE)
# Equation to solve
# zipfs = 1 / (rank_order + beta)^alpha
# Set alpha and beta sequence
alpha_sequence <- seq(1, 4, 0.50)
beta_sequence <- seq(0, 200, 10)
# Try to solve for both parameters
parameters <- estimate_parameters(
alpha_sequence = alpha_sequence,
beta_sequence = beta_sequence,
zipfs = zipfs,
rank_order = rank_order
)
## alpha
alpha_sequence <- seq(
parameters$alpha - 0.5,
parameters$alpha + 0.5,
0.10
)
## Ensure minimum equal zero
if(min(alpha_sequence) < 1){
alpha_sequence <- seq(1, max(alpha_sequence), 0.10)
}
## beta
beta_sequence <- seq(
parameters$beta - 50,
parameters$beta + 50,
5
)
## Ensure minimum equal zero
if(min(beta_sequence) < 0){
beta_sequence <- seq(0, max(beta_sequence), 5)
}
# Try to solve for both parameters
parameters <- estimate_parameters(
alpha_sequence = alpha_sequence,
beta_sequence = beta_sequence,
zipfs = zipfs,
rank_order = rank_order
)
## alpha
alpha_sequence <- seq(
parameters$alpha - 0.10,
parameters$alpha + 0.10,
0.01
)
## Ensure minimum equal zero
if(min(alpha_sequence) < 1){
alpha_sequence <- seq(1, max(alpha_sequence), 0.01)
}
# Try to solve for both parameters
parameters <- estimate_parameters(
alpha_sequence = alpha_sequence,
beta_sequence = beta_sequence,
zipfs = zipfs,
rank_order = rank_order
)
## beta
beta_sequence <- seq(
parameters$beta - 10,
parameters$beta + 10,
1
)
## Ensure minimum equal zero
if(min(beta_sequence) < 0){
beta_sequence <- seq(0, max(beta_sequence), 1)
}
# Try to solve for both parameters
parameters <- estimate_parameters(
alpha_sequence = alpha_sequence,
beta_sequence = beta_sequence,
zipfs = zipfs,
rank_order = rank_order
)
## alpha
alpha_sequence <- seq(
parameters$alpha - 0.05,
parameters$alpha + 0.05,
0.01
)
## Ensure minimum equal zero
if(min(alpha_sequence) < 1){
alpha_sequence <- seq(1, max(alpha_sequence), 0.01)
}
## beta
beta_sequence <- seq(
parameters$beta - 1,
parameters$beta + 1,
0.10
)
## Ensure minimum equal zero
if(min(beta_sequence) < 0){
beta_sequence <- seq(0, max(beta_sequence), 0.01)
}
# Try to solve for both parameters
parameters <- estimate_parameters(
alpha_sequence = alpha_sequence,
beta_sequence = beta_sequence,
zipfs = zipfs,
rank_order = rank_order
)
## beta
beta_sequence <- seq(
parameters$beta - 0.10,
parameters$beta + 0.10,
0.01
)
## Ensure minimum equal zero
if(min(beta_sequence) < 0){
beta_sequence <- seq(0, max(beta_sequence), 0.01)
}
# Try to solve for both parameters
parameters <- estimate_parameters(
alpha_sequence = parameters$alpha,
beta_sequence = beta_sequence,
zipfs = zipfs,
rank_order = rank_order
)
# Set final parameters
alpha <- parameters$alpha
beta <- parameters$beta
# Final values
final_values <- 1 / (rank_order + beta)^alpha
# Compute root mean square error
sse <- zipf_sse(
values = final_values,
zipfs = zipfs
)
# Add space
cat("\n")
# Return parameters
return(
zapsmall(c(alpha = alpha, beta = beta, zipfs_sse = sse))
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.