R/powerexp.R
In Dpit: Distribution Pitting

########## Functions for power law with exponential cut-off
# Let's call this "powerexp" for short.
# The parameterization is 
### f(x) \propto (x/threshold)^exponent exp(-rate*x)
# Really, functions should check that rate is positive, and that
# exponent is negative (otherwise it's just gamma distribution w/ threshold)
# This is not yet implemented

# The normalization constant must be obtained numerically; it is an upper
# incomplete gamma function of negative index.  Unfortuantely, R only includes
# code for upper incomplete gamma functions of _positive_ index.  Sometimes
# the normalizing constant can be expressed in terms of the latter; if so this
# code does so.  In all other cases, the normalizing constant can be expressed
# in terms of the exponential integral function.
# A separate C program invokes the Gnu Scientific Library (GSL) to calculate
# the exponential integral.  This program should be found in an accompanying
# file called exponential-integral.tgz.
# To install it on a Unix system, first make sure GSL is installed.
### > tar xzf exponential-integral.tgz
### > cd exponential-integral
# At this point, you need to edit the file "Makefile" to give the location of
# the GSL, which on my system is /sw/lib (for the library) and /sw/include
# (for the included files).  Then
### > make
### > mv exp_int yourexecutablepath
# where "yourexecutablepath" is a directory where you can put executable
# files.  Then edit the variable "exp_int_function_filename" in this file,
# below, to give the full path to the executable program exp_int.
# This is not the world's slickest installation mechanism, no.




### R-standard functions for a distribution:
# dpowerexp		Probability density
# ppowerexp		Cumulative distribution function
# qpowerexp		Quantile function
# rpowerexp		Random variate generation
### Estimation functions:
# powerexp.fit		Find scaling exponent and exponential rate by maximizing
#			likelihood
# test.powerexp.fit	Test the quality of the powerexp.fit routine
#			by generating data from a pure Pareto and comparing
#			the result log-likelihood ratio to the chi^2
#			distribution (to which it should asymptotically
#			converge) --- mostly included for debugging of the
#			normalizing constant and numerical optimization
### Backstage functions, not for users:
# powerexp.loglike	Calculcate log-likelihood, not intended for users
# test.powerexp.fit.1	Inner loop of test.powerexp.fit (one call)
# qpowerexponce		Find one upper quantile via numerical inversion
# uigf			Upper incomplete gamma function
# exp_int		Exponential integral function (vectorized); invokes
#                       either of the two following functions
# exp_int_once.gsl	Exponential integral of one value, calling Gnu
#			Scientific Library (crudely)
# exp_int_once.nr	Exponential integral of one value, re-implementing
#			a numerical recipe in R (not included in this public
#			release for copyright reasons)


# Location of external program invoking the GSL for calculating the exponential
# integral function
#exp_int1 <- "/usr/local/bin1/exp_int1"
#
#### EDIT THIS LOCATION!!!! ###




# Density of power laws with exponential cut-off
# Returns NA on values < threshold
# Input: Data vector, lower threshold, scaling exponent, exponential rate,
#        log flag
# Output: vector of (log) densities
dpowerexp <- function(x, threshold=1, exponent, rate=0, log=FALSE) {
   # If the rate is zero, we've got a pure Pareto
   if (rate==0) {
     return(suppressWarnings(dpareto(x,threshold,exponent,log)))
   }
   if (rate==0) {
     cat("I should never be here!\n")
   }
   nondim_thresh <- threshold*rate
   #if (is.nan(log(nondim_thresh))) { cat("Non-dimensionalized threshold, ",nondim_thresh,", is too small for log\n")} #HJ: Taken out by me
   #if (is.nan(log(rate))) { cat("Rate, ",rate,", is too small for log\n")} #HJ: Taken out by me.
   #if (is.nan(log(uigf(-exponent+1,nondim_thresh)))) {cat("UIGF, ", uigf, ", is too small for log\n")} #HJ: p value will be high or even 1 if Ntail is small enough to trigger this.
   prefactor <- rate/(nondim_thresh^exponent)
   C <- prefactor/uigf(-exponent+1,nondim_thresh)
   prefactor.log <- log(rate) - exponent*log(nondim_thresh)
   C.log <- prefactor.log - log(uigf(-exponent+1,nondim_thresh))
   #if (is.nan(C.log)) {cat("Normalizing constant", C, ", is too small for log\n")} #HJ: Produces warning messages, but not stoppages.
   # If I want the log density, I may as well avoid some finite-precision
   # arithmetic first
   if(log) {
   	f <- function(y) {C.log - exponent*(log(y)-log(threshold)) - rate*y}
   }
   else {
	f <- function(y) {C*(y/threshold)^(-exponent)*exp(-rate*y)}
   }
   d <- ifelse(x<threshold,NA,f(x))
   return(d)
}

# Cumulative distribution function for power law with exponential cut-off
# Returns NA on values less than lower threshold
# Input: Data vector, lower threshold, scaling exponent, exponential rate,
#        usual flags
# Output: Vector of (log) probabilities
ppowerexp <- function(x, threshold=1,exponent,rate=0,
                      lower.tail=TRUE,log.p=FALSE) {
   # The complementary distribution, Pr(X > x), is very simple,
   # it's just uigf(-exponent+1,rate*x)/uigf(-exponent+1,rate*threshold)
   # So, we do that
   if (rate==0) {
     return(suppressWarnings(ppareto(x,threshold,exponent,log,lower.tail)))
   }
   C <- 1/uigf(-exponent+1,rate*threshold)
   this_uigf <- function(x){uigf(-exponent+1,rate*x)}
   if ((!lower.tail)&&(!log.p)) {
     f <- function(x) {C*this_uigf(x)}
   }
   if ((lower.tail)&&(!log.p)) {
     f <- function(x) {1 - C*this_uigf(x)}
   }
   if ((!lower.tail)&&(log.p)) {
     f <- function(x) {log(C) + log(this_uigf(x))}
   }
   if ((lower.tail)&&(log.p)) {
     f <- function(x) {log(1 - c*this_uigf(x))}
   }
   p <- ifelse(x<threshold,NA,f(x))
   return(p)
}

# Quantiles of power-law with exponential cut-off distributions
# Input: Vector of (log) probabilities, lower threshold, scaling exponent,
#        exponential rate, usual flags
# Output: Vector of quantiles
qpowerexp <- function(p, threshold=1, exponent, rate=0, lower.tail=TRUE, log.p = FALSE) {
  # This isn't going to be simple, is it?  However, since ppowerexp is known,
  # numerical inversion should be possible
  if (rate==0) {
    return(suppressWarnings(qpareto(p,threshold,exponent,log.p,lower.tail)))
  }
  if (log.p) {
    z <- exp(p)
  } else {
    z <- p
  }
  if (lower.tail) {
    q <- 1-z
  } else {
    q <- z
  }
  qs <- sapply(q,qpowerexponce,threshold,exponent,rate)
  return(qs)
}

# Generate random variates from a power law with exponential cut-off
# Input: Integer size, lower threshold, scaling exponent, exponential rate
# Output: Real vector of random variates
rpowerexp <- function(n,threshold=1,exponent,rate=0) {
  if (rate==0) {
    return(suppressWarnings(rpareto(n,threshold,exponent)))
  }
  ru <- runif(n)
  r <- suppressWarnings(qpowerexp(ru,threshold,exponent,rate))
  return(r)
}

# Estimate scaling exponent and exponential rate of power law with
# exponential cut-off by maximizing likelihood
# Input: Data vector, lower threshold
# Output: List, giving type ("powerexp"), scaling exponent, exponential
#         rate, lower threshold, log-likelihood
powerexp.fit <- function(data,threshold=1,method="constrOptim",initial_rate=-1) {
  x <- data[data>=threshold]
  negloglike <- function(theta) {
    -powerexp.loglike(x,threshold,exponent=theta[1],rate=theta[2])
  }
  # Fit a pure power-law distribution
  pure_powerlaw <- pareto.fit(data,threshold)
  # Use this as a first guess at the exponent
  initial_exponent <- pure_powerlaw$exponent
  if (initial_rate < 0) { initial_rate <- exp.fit(data,threshold)$rate }
  minute_rate <- 1e-6
  theta_0 <- as.vector(c(initial_exponent,initial_rate))
  theta_1 <- as.vector(c(initial_exponent,minute_rate))
  switch(method,
    constrOptim = {
      # Impose the constraint that rate >= 0
      # and that exponent >= -1
      ui <- rbind(c(1,0),c(0,1))
      ci <- c(-1,0)
      # Can't start with values on the boundary of the feasible set so add
      # tiny amounts just in case
      if (theta_0[1] == -1) {theta_0[1] <- theta_0[1] + minute_rate}
      if (theta_0[2] == 0) {theta_0[2] <- theta_0[2] + minute_rate}
      est <- constrOptim(theta=theta_0,f=negloglike,grad=NULL,ui=ui,ci=ci)
      alpha <- est$par[1]
      lambda <- est$par[2]
      loglike <- -est$value},
    optim = {
      est <- optim(par=theta_0,fn=negloglike,gr=NULL,method="SANN")
      alpha <- est$par[1]
      lambda <- est$par[2]
      loglike <- -est$value},
    nlm = {
      est.0 <- suppressWarnings(nlm(f=negloglike,p=theta_0))
      est.1 <- suppressWarnings(nlm(f=negloglike,p=theta_1))
      est <- est.0
      if (-est.1$minimum > -est.0$minimum) { est <- est.1;cat("NLM had to switch\n") }
      alpha <- est$estimate[1]
      lambda <- est$estimate[2]
      loglike <- -est$minimum},
    {cat("Unknown method",method,"\n"); alpha<-NA; lambda<-NA; loglike<-NA}
  )
  fit <- list(type="powerexp", exponent=alpha, rate=lambda, xmin=threshold,
              loglike=loglike, samples.over.threshold=length(x))
  return(fit)
}

# Calculate log-likelihood under power law with exponential cut-off
# Input: Data vector, lower threshold, scaling exponent, exponential rate
# Output: Real-valued log-likelihood
powerexp.loglike <- function(x,threshold=1,exponent,rate) {
  x <- x[x>=threshold]
  L <- sum(suppressWarnings(dpowerexp(x,threshold,exponent,rate,log=TRUE)))
  return(L)
}

# Test the quality of the fitting by generating Pareto variates and seeing
# whether the log likelihood ratio is \chi^2
# Input: number of replicates, sample size, distributional parameters
# Output: List of 2*LLR values
# Side-effect: Plot of CDF of 2*LLR vs. pchisq(,1)
#test.powerexp.fit <- function(reps=200,n=200,xmin=1,alpha=2.5) {
#  l <- replicate(reps,2*test.powerexp.fit.1(n,xmin,alpha))
#  plot(ecdf(l))
#  curve(pchisq(x,1),add=TRUE,col="red")
#  return(l)
#}

#test.powerexp.fit.1 <- function(n=200,xmin=1,alpha=2.5) {
#  x <- suppressWarnings(rpareto(n,xmin,alpha))
#  pareto.ll <- pareto.fit(x,xmin)$loglike
#  powerexp.ll <- powerexp.fit(x,xmin,initial_rate=0)$loglike
#  return(powerexp.ll - pareto.ll)
#}


# Upper quantile of a single value from a power-law with exponential cut-off
# Should only ever be called by qpowerexp
# Input: Probability, lower threshold, scaling exponent, exponential rate
# Output: real-valued quantile
qpowerexponce <- function(q,threshold,exponent,rate) {
  # Finds a single powerexp (upper) quantile; gets called via sapplied in the
  # real qpowerexp
  # Handle easy cases first
  if (q==0) {
    return(Inf)
  }
  if (q==1) {
    return(threshold)
  }
  # The basic idea is to solve the equation
  # q = ppowerexp(x)
  # for x, by invoking the "uniroot" routine for finding zeroes of one-argument
  # functions
  w <- function(x) {q - suppressWarnings(ppowerexp(x,threshold,exponent,rate,lower.tail=FALSE))}
  # powerexp decays more quickly than a power-law distribution of the same
  # exponent and threshold
  # Use the quantile of the Pareto as an upper bound
  # Use the lower threshold as a lower bound
  pareto_x_of_q <- suppressWarnings(qpareto(q,threshold,exponent,lower.tail=FALSE))
  upperquant <- uniroot(f=w,interval=c(threshold,pareto_x_of_q))
  x <- upperquant$root
  return(x)
}




################## Numerical utility functions
# Mostly in connection with the "powerexp" distribution family


# The upper incomplete gamma function, allowing for negative indices
# Part of the normalizing constant in the powerexp family
# Uses the identity (holding for all a != 0)
### Gamma(a,x) = (1/a)(Gamma(a+1,x) - e^{-x} x^a)
# and Gamma(0,x) = E_1(x), first-order exponential integral (special) function

# Input: exponent (a), lower cut-off (x),  log flag
# Output: Real value
uigf <- function(a,x,log=FALSE) {
  if (a > 0) {
    # Use gamma-distribution tricks
    Gamma <- gamma(a)*pgamma(x,a,lower.tail=FALSE)
  }
  if (a == 0) {
    # Invoke the exponential integral function
    Gamma <- exp_int(x)
  }
  if (a < 0) {
    # Recurse
    Gamma <- (1/a) * (uigf(a+1,x) - exp(-x)*(x^a))
  }
  if (log) {
    Gamma <- log(Gamma)
  }
  return(Gamma)
}

# The exponential integral function
# Used in evaluating the upper incomplete gamma function, when the latter
# has a negative integer argument
# Two choices: invoke the GNU Scientific Library (GSL), via Cosma's
# very clumsy R/C interface
# OR use the series expansion given in Numerical Recipes, translated into
# R without any optimization or permissions
# Both methods don't work well on vector data, so exp_int acts as
# wrapper, with a choice of method, defaulting to the GSL (where my programming
# had less chance to screw it up)

# Compute the first-order exponential integrals of vectors of arguments
# The method "gsl" makes an external call to the GSL
# The method "nr" calls code lifted from Numerical Recipies
# For copyright reasons that method is NOT part of this public release
# Input: Real-valued vector, method flag
# Output: Real-valued vector
exp_int <- function(x,method="gsl") {
  switch(method,
    gsl = {ei <- sapply(x,exp_int_once.gsl)},
    nr = {ei <- sapply(x,exp_int_once.nr)},
    {cat("Unknown method", method); ei <- NA}
  )
  return(ei)
}

# Compute the exponential integral via invoking the GNU scientific library ONCE
# Input: real-valued argument (x)
# Output: Real value
exp_int_once.gsl <- function(x) {
  # ei.command <- paste(system.file(package="Dpit", "expint"), x) #ei.command <- paste(exp_int,x)
  # ei <- as.numeric(system(ei.command,intern=TRUE))
  gsl::expint_E1(x)
}