vignettes/plfit.R

###############################################################################
#
# library 
# 
library(VGAM)   # zeta function
library(R.matlab) # read matlab file just for test purpose
#
###############################################################################
#
# test zone 
#
###############################################################################
runtest <- function(){
 #read a matlab file
 #x generated by x = randht(10000,'xmin',15,'powerlaw',2.5) with matlab (continuous law)
 x<-readMat(file("x_2.5_15_10000.mat","rb"))$x[,1]
 #plfit matlab:alpha=2.4975,xmin=18.5752,D=0.0062
 plfit(x)
 #plfit R:alpha=2.497485,xmin=18.57524,D=0.006174977
 #x0 generated by x0 =floor(x) with matlab (big discrete approximation)
 x0<-readMat(file("x0_2.5_15_10000.mat","rb"))$x[,1]
 #plfit matlab:alpha=2.4700,xmin=15,D=0.0056
 plfit(x0)
 #plfit r:alpha=2.47,xmin=15,D=0.00564463

 
}
###############################################################################
#
# PLFIT fits a power-law distributional model to data.
#
#    PLFIT(x) estimates x_min and alpha according to the goodness-of-fit
#    based method described in Clauset, Shalizi, Newman (2007). x is a
#    vector of observations of some quantity to which we wish to fit the
#    power-law distribution p(x) ~ x^-alpha for x >= xmin.
#    PLFIT automatically detects whether x is composed of real or integer
#    values, and applies the appropriate method. For discrete data, if
#    min(x) > 1000, PLFIT uses the continuous approximation, which is
#    a reliable in this regime.
#
#    The fitting procedure works as follows:
#    1) For each possible choice of x_min, we estimate alpha via the
#       method of maximum likelihood, and calculate the Kolmogorov-Smirnov
#       goodness-of-fit statistic D.
#    2) We then select as our estimate of x_min, the value that gives the
#       minimum value D over all values of x_min.
#
#    Note that this procedure gives no estimate of the uncertainty of the
#    fitted parameters, nor of the validity of the fit.
#
#    Example:
#       x <- (1-runif(10000))^(-1/(2.5-1))
#       plfit(x)
#
#
# Version 1.0   (2008 February)
# Version 1.1   (2008 February)
#    - correction : division by zero if limit >= max(x) because the unique R function do no sort
#                   and the matlab function do...
# Version 1.1   (minor correction 2009 August)
#    - correction : lines 230 zdiff calcul was wrong when xmin=0 (thanks to Naoki Masuda)
#    - gpl version updated to v3.0 (asked by Felipe Ortega)
# Version 1.2 (2011 August)
#    - correction for method "limit" thanks to David R. Pugh 
#      xmins <- xmins[xmins<=limit] is now xmins <- xmins[xmins>=limit]
#    - "fixed" method added for xmins from David R. Pugh
#    - modifications by Alan Di Vittorio:
#       - correction : zdiff calculation was wrong when xmin==1
#       - the previous zdiff correction was incorrect
#       - correction : x has to have at least two unique values
#       - additional discrete x input test : discrete x cannot contain the value 0
#       - added option to truncate continuous xmin search when # of obs gets small
#
# Copyright (C) 2008,2011 Laurent Dubroca laurent.dubroca_at_gmail.com 
# (Stazione Zoologica Anton Dohrn, Napoli, Italy)
# Distributed under GPL 3.0 
# http://www.gnu.org/copyleft/gpl.html
# PLFIT comes with ABSOLUTELY NO WARRANTY
# Matlab to R translation based on the original code of Aaron Clauset (Santa Fe Institute)
# Source: http://www.santafe.edu/~aaronc/powerlaws/
#
# Notes:
#
# 1. In order to implement the integer-based methods in Matlab, the numeric
#    maximization of the log-likelihood function was used. This requires
#    that we specify the range of scaling parameters considered. We set
#    this range to be seq(1.5,3.5,0.01) by default. This vector can be
#    set by the user like so,
#
#       a <- plfit(x,"range",seq(1.001,5,0.001))
#
# 2. PLFIT can be told to limit the range of values considered as estimates
#    for xmin in two ways. First, it can be instructed to sample these
#    possible values like so,
#
#       a <- plfit(x,"sample",100)
#
#    which uses 100 uniformly distributed values on the sorted list of
#    unique values in the data set. Alternatively, it can simply omit all
#    candidates below a hard limit, like so
#
#       a <- plfit(x,"limit",3.4)
#
#    In the case of discrete data, it rounds the limit to the nearest
#    integer.
#
#     Finally, if you wish to force the threshold parameter to take a specific value 
#     (useful for bootstrapping), simply call plfit() like so
#       
#       a <- plfit(x,"fixed",3.5)
#
# 3. When the input sample size is small (e.g., < 100), the estimator is
#    known to be slightly biased (toward larger values of alpha). To
#    explicitly use an experimental finite-size correction, call PLFIT like
#    so
#
#       a <- plfit(x,finite=TRUE)
#
# 4. For continuous data, PLFIT can return erroneously large estimates of 
#    alpha when xmin is so large that the number of obs x >= xmin is very 
#    small. To prevent this, we can truncate the search over xmin values 
#    before the finite-size bias becomes significant by calling PLFIT as
#    
#       a = plfit(x,nosmall=TRUE);
#    
#    which skips values xmin with finite size bias > 0.1.
#
###############################################################################
plfit<-function(x=rpareto(1000,10,2.5),method="limit",value=c(),finite=FALSE,nowarn=FALSE,nosmall=FALSE){
   #init method value to NULL	
   vec <- c() ; sampl <- c() ; limit <- c(); fixed <- c()
###########################################################################################
#
#  test and trap for bad input
#
   switch(method, 
     range = vec <- value,
     sample = sampl <- value,
     limit = limit <- value,
     fixed = fixed <- value,
     argok <- 0)
   
   if(exists("argok")){stop("(plfit) Unrecognized method")}

   if( !is.null(vec) && (!is.vector(vec) || min(vec)<=1 || length(vec)<=1) ){
     print(paste("(plfit) Error: ''range'' argument must contain a vector > 1; using default."))
     vec <- c()
   }
   if( !is.null(sampl) && ( !(sampl==floor(sampl)) ||  length(sampl)>1 || sampl<2 ) ){
     print(paste("(plfit) Error: ''sample'' argument must be a positive integer > 2; using default."))
     sample <- c()
   }
   if( !is.null(limit) && (length(limit)>1 || limit<1) ){
     print(paste("(plfit) Error: ''limit'' argument must be a positive >=1; using default."))
     limit <- c()
   }
   if( !is.null(fixed) && (length(fixed)>1 || fixed<=0) ){
     print(paste("(plfit) Error: ''fixed'' argument must be a positive >0; using default."))
     fixed <- c() 
   }   

#  select method (discrete or continuous) for fitting and test if x is a vector
   fdattype<-"unknow"
   if( is.vector(x,"numeric") ){ fdattype<-"real" }
   if( all(x==floor(x)) && is.vector(x) ){ fdattype<-"integer" }
   if( all(x==floor(x)) && min(x) > 1000 && length(x) > 100 ){ fdattype <- "real" }
   if( fdattype=="unknow" ){ stop("(plfit) Error: x must contain only reals or only integers.") }

#
#  end test and trap for bad input
#
###########################################################################################

###########################################################################################
#
#  estimate xmin and alpha in the continuous case
#
   if( fdattype=="real" ){

    xmins <- sort(unique(x))
    xmins <- xmins[-length(xmins)]

    if( !is.null(limit) ){
      xmins <- xmins[xmins>=limit]
    } 
    if( !is.null(fixed) ){
      xmins <- fixed
    }
    if( !is.null(sampl) ){ 
      xmins <- xmins[unique(round(seq(1,length(xmins),length.out=sampl)))]
    }
 
    dat <- rep(0,length(xmins))
    z   <- sort(x)
    
    for( xm in 1:length(xmins) ){
      xmin <- xmins[xm]
      z    <- z[z>=xmin]
      n    <- length(z)
      # estimate alpha using direct MLE
      a    <- n/sum(log(z/xmin))
      # truncate search if nosmall is selected
      if( nosmall ){
       if((a-1)/sqrt(n) > 0.1){
           dat <- dat[1:(xm-1)]
           print(paste("(plfit) Warning : xmin search truncated beyond",xmins[xm-1]))
           break
       }
      }
      # compute KS statistic
      cx   <- c(0:(n-1))/n
      cf   <- 1-(xmin/z)^a
      dat[xm] <- max(abs(cf-cx))
    }
 
    D     <- min(dat)
    xmin  <- xmins[min(which(dat<=D))]
    z     <- x[x>=xmin]
    n     <- length(z)
    alpha <- 1 + n/sum(log(z/xmin))

    if( finite ){
      alpha <- alpha*(n-1)/n+1/n # finite-size correction
    }
    if( n<50 && !finite && !nowarn){
      print("(plfit) Warning : finite-size bias may be present")
    }

   }
#
#  end continuous case
#
###########################################################################################

###########################################################################################
#
#  estimate xmin and alpha in the discrete case
#
   if( fdattype=="integer" ){

    if( is.null(vec) ){ vec<-seq(1.5,3.5,.01) } # covers range of most practical scaling parameters
    zvec <- zeta(vec)

    xmins <- sort(unique(x))
    xmins <- xmins[-length(xmins)]

    if( !is.null(limit) ){
      limit <- round(limit)
      xmins <- xmins[xmins>=limit]
    } 

    if( !is.null(fixed) ){
      xmins <- fixed
    }

    if( !is.null(sampl) ){ 
      xmins <- xmins[unique(round(seq(1,length(xmins),length.out=sampl)))]
    }

    if( is.null(xmins) || length(xmins) < 2){
      stop("(plfit) error: x must contain at least two unique values.")
    }

    if(length(which(xmins==0) > 0)){
      stop("(plfit) error: x must not contain the value 0.")
    }

    xmax <- max(x)
     dat <- matrix(0,nrow=length(xmins),ncol=2)
       z <- x
    for( xm in 1:length(xmins) ){
      xmin <- xmins[xm]
      z    <- z[z>=xmin]
      n    <- length(z)
      # estimate alpha via direct maximization of likelihood function
      #  vectorized version of numerical calculation
      # matlab: zdiff = sum( repmat((1:xmin-1)',1,length(vec)).^-repmat(vec,xmin-1,1) ,1);
      if(xmin==1){
        zdiff <- rep(0,length(vec))
      }else{
       zdiff <- apply(rep(t(1:(xmin-1)),length(vec))^-t(kronecker(t(array(1,xmin-1)),vec)),2,sum)
      }
      # matlab: L = -vec.*sum(log(z)) - n.*log(zvec - zdiff);
      L <- -vec*sum(log(z)) - n*log(zvec - zdiff);
      I <- which.max(L)
      # compute KS statistic
      fit <- cumsum((((xmin:xmax)^-vec[I])) / (zvec[I] - sum((1:(xmin-1))^-vec[I])))
      cdi <- cumsum(hist(z,c(min(z)-1,(xmin+.5):xmax,max(z)+1),plot=FALSE)$counts/n)
      dat[xm,] <- c(max(abs( fit - cdi )),vec[I])
    }
    D     <- min(dat[,1])
    I     <- which.min(dat[,1])
    xmin  <- xmins[I]
    n     <- sum(x>=xmin)
    alpha <- dat[I,2]

    if( finite ){
      alpha <- alpha*(n-1)/n+1/n # finite-size correction
    }
    if( n<50 && !finite && !nowarn){
      print("(plfit) Warning : finite-size bias may be present")
    }

   }
#
#  end discrete case
#
###########################################################################################

#  return xmin, alpha and D in a list
   return(list(xmin=xmin,alpha=alpha,D=D))
}
csgillespie/poweRlaw documentation built on July 26, 2018, 9:54 p.m.