IndexNumR: Index Number Calculation

Documented in priceIndex quantityIndex

# IndexNumR: a package for index number computation
# Copyright (C) 2018 Graham J. White (g.white@unswalumni.com)
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.


#' Dutot
#'
#' compute a bilateral Dutot index for a single period
#'
#' @keywords internal
#' @noRd
dutot_t <- function(p0,p1){
  M0 <- length(p0)
  M1 <- length(p1)
  return(((1/M1)*sum(p1))/((1/M0)*sum(p0)))
}

#' Carli
#'
#' compute a bilateral Carli index for a single period
#'
#' @keywords internal
#' @noRd
carli_t <- function(p0,p1){
  return((1/length(p0))*sum(p1/p0))
}

#' Jevons
#'
#' compute a bilateral Jevons index for a single period
#'
#' @keywords internal
#' @noRd
jevons_t <- function(p0,p1){
  return(prod((p1/p0)^(1/length(p0))))
}

#' Harmonic mean
#'
#' compute a bilateral Harmonic mean index for a single period
#'
#' @keywords internal
#' @noRd
harmonic_t <- function(p0,p1){
  return((1/length(p0)*sum((p1/p0)^-1))^-1)
}

#' CSWD
#'
#'compute a bilateral CSWD index for a single period
#'
#' @keywords internal
#' @noRd
cswd_t <- function(p0,p1){
  return(sqrt((carli_t(p0,p1)*harmonic_t(p0,p1))))
}

#' Fixed-basket indices - Laspeyres (q=q0), Paasche (q=q1), Lowe (q=qb)
#'
#' compute a bilateral fixed base index for a single period
#'
#' @keywords internal
#' @noRd
fixed_t <- function(p0,p1,q){
  return(sum(p1*q)/sum(p0*q))
}

#' fisher_t
#'
#' compute a bilateral Fisher index for a single period
#'
#' @keywords internal
#' @noRd
fisher_t <- function(p0,p1,q0,q1){
  las <- fixed_t(p0,p1,q0)
  pas <- fixed_t(p0,p1,q1)
  return(sqrt((las*pas)))
}

#' tornqvist_t
#'
#' compute a bilateral Tornqvist index for a single period
#'
#' @keywords internal
#' @noRd
tornqvist_t <- function(p0,p1,q0,q1){
  exp0 <- sum(p0*q0)
  exp1 <- sum(p1*q1)
  s0 <- (p0*q0)/exp0
  s1 <- (p1*q1)/exp1
  return(prod((p1/p0)^(0.5*(s0+s1))))
}

#' Sato-Vartia index
#'
#' compute a bilateral Sato-Vartia index
#' @keywords internal
#' @noRd
satoVartia_t <- function(p0,p1,q0,q1){
  exp0 <- sum(p0*q0)
  exp1 <- sum(p1*q1)
  s0 <- (p0*q0)/exp0
  s1 <- (p1*q1)/exp1
  wstar <- (s1-s0)/(log(s1)-log(s0))
  w <- wstar/sum(wstar)
  return(exp(sum(w*log(p1/p0))))
}

#' Walsh index
#'
#' compute a bilateral Walsh index
#'
#' @keywords internal
#' @noRd
walsh_t <- function(p0,p1,q0,q1){
  return(sum(p1*sqrt(q0*q1))/sum(p0*sqrt(q0*q1)))
}

#' Lloyd-Moulton index, period 0 share
#'
#' @keywords internal
#' @noRd
lloydMoulton_t0 <- function(p0,p1,q,sigma){
  e <- sum(p0*q)
  s0 <- (p0*q)/e
  return(sum((s0*((p1/p0)^(1-sigma))))^(1/(1-sigma)))
}

#' Lloyd-Moulton index, current period share
#'
#' @keywords internal
#' @noRd
lloydMoulton_tc <- function(p0,p1,q,sigma){
  e <- sum(p1*q)
  s1 <- (p1*q)/e
  return(sum((s1*((p1/p0)^-(1-sigma))))^(-1/(1-sigma)))
}

#' Geometric laspeyres index
#'
#' @keywords internal
#' @noRd
geomLaspeyres_t <- function(p0, p1, q0, q1){
  exp0 <- sum(p0*q0)
  s0 <- (p0*q0)/exp0
  return(prod((p1/p0)^s0))
}

#' Geometric paasche index
#'
#' @keywords internal
#' @noRd
geomPaasche_t <- function(p0, p1, q0, q1){
  exp1 <- sum(p1*q1)
  s1 <- (p1*q1)/exp1
  return(prod((p1/p0)^s1))
}

#' time product dummy
#'
#' @keywords internal
#' @noRd
tpd_t <- function(p0, p1, q0, q1, prodID0, prodID1, biasAdjust, weights){

  exp0 <- sum(p0*q0)
  exp1 <- sum(p1*q1)

  switch(weights,
         "shares" = {s0 <- (p0*q0)/exp0
                     s1 <- (p1*q1)/exp1},
         "average" = {s0Initial <- (p0*q0)/exp0
                     s1Initial <- (p1*q1)/exp1

                     df <- merge(data.frame(prodID = prodID0, s0 = s0Initial),
                                 data.frame(prodID = prodID1, s1 = s1Initial),
                                 all = TRUE)
                     df$average <- 0.5*(ifelse(is.na(df$s0), 0, df$s0) + ifelse(is.na(df$s1), 0, df$s1))

                     s0 <- df$average[!is.na(df$s0)]
                     s1 <- df$average[!is.na(df$s1)]

                     },
         "unweighted" = {s0 <- rep(NA, length(p0))
                         s1 <- rep(NA, length(p1))}
  )

  df1 <- data.frame(lnP = log(p1),
                    D = factor(rep("1", length(p1)), levels = c("0", "1")),
                    s = s1,
                    product = as.factor(prodID1))

  df0 <- data.frame(lnP = log(p0),
                    D = factor(rep("0", length(p0)), levels = c("0", "1")),
                    s = s0,
                    product = as.factor(prodID0))

  regData <- rbind(df0, df1)

  if(weights == "unweighted"){
    reg <- stats::lm(lnP ~ D + product, data = regData)
  } else {
    reg <- stats::lm(lnP ~ D + product, weights = regData$s, data = regData)
  }

  if(biasAdjust){
    coeffs <- kennedyBeta(reg)
  }
  else {
    coeffs <- stats::coef(reg)
  }

  b <- coeffs[which(names(coeffs) == "D1")]

  return(exp(b))
}

#' Geary-Khamis bilateral
#'
#' @keywords internal
#' @noRd
gk_t <- function(p0, p1, q0, q1){

  h <- 2/(1/q0+1/q1)
  return(sum(p1*h)/sum(p0*h))

}

#' Drobish bilateral index
#'
#' @keywords internal
#' @noRd
drobish_t <- function(p0, p1, q0, q1){

  return((fixed_t(p0,p1,q0) + fixed_t(p0,p1,q1))/2)

}

#' Stuvel bilateral index
#'
#' @keywords internal
#' @noRd
stuvel_t <- function(p0, p1, q0, q1){

  exp0 <- sum(p0*q0)
  exp1 <- sum(p1*q1)

  PL <- fixed_t(p0, p1, q0) # laspeyres price index
  QL <- fixed_t(q0, q1, p0) # laspeyres quantity index

  A <- 0.5*(PL-QL)

  return(A + (A^2 + exp1/exp0)^0.5)

}

#' Marshall-Edgeworth bilateral index
#'
#' @keywords internal
#' @noRd
marshallEdgeworth_t <- function(p0, p1, q0, q1){

  return(sum(p1*(q0+q1))/sum(p0*(q0+q1)))

}

#' Palgrave bilateral index
#'
#' @keywords internal
#' @noRd
palgrave_t <- function(p0, p1, q1){

  exp1 <- sum(p1*q1)
  s1 <- (p1*q1)/exp1

  return(sum(s1*(p1/p0)))

}

#' Young bilateral index
#'
#' @keywords internal
#' @noRd
young_t <- function(p0, p1, pb, qb){

  expb <- sum(pb*qb)
  sb <- (pb*qb)/expb

  return(sum(sb*p1/p0))

}

#' Computes a bilateral price index
#'
#' A function to compute a price index given data on products over time
#'
#' @param x A dataframe containing price, quantity, a time period identifier
#' and a product identifier. It must have column names.
#' @param pvar A character string for the name of the price variable
#' @param qvar A character string for the name of the quantity variable. For
#' elementary indexes a quantity variable is not required for the calculations
#' and you must specify qvar = "".
#' @param prodID A character string for the name of the product identifier
#' @param pervar A character string for the name of the time variable. This variable
#' must contain integers starting at period 1 and increasing in increments of 1 period.
#' There may be observations on multiple products for each time period.
#' @param indexMethod A character string to select the index number method. Valid index
#' number methods are dutot, carli, jevons, laspeyres, paasche, fisher, cswd,
#' harmonic, tornqvist, satovartia, walsh, CES, geomLaspeyres, geomPaasche, tpd,
#' Geary-Khamis (gk), drobish, palgrave, stuvel, marshalledgeworth.
#' @param sample A character string specifying whether a matched sample
#' should be used.
#' @param output A character string specifying whether a chained (output="chained")
#' , fixed base (output="fixedbase") or period-on-period (output="pop")
#' price index numbers should be returned. Default is period-on-period.
#' @param chainMethod A character string specifying the method of chain linking
#' to use if the output option is set to "chained".
#' Valid options are "pop" for period-on-period, and similarity chain linked
#' options "plspread" for the Paasche-Laspeyres spread, "asymplinear" for
#' weighted asymptotically linear, "logquadratic" for the weighted log-quadratic,
#' and "mixScale" for the mix, scale or absolute dissimilarity measures,
#' or "predictedshare" for the predicted share relative price dissimilarity.
#' The default is period-on-period. Additional parameters can be passed to the
#' mixScaleDissimilarity function using \code{...}
#' @param sigma The elasticity of substitution for the CES index method.
#' @param basePeriod The period to be used as the base when 'fixedbase' output is
#' chosen. Default is 1 (the first period).
#' @param biasAdjust whether to adjust for bias in the coefficients in the bilateral
#' TPD index. The default is TRUE.
#' @param weights the type of weighting for the bilateral TPD index. Options are
#' "unweighted" to use ordinary least squares, "shares" to use weighted least squares
#' with expenditure share weights, and "average" to use weighted least squares
#' with the average of the expenditure shares over the two periods.
#' @param loweYoungBase the period used as the base for the lowe or
#' young type indexes. The default is period 1. This can be a vector of values to
#' use multiple periods. For example, if the data are monthly and start in January, specifying
#' 1:12 will use the first twelve months as the base.
#' @param imputePrices the type of price imputation to use for missing prices.
#' Currently only "carry" is supported to used carry-forward/carry-backward prices.
#' Default is NULL to not impute missing prices.
#' @param ... this is used to pass additional parameters to the mixScaleDissimilarity
#' function.
#' @examples
#' # period-on-period Laspeyres index for the CES_sigma_2 dataset
#' priceIndex(CES_sigma_2, pvar="prices", qvar="quantities", pervar="time",
#' prodID = "prodID", indexMethod = "laspeyres")
#'
#' # chained Fisher index
#' priceIndex(CES_sigma_2, pvar="prices", qvar="quantities", pervar="time",
#' prodID = "prodID", indexMethod = "fisher", output="chained")
#'
#' # chained Tornqvist index, with linking periods chosen by the
#' # weighted log-quadratic dissimilarity measure
#' priceIndex(CES_sigma_2, pvar="prices", qvar="quantities", pervar="time",
#' prodID = "prodID", indexMethod = "tornqvist", output="chained",
#' chainMethod = "logquadratic")
#' @export
priceIndex <- function(x, pvar, qvar, pervar, indexMethod = "laspeyres", prodID,
                       sample = "matched", output = "pop", chainMethod = "pop",
                       sigma = 1.0001, basePeriod = 1, biasAdjust = TRUE,
                       weights = "average", loweYoungBase = 1,
                       imputePrices = NULL, ...){

  # check that a valid method is chosen
  validMethods <- c("dutot","carli","jevons","harmonic","cswd","laspeyres",
                    "paasche","fisher","tornqvist","satovartia","walsh","ces",
                    "geomlaspeyres", "geompaasche", "tpd", "gk", "drobish",
                    "stuvel", "marshalledgeworth", "palgrave", "lowe", "young")

  if(!(tolower(indexMethod) %in% validMethods)){
    stop("Not a valid index number method.")
  }

  # check that a valid output type is chosen
  validOutput <- c("chained","pop","fixedbase")
  if(!(tolower(output) %in% validOutput)){
    stop("Not a valid output type. Please choose from chained, fixedbase or pop.")
  }

  # check that valid weights are given
  validWeights <- c("unweighted", "average", "shares")
  if(!(tolower(weights) %in% validWeights)){
    stop("Not a valid weight type. Please choose from unweighted, shares or average.")
  }

  # check valid column names are given
  colNameCheck <- checkNames(x, c(pvar, qvar, pervar, prodID))
  if(colNameCheck$result == FALSE){
    stop(colNameCheck$message)
  }

  # check valid chainMethod given
  validChainMethods <- c("pop", "plspread", "asymplinear", "logquadratic", "mixscale",
                         "predictedshare")
  if(!chainMethod %in% validChainMethods){
    stop("Not a valid chainMethod. Please choose from", paste(validChainMethods, collapse = ", "))
  }

  # check column types
  x <- checkTypes(x, pvar, qvar, pervar)

  # check that the time period variable is continuous
  timeCheck <- isContinuous(x[[pervar]])
  if(timeCheck$result == FALSE){
    stop(paste("The time period variable is not continuous.",
                "Missing periods:", timeCheck$missing))
  }

  # check that data are unique by time and product ID
  tpCheck <- checkTimeProdUnique(x, pervar, prodID)
  if(tpCheck$result == FALSE){
    stop("Products must only have one observation for each time period. If you have multiple observations on products for one or more time periods, combine this information using unitValues() or another method before calculating the price index.")
  }

  # apply price imputation
  if(!is.null(imputePrices)){
    switch(imputePrices,
           "carry" = {x <- imputeCarryPrices(x, pvar, qvar, pervar, prodID)},
           stop("Invalid imputePrices argument"))
  }

  # sort the dataset by time period and product ID
  x <- x[order(x[[pervar]], x[[prodID]]),]

  # initialise some things
  n <- max(x[[pervar]],na.rm = TRUE)
  plist <- matrix(1, nrow = n, ncol = 1)
  naElements <- character()

  # if similarity chaining was requested, get the similarity measure
  if(tolower(output)=="chained" & !(tolower(chainMethod)=="pop")){
    switch(tolower(chainMethod),
           plspread = {similarityMatrix <- relativeDissimilarity(x,pvar=pvar,qvar=qvar,
                                                                 pervar=pervar,prodID=prodID,
                                                                 similarityMethod = "plspread")},
           logquadratic = {similarityMatrix <- relativeDissimilarity(x,pvar=pvar,qvar=qvar,
                                                                     pervar=pervar,prodID=prodID,
                                                                     similarityMethod = "logquadratic")},
           asymplinear = {similarityMatrix <- relativeDissimilarity(x,pvar=pvar,qvar=qvar,
                                                                    pervar=pervar,prodID=prodID,
                                                                    similarityMethod = "asymplinear")},
           mixscale = {similarityMatrix <- mixScaleDissimilarity(x,pvar=pvar,qvar=qvar,
                                                                 pervar=pervar,prodID=prodID,
                                                                 ...)},
           predictedshare = {similarityMatrix <- relativeDissimilarity(x, pvar, qvar,
                                                                       pervar, prodID,
                                                                       similarityMethod = "predictedshare")})
    # use the similarity matrix to compute links
    links <- maximumSimilarityLinks(similarityMatrix)
  }

  for(i in 1:n){

    if(i == basePeriod){
      plist[i,1] <- 1
      next
    }

    # if fixed base requested, set xt0 to the first period data
    if(tolower(output) == "fixedbase"){
      xt0 <- x[x[[pervar]] == basePeriod,]
    }

    # if lowe index method chosen then set xtb to the loweYoungBase period
    if(tolower(indexMethod) %in% c("lowe", "young")){
      xtb <- x[x[[pervar]] %in% loweYoungBase,]
    }

    # if chained or period-on-period requested then set xt0
    # to the previous period
    if(tolower(output) == "chained" & tolower(chainMethod)=="pop" |
       tolower(output) == "pop"){
      xt0 <- x[x[[pervar]]==i-1,]
    }
    # if similarity linking requested set xt0 to link period
    else if(tolower(output) == "chained" & !(tolower(chainMethod) == "pop")){
      xt0 <- x[x[[pervar]]==links[links$xt==i,2],]
    }
    # otherwise set xt1 to current period data
    xt1 <- x[x[[pervar]]==i,]

    # if matching requested then remove unmatched items
    if(sample=="matched"){

      # remove unmatched products
      xt1 <- xt1[xt1[[prodID]] %in% unique(xt0[[prodID]]),]
      xt0 <- xt0[xt0[[prodID]] %in% unique(xt1[[prodID]]),]

      # because the base period can differ from period 1 and 0 for lowe index
      if(tolower(indexMethod) %in% c("lowe", "young")){
        xt1 <- xt1[xt1[[prodID]] %in% unique(xtb[[prodID]]),]
        xt0 <- xt0[xt0[[prodID]] %in% unique(xtb[[prodID]]),]

        # for xtb we only need to intersect with one of xt0 or xt1 because they are the same set
        xtb <- xtb[xtb[[prodID]] %in% unique(xt1[[prodID]]),]
      }

    }

    # set the price index element to NA if there are no matches
    if(nrow(xt1)==0){
      plist[i,1] <- NA
      naElements <- paste0(naElements, i, sep = ",")
    }
    else{
      # set p and q
      p0 <- xt0[[pvar]]
      p1 <- xt1[[pvar]]
      q0 <- xt0[[qvar]]
      q1 <- xt1[[qvar]]

      if(tolower(indexMethod %in% c("lowe", "young"))){
        qb <- xtb[[qvar]]
        if(tolower(indexMethod == "young")){
          pb <- xtb[[pvar]]
        }
      }

      # compute the index
      switch(tolower(indexMethod),
             dutot = {plist[i,1] <- dutot_t(p0,p1)},
             carli = {plist[i,1] <- carli_t(p0,p1)},
             jevons = {plist[i,1] <- jevons_t(p0,p1)},
             harmonic = {plist[i,1] <- harmonic_t(p0,p1)},
             cswd = {plist[i,1] <- cswd_t(p0,p1)},
             laspeyres = {plist[i,1] <- fixed_t(p0,p1,q0)},
             paasche = {plist[i,1] <- fixed_t(p0,p1,q1)},
             lowe = {plist[i,1] <- fixed_t(p0,p1,qb)},
             young = {plist[i,1] <- young_t(p0,p1,pb,qb)},
             fisher = {plist[i,1] <- fisher_t(p0,p1,q0,q1)},
             tornqvist = {plist[i,1] <- tornqvist_t(p0,p1,q0,q1)},
             satovartia = {plist[i,1] <- satoVartia_t(p0,p1,q0,q1)},
             walsh = {plist[i,1] <- walsh_t(p0,p1,q0,q1)},
             ces = {plist[i,1] <- lloydMoulton_t0(p0,p1,q0,sigma = sigma)},
             geomlaspeyres = {plist[i,1] <- geomLaspeyres_t(p0, p1, q0, q1)},
             geompaasche = {plist[i,1] <- geomPaasche_t(p0, p1, q0, q1)},
             tpd = {plist[i,1] <- tpd_t(p0, p1, q0, q1, xt0[[prodID]], xt1[[prodID]], biasAdjust, weights)},
             gk = {plist[i,1] <- gk_t(p0, p1, q0, q1)},
             drobish = {plist[i,1] <- drobish_t(p0, p1, q0, q1)},
             stuvel = {plist[i,1] <- stuvel_t(p0, p1, q0, q1)},
             marshalledgeworth = {plist[i,1] <- marshallEdgeworth_t(p0, p1, q0, q1)},
             palgrave = {plist[i,1] <- palgrave_t(p0, p1, q1)}
      )

      # if similarity chain linking then multiply the index by the link period index
      if(tolower(output) == "chained" & !(tolower(chainMethod) == "pop")){
        plist[i,1] = plist[i,1]*plist[links[links$xt==i,2],1]
      }
    }
  }

  if(tolower(output) == "chained" & tolower(chainMethod)=="pop"){
    result <- apply(plist,2,cumprod)
  }
  else{
    result <- plist
  }

  if(length(naElements)>0){
    warning(paste0("The following elements of the index were set to NA because there were no matched products in the two comparison periods: ", naElements))
  }

  return(result)
}

#' Computes a bilateral quantity index
#'
#' A function to compute a quantity index given data on products over time
#'
#' @inheritParams priceIndex
#' @examples
#' # chained Fisher quantity index for the CES_sigma_2 dataset
#' quantityIndex(CES_sigma_2, pvar="prices", qvar="quantities", pervar="time",
#' prodID = "prodID", indexMethod = "fisher", output="chained")
#' @export
quantityIndex <- function(x, pvar, qvar, pervar, indexMethod = "laspeyres", prodID,
                          sample = "matched", output = "pop", chainMethod = "pop",
                          sigma = 1.0001, basePeriod = 1, biasAdjust = TRUE,
                          weights = "average", loweYoungBase = 1,
                          imputePrices = NULL, ...){
  return(priceIndex(x,
                    pvar = qvar,
                    qvar = pvar,
                    pervar = pervar,
                    indexMethod = indexMethod,
                    prodID = prodID,
                    sample = sample,
                    output = output,
                    chainMethod = chainMethod,
                    sigma = sigma,
                    basePeriod = basePeriod,
                    biasAdjust = biasAdjust,
                    weights = weights,
                    ... = ...))
}