R/distance_measures.R

Defines functions VonMisesDist TestVonMisesDist KSdist TestKSdist SignPow Example BoxCox FitGoodnessFcn EstimBoxCox QskewFcn QskewAbs SkewnessAbs TestEstimBoxCox TestEstimBoxCox2 TotalVar_sampleCategData TestTotalVar_sampleCategData TotalVar_sampleCategData_sigTest TestTotalVar_sampleCategData_sigTest

#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# authors: Alireza Hosseini, Reza Hosseini

### vonMises distance measures
# F, G distributions
# x, y empirical data
# this function is NOT GREAT due to asymetry with wrt to F
# also because we need a very good grid
# with good coverage to get a good density

VonMisesDist = function(
    Func1=NULL,
    Func2=NULL,
    x=NULL, 
    y=NULL,
    DensF1=NULL,
    DensF2=NULL,
    intGrid=seq(-100, 100, 0.1),
    ordLP=2) {

  if (is.null(Func1)) {
    Func1 = ecdf(x)
  }
  if (is.null(Func2)) {
    Func2 = ecdf(y)
  }

  if (!is.null(x) & !is.null(y)) {
    z = c(x, y)
    z = unique(z)
    intGrid = sort(z)
  }

  n = length(intGrid)
  intGrid2 = c(intGrid[1], intGrid[-n])
  deltaGrid = intGrid - intGrid2
  deltaGrid[1] = median(deltaGrid)

  dVec = abs(Func1(intGrid) - Func2(intGrid))^ordLP

  if (!is.null(DensF)) {
    dens = DensF1(intGrid)
  } else if (!is.null(DensG)) {
    dens = DensF2(intGrid)
  } else {
     dens = (Func1(intGrid) - Func2(intGrid2)) / deltaGrid
  }

  Mark(sum(dens * deltaGrid), 'dens sum')
  out = sum(dVec * dens * deltaGrid)/sum(dens * deltaGrid)
  return(out)
}


TestVonMisesDist = function() {
  x = 1:100
  Func1 = ecdf(x)
  y = 2:101
  Func2 = ecdf(y)
  # InstallIf('distrEx')
  # library('distrEx')
  # CvMDist(e1=x, e2=y)
  out = VonMisesDist(Func1=Func1, Func2=Func2)
  VonMisesDist(x=x, y=y)

  ss= 500
  x = rnorm(ss)
  y = rnorm(ss)
  VonMisesDist(x=x, y=y)

  x = rnorm(ss)
  y = rnorm(ss, mean=2)
  VonMisesDist(x=x, y=y)

  x = rnorm(ss)
  y = rnorm(ss, mean=5)
  VonMisesDist(x=x, y=y)

  x = rnorm(100)
  y = rnorm(100, mean=0)
  VonMisesDist(x=x, y=y)

  x = rnorm(100)
  y = rnorm(100, sd=2)
  VonMisesDist(x=x, y=y)

  x = rnorm(100)
  y = rnorm(100, sd=1/2)
  VonMisesDist(x=x, y=y)

  x = rnorm(200)
  y = runif(200, min=-1, max=1)
  VonMisesDist(x=x, y=y)

  x = rnorm(200)
  y = runif(200, min=0, max=1)
  VonMisesDist(x=x, y=y)

  x = rnorm(200)
  y = runif(200, min=-1, max=0)
  VonMisesDist(x=x, y=y)

  G = function(m1, s1, m2, s2, n1=1000, n2=500) {
    x = rnorm(n1, mean=m1, sd=s1)
    y = rnorm(n2, mean=m2, sd=s2)
    print(VonMisesDist(x=x, y=y, ordLP=1))


    Func1 = function(x) {
      pnorm(x, mean=m1, sd=s1)
    }

    Func2 = function(x) {
      pnorm(x, mean=m2, sd=s2)
    }
    print(VonMisesDist(Func1=Func1, Func2=Func2, ordLP=1))
  }

  G(m1=0, s1=1, m2=1, s2=1, n1=1000, n2=500)
}

## Kolmogorov Smirnoff distance
KSdist = function(
    Func1=NULL,
    Func2=NULL,
    x=NULL,
    y=NULL,
    intGrid=seq(-100, 100, 0.01)) {

  if (is.null(Func1)) {
    Func1 = ecdf(x)
  }
  if (is.null(Func2)) {
    Func2 = ecdf(y)
  }

  if (!is.null(x) & !is.null(y)) {
    z = c(x, y)
    z = unique(z)
    intGrid = sort(z)
  }


  dVec = abs(Func1(intGrid) - Func2(intGrid))
  return(max(dVec))
}

TestKSdist = function() {
  x = rnorm(1000, mean=0, sd=1)
  y = rnorm(50000, mean=2, sd=2)
  KSdist(x=x, y=y)
}

## define a power function
# R cannot handle powers such as 1/3
# for negative numbers
SignPow = function(x, pow) {
  sign(x)*(abs(x)^pow)
}

Example = function() {
  x = -10:10
  y = SignPow(x, pow=1/3)
  plot(x, y)

  y = SignPow(x, pow=1/2)
  plot(x, y)

  y = SignPow(x, pow=1/4)
  plot(x, y)
}

## box-cox transform
# we allow for a shift as well as the power
BoxCox = function(x, pow, shift) {
  SignPow((x + shift), pow)
}

## distance to a given fitted dist
# the default is to a fitted normal
FitGoodnessFcn = function(DistFcn) {

  function(x, FitCdf=NULL) {
    if (is.null(FitCdf)) {
      FitCdf = function(z) {
        pnorm(z, mean=mean(x), sd=sd(x))
      }
    }

    G = ecdf(x)
    F = FitCdf
    return(DistFcn(F=F, G=G))
  }
}

## this function fimds the optimal parameters of boxcox
# which give the closest distribution to normal
# which is the default of FitGoodness
# x is data
# prefCoef is used when we try to
# calculate preference between box parameters
# prefCoef[1]: this favors small deviations for power from 1
# prefCoef[2]: also favors small deviations for shift from 0
# prefCoef[3]: favors small values fo rthe OptCrit
EstimBoxCox = function(
    x,
    OptCrit,
    powRange=seq(1/4, 4, 1/4),
    shiftRange=NULL,
    shiftIt=TRUE,
    shiftGridNum=20,
    prefCoef=c(0, 0, 1)) {

  param = c(1, 0)
  x = na.omit(x)
  d = OptCrit(x)
  dPref = d
  mu = mean(x, na.rm=TRUE)
  sig = sd(x, na.rm=TRUE)

  lower = quantile(x, 0.1)
  upper = quantile(x, 0.9)
  delta = (upper - lower)/shiftGridNum
  if (is.null(shiftRange)) {
    shiftRange=seq(lower, upper, delta)
  }
  # print(shiftRange)
  if (!shiftIt) {
    shiftRange=0
  }

  n = length(powRange)
  m = length(shiftRange)
  for (i in 1:n) {
    for (j in 1:m) {
      pow = powRange[i]
      shift = shiftRange[j]
      y = BoxCox(x, pow, shift)
      newD = OptCrit(y)
      newD_pref = (abs(pow - 1)*prefCoef[1] +
        abs(shift)*prefCoef[2] + newD*prefCoef[3])
      if (newD_pref < d) {
        param = c(pow, shift)
        d = newD
        dPref = newD_pref
        transX = y
      }
    }
  }
  out = list(value=OptCrit(x), newValue=d, param=param, transX=transX)
  return(out)
}

QskewFcn = function(pDelta) {
  function(x) {
    numer = quantile(x, pDelta) + quantile(x, 1 - pDelta) -2 * median(x)
    denom = quantile(x, pDelta) - quantile(x, 1 - pDelta)
    return(numer / denom)
  }
}

Qskew = QskewFcn(pDelta=1/4)
QskewAbs = function(x) {
  abs(Qskew(x))
}

SkewnessAbs = function(x) {
  abs(skewness(x))
}

TestEstimBoxCox = function(x, OptCrit) {
  x = 1:500
  x = rgamma(500, shape=1)
  res = EstimBoxCox(
    x=x,
    OptCrit=FitGoodnessFcn(KSdist),
    powRange=seq(1/50, 5, 1/50))

  param = res[['param']]

  print('final')
  print(param)

  y = res[['transX']]
  pow = param[1]
  shift = param[2]
  y = BoxCox(x, pow, shift)

  value = res[['value']]
  newValue = res[['newValue']]
  value = signif(value, 2)
  newValue = signif(newValue, 2)
  param = signif(param, 2)

  main1 = paste('value:',value)
  main2 = paste('value:',newValue,' param:',param[1],',',param[2],sep='')

  par(mfrow=c(1, 2))
  hist(x, main=main1, col=ColAlpha('red', 0.5))
  hist(y, main=main2, add=FALSE, col=ColAlpha('blue', 0.5))
  print(summary(y))
  return(list(x=x, y=y, param=param))
}



TestEstimBoxCox2 = function() {
  G = function(x) {
    par(mfrow=c(3, 2))
    Func = EstimBoxCox
    out = Func(x, OptCrit=FitGoodnessFcn(KSdist))
    out = Func(x, OptCrit=SkewnessAbs)
    out = Func(x, OptCrit=QskewAbs)
  }
  G = Example_BoxCoxVarious
  x = rnorm(100)^2
  out = G(x)

  x = rgamma(100, rate=1, shape=1)
  out = G(x)

  x = rcauchy(100, scale=0.5)
  out = G(x)

  u = rnorm(100)
  x = (u + 10)^3
  out = G(x)

  x = rchisq(100, df=10)
  out = G(x)

  x = rchisq(100, df=3)
  out = G(x)

  x = rchisq(100, df=3)
  out = G(x)
}

# Calculate total variation metric for categorical variables
# this operates on sampled data (rather than distbns)
TotalVar_sampleCategData = function(x, y) {

  xDf = data.frame(table(x))
  yDf = data.frame(table(y))

  xDf[ , "prop"] = xDf[ , "Freq"] / sum(xDf[ , "Freq"])
  yDf[ , "prop"] = yDf[ , "Freq"] / sum(yDf[ , "Freq"])
  xDf = xDf[ , c(1, 3)]
  yDf = yDf[ , c(1, 3)]

  colnames(xDf) = c("label", "xprop")
  colnames(yDf) = c("label", "yprop")

  mDf = merge(xDf, yDf, all=TRUE)
  mDf[is.na(mDf)] = 0

  dist = sum(abs(mDf[ , "xprop"] - mDf[ , "yprop"]))

  return(list("dist"=dist, "df"=mDf))
}


TestTotalVar_sampleCategData = function() {

  x = c("cat", "cat", "dog", "dog", "horse")
  y = c("cat", "dog", "horse", "zebra")
  TotalVar_sampleCategData(x, y)

}


TotalVar_sampleCategData_sigTest = function(x, y, bsNum=200) {

  Func = function(i) {
    x0 = sample(x, length(x), replace=TRUE)
    y0 = sample(y, length(y), replace=TRUE)
    TotalVar_sampleCategData(x0, y0)[["dist"]]
  }

  vec = sapply(FUN=Func, X=1:bsNum)
  value = TotalVar_sampleCategData(x, y)[["dist"]]
  ci = quantile(x=vec, probs=c(0.025, 0.0975))

  # ci under h0 for comparison
  Func_h0 = function(i) {
    x0 = sample(c(x, y), length(x), replace=TRUE)
    y0 = sample(c(x, y), length(y), replace=TRUE)
    TotalVar_sampleCategData(x0, y0)[["dist"]]
  }

  vec_h0 = sapply(FUN=Func_h0, X=1:bsNum)
  ci_h0 = quantile(x=vec_h0, probs=c(0.025, 0.0975))
  pValue = 1 - mean(value > vec_h0)

  r = sort(c(vec_h0, vec))
  breaks = seq(min(r), max(r), by=(max(r) - min(r))/10)

  hist(vec_h0, col=ColAlpha("blue", 0.2), breaks=breaks)
  hist(vec, col=ColAlpha("red", 0.2), add=TRUE)
  legend(
      "topright",
      col=c("blue", "red"),
      legend=c("random dist", "obs dist"),
      lwd=c(2, 2))

  return(list(
      "ci"=ci,
      "pValue"=pValue,
      "ci_h0"=ci_h0,
      "pValue"=pValue))
}

TestTotalVar_sampleCategData_sigTest = function() {

  x = c("cat", "cat", "dog", "dog", "horse")
  y = c("cat", "dog", "horse", "zebra")

  TotalVar_sampleCategData_sigTest(x, y)

  # simulation
  grid = seq(from=10, to=1000, by=10)
  ciDf = setNames(
    data.frame(
        matrix(ncol=6, nrow=0)),
        c(
            "sample_size",
            "ci_rand_lower",
            "ci_rand_upper",
            "ci_lower",
            "ci_upper",
            "p_value"))

  for (n in grid) {
    x = sample(c("dog", "cat", "horse", "cat"), n, replace=TRUE)
    y = sample(c("dog", "cat", "horse", "zebra"), n, replace=TRUE)
    res = TotalVar_sampleCategData_sigTest(x, y)
    ci = res[["ci"]]
    pValue = res[["pValue"]]
    ci_h0 = res[["ci_h0"]]


    ciDf[nrow(ciDf) + 1, ] = c(n, ci_h0, ci, pValue)
  }

  plot(
      ciDf[ , "sample_size"],
      ciDf[ , "p_value"],
      type="l",
      xlab="sample_size",
      ylab="p_value")

}
Reza1317/funcly documentation built on Feb. 5, 2020, 4:06 a.m.