#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# authors: Alireza Hosseini, Reza Hosseini
### vonMises distance measures
# F, G distributions
# x, y empirical data
# this function is NOT GREAT due to asymetry with wrt to F
# also because we need a very good grid
# with good coverage to get a good density
VonMisesDist = function(
Func1=NULL,
Func2=NULL,
x=NULL,
y=NULL,
DensF1=NULL,
DensF2=NULL,
intGrid=seq(-100, 100, 0.1),
ordLP=2) {
if (is.null(Func1)) {
Func1 = ecdf(x)
}
if (is.null(Func2)) {
Func2 = ecdf(y)
}
if (!is.null(x) & !is.null(y)) {
z = c(x, y)
z = unique(z)
intGrid = sort(z)
}
n = length(intGrid)
intGrid2 = c(intGrid[1], intGrid[-n])
deltaGrid = intGrid - intGrid2
deltaGrid[1] = median(deltaGrid)
dVec = abs(Func1(intGrid) - Func2(intGrid))^ordLP
if (!is.null(DensF)) {
dens = DensF1(intGrid)
} else if (!is.null(DensG)) {
dens = DensF2(intGrid)
} else {
dens = (Func1(intGrid) - Func2(intGrid2)) / deltaGrid
}
Mark(sum(dens * deltaGrid), 'dens sum')
out = sum(dVec * dens * deltaGrid)/sum(dens * deltaGrid)
return(out)
}
TestVonMisesDist = function() {
x = 1:100
Func1 = ecdf(x)
y = 2:101
Func2 = ecdf(y)
# InstallIf('distrEx')
# library('distrEx')
# CvMDist(e1=x, e2=y)
out = VonMisesDist(Func1=Func1, Func2=Func2)
VonMisesDist(x=x, y=y)
ss= 500
x = rnorm(ss)
y = rnorm(ss)
VonMisesDist(x=x, y=y)
x = rnorm(ss)
y = rnorm(ss, mean=2)
VonMisesDist(x=x, y=y)
x = rnorm(ss)
y = rnorm(ss, mean=5)
VonMisesDist(x=x, y=y)
x = rnorm(100)
y = rnorm(100, mean=0)
VonMisesDist(x=x, y=y)
x = rnorm(100)
y = rnorm(100, sd=2)
VonMisesDist(x=x, y=y)
x = rnorm(100)
y = rnorm(100, sd=1/2)
VonMisesDist(x=x, y=y)
x = rnorm(200)
y = runif(200, min=-1, max=1)
VonMisesDist(x=x, y=y)
x = rnorm(200)
y = runif(200, min=0, max=1)
VonMisesDist(x=x, y=y)
x = rnorm(200)
y = runif(200, min=-1, max=0)
VonMisesDist(x=x, y=y)
G = function(m1, s1, m2, s2, n1=1000, n2=500) {
x = rnorm(n1, mean=m1, sd=s1)
y = rnorm(n2, mean=m2, sd=s2)
print(VonMisesDist(x=x, y=y, ordLP=1))
Func1 = function(x) {
pnorm(x, mean=m1, sd=s1)
}
Func2 = function(x) {
pnorm(x, mean=m2, sd=s2)
}
print(VonMisesDist(Func1=Func1, Func2=Func2, ordLP=1))
}
G(m1=0, s1=1, m2=1, s2=1, n1=1000, n2=500)
}
## Kolmogorov Smirnoff distance
KSdist = function(
Func1=NULL,
Func2=NULL,
x=NULL,
y=NULL,
intGrid=seq(-100, 100, 0.01)) {
if (is.null(Func1)) {
Func1 = ecdf(x)
}
if (is.null(Func2)) {
Func2 = ecdf(y)
}
if (!is.null(x) & !is.null(y)) {
z = c(x, y)
z = unique(z)
intGrid = sort(z)
}
dVec = abs(Func1(intGrid) - Func2(intGrid))
return(max(dVec))
}
TestKSdist = function() {
x = rnorm(1000, mean=0, sd=1)
y = rnorm(50000, mean=2, sd=2)
KSdist(x=x, y=y)
}
## define a power function
# R cannot handle powers such as 1/3
# for negative numbers
SignPow = function(x, pow) {
sign(x)*(abs(x)^pow)
}
Example = function() {
x = -10:10
y = SignPow(x, pow=1/3)
plot(x, y)
y = SignPow(x, pow=1/2)
plot(x, y)
y = SignPow(x, pow=1/4)
plot(x, y)
}
## box-cox transform
# we allow for a shift as well as the power
BoxCox = function(x, pow, shift) {
SignPow((x + shift), pow)
}
## distance to a given fitted dist
# the default is to a fitted normal
FitGoodnessFcn = function(DistFcn) {
function(x, FitCdf=NULL) {
if (is.null(FitCdf)) {
FitCdf = function(z) {
pnorm(z, mean=mean(x), sd=sd(x))
}
}
G = ecdf(x)
F = FitCdf
return(DistFcn(F=F, G=G))
}
}
## this function fimds the optimal parameters of boxcox
# which give the closest distribution to normal
# which is the default of FitGoodness
# x is data
# prefCoef is used when we try to
# calculate preference between box parameters
# prefCoef[1]: this favors small deviations for power from 1
# prefCoef[2]: also favors small deviations for shift from 0
# prefCoef[3]: favors small values fo rthe OptCrit
EstimBoxCox = function(
x,
OptCrit,
powRange=seq(1/4, 4, 1/4),
shiftRange=NULL,
shiftIt=TRUE,
shiftGridNum=20,
prefCoef=c(0, 0, 1)) {
param = c(1, 0)
x = na.omit(x)
d = OptCrit(x)
dPref = d
mu = mean(x, na.rm=TRUE)
sig = sd(x, na.rm=TRUE)
lower = quantile(x, 0.1)
upper = quantile(x, 0.9)
delta = (upper - lower)/shiftGridNum
if (is.null(shiftRange)) {
shiftRange=seq(lower, upper, delta)
}
# print(shiftRange)
if (!shiftIt) {
shiftRange=0
}
n = length(powRange)
m = length(shiftRange)
for (i in 1:n) {
for (j in 1:m) {
pow = powRange[i]
shift = shiftRange[j]
y = BoxCox(x, pow, shift)
newD = OptCrit(y)
newD_pref = (abs(pow - 1)*prefCoef[1] +
abs(shift)*prefCoef[2] + newD*prefCoef[3])
if (newD_pref < d) {
param = c(pow, shift)
d = newD
dPref = newD_pref
transX = y
}
}
}
out = list(value=OptCrit(x), newValue=d, param=param, transX=transX)
return(out)
}
QskewFcn = function(pDelta) {
function(x) {
numer = quantile(x, pDelta) + quantile(x, 1 - pDelta) -2 * median(x)
denom = quantile(x, pDelta) - quantile(x, 1 - pDelta)
return(numer / denom)
}
}
Qskew = QskewFcn(pDelta=1/4)
QskewAbs = function(x) {
abs(Qskew(x))
}
SkewnessAbs = function(x) {
abs(skewness(x))
}
TestEstimBoxCox = function(x, OptCrit) {
x = 1:500
x = rgamma(500, shape=1)
res = EstimBoxCox(
x=x,
OptCrit=FitGoodnessFcn(KSdist),
powRange=seq(1/50, 5, 1/50))
param = res[['param']]
print('final')
print(param)
y = res[['transX']]
pow = param[1]
shift = param[2]
y = BoxCox(x, pow, shift)
value = res[['value']]
newValue = res[['newValue']]
value = signif(value, 2)
newValue = signif(newValue, 2)
param = signif(param, 2)
main1 = paste('value:',value)
main2 = paste('value:',newValue,' param:',param[1],',',param[2],sep='')
par(mfrow=c(1, 2))
hist(x, main=main1, col=ColAlpha('red', 0.5))
hist(y, main=main2, add=FALSE, col=ColAlpha('blue', 0.5))
print(summary(y))
return(list(x=x, y=y, param=param))
}
TestEstimBoxCox2 = function() {
G = function(x) {
par(mfrow=c(3, 2))
Func = EstimBoxCox
out = Func(x, OptCrit=FitGoodnessFcn(KSdist))
out = Func(x, OptCrit=SkewnessAbs)
out = Func(x, OptCrit=QskewAbs)
}
G = Example_BoxCoxVarious
x = rnorm(100)^2
out = G(x)
x = rgamma(100, rate=1, shape=1)
out = G(x)
x = rcauchy(100, scale=0.5)
out = G(x)
u = rnorm(100)
x = (u + 10)^3
out = G(x)
x = rchisq(100, df=10)
out = G(x)
x = rchisq(100, df=3)
out = G(x)
x = rchisq(100, df=3)
out = G(x)
}
# Calculate total variation metric for categorical variables
# this operates on sampled data (rather than distbns)
TotalVar_sampleCategData = function(x, y) {
xDf = data.frame(table(x))
yDf = data.frame(table(y))
xDf[ , "prop"] = xDf[ , "Freq"] / sum(xDf[ , "Freq"])
yDf[ , "prop"] = yDf[ , "Freq"] / sum(yDf[ , "Freq"])
xDf = xDf[ , c(1, 3)]
yDf = yDf[ , c(1, 3)]
colnames(xDf) = c("label", "xprop")
colnames(yDf) = c("label", "yprop")
mDf = merge(xDf, yDf, all=TRUE)
mDf[is.na(mDf)] = 0
dist = sum(abs(mDf[ , "xprop"] - mDf[ , "yprop"]))
return(list("dist"=dist, "df"=mDf))
}
TestTotalVar_sampleCategData = function() {
x = c("cat", "cat", "dog", "dog", "horse")
y = c("cat", "dog", "horse", "zebra")
TotalVar_sampleCategData(x, y)
}
TotalVar_sampleCategData_sigTest = function(x, y, bsNum=200) {
Func = function(i) {
x0 = sample(x, length(x), replace=TRUE)
y0 = sample(y, length(y), replace=TRUE)
TotalVar_sampleCategData(x0, y0)[["dist"]]
}
vec = sapply(FUN=Func, X=1:bsNum)
value = TotalVar_sampleCategData(x, y)[["dist"]]
ci = quantile(x=vec, probs=c(0.025, 0.0975))
# ci under h0 for comparison
Func_h0 = function(i) {
x0 = sample(c(x, y), length(x), replace=TRUE)
y0 = sample(c(x, y), length(y), replace=TRUE)
TotalVar_sampleCategData(x0, y0)[["dist"]]
}
vec_h0 = sapply(FUN=Func_h0, X=1:bsNum)
ci_h0 = quantile(x=vec_h0, probs=c(0.025, 0.0975))
pValue = 1 - mean(value > vec_h0)
r = sort(c(vec_h0, vec))
breaks = seq(min(r), max(r), by=(max(r) - min(r))/10)
hist(vec_h0, col=ColAlpha("blue", 0.2), breaks=breaks)
hist(vec, col=ColAlpha("red", 0.2), add=TRUE)
legend(
"topright",
col=c("blue", "red"),
legend=c("random dist", "obs dist"),
lwd=c(2, 2))
return(list(
"ci"=ci,
"pValue"=pValue,
"ci_h0"=ci_h0,
"pValue"=pValue))
}
TestTotalVar_sampleCategData_sigTest = function() {
x = c("cat", "cat", "dog", "dog", "horse")
y = c("cat", "dog", "horse", "zebra")
TotalVar_sampleCategData_sigTest(x, y)
# simulation
grid = seq(from=10, to=1000, by=10)
ciDf = setNames(
data.frame(
matrix(ncol=6, nrow=0)),
c(
"sample_size",
"ci_rand_lower",
"ci_rand_upper",
"ci_lower",
"ci_upper",
"p_value"))
for (n in grid) {
x = sample(c("dog", "cat", "horse", "cat"), n, replace=TRUE)
y = sample(c("dog", "cat", "horse", "zebra"), n, replace=TRUE)
res = TotalVar_sampleCategData_sigTest(x, y)
ci = res[["ci"]]
pValue = res[["pValue"]]
ci_h0 = res[["ci_h0"]]
ciDf[nrow(ciDf) + 1, ] = c(n, ci_h0, ci, pValue)
}
plot(
ciDf[ , "sample_size"],
ciDf[ , "p_value"],
type="l",
xlab="sample_size",
ylab="p_value")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.