# R/mvn.tests.r In mvnTest: Goodness of Fit Tests for Multivariate Normality

```#########################################
## the Anderson-Darling test
#########################################

{  ## This function calculates the value of the Anderson-Darling test
## and approximate p-value
## data - as n times p matrix or data frame, where p is dimension
if (!is.data.frame(data) && !is.matrix(data)) stop('data supplied must be either of class \"data frame\" or \"matrix\"')
if (dim(data)[2] < 2 || is.null(dim(data))) {stop("data dimesion has to be more than 1")}
if (dim(data)[1] < 3) {stop("not enough data for assessing mvn")}
data.name <- deparse(substitute(data))
xp <- as.matrix(data)
p <- dim(xp)[2]
n <- dim(xp)[1]
## getting MLEs...
s.mean <- colMeans(xp)
s.cov <- (n-1)/n*cov(xp)
s.cov.inv <- solve(s.cov) # inverse matrix of S (matrix of sample covariances)
D <- rep(NA,n) # vector of (Xi-mu)'S^-1(Xi-mu)...
for (j in 1:n)
D[j] <- t(xp[j,]-s.mean)%*%(s.cov.inv%*%(xp[j,]-s.mean))
D.or <- sort(D) ## get ordered statistics
Gp <- pchisq(D.or,df=p)
## getting the value of A-D test...
ind <- c(1:n)
an <- (2*ind-1)*(log(Gp[ind])+log(1 - Gp[n+1-ind]))
AD <- -n - sum(an) / n

if (qqplot) {   ## not checked yet....
xp <- scale(xp, scale = FALSE)
D <- xp %*%s.cov.inv%*% t(xp)
d <- diag(D)
r <- rank(d)
chi2q <- qchisq((r-0.5)/n,p)
plot(d, chi2q, pch = 19, main = "Chi-Square Q-Q Plot", xlab = "Squared Mahalanobis Distance", ylab = "Chi-Square Quantile")
abline(0, 1,lwd = 2, col = "black")
}

## getting the p-value...
N <- 1e4
U <- rep(0,N) ## initializing values of the AD test
for (i in 1:N) { ## loop through N reps
dat <- rmvnorm(n=n,mean=s.mean,sigma=s.cov) # generating n p-variate normal rv
mean1 <- colMeans(dat)
cov1 <- (n-1)/n*cov(dat)
cov.inv <- solve(cov1) # inverse matrix of S (matrix of sample covariances)
D <- rep(NA,n) # vector of (Xi-mu)'S^-1(Xi-mu)...
for (j in 1:n)
D[j] <- t(dat[j,]-mean1)%*%(cov.inv%*%(dat[j,]-mean1))
Gp <- pchisq(sort(D),df=p)
## getting the value of A-D test...
an <- (2*ind-1)*(log(Gp[ind])+log(1 - Gp[n+1-ind]))
U[i] <- -n - sum(an) / n
}

result

#################################################
##      Cramer-von Mises test                  ##
#################################################

CM.test<-function(data, qqplot=FALSE)
{  ##  This function calculates the value of the Cramer-von Mises test and
## approximate  p-value
## data - as n times p matrix or data frame, where p is dimension
if (!is.data.frame(data) && !is.matrix(data)) stop('data supplied must be either of class \"data frame\" or \"matrix\"')
if (dim(data)[2] < 2 || is.null(dim(data))) {stop("data dimesion has to be more than 1")}
if (dim(data)[1] < 3) {stop("not enough data for assessing mvn")}
data.name <- deparse(substitute(data))
xp <- as.matrix(data)
p <- dim(xp)[2]
n <- dim(xp)[1]
## getting MLEs...
s.mean <- colMeans(xp)
s.cov <- (n-1)/n*cov(xp)
s.cov.inv <- solve(s.cov) # inverse matrix of S (matrix of sample covariances)
D <- rep(NA,n) # vector of (Xi-mu)'S^-1(Xi-mu)...
for (j in 1:n)
D[j] <- t(xp[j,]-s.mean)%*%(s.cov.inv%*%(xp[j,]-s.mean))
D.or <- sort(D) ## get ordered statistics
Gp <- pchisq(D.or,df=p)
## getting the value of CM test...
ind <- c(1:n)
w <- (Gp - (2 * ind - 1) / (2 * n)) ^ 2
CM <- sum(w) + 1 / (12 * n)

if (qqplot) {   ## not checked yet....
xp <- scale(xp, scale = FALSE)
D <- xp %*%s.cov.inv%*% t(xp)
d <- diag(D)
r <- rank(d)
chi2q <- qchisq((r-0.5)/n,p)
plot(d, chi2q, pch = 19, main = "Chi-Square Q-Q Plot", xlab = "Squared Mahalanobis Distance", ylab = "Chi-Square Quantile")
abline(0, 1,lwd = 2, col = "black")
}

## getting the p-value...
N <- 1e4
U <- rep(0,N) ## initializing values of the AD test
for (i in 1:N) { ## loop through N reps
dat <- rmvnorm(n=n,mean=s.mean,sigma=s.cov) # generating n p-variate normal rv
mean1 <- colMeans(dat)
cov1 <- (n-1)/n*cov(dat)
cov.inv <- solve(cov1)
D <- rep(NA,n) # vector of (Xi-mu)'S^-1(Xi-mu)...
for (j in 1:n)
D[j] <- t(dat[j,]-mean1)%*%(cov.inv%*%(dat[j,]-mean1))
Gp <- pchisq(sort(D),df=p)
w <- (Gp - (2 * ind - 1) / (2 * n)) ^ 2
U[i] <-  sum(w) + 1 / (12 * n)
}
p.value <- (sum(U >= CM)+1)/(N+1)

result <- new("cm",CM=CM, p.value=p.value, data.name=data.name)
result
} ## CM.test

####################################################
## function to calculate the Doornik-Hansen (1994) test (Doornik and Hansen 1994)
####################################################

DH.test<-function(data,qqplot=FALSE){
##  This function calculates the Doornik-Hansen (1994) test statistic
## returns the test statistic and the approximate p-value
## data - as n times p matrix or data frame, where p is dimension
if (!is.data.frame(data) && !is.matrix(data)) stop('data supplied must be either of class \"data frame\" or \"matrix\"')
if (dim(data)[2] < 2 || is.null(dim(data))) {stop("data dimesion has to be more than 1")}
if (dim(data)[1] < 3) {stop("not enough data for assessing mvn")}
data.name <- deparse(substitute(data))
data <- as.matrix(data)

transform<-function(data)
{
x<-scale(data, scale=F)
x<-t(x)
s<-var(data)
v<-diag(s)^(-1/2)
v <- diag(v)
c.data<-v %*% s %*% v
values<- (tmp <- eigen(c.data, symmetric=T))\$values
vectors<-tmp\$vectors
lambda<-diag(values^(-.5))
trans<- vectors %*% lambda %*% t(vectors) %*% v %*% x
return(t(trans))
}

find.z1<-function(n,trans)

{
get.skew<-function(data)
{
get.m2<-function(data)
{
d<-dim(data)
n<-d[1]
p<-d[2]
mean.data<-apply(data,2,mean)
m2<-rep(0,p)
for(i in 1:p)
m2[i]<-(sum((data[,i]-mean.data[i])^2))/n
return(m2)
}

get.m3<-function(data)
{
d<-dim(data)
n<-d[1]
p<-d[2]
mean.data<-apply(data,2,mean)
m3<-rep(0,p)
for(i in 1:p)
m3[i]<- sum((data[,i]-mean.data[i])^3)/n
return(m3)
}

d<- dim(data)
n<-d[1]
p<-d[2]
m2<-get.m2(data)
m3<-get.m3(data)
b1<-rep(0,p)
for(i in 1:p)
b1[i]<-m3[i]/m2[i]^1.5
return(b1)
}
skew <-get.skew(trans)
B<- 3*(n^2+27*n-70)*(n+1)*(n+3)/((n-2)*(n+5)*(n+7)*(n+9))
w.2<- -1+ (2*(B-1))^.5
delta <- 1/log(sqrt(w.2))^.5
y <- skew * ((w.2-1)*(n+1)*(n+3)/(12*(n-2)))^.5
z1<-delta*log(y+(y^2+1)^.5)
return(z1)
}

find.z2<-function(n,trans)
{
get.kurt<-function(data)
{
get.m2<-function(data)
{
d<-dim(data)
n<-d[1]
p<-d[2]
mean.data<-apply(data,2,mean)
m2<-rep(0,p)
for(i in 1:p)
m2[i]<-sum((data[,i]-mean.data[i])^2)/n
return(m2)
}
get.m4<-function(data)
{
d<-dim(data)
n<-d[1]
p<-d[2]
mean.data<-apply(data,2,mean)
m4<-rep(0,p)
for(i in 1:p)
m4[i]<-sum((data[,i]-mean.data[i])^4)/n
return(m4)
}
d<- dim(data)
n<-d[1]
p<-d[2]
m2<-get.m2(data)
m4<-get.m4(data)
b2<-rep(0,p)
for(i in 1:p)
b2[i]<-m4[i]/m2[i]^2
return(b2)
}
get.skew<-function(data)
{
get.m2<-function(data)
{
d<-dim(data)
n<-d[1]
p<-d[2]
mean.data<-apply(data,2,mean)
m2<-rep(0,p)
for(i in 1:p)
m2[i]<-sum((data[,i]-mean.data[i])^2)/n
return(m2)
}

get.m3<-function(data)
{
d<-dim(data)
n<-d[1]
p<-d[2]
mean.data<-apply(data,2,mean)
m3<-rep(0,p)
for(i in 1:p)
m3[i]<-sum((data[,i]-mean.data[i])^3)/n
return(m3)
}

d<- dim(data)
n<-d[1]
p<-d[2]
m2<-get.m2(data)
m3<-get.m3(data)

b1<-rep(0,p)
for(i in 1:p)
b1[i]<-m3[i]/m2[i]^1.5
return(b1)
}

skew<-get.skew(trans)
kurt<-get.kurt(trans)
delta<-(n-3)*(n+1)*((n^2)+(15*n)-4)
a<- (n-2)*(n+5)*(n+7)*(n^2+27*n-70)/(6*delta)
c<- (n-7)*(n+5)*(n+7)*(n^2+2*n-5)/(6*delta)
k<-(n+5)*(n+7)*(n^3+37*n^2+11*n-313)/(12*delta)
alpha<-a + skew^2*c
chi<-(kurt-1-skew^2)*2*k
z2<-((chi/(2*alpha))^(1/3)-1+ 1/(9*alpha))*3*alpha^0.5
return(z2)
}

trans<-transform(data)
d<-dim(trans)
n<-d[1]
p<-d[2]
z1<-find.z1(n,trans)
z2<-find.z2(n,trans)
DH<-sum(t(z1)*z1)+sum(t(z2)*z2)
p.value<- 1-pchisq(DH,df=2*p)

if (qqplot) {   ## NOT checked yet....
xp <- scale(data, scale = FALSE)
Sa <- cov(xp)
D <- xp %*%solve(Sa)%*% t(xp)
d <- diag(D)
r <- rank(d)
chi2q <- qchisq((r-0.5)/n,p)
plot(d, chi2q, pch = 19, main = "Chi-Square Q-Q Plot", xlab = "Squared Mahalanobis Distance", ylab = "Chi-Square Quantile")
abline(0, 1,lwd = 2, col = "black")
}
result <- new("dh",DH = DH, p.value=p.value, data.name=data.name)
result
} ## DH.test

####################################################
## function to calculate the Royston test (Royston 1992)
####################################################

R.test<-function(data, qqplot=FALSE)
{   ## This function finds the equivalent degrees of freedom and calculates
## the Royston (1992) test and approximate p-value
## data - as n times p matrix or data frame, where p is dimension
if (!is.data.frame(data) && !is.matrix(data)) stop('data supplied must be either of class \"data frame\" or \"matrix\"')
if (dim(data)[2] < 2 || is.null(dim(data))) {stop("data dimesion has to be more than 1")}
if (dim(data)[1] < 3) {stop("not enough data for assessing mvn")}
data.name <- deparse(substitute(data))
data <- as.matrix(data)

find.coeff.new<-function(n)
{
## This function finds the coefficients used to obtain the values after applying Royston's normalizing transformation.
## n is the number of observations
## p is the number of variables
if(n <= 3) {
stop(" n is too small!")
}
else if(n >= 4 && n <=11) {
x = n
}
else if (n >= 12 && n <=2000){
x = log(n)
}
else{
stop("n is too large!")
}

if(n >= 4 && n <=11) {
gamma = -2.273 + 0.459 * x
}
else if(n >= 12 && n <= 2000) {
gamma = 0

}
else {
stop("n is not in the proper size range")
}
ret <- list(gamma = gamma)
if(n >= 4 && n <= 11) {
mu = 0.5440 - 0.39978 * x + 0.025054 * x^2 - 0.0006714 * x^3
}
else if(n >= 12 && n <= 2000) {
mu = -1.5861 - 0.31082 * x - 0.083751 * x^2 + 0.0038915 * x^3
}
else {
stop(" n is not in the proper size range")
}
ret\$mu <- mu
if(n >= 4 && n <= 11) {
sigma = exp(1.3822 - 0.77857 *x + 0.062767 * x^2 - 0.0020322 *x^3)
}
else if(n >= 12 && n <= 2000) {
sigma = exp(-0.4803 -0.082676 *x + 0.0030302 * x^2)
}
else {
stop("n is not in the proper size range")
}
ret\$sigma <- sigma
return(ret)
}

our.test.shapiro<-function(data, n, p)
{
## This function finds the values of Zj after the Royston transformation is applied
## data is the data file being analyzed, it is in matrix form
## n is the number of observations
## is the number of variables

W <- rep(0, p)

for(i in 1:p)   W[i] <- shapiro.test(data[, i])\$statistic
return(W)
}

r.trans.new<-function( n, p, a, b)
{
## This function normalized the Shapiro-Wilks statistic under Royston's Transformation
## data is the data set used, in matrix form
## n is the number of observations
## p is the number of variables

Z <- rep(0, p)
if (n >= 4 && n <= 11)
{
for(i in 1:p) Z[i] <- (-log(a\$gamma-log(1-b[i]))-a\$mu)/a\$sigma
}
else if (n >= 12 && n <= 2000)
{
for (i in 1:p) Z[i] <- (log(1-b[i]) + a\$gamma - a\$mu)/ a\$sigma
}
else {
stop ("WARNING : n is not in the proper range")
}
return(Z)
}

Royston<-function( n, p, z)
{
## This function finds the statistics to be compared for multivariate normality
## data is the data set to be analysed
## n is the number of observations
## p is the number of variables

G <- rep(0, p)
for(i in 1:p) G[i] <- qnorm((pnorm( - z[i]))/2)^2
return(G)
}

d <- dim(data)
n <- d[1]
p <- d[2]
a <- find.coeff.new(n)
b <- our.test.shapiro(data, n, p)
z <- r.trans.new( n, p, a, b)
r <- Royston( n, p, z)
u <- 0.715
v <- 0.21364+0.015124*log(n)^2-0.0018034*log(n)^3
la <- 5
corr.data <- cor(data)
new.corr <- corr.data^la * (1 - u  * (1 - corr.data)^u/v)
total <- sum(new.corr) - p
avg.corr <- total/(p^2 - p)
est.e <- p/(1 + (p - 1) * avg.corr)
## equivalent degrees of freedom
res <- list()
R <- est.e *sum(r)/p
p.value <- 1 - (pchisq(R, df = est.e))

if (qqplot) {   ## not checked yet....
xp <- scale(data, scale = FALSE)
Sa <- cov(xp)
D <- xp %*%solve(Sa)%*% t(xp)
d <- diag(D)
r <- rank(d)
chi2q <- qchisq((r-0.5)/n,p)
plot(d, chi2q, pch = 19, main = "Chi-Square Q-Q Plot", xlab = "Squared Mahalanobis Distance", ylab = "Chi-Square Quantile")
abline(0, 1,lwd = 2, col = "black")
}
result <- new("r",R=R,p.value=p.value, data.name=data.name)
result
} ## R.test

####################################################
## function to calculate the Henze-Zirkler test (Henze and Zirkler 1990)
####################################################

HZ.test<-function(data, qqplot=FALSE)
{   ## This function calculates the Henze-Zirkler test
## statistic for assessing  multivariate normality of a data set to be analyzed
## returns the value of the test and approximate p-value
## r is the desired alpha level of significance
## data - as n times p matrix or data frame, where p is dimension
if (!is.data.frame(data) && !is.matrix(data)) stop('data supplied must be either of class \"data frame\" or \"matrix\"')
if (dim(data)[2] < 2 || is.null(dim(data))) {stop("data dimesion has to be more than 1")}
if (dim(data)[1] < 3) {stop("not enough data for assessing mvn")}
data.name <- deparse(substitute(data))
data <- as.matrix(data)

loop4<- function(data)
{
n <- dim(data)[1]
s.inv <- solve(var(data))
X <- X.2 <- data %*% s.inv %*% t(data)
diag(X) <- 0
for(i in 1:(n-1)) {
for(j in (i+1):n)
X[j,i] <- X[i, j] <- X.2[i,i] - 2 * X.2[i,j] + X.2[j,j]
}
return(X)
}

Square<-function(data)
{
S <- var(data)
S.inv <- solve(S)
x.diff <- scale(data, scale = F)
square <- diag(x.diff %*% S.inv %*% t(x.diff))
return(square)
}

d <- dim(data)
n <- d[1]
p <- d[2]

Beta <- 1/sqrt(2) * ((2 * p + 1)/4)^(1/(p + 4)) * (n^(1/(p + 4)))

a <- var(data)
m <- dim(a)[1]
ranka <- qr(a)\$rank

if( ranka == m){

s <- loop4(data)
square <- Square(data)

T.beta <- n * (1/n^2 * sum(exp( -Beta^2/2 * s)) - 2 * ((1 + (
Beta^2))^( - p/2)) * (1/n) * (sum(exp( - (Beta^2/(2 * (1 +
Beta^2))) * square))) + ((1 + (2 * Beta^2))^( - p/2)))
}
else
{
T.beta <- n * 4
}
ans <- list(T = T.beta)
mu <- 1 - ((1 + 2 * Beta^2)^( - p/2)) * (1 + (p * Beta^2)/(1 + 2 *
Beta^2) + (p * (p + 2) *Beta^4)/(2 * (1 + 2 * Beta^2)^2))

w.beta <- (1 + Beta^2) * (1 + 3 * Beta^2)
sigma.sq <- 2 * (1 + 4 * Beta^2)^( - p/2) + 2 * (1 + 2 * Beta^2)^(- p) *
(1 + (2 * p * Beta^4)/(1 + (2 * Beta^2))^2 + (
3 * p * (p + 2) * Beta^8)/(4 * (1 + 2 * Beta^2)^4)) - 4 *
(w.beta^( - p/2)) * (1 + (3 * p * Beta^4)/(2 * w.beta) + (
p * (p + 2) * Beta^8)/(2 * w.beta^2))

p.mu<-log(sqrt(mu^4/(sigma.sq + mu^2)))

p.sigma<-sqrt(log((sigma.sq + mu^2)/mu^2))

p.value<- 1 - (plnorm(T.beta,p.mu,p.sigma))

if (qqplot) {   ## not checked yet....
xp <- scale(data, scale = FALSE)
Sa <- cov(xp)
D <- xp %*%solve(Sa)%*% t(xp)
d <- diag(D)
r <- rank(d)
chi2q <- qchisq((r-0.5)/n,p)
plot(d, chi2q, pch = 19, main = "Chi-Square Q-Q Plot", xlab = "Squared Mahalanobis Distance", ylab = "Chi-Square Quantile")
abline(0, 1,lwd = 2, col = "black")
}
result <- new("hz", HZ=T.beta, p.value=p.value, data.name=data.name)
result
} ## HZ.test
```

## Try the mvnTest package in your browser

Any scripts or data that you put into this service are public.

mvnTest documentation built on May 2, 2019, 2:44 p.m.