R/np.deneqtest.R
In np: Nonparametric Kernel Smoothing Methods for Mixed Data Types

Documented in npdeneqtest

## Function that implements the multivariate density equality test
## described in Li, Q., E. Maasoumi, and J.S. Racine (2009), "A
## Nonparametric Test for Equality of Distributions with Mixed
## Categorical and Continuous Data," Journal of Econometrics, Volume
## 148, pp 186-200.

npdeneqtest <- function(x = NULL,
                        y = NULL,
                        bw.x = NULL,
                        bw.y = NULL,
                        boot.num = 399,
                        random.seed = 42,
                        ...) {

  ## Some testing of input values

  if(is.null(x) || is.null(y)) stop(" you must provide x and y data")
  if(!is.data.frame(x) || !is.data.frame(y)) stop(" x and y must be data frames")
  if(!identical(names(data.frame(x)),names(data.frame(y)))) stop(" data frames x and y must have identical variable names")
  if(boot.num < 9) stop(" number of bootstrap replications must be >= 9")

  if(is.null(bw.x) || is.null(bw.y)) {
    bw.x <- npudensbw(dat=x,...)
    bw.y <- npudensbw(dat=y,...)     
  }

  ## Save seed prior to setting

  if(exists(".Random.seed", .GlobalEnv)) {
    save.seed <- get(".Random.seed", .GlobalEnv)
    exists.seed = TRUE
  } else {
    exists.seed = FALSE
  }

  set.seed(random.seed)

  ## First, define test statistic function. This will return the
  ## standardized and unstandardized test statistic along with its
  ## estimated variance.

  teststat <- function(x,y,bw.x,bw.y) {

    ## Get n1 and n2, number of rows in x and y

    n1 <- nrow(x)
    n2 <- nrow(y)

    ## First, compute the In statistic

    sum.1 <- sum(npksum(txdat=x,
                        bws=bw.x,
                        leave.one.out=TRUE,
                        bandwidth.divide=TRUE)$ksum)
    
    sum.2 <- sum(npksum(txdat=y,
                        bws=bw.y,
                        leave.one.out=TRUE,
                        bandwidth.divide=TRUE)$ksum)
    
    sum.3 <- sum(npksum(txdat=x,
                        exdat=y,
                        bws=bw.x,
                        leave.one.out=FALSE,
                        bandwidth.divide=TRUE)$ksum)

    ## sum.4 and sum.3 are identical...
    
    In <- sum.1/(n1*(n1-1))+sum.2/(n2*(n2-1))-2*sum.3/(n1*n2)

    ## Next, compute sigma^2_n

    sum.1 <- sum(npksum(txdat=x,
                        bws=bw.x,
                        kernel.pow=2,
                        leave.one.out=TRUE,
                        bandwidth.divide=TRUE)$ksum)
    
    sum.2 <- sum(npksum(txdat=y,
                        bws=bw.y,
                        kernel.pow=2,
                        leave.one.out=TRUE,
                        bandwidth.divide=TRUE)$ksum)
    
    sum.3 <- sum(npksum(txdat=x,
                        exdat=y,
                        bws=bw.x,
                        kernel.pow=2,
                        leave.one.out=FALSE,
                        bandwidth.divide=TRUE)$ksum)
    
    ## sum.4 and sum.3 are identical

    sigma2.n<- 2*(sum.1/(n1^2*(n1-1)^2)+sum.2/(n2^2*(n2-1)^2)+2*sum.3/(n1^2*n2^2))

    ## Finally, compute Tn, the standardized statistic

    Tn <- In/sqrt(sigma2.n)
    
    return(list(Tn=Tn,In=In))
    
  } ## End of test statistic

  ## Now write a bootstrap function for the test statistic
  
  teststat.boot <- function(x,y,bw.x,bw.y) {
    n1 <- nrow(x)
    n2 <- nrow(y)
    ## Resample from pooled data
    z <- data.frame(rbind(x,y))
    x.bootstrap <- data.frame(z[sample(nrow(z),size=n1,replace=TRUE),])      
    y.bootstrap <- data.frame(z[sample(nrow(z),size=n2,replace=TRUE),])      
    output.boot <- teststat(x.bootstrap,y.bootstrap,bw.x,bw.y)
    return(list(Tn=output.boot$Tn,
                In=output.boot$In))
  }
  
  Tn.vector <- numeric(boot.num)
  In.vector <- numeric(boot.num)
  
  console <- newLineConsole()

  for(i in 1:boot.num) {
    console <- printClear(console)
    console <- printPush(paste(sep="", "Bootstrap replication ",
                               i, "/", boot.num, "..."), console)
    output.boot <- teststat.boot(x,y,bw.x,bw.y)
    Tn.vector[i] <- output.boot$Tn
    In.vector[i] <- output.boot$In
  }
  
  console <- printClear(console)
  console <- printPop(console)  

  ## Compute the test statistic
  
  output <- teststat(x,y,bw.x,bw.y)
  
  ## Compute empirical P-values - the number of resampled statistics
  ## more extreme than the original statistic
  
  Tn.P <- mean(ifelse(Tn.vector>output$Tn,1,0))
  In.P <- mean(ifelse(In.vector>output$In,1,0))
  
  ## Restore seed

  if(exists.seed) assign(".Random.seed", save.seed, .GlobalEnv)
  
  deneqtest(Tn=output$Tn,
            In=output$In,
            Tn.bootstrap=Tn.vector,
            In.bootstrap=In.vector,                
            Tn.P=Tn.P,
            In.P=In.P,
            boot.num=boot.num)
  
}