R/testHSIC.R

Defines functions plot.testHSIC print.testHSIC mat.vec double.centering meannondiag compute.mom.TrAW compute.mat.TrAW gammatest seqpermutation.test permutation.test HSIC.perm asymptotic.test check.sensiHSIC testHSIC

Documented in plot.testHSIC print.testHSIC testHSIC

### testHSIC ###################################################################

testHSIC <- function(sensi, 
                     test.method="Asymptotic",
                     B=3000, seq.options=list(criterion="screening", alpha=0.05, 
                                              Bstart=200, Bfinal=5000, Bbatch=100, Bconv=200, 
                                              graph=TRUE) 
                     ){
  
  ### inputs ###

  # sensi:            1 x ...     list of class sensiHSIC (with all data)
  # test.method:      1 x 1       character (method used to test independence)
  # B:                1 x 1       numeric   (number of permutations)
  # seq.options:      1 x ...     list      (options for the sequential test)

  ### output ###

  # x:                1 x ...     list of class testHSIC (with all results)

  ### useful lists #############################################################
  
  L <- available.material()

  ### For sensi ################################################################

  if(missing(sensi)){
    stop("You must specificy what sensi is.", call.=FALSE)
  }

  ### For test.method ##########################################################

  # test.method must belong to L$tests

  if(!is.character(test.method) | length(test.method)>1){
    stop("test.method must be a string.", call.=FALSE)
  }else{
    if(!(test.method %in% L$tests)){
      stop("Invalid test method. \n",
           "Available test methods include: ",
           paste0(L$tests, collapse=', '), ".", call.=FALSE)
    }
  }
  
  # two statistics can be used to test independence:
  # --> the numerator of the first-order HSIC-ANOVA index
  # --> the numerator of the total-order HSIC-ANOVA index
  
  if(stringr::str_detect(test.method, "^Tot_")){
    measure.def <- "TO.num" 
  }else{
    measure.def <- "HSICXY"
  }
  
  ### For sensi and test.method ################################################

  check.sensiHSIC(sensi, test.method)

  ### For B ####################################################################

  if(!is.numeric(B) | length(B)>1){
    stop("B must be a positive integer.", call.=FALSE)
  }else{
    if(B<=0){
      stop("B must be a positive integer.", call.=FALSE)
    }else{
      if(!(B%%1==0)){
        B <- ceiling(B)
        warning("B was not an integer. \n",
                "ceiling(B) has been taken instead.", call.=FALSE)
      }
    }
  }

  ### For seq.options ##########################################################

  ### minimum requirement ###

  # 1) seq.options must be a list of options
  # 2) seq.options must contain at least one element named "criterion"

  if(!is.list(seq.options)){
    stop("seq.options must be a list of options.", call.=FALSE)
  }else{
    if(is.null(seq.options$criterion)){
      stop("Criterion not defined. You must specify seq.options$criterion.", call.=FALSE)
    }
  }

  ### default values for other options ###

  if(is.null(seq.options$alpha)) seq.options$alpha <- 0.05
  if(is.null(seq.options$Bstart)) seq.options$Bstart <- 200
  if(is.null(seq.options$Bfinal)) seq.options$Bfinal <- 5000
  if(is.null(seq.options$Bbatch)) seq.options$Bbatch <- 100
  if(is.null(seq.options$Bconv)) seq.options$Bconv <- 200
  if(is.null(seq.options$graph)) seq.options$graph <- TRUE

  ### removal of undesired options in seq.options ###

  expected.seq <- c("criterion", "alpha",
                    "Bstart", "Bfinal", "Bbatch", "Bconv", "graph")

  found.seq <- names(seq.options)
  if(!(all(found.seq %in% expected.seq))){
    undesired.seq <- setdiff(found.seq, expected.seq)
    seq.options[undesired.seq] <- NULL
    warning("The list seq.options can only contain the following options: \n",
            paste0(expected.seq, collapse=', '), ". \n",
            "Other options have been removed.", call.=FALSE)
  }

  ### check of remaining options in seq.options ###

  # seq.options$criterion must be a string

  if(!is.character(seq.options$criterion) | length(seq.options$criterion)>1){
    stop("seq.options$criterion must be a string.", call.=FALSE)
  }else{
    if(!(seq.options$criterion %in% c("screening", "ranking", "both"))){
      seq.options$criterion <- "screening"
      warning("seq.options$criterion must be: screening, ranking or both. \n",
              "By default, seq.options$criterion=screening has been selected.", call.=FALSE)
    }
  }

  # seq.options$alpha must be a real number in [0,1]

  if(!is.numeric(seq.options$alpha) | length(seq.options$alpha)>1){
    stop("seq.options$alpha must be a real number between 0 and 1.", call.=FALSE)
  }else{
    if(seq.options$alpha<0 | seq.options$alpha>1){
      seq.options$alpha <- 0.05
      warning("seq.options$alpha was not between 0 and 1. \n",
              "By default, seq.options$alpha=0.05 has been selected.", call.=FALSE)
    }
  }

  # seq.options$graph must be boolean

  if(length(seq.options$graph)>1){
    stop("seq.options$graph must be a boolean.", call.=FALSE)
  }else{
    if(!(isTRUE(seq.options$graph) | isFALSE(seq.options$graph))){
      stop("seq.options$graph must be a boolean.", call.=FALSE)
    }
  }

  ### special focus on Bstart, Bfinal, Bbatch and Bconv ###

  # the hyperparameters are unpacked
  
  Bstart <- seq.options$Bstart
  Bfinal <- seq.options$Bfinal
  Bbatch <- seq.options$Bbatch
  Bconv <- seq.options$Bconv

  # Bstart must be a positive integer

  if(!is.numeric(Bstart) | length(Bstart)>1){
    stop("seq.options$Bstart must be a positive integer.", call.=FALSE)
  }else{
    if(Bstart<=0){
      stop("seq.options$Bstart must be a positive integer.", call.=FALSE)
    }else{
      if(!(Bstart%%1==0)){
        Bstart <- ceiling(Bstart)
        warning("seq.options$Bstart was not an integer. \n",
                "ceiling(seq.options$Bstart) has been taken instead.", call.=FALSE)
      }
    }
  }

  # Bfinal must be a positive integer

  if(!is.numeric(Bfinal) | length(Bfinal)>1){
    stop("seq.options$Bfinal must be a positive integer.", call.=FALSE)
  }else{
    if(Bfinal<=0){
      stop("seq.options$Bfinal must be a positive integer.", call.=FALSE)
    }else{
      if(!(Bfinal%%1==0)){
        Bfinal <- ceiling(Bfinal)
        warning("seq.options$Bfinal was not an integer. \n",
                "ceiling(seq.options$Bfinal) has been taken instead.", call.=FALSE)
      }
    }
  }

  # Bbatch must be a positive integer

  if(!is.numeric(Bbatch) | length(Bbatch)>1){
    stop("seq.options$Bbatch must be a positive integer.", call.=FALSE)
  }else{
    if(Bbatch<=0){
      stop("seq.options$Bbatch must be a positive integer.", call.=FALSE)
    }else{
      if(!(Bbatch%%1==0)){
        Bbatch <- ceiling(Bbatch)
        warning("seq.options$Bbatch was not an integer. \n",
                "ceiling(seq.options$Bbatch) has been taken instead.", call.=FALSE)
      }
    }
  }

  # Bconv must be a positive integer

  if(!is.numeric(Bconv) | length(Bconv)>1){
    stop("seq.options$Bconv must be a positive integer.", call.=FALSE)
  }else{
    if(Bconv<=0){
      stop("seq.options$Bconv must be a positive integer.", call.=FALSE)
    }else{
      if(!(Bconv%%1==0)){
        Bconv <- ceiling(Bconv)
        warning("seq.options$Bconv was not an integer. \n",
                "ceiling(seq.options$Bconv) has been taken instead.", call.=FALSE)
      }
    }
  }

  ### additional precautions ###
  
  need.replacement <- FALSE
  
  # precaution 1: Bstart must be greater than Bfinal
  
  if(Bstart>=Bfinal){
    stop("Bfinal must be larger than Bstart. \n",
         "Here, you chose Bstart=", Bstart, 
         " and Binal=", Bfinal, ".", call.=FALSE)
  }

  # precaution 2: Bconv must be greater than or equal to Bfinal
  
  if(Bconv>Bfinal){
    stop("Bfinal must be larger than or equal to Bconv. \n",
         "Here, you chose Bconv=", Bconv, 
         " and Binal=", Bfinal, ".", call.=FALSE)
  }

  # precaution 3: at least one batch may be added
  
  if(Bbatch>Bfinal-Bstart){
    Bbatch <- Bfinal-Bstart
    need.replacement <- TRUE
  }

  # precaution 4: Bfinal must be equal to Bstart+k*Bbatch

  if(!((Bfinal-Bstart)%%Bbatch==0)){
    cat("boolean=",!((Bfinal-Bstart)%%Bbatch==0))
    Bfinal <- Bstart+ceiling((Bfinal-Bstart)/Bbatch)*Bbatch
    need.replacement <- TRUE
  }

  # to send a warning if some values have been corrected
  
  if(need.replacement){
    
    warning("Some hyperparameters in seq.options are not consistent. \n",
            "The following values have been selected: \n",
            "Bstart=", Bstart, ", ", "Bfinal=", Bfinal, ", ",
            "Bbatch=", Bbatch, ", ", "Bconv=", Bconv, ".", call.=FALSE)
    
    # the hyperparameters are repacked
    
    seq.options$Bstart <- Bstart
    seq.options$Bfinal <- Bfinal
    seq.options$Bbatch <- Bbatch
    seq.options$Bconv <- Bconv

  }
  
  #########################
  ### Main instructions ###
  #########################
  
  ### extraction from sensi ###
  
  HSIC.obs <- switch(measure.def, 
                     "HSICXY"={ unname(sensi$HSICXY[,"original"]) },
                     "TO.num"={ unname(sensi$TO.num[,"original"]) },
                     NA)
  
  ### specific case of C-HSIC indices ###
  
  if(!is.null(sensi$cond)){
    mask.cond <- sensi$weights>0
    KX <- sensi$KX[mask.cond, mask.cond, ]
    KY <- sensi$KY[mask.cond, mask.cond]
  }else{
    KX <- sensi$KX
    KY <- sensi$KY
  }
  
  ### independence test ###

  if(test.method=="Asymptotic"){
    
    res <- asymptotic.test(HSIC.obs, KX, KY, sensi$estimator.type)
    
  }else if(test.method %in% c("Gamma", "Tot_Gamma")){
    
    res <- gammatest(HSIC.obs, KX, KY, measure.def)
    
  }else if(test.method %in% c("Permutation", "Tot_Permutation")){
    
    res <- permutation.test(HSIC.obs, KX, KY, 
                            measure.def, sensi$estimator.type, B) 
    
  }else if(test.method %in% c("Seq_Permutation", "Tot_Seq_Permutation")){
    
    res <- seqpermutation.test(HSIC.obs, KX, KY,
                                measure.def, sensi$estimator.type, seq.options)
    
  }
  
  ### object of class testHSIC ###
  
  x <- list()
  
  p <- dim(KX)[3]
  
  # p-values
  x$pval <- data.frame(res$pval)
  rownames(x$pval) <- paste("X", 1:p, sep="")
  colnames(x$pval) <- "original"
  
  # properties of the chosen test
  
  x$prop <- switch(test.method,
                   "Asymptotic"={ c("asymptotic", "parametric") }, 
                   "Permutation"={ c("non-asymptotic", "non-parametric") },
                   "Seq_Permutation"={ c("non-asymptotic", "non-parametric") },
                   "Gamma"={ c("non-asymptotic", "parametric") },
                   "Tot_Permutation"={ c("non-asymptotic", "non-parametric") },
                   "Tot_Seq_Permutation"={ c("non-asymptotic", "non-parametric") },
                   "Tot_Gamma"={ c("non-asymptotic", "parametric") },
                   NA)
  
  # parametric family
  
  if(x$prop[2]=="parametric"){
    
    if(test.method=="Asymptotic"){
      x$family <- switch(sensi$estimator.type, 
                         "U-stat"={ "Pearson3" }, 
                         "V-stat"={ "Gamma" }, NA)
    }else{
      x$family <- "Gamma"
    }
    
  }
  
  # about parameters (for parametric methods)
  
  if(x$prop[2]=="parametric"){
  
    x$param <- data.frame(t(res$param))
    rownames(x$param) <- paste("X", 1:p, sep="")
    
    if(test.method=="Asymptotic"){
      colnames(x$param) <- switch(sensi$estimator.type,
                                  "U-stat"={ c("shape", "scale", "position") }, 
                                  "V-stat"={ c("shape", "scale") }, NA)
    }else{
      colnames(x$param) <- c("shape", "scale")
    }
    
  }

  # about permutations (for non-parametric methods)
  
  if(test.method %in% c("Permutation", "Tot_Permutation")){
    
    x$B <- B
    
    x$Hperm <- data.frame(t(res$Hperm))
    rownames(x$Hperm) <- paste("X", 1:p, sep="")
    colnames(x$Hperm) <- NULL
    
  }
  
  if(test.method %in% c("Seq_Permutation", "Tot_Seq_Permutation")){
    
    x$seq.options <- seq.options

    x$paths <- data.frame(t(res$paths))
    rownames(x$paths) <- paste("X", 1:p, sep="")
    colnames(x$paths) <- NULL
    
  }
  
  # initial call to the function
  x$call <- match.call()
  
  class(x) <- "testHSIC"

  return(x)
  
}

### check.sensiHSIC ############################################################

check.sensiHSIC <- function(sensi, test.method){
  
  ### inputs ###
  
  # sensi:            1 x ...     list of class sensiHSIC (with all data)
  # test.method:      1 x 1       character (method used to test independence)
  
  ### output ###
  
  # --> If sensi and test.method are properly defined, the function returns NULL.
  # --> Otherwise, an error is returned indicating what happens.

  ### useful lists #############################################################
  
  L <- available.material()
  
  ### For sensi ################################################################
  
  msg <- "sensi is not an appropriate object of class sensiHSIC. \n"
  
#  if(!(class(sensi)=="sensiHSIC")){ # bug
   if(!(inherits(sensi,"sensiHSIC"))){
    stop("sensi must be a list of class sensiHSIC.", call.=FALSE)
  }
  
  ### For sensi$KX #############################################################
  
  # sensi must contain an element named KX
  if(is.null(sensi$KX)){
    stop(msg, "KX is missing in sensi. \n",
         "Please add an object named KX in sensi.", call.=FALSE)
  }else{
  
    msg.KX <- "sensi$KX must be a n-by-n-by-p array containing all p input Gram matrices."
    
    # sensi$KX must be an array
    if(!(is.array(sensi$KX))){
      stop(msg, msg.KX, call.=FALSE)
    }else{
      # sensi$KX must be a 3-dimensional array
      if(length(dim(sensi$KX))!=3){
        stop(msg, msg.KX, call.=FALSE)
      }else{
        n <- dim(sensi$KX)[1]
        p <- dim(sensi$KX)[3]
        # each slice of sensi$KX must be a square matrix
        if(!(ncol(sensi$KX)==n)) stop(msg, msg.KX, call.=FALSE)
      }
    }
    
  }
  
  ### For sensi$KY #############################################################
  
  # sensi must contain an element named KY
  if(is.null(sensi$KY)){
    stop(msg, "KY is missing in sensi. \n",
         "Please add an object named KY in sensi.", call.=FALSE)
  }else{
  
    msg.KY <- "sensi$KY must be a n-by-n array containing the output Gram matrix."
    
    # sensi$KY must be a matrix
    if(!(is.matrix(sensi$KY))){
      stop(msg, msg.KY, call.=FALSE)
    }else{
      # sensi$KX and sensi$KY must have equal dimensions
      if(nrow(sensi$KY)!=n | ncol(sensi$KY)!=n){
        stop(msg, "Dimensions of sensi$KX and sensi$KY do not match. \n",
             msg.KX, "\n", msg.KY, call.=FALSE)
      }
    }
    
  }

  ### For estimator.type #######################################################
  
  # sensi must contain an element named estimator.type
  if(is.null(sensi$estimator.type)){
    stop(msg, "estimator.type is missing in sensi. \n",
         "Please add an object named estimator.type in sensi.", call.=FALSE)
  }else{
  
    # sensi$estimator.type must be a string
    if(!is.character(sensi$estimator.type) | length(sensi$estimator.type)>1){
      stop(msg, "sensi$estimator.type must be a string.", call.=FALSE)
    }else{
      # sensi$estimator.type must be either "V-stat" or "U-stat"
      if(!(sensi$estimator.type %in% c("V-stat", "U-stat"))){
        stop(msg, "estimator.type must be: V-stat or U-stat.", call.=FALSE)
      }
    }
    
  }
  
  ### For cond #################################################################
  
  # sensi must contain an element named cond
  
  if(!is.null(sensi$cond)){
    
    # sensi$weights must then contain an element named weights
    if(is.null(sensi$weights)){
      stop(msg, "weights is missing in sensi. \n",
           "Please add an object named weights in sensi.", call.=FALSE)
    }else{

      msg.weights <- "sensi$weights must be a vector of length n."

      # weights must a vector
      if(!is.numeric(sensi$weights) && !is.null(dim(sensi$weights))){
        stop(msg, msg.weights, call.=FALSE)
      }else{
        # weights must be of length n
        if(length(sensi$weights)!=n){
          stop(msg, msg.weights, call.=FALSE)
        }
      }

    }
    
  }
  
  ### For test.method ##########################################################
  
  # in presence of U-statistics, test.method must belong to L$U.tests
  
  if(sensi$estimator.type=="U-stat"){
    if(!(test.method %in% L$U.tests)){
      stop("Invalid test method for U-statistics. \n",
           "In presence of U-statistics, possible test methods only include: ", 
           paste(L$U.tests, collapse=", "), ".", call.=FALSE)
    }
  }
  
  # in presence of C-HSIC indices, there are two constraints:
  # 1) cond$type must be "indicTh" (binary weights)
  # 2) test.method must belong to L$cond.tests
  
  if(!is.null(sensi$cond)){
    
    # cond$type must be "indicTh"
    if(!(sensi$cond$type=="indicTh")){
      stop("No available test method for C-HSIC indices ",
           "when sensi$cond$type is ", sensi$cond$type, ". \n",
           "sensi$cond$type=indicTh must be selected instead.", call.=FALSE)
    }
    
    # test.method must belong to L$cond.tests
    if(!(test.method %in% L$cond.tests)){
      stop("Invalid test method for C-HSIC indices. \n",
           "In presence of C-HSIC indices, possible test methods only include: ", 
           paste(L$cond.tests, collapse=", "), ".", call.=FALSE)
    }
    
  }
  
  ### For sensi$HSICXY and sensi$TO.num ########################################
  
  # two situations can be distinguished:
  # 1) the test is based on the numerators of the first-order HSIC-ANOVA indices
  # 2) __________________________________________ total-order __________________
  
  if(stringr::str_detect(test.method, "^Tot_")){
    
    # sensi must contain an element named TO.num
    if(is.null(sensi$TO.num)){
      stop(msg, "TO.num is missing in sensi. \n",
           "Please add an object named TO.num in sensi.", call.=FALSE)
    }else{
      
      # sensi$TO.num must be a data frame
      if(!is.data.frame(sensi$TO.num)){
        stop(msg, "sensi$TO.num must be a data frame.", call.=FALSE)
      }else{
        # sensi$TO.num must contain a column named "original"
        if(!("original" %in% colnames(sensi$TO.num))){
          stop(msg, "sensi$TO.num must have a column named original.", call.=FALSE)
        }else{
          # sensi$TO.num must have p rows
          if(nrow(sensi$TO.num)!=p){
            stop(msg, "Dimensions of sensi$KX and sensi$TO.num do not match. \n",
                 msg.KX, "\n", "sensi$TO.num must be a data frame with p rows.", call.=FALSE)
          }
        }
      }
      
    }
  
  }else{
    
    # sensi must contain an element named HSICXY
    if(is.null(sensi$HSICXY)){
      stop(msg, "HSICXY is missing in sensi. \n",
           "Please add an object named HSICXY in sensi.", call.=FALSE)
    }else{
      
      # sensi$HSICXY must be a data frame
      if(!is.data.frame(sensi$HSICXY)){
        stop(msg, "sensi$HSICXY must be a data frame.", call.=FALSE)
      }else{
        # sensi$HSICXY must contain a column named "original"
        if(!("original" %in% colnames(sensi$HSICXY))){
          stop(msg, "sensi$HSICXY must have a column named original.", call.=FALSE)
        }else{
          # sensi$HSICXY must have p rows
          if(nrow(sensi$HSICXY)!=p){
            stop(msg, "Dimensions of sensi$KX and sensi$HSICXY do not match. \n",
                 msg.KX, "\n", "sensi$HSICXY must be a data frame with p rows.", call.=FALSE)
          }
        }
      }
      
    }
 
  }
  
  return(NULL)
  
}

### asymptotic.test ############################################################

asymptotic.test <- function(HSIC.obs, KX, KY, estimator.type){
  
  ### inputs ###
  
  # HSIC.obs:         1 x p         numeric   (observed values of HSIC indices)
  # KX:               n x n x p     array     (input Gram matrices)
  # KY:               n x n         matrix    (output Gram matrix)
  # estimator.type:   1 x 1         character ("U-stat" or "V-stat")
  
  ### output ###
  
  # res:        1 x 2   list        (test results)
  
  # In particular, res contains the following objects:
  
  # pval:       1 x p   numeric     (estimated p-values)             
  # param:      p x 2   matrix      (parameters of the Gamma distributions)
  
  n <- dim(KX)[1]
  p <- dim(KX)[3]
  
  # pre-allocation
  pval <- rep(NA, p)
  param <- switch(estimator.type, 
                  "U-stat"={ matrix(NA, 3, p) },
                  "V-stat"={ matrix(NA, 2, p) }, NA)

  # output Gram matrix
  EY <- mean(diag(KY))            # mean of diagonal coefficients
  EYY <- meannondiag(KY)# mean of non-diagonal coefficients
  BY <- double.centering(KY)      # the rows and columns are now centered
  
  for(i in 1:p){
    
    # input Gram matrix
    KXi <- KX[,,i]                  # i-th input Gram matrix
    EXi <- mean(diag(KXi))          # mean of diagonal coefficients
    EXiXi <- meannondiag(KXi)# mean of non-diagonal coefficients
    BXi <- double.centering(KXi)    # double-centering 
    
    Bi <- (BXi*BY)^2
    
    # estimation of the mean and variance of the V-statistic
    HSIC.mean <- (EXi-EXiXi)*(EY-EYY)/n
    HSIC.var <- 2*(n-4)*(n-5)/(n*(n-1)*(n-2)*(n-3))*meannondiag(Bi)
    
    # method of moments
    alpha <- param[1,i] <- (HSIC.mean^2)/HSIC.var
    beta  <- param[2,i] <- HSIC.var/HSIC.mean
    if(estimator.type=="U-stat") delta <- param[3,i] <- -HSIC.mean
    
    # parametric estimation of the p-value
    pval[i] <- switch(estimator.type,
                      "U-stat"={ 1-pgamma(q=HSIC.obs[i]-delta, shape=alpha, scale=beta) },
                      "V-stat"={ 1-pgamma(q=HSIC.obs[i], shape=alpha, scale=beta) }, NA)
    
  }
  
  res <- list(pval=pval, param=param)
  
  return(res)
  
}

### HSIC.perm ##################################################################

HSIC.perm <- function(KX, KY, measure.def, estimator.type, B, A){
  
  ### inputs ###
  
  # KX:               n x n x p     array     (input Gram matrices)
  # KY:               n x n         matrix    (output Gram matrix)
  # measure.def:      1 x 1         character ("HSICXY" or "TO.num")
  # estimator.type:   1 x 1         character ("U-stat" or "V-stat")
  # B:                1 x 1         numeric   (number of permutations)
  # A:                1 x r         numeric   (subset of 1,...,p)
  
  ### output ###
  
  # Hperm is either a vector (if r=1) or a matrix (if r>1)
  
  # Hperm:            1 x B         numeric   (HSIC index after permutations)
  #                   B x r         matrix    (HSIC indices after permutations)
  
  n <- dim(KX)[1] # nb of samples
  p <- dim(KX)[3] # nb of input variables
  
  r <- length(A)
  Hperm <- matrix(NA, B, r)
  
  for(k in 1:r){
    
    i <- A[k]       # k-th element in the vector A
    KXi <- KX[,,i]  # i-th Gram matrix
    
    ### definition of the permutation scheme ###
      
    if(measure.def=="HSICXY"){

      perm.scheme <- function(j){ 
        shuf <- sample(seq(1,n), n)
        HSIC.fun(KXi, KY[shuf, shuf], estimator.type)
      }
      
    }else{
      
      kXi <- KXi-1      # the Gram matrix that needs to be permuted
      
      # product of all other Gram matrices
      LXi <- 1          
      for(s in setdiff(1:p, i)) LXi <- LXi*KX[,,s]
      # --> this matrix is invariant under permutations
      
      perm.scheme <- function(j){ 
        shuf <- sample(seq(1,n), n)
        HSIC.fun(kXi[shuf, shuf]*LXi, KY, estimator.type)
      }
      
    }

    ### repetition of B permutations ###
    
    Hperm[,k] <- sapply(1:B, perm.scheme)
    
  }
  
  if(r==1) Hperm <- as.numeric(Hperm)
  
  return(Hperm)
  
}

### permutation.test ###########################################################

permutation.test <- function(HSIC.obs, KX, KY,
                             measure.def, estimator.type, B){
  
  ### inputs ###
  
  # HSIC.obs:         1 x p         numeric   (observed values of HSIC indices)
  # KX:               n x n x p     array     (input Gram matrices)
  # KY:               n x n         matrix    (output Gram matrix)
  # measure.def:      1 x 1         character ("HSICXY" or "TO.num")
  # estimator.type:   1 x 1         character ("U-stat" or "V-stat")
  # B:                1 x 1         numeric   (number of permutations)
  
  ### output ###
  
  # res:        1 x 2   list        (test results)
  
  # In particular, res contains the following object:
  
  # pval:           1 x p   numeric     (estimated p-values)  
  # Hperm:          B x p   numeric     (HSIC indices after permutations)
  
  p <- dim(KX)[3]
  
  pval <- rep(NA, p)
  
  ### repetitions of B permutations ###
  
  Hperm <- HSIC.perm(KX, KY, measure.def, estimator.type, B, 1:p)
  
  ### non-parametric estimation of the p-values ###
  
  pval <- apply(mat.vec(Hperm, HSIC.obs, ">"), 2, mean)
  
  res <- list(pval=pval, Hperm=Hperm)
  
  return(res)
  
}

### seqpermutation.test ###########################################

seqpermutation.test <- function(HSIC.obs, KX, KY,
                                 measure.def, estimator.type, seq.options){
  
  ### inputs ###
  
  # HSIC.obs:         1 x p         numeric   (observed values of HSIC indices)
  # KX:               n x n x p     array     (input Gram matrices)
  # KY:               n x n         matrix    (output Gram matrix)
  # measure.def:      1 x 1         character ("HSICXY" or "TO.num")
  # estimator.type:   1 x 1         character ("U-stat" or "V-stat")
  # seq.options:      1 x ...       list      (options for the sequential test)
  
  ### output ###
  
  # res:        1 x 2     list      (test results)
  
  # In particular, res contains the following objects:
  
  # pval:       1 x p     numeric   (estimated p-values)
  # paths:    ... x p     matrix    (all estimated p-values)
  
  p <- dim(KX)[3]
  
  # hyperparameters to guide the sequential procedure
  criterion <- seq.options$criterion
  alpha <- seq.options$alpha
  Bstart <- seq.options$Bstart
  Bfinal <- seq.options$Bfinal
  Bbatch <- seq.options$Bbatch
  Bconv <- seq.options$Bconv
  graph <- seq.options$graph
  
  if(Bconv>Bstart){
    Bstart <- Bconv
  }

  ### goal-oriented test of independence ###
  
  if(criterion=="screening"){
    
    #############################
    ### FIRST CASE: SCREENING ###
    #############################
      
    pval <- rep(NA, p)              # p-values
    paths <- matrix(NA, Bfinal, p)  # estimation paths
    nb.perm <- rep(NA, p)           # nb of permutations
    
    for(i in 1:p){
      
      ### FIRST ITERATION ###
      
      B <- Bstart # counter for permutations
      
      # observed value of the HSIC index
      Hi <- HSIC.obs[i]
      # new values of the HSIC index after Bstart permutations 
      seq.HSIC <- HSIC.perm(KX, KY, measure.def, estimator.type, B, i)
      # estimation of the p-value with increasing sample sizes
      seq.pval <- cumsum(seq.HSIC>Hi)/seq(2, B+1)
      # screening (0 if pval>alpha, 1 otherwise)
      seq.bin <- 1*(seq.pval<=alpha)
      # more permutations?
      more.perm <- !identical( seq.bin[(B-Bconv+1):B], 
                               rep(seq.bin[B], Bconv) )
      
      ### OTHER ITERATIONS ###
      
      while(more.perm && B<Bfinal){
        
        B <- B+Bbatch
        
        # new values of the HSIC index (coming from Bbatch additional permutations)
        seq.HSIC <- c(seq.HSIC,
                      HSIC.perm(KX, KY, measure.def, estimator.type, Bbatch, i))
        # all p-values are recomputed
        seq.pval <- cumsum(seq.HSIC>Hi)/seq(2, B+1)
        # all binary decisions are recomputed
        seq.bin <- 1*(seq.pval<=alpha)       
        # more permutations?
        more.perm <- !identical( seq.bin[(B-Bconv+1):B], 
                                 rep(seq.bin[B], Bconv) )
        
      }
      
      # storage
      pval[i] <- seq.pval[B]
      paths[1:B,i] <- seq.pval
      nb.perm[i] <- B
      
    }
    
    Bm <- max(nb.perm)    # largest nb of permutations
    paths <- paths[1:Bm,] # paths are shortened
    
  }else{
    
    ####################################
    ### SECOND CASE: RANKING or BOTH ###
    ####################################
    
    ### FIRST ITERATION ###
    
    B <- Bstart # counter for permutations
    
    # new values of the HSIC indices after Bstart permutations 
    seq.HSIC <- HSIC.perm(KX, KY, measure.def, estimator.type, B, 1:p)
    # estimation of the p-value with increasing sample sizes
    seq.pval <- ( apply(mat.vec(seq.HSIC, HSIC.obs, ">"), 2, cumsum) /
                    matrix(seq(2, B+1), B, p) )
    # rankings of p-values with increasing sample sizes
    seq.rk <- t(apply(seq.pval, 1, rank, ties="first"))
    # more permutations?
    more.perm <- !identical( seq.rk[(B-Bconv+1):B,], 
                             matrix(seq.rk[B,], Bconv, p, byrow=TRUE) )
    
    # additional check if screening="both"
    if(!more.perm && criterion=="both"){
      # screening (0 if pval>alpha, 1 otherwise)
      seq.bin <- 1*(seq.pval<=alpha)
      # more permutations?
      more.perm <- !identical( seq.bin[(B-Bconv+1):B,], 
                               matrix(seq.bin[B,], Bconv, p, byrow=TRUE) )
    }
    
    ### OTHER ITERATIONS ###
    
    while(more.perm && B<Bfinal){
      
      B <- B+Bbatch
      
      # new values of the HSIC indices (coming from Bbatch additional permutations)
      seq.HSIC <- rbind(seq.HSIC,
                        HSIC.perm(KX, KY, measure.def, estimator.type, Bbatch, 1:p))
      # all p-values are recomputed
      seq.pval <- ( apply(mat.vec(seq.HSIC, HSIC.obs, ">"), 2, cumsum) /
                      matrix(seq(2, B+1), B, p) )
      # all rankings are recomputed
      seq.rk <- t(apply(seq.pval, 1, rank, ties="first"))
      # more permutations?
      more.perm <- !identical( seq.rk[(B-Bconv+1):B,], 
                               matrix(seq.rk[B,], Bconv, p, byrow=TRUE) )
      
      # additional check if screening="both"
      if(!more.perm && criterion=="both"){
        seq.bin <- 1*(seq.pval<=alpha)       
        # more permutations?
        more.perm <- !identical( seq.bin[(B-Bconv+1):B,], 
                                 matrix(seq.bin[B,], Bconv, p, byrow=TRUE) )
      }
      
    }
    
    # storage
    pval <- seq.pval[B,]
    paths <- seq.pval
    
  }
  
  ### warning if paths are not stabilized ###
  
  if(criterion=="screening"){
    
    if(any(nb.perm>Bfinal)){
      ind <- which(nb.perm>Bfinal)
      warning("Even after ", Bfinal, " iterations, ",
              "the algorithm has not converged yet for the following variables: ",
              paste(ind, collapse=", "), ".", call.=FALSE)
    }
    
  }else{
    
    if(B>Bfinal){
      warning("Even after ", Bfinal, " iterations, ",
              "the algorithm has not converged yet.", call.=FALSE)
    }
    
  }
  
  ### plot of estimation paths ###
  
  if(graph){
    # estimation paths for all p-values
    matplot(paths, type='l', ylim=c(0,1),
            xlab="number of permutations",
            ylab="estimated p-values",
            main="Sequential estimation of the p-values until convergence")
    # red line for screening
    if(!(criterion=="ranking")) abline(h=alpha, lwd=2, col="red")
  }
  
  res <- list(pval=pval, paths=paths)
  
  return(res)
  
}

### gammatest #################################################################

gammatest <- function(HSIC.obs, KX, KY, measure.def){

  ### inputs ###

  # HSIC.obs:       1 x p         numeric   (observed values of HSIC indices)
  # KX:             n x n x p     array     (input Gram matrices)
  # KY:             n x n         matrix    (output Gram matrix)
  # measure.def:    1 x 1         character ("HSICXY" or "TO.num")

  ### output ###

  # res:        1 x 2   list        (test results)

  # In particular, res contains the following objects:

  # pval:       1 x p   numeric     (estimated p-values)
  # param:      2 x p   matrix      (parameters of the Gamma distributions)

  # pre-allocation
  
  n <- dim(KX)[1]
  p <- dim(KX)[3]
  pval <- rep(NA, p)
  param <- matrix(NA, 2, p)
  
  # computations of the matrices Ai and Wi for all i in {1,...,p}
  
  mat <- compute.mat.TrAW(KX, KY, measure.def)
  
  # computations of the first two moments of Tr(Ai Wi) for all i in {1,...,p}
  
  mom <- compute.mom.TrAW(mat$A, mat$W, measure.def)
  
  # parametric estimation of all p-values
  
  for(i in 1:p){
    
    esp <- mom[1,i]
    var <- mom[2,i]

    # method of moments for Tr(Ai W)
    shape.TrAW <- (esp^2)/var
    scale.TrAW <- var/esp
    
    # parameters for HSIC(Xi,Y) = Tr(Ai W)/(n^2)
    alpha <- param[1,i] <- shape.TrAW
    beta  <- param[2,i] <- scale.TrAW/(n^2)
    
    # parametric estimation of the p-value
    pval[i] <- 1-pgamma(q=HSIC.obs[i], shape=alpha, scale=beta)
    
  }
  
  res <- list(pval=pval, param=param)

  return(res)

}

### compute.mat.TrAW ###########################################################

compute.mat.TrAW <- function(KX, KY, measure.def){
  
  ### inputs ###
  
  # KX:             n x n x p     array     (input Gram matrices)
  # KY:             n x n         matrix    (output Gram matrix)
  # measure.def:    1 x 1         character ("HSICXY" or "TO.num")
  
  ### output ###
  
  # mat:      1 x 2         list      (all matrices)
  
  # In particular, mat contains the following objects:
  
  # A:        n x n x p     array     (all matrices Ai)
  
  # W:        n x n         matrix    (matrix W        if measure.def="HSICXY")
  #           n x n x p     array     (all matrices Wi if measure.def="TO.num")
  
  n <- dim(KX)[1]
  p <- dim(KX)[3]
  
  if(measure.def=="HSICXY"){
    
    A <- array(NA, c(n, n, p))
    for(i in 1:p) A[,,i] <- double.centering(KX[,,i])
    
    W <- double.centering(KY)

  }else{
    
    A <- array(NA, c(n, n, p))
    LY <- double.centering(KY)
    for(i in 1:p){
      Ai <- LY
      for(j in setdiff(1:p, i)) Ai <- Ai*KX[,,j]
      A[,,i] <- Ai
    }
    
    W <- KX-1
    
  }
  
  mat <- list(A=A, W=W)
  return(mat)
  
}

### compute.mom.TrAW ###########################################################

compute.mom.TrAW <- function(A, W, measure.def){
  
  ### inputs ###
  
  # A:              n x n x p     array     (all matrices Ai)
  
  # W:              n x n         matrix    (matrix W        if measure.def="HSICXY")
  #                 n x n x p     array     (all matrices Wi if measure.def="TO.num")
  
  # measure.def:    1 x 1         character ("HSICXY" or "TO.num")

  ### output ###
  
  # mom:    2 x p   numeric   (expectations and variances of Tr(Ai Wi) for all i in {1,...,p})  
  
  n <- dim(A)[1]
  p <- dim(A)[3]
  
  mom <- matrix(NA, 2, p)
  
  if(measure.def=="HSICXY"){
    
    # denominators
    denom1 <- ((n-1)^2)*(n+1)*(n-2)
    denom2 <- (n+1)*n*(n-1)*(n-2)*(n-3)
    
    # matrix operations
    tr.W <- sum(diag(W))      # complexity:          1 x n 
    tr.W2 <- sum(diag(W)^2)   # complexity:          2 x n
    sum.W2 <- sum(W^2)        # complexity: 2 x n2
                              # __________________________
                              # complexity: 2 x n2 + 3 x n
    
    # terms to be used in the final formulas
    O1.W <- (n-1)*sum.W2-tr.W^2
    O2.W <- n*(n+1)*tr.W2-(n-1)*(tr.W^2+2*sum.W2)
    
    for(i in 1:p){
      
      Ai <- A[,,i]
      
      # matrix operations
      tr.Ai <- sum(diag(Ai))
      tr.Ai2 <- sum(diag(Ai)^2)
      sum.Ai2 <- sum(Ai^2)
      
      # terms to be used in the final formulas
      O1.Ai <- (n-1)*sum.Ai2-tr.Ai^2
      O2.Ai <- n*(n+1)*tr.Ai2-(n-1)*(tr.Ai^2+2*sum.Ai2)
      
      # --> final formulas
      
      mom[1,i] <- tr.Ai*tr.W/(n-1)
      mom[2,i] <- 2*O1.Ai*O1.W/denom1 + O2.Ai*O2.W/denom2

    }

  }else{
    
    # denominators
    denom1 <- n
    denom2 <- n*(n-1)
    denom3 <- n*(n-1)*(n-2)
    denom4 <- n*(n-1)*(n-2)*(n-3)
    
    for(i in 1:p){

      # --> for the matrix Ai
      
      Ai <- A[,,i]
      tAi <- Ai
      diag(tAi) <- 0
      
      # matrix operations
      tr.Ai <- sum(diag(Ai))                      # complexity:          1 x n
      sum.tAi <- sum(tAi)                         # complexity: 1 x n2
      tr.Ai2 <- sum(diag(Ai)^2)                   # complexity:          2 x n
      sum.Ai2 <- sum(Ai*Ai)                       # complexity: 2 x n2
      sdc.Ai <- sum(diag(Ai)*colSums(Ai))         # complexity: 1 x n2 + 2 x n
      sum.tAi2 <- sum.Ai2-tr.Ai2          
      sct.Ai <- sum(colSums(tAi)%*%tAi)-sum.tAi2  # complexity: 3 x n2 + 1 x n
                                                  # __________________________
                                                  # complexity: 7 x n2 + 6 x n 
      
      # terms to be used in the final formulas
      O1.Ai <- tr.Ai2
      O21.Ai <- (tr.Ai)^2-tr.Ai2
      O22.Ai <- sum.tAi2
      O23.Ai <- sdc.Ai-tr.Ai2
      O31.Ai <- sct.Ai
      O32.Ai <- tr.Ai*sum.tAi-2*(sdc.Ai-tr.Ai2)
      O4.Ai <- (sum.tAi)^2-2*sum.tAi2-4*sct.Ai
      
      # --> for the matrix Wi
      
      Wi <- W[,,i]
      tWi <- Wi
      diag(tWi) <- 0
      
      # matrix operations
      tr.Wi <- sum(diag(Wi))
      sum.tWi <- sum(tWi)
      tr.Wi2 <- sum(diag(Wi)^2)
      sum.Wi2 <- sum(Wi*Wi)
      sdc.Wi <- sum(diag(Wi)*colSums(Wi))
      sum.tWi2 <- sum.Wi2-tr.Wi2
      sct.Wi <- sum(colSums(tWi)%*%tWi)-sum.tWi2
      
      # terms to be used in the final formulas
      O1.Wi <- tr.Wi2
      O21.Wi <- (tr.Wi)^2-tr.Wi2
      O22.Wi <- sum.tWi2
      O23.Wi <- sdc.Wi-tr.Wi2
      O31.Wi <- sct.Wi
      O32.Wi <- tr.Wi*sum.tWi-2*(sdc.Wi-tr.Wi2)
      O4.Wi <- (sum.tWi)^2-2*sum.tWi2-4*sct.Wi
      
      # crossed expressions
      T1 <- ( O1.Ai*O1.Wi )/denom1
      T2 <- ( O21.Ai*O21.Wi + 2*O22.Ai*O22.Wi + 4*O23.Ai*O23.Wi )/denom2
      T3 <- ( 4*O31.Ai*O31.Wi + 2*O32.Ai*O32.Wi )/denom3
      T4 <- ( O4.Ai*O4.Wi )/denom4
      
      # --> final formulas
      
      E1 <- tr.Ai*tr.Wi/n+sum.tAi*sum.tWi/(n*(n-1))
      E2 <- T1+T2+T3+T4
      mom[, i] <- c(E1, E2-E1^2)
 
    }

  }
  
  return(mom)
  
}

### meannondiag ################################################

meannondiag <- function(A){
  
  ### input ###
  
  # A:    n x n     numeric (matrix)
  
  ### output ###
  
  # s:    1 x 1     numeric (mean of non-diagonal coefficients)
  
  n <- nrow(A)
  s <- sum(A-diag(diag(A)))/(n*(n-1))
  
  return(s)
  
}

### double.centering ###########################################################

double.centering <- function(A){
  
  ### input ###
  
  # A:        n x m         matrix    (initial given matrix)
  
  ### output ###
  
  # B:        n x m         matrix    (double-centered matrix)
  
  n <- nrow(A)
  m <- ncol(A)
  
  S.mat <- sum(A)/(n*m)
  S.rows <- rowSums(A)/m
  S.cols <- colSums(A)/n
  
  B <- ( A - matrix(rep(S.rows, m), n, m)
         - matrix(rep(S.cols, n), n, m, byrow=TRUE) + S.mat )
  
  return(B)
  
}

### mat.vec ####################################################################

mat.vec <- function(M1, vec, sgn){
  
  ### input ###
  
  # M1:     n x p     matrix      (samples from a random variable in R^p)  
  # vec:    1 x p     numeric     (reference vector in R^p)
  # sgn:    1 x 1     character   (boolean operator provided as a string)
  
  ### output ###
  
  # M2:     n x p     matrix      (each row in M1 is compared to vec with sgn)
  
  n <- nrow(M1)
  
  vec <- t(as.matrix(vec))
  vec.cloned <- matrix(1, n, 1)%*%vec
  M2 <- switch(sgn, 
               "==" ={ 1*(M1-vec.cloned==0) },
               ">"  ={ 1*(M1-vec.cloned>0) },
               "<"  ={ 1*(M1-vec.cloned<0) },
               ">=" ={ 1*(M1-vec.cloned>=0) },
               "<=" ={ 1*(M1-vec.cloned<=0) },
               NA)
  
  return(M2)
  
}

### print.testHSIC #############################################################

print.testHSIC <- function(x, ...){
  
  ### input ###
  
  # x:      1 x 1     list of class testHSIC  
  
  ##########
  
  # initial call
  cat("\n", "Call:", "\n", deparse(x$call), "\n\n", sep="")
  
  if(!is.null(x$pval)){
    
    # p-values
    cat("P-values:", "\n", sep="")
    print(x$pval)
    cat("\n")
    
    # parametric distributions
    if(x$prop[2]=="parametric"){
      cat("The test statistics were assumed to follow", x$family, "distributions:", "\n")
      print(x$param)
      cat("\n")
    }
    
  }else{
    cat("(empty)", "\n\n")
  }

}

### plot.testHSIC ##############################################################

plot.testHSIC <- function(x, ylim=c(0, 1), err=NA, ...){
  
  ### inputs ###
  
  # x:      1 x 1     list of class sensiHSIC 
  # ylim:   1 x 2     (numeric)                 bounds on the y-axis
  # err:    1 x 1     (numeric)                 level of type I error
  
  ##########
  
  if(is.null(x$pval)){
    
    warning("No result is stored in x. \n", call.=TRUE)
    
  }else{
    
    ###################
    ### Preparation ###
    ###################
    
    yshift <- 0.2
    
    # minimum and maximum values on the y-axis
    ym <- ylim[1]-yshift
    yM <- ylim[2]+2*yshift
    
    # nb of input variables
    p <- nrow(x$pval)
    
    # type I error
    if(is.na(err)){
      if(!is.null(x$seq.options)){
        err <- x$seq.options$alpha
      }else{
        err <- 0.05
        warning("By default, err=0.05 has been selected.", call.=FALSE)
      }
    }
    
    # preliminary work for the legend
    
    cap.H0 <- "Rejected variables"
    cap.H1 <- "Selected variables"
    
    col.H0 <- "red"
    col.H1 <- "green"
    
    all.captions <- c(cap.H0, cap.H1)
    all.colors <- c(col.H0, col.H1)
    
    #########################################
    ### Rejected variables (decision: H0) ###
    #########################################
    
    if(sum(x$pval>=err)>=1){
      var.H0 <- which(x$pval>=err)
      nodeplot(x$pval[var.H0, "original", drop=FALSE], at=var.H0, 
               xlim=c(1,p), ylim=c(ym, yM), pch=21, bg=col.H0)
    }
    
    #########################################
    ### Selected variables (decision: H1) ###
    #########################################
    
    if(sum(x$pval<err)>=1){
      var.H1 <- which(x$pval<err)
      nodeplot(x$pval[var.H1, "original", drop=FALSE], at=var.H1, 
               xlim=c(1,p), ylim=c(ym, yM), pch=21, bg=col.H1,
               add=TRUE)
    }
    
    ##############
    ### Legend ###
    ##############
    
    abline(h=err, col="red", lwd=2)
    abline(h=0, col="black", lwd=2, lty=2)
    abline(h=1, col="black", lwd=2, lty=2)
    
    legend("top", legend=all.captions, col="black", pch=21, pt.bg=all.colors)
    
  }
  
}

Try the sensitivity package in your browser

Any scripts or data that you put into this service are public.

sensitivity documentation built on Aug. 31, 2023, 5:10 p.m.