R/zip.fsreg.R

Defines functions zip.fsreg

Documented in zip.fsreg

zip.fsreg <- function(target, dataset, threshold = 0.05, wei = NULL, tol = 2, ncores = 1) {
  
  threshold <- log(threshold)  ## log of the significance level
  p <- dim(dataset)[2]  ## number of variables
  devi <- dof <- numeric( p )  
  moda <- list()
  k <- 1   ## counter
  n <- length(target)  ## sample size
  tool <- numeric( min(n, p) )
  con <- log(n)
  sela <- NULL
  #check for NA values in the dataset and replace them with the variable median or the mode
  if ( any(is.na(dataset) ) ) {
    #dataset = as.matrix(dataset);
    warning("The dataset contains missing values (NA) and they were replaced automatically by the variable (column) median (for numeric) or by the most frequent level (mode) if the variable is factor")
    if ( is.matrix(dataset) )  {
      dataset <- apply( dataset, 2, function(x){ x[which(is.na(x))] = median(x, na.rm = TRUE) ; return(x) } ) 
    } else {
      poia <- which( is.na(dataset), arr.ind = TRUE )[2]
      for ( i in poia )  {
        xi <- dataset[, i]
        if ( is.numeric(xi) )  {                    
          xi[ which( is.na(xi) ) ] <- median(xi, na.rm = TRUE) 
        } else if ( is.factor( xi ) ) {
          xi[ which( is.na(xi) ) ] <- levels(xi)[ which.max( as.vector( table(xi) ) )]
        }
        dataset[, i] <- xi
      }
    }
  }
  
  runtime <- proc.time()
  
  if ( is.null(wei) ) {
    ini <-  - 2 * Rfast::zip.mle(target)$loglik + 2 * con 
    lgy <- sum( lgamma(target + 1) )  
  } else  { 
    ini <-  - 2 * zipmle.wei(target, wei)$loglik + 2 * con 
    lgy <- sum( wei * lgamma(target + 1) )  
  }  
  mod <- zip.regs(target, dataset, wei, logged = TRUE, ncores = ncores)[, 1:2]
  mat <- cbind(1:p, mod[, 2], mod[, 1])
  rownames(mat) <- 1:p
  colnames(mat) <- c( "variables", "log.p-value", "stat" )
  sel <- which.min(mat[, 2])
  info <- matrix( numeric(3), ncol = 3 )
  
  if ( mat[sel, 2] < threshold ) {
    info[1, ] <- mat[sel, ]
    mat <- mat[-sel, , drop = FALSE] 
    sela <- sel
    mi <- zip.reg( target, dataset[, sel], wei = wei, lgy = lgy )
    tool[1] <-  - 2 * mi$loglik + ( length(mi$be) + 1 ) * con
    moda[[ 1 ]] <- mi
  }  else  {
    info <- info  
    sela <- NULL
  }
  ############
  ###       k equals 2
  ############ 
  if ( info[k, 2] < threshold  &  dim(mat)[1] > 0 ) {
    
    k <- 2
    pn <- p - k + 1   
    ini <-  2 * mi$loglik
    do <- length(mi$be)
    devi <- dof <- numeric( pn )  
    
    if ( ncores <= 1 ) {
      devi <- dof <- numeric(pn)
      for ( i in 1:pn ) {
        ww <- zip.reg( target, dataset[, c(sela, mat[i, 1]) ], wei = wei, lgy = lgy )
        devi[i] <-  2 * ww$loglik
        dof[i] <- length( ww$be )          
      }
      stat <- abs( devi - ini )
      pval <- pchisq( stat, dof - do, lower.tail = FALSE, log.p = TRUE )
      
    } else {
      cl <- makePSOCKcluster(ncores)
      registerDoParallel(cl)
      mata <- matrix(0, pn, 2)  
      mod <- foreach( i = 1:pn, .combine = rbind, .export = "zip.reg" ) %dopar% {
        ww <- zip.reg( target, dataset[, c(sela, mat[i, 1]) ], wei = wei, lgy = lgy )
        mata[i, ] <- c( 2 * ww$loglik, length( ww$be )  )
      }
      stopCluster(cl)
      stat <- abs( mod[, 1] - ini )
      pval <- pchisq( stat, mod[, 2] - do, lower.tail = FALSE, log.p = TRUE ) 
    }
    mat[, 2:3] <- cbind(pval, stat)
    ina <- which.min(mat[, 2])
    sel <- mat[ina, 1]    
  
    if ( mat[ina, 2] < threshold ) {
      ma <- zip.reg( target, dataset[, c(sela, sel)], wei = wei, lgy = lgy )
      tool[2] <-  - 2 * ma$loglik + ( length(ma$be) + 1 ) * con
    
      if ( tool[ 1 ] - tool[ 2 ] <= tol ) {
        info <- info 
      } else { 
        info <- rbind(info, c( mat[ina, ] ) )
        sela <- info[, 1]
        mat <- mat[-ina, , drop = FALSE] 
        moda[[ 2 ]] <- ma
      } 
    } 
  }  else  info <- info
  ############
  ###       k greater than 2
  ############ 
  if ( nrow(info) > 1  &  nrow(mat) > 0 ) {
    while ( info[k, 2] < threshold &  k < n - 15 & tool[ k - 1 ] - tool[ k ] > tol  &  nrow(mat) > 0 )  {
      
      ini <-  2 * moda[[ k ]]$loglik 
      do <- length( moda[[ k ]]$be ) 
      k <- k + 1   
      pn <- p - k  + 1
      devi <- dof <- numeric( pn )  
      
      if (ncores <= 1) {  
        devi = dof = numeric(pn) 
        for ( i in 1:pn ) {
          ma <- zip.reg( target, dataset[, c(sela, mat[i, 1] ) ], wei = wei, lgy = lgy )
          devi[i] <-  2 * ma$loglik
          dof[i] = length( ma$be ) 
        }
        stat <- abs( devi - ini )
        pval <- pchisq( stat, dof - do, lower.tail = FALSE, log.p = TRUE )
        
      } else {
        cl <- makePSOCKcluster(ncores)
        registerDoParallel(cl)
        mod <- foreach( i = 1:pn, .combine = rbind, .export = "zip.reg" ) %dopar% {
          ww <- zip.reg( target, dataset[, c(sela, mat[i, 1]) ], wei = wei, lgy = lgy )
          return( c( 2 * ww$loglik, length( ww$be ) ) )
        }
        stopCluster(cl)
        stat = abs( mod[, 1] - ini )
        pval = pchisq( stat, mod[, 2] - do, lower.tail = FALSE, log.p = TRUE ) 
      }
      mat[, 2:3] <- cbind(pval, stat)
      ina <- which.min(mat[, 2])
      sel <- mat[ina, 1]    
      
      if ( mat[ina, 2] < threshold ) {
        ma <- zip.reg( target, dataset[, c(sela, sel) ], wei = wei, lgy = lgy )
        tool[k] <-  - 2 * ma$loglik + ( length(ma$be) + 1 ) * con
        if ( tool[ k - 1 ] - tool[ k  ] <= tol ) {
          info <- rbind(info, c( 1e300, 0, 0 ) )   
        } else { 
          info <- rbind( info, mat[ina, ] )
          sela <- info[, 1]
          mat <- mat[-ina, , drop = FALSE]
          moda[[ k ]] <- ma
        } 
      } else  info <- rbind(info, c( 1e300, 0, 0 ) ) 
    } 
  }  

  runtime <- proc.time() - runtime
  
  d <- length(sela)
  final <- NULL
  if ( d >= 1 ) {
    final <- zip.reg( target, dataset[, sela], wei = wei, lgy = lgy ) 
    info <- info[1:d, , drop = FALSE]
    info <- cbind( info, tool[ 1:d ] ) 
    colnames(info) <- c( "variables", "p-value", "stat", "BIC" )
    rownames(info) <- info[, 1]
  }
  list( runtime = runtime, mat = t(mat), info = info, ci_test = "testIndZIP", final = final ) 
}         

Try the MXM package in your browser

Any scripts or data that you put into this service are public.

MXM documentation built on Aug. 25, 2022, 9:05 a.m.