R/bic.fsreg.R

Defines functions bic.fsreg

Documented in bic.fsreg

bic.fsreg <- function( target, dataset, test = NULL, wei = NULL, tol = 2, ncores = 1 ) {

  p <- ncol(dataset)  ## number of variables
  bico <- numeric( p )
  moda <- list()
  k <- 1   ## counter
  n <- length(target)  ## sample size
  con <- log(n)
  tool <- NULL
  info <- matrix( 0, ncol = 2 )
  result <- NULL
  sela <- NULL
  #check for NA values in the dataset and replace them with the variable median or the mode
  if( any( is.na(dataset) ) ) {
    #dataset = as.matrix(dataset);
    warning("The dataset contains missing values (NA) and they were replaced automatically by the variable (column) median (for numeric) or by the most frequent level (mode) if the variable is factor")
    if ( is.matrix(dataset) )  {
      dataset <- apply( dataset, 2, function(x){ x[which(is.na(x))] = median(x, na.rm = TRUE) ; return(x) } ) 
    }else{
       poia <- unique( which( is.na(dataset), arr.ind = TRUE )[, 2] )
      for( i in poia )  {
        xi <- dataset[, i]
        if ( is.numeric(xi) ) {                    
          xi[ which( is.na(xi) ) ] <- median(xi, na.rm = TRUE) 
        } else if ( is.factor( xi ) ) {
          xi[ which( is.na(xi) ) ] <- levels(xi)[ which.max( as.vector( table(xi) ) )]
        }
        dataset[, i] <- xi
      }
    }
  }
  ##################################
  # target checking and initialize #
  ##################################
  if ( is.null(test) ) {
    ## linear regression 
    if ( is.numeric(target)  ||  is.vector(target)  ) { 
	  la <- length( unique(target) )
	  if ( la > 2 ) {
	    if ( sum( round(target) - target ) == 0 )   test <- "testIndPois" 
	   } else if ( la == 2 ) {
	     test <- "testIndLogistic" 
	   }	 
       ## surival data
    } else if ( is.Surv(target) ) {
      test <- "censIndCR"
    }  ## end if ( is.null(test) )
    #available conditional independence tests
  }	 ## end if ( is.null(test) )
    av_models = c("testIndReg", "testIndRQ", "testIndBeta", "censIndCR", "censIndWR", "censIndLLR", 
                  "testIndLogistic", "testIndPois", "testIndNB", "testIndZIP", "testIndGamma", 
                  "testIndNormLog", "testIndTobit", "testIndMMReg"); 
  
  dataset <- as.data.frame(dataset)
    
  if ( test == "testIndLogistic"  |  test == "testIndPois"  ||  test == "testIndReg" ) {
    result <- bic.glm.fsreg( target, dataset, wei = wei, tol = tol, ncores = ncores ) 
  
  } else if ( test == "testIndBeta" ) {
    result <- bic.betafsreg(target, dataset, wei = wei, tol = tol, ncores = ncores )

  } else if ( test == "testIndMMReg" ) {
    result <- bic.mm.fsreg(target, dataset, wei = wei, tol = tol, ncores = ncores )
	
  } else if ( test == "testIndZip" ) {
    result <- bic.zipfsreg(target, dataset, wei = wei, tol = tol, ncores = ncores )
    
  } else if ( test == "testIndGamma" ) {
    result <- bic.gammafsreg(target, dataset, wei = wei, tol = tol, ncores = ncores )
    
  } else if ( test == "testIndNormLog" ) {
    result <- bic.normlog.fsreg(target, dataset, wei = wei, tol = tol, ncores = ncores )

  } else if ( test == "testIndTobit" ) {
    result <- bic.tobit.fsreg(target, dataset, wei = wei, tol = tol, ncores = ncores )
	
  } else if ( test == "censIndWR" ) {
    result <- bic.wr.fsreg(target, dataset, wei = wei, tol = tol, ncores = ncores )
    
  } else if ( test == "censIndLLR" ) {
    result <- bic.llr.fsreg(target, dataset, wei = wei, tol = tol, ncores = ncores )
    
  } else if ( test == "testIndClogit" ) {
    result <- bic.clogit.fsreg(target, dataset, wei = wei, tol = tol, ncores = ncores )
    
  } else {
 
    ci_test <- test <- match.arg(test, av_models ,TRUE);
    #convert to closure type
    if ( test == "censIndCR" ) {
      test <- survival::coxph 

	  } else if ( test == "censIndWR" ) {
      test <- survival::survreg 
      
    } else if ( test == "testIndOrdinal" ) {  
      test <- ordinal::clm
  
    } else if (test == "testIndMultinom") {
      test <- nnet::multinom 
      
    } else if ( test == "testIndNB" ) {
      test <- MASS::glm.nb

    } else if ( test == "testIndRQ" ) {
      test <- quantreg::rq 
    } 
    runtime <- proc.time()
      
    ini <- test( target ~ 1 )
    if ( ci_test == "censIndCR" ) {
      ini <-  - 2 * ini$loglik
    } else {  
      la <- logLik(ini)
      ini <-  - 2 * as.numeric( la ) + attr(la, "df") * con   ## initial BIC  
    }   
    if (ncores <= 1) {
        for (i in 1:p) {
          mi <- test( target ~ dataset[, i], weights = wei )
		      la <- logLik(mi)
          bico[i] <-  - 2 * as.numeric( la ) + attr(la, "df") * con
        }
      mat <- cbind(1:p, bico)
      if( any( is.na(mat) ) )     mat[ which( is.na(mat) ) ] <- ini

    } else {
      cl <- makePSOCKcluster(ncores)
      registerDoParallel(cl)
      mod <- foreach( i = 1:p, .combine = rbind) %dopar% {
        ww <- test( target ~ dataset[, i], weights = wei )
		    la <- logLik(ww)
        return( - 2 * as.numeric( la ) + attr(la, "df") * con )
      }
      stopCluster(cl)
      mat <- cbind(1:p, mod)
      if ( any( is.na(mat) ) )  mat[ which( is.na(mat) ) ] <- ini 
    }
    colnames(mat) <- c("variable", "BIC")
    rownames(mat) <- 1:p
    sel <- which.min( mat[, 2] )
    
    if ( ini - mat[sel, 2] > tol ) {
      info[1, ] <- mat[sel, ]
      mat <- mat[-sel, , drop = FALSE]
      sela <- sel
      mi <- test( target ~ dataset[, sel], weights = wei )
      la <- logLik(mi)
      tool[1] <-  - 2 * as.numeric( la ) +  attr(la, "df") * con
      moda[[ 1 ]] <- mi
    } else  {
      info <- info  
      sela <- NULL
    }
    ######
    ###     k equals 2
    ######
    if ( length(moda) > 0  &  nrow(mat) > 0 ) {

      k <- 2
      pn <- p - k  + 1
      mod <- list()
      if ( ncores <= 1 ) {
        bico <- numeric( pn )
        for ( i in 1:pn ) {
          ma <- test( target ~., data = dataset[, c(sel, mat[i, 1]) ], weights = wei )
		      la <- logLik(ma)
          bico[i] <-  - 2 * as.numeric( la ) + attr(la, "df") * con
        }
        mat[, 2] <- bico

      } else {
        cl <- makePSOCKcluster(ncores)
        registerDoParallel(cl)
        mod <- foreach( i = 1:pn, .combine = rbind) %dopar% {
          ww <- test( target ~., data = dataset[, c(sel, mat[i, 1]) ], weights = wei )
          la <- logLik(ww)
          return( - 2 * as.numeric( la ) +  attr(la, "df") * con )
        }
        stopCluster(cl)
        mat[, 2] <- mod
      }
      ina <- which.min( mat[, 2] )
      sel <- mat[ina, 1]

      if ( tool[1] - mat[ina, 2] <= tol ) {
        info <- info
      } else {
        tool[2] <- mat[ina, 2]
        info <- rbind(info, mat[ina, ] )
        sela <- info[, 1]
        mat <- mat[-ina, , drop = FALSE]
        mi <- test( target ~., data = dataset[, sela], weights = wei )
	      la <- logLik(mi)
        tool[2] <-  - 2 * as.numeric( la ) +  attr(la, "df") * con
        moda[[ 2 ]] <- mi
      }
   }
    #########
    ####      k is greater than 2
    #########
    if ( nrow(info) > 1  &  nrow(mat) > 0 ) {
      while ( k < n - 15 & tool[ k - 1 ] - tool[ k ] > tol &  nrow(mat) > 0 ) {

        k <- k + 1
        pn <- p - k + 1
        if (ncores <= 1) {
          for ( i in 1:pn ) {
            ma <- test( target ~., data = dataset[, c(sela, mat[i, 1]) ], weights = wei )
		        la <- logLik(ma)
            mat[i, 2] <-  - 2 * as.numeric( la ) +  attr(la, "df") * con
          }

        } else {
          cl <- makePSOCKcluster(ncores)
          registerDoParallel(cl)
          mod <- foreach( i = 1:pn, .combine = rbind) %dopar% {
            ww <- test( target ~., data = dataset[, c(sela, mat[i, 1]) ], weights = wei )
		      	la <- logLik(ww)
            return( - 2 * as.numeric( la ) +  attr(la, "df") * con )
          }
          stopCluster(cl)
          mat[, 2] <- mod
        }

        ina <- which.min( mat[, 2] )
        sel <- mat[ina, 1]
        if ( tool[k - 1] - mat[ina, 2]  <= tol ) {
          info <- rbind( info,  c( -10, Inf ) )
          tool[k] <- Inf

        } else {
          tool[k] <- mat[ina, 2]
          info <- rbind(info, mat[ina, ] )
          sela <- info[, 1]
          mat <- mat[-ina, , drop = FALSE]
          ma <- test( target ~., data = dataset[, sela], weights = wei )
		      la <- logLik(ma)
          tool[k] <-  - 2 * as.numeric( la ) +  attr(la, "df") * con
          moda[[ k ]] <- ma
        }
      }
    }

    runtime <- proc.time() - runtime

    d <- length(sela)
    final <- NULL
    if ( d >= 1 ) {
      final <- test( target ~., data = dataset[, sela, drop = FALSE], weights = wei )
      info <- info[1:d, , drop = FALSE]
      colnames(info) <- c( "variables", "BIC" )
      rownames(info) <- info[, 1]
    }
    result = list( runtime = runtime, mat = t(mat), info = info, final = final, ci_test = ci_test)
  }  
  result
}

Try the MXM package in your browser

Any scripts or data that you put into this service are public.

MXM documentation built on Aug. 25, 2022, 9:05 a.m.