benchmarks/scripts/mediumData.R

rm(list = ls())
require(data.table)
library(miceRanger)
require(doParallel)
require(mice)
require(missForest)
set.seed(1991)

# Loads functions we will use to keep the code clean.
source("benchmarks/scripts/helperFuncs.R")

cat("This machine has:",parallel::detectCores(),"cores.\n")
cat("Original benchmarking machine had 12 cores and 16GB RAM.\n")

# Helper function for procuring factors that are correlated with
# different columns.
getFactorVar <- function(ind) {
  apply(
      dat
    , MARGIN = 1
    , function(x) {
      x[1:10] <- as.numeric(x[1:10])
      sample(
          LETTERS[ind]
        , size=1
        , prob = abs(as.numeric(x[ind]))/sum(abs(as.numeric(x[ind])))
      )
    }
  )
}

# Create artificial data
nrows <- 10000
dat <- data.table(a = runif(nrows))
dat$b <- dat$a + rnorm(nrows)
dat$c <- dat$b + rnorm(nrows)
dat$d <- dat$c + rnorm(nrows)
dat$e <- dat$d + rnorm(nrows)
dat$f <- dat$e + rnorm(nrows)
dat$g <- dat$f + rnorm(nrows)
dat$h <- dat$g + rnorm(nrows)
dat$i <- dat$h + rnorm(nrows)
dat$j <- dat$i + rnorm(nrows)
dat$k <- factor(getFactorVar(1:2))
dat$l <- factor(getFactorVar(3:4))
dat$n <- factor(getFactorVar(5:10))
dat$o <- factor(getFactorVar(1:10))
dat$p <- factor(getFactorVar(8:10))

# Ampute the data
ampDat <- amputeData(dat,perc=0.2)


# We run these benchmarks on different forest sizes.
treeVec <- c(4,8,16,32,64,128,256)

# Begin growing forests for each package. This is done
# in the global environment because parlmice cannot be
# passed arguments from a function, or it runs into
# variable scoping issues.

# There is no way to keep missForest from printing output.
# Sink is used.
sinkFile <- file()

# pre-define 
timeDT <- data.table()
mrList <- list()
mrvList <- list()
miceList <- list()
mfList <- list()


for (i in 1:length(treeVec)) {
  
  tre <- treeVec[[i]]
  
  cat("Running miceRanger (PMM) with",tre,"trees\n")
  mrTime <- system.time(
    mrList[[i]] <- miceRanger(
      ampDat
      , maxiter=5
      , num.trees=tre
      , verbose=FALSE
      , num.threads = 5
    )
  )[[3]]
  
  # Run miceRanger again without mean matching.
  cat("Running miceRanger (value) with",tre,"trees\n")
  mrvList[[i]] <- miceRanger(
    ampDat
    , maxiter=5
    , valueSelector = "value"
    , num.trees=tre
    , verbose=FALSE
    , num.threads = 5
  )
  
  cat("Running mice with",tre,"trees\n")
  if (tre <= 32) {
    miceTime <- system.time(
      miceList[[i]] <- parlmice(
        ampDat
        , method = "rf"
        , n.core=5
        , n.imp.core = 1
        , ntree=tre
      )
    )[[3]]
  } else {
    miceTime <- NA
  }
  
  # Set up parallel back end for missforest and miceRanger
  # Set up in lapply to simulate getting multiple datasets from missForest.
  cl <- makeCluster(5)
  registerDoParallel(cl)
  
  cat("Running missForest with",tre,"trees\n")
  sink(file=sinkFile)
  mfTime <- system.time(
    mfList[[i]] <- lapply(
      1:length(treeVec)
      , function(x) {
        missForest(
          ampDat
          , maxiter=5
          , ntree=tre
          , verbose=FALSE
          , parallelize = "forests"
        )
      }
    )
  )[[3]]
  while (sink.number() > 0) sink()
  
  cat("Running miceRanger (parallel) with",tre,"trees\n\n")
  mrPTime <- system.time(
    mpP <- miceRanger(
      ampDat
      , maxiter=5
      , num.trees=tre
      , verbose=FALSE
      , num.threads = 1
      , parallel = TRUE
    )
  )[[3]]
  
  stopCluster(cl)
  registerDoSEQ()
  
  timeDT <- rbind(
    timeDT
    , data.table(trees = tre, miceRanger = mrTime,miceRangerPar = mrPTime,mice = miceTime,missForest = mfTime)
  )
  
}

saveRDS(
  list(mrList=mrList,mrvList=mrvList,miceList=miceList,mfList=mfList,timeDT=timeDT)
  , file="benchmarks/RDS/mediumImpRunList.RDS"
)
#list2env(readRDS("benchmarks/RDS/mediumImpRunList.RDS"))

require(ggplot2)
require(ggpubr)
plotDT <- melt(timeDT,id.vars = "trees")
p <- ggplot(plotDT,aes(x=trees,y=value,color=variable)) +
  geom_point() +
  geom_line() +
  xlab("Trees in Forest") +
  ylab("Time (Seconds)") +
  scale_x_continuous(trans='log2',labels = c(treeVec),breaks=c(treeVec)) +
  ggtitle("10,000 Rows, 15 columns")
p$labels$colour <- "Package"
FarrellDay/miceRanger documentation built on May 28, 2022, 10:51 p.m.