inst/doc/vividVignette.R

## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "vig/"
)
options(rmarkdown.html_vignette.check_title = FALSE)

## ----setup, echo = FALSE, warning=FALSE, message=FALSE------------------------
library(vivid)

## ---- warning=FALSE, message=FALSE--------------------------------------------
library(vivid) 

## ---- messages = FALSE--------------------------------------------------------
set.seed(101)
genFriedman <- function(noFeatures = 10, noSamples = 100, sigma = 1) {
  
  # Create dataframe with named columns
  df <- setNames(as.data.frame(matrix(runif(noSamples * noFeatures, 0, 1), nrow = noSamples), 
                               stringsAsFactors = FALSE), 
                 paste0("x", 1:noFeatures))
  
  # Equation: y = 10sin(πx1x2) + 20(x3−0.5)^2 + 10x4 + 5x5 + ε
  
  df$y <- 10 * sin(pi * df$x1 * df$x2) + 
    20 * (df$x3 - 0.5)^2 + 
    10 * df$x4 + 
    5 * df$x5 + 
    rnorm(noSamples, sd = sigma) # error
  
  return(df)
}

myData <- genFriedman(noFeatures = 9, noSamples = 350, sigma = 1) 

## ---- eval = FALSE------------------------------------------------------------
#  library(randomForest) # for model fit
#  set.seed(1701)
#  rf <- randomForest(y ~ ., data = myData)

## ---- eval = FALSE------------------------------------------------------------
#  set.seed(1701)
#  viviRf  <- vivi(fit = rf,
#                  data = myData,
#                  response = "y",
#                  gridSize = 50,
#                  importanceType = "agnostic",
#                  nmax = 500,
#                  reorder = TRUE,
#                  predictFun = NULL,
#                  numPerm = 4,
#                  showVimpError = FALSE)

## ---- eval = FALSE------------------------------------------------------------
#  viviHeatmap(mat = viviRf)

## ---- echo = F,  out.width = '100%'-------------------------------------------
knitr::include_graphics("https://raw.githubusercontent.com/AlanInglis/vivid/master/vividVigplots/rfheatmap.png")

## ---- eval = FALSE------------------------------------------------------------
#  viviNetwork(mat = viviRf)

## ---- echo = F,  out.width = '100%'-------------------------------------------
knitr::include_graphics("https://raw.githubusercontent.com/AlanInglis/vivid/master/vividVigplots/rfnetwork.png")

## ---- eval = FALSE------------------------------------------------------------
#  viviNetwork(mat = viviRf, intThreshold = 0.12, removeNode = FALSE)
#  viviNetwork(mat = viviRf, intThreshold = 0.12, removeNode = TRUE)

## ---- echo = F,  out.width = '100%'-------------------------------------------
knitr::include_graphics("https://raw.githubusercontent.com/AlanInglis/vivid/master/vividVigplots/rfnet_filter_comb1.png")

## ---- eval = FALSE------------------------------------------------------------
#  viviNetwork(mat = viviRf,
#              layout = cbind(c(1,1,1,1,2,2,2,2,2), c(1,2,4,5,1,2,3,4,5)))

## ---- echo = F,  out.width = '70%', fig.align='center'------------------------
knitr::include_graphics("https://raw.githubusercontent.com/AlanInglis/vivid/master/vividVigplots/rfnet_custom_layout.png")

## ---- eval = FALSE------------------------------------------------------------
#  set.seed(1701)
#  # clustered and filtered network for rf
#  intVals <- viviRf
#  diag(intVals) <- NA
#  
#  
#  # select VIVI values in top 20%
#  impTresh <- quantile(diag(viviRf),.8)
#  intThresh <- quantile(intVals,.8,na.rm=TRUE)
#  sv <- which(diag(viviRf) > impTresh |
#                apply(intVals, 1, max, na.rm=TRUE) > intThresh)
#  
#  h <- hclust(-as.dist(viviRf[sv,sv]), method="single")
#  
#  viviNetwork(viviRf[sv,sv],
#              cluster = cutree(h, k = 3), # specify number of groups
#              layout = igraph::layout_as_star)

## ---- echo = F,  out.width = '80%', fig.align='center'------------------------
knitr::include_graphics("https://raw.githubusercontent.com/AlanInglis/vivid/master/vividVigplots/rfnet_cluster.png")

## ---- eval = FALSE------------------------------------------------------------
#  top5 <- colnames(viviRf)[1:5]
#  pdpVars(data = myData,
#          fit = rf,
#          response = 'y',
#          vars = top5,
#          nIce = 100)

## ---- echo = F,  out.width = '100%', fig.align='center'-----------------------
knitr::include_graphics("https://raw.githubusercontent.com/AlanInglis/vivid/master/vividVigplots/rfpdp.png")

## ---- eval = FALSE------------------------------------------------------------
#  set.seed(1701)
#  pdpPairs(data = myData,
#           fit =  rf,
#           response = "y",
#           nmax = 500,
#           gridSize = 10,
#           vars = c("x1", "x2", "x3", "x4", "x5"),
#           nIce = 100)

## ---- echo = F,  out.width = '80%', fig.align='center'------------------------
knitr::include_graphics("https://raw.githubusercontent.com/AlanInglis/vivid/master/vividVigplots/rfGPGP_subset.png")

## ---- eval = FALSE------------------------------------------------------------
#  set.seed(1701)
#  pdpZen(data = myData, fit = rf, response = "y", nmax = 500, gridSize = 10)

## ---- echo = F,  out.width = '60%', fig.align='center'------------------------
knitr::include_graphics("https://raw.githubusercontent.com/AlanInglis/vivid/master/vividVigplots/rfzen_all.png")

## ---- eval = FALSE------------------------------------------------------------
#  set.seed(1701)
#  pdpZen(data = myData,
#         fit = rf,
#         response = "y",
#         nmax = 500,
#         gridSize = 10,
#         zpath = c("x1", "x2", "x3", "x4", "x5"))

## ---- echo = F,  out.width = '70%', fig.align='center'------------------------
knitr::include_graphics("https://raw.githubusercontent.com/AlanInglis/vivid/master/vividVigplots/rfzen_subset.png")

## ---- eval = FALSE------------------------------------------------------------
#  # set zpaths with different parameters
#  intVals <- viviRf
#  diag(intVals) <- NA
#  intThresh <- quantile(intVals, .90, na.rm=TRUE)
#  zpSw <- zPath(viv = viviRf, cutoff = intThresh, connect = FALSE, method = 'strictly.weighted')
#  
#  
#  
#  set.seed(1701)
#  pdpZen(data = myData,
#         fit = rf,
#         response = "y",
#         nmax = 500,
#         gridSize = 10,
#         zpath = zpSw)

## ---- echo = F,  out.width = '70%', fig.align='center'------------------------
knitr::include_graphics("https://raw.githubusercontent.com/AlanInglis/vivid/master/vividVigplots/rfzen_SW.png")

## ---- eval = FALSE------------------------------------------------------------
#  library("xgboost")
#  gbst <- xgboost(data = as.matrix(myData[,1:9]),
#                  label =  as.matrix(myData[,10]),
#                  nrounds = 100,
#                  verbose = 0)

## ---- eval = FALSE------------------------------------------------------------
#  # predict function for GBM
#  pFun <- function(fit, data, ...) predict(fit, as.matrix(data[,1:9]))
#  
#  set.seed(1701)
#  viviGBst <- vivi(fit = gbst,
#                   data = myData,
#                   response = "y",
#                   reorder = FALSE,
#                   normalized = FALSE,
#                   predictFun = pFun,
#                   gridSize = 50,
#                   nmax = 500)

## ---- eval = FALSE------------------------------------------------------------
#  viviHeatmap(mat = viviGBst)

## ---- echo = F,  out.width = '100%', fig.align='center'-----------------------
knitr::include_graphics("https://raw.githubusercontent.com/AlanInglis/vivid/master/vividVigplots/gbmheat.png")

## ---- eval = FALSE------------------------------------------------------------
#  set.seed(1701)
#  rfEmbedded <- randomForest(y ~ ., data = myData, importance = TRUE)

## ---- eval = FALSE------------------------------------------------------------
#  viviRfEmbedded <- vivi(fit = rfEmbedded,
#                         data = myData,
#                         response = "y",
#                         importanceType = "%IncMSE")

## ---- eval = FALSE------------------------------------------------------------
#  library("ranger")
#  rang <- ranger(y~., data = myData, importance = 'impurity')

## ---- eval = FALSE------------------------------------------------------------
#  viviRangEmbedded <- vivi(fit = rang,
#                           data = myData,
#                           response = "y",
#                           importanceType = "impurity")

## ---- eval = FALSE------------------------------------------------------------
#  set.seed(1701)
#  rfClassif <- ranger(Species~ ., data = iris, probability = T,
#                      importance = "impurity")
#  
#  set.seed(101)
#  viviClassif  <- vivi(fit = rfClassif,
#                       data = iris,
#                       response = "Species",
#                       gridSize = 10,
#                       nmax = 50,
#                       reorder = TRUE,
#                       class = "setosa")

## ---- eval = FALSE------------------------------------------------------------
#  viviHeatmap(mat = viviClassif)
#  viviNetwork(mat = viviClassif)

## ---- echo = F,  out.width = '100%', fig.align='center'-----------------------
knitr::include_graphics("https://raw.githubusercontent.com/AlanInglis/vivid/master/vividVigplots/classifVIVI.png")

## ---- eval = FALSE------------------------------------------------------------
#  set.seed(1701)
#  pdpPairs(data = iris,
#           fit = rfClassif,
#           response = "Species",
#           class = "setosa",
#           convexHull = T,
#           gridSize = 10,
#           nmax = 50)
#  

## ---- echo = F,  out.width = '80%', fig.align='center'------------------------
knitr::include_graphics("https://raw.githubusercontent.com/AlanInglis/vivid/master/vividVigplots/classifGPDP.png")

## ---- eval = FALSE------------------------------------------------------------
#  
#  # Load libraries
#  library("vivid")
#  library("randomForest")
#  
#  # Create data based on the Friedman equation
#  set.seed(101)
#  genFriedman <- function(noFeatures = 10, noSamples = 100, sigma = 1) {
#  
#    # Create dataframe with named columns
#    df <- setNames(as.data.frame(matrix(runif(noSamples * noFeatures, 0, 1), nrow = noSamples),
#                                 stringsAsFactors = FALSE),
#                   paste0("x", 1:noFeatures))
#  
#    # Equation: y = 10sin(πx1x2) + 20(x3−0.5)^2 + 10x4 + 5x5 + ε
#  
#    df$y <- 10 * sin(pi * df$x1 * df$x2) +
#      20 * (df$x3 - 0.5)^2 +
#      10 * df$x4 +
#      5 * df$x5 +
#      rnorm(noSamples, sd = sigma) # error
#  
#    return(df)
#  }
#  
#  myData <- genFriedman(noFeatures = 9, noSamples = 350, sigma = 1)
#  
#  -------------------------------------------------------------------
#  
#  # Fit random forest using randomForest package
#  set.seed(1701)
#  rf <- randomForest(y ~ ., data = myData)
#  
#  -------------------------------------------------------------------
#  
#  # Run vivid
#  set.seed(1701)
#  viviRf  <- vivi(fit = rf,
#                  data = myData,
#                  response = "y",
#                  gridSize = 50,
#                  importanceType = "agnostic",
#                  nmax = 500,
#                  reorder = TRUE,
#                  predictFun = NULL,
#                  numPerm = 4,
#                  showVimpError = FALSE)
#  
#  -------------------------------------------------------------------
#  
#  # Visualisations:
#  
#  # 1.0: Heatmap
#  viviHeatmap(mat = viviRf)
#  
#  # 1.1: Heatmap with custom colour palettes and a small border around the diagonal importance values
#  viviHeatmap(mat = viviRf,
#              intPal = colorspace::sequential_hcl(palette = "Oslo", n = 100),
#              impPal = rev(colorspace::sequential_hcl(palette = "Reds 3", n = 100)),
#              border = T)
#  
#  # 2.0: Network
#  viviNetwork(mat = viviRf)
#  
#  # 2.1: Network with interactions below 0.12 are filtered.
#  # By default, unconnected nodes are displayed, however, they can be removed by setting `removeNode = T`
#  viviNetwork(mat = viviRf, intThreshold = 0.12, removeNode = FALSE)
#  viviNetwork(mat = viviRf, intThreshold = 0.12, removeNode = TRUE)
#  
#  # 2.3: Network clustered and with interactions thresholded
#  set.seed(1701)
#  # clustered and filtered network for rf
#  intVals <- viviRf
#  diag(intVals) <- NA
#  
#  
#  # select VIVI values in top 20%
#  impTresh <- quantile(diag(viviRf),.8)
#  intThresh <- quantile(intVals,.8,na.rm=TRUE)
#  sv <- which(diag(viviRf) > impTresh |
#                apply(intVals, 1, max, na.rm=TRUE) > intThresh)
#  
#  h <- hclust(-as.dist(viviRf[sv,sv]), method="single")
#  
#  # plot
#  viviNetwork(viviRf[sv,sv],
#              cluster = cutree(h, k = 3), # specify number of groups
#              layout = igraph::layout_as_star)
#  
#  # 3.0: PDP of the top 5 variables extracted from the vivi matrix and number of ICe curves set to 100
#  top5 <- colnames(viviRf)[1:5]
#  pdpVars(data = myData,
#          fit = rf,
#          response = 'y',
#          vars = top5,
#          nIce = 100)
#  
#  # 4.0: GPDP of the variables x1 to x5, with 100 ICE curves shown.
#  set.seed(1701)
#  pdpPairs(data = myData,
#           fit =  rf,
#           response = "y",
#           nmax = 500,
#           gridSize = 10,
#           vars = c("x1", "x2", "x3", "x4", "x5"),
#           nIce = 100)
#  
#  # 5.0: ZPDP of all variables
#  set.seed(1701)
#  pdpZen(data = myData, fit = rf, response = "y", nmax = 500, gridSize = 10)
#  
#  # 5.1: ZPDP where the `zpath` argument specifies the variables to be plotted. In this case, x1 to x5.
#  set.seed(1701)
#  pdpZen(data = myData,
#         fit = rf,
#         response = "y",
#         nmax = 500,
#         gridSize = 10,
#         zpath = c("x1", "x2", "x3", "x4", "x5"))
#  
#  # 5.2: ZPDP with the zpaths set with different parameters using the `zPath` function.
#  intVals <- viviRf
#  diag(intVals) <- NA
#  intThresh <- quantile(intVals, .90, na.rm=TRUE)
#  zpSw <- zPath(viv = viviRf, cutoff = intThresh, connect = FALSE, method = 'strictly.weighted')
#  
#  set.seed(1701)
#  pdpZen(data = myData,
#         fit = rf,
#         response = "y",
#         nmax = 500,
#         gridSize = 10,
#         zpath = zpSw)

## ---- eval = FALSE------------------------------------------------------------
#  library("vivid")
#  library("xgboost")
#  gbst <- xgboost(data = as.matrix(myData[,1:9]),
#                  label =  as.matrix(myData[,10]),
#                  nrounds = 100,
#                  verbose = 0)
#  
#  # predict function for GBM
#  pFun <- function(fit, data, ...) predict(fit, as.matrix(data[,1:9]))
#  
#  # run vivid
#  set.seed(1701)
#  viviGBst <- vivi(fit = gbst,
#                   data = myData,
#                   response = "y",
#                   reorder = FALSE,
#                   normalized = FALSE,
#                   predictFun = pFun,
#                   gridSize = 50,
#                   nmax = 500)
#  
#  # plot heatmap
#  viviHeatmap(mat = viviGBst)
#  

## ---- eval = FALSE------------------------------------------------------------
#  library("vivid")
#  library("randomForest")
#  library("ranger")
#  
#  # randomForest
#  # Note importance must be set to TRUE to use embedded importance scores.
#  set.seed(1701)
#  rfEmbedded <- randomForest(y ~ ., data = myData, importance = TRUE)
#  
#  # Using % increase in MSE as the importance metric in vivid
#  viviRfEmbedded <- vivi(fit = rfEmbedded,
#                         data = myData,
#                         response = "y",
#                         importanceType = "%IncMSE")
#  
#  # Plot Heatmap
#  viviHeatmap(mat = viviRfEmbedded)
#  
#  # ranger
#  # Note the importance metric is selected directly in ranger
#  rang <- ranger(y~., data = myData, importance = 'impurity')
#  
#  # Run vivid
#  viviRangEmbedded <- vivi(fit = rang,
#                           data = myData,
#                           response = "y",
#                           importanceType = "impurity")
#  
#  # Plot Heatmap
#  viviHeatmap(mat = viviRangEmbedded)

## ---- eval = FALSE------------------------------------------------------------
#  library("vivid")
#  library("randomForest")
#  
#  set.seed(1701)
#  rfClassif <- ranger(Species~ ., data = iris, probability = T,
#                      importance = "impurity")
#  
#  set.seed(101)
#  viviClassif  <- vivi(fit = rfClassif,
#                       data = iris,
#                       response = "Species",
#                       gridSize = 10,
#                       nmax = 50,
#                       reorder = TRUE,
#                       class = "setosa")
#  
#  viviHeatmap(mat = viviClassif)
#  
#  set.seed(1701)
#  pdpPairs(data = iris,
#           fit = rfClassif,
#           response = "Species",
#           class = "setosa",
#           convexHull = T,
#           gridSize = 10,
#           nmax = 50)

Try the vivid package in your browser

Any scripts or data that you put into this service are public.

vivid documentation built on Sept. 11, 2024, 6:12 p.m.