inst/doc/applications.R

## ----echo=F-------------------------------------------------------------------
### get knitr just the way we like it

knitr::opts_chunk$set(
  message = FALSE,
  warning = FALSE,
  error = FALSE,
  tidy = FALSE,
  cache = FALSE
)


## -----------------------------------------------------------------------------
library(treemap)
data(GNI2014)
treemap(GNI2014,
       index=c("continent", "iso3"),
       vSize="population",
       vColor="GNI",
       type="value")

## -----------------------------------------------------------------------------
library(data.tree)
GNI2014$continent <- as.character(GNI2014$continent)
GNI2014$pathString <- paste("world", GNI2014$continent, GNI2014$country, sep = "/")
tree <- as.Node(GNI2014[,])
print(tree, pruneMethod = "dist", limit = 20)

## -----------------------------------------------------------------------------

tree$Europe$Switzerland$population


## -----------------------------------------------------------------------------
northAm <- tree$`North America`
Sort(northAm, "GNI", decreasing = TRUE)
print(northAm, "iso3", "population", "GNI", limit = 12)

## -----------------------------------------------------------------------------
maxGNI <- Aggregate(tree, "GNI", max)
#same thing, in a more traditional way:
maxGNI <- max(sapply(tree$leaves, function(x) x$GNI))

tree$Get("name", filterFun = function(x) x$isLeaf && x$GNI == maxGNI)

## -----------------------------------------------------------------------------
tree$Do(function(x) {
        x$population <- Aggregate(node = x,
        attribute = "population",
        aggFun = sum)
        }, 
     traversal = "post-order")

## -----------------------------------------------------------------------------
Sort(tree, attribute = "population", decreasing = TRUE, recursive = TRUE)


## -----------------------------------------------------------------------------
tree$Do(function(x) x$cumPop <- Cumulate(x, "population", sum))


## -----------------------------------------------------------------------------
print(tree, "population", "cumPop", pruneMethod = "dist", limit = 20)

## -----------------------------------------------------------------------------

tree$Do(function(x) x$origCount <- x$count)


## -----------------------------------------------------------------------------

myPruneFun <- function(x, cutoff = 0.9, maxCountries = 7) {
  if (isNotLeaf(x)) return (TRUE)
  if (x$position > maxCountries) return (FALSE)
  return (x$cumPop < (x$parent$population * cutoff))
}


## -----------------------------------------------------------------------------
treeClone <- Clone(tree, pruneFun = myPruneFun)
print(treeClone$Oceania, "population", pruneMethod = "simple", limit = 20)

## -----------------------------------------------------------------------------

treeClone$Do(function(x) {
  missing <- x$population - sum(sapply(x$children, function(x) x$population))
  other <- x$AddChild("Other")
  other$iso3 <- paste0("OTH(", x$origCount, ")")
  other$country <- "Other"
  other$continent <- x$name
  other$GNI <- 0
  other$population <- missing
},
filterFun = function(x) x$level == 2
)


print(treeClone$Oceania, "population", pruneMethod = "simple", limit = 20)


## -----------------------------------------------------------------------------
df <- ToDataFrameTable(treeClone, "iso3", "country", "continent", "population", "GNI")

treemap(df,
        index=c("continent", "iso3"),
        vSize="population",
        vColor="GNI",
        type="value")


## -----------------------------------------------------------------------------
plot(as.dendrogram(treeClone, heightAttribute = "population"))


## -----------------------------------------------------------------------------
fileName <- system.file("extdata", "portfolio.csv", package="data.tree")
pfodf <- read.csv(fileName, stringsAsFactors = FALSE)
head(pfodf)

## -----------------------------------------------------------------------------
pfodf$pathString <- paste("portfolio", 
                          pfodf$AssetCategory, 
                          pfodf$AssetClass, 
                          pfodf$SubAssetClass, 
                          pfodf$ISIN, 
                          sep = "/")
pfo <- as.Node(pfodf)


## -----------------------------------------------------------------------------
t <- Traverse(pfo, traversal = "post-order")
Do(t, function(x) x$Weight <- Aggregate(node = x, attribute = "Weight", aggFun = sum))

## -----------------------------------------------------------------------------

Do(t, function(x) x$WeightOfParent <- x$Weight / x$parent$Weight)


## -----------------------------------------------------------------------------

pfo$Do(function(x) x$Duration <- ifelse(is.null(x$Duration), 0, x$Duration), filterFun = isLeaf)
Do(t, function(x) x$Duration <- Aggregate(x, function(x) x$WeightOfParent * x$Duration, sum))


## -----------------------------------------------------------------------------
SetFormat(pfo, "WeightOfParent", function(x) FormatPercent(x, digits = 1))
SetFormat(pfo, "Weight", FormatPercent)

FormatDuration <- function(x) {
  if (x != 0) res <- FormatFixedDecimal(x, digits = 1)
  else res <- ""
  return (res)
}

SetFormat(pfo, "Duration", FormatDuration)


## -----------------------------------------------------------------------------

#Print
print(pfo, 
      "Weight", 
      "WeightOfParent",
      "Duration",
      filterFun = function(x) !x$isLeaf)


## -----------------------------------------------------------------------------
IsPure <- function(data) {
  length(unique(data[,ncol(data)])) == 1
}

## -----------------------------------------------------------------------------
Entropy <- function( vls ) {
  res <- vls/sum(vls) * log2(vls/sum(vls))
  res[vls == 0] <- 0
  -sum(res)
}

## -----------------------------------------------------------------------------

InformationGain <- function( tble ) {
  entropyBefore <- Entropy(colSums(tble))
  s <- rowSums(tble)
  entropyAfter <- sum (s / sum(s) * apply(tble, MARGIN = 1, FUN = Entropy ))
  informationGain <- entropyBefore - entropyAfter
  return (informationGain)
}

## -----------------------------------------------------------------------------
TrainID3 <- function(node, data) {
    
  node$obsCount <- nrow(data)
  
  #if the data-set is pure (e.g. all toxic), then
  if (IsPure(data)) {
    #construct a leaf having the name of the pure feature (e.g. 'toxic')
    child <- node$AddChild(unique(data[,ncol(data)]))
    node$feature <- tail(names(data), 1)
    child$obsCount <- nrow(data)
    child$feature <- ''
  } else {
    #calculate the information gain
    ig <- sapply(colnames(data)[-ncol(data)], 
            function(x) InformationGain(
              table(data[,x], data[,ncol(data)])
              )
            )
    #chose the feature with the highest information gain (e.g. 'color')
    #if more than one feature have the same information gain, then take
    #the first one
    feature <- names(which.max(ig))
    node$feature <- feature
    
    #take the subset of the data-set having that feature value
    
    childObs <- split(data[ ,names(data) != feature, drop = FALSE], 
                      data[ ,feature], 
                      drop = TRUE)
  
    for(i in 1:length(childObs)) {
      #construct a child having the name of that feature value (e.g. 'red')
      child <- node$AddChild(names(childObs)[i])
      
      #call the algorithm recursively on the child and the subset      
      TrainID3(child, childObs[[i]])
    }
    
  }
  
  

}

## -----------------------------------------------------------------------------
library(data.tree)
data(mushroom)
mushroom

## -----------------------------------------------------------------------------

tree <- Node$new("mushroom")
TrainID3(tree, mushroom)
print(tree, "feature", "obsCount")


## -----------------------------------------------------------------------------

Predict <- function(tree, features) {
  if (tree$children[[1]]$isLeaf) return (tree$children[[1]]$name)
  child <- tree$children[[features[[tree$feature]]]]
  return ( Predict(child, features))
}


## -----------------------------------------------------------------------------
Predict(tree, c(color = 'red', 
                size = 'large', 
                points = 'yes')
        )

## -----------------------------------------------------------------------------
fileName <- system.file("extdata", "jennylind.yaml", package="data.tree")
cat(readChar(fileName, file.info(fileName)$size))


## -----------------------------------------------------------------------------

library(data.tree)
library(yaml)
lol <- yaml.load_file(fileName)
jl <- as.Node(lol)
print(jl, "type", "payoff", "p")

## -----------------------------------------------------------------------------

payoff <- function(node) {
  if (node$type == 'chance') node$payoff <- sum(sapply(node$children, function(child) child$payoff * child$p))
  else if (node$type == 'decision') node$payoff <- max(sapply(node$children, function(child) child$payoff))
}

jl$Do(payoff, traversal = "post-order", filterFun = isNotLeaf)


## -----------------------------------------------------------------------------
decision <- function(x) {
  po <- sapply(x$children, function(child) child$payoff)
  x$decision <- names(po[po == x$payoff])
}

jl$Do(decision, filterFun = function(x) x$type == 'decision')



## -----------------------------------------------------------------------------

GetNodeLabel <- function(node) switch(node$type, 
                                      terminal = paste0( '$ ', format(node$payoff, scientific = FALSE, big.mark = ",")),
                                      paste0('ER\n', '$ ', format(node$payoff, scientific = FALSE, big.mark = ",")))

GetEdgeLabel <- function(node) {
  if (!node$isRoot && node$parent$type == 'chance') {
    label = paste0(node$name, " (", node$p, ")")
  } else {
    label = node$name
  }
  return (label)
}

GetNodeShape <- function(node) switch(node$type, decision = "box", chance = "circle", terminal = "none")


SetEdgeStyle(jl, fontname = 'helvetica', label = GetEdgeLabel)
SetNodeStyle(jl, fontname = 'helvetica', label = GetNodeLabel, shape = GetNodeShape)

## -----------------------------------------------------------------------------
jl$Do(function(x) SetEdgeStyle(x, color = "red", inherit = FALSE), 
      filterFun = function(x) !x$isRoot && x$parent$type == "decision" && x$parent$decision == x$name)



## ----eval = FALSE-------------------------------------------------------------
#  SetGraphStyle(jl, rankdir = "LR")
#  plot(jl)
#  

## -----------------------------------------------------------------------------
fileName <- system.file("extdata", "flare.json", package="data.tree")
flareJSON <- readChar(fileName, file.info(fileName)$size)
cat(substr(flareJSON, 1, 300))


## -----------------------------------------------------------------------------
library(jsonlite)
flareLoL <- fromJSON(file(fileName),
                     simplifyDataFrame = FALSE
                     )

flareTree <- as.Node(flareLoL, mode = "explicit", check = "no-warn")
flareTree$attributesAll
print(flareTree, "size", limit = 30)


## -----------------------------------------------------------------------------

flare_df <- ToDataFrameTable(flareTree, 
                             className = function(x) x$parent$name, 
                             packageName = "name", 
                             "size")
head(flare_df)


## ----eval = FALSE-------------------------------------------------------------
#  
#  devtools::install_github("jcheng5/bubbles@6724e43f5e")
#  library(scales)
#  library(bubbles)
#  library(RColorBrewer)
#  bubbles(
#    flare_df$size,
#    substr(flare_df$packageName, 1, 2),
#    tooltip = flare_df$packageName,
#    color = col_factor(
#      brewer.pal(9,"Set1"),
#      factor(flare_df$className)
#    )(flare_df$className),
#    height = 800,
#    width = 800
#  )

## -----------------------------------------------------------------------------
path <- ".."
files <- list.files(path = path, 
                    recursive = TRUE,
                    include.dirs = FALSE) 

df <- data.frame(
      filename = sapply(files, 
                        function(fl) paste0("data.tree","/",fl)
      ), 
      file.info(paste(path, files, sep = "/")),
      stringsAsFactors = FALSE
    )
 
print(head(df)[c(1,2,3,4)], row.names = FALSE)


## -----------------------------------------------------------------------------

fileStructure <- as.Node(df, pathName = "filename")
fileStructure$leafCount / (fileStructure$totalCount - fileStructure$leafCount)
print(fileStructure, "mode", "size", limit = 25)


## ----eval = FALSE-------------------------------------------------------------
#  
#  #This requires listviewer, which is available only on github
#  devtools::install_github("timelyportfolio/listviewer")
#  
#  library(listviewer)
#  
#  l <- ToListSimple(fileStructure)
#  jsonedit(l)
#  

## -----------------------------------------------------------------------------

#' @param children the number of children each population member has
#' @param probSex the probability of the sex of a descendant
#' @param probInherit the probability the feature is inherited, depending on the sex of the descendant
#' @param probDevelop the probability the feature is developed (e.g. a gene defect), depending on the sex
#' of the descendant
#' @param generations the number of generations our simulated population should have
#' @param parent for recursion
GenerateChildrenTree <- function(children = 2, 
                                 probSex = c(male = 0.52, female = 0.48), 
                                 probInherit = c(male = 0.8, female = 0.5),
                                 probDevelop = c(male = 0.05, female = 0.01),
                                 generations = 3, 
                                 parent = NULL) {
  
  if (is.null(parent)) {
    parent <- Node$new("1")
    parent$sex <- 1
    parent$feature <- TRUE
    parent$develop <- FALSE
  }
  
  #sex of descendants
  #1 = male
  #2 = female
  sex <- sample.int(n = 2, size = children, replace = TRUE, prob = probSex)
  for (i in 1:children) child <- parent$AddChild(i)
  Set(parent$children, sex = sex)
  
  #inherit
  if (parent$feature == TRUE) {
    for (i in 1:2) {
      subPop <- Traverse(parent, filterFun = function(x) x$sex == i)
      inherit <- sample.int(n = 2, 
                            size = length(subPop), 
                            replace = TRUE, 
                            prob = c(1 - probInherit[i], probInherit[i]))
      
      Set(subPop, feature = as.logical(inherit - 1))
    }
  } else {
    Set(parent$children, feature = FALSE)
  }
  
  #develop
  Set(parent$children, develop = FALSE)
  for (i in 1:2) {
    subPop <- Traverse(parent, filterFun = function(x) x$sex == i && !x$feature)
    develop <- sample.int(n = 2, 
                          size = length(subPop), 
                          replace = TRUE, 
                          prob = c(1 - probDevelop[i], probDevelop[i]))
    Set(subPop, feature = as.logical((develop - 1)), develop = as.logical((develop - 1)))
  }
  
  #recursion to next generation
  if (generations > 0) for (i in 1:children) GenerateChildrenTree(children, 
                                                                  probSex, 
                                                                  probInherit, 
                                                                  probDevelop, 
                                                                  generations - 1, 
                                                                  parent$children[[i]])
  
  return (parent)
}


## -----------------------------------------------------------------------------

tree <- GenerateChildrenTree()
print(tree, "sex", "feature", "develop", limit = 20)


## -----------------------------------------------------------------------------
tree$totalCount

## -----------------------------------------------------------------------------
length(Traverse(tree, filterFun = function(x) x$feature))

## -----------------------------------------------------------------------------
length(Traverse(tree, filterFun = function(x) x$sex == 1 && x$develop))

## -----------------------------------------------------------------------------
FreqLastGen <- function(tree) {
  l <- tree$leaves
  sum(sapply(l, function(x) x$feature))/length(l)
}

FreqLastGen(tree)


## -----------------------------------------------------------------------------
system.time(x <- sapply(1:100, function(x) FreqLastGen(GenerateChildrenTree())))

## -----------------------------------------------------------------------------
hist(x, probability = TRUE, main = "Frequency of feature in last generation")

## ----eval = FALSE-------------------------------------------------------------
#  library(foreach)
#  library(doParallel)
#  registerDoParallel(makeCluster(3))
#  #On Linux, there are other alternatives, e.g.: library(doMC);  registerDoMC(3)
#  
#  system.time(x <- foreach (i = 1:100, .packages = "data.tree") %dopar% FreqLastGen(GenerateChildrenTree()))
#  stopImplicitCluster()

## ----echo = FALSE-------------------------------------------------------------

print(c(user = 0.07, system = 0.02, elapsed = 1.40))

## -----------------------------------------------------------------------------

attributes <- expand.grid(letters[1:3], 1:3)
attributes


## -----------------------------------------------------------------------------
ttt <- Node$new("ttt")

#consider rotation, so first move is explicit
ttt$AddChild("a3")
ttt$a3$f <- 7
ttt$AddChild("b3")
ttt$b3$f <- 8
ttt$AddChild("b2")
ttt$b2$f <- 5


ttt$Set(player = 1, filterFun = isLeaf)



## -----------------------------------------------------------------------------
AddPossibleMoves <- function(node) {
  t <- Traverse(node, traversal = "ancestor", filterFun = isNotRoot)
  
  available <- rownames(attributes)[!rownames(attributes) %in% Get(t, "f")]
  for (f in available) {
    child <- node$AddChild(paste0(attributes[f, 1], attributes[f, 2]))
    child$f <- as.numeric(f)
    child$player <- ifelse(node$player == 1, 2, 1)
    hasWon <- HasWon(child)
    if (!hasWon && child$level <= 10) AddPossibleMoves(child)
    if (hasWon) {
      child$result <- child$player
      print(paste("Player ", child$player, "wins!"))
    } else if(child$level == 10) {
      child$result <- 0
      print("Tie!")
    }
    
  }
  return (node)  
}


## -----------------------------------------------------------------------------
HasWon <- function(node) {
  t <- Traverse(node, traversal = "ancestor", filterFun = function(x) !x$isRoot && x$player == node$player)
  mine <- Get(t, "f")
  mineV <- rep(0, 9)
  mineV[mine] <- 1
  mineM <- matrix(mineV, 3, 3, byrow = TRUE)
  result <- any(rowSums(mineM) == 3) ||
    any(colSums(mineM) == 3) ||
    sum(diag(mineM)) == 3 ||
    sum(diag(t(mineM))) == 3
  return (result)
}


## ----eval=FALSE---------------------------------------------------------------
#  system.time(for (child in ttt$children) AddPossibleMoves(child))

## ----echo= FALSE--------------------------------------------------------------
c(user = 345.645, system = 3.245, elapsed = 346.445)

## ----eval = FALSE-------------------------------------------------------------
#  ttt$leafCount

## ----echo = FALSE-------------------------------------------------------------
89796

## ----eval = FALSE-------------------------------------------------------------
#  ttt$totalCount

## ----echo = FALSE-------------------------------------------------------------
203716

## ----eval = FALSE-------------------------------------------------------------
#  mean(ttt$Get(function(x) x$level - 1, filterFun = isLeaf))

## ----echo = FALSE-------------------------------------------------------------
8.400775

## ----eval = FALSE-------------------------------------------------------------
#  ttt$averageBranchingFactor

## ----echo = FALSE-------------------------------------------------------------
1.788229

## ----eval = FALSE-------------------------------------------------------------
#  
#  winnerOne <- Traverse(ttt, filterFun = function(x) x$isLeaf && x$result == 1)
#  winnerTwo <- Traverse(ttt, filterFun = function(x) x$isLeaf && x$result == 2)
#  ties <- Traverse(ttt, filterFun = function(x) x$isLeaf && x$result == 0)
#  
#  c(winnerOne = length(winnerOne), winnerTwo = length(winnerTwo), ties = length(ties))

## ----echo=FALSE---------------------------------------------------------------
c(winnerOne = 39588, winnerTwo = 21408, ties = 28800)

## -----------------------------------------------------------------------------

PrintBoard <- function(node) {
  mineV <- rep(0, 9)

  
  t <- Traverse(node, traversal = "ancestor", filterFun = function(x) !x$isRoot && x$player == 1)
  field <- Get(t, "f")
  value <- Get(t, function(x) paste0("X", x$level - 1))
  mineV[field] <- value
  
  t <- Traverse(node, traversal = "ancestor", filterFun = function(x) !x$isRoot && x$player == 2)
  field <- Get(t, "f")
  value <- Get(t, function(x) paste0("O", x$level - 1))
  mineV[field] <- value
    
  mineM <- matrix(mineV, 3, 3, byrow = TRUE)
  rownames(mineM) <- letters[1:3]
  colnames(mineM) <- as.character(1:3)
  mineM
}


## ----eval = FALSE-------------------------------------------------------------
#  
#  PrintBoard(ties[[1]])
#  

## ----echo = FALSE-------------------------------------------------------------
mt <- matrix(c("O2", "X3", "O4", "X5", "O6", "X7", "X1", "O8", "X9"), nrow = 3, ncol = 3, byrow = TRUE)
rownames(mt) <- letters[1:3]
colnames(mt) <- as.character(1:3)
mt

## ----eval=FALSE---------------------------------------------------------------
#  
#  
#  AnalyseTicTacToe <- function(subtree) {
#    # 1. create sub-tree
#    AddPossibleMoves(subtree)
#    # 2. run the analysis
#    winnerOne <- Traverse(subtree, filterFun = function(x) x$isLeaf && x$result == 1)
#    winnerTwo <- Traverse(subtree, filterFun = function(x) x$isLeaf && x$result == 2)
#    ties <- Traverse(subtree, filterFun = function(x) x$isLeaf && x$result == 0)
#  
#    res <- c(winnerOne = length(winnerOne),
#             winnerTwo = length(winnerTwo),
#             ties = length(ties))
#    # 3. return the result
#    return(res)
#  }
#  
#  
#  library(foreach)
#  library(doParallel)
#  registerDoParallel(makeCluster(3))
#  #On Linux, there are other alternatives, e.g.: library(doMC);  registerDoMC(3)
#  system.time(
#    x <- foreach (child = ttt$children,
#                  .packages = "data.tree") %dopar% AnalyseTicTacToe(child)
#  )

## ----echo= FALSE--------------------------------------------------------------
c(user = 0.05, system = 0.04, elapsed = 116.86)

## ----eval = FALSE-------------------------------------------------------------
#  stopImplicitCluster()
#  # 4. aggregate results
#  rowSums(sapply(x, c))

## ----echo=FALSE---------------------------------------------------------------
c(winnerOne = 39588, winnerTwo = 21408, ties = 28800)

Try the data.tree package in your browser

Any scripts or data that you put into this service are public.

data.tree documentation built on Nov. 13, 2023, 1:08 a.m.