scratchwork/RF4_NLonWholedata.R

#Rotation forest with various embedding
#Writer: Haedong Kim
#Begin: ?
#Fixed: SEP 1, 2016
#Fixed: SEP 11, 2016

#library(rotationForest)

install.packages("Matrix", dependencies = TRUE)
install.packages("vegan", dependencies = TRUE)
install.packages("Rtsne", dependencies = TRUE)
install.packages("lle", dependencies = TRUE)
install.packages("rpart", dependencies = TRUE)
install.packages("randomForest", dependencies = TRUE)
install.packages("caret", dependencies = TRUE)

library(Matrix)
library(vegan)
library(Rtsne)
library(lle)
library(rpart)
library(randomForest)
library(caret)

#########################################################
#function 1 
#get random k subsets from the original data X
#subsets are differ in features but have same number of records
#input 
#x: independent variables
#y: dependent variable (should be a factor)
#k: number of features which are in each subset             
#output: List of subsets with randomly selected features
#########################################################

fn1_getRandomSubset = function(x, y, k){
  numSubset <- round(length(x)/k)
  subsetList <- list()
  subsetIdx <- 0
  
  #check that number of subserts is proper
  if(numSubset <= length(x)){
    subsetIdx <- numSubset
  }else{
    subsetIdx <- length(x)
  }
  
  #give features names for indentifing the original sequence of features
  colnames(x) <- paste0("X", 1:length(x))
  
  #shuffling features randomly
  x <- subset(x, select = sample(1:length(x))) 
  
  #pick out k-1 subsets
  i <-  1
  while(i<numSubset){
    subsetList[[i]] <- cbind(x[ , 1:numSubset], y)
    x <- x[ , -(1:numSubset)]
    i <- i+1
  }
  #pick out the last subset (kth subset)
  # The last kth subset may include more or less records than the other subsets 
  # when the number of subets is not an integer
  subsetList[[numSubset]] <- cbind(x, y)
  return(subsetList)
}


#########################################################
#function 2: bootstrapping
#input
#subsetList: list of randomly generated subsets
#bootstrapRate: bootstrapping rate
#output: bootstrapped subset list
#########################################################
fn2_bootstrap = function(subsetList, bootstrapRate){
  bootIdxList <- list()
  bootSubsetList <- list()
  
  for(i in 1:length(subsetList)){#for-loop begins
    #bootstrap sampling
    numRecord <- nrow(data.frame(subsetList[[i]]))
    bootIdxList[[i]] <- sample(1:numRecord, round(bootstrapRate*numRecord), replace = TRUE)
    
    #save a bootstrapped subset in the 'bootSubsetList'
    bootSubsetList[[i]] <- data.frame(subsetList[[i]][bootIdxList[[i]], ])
    names(bootSubsetList)[i] <- paste0("bootstraped", i)
  }#for-loop ends
  return(bootSubsetList)   
}


#########################################################
#function 3: PCA
#input
#bootSubsetList: bootstrapped subset list
#x training x of the original data
#y training y of the original data
#output: matrix constructed by projecting original data X onto a rotation matrix
#########################################################

fn3_RmxPCA = function(bootSubsetList){
  bootSubsetListRd <- lapply(bootSubsetList, function(x){subset(x, select = -y)})
  PCAcomp <- lapply(bootSubsetListRd, function(x){prcomp(x, center = TRUE)$rotation})
  PCAcompT <- lapply(PCAcomp, t)
  PCArrangement <- as.vector(unlist(lapply(PCAcompT, colnames)))
  #'bdiag' is a function included in the R package 'Matrix'
  #It returns block-shaped diagonal matrix which is sparse
  rotationMatrix <- as.matrix(do.call(bdiag, lapply(PCAcompT, as.matrix)))
  colnames(rotationMatrix) <- PCArrangement
  rotationMatrix <- subset(rotationMatrix, select = sort(PCArrangement))
  return(as.matrix(rotationMatrix))
}

#########################################################
#function 4: non-linear embedding
#input
#subetList: subset list
#y: training y of the original data
#dimRdcMethod: name of a non-linear embedding
#output: matrix embedded by non-linear methods; LLE, t-SNE, ISOMAP
#########################################################
#bootSubsetList <- fn1_getRandomSubset(x = trainingX, y = trainingY, k = 2)
fn4_NL = function(bootSubsetList, dimRdcMethod, seed){
  bootSubsetListx <- lapply(bootSubsetList, function(x){subset(x, select = c(-y))})
  bootSubsetListy <- lapply(bootSubsetList, function(x){subset(x, select = c(y))})
  
  #saving variable arrangement for rearranging as the original sequence after dimensionality reduction
  varArrangement <- lapply(bootSubsetListx, colnames)
  
  switch (dimRdcMethod,
          LLE = {
            set.seed(seed)
            LLEresult <- lapply(bootSubsetListx, function(x){
              lle(x, m = length(x), k = 10, reg = 2, id = TRUE)})
            NLcomp <- lapply(LLEresult, function(x){x$Y})
            for(i in 1:length(NLcomp)){
              #colnames(NLcomp[[i]]) <- varArrangement[[i]]
              NLcomp[[i]] <- cbind(data.frame(NLcomp[[i]]), y = bootSubsetListy[[i]])
            }
          },#LLE ends
          tSNE = { 
            bootSubsetListxUni <- list()
            bootSubsetListyUni <- list()
            #make duplicated records unique before running t-SNE
            i <- 1
            if(sum(unlist(lapply(bootSubsetListx, duplicated))) != 0){
              for(i in 1:length(bootSubsetListx)){
                dupliIdx <- which(duplicated(bootSubsetListx[[i]]))
                bootSubsetListxUni[[i]] <- bootSubsetListx[[i]][-dupliIdx, ]
                bootSubsetListyUni[[i]] <- bootSubsetListy[[i]][-dupliIdx, ]
              }
            } else{
              bootSubsetListxUni[[i]] <- bootSubsetListx[[i]]
              bootSubsetListyUni[[i]] <- bootSubsetListy[[i]]
            }
            #determin value of perplexity
            #If the number of rows is less than three times of dafault perplexity (30), it give an error
            #pick out the number of first bootSubset's rows because every bootstrapped subsets has same number of rows
            if(nrow(bootSubsetListxUni[[1]])-1 < 3*30){
              set.seed(seed)
              tsneOut <- lapply(bootSubsetListxUni, function(x){Rtsne(x, dims = length(x), perplexity = floor((nrow(x)-1)/3))})
            } else{
              set.seed(seed)
              tsneOut <- lapply(bootSubsetListxUni, function(x){Rtsne(x, dims = length(x))})
            }
            NLcomp <- lapply(tsneOut, function(x){x$Y})
            #put variable names on non-linearly embedded data (i.e. NLcomp)
            #and dependent variable (i.e. y) also
            for(i in 1:length(NLcomp)){
              #colnames(NLcomp[[i]]) <- varArrangement[[i]]
              NLcomp[[i]] <- cbind(data.frame(NLcomp[[i]]), y = bootSubsetListyUni[[i]])
            }
          }#tSNE ends
  )#switch ends
  return(NLcomp)
}


#########################################################
#function 5: PCA rotation forest
#input: x, y, k, bootstrap rate
#output: 
#########################################################
fn5_PCArf = function(x, y, k, bootstrapRate, numTrees, testX, testY){
  PCArfList <- list()
  RmxList <- list()
  resultsList <- list()
  xRyList <- list()
  RmxTestList <- list()
  
  colnames(testX) <- paste0("X", 1:length(testX))
  for(i in 1:numTrees){
    #generate a rotation matrix
    subsetList <- fn1_getRandomSubset(x = x, y = y, k = k)
    bootSubsetList <-  fn2_bootstrap(subsetList = subsetList, bootstrapRate = bootstrapRate)
    RmxPCA <- fn3_RmxPCA(bootSubsetList)
    
    #build a tree model and then put it in the list; PCArfList
    xRy <- cbind(data.frame(as.matrix(x) %*% RmxPCA), y)
    PCArfList[[i]] <- rpart(y ~., method = "class", data = xRy)
    RmxList[[i]] <- RmxPCA
    
    #for plotting
    xRyList[[i]] <- xRy
    RmxTestList[[i]] <- as.matrix(testX) %*% RmxList[[i]]
    
    #prediction
    resultsList[[i]] <- predict(PCArfList[[i]], newdata = data.frame(as.matrix(testX) %*% RmxList[[i]]), type = "prob")
  }
  #aggregating the results of each number of trees; mean of probability of classes
  aggResults <- Reduce("+", resultsList)/numTrees
  #pick out the class which has the highest probability result from the aggregating
  prdtResult <- factor(apply(aggResults, 1, function(x){colnames(aggResults)[which(x == max(x))]}),
                       levels = colnames(aggResults))
  
  lastOut <- list(confusionMatrix(prdtResult, testY), xRyList)
  return(lastOut)
}

#########################################################
#function 6: non-linear rotation forest 
#########################################################
#x <- trainingX
#y <- trainingY
#dimRdcMethod <- "tSNE"

fn6_NLrf = function(x, y, k, dimRdcMethod, numTrees, testX, testY, seed){
  #initialization
  NLrfList <- list()
  resultsList <- list()
  modelContainer <- list()
  resultContainer <- list()
  
  #preparing test data
  colnames(testX) <- paste0("X", 1:length(testX))
  testData <- cbind(data.frame(testX), y = testY)
  NLtestData <- data.frame(fn4_NL(list(testData), dimRdcMethod, seed))
  NLtestDataX <- subset(NLtestData, select = c(-y))
  NLtestDataY <- as.factor(unlist(subset(NLtestData, select = c(y))))
  
  #preparing training data
  colnames(x) <- paste0("X", 1:length(x))
  trainingData <- cbind(data.frame(x), y = y)
  NLtraining <- data.frame(fn4_NL(list(trainingData), dimRdcMethod, seed))
  NLtrainingX <- subset(NLtraining, select = c(-y))
  NLtrainingY <- as.factor(unlist(subset(NLtraining, select = c(y))))
  
  for(iTree in 1:numTrees){
    subsetList <- fn1_getRandomSubset(x = NLtrainingX, y = NLtrainingY, k = 2)
    bootSubsetList <- fn2_bootstrap(subsetList, bootstrapRate = 1)
    
    for(j in 1:length(bootSubsetList)){
      #fitting a tree model
      modelContainer[[j]] <- rpart(y ~ ., data = bootSubsetList[[j]], method = "class")
      #saving an immediate result
      resultContainer[[j]] <- predict(modelContainer[[j]], newdata = NLtestDataX, type = "prob")
    }#2nd for-loop end
    #Save "trees" getting from one bootstrapped subsets (i.e. one iteration)
    #It is different with the rotation forest with PCA such that PCA one fit only one tree per each iteration
    NLrfList[[iTree]] <- modelContainer
    aggResultTree <- Reduce('+', resultContainer)/length(bootSubsetList)
    resultsList[[iTree]] <- aggResultTree
  }#1st for-loop ends
  aggResultForest <- Reduce('+', resultsList)/numTrees
  prtdResult <- factor(apply(aggResultForest, 1, function(x){
    colnames(aggResultForest)[which(x == max(x))]}), levels = colnames(aggResultForest))
  
  #conveting testY into a factor
  NLtestDataY <- factor(NLtestDataY, levels = colnames(aggResultForest))
  lastOut <- list(confusionMatrix(prtdResult, NLtestDataY), NLtraining, NLtestData)
  return(lastOut)
}


#########################################################
#main function
#########################################################
#Data preparation
#Trial 1: Iris data
trialData = datasets::iris
#Trial 2: Binary taget data
setwd("C:\\Users\\DSBA-0\\Google Drive\\R16RF\\data")
trialData = read.csv("data13.csv", stringsAsFactors = FALSE)
str(trialData)
head(trialData, 5)

#How many times do you like to repeat the experiment?
itrExp <- 30
i <- 1
PCAout <- list()
tSNEout <- list()
LLEout <- list()
for(i in 1:itrExp){
  #get indices for training data
  trainingIdx <- sample(1:nrow(trialData), round(0.7*(nrow(trialData))))
  
  #training data
  trainingData <- trialData[trainingIdx, ]
  trainingX <- trainingData[, 2:5]
  trainingY <- as.factor(trainingData[, 6])
  
  #test data
  testData <- trialData[-trainingIdx, ]
  testX <- testData[, 2:5]
  testY <- as.factor(testData[, 6])
  
  # RF with PCA
  cat(i, "th PCA", "\n")
  PCAout[[i]] <- fn5_PCArf(x = trainingX, y = trainingY, k = 2, bootstrapRate = 0.75,
                           numTrees = 30, testX, testY)
  
  #RF with NL
  cat(i, "th tSNE", "\n")
  tSNEout[[i]] <- fn6_NLrf(x = trainingX, y = trainingY, k = 2, dimRdcMethod = "tSNE",
                           numTrees = 30, testX, testY, seed = i)
  
  # cat(i, "th LLE", "\n")
  # LLEout[[i]] <- fn6_NLrf(x = trainingX, y = trainingY, k = 2, dimRdcMethod = "LLE",
  #                         numTrees = 30, testX, testY, seed = i)
}

PCAacc <- lapply(PCAout, function(x){x[[1]]$overall})
Reduce('+', PCAacc)[1]/30
PCAexp <- Reduce(rbind, PCAacc)

tSNEacc <- lapply(tSNEout, function(x){x[[1]]$overall})
Reduce('+', tSNEacc)[1]/30
tSNEexp <- Reduce(rbind, tSNEacc)

LLEacc <- lapply(LLEout, function(x){x[[1]]$overall})
Reduce('+', LLEacc)[1]/30
LLEexp <- Reduce(rbind, LLEacc)

#export the results
setwd("C:\\Users\\DSBA-0\\Google Drive\\R16RF\\results\\RF4_NLwholeData")
save(PCAout, file = "PCAout_data13.rda")
save(tSNEout, file = "tSNEout_data13.rda")
save(LLEout, file = "LLEout.rda")

write.csv(PCAexp, "PCAacc30_data13.csv", row.names = FALSE)
write.csv(tSNEexp, "tSNEacc30_data13.csv", row.names = FALSE)
write.csv(LLEexp, "LLEacc30.csv", row.names = FALSE)
haedong31/rotation_forest documentation built on Nov. 4, 2019, 1:26 p.m.