Nothing
#' Encode categorical variables using split information of CART
#'
#' @param targetVariable target variable that we want to predict.
#' @param trainData training data.
#' @param testData testing data.
#' @param problemType classification or regression.
#' @param datasetName Name of the dataset, could be any string name.
#' @param catVariables List of categorical variables in the dataset.
#' @return dataframe that is the encoding of categorical variables.
#' @name catSplitEncoding
#' @import rpart
#' @import caret
#' @import utils
#' @import data.table
#' @import OpenML
#' @import stringr
#' @import farff
#' @importFrom dplyr filter
#' @importFrom stats na.omit predict
#' @export catSplitEncoding
#' @example man/example.R
catSplitEncoding <- function(targetVariable, trainData, testData, problemType, datasetName, catVariables){
formula <- paste(targetVariable, "~.", collapse = "")
datasetNameTxt <- paste0(datasetName, ".txt", collapse = "")
catVariablesWithSpaces <- list()
for(cat in catVariables){
catVariablesWithSpaces <- append(catVariablesWithSpaces, paste0(cat, " ",""))
}
if(problemType == "regression"){
# Fit rpart
tree_fit = rpart(formula, trainData, method = 'anova', control = rpart.control(cp=0,maxdepth=5, usesurrogate = 2))
} else if(problemType == "classification"){
tree_fit = rpart(formula, trainData, method = 'class', control = rpart.control(cp=0,maxdepth=5,usesurrogate = 2))
}
# Get the summary file of tree_fit
#summary(tree_fit, file = datasetNameTxt)
allLines = capture.output(summary(tree_fit))
# Get a list of ordered unique names from rpart for all categorical variables
# Resulting list is a named list, names being categorical variable names
uniqueSortedNames <- list()
for(i in 1:length(names(attr(tree_fit,"xlevels")))){
names <- names(attr(tree_fit,"xlevels"))
uniqueSortedNames[[names[i]]] <- data.table(lev = attr(tree_fit,"xlevels")[[i]])
}
# Read summary file
#allLines <- readLines(datasetNameTxt)
# Filter allLines and remove the unnecessary lines
# unnecessary lines being the node numbers which do not have primary or surrogate split information
# or predicted class, variable importance etc.
necessaryLines <- c()
necessaryLinescheckList <- unlist(list("Node number", "Primary splits", "Surrogate splits", catVariablesWithSpaces))
for(i in 1:length(allLines)){
if(grepl(paste(necessaryLinescheckList, collapse="|"), allLines[i])){
necessaryLines <- append(necessaryLines, trimws(allLines[i]))
}
}
necessaryLines[1] <- NA
necessaryLinesCopy <- necessaryLines
indicestoRemove <- list()
for(i in 1:(length(necessaryLinesCopy)-1)){
if(grepl("Node number", necessaryLinesCopy[[i]]) && grepl("Node number", necessaryLinesCopy[[i + 1]]) ){
indicestoRemove <- append(indicestoRemove, i)
}
}
for(i in indicestoRemove){
necessaryLines[[i]] <- NA
}
necessaryLines <- necessaryLines[!is.na(necessaryLines)]
if(grepl("Node number", necessaryLines[length(necessaryLines)])){
necessaryLines[length(necessaryLines)] <- NA
}
# necessaryLines consists of node numbers, primary and surrogate split information
necessaryLines <- necessaryLines[!is.na(necessaryLines)]
#-----------------------------------------------------------------------------------------------
# SPLIT necessarylines into Primary and Surrogate Splits
# PRIMARY SPLITS
primaryfullNames <- stack(Filter(Negate(is.null),
lapply(split(necessaryLines, cumsum(grepl('Node number', necessaryLines))),
function(x) {
x1 <- sub(",.*", "", x[grep('improve', x)])
if(length(x1) > 0) paste(x1, strsplit(x[1],":")[[1]][1])
})))[2:1][[2]]
primaryfullNamesModified <- list()
for(name in primaryfullNames){
primaryfullNamesModified <- append(primaryfullNamesModified, paste(c(strsplit(name, "\\s+")[[1]][c(1,5,6,7)],"primary"), collapse = " "))
}
primaryInfo <- list()
primarygeneralName <- list()
primaryPattern <- "splits as\\s*(.*?)\\s,*improve"
for(i in necessaryLines){
if(grepl("improve",i)){
#print(i)
name <- strsplit(i, "\\s+")[[1]][1]
#print(name)
primarygeneralName <- append(primarygeneralName, name)
primaryInfo <- append(primaryInfo, trimws(sub(",","",regmatches(i, regexec(primaryPattern, i))[[1]][2])))
}
}
primaryfullNamesModified <- unlist(primaryfullNamesModified)
primaryInfo <- unlist(primaryInfo)
names(primaryInfo) <- unlist(primaryfullNamesModified)
primaryInfo <- lapply(split(x = primaryInfo, f = sapply(strsplit(names(primaryInfo)," ") , "[", 1)), unlist)
# SURROGATE SPLITS
surrogatefullNames <- stack(Filter(Negate(is.null),
lapply(split(necessaryLines, cumsum(grepl('Node number', necessaryLines))),
function(x) {
x1 <- sub(",.*", "", x[grep('agree', x)])
if(length(x1) > 0) paste(x1, strsplit(x[1],":")[[1]][1])
})))[2:1][[2]]
surrogatefullNamesModified <- list()
for(name in surrogatefullNames){
surrogatefullNamesModified <- append(surrogatefullNamesModified, paste(c(strsplit(name, "\\s+")[[1]][c(1,5,6,7)], "surrogate"), collapse = " "))
}
surrogateInfo <- list()
surrogategeneralName <- list()
surrogatePattern <- "splits as\\s*(.*?)\\s,*agree"
for(i in necessaryLines){
if(grepl("agree",i)){
name <- strsplit(i, "\\s+")[[1]][1]
surrogategeneralName <- append(surrogategeneralName, name)
surrogateInfo <- append(surrogateInfo, trimws(sub(",","",regmatches(i, regexec(surrogatePattern, i))[[1]][2])))
}
}
surrogatefullNamesModified <- unlist(surrogatefullNamesModified)
surrogateInfo <- unlist(surrogateInfo)
names(surrogateInfo) <- unlist(surrogatefullNamesModified)
surrogateInfo <- lapply(split(x = surrogateInfo, f = sapply(strsplit(names(surrogateInfo)," ") , "[", 1)), unlist)
#print(surrogateInfo)
#---------------------------------------------------------------------------
# Convert Primary and Surrogate Splits into Separate Dataframes
## PRIMARY
primaryDF <- c()
for(i in 1:length(names(primaryInfo))){
subDf <- c()
for(j in 1:length(primaryInfo[[i]])) {
subDf[[j]] <- data.table(strsplit(unname(trimws(primaryInfo[[i]])),"")[[j]])
}
subDf <- data.frame(subDf)
setnames(subDf, names(primaryInfo[[i]]))
primaryDF[[names(primaryInfo)[[i]]]] <- subDf
}
# Bring unique sorted names next to each categorical column name
primaryDfModified <- c()
for(i in names(uniqueSortedNames)){
for(j in names(primaryDF)){
if(i == j){
primaryDfModified[[i]] <- cbind(primaryDF[[j]], uniqueSortedNames[[i]])
}
}
}
# Map the columns to the dataframe
primaryDFNames <- names(primaryDfModified)
# PrimaryOut is the dataframe with dat and primary columns added
primaryOut <- trainData
primaryOut$id <- 1:nrow(primaryOut)
for(i in seq_along(primaryDFNames)) {
primaryOut <- merge(primaryOut, primaryDfModified[[primaryDFNames[i]]], all.x = TRUE,
by.x = primaryDFNames[i], by.y = 'lev')
}
primaryOut <- primaryOut[order(primaryOut$id), ]
primaryOut <- data.frame(primaryOut, check.names = F)
# Test Data
primaryOutTest <- testData
primaryOutTest$id <- 1:nrow(primaryOutTest)
for(i in seq_along(primaryDFNames)) {
primaryOutTest <- merge(primaryOutTest, primaryDfModified[[primaryDFNames[i]]], all.x = TRUE,
by.x = primaryDFNames[i], by.y = 'lev')
}
primaryOutTest <- primaryOutTest[order(primaryOutTest$id), ]
primaryOutTest <- data.frame(primaryOutTest, check.names = F)
# Column that do not have the value "-"
primaryColumnstoBeEdited <- list()
for(i in 1:length(names(primaryDF))){
uniqueValues <- lapply(primaryDF[[i]],unique)
#print(uniqueValues)
notFullsub <- c()
for(j in 1:length(uniqueValues)){
if("-" %in% uniqueValues[[j]]){
notFullsub <- append(notFullsub, names(uniqueValues)[[j]])
}
}
primaryColumnstoBeEdited[[names(primaryDF)[[i]]]] <- notFullsub
}
# For primary columns if the column is not the most important split and contains "-"
# we throw it
if(problemType == "classification"){
if(length(primaryColumnstoBeEdited) != 0){
primaryColumnstoBeEditedtoKeep <- list()
for(j in 1:length(names(primaryColumnstoBeEdited))){
for(i in primaryColumnstoBeEdited[[j]]){
nodeName <- paste0(paste(unlist(strsplit(i," ")[[1]])[2:4], collapse = " "),":")
catColName <- paste(unlist(strsplit(i," ")[[1]])[1], collapse = " ")
for(line in 1:length(allLines)){
if(grepl(nodeName, allLines[[line]])){
if(unlist(strsplit(trimws(allLines[[line + 6]])," ")[[1]])[1] == strsplit(i, " ")[[1]][1]){
primaryColumnstoBeEditedtoKeep <- append(primaryColumnstoBeEditedtoKeep, i)
}
}
}
}
}
}else {primaryColumnstoBeEditedtoKeep <- list()
message(paste("Primary column do not contain any non-directional information", "Therefore, no need to encode any primary columns from surrogates", sep = "\n"), sep = "\n")}
}else if(problemType == "regression"){
if(length(primaryColumnstoBeEdited) != 0){
primaryColumnstoBeEditedtoKeep <- list()
for(j in 1:length(names(primaryColumnstoBeEdited))){
for(i in primaryColumnstoBeEdited[[j]]){
nodeName <- paste0(paste(unlist(strsplit(i," ")[[1]])[2:4], collapse = " "),":")
catColName <- paste(unlist(strsplit(i," ")[[1]])[1], collapse = " ")
for(line in 1:length(allLines)){
if(grepl(nodeName, allLines[[line]])){
if(unlist(strsplit(trimws(allLines[[line + 4]])," ")[[1]])[1] == strsplit(i, " ")[[1]][1]){
primaryColumnstoBeEditedtoKeep <- append(primaryColumnstoBeEditedtoKeep, i)
}
}
}
}
}
}else {primaryColumnstoBeEditedtoKeep <- list()
message(paste("Primary column do not contain any non-directional information", "Therefore, no need to encode any primary columns from surrogates", sep = "\n"), sep = "\n")}
}
# Remove primary columns to be discarded
# I realized I need to discard primary columns who are on the first row
# and do not have any corresponding surrogate splits should be discarded as well
primaryColumnstobeDiscarded <- setdiff( unlist(unname(primaryColumnstoBeEdited)), primaryColumnstoBeEditedtoKeep)
# Find further primary columns to be discarded that do not have any corresponding surrogate splits
primaryANDsurrogatelinesANDnodeslist <- list("Node number", "Primary splits", "Surrogate splits")
primaryANDsurrogatelinesANDnodes <- list()
for(line in 1:length(necessaryLines)){
if(grepl(paste(primaryANDsurrogatelinesANDnodeslist, collapse="|"), necessaryLines[[line]])){
primaryANDsurrogatelinesANDnodes <- append(primaryANDsurrogatelinesANDnodes, necessaryLines[[line]])
}
}
primaryANDsurrogatelinesANDnodes[[length(primaryANDsurrogatelinesANDnodes) + 1 ]] <- "a"
NodeNumbersNottoBeDiscardedforPrimary <- list()
for(line in 1:length(primaryANDsurrogatelinesANDnodes)){
if(grepl("Node number",primaryANDsurrogatelinesANDnodes[[line]] )){
if(grepl("Primary splits",primaryANDsurrogatelinesANDnodes[[line+1]]) && grepl("Surrogate splits",primaryANDsurrogatelinesANDnodes[[line+2]]) ){
NodeNumbersNottoBeDiscardedforPrimary <- append(NodeNumbersNottoBeDiscardedforPrimary, sub(":","",paste(strsplit(primaryANDsurrogatelinesANDnodes[[line]], " ")[[1]][1:3], collapse = " ")))
}
}
}
primaryColumnstobeDiscarded2 <- list()
for(column in primaryColumnstoBeEditedtoKeep){
if(!(paste(strsplit(column, " ")[[1]][2:4], collapse = " ") %in% NodeNumbersNottoBeDiscardedforPrimary)){
primaryColumnstobeDiscarded2 <- append(primaryColumnstobeDiscarded2, column)
}
}
# Unite two information of primary columns to be dropped
primaryColumnstobeDiscarded <- unlist(c(list(primaryColumnstobeDiscarded), primaryColumnstobeDiscarded2))
# Drop from editedtokeeplist
primaryColumnstoBeEditedtoKeepCopy <- primaryColumnstoBeEditedtoKeep
if(length(primaryColumnstoBeEditedtoKeepCopy) != 0){
for(column in 1:length(primaryColumnstoBeEditedtoKeepCopy)){
if(primaryColumnstoBeEditedtoKeepCopy[[column]] %in% primaryColumnstobeDiscarded2){
primaryColumnstoBeEditedtoKeep[[column]] <- NULL
}
}
}#else cat(paste("There are no primary columns to be dropped", "because none of them contains non-directional information", sep = "\n"), sep = "\n")
# Find the corresponding surrogate split line
# First find categorical surrogate splits
namesPrimaryCategoricalSplits <- list()
primarySplitEncodingsCategorical <- list()
lineNumberPrimaryCategorical <- list()
#catVariablesstartANDend <- list()
#namesPrimaryCategoricalSplits <- list()
#primarySplitEncodingsCategorical <- list()
#lineNumberPrimaryCategorical <- list()
catVariablesstartANDend <- list()
for(cat in catVariables){
catVariablesstartANDend <- append(catVariablesstartANDend, paste0("^",cat,"$"))}
if(length(primaryColumnstoBeEditedtoKeep)!= 0){
catVariablesstartANDend <- list()
for(cat in catVariables){
catVariablesstartANDend <- append(catVariablesstartANDend, paste0("^",cat,"$"))
}
namesPrimaryCategoricalSplits <- list()
primarySplitEncodingsCategorical <- list()
lineNumberPrimaryCategorical <- list()
for(i in 1:length(primaryColumnstoBeEditedtoKeep)){
nodeNumber <- paste0(paste(unlist(strsplit(primaryColumnstoBeEditedtoKeep[[i]]," ")[[1]])[2:4], collapse = " "),":")
counter <- 0
counter2 <- 0
for(l in 1:length(allLines)){
counter <- counter + 1
counter2 <- counter2 + 1
if(grepl(nodeNumber,allLines[[l]])){
for(lr in allLines[counter:length(allLines)]){
counter2 <- counter2 + 1
if(grepl("agree",lr)){
if(grepl(paste(catVariablesstartANDend, collapse="|"), strsplit(trimws(lr)," ")[[1]][1] ))
{
a <- paste(c(strsplit(trimws(lr)," ")[[1]][1], strsplit(primaryColumnstoBeEditedtoKeep[[i]], " ")[[1]][2:4], "surrogate"), collapse = " " )
primarySplitEncodingsCategorical <- append(primarySplitEncodingsCategorical, a )
namesPrimaryCategoricalSplits <- append(namesPrimaryCategoricalSplits, primaryColumnstoBeEditedtoKeep[[i]])
lineNumberPrimaryCategorical <- append(lineNumberPrimaryCategorical, counter2 - 1)
break
}
}
}
}
}
}
} #else cat("There are no primary columns to be encoded", sep="\n")
# Find the encodings for numerical surrogates too
names(primarySplitEncodingsCategorical) <- namesPrimaryCategoricalSplits
# Save columns to be discarded in a df
primaryDiscardedDf <- primaryOut[primaryColumnstobeDiscarded]
#test
primaryDiscardedDfTest <- primaryOutTest[primaryColumnstobeDiscarded]
# Remove primary columns to be discarded from the dataframe
primaryOut <- primaryOut[ , !names(primaryOut) %in% c(primaryColumnstobeDiscarded)]
#test
primaryOutTest <- primaryOutTest[ , !names(primaryOutTest) %in% c(primaryColumnstobeDiscarded)]
#####------------------------------------------------------------------------------------------
#####------------------------------------------------------------------------------------------
# Convert Primary and Surrogate Splits into Separate Dataframes
## SURROGATE
surrogateDF <- c()
for(i in 1:length(names(surrogateInfo))){
subDf <- c()
for(j in 1:length(surrogateInfo[[i]])) {
subDf[[j]] <- data.table(strsplit(unname(trimws(surrogateInfo[[i]])),"")[[j]])
}
subDf <- data.frame(subDf)
setnames(subDf, names(surrogateInfo[[i]]))
surrogateDF[[names(surrogateInfo)[[i]]]] <- subDf
}
# Bring unique sorted names next to each categorical column name
surrogateDfModified <- c()
for(i in names(uniqueSortedNames)){
for(j in names(surrogateDF)){
if(i == j){
surrogateDfModified[[i]] <- cbind(surrogateDF[[j]], uniqueSortedNames[[i]])
}
}
}
# Map the columns to the dataframe
surrogateDFNames <- names(surrogateDfModified)
# surrogateOut is the dataframe with dat and surrogate columns added
surrogateOut <- trainData
surrogateOut$id <- 1:nrow(surrogateOut)
for(i in seq_along(surrogateDFNames)) {
surrogateOut <- merge(surrogateOut, surrogateDfModified[[surrogateDFNames[i]]], all.x = TRUE,
by.x = surrogateDFNames[i], by.y = 'lev')
}
surrogateOut <- surrogateOut[order(surrogateOut$id), ]
surrogateOut <- data.frame(surrogateOut, check.names = F)
## For TEST
surrogateOutTest <- testData
surrogateOutTest$id <- 1:nrow(surrogateOutTest)
for(i in seq_along(surrogateDFNames)) {
surrogateOutTest <- merge(surrogateOutTest, surrogateDfModified[[surrogateDFNames[i]]], all.x = TRUE,
by.x = surrogateDFNames[i], by.y = 'lev')
}
surrogateOutTest <- surrogateOutTest[order(surrogateOutTest$id), ]
surrogateOutTest <- data.frame(surrogateOutTest, check.names = F)
# Find surrogate columns to be edited which means they have "-"
surrogateColumnstoBeEdited <- list()
for(i in 1:length(names(surrogateDF))){
uniqueValues <- lapply(surrogateDF[[i]],unique)
notFullsub <- c()
for(j in 1:length(uniqueValues)){
if("-" %in% uniqueValues[[j]]){
notFullsub <- append(notFullsub, names(uniqueValues)[[j]])
}
}
surrogateColumnstoBeEdited[[names(surrogateDF)[[i]]]] <- notFullsub
}
# Get rid of names
surrogateColumnstoBeEdited <- unname(surrogateColumnstoBeEdited)
surrogateColumnstoBeEdited <- unlist(unname(surrogateColumnstoBeEdited))
##-------------------------------------------------------------------------------------------
# Find the corresponding primary columns to surrogate columns to be edited
# First for categorical surrogate splits
## We have to state the type of the problem for this part
namesSurrogateCategoricalSplits <- list()
surrogateSplitEncodingsCategorical <- list()
namesSurrogateNumericalSplits <- list()
surrogateSplitEncodingsNumerical <- list()
if(problemType == "classification"){
namesSurrogateCategoricalSplits <- list()
surrogateSplitEncodingsCategorical <- list()
if(!is.null(surrogateColumnstoBeEdited)){
namesSurrogateCategoricalSplits <- list()
surrogateSplitEncodingsCategorical <- list()
for(i in 1:length(surrogateColumnstoBeEdited)){
node_number <- paste0(paste(strsplit(surrogateColumnstoBeEdited[[i]], " ")[[1]][2:4], collapse = " "),":")
counter <- 0
for(line in 1:length(allLines)){
counter <- counter + 1
if(grepl(node_number,allLines[[line]])){
if( (grepl(paste(catVariablesstartANDend, collapse="|"), strsplit(trimws(allLines[[counter+6]])," ")[[1]][1] ))){
columnName <- strsplit(trimws(allLines[[counter+6]])," ")[[1]][1]
wholeName <- paste(c(columnName,node_number,"primary"), collapse = " ")
surrogateSplitEncodingsCategorical <- append(surrogateSplitEncodingsCategorical, sub(":","",wholeName))
namesSurrogateCategoricalSplits <- append(namesSurrogateCategoricalSplits,sub(":","",surrogateColumnstoBeEdited[[i]]))
}
}
}
}
names(surrogateSplitEncodingsCategorical) <- namesSurrogateCategoricalSplits
} else{message("There are no surrogate columns to be edited", sep="\n")}
# Second for numerical surrrogate splits
# Find the corresponding primary columns to surrogate columns to be edited
namesSurrogateNumericalSplits <- list()
surrogateSplitEncodingsNumerical <- list()
if(!is.null(surrogateColumnstoBeEdited)){
namesSurrogateNumericalSplits <- list()
surrogateSplitEncodingsNumerical <- list()
for(i in 1:length(surrogateColumnstoBeEdited)){
node_number <- paste0(paste(strsplit(surrogateColumnstoBeEdited[[i]], " ")[[1]][2:4], collapse = " "),":")
counter <- 0
for(line in 1:length(allLines)){
counter <- counter + 1
if(grepl(node_number,allLines[[line]])){
if( !(grepl(paste(catVariablesstartANDend, collapse="|"), strsplit(trimws(allLines[[counter+6]])," ")[[1]][1] ))){
columnName <- strsplit(trimws(allLines[[counter+6]])," ")[[1]][1]
#wholeName <- paste(c(columnName,sub(":","",node_number),"primary"), collapse = " ")
wholeName <- columnName
#print(wholeName)
stringToBeManipulated <- list(strsplit(trimws(allLines[[counter+6]])," ")[[1]])
stringToBeManipulatedCleaned <- lapply(stringToBeManipulated, function(x) x[!x %in% ""])[[1]]
#print(stringToBeManipulatedCleaned)
value <- paste(stringToBeManipulatedCleaned[2:3], collapse = " ")
direction <- stringToBeManipulatedCleaned[6]
mainInfo <- paste(c(wholeName, value , direction), collapse = " ")
surrogateSplitEncodingsNumerical <- append(surrogateSplitEncodingsNumerical, mainInfo)
namesSurrogateNumericalSplits <- append(namesSurrogateNumericalSplits,surrogateColumnstoBeEdited[[i]])
}
}
}
}
names(surrogateSplitEncodingsNumerical) <- namesSurrogateNumericalSplits
} else{
namesSurrogateNumericalSplits <- list()
surrogateSplitEncodingsNumerical <- list()}
} else if(problemType == "regression"){
namesSurrogateCategoricalSplits <- list()
surrogateSplitEncodingsCategorical <- list()
if(!is.null(surrogateColumnstoBeEdited)){
namesSurrogateCategoricalSplits <- list()
surrogateSplitEncodingsCategorical <- list()
for(i in 1:length(surrogateColumnstoBeEdited)){
node_number <- paste0(paste(strsplit(surrogateColumnstoBeEdited[[i]], " ")[[1]][2:4], collapse = " "),":")
counter <- 0
for(line in 1:length(allLines)){
counter <- counter + 1
if(grepl(node_number,allLines[[line]])){
if( (grepl(paste(catVariablesstartANDend, collapse="|"), strsplit(trimws(allLines[[counter+4]])," ")[[1]][1] ))){
columnName <- strsplit(trimws(allLines[[counter+4]])," ")[[1]][1]
wholeName <- paste(c(columnName,node_number,"primary"), collapse = " ")
surrogateSplitEncodingsCategorical <- append(surrogateSplitEncodingsCategorical, sub(":","",wholeName))
#print("not correct")
namesSurrogateCategoricalSplits <- append(namesSurrogateCategoricalSplits,sub(":","",surrogateColumnstoBeEdited[[i]]))
}
}
}
}
names(surrogateSplitEncodingsCategorical) <- namesSurrogateCategoricalSplits
} else{message("There are no surrogate columns to be edited", sep="\n")}
# Second for numerical surrrogate splits
# Find the corresponding primary columns to surrogate columns to be edited
namesSurrogateNumericalSplits <- list()
surrogateSplitEncodingsNumerical <- list()
if(!is.null(surrogateColumnstoBeEdited)){
namesSurrogateNumericalSplits <- list()
surrogateSplitEncodingsNumerical <- list()
for(i in 1:length(surrogateColumnstoBeEdited)){
node_number <- paste0(paste(strsplit(surrogateColumnstoBeEdited[[i]], " ")[[1]][2:4], collapse = " "),":")
counter <- 0
for(line in 1:length(allLines)){
counter <- counter + 1
if(grepl(node_number,allLines[[line]])){
if( !(grepl(paste(catVariablesstartANDend, collapse="|"), strsplit(trimws(allLines[[counter+4]])," ")[[1]][1] ))){
columnName <- strsplit(trimws(allLines[[counter+4]])," ")[[1]][1]
#wholeName <- paste(c(columnName,sub(":","",node_number),"primary"), collapse = " ")
wholeName <- columnName
#print(wholeName)
stringToBeManipulated <- list(strsplit(trimws(allLines[[counter+6]])," ")[[1]])
stringToBeManipulatedCleaned <- lapply(stringToBeManipulated, function(x) x[!x %in% ""])[[1]]
#print(stringToBeManipulatedCleaned)
value <- paste(stringToBeManipulatedCleaned[2:3], collapse = " ")
direction <- stringToBeManipulatedCleaned[6]
mainInfo <- paste(c(wholeName, value , direction), collapse = " ")
surrogateSplitEncodingsNumerical <- append(surrogateSplitEncodingsNumerical, mainInfo)
#print("Correct")
namesSurrogateNumericalSplits <- append(namesSurrogateNumericalSplits,surrogateColumnstoBeEdited[[i]])
}
}
}
}
names(surrogateSplitEncodingsNumerical) <- namesSurrogateNumericalSplits
} else{
namesSurrogateNumericalSplits <- list()
surrogateSplitEncodingsNumerical <- list()}
}
# Unite Primary and Surrogate Dfs
DfOut <- merge(primaryOut, surrogateOut)
DfOutTest <- merge(primaryOutTest, surrogateOutTest)
# Filter out duplicate columns
DfOut <- DfOut[, !duplicated(colnames(DfOut))]
DfOut <- DfOut[order(DfOut$id), ]
#Test
DfOutTest <- DfOutTest[, !duplicated(colnames(DfOutTest))]
DfOutTest <- DfOutTest[order(DfOutTest$id), ]
### ----------------------------------------------------------------------------
# ENCODING PRIMARIES
# Encode primaries from numerical and categorical splits
ColumnNamesfromColumnNumber = primaryColumnstoBeEditedtoKeep
# Here a start index and ending index is obtained to find the surrogate elements in between
startIndex <- list()
endIndex <- list()
for(name in ColumnNamesfromColumnNumber){
# extract node number
nodeNumberofprimary <- paste0(paste(strsplit(name," ")[[1]][2:4], collapse = " "),":")
for(i in 1:length(allLines)){
if(grepl(nodeNumberofprimary, allLines[[i]])){
startIndex <- append(startIndex, i)
for(i2 in i+1:length(allLines)){
if(grepl("Node number", allLines[[i2]])){
endIndex <- append(endIndex, i2)
break
}
}
}
}
}
names(startIndex) <- endIndex
# Get all the lines for lines between each start index and end index
lineList <- list()
if(length(startIndex) != 0){
lineList <- list()
for(l in 1:length(startIndex)){
lineList <- append(lineList, trimws(allLines[startIndex[[l]]:names(startIndex)[[l]] -1]))
}
} #else cat("Nothing to be encoded", sep="\n")
# From line list filter all the columns that are under surrogate splits
if(length(lineList) != 0){
out <- by(unlist(lineList),
cumsum(grepl("^(Node|Surrogate)", lineList)),
function(x) {
if (grepl("Node number.*:", x[1])) x[1] else {
if (grepl("Surrogate.*:", x[1])) x[-1]
}
})
outList <- as.list(do.call(c, unname(out)))
outList <- lapply(outList, function(z){ z[z != ""]})
outList <- Filter(length, outList)
# Create a nested list, where each key is a "number XX" and its elements are the variables needed
indicesOfNodes = which(grepl("Node number", outList))
mappingListNested = lapply(seq_along(indicesOfNodes), function(i){
# If part is currently added
if (length(outList) == 1){
return(outList[(length(outList))])
}
else if (i+1 <= length(indicesOfNodes)){
return(outList[(indicesOfNodes[i]+1):(indicesOfNodes[i+1]-1)])
} else {
return(outList[(indicesOfNodes[i]+1):length(outList)])
}
})
oppositeDirection <- function(direction){
if(direction == "L"){return("R")}
if(direction == "R"){ return("L")}
}
names(mappingListNested) = paste('Node number', str_extract(outList[indicesOfNodes], "[[:digit:]]+"))
columnsToBeEncoded <- ColumnNamesfromColumnNumber
for(col in columnsToBeEncoded){
if(col %in% primaryColumnstobeDiscarded){
columnsToBeEncoded <- columnsToBeEncoded[columnsToBeEncoded != col]
}
}
# Encode "former" columns where row is "?" from "latter" columns by the order in mappingList
for(col in columnsToBeEncoded){
# extract column number from former column
colNumber <- paste(strsplit(col, " ")[[1]][2:4], collapse = " ")
# Find indices where former column has "?"
replacementVariables = mappingListNested[[colNumber]]
#print(replacementVariables)
for (var in replacementVariables){
if(grepl("splits as", var)){
varNameinColumnForm <- paste(c(strsplit(var," ")[[1]][1], colNumber, "surrogate"), collapse = " ")
#print(col)
#print(varNameinColumnForm)
DfOut[, col] = ifelse(
DfOut[, col] == "-", # which elements are "?"
DfOut[, varNameinColumnForm], # replace those which are "?" with the values of var
DfOut[, col]) # otherwise leave unchanged
}else{
stringToBeManipulated <- strsplit(trimws(var), " ")
stringToBeManipulatedCleaned <- lapply(stringToBeManipulated, function(x) x[!x %in% ""])[[1]]
columnName <- stringToBeManipulatedCleaned[1]
#print(columnName)
value <- stringToBeManipulatedCleaned[3]
#print(value)
direction <- stringToBeManipulatedCleaned[6]
#print(direction)
mainInfo <- paste(c(columnName, value, direction), collapse = " ")
#print(mainInfo)
value2 <- as.numeric(strsplit(mainInfo, " ")[[1]][2])
#print(value2)
direction2 <- toupper(strsplit(strsplit(mainInfo, " ")[[1]][3],"")[[1]][1])
#print(direction2)
intheDirection <- which((DfOut[,col] == "-") & (DfOut[,columnName] < value2))
againsttheDirection <- which((DfOut[,col] == "-") & (DfOut[,columnName] >= value2))
#print(againsttheDirection)
# change in direction
DfOut[intheDirection, col] <- direction2
# change in the opposite direction
DfOut[againsttheDirection, col] <- oppositeDirection(direction2)
}
}
}
} #else cat("Nothing to encode", sep="\n")
## TEST
# From line list filter all the columns that are under surrogate splits
if(length(lineList) != 0){
#print( "DOES THIS PART HAPPEN??")
out <- by(unlist(lineList),
cumsum(grepl("^(Node|Surrogate)", lineList)),
function(x) {
if (grepl("Node number.*:", x[1])) x[1] else {
if (grepl("Surrogate.*:", x[1])) x[-1]
}
})
outList <- as.list(do.call(c, unname(out)))
outList <- lapply(outList, function(z){ z[z != ""]})
outList <- Filter(length, outList)
# Create a nested list, where each key is a "number XX" and its elements are the variables needed
indicesOfNodes = which(grepl("Node number", outList))
mappingListNested = lapply(seq_along(indicesOfNodes), function(i){
# If part is currently added
if (length(outList) == 1){
return(outList[(length(outList))])
}
else if (i+1 <= length(indicesOfNodes)){
return(outList[(indicesOfNodes[i]+1):(indicesOfNodes[i+1]-1)])
} else {
return(outList[(indicesOfNodes[i]+1):length(outList)])
}
})
oppositeDirection <- function(direction){
if(direction == "L"){return("R")}
if(direction == "R"){ return("L")}
}
names(mappingListNested) = paste('Node number', str_extract(outList[indicesOfNodes], "[[:digit:]]+"))
columnsToBeEncoded <- ColumnNamesfromColumnNumber
for(col in columnsToBeEncoded){
if(col %in% primaryColumnstobeDiscarded){
columnsToBeEncoded <- columnsToBeEncoded[columnsToBeEncoded != col]
}
}
# Encode "former" columns where row is "?" from "latter" columns by the order in mappingList
for(col in columnsToBeEncoded){
# extract column number from former column
colNumber <- paste(strsplit(col, " ")[[1]][2:4], collapse = " ")
# Find indices where former column has "?"
replacementVariables = mappingListNested[[colNumber]]
#print(replacementVariables)
for (var in replacementVariables){
if(grepl("splits as", var)){
varNameinColumnForm <- paste(c(strsplit(var," ")[[1]][1], colNumber, "surrogate"), collapse = " ")
#print(col)
#print(varNameinColumnForm)
DfOutTest[, col] = ifelse(
DfOutTest[, col] == "-", # which elements are "?"
DfOutTest[, varNameinColumnForm], # replace those which are "?" with the values of var
DfOutTest[, col]) # otherwise leave unchanged
}else{
stringToBeManipulated <- strsplit(trimws(var), " ")
stringToBeManipulatedCleaned <- lapply(stringToBeManipulated, function(x) x[!x %in% ""])[[1]]
columnName <- stringToBeManipulatedCleaned[1]
#print(columnName)
value <- stringToBeManipulatedCleaned[3]
#print(value)
direction <- stringToBeManipulatedCleaned[6]
#print(direction)
mainInfo <- paste(c(columnName, value, direction), collapse = " ")
#print(mainInfo)
value2 <- as.numeric(strsplit(mainInfo, " ")[[1]][2])
#print(value2)
direction2 <- toupper(strsplit(strsplit(mainInfo, " ")[[1]][3],"")[[1]][1])
#print(direction2)
intheDirection <- which((DfOutTest[,col] == "-") & (DfOutTest[,columnName] < value2))
againsttheDirection <- which((DfOutTest[,col] == "-") & (DfOutTest[,columnName] >= value2))
#print(againsttheDirection)
# change in direction
DfOutTest[intheDirection, col] <- direction2
# change in the opposite direction
DfOutTest[againsttheDirection, col] <- oppositeDirection(direction2)
}
}
}
} #else cat("Nothing to encode", sep="\n")
# Get the name of the columns that are still not encoded
# Get all the primary column names that are in primaryColumnNames
primaryColumnNames <- list()
for(i in names(DfOut)){
if(grepl("primary",i)){
primaryColumnNames <- append(primaryColumnNames, i)
}
}
# Find primary columns that still have "-" after encoding
# Here I used [[1]] to take the value but is this true if we have more than 1 value
columnNumber <- unique(data.frame(which(DfOut[unlist(primaryColumnNames)] == "-", arr.ind = TRUE))["col"])[[1]]
ColumnNamesfromColumnNumber <- names(DfOut[unlist(primaryColumnNames)][columnNumber])
# TEST
# Get the name of the columns that are still not encoded
# Get all the primary column names that are in primaryColumnNames
primaryColumnNamesTest <- list()
for(i in names(DfOutTest)){
if(grepl("primary",i)){
primaryColumnNamesTest <- append(primaryColumnNamesTest, i)
}
}
# Find primary columns that still have "-" after encoding
# Here I used [[1]] to take the value but is this true if we have more than 1 value
columnNumber <- unique(data.frame(which(DfOutTest[unlist(primaryColumnNamesTest)] == "-", arr.ind = TRUE))["col"])[[1]]
ColumnNamesfromColumnNumber <- names(DfOutTest[unlist(primaryColumnNamesTest)][columnNumber])
# ENCODING SURROGATES
# First encode surrogates from categorical primary splits
for(i in seq_len(length(surrogateSplitEncodingsCategorical))) {
j <- which(DfOut[,names(surrogateSplitEncodingsCategorical)[[i]]] == "-")
#print(j)
DfOut[j,names(surrogateSplitEncodingsCategorical)[[i]]] <- DfOut[j,surrogateSplitEncodingsCategorical[[i]]]
}
oppositeDirection <- function(direction){
if(direction == "L"){return("R")}
if(direction == "R"){ return("L")}
}
# Second encode surrogates from numerical primary splits
# There are such columns when primary split is absent as well I have not implemented that yet
# For now we have to drop those columns
if(exists("surrogateSplitEncodingsNumerical")){
for(i in seq_len(length(surrogateSplitEncodingsNumerical))) {
primaryColumn <- strsplit(surrogateSplitEncodingsNumerical[[i]], " ")[[1]][1]
value <- as.numeric(strsplit(surrogateSplitEncodingsNumerical[[i]], " ")[[1]][3])
direction <- toupper(strsplit(strsplit(surrogateSplitEncodingsNumerical[[i]], " ")[[1]][4],"")[[1]][1])
intheDirection <- which((DfOut[,names(surrogateSplitEncodingsNumerical)[i]] == "-") & (DfOut[,primaryColumn] < value))
againsttheDirection <- which((DfOut[,names(surrogateSplitEncodingsNumerical)[i]] == "-") & (DfOut[,primaryColumn] >= value))
#value <- as.numeric(strsplit(surrogateSplitEncodingsNumerical[[i]], " ")[[1]][3])
#direction <- toupper(strsplit(strsplit(surrogateSplitEncodingsNumerical[[i]], " ")[[1]][4],"")[[1]][1])
# change in direction
DfOut[intheDirection, names(surrogateSplitEncodingsNumerical)[i]] <- direction
# change in the opposite direction
DfOut[againsttheDirection, names(surrogateSplitEncodingsNumerical)[i]] <- oppositeDirection(direction)
}
} else message("Variable does not exist", sep="\n")
##TEST
# First encode surrogates from categorical primary splits
for(i in seq_len(length(surrogateSplitEncodingsCategorical))) {
j <- which(DfOutTest[,names(surrogateSplitEncodingsCategorical)[[i]]] == "-")
#print(j)
DfOutTest[j,names(surrogateSplitEncodingsCategorical)[[i]]] <- DfOutTest[j,surrogateSplitEncodingsCategorical[[i]]]
}
oppositeDirection <- function(direction){
if(direction == "L"){return("R")}
if(direction == "R"){ return("L")}
}
# Second encode surrogates from numerical primary splits
# There are such columns when primary split is absent as well I have not implemented that yet
# For now we have to drop those columns
if(exists("surrogateSplitEncodingsNumerical")){
for(i in seq_len(length(surrogateSplitEncodingsNumerical))) {
primaryColumn <- strsplit(surrogateSplitEncodingsNumerical[[i]], " ")[[1]][1]
value <- as.numeric(strsplit(surrogateSplitEncodingsNumerical[[i]], " ")[[1]][3])
direction <- toupper(strsplit(strsplit(surrogateSplitEncodingsNumerical[[i]], " ")[[1]][4],"")[[1]][1])
intheDirection <- which((DfOutTest[,names(surrogateSplitEncodingsNumerical)[i]] == "-") & (DfOutTest[,primaryColumn] < value))
againsttheDirection <- which((DfOutTest[,names(surrogateSplitEncodingsNumerical)[i]] == "-") & (DfOutTest[,primaryColumn] >= value))
#value <- as.numeric(strsplit(surrogateSplitEncodingsNumerical[[i]], " ")[[1]][3])
#direction <- toupper(strsplit(strsplit(surrogateSplitEncodingsNumerical[[i]], " ")[[1]][4],"")[[1]][1])
# change in direction
DfOutTest[intheDirection, names(surrogateSplitEncodingsNumerical)[i]] <- direction
# change in the opposite direction
DfOutTest[againsttheDirection, names(surrogateSplitEncodingsNumerical)[i]] <- oppositeDirection(direction)
}
} else message("Variable does not exist", sep="\n")
# Check which columns still have "-"
ColumnsIWant <- list()
for(i in names(DfOut)){
if(grepl("*primary$",i) || grepl("*surrogate$",i) ){
ColumnsIWant <- append(ColumnsIWant, i)
}
}
# Check if the final dataframe has "-"
dfThatHasMissing <- data.frame(which(DfOut[unlist(ColumnsIWant)] == "-", arr.ind = TRUE))
# Find list of columns which have "-"
columnNumbersThatHave <- unique(dfThatHasMissing[["col"]])
multipleEncodingColumnstoBeDiscarded <- list()
for(i in columnNumbersThatHave){
multipleEncodingColumnstoBeDiscarded <- append(multipleEncodingColumnstoBeDiscarded, names(DfOut[unlist(ColumnsIWant)][i]))
}
DfOutCopy <- DfOut
# Discard those columns
DfOut <- DfOut[ , !names(DfOut) %in% multipleEncodingColumnstoBeDiscarded]
ColumnsIWant <- list()
for(i in names(DfOut)){
if(grepl("*primary$",i) || grepl("*surrogate$",i) ){
ColumnsIWant <- append(ColumnsIWant, i)
}
}
#print(ColumnsIWant)
nodes <- DfOut[unlist(ColumnsIWant)]
## SCENARIO 1
#scenario1Name <- paste0(datasetName, "_imit_splits_scenario_1.csv", collapse = "")
# Convert to csv file
#write.csv(nodes, scenario1Name, row.names = FALSE)
## TEST
# Check which columns still have "-"
ColumnsIWantTest <- list()
for(i in names(DfOutTest)){
if(grepl("*primary$",i) || grepl("*surrogate$",i) ){
ColumnsIWantTest <- append(ColumnsIWantTest, i)
}
}
# Check if the final dataframe has "-"
#print("DOES THIS PART HAPPEN")
dfThatHasMissingTest <- data.frame(which(DfOutTest[unlist(ColumnsIWantTest)] == "-", arr.ind = TRUE))
#print("WHAT HAPPENED")
# Find list of columns which have "-"
columnNumbersThatHaveTest <- unique(dfThatHasMissingTest[["col"]])
multipleEncodingColumnstoBeDiscardedTest <- list()
for(i in columnNumbersThatHaveTest){
multipleEncodingColumnstoBeDiscardedTest <- append(multipleEncodingColumnstoBeDiscardedTest, names(DfOutTest[unlist(ColumnsIWantTest)][i]))
}
# Discard those columns
DfOutTestCopy <- DfOutTest
DfOutTest <- DfOutTest[ , !names(DfOutTest) %in% multipleEncodingColumnstoBeDiscarded]
ColumnsIWantTest <- list()
for(i in names(DfOutTest)){
if(grepl("*primary$",i) || grepl("*surrogate$",i) ){
ColumnsIWantTest <- append(ColumnsIWantTest, i)
}
}
nodesTest <- DfOutTest[unlist(ColumnsIWantTest)]
## SCENARIO 1
#scenario1Name <- paste0(datasetName, "_imit_splits_scenario_1_test.csv", collapse = "")
# Convert to csv file
#write.csv(nodesTest, scenario1Name, row.names = FALSE)
return(list(nodes, nodesTest))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.