GA: Genetic Algorithm

Documented in select

#### Main Function ####
# data is a df, target is the column name of the target variable, method is lm/glm

#' Genetic algorithm for variable selection
#'
#' This function implements a genetic algorithm for variable selection in linear regression and GLM. Genetic algorithms is essentially an optimization problem. In feature selection, it uses the given fitness function (e.g. AIC) as the objective function and conduct multiple rounds of an update process to approach the optimal solution. For feature selection, it first generates a population of many possible combination for selecting a subset of features. Then, among this population, the best ones are selected according the objective function and from these parents, a new population with the same size as before are randomly generated. After many iterations, the best solutions from the population would approach the optimal sulotion, which is a binary string indicating the selection of a subset of independent variables.
#'
#' @param data A data frame with one response variable and arbitrary number of dependent variables. Order does not matter.
#' @param target Column name of the response variable in \code{data}. Parameter type should be numeric of character.
#' @param fit_method Regression method, either \code{lm} or \code{glm}. Parameter type should be character.
#' @param metric Objective function, default is \code{aic}. The function also supports \code{bic}, \code{rmse}, \code{mae}, \code{rsquare_negative} and user-defined function. Since this is a minimization problem, the R-square is inversed. User can defined arbitrary error function for minimization. Function input should be \code{model} and \code{data}. Function output should be a self-defined error. Pass the function name directly as a function object. For other regular metrics, parameter type should be character.
#' @return A vector of selected column names in the input data frame \code{data}. Return type is character vector.
#' @examples
#' # Example 1
#' Setup:
#' # https://www.kaggle.com/c/house-prices-advanced-regression-techniques
#' # House Prices: Advanced Regression Techniques
#' # Predict sales prices
#' dt_house <- read.csv("../data/data_house.csv")
#' dt_house <- dt_house[, c("MSSubClass", "MSZoning", "LotArea", "LotShape", "Alley", "LandContour", "LotConfig", "LandSlope", "Neighborhood", "BldgType", "WoodDeckSF", "OpenPorchSF", "HouseStyle", "OverallQual", "OverallCond","SaleType", "SaleCondition", "LotFrontage", "MoSold", "SalePrice")]
#' dt_house[, "MSSubClass"] <- as.factor(dt_house[, "MSSubClass"])
#' dt_house[, "MoSold"] <- as.factor(dt_house[, "MoSold"])
#' dt_house[, "LotArea"] <- as.numeric(dt_house[, "LotArea"])
#' dt_house[, "LotShape"] <- as.factor(dt_house[, "LotShape"])
#' dt_house[, "Alley"] <- as.factor(dt_house[, "Alley"])
#' dt_house[, "LandContour"] <- as.factor(dt_house[, "LandContour"])
#' dt_house[, "LotConfig"] <- as.factor(dt_house[, "LotConfig"])
#' dt_house[, "LandSlope"] <- as.factor(dt_house[, "LandSlope"])
#' dt_house[, "Neighborhood"] <- as.factor(dt_house[, "Neighborhood"])
#' dt_house[, "BldgType"] <- as.factor(dt_house[, "BldgType"])
#' dt_house[, "WoodDeckSF"] <- as.numeric(dt_house[, "WoodDeckSF"])
#' dt_house[, "OpenPorchSF"] <- as.numeric(dt_house[, "OpenPorchSF"])
#' dt_house[, "HouseStyle"] <- as.factor(dt_house[, "HouseStyle"])
#' dt_house[, "OverallQual"] <- as.numeric(dt_house[, "OverallQual"])
#' dt_house[, "OverallCond"] <- as.numeric(dt_house[, "OverallCond"])
#' dt_house[, "SaleType"] <- as.factor(dt_house[, "SaleType"])
#' dt_house[, "SaleCondition"] <- as.factor(dt_house[, "SaleCondition"])
#' dt_house[, "LotFrontage"] <- as.numeric(dt_house[, "LotFrontage"])
#' dt_house[, "MoSold"] <- as.factor(dt_house[, "MoSold"])
#' dt_house[, "SalePrice"] <- as.numeric(dt_house[, "SalePrice"])
#' # Execution
#' select(dt_house, 'SalePrice', fit_method = 'lm', metric = 'aic')
#'
#'
#' # Example 2
#' # Setup:
#' # https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009
#' # Red Wine Quality
#' dt_wine <- read.csv("../data/data_wine.csv")
#' dt_wine[, "quality"] <- as.numeric(dt_wine[, "quality"])
#' # Execution:
#' select(dt_wine, 'quality', fit_method = 'lm', metric = 'aic')
#'
#'
#' # Example 3
#' # Setup:
#' # https://www.kaggle.com/kumarajarshi/life-expectancy-who
#' # Life Expectancy (WHO)
#' # Statistical Analysis on factors influencing Life Expectancy
#' dt_life <- read.csv("./data/data_life.csv")
#' dt_life[, "Country"] <- as.factor(dt_life[, "Country"])
#' dt_life[, "Year"] <- as.numeric(dt_life[, "Year"])
#' dt_life[, "Status"] <- as.factor(dt_life[, "Status"])
#' dt_life[, "Life.expectancy"] <- as.numeric(dt_life[, "Life.expectancy"])
#' for(i in 5:dim(dt_life)[2]){ dt_life[, i] <- as.numeric(dt_life[, i]) }
#' # Execution:
#' select(dt_life, 'Life.expectancy', fit_method = 'lm', metric = 'aic')
#'
#'
#' # Example 4
#' # Setup:
#' # Bike sharing dataset
#' dt_bike <- read.csv("./data/data_bike.csv")
#' dt_bike[, 'dteday'] <- as.numeric(as.Date(dt_bike[, 'dteday']))
#' dt_bike[, 'yr'] <- as.factor(dt_bike[, 'yr'])
#' dt_bike[, 'mnth'] <- as.factor(dt_bike[, 'mnth'])
#' dt_bike[, 'holiday'] <- as.factor(dt_bike[, 'holiday'])
#' dt_bike[, 'workingday'] <- as.factor(dt_bike[, 'workingday'])
#' dt_bike[, 'weathersit'] <- as.factor(dt_bike[, 'weathersit'])
#' dt_bike$instant <- NULL
#' dt_bike$registered <- NULL
#' dt_bike$casual <- NULL
#' # Execution:
#' select(dt_bike, 'cnt', fit_method = 'lm', metric = 'aic')
#'
#'
#' # Example 5
#' # Setup:
#' # Basic data set loading and test of function lm()
#' # Load a build in data set BostonHousing
#' library(mlbench)
#' data(BostonHousing)
#' # Execution:
#' select(BostonHousing, 'medv', fit_method = 'lm', metric = 'aic')
#' score_value <- rep(0,repetition)
#' score_value[k] <- temp[[2]]
#' plot(score_value)



select <- function(data, target, fit_method = 'lm', metric = 'aic'){
  y <- data[, target]
  x <- data[, names(data) != target]
  #### Initialization ####
  # Parameters
  C <- dim(x)[2]  # chromosome_length (# of features)
  P <- C * 10  # population size
  mutation_rate <- 1 / C
  repetition <- 30
  gene_mat <- matrix(rep(initialize_gene(C), P), ncol = C, nrow = P)

  # Start
  for (k in 1 : repetition){
    temp <- selection(gene_mat, x, y, fit_method, metric)
    parent_gene <- temp[[1]]
    gene_mat <- cross_over(parent_gene, P)
  }
  result_column_boolean <- parent_gene[1, ]
  result_column_names <- names(x)[result_column_boolean]
  return(result_column_names)
}


# Initialize a chromosome vector
initialize_gene <- function(chromo_length){
  chromo <- sample(c(TRUE, FALSE), chromo_length, replace = TRUE)
}

initialize_gene()

#### Fitness assignment ####
fitness_assignment <- function(chromo, x, y, fit_method = 'lm', metric = 'aic'){
  mini_df <- data.frame(y, x[, chromo])
  if (fit_method == 'lm'){
    linearMod <- lm(y ~ ., mini_df)
  } else if(fit_method =='glm'){
    linearMod <- glm(y ~ ., mini_df, family = gaussian)
  } else{
    print("Method not defined.")
    exit()
  }

  if(class(metric) == "character"){
    residual <- residuals(linearMod)
    if (metric == 'aic' || metric == 'bic'){
      score <- eval(parse(text=paste0(toupper(metric), "(linearMod)")))
    } else if(metric == 'rsquare_negative'){
      score <- -eval(parse(text=paste0("summary(linearMod)$r.squared")))
    } else if(metric == 'rmse'){
      score <- sqrt(mean(residual^2, na.rm = TRUE))
    } else if(metric == 'mae'){
      score <- mean(abs(residual), na.rm = TRUE)
    }
  } else if(class(metric) == "function"){
    score <- eval(parse(text=paste0("metric", "(linearMod, mini_df)")))
  } else{
    print("Method not defined.")
    exit()
  }
  return(score)
}


#### Selection by ranking ####
selection <- function(gene_mat, x, y, fit_method = 'lm', metric = 'aic'){
  # rank matrix has P rows, 2 columns [index, score]
  P <- dim(gene_mat)[1]
  P_parent <- ceiling(P / 2)
  rank_mat <- matrix(c(1 : P, rep(0, P)), ncol = 2, nrow = P)
  rank_mat[, 2] <- t(apply(gene_mat, 1, function(k) fitness_assignment(k, x, y, fit_method, metric)))
  rank_mat <- rank_mat[order(rank_mat[, 2], decreasing = FALSE), ]
  #optional print statement to view the rankings
  #print(mean(rank_mat[, 2]))
  parant_gene <- gene_mat[rank_mat[1 : P_parent, 1], ]
  return(list(parant_gene, mean(rank_mat[, 2])))
}


#### Cross over ####
cross_over_two_chromo <- function(chromo_1, chromo_2){
  C <- length(chromo_1)
  mutation_rate <- 1 / C
  selection_sampler <- sample(c(TRUE, FALSE), C, replace = TRUE)
  result <- (chromo_1 * selection_sampler + chromo_2 * (1 - selection_sampler) == 1)

  ## mutation ##
  mutation_sampler <- (runif(C) < mutation_rate)
  result <- !(result == mutation_sampler)
  return(result)
}

# Single child policy
cross_over <- function(parant_gene, P){
  P_parent <- ceiling(P / 2)
  C <- dim(parant_gene)[2]
  gene_mat <- matrix(rep(FALSE, C * P), ncol = C, nrow = P)
  p_mat <- matrix(sample.int(2 * P, n = P_parent, replace = T), ncol = 2, nrow = P)
  gene_mat <- t(apply(p_mat, 1, function(x) cross_over_two_chromo(parant_gene[x[1], ], parant_gene[x[2], ])))
  return(gene_mat)
}



# R help page
library(roxygen2)