R/functions.R

Defines functions classify.columns get.proposed.conversions get.date.format convert.columns get.numeric.columns get.factor.columns get.date.columns generate.date.features get.chisq.importance get.auc.importance get.rf.importance get.var.importance get.rf.subset.quality group.factor apply.factor.grouping optimal.factor.grouping get.auc get.top.subset set.missing.to.random impute.missing.mean standardize.columns replace.outliers.mean set.missing.to.mean set.missing.to.zero set.missing.factors.to.NA set.missing.to.prediction cv.glm find.optimal.subset create.woe woe.apply compute.woe

Documented in apply.factor.grouping classify.columns compute.woe convert.columns create.woe cv.glm find.optimal.subset generate.date.features get.auc.importance get.chisq.importance get.date.columns get.date.format get.factor.columns get.numeric.columns get.proposed.conversions get.rf.importance get.rf.subset.quality get.var.importance group.factor optimal.factor.grouping set.missing.factors.to.NA set.missing.to.mean set.missing.to.prediction set.missing.to.random set.missing.to.zero standardize.columns woe.apply

require(plyr)
require(randomForest)

# To compile with manual use Roxygen

#' Get variables class information
#'
#' Creates data frame with info about dataset columns including proposed conversions and optionally convert
#'
#' @param data data frame with variables of interest
#' @param y.name target variable
#' @param max.cardinality maximum levels of numeric variable to be marked for conversion to factor
#' @param convert if not set - return information about variables; if set - return converted data frame and print information
#' @examples
#' df.cols <- classify.columns(df)
#' @return data frame with column types and statistics
#' @export
classify.columns <- function(data, y.name="target", max.cardinality=6)
{
  df <- data.frame(var=colnames(data), current.class=rep("",ncol(data)), proposed.class=rep("",ncol(data)), lev=rep(0,ncol(data)), missing=rep(0,ncol(data)), format=rep('',ncol(data)), stringsAsFactors = FALSE)
  df$N <- nrow(data)
  for (i in 1:ncol(data))
  {
    
    cat(paste("\rProcessing column ",i,"/",ncol(data),"\r",sep=""))
    
    df$lev[i] <- length(levels(as.factor(data[,i])))
    df$current.class[i] <- class(data[,i])
    df$missing[i] <- ifelse(df$current.class[i]=='factor', sum(data[,i]==""), sum(is.na(data[,i])))
    
    # Check for Date columns (of the form digits, not-digit, digits, not-digit, digits)
    if (sum(grepl("^\\d{1,4}[^0-9]\\d{1,2}[^0-9]\\d{1,4}$", data[,i])) > 10)
      df$proposed.class[i] <- 'Date'
    
  }
  df$current.class[df$var==y.name] <- df$proposed.class[df$var %in% y.name] <- "target"
  df$proposed.class[df$current.class %in% c("numeric","integer")] <- ifelse(df$lev[df$current.class %in% c("numeric","integer")] < max.cardinality, "factor", "numeric")
  df$proposed.class[grepl("woe.", df$var)] <- "numeric"
  df$proposed.class[df$current.class %in% c("factor","character","logical") & df$proposed.class!="Date"] <- "factor"
  df$proposed.class[df$current.class %in% c("Date")] <- "Date"
  df$proposed.class[df$lev == df$N] <- "id"
  
  return (df)
  
}

#' Get proposed type conversions
#'
#' Selects conversions suggested by classify.columns function
#'
#' @param df output of classify.columns function
#' @examples
#' get.proposed.conversions(classify.columns(df))
#' @return data frame with variables to convert
#' @export
get.proposed.conversions <- function(df)
{
  if (sum(df$current.class != df$proposed.class) == 0)
    print("All variables are appropriate")
  else
    return(df[df$current.class != df$proposed.class,])
}

#' Deduct date format
#' @export
get.date.format <- function(var)
{
  
  require(stringr)
  
  if (sum(grepl('^\\d{1,2}.\\d{1,2}.\\d{4}$', var)) > 10)
  {
    delimiter <- setdiff(unique(substr(var,nchar(as.character(var))-4,nchar(as.character(var))-4)), c(NA, ""))[1]
    m <- max(as.numeric(str_extract(substr(var,1,2), "\\d+")), na.rm=TRUE)
    return (ifelse(m>12,paste("%d","%m","%Y",sep=delimiter),paste("%m","%d","%Y",sep=delimiter)))
  }
  
  if (sum(grepl('^\\d{1,2}.\\d{1,2}.\\d{2}$', var)) > 10)
  {
    delimiter <- setdiff(unique(substr(var,nchar(as.character(var))-2,nchar(as.character(var))-2)), c(NA, ""))[1]
    m <- max(as.numeric(str_extract(substr(var,1,2), "\\d+")), na.rm=TRUE)
    return (ifelse(m>12,paste("%d","%m","%y",sep=delimiter),paste("%m","%d","%y",sep=delimiter)))
  }
  
  if (sum(grepl('^\\d{4}.\\d{1,2}.\\d{1,2}$', var)) > 10)
  {
    delimiter <- setdiff(unique(substr(var,5,5)), c(NA, ""))[1]
    return (paste("%Y","%m","%d",sep=delimiter))
  }
  
  return (NA)
  
}


#' Convert columns
#'
#' Converts dataset columns according to proposed mapping
#'
#' @param df - data frame with information about variables (may be result of classify.columns function)
#' @param data - data frame to convert
#'
#' @examples
#' df <- convert.columns(classify.columns(df), df)
#'
#' @return data frame with mapping (variable value) -> (variable group)
#
# ----------------------------------------------------------------
#' @export
convert.columns <- function(df, data)
{
  convert.to.factor <- df$var[df$current.class != df$proposed.class & df$proposed.class=="factor"]
  data[,convert.to.factor] <- as.data.frame(apply(data[,convert.to.factor], 2, function(x) as.factor(x)))
  print(paste("Converted ", length(convert.to.factor)," columns to factor"))
  
  convert.to.numeric <- df$var[df$current.class != df$proposed.class & df$proposed.class=="numeric"]
  data[,convert.to.numeric] <- as.data.frame(apply(data[,convert.to.numeric], 2, function(x) as.numeric(x)))
  print(paste("Converted ", length(convert.to.numeric)," columns to numeric"))
  
  convert.to.date <- df$var[df$current.class != df$proposed.class & df$proposed.class=="Date"]
  for (var in convert.to.date)
    data[,var] <- as.Date(data[,var], get.date.format(data[,var]))
  print(paste("Converted ", length(convert.to.date)," columns to Date"))
  
  # Generate
  # if (is.null(get.date.columns(data))==FALSE) generate.date.features(data, get.date.columns(data))
  return(data)
}

#' Get names of all numeric columns
#'
#' @param data data frame of interest
#' @return vector of numeric variable names
#' @examples
#' df.numerics <- get.numeric.columns(df)
#' @export
get.numeric.columns <- function(data)
{
  result <- c()
  for (i in 1:length(colnames(data)))
  {
    if (class(data[,i]) %in% c("numeric","integer")) result <- c(result, colnames(data)[i])
  }
  return(result)
}

#' Get names of all factor columns
#'
#' @param data data frame of interest
#' @return vector of factor variable names
#' @examples
#' df.numerics <- get.factor.columns(df)
#' @export
get.factor.columns <- function(data)
{
  result <- c()
  for (i in 1:length(colnames(data)))
  {
    if (class(data[,i]) == "factor") result <- c(result, colnames(data)[i])
  }
  return(result)
}

#' Get names of all date columns
#'
#' @param data data.frame of interest
#' @return vector of date variable names
#' @examples
#' df.numerics <- get.date.columns(df)
#' @export
get.date.columns <- function(data)
{
  result <- c()
  for (i in 1:length(colnames(data)))
  {
    if (class(data[,i]) == "Date") result <- c(result, colnames(data)[i])
  }
  return(result)
}

#' Generate date features
#'
#' Generates features based on columns of date type. Creates year, month and day features both numeric and factor.
#'
#' @param data data frame of interest
#' @param date.variables vector of date variable names
#' @param overwrite let function overwrite already existing variables with the same names
#' @return data frame with new generated variables
#' @examples
#' df <- generate.date.features(df, c('date1','date2'))
#' @export
generate.date.features <- function(data, date.variables, overwrite=FALSE)
{
  for (i in date.variables)
    if (class(data[,i]) != "Date")
      stop("All variables must of class Date")
  
  for (i in date.variables)
    for (j in c(".month.num",".day.num",".year.num",".month",".day",".year"))
      if (paste(i,j,sep="") %in% colnames(data))
        if (overwrite)
          warning(paste("Variable '", paste(i,j,sep=""),"' is already defined in input data frame", sep=""))
  else
    stop(paste("Variable '", paste(i,j,sep=""),"' is already defined in input data frame. To overwrite specify parameter 'overwrite=TRUE'.", sep=""))
  
  result <- data
  
  for (i in date.variables)
  {
    result[,paste(i,".month.num",sep="")] <- as.numeric(format(data[,i], "%m"))
    result[,paste(i,".day.num",sep="")] <- as.numeric(format(data[,i], "%d"))
    result[,paste(i,".year.num",sep="")] <- as.numeric(format(data[,i], "%Y"))
    result[,paste(i,".month",sep="")] <- factor(format(data[,i], "%m"))
    result[,paste(i,".day",sep="")] <- factor(format(data[,i], "%d"))
    result[,paste(i,".year",sep="")] <- factor(format(data[,i], "%Y"))
  }
  return(result)
}

#' Calculate Chi-squared importance of each attribute
#'
#' Creates a data frame with calculated chi-squared difference between good class and bad class for each factor
#'
#' @param x data frame with variables of interest (all must be factors)
#' @param y target variable
#'
#' @return data frame with variable names sorted by chi-squared statistics
#' @export
get.chisq.importance <- function(x, y)
{
  variables <- colnames(x)
  df <- data.frame(var=rep('',length(variables)), chisq=rep(0,length(variables)), stringsAsFactors = FALSE)
  for (i in 1:length(variables))
  {
    df$var[i] <- variables[i]
    df$chisq[i] <- round(as.numeric(chisq.test(table(x[,variables[i]], y))$statistic),2)
  }
  df <- df[order(df$chisq, decreasing=TRUE),]
  rownames(df) <- NULL
  return(df)
}

#' Calculate AUC importance of each attribute
#'
#' Creates a data frame with calculated AUC difference between good class and bad class for each variable
#'
#' @param x data frame with variables of interest (all must be numerics)
#' @param y target variable
#'
#' @return data frame with variable names sorted by AUC statistics
#' @export
get.auc.importance <- function(x,y)
{
  
  require(AUC)
  
  variables <- colnames(x)
  n.variables <- length(variables)
  df <- data.frame(var=rep('',n.variables), auc=rep(NA,n.variables), fill.rate=rep(NA,n.variables), stringsAsFactors = FALSE)
  for (i in 1:n.variables)
  {
    cat(paste("\rProcessing variable [",i,"/",n.variables,"]\r",sep=""))
    df$var[i] <- variables[i]
    if ((class(x[,i]) %in% c("numeric","integer"))==FALSE) next
    df$auc[i] <- round(AUC::auc(AUC::roc(x[,i],y),5))
    df$fill.rate[i] <- 1 - round((sum(is.na(x[,i])) / nrow(x)), 5)
  }
  cat("\n")
  df <- df[order(abs(df$auc-0.5), decreasing=TRUE),]
  rownames(df) <- NULL
  return(df)
}

#' Calculate RandomForest importance of each attribute
#'
#' Creates a data frame with calculated RandomForest importance for each variable
#'
#' @param x data frame with variables of interest
#' @param y target variable
#'
#' @return data frame with variable names sorted by RandomForest statistics
#' @export
get.rf.importance <- function(x,y){
  
  require(randomForest)
  
  print("Fitting Random Forest on all data. This might take some time.")
  rf.fit <- randomForest(x, y, ntree=100, importance=TRUE)
  result <- as.data.frame(rf.fit$importance[,c('MeanDecreaseAccuracy','MeanDecreaseGini')])
  result$var <- rownames(result)
  rownames(result) <- NULL
  result <- result[order(result$MeanDecreaseAccuracy, decreasing=TRUE), c('var','MeanDecreaseAccuracy','MeanDecreaseGini')]
  
  return(result)
}

#' Calculate combined importance measure of each attribute
#'
#' Creates a data frame with calculated chi-squared, AUC and RandomForest measures combined into one score and sorted by it
#'
#' @param x data frame with variables of interest
#' @param y target variable
#'
#' @return data frame with variable names sorted by combined importance statistics
#' @export
get.var.importance <- function(x,y)
{
  
  print("Computing AUC based importance")
  auc.imp <- get.auc.importance(x, y)
  rownames(auc.imp) <- NULL
  auc.imp$num <- as.numeric(rownames(auc.imp))
  
  print("Computing Random Forest based importance")
  rf.imp <- get.rf.importance(x, y)
  rownames(rf.imp) <- NULL
  rf.imp$num <- as.numeric(rownames(rf.imp))
  
  print("Computing Chi-squared based importance")
  chisq.imp <- get.chisq.importance(x, y)
  rownames(chisq.imp) <- NULL
  chisq.imp$num <- as.numeric(rownames(chisq.imp))
  
  print("Preparing result table")
  x <- merge(auc.imp[,c('var','num')], rf.imp[,c('var','num')], by='var', all.x=TRUE, all.y=TRUE)
  x <- merge(x, chisq.imp[,c('var','num')], by='var', all.x=TRUE, all.y=TRUE)
  colnames(x) <- c("VAR","AUC.RANK","RF.RANK","CHISQ.RANK")
  x$TOTAL.RANK <- x$AUC.RANK + x$RF.RANK + x$CHISQ.RANK
  x<-x[order(x$TOTAL.RANK),]
  rownames(x) <- NULL
  
  return(x)
  
}

# compute.woe <- function(factor, target)
# {
#
#   if (class(target) != "factor")
#     stop("target variable must be factor")
#
#   if (class(factor) != "factor")
#     stop("factor variable must be factor")
#
#   x <- table(factor, target)
#   n.categories <- nrow(x)
#   x <- cbind(x,rep(0,n.categories))
#   x <- cbind(x,rep(0,n.categories))
#   x <- cbind(x,rep(0,n.categories))
#   x <- cbind(x,rep(0,n.categories))
#   colnames(x) <- c('0','1','rate','avg','lift','woe')
#   x <- rbind(x,c(0,0,0,0,0,0))
#   rownames(x)[n.categories+1] <- 'Total'
#   x[n.categories+1,1] <- sum(x[1:n.categories,1])
#   x[n.categories+1,2] <- sum(x[1:n.categories,2])
#   x[,3] <- round(x[,2] / ( x[,1] + x[,2] ), 2)
#   x[,4] <- x[n.categories+1,3]
#   x[,5] <- round(x[,3] / x[,4], 2)
#   x[,6] <- log((x[,2] / x[,1]) / (x[n.categories+1,2] / x[n.categories+1,1]))
#
#   x <- x[c(order(x[1:n.categories,5], decreasing=TRUE),n.categories+1),]
#
#   return (x)
#
# }

# ----------------------------------------------------------------
#
# Calculate performance for each top-n subset of variables using RandomForest Fitting
#
# Function "get.chisq.importance"
#
#   input:
#     data - covariates (numerics and factors with <53 levels)
#     sorted.variables - vector of variables sorted by their importance
#     y.name - name of target variable
#     analyze.top.n - limit calculations to top "analyze.top.n" variables
#     RF.trees - tune confidence by setting number of trees for RandomForest fitting
#     times - number of repetitions to get more stable results
#     trainSplit - how to partition dataset: rate of training examples
#     logging - "sparse" - only control information / "dense" - all information
#
#   output:
#     data frame with performance measures (accuracy and for binary models - AUC) for each subset
#
# ----------------------------------------------------------------
#' Beta
#' @export
get.rf.subset.quality <- function(data, sorted.variables, y.name, analyze.top.n=50, RF.trees=100, times=1, trainSplit=0.7, logging='sparse'){
  
  variable.subsets <- data.frame(var=sorted.variables, accuracy=rep(0, length(sorted.variables)), auc=rep(0, length(sorted.variables)))
  
  for (current.repeat in 1:times)
  {
    
    print(paste("Iteration ", current.repeat))
    
    trainIndex <- createDataPartition(data[,y.name], p=trainSplit, list=FALSE)
    data.train <- data[trainIndex,]
    data.test <- data[-trainIndex,]
    
    if (length(levels(as.factor(data[,y.name]))) == 2)
      compute.auc=TRUE
    
    for (i in 2:analyze.top.n)
    {
      variable.subsets$var[i] <- sorted.variables[i]
      if (logging != 'sparse') print(paste("[Cycle ", current.repeat, "] Evalutaing for top ", i," subset: ", sorted.variables[i], sep=""))
      rf.fit <- randomForest(y=data.train[,y.name], x=data.train[,sorted.variables[1:i]], n.tree=RF.trees, importance=FALSE)
      
      # For all kinds of classification compute accuracy
      scores.class <- predict(rf.fit, data.test[,sorted.variables[1:i]], type="response")
      variable.subsets$accuracy[i] <- variable.subsets$accuracy[i] + confusionMatrix(data.test[,y.name], scores.class)$overall[1]
      
      # For binary classification compute AUCs
      if (compute.auc)
      {
        scores.prob <- predict(rf.fit, data.test[,sorted.variables[1:i]], type="prob")
        variable.subsets$auc[i] <- variable.subsets$auc[i] + roc(as.factor(data.test[,y.name]), scores.prob[,2])$auc
      }
      
      if (logging != 'sparse') print(paste("Accuracy=", round(variable.subsets$accuracy[i],5), sep=""))
    }
    
    #variable.subsets[,paste(accuracy,".", current.repeat, sep="")] <- variable.subsets$accuracy
  }
  
  variable.subsets$accuracy <- round(variable.subsets$accuracy / times, 5)
  variable.subsets$auc <- round(variable.subsets$auc / times, 5)
  
  plot(variable.subsets$auc[2:analyze.top.n], type='l')
  #predict(loess(~as.numeric(rownames(voip.money))))[1:2000]))
  return(variable.subsets)
}


#' Put rare factor values into separate group
#'
#' Sets rare values of a factor to new value "other". Thus limits the number of factor levels.
#'
#' @param x data vector
#' @param top.values number of bins to create (plus "other" group)
#' @param min.frequency if factor value is less requent than this value, then put it to "other" group
#' @examples
#' df$var <- group.factor(df$var, top.values=10)
#' df$var <- group.factor(df$var, min.frequency=100)
#' @return data frame with mapping (variable value) -> (variable group)
#' @export
group.factor <- function(x, top.values=10, min.frequency=NA)
{
  
  require(plyr)
  
  if (class(x)!="factor")
    stop("Variable must be factor")
  
  # If there are less levels in variable than "top.values" parameter
  top.values <- min(length(levels(x)), top.values)
  
  x <- data.frame(x)
  colnames(x) <- c("x")
  
  counts <- count(x)
  counts <- counts[order(counts$freq, decreasing=TRUE),]
  counts$group <- rep("", nrow(counts))
  rownames(counts) <- NULL
  
  # If we choose to select by minimum frequency, set top.values accordingly [1,n]
  if (!is.na(min.frequency))
  {
    if (sum(counts$freq >= min.frequency)==0)
    {
      top.values <- 0
    } else {
      top.values <- max(which(counts$freq >= min.frequency))
    }
  }
  
  # Set group values for top categories
  counts$group[1:top.values] <- as.character(counts$x[1:top.values])
  
  # Set "other" for all other categories
  if(nrow(counts) > top.values) counts$group[(top.values+1):nrow(counts)] <- rep("other", nrow(counts)-top.values)
  
  return(factor(join(x,counts[,c("x","group")], by="x", type="left")[[2]]))
  
}


#' Apply groupings to new data
#'
#' Groups factor values according to mapping generated as an output of a function group.factor. Thus the same grouping can be applied to new datasets.
#' Now it uses dataset itself as a mapping, which is not very efficient. In future releases a proper mapping will be generated and used.
#'
#' @param data data with already grouped data
#' @param variables factors to analyze
#' @param new.data new data frame with factor variables to convert
#' @param other.factor name of the "other" group
#' @examples
#' new.df <- apply.factor.grouping(df, c('var1','var2','var3'), new.df)
#' @return new data frame with added columns named group.X (where X is variable name)
#
# ----------------------------------------------------------------
#' @export
apply.factor.grouping <- function(data, variables, new.data, other.factor="other")
{
  for (i in 1:length(variables))
  {
    
    # if (length(intersect(paste("group.", variables, sep=""), new.data)) > 0)
    #  stop(paste("New data already contains variable",intersect(paste("group.", variables, sep=""),sep="")))
    
    # Use initial dataset as mapping (value -> group)
    group.map <- unique(data.frame(var=data[,variables[i]], group=as.character(data[,variables[i]]), stringsAsFactors=FALSE))
    
    # Set name for the new variable
    colnames(group.map) <- c("var", paste("group.",variables[i],sep=""))
    # colnames(group.map) <- c("var", variables[i])
    
    # Merge created group variable into the new dataset
    new.data <- merge(new.data, group.map, by.x=variables[i], by.y="var", all.x=TRUE)
    
    # Select just merged variable
    current.vector <- new.data[,paste("group.",variables[i], sep="")]
    # current.vector <- new.data[,variables[i]]
    
    # If the value in new dataset didn't match with the mapping, set group to "OTHER" (name of the group can be changed in parameters)
    current.vector[is.na(current.vector)] <- "other"
    
    # Set new group variable to factor
    # new.data[,paste("group.",variables[i], sep="")] <- as.factor(current.vector)
    new.data[,variables[i]] <- as.factor(current.vector)
    
    new.data[,paste("group.",variables[i], sep="")] <- NULL
    
  }
  return(new.data)
}

# TODO: change algorithm from RF to some classification model
#' Optimal factor cutoff (Beta)
#'
#' @export
optimal.factor.grouping <- function(x,y, fit.type="glm", max.levels=30)
{
  
  require(randomForest)
  require(AUC)
  require(caret)
  require(klaR)
  
  if (!("0" %in% levels(y) & "1" %in% levels(y)))
    stop("Please set target values to 0 and 1. Current version works only with numeric values")
  
  if (fit.type=="glm")
    print("Fit model = GLM (logistic regression)")
  if (fit.type=="rf")
    print("Fit model = Random Forest")
  
  
  trainIndex <- createDataPartition(y, p=0.7, list=FALSE)
  y.train <- y[trainIndex]
  y.test <- y[-trainIndex]
  result <- c()
  woes <- c()
  for (i in 1:max.levels)
  {
    cat(paste("\rProcessing group #",i,sep=""))
    x.grouped <- group.factor(x, top.values=i)
    x.train <- x.grouped[trainIndex]
    x.test <- x.grouped[-trainIndex]
    if (fit.type=="glm")
    {
      model.fit <- glm(formula("y~x"), data=data.frame(x=x.grouped, y=y), family=binomial(logit))
      model.pred <- predict(model.fit, data.frame(x=x.grouped), type="response")
      auc <- as.numeric(AUC:auc(AUC:roc(model.pred, y)))
      woes <- c(woes, woe(formula("y~x"), data=data.frame(x=x.grouped, y=y))$IV)
    }
    if (fit.type=="rf")
    {
      model.fit <- randomForest(data.frame(x=x.train), y.train, ntree=100)
      model.pred <- predict(model.fit, data.frame(x=x.test), type="prob")[,1]
      auc <- as.numeric(AUC:auc(AUC:roc(model.pred,y.test)))
    }
    
    result <- c(result, auc)
    
  }
  cat(paste(" => Optimal number of groups=",which.max(result)))
  cat("\n")
  return(as.vector(result))
}

# optimal.factor.grouping <- function(x,y, max.levels=30)
# {
#   result <- c()
#   for (i in 1:max.levels)
#   {
#     cat(paste("\rProcessing group #",i,sep=""))
#     x.grouped <- group.factor(x, top.values=i)
#     woe(y~x, data=data.frame(x,y), appont=TRUE)
#     result[length(result) + 1] <- abs(as.numeric(roc(x.grouped)$auc)-0.5)
#   }
#   print("\n")
#   abs(result-0.5)
#   return(result))
# }


# ----------------------------------------------------------------
#
# Calculate AUC for each individual variable (for binomial cases only)
#
# Function "get.auc"
#
#   input:
#     x - data frame with covariates
#     y - binary target variable
#
#   output:
#     data frame with covariates from x sorted by AUC values
#
# ----------------------------------------------------------------
#' @export
get.auc <- function(y,x)
{
  
  require(AUC)
  
  auc.df <- as.data.frame(sapply(x,function(a) AUC::auc(AUC::roc(a,y))))
  colnames(auc.df) <- "auc"
  auc.df$var <- rownames(auc.df)
  rownames(auc.df) <- NULL
  auc.df <- auc.df[,c('var','auc')]
  auc.df$gini <- 2*abs(auc.df$auc-0.5)
  auc.df <- auc.df[order(abs(auc.df$auc-0.5), decreasing=TRUE),]
  return(auc.df)
}

# ----------------------------------------------------------------
#
# Calculate AUC for model based on each top-n subset of sorted variables
#
# Function "get.top.subset"
#
#   input:
#     variables - sorted data frame of variable names
#     train.data - training data frame
#     test.data - test data frame
#
#   output:
#     data frame with covariates from x sorted by AUC values
#
# ----------------------------------------------------------------
#'
#' @export
get.top.subset <- function(variables, data, target)
{
  
  require(caret)
  
  subsets.df <- data.frame(
    subset.num<-rep("",length(variables)),
    auc<-rep(0, length(variables)),stringsAsFactors = FALSE)
  colnames(subsets.df) <- c("subset.num","auc")
  
  trainIndex <- createDataPartition(data[,target], p=0.7, list=FALSE)
  train.data <- data[trainIndex,]
  test.data <- data[-trainIndex,]
  
  for (i in 1:length(variables))
  {
    cat(paste("\rProcessing variable: ", i," / ",length(variables),sep=""))
    current.vars <- variables[1:i]
    subsets.df$subset.num[i] <- paste("Top-",i,sep="")
    glm.fit <- glm(formula=as.formula(paste(target,"~.")), data=train.data[,c(current.vars,target)], family=binomial(probit))
    score <- predict(glm.fit, test.data, type='response')
    subsets.df$auc[i] <- roc(test.data[,target], score)$auc
  }
  return(subsets.df)
}





# Impute missing values in variable by its non-missing sample
random.imp <- function (a){
  missing <- is.na(a)
  n.missing <- sum(missing)
  a.obs <- a[!missing]
  imputed <- a
  imputed[missing] <- sample(a.obs, n.missing, replace=TRUE)
  return (imputed)
}

mean.imp <- function (a){
  a[is.na(a)] <-  mean(a, na.rm=TRUE)
  return(a)
}

#' Set missing to random
#'
#' Sets missing values to values from a random sample, which repeats distribution of non-missing values
#'
#' @param data data frame of interest
#' @param variables variables with possibly missing values
#' @return data frame with missing values replaced by values from random sample
#' @export
set.missing.to.random <- function(data, variables)
{
  data[variables] <- as.data.frame(apply(data[variables], 2, function(x) {random.imp(x)}))
  return(data)
}

impute.missing.mean <- function(data, variables)
{
  data[variables] <- as.data.frame(apply(data[variables], 2, function(x) {mean.imp(x)}))
  return(data)
}

#' Standardize columns
#'
#' Calculates z-score for one or several variables
#'
#' @param data data.frame with variables of interest
#' @param variables vector of variables to process
#'
#' @return The same data frame with variables replaced with their z-scores
#'
#' @examples
#' standardize.columns(df, c('var1','var2','var3'))
#'
#' @export
standardize.columns <- function(data, variables)
{
  data[variables] <- as.data.frame(apply(data[variables], 2, function(x) {scale(x)}))
  return(data)
}

# ----------------------------------------------------------------
#
# Replace outliers
#
# Function "replace.outliers.mean"
#
#   input:
#     data - data frame with variables of interest
#     variables - vector of variables to replace outliers
#     sigma.cutoff - minimum deviation (in sigmas) to be considered as an outlier
#
#   output:
#     data frame with replaced outliers variables only
#
# ----------------------------------------------------------------
replace.outliers.mean.f <- function (a, sigma.cutoff, var.name)
{
  print(paste("replaced outliers: ", sum(abs(a)>sigma.cutoff), sep=""))
  a[abs(a)>sigma.cutoff] <-  mean(a, na.rm=TRUE)
  return(a)
}

replace.outliers.mean <- function(data, variables, sigma.cutoff=3)
{
  data[variables] <- as.data.frame(apply(data[variables], 2, function(x) {replace.outliers.mean.f(x,sigma.cutoff)}))
}

#' Replace numeric missing values with mean
#'
#' Replaces missing (NA) values with column mean. Only applicable for numerics.
#'
#' @param data - data frame with variables of interest
#' @param variables - vector of numeric variables to replace missing values
#' @return data frame with replaced missing values for selected variables
#'
#' @export
set.missing.to.mean <- function(data, variables)
{
  data.means <- colMeans(data[variables], na.rm=TRUE)
  for (i in variables)
    data[is.na(data[,i]),i] <- data.means[i]
  return(data)
}

#' Replace numeric missing values with zero
#'
#' Replaces missing (NA) values with zero. Only applicable for numerics.
#'
#' @param data data frame with variables of interest
#' @param variables vector of numeric variables to replace missing values
#'
#' @return data frame with replaced missing values for selected variables
#'
#' @export
set.missing.to.zero <- function(data, variables)
{
  for (i in variables)
    data[is.na(data[,i]),i] <- 0
  return(data)
}

#' Replace empty factor values with "NA"
#'
#' Replaces blank factor values with "NA". Recreates factor variable with new level.
#'
#' @param data data frame with variables of interest
#' @param variables vector of factors
#' @param logging flag to write detailed stats in standard output (default=FALSE)
#'
#' @return data frame with replaced missing values for selected variables
#'
#' @examples
#' set.missing.factors.to.NA(df, c('var1','var2','var3'), logging=TRUE)
#' @export
set.missing.factors.to.NA <- function(data, variables, logging=FALSE)
{
  for (i in variables)
  {
    if (class(data[,i]) != "factor") stop("All variables must be factors!")
    
    if (logging) print(paste("Replaced ",sum(is.na(data[,i]))," missing values for factor ", i))
    data[,i] <- as.character(data[,i])
    data[is.na(data[,i]), i] <- "NA"
    
    if (logging) print(paste("Replaced ",sum(data[,i]=="")," blank values for factor ", i))
    data[data[,i]=="", i] <- "NA"
    data[,i] <- factor(data[,i])
  }
  return(data)
}



#' Predict missing numeric values (beta)
#'
#' Uses regression to predict most probable value
#' Was not tested yet
#'
#' @export
set.missing.to.prediction <- function(x)
{
  for (current.var in colnames(x))
  {
    print(paste("Processing variable ", current.var, sep=""))
    missingIndex <- is.na(x[,current.var])
    non.missingIndex <- !is.na(x[,current.var])
    
    lm.fit <- lm(as.formula(paste(current.var,"~.",sep="")), data=x[non.missingIndex,])
    lm.pred <- as.vector(predict(lm.fit, x[missingIndex,]))
    
    print(paste("Could not impute for ", sum(is.na(lm.pred)), " cases out of ", length(lm.pred), sep=""))
    huy <-  x[, current.var]
    huy[missingIndex] <- lm.pred
    x[, current.var] <- huy
  }
  return(x)
}

#' Compute average AUC with cross-validation
#'
#' Repeats random split, fits GLM model on train dataset and computes AUC on test dataset and calculates average AUC
#' In further releases will be replaced with more efficient cross-validation techniques
#'
#' @param formula formula to fit regression on
#' @param data input data.frame
#' @param target.var target variable name
#' @param n number of repetitions
#' @param fit.type Type of model to use (GLM or RandomForest)
#' @param train.rate fraction of data to set as training (default=0.7)
#' @export
cv.glm <- function(formula, data, target.var, n, fit.type="glm", train.rate=0.7)
{
  
  require(caret)
  require(randomForest)
  require(AUC)
  
  if (fit.type=="glm")
    cat("Fitting method = GLM (logit)\n")
  if (fit.type=="rf")
    cat("Fitting method = Random Forest\n")
  
  auc <- c()
  # cases <- c()
  for (i in 1:n)
  {
    cat(paste("\rIteration [",i,"/",n,"]",sep=""))
    trainIndex <- createDataPartition(data[,target.var], p=train.rate, list=FALSE)
    trainData <- data[trainIndex,]
    testData <- data[-trainIndex,]
    if (fit.type=="glm")
    {
      cv.fit <- glm(formula, data=trainData, family = binomial(logit))
      testData$score <- predict(cv.fit, testData)
    }
    if (fit.type=="rf")
    {
      cv.fit <- randomForest(formula=formula, data=trainData, n.tree=100, proximity=FALSE)
      testData$score <- predict(cv.fit, testData, type="prob")[,1]
    }
    auc[i] <- as.numeric(AUC:auc(testData$score, testData[,target.var]))
    #cases[i] <- complete.cases(trainData[,as.character(attr(terms(formula(formula)), 'variables'))[-c(1,2)]])
  }
  cat("\nIteration Results:")
  print(auc)
  #cat("\n Results:")
  #print(auc)
  print(paste("Average AUC = ", as.character(round(mean(auc),4)), "; Standard Deviation = ", as.character(round(sd(auc), 4)), sep=""))
  return(mean(auc))
}


#' Beta
#'
#' @export
find.optimal.subset <- function(x, y)
{
  n.vars <- length(x)
  n.iters <- 100
  subset.sizes <- rexp(n.iters, r=0.25)
  for (i in subset.sizes)
  {
    current.subset <- sample(1:n.vars, i, replace=FALSE)
    rf.fit <- randomForest(y=y, x=x, ntree=100, importance=FALSE)
  }
  return(1)
}

#'  Create WOE mapping
#'
#'  Computes WOE
#'  If one group is empty, woe := 2 * max(WOE) (or 2 * min(WOE))
#'
#' @param x data.frame with varaibles
#' @param vars variables of interest (must be factors!)
#' @param y target variable
#' @examples
#' create.woe(df, c('var1','var2','var3'), df$target)
#' @return data.frame with WOE mapping (value->woe) that can be further applied to new datasets
#' @export
create.woe <- function(x, vars=colnames(x), y)
{
  
  for (i in vars)
    if (class(x[,i])!='factor')
      stop(paste(i," is not a factor! To compute WOEs all variables must be factors",sep=""))
  
  if (length(levels(y)) != 2)
    stop(paste("Target variable should be binary! Now it has ", length(levels(y)), " levels", sep=""))
  
  result <- data.frame(var=character(0), category=character(0), woe=numeric(0))
  for (i in 1:length(vars))
  {
    cat("\rCalculating WOE for variable ",i," / ",length(vars),"\r",sep="")
    
    d <- merge(data.frame(table(x[y==0,vars[i]])), data.frame(table(x[y==1,vars[i]])), by.x='Var1', by.y='Var1', all.x=TRUE, all.y=TRUE)
    d$var <- vars[i]
    d<-d[,c(4,1,2,3)]
    colnames(d) <- c('var','category','cnt.0','cnt.1')
    
    total0 <- sum(d[,'cnt.0'])
    total1 <- sum(d[,'cnt.1'])
    d$woe <- log((d[,'cnt.0'] / total0) / (d[,'cnt.1'] / total1))
    
    d$woe[d$woe==Inf] <- 2 * max(d$woe[d$woe!=Inf])
    d$woe[d$woe==-Inf] <- 2 * min(d$woe[d$woe!=-Inf])
    d$woe[d$woe==Inf] <- 1
    d$woe[d$woe==-Inf] <- -1
    
    d <- d[,c('var','category','woe')]
    result <- rbind(result, d)
  }
  cat("\n")
  return(result)
}

#' Apply WOE mapping to new data
#'
#' Replaces factor values with corresponding WOE scores
#'
#' @param x data.frame with factor values to replace
#' @param woe.map generated with create.woe (or manually) mapping
#' @examples
#' woe.apply(df, woe.map)
#' @export
woe.apply <- function(x, woe.map)
{
  
  if (nrow(woe.map[,c('var','category')]) != nrow(unique(woe.map[,c('var','category')])))
    stop("Non-unique categories. Check WOE mapping.")
  
  vars <- unique(woe.map$var)
  
  # Set row numbers as IDs (to merge intermediate results into initial table)
  rownames(x) <- NULL
  x$row <- rownames(x)
  
  for (i in 1:length(vars))
  {
    cat("\rApplying WOE to variable ",i,"/",length(vars),"\r",sep="")
    woe.var <- merge(data.frame(row=x$row, category=x[,vars[i]]), woe.map[woe.map$var==vars[i],c('category','woe')], by.x='category', by.y='category', all.x=TRUE)
    woe.var <- woe.var[,c('row','woe')]
    woe.var$woe <- ifelse(is.na(woe.var$woe)==TRUE, 0, woe.var$woe)
    colnames(woe.var) <- c('row',paste("woe.",vars[i],sep=""))
    x <- merge(x, woe.var, by.x='row', by.y='row', all.x=TRUE)
  }
  cat("\n")
  
  # Reset to initial sorting
  x <- x[order(x$row),]
  
  # Delete temporary attribute
  x$row <- NULL
  
  return (x)
}

#' Create cross-tabulation between factor and binary target
#'
#' Creates a table with stats about factor values. Also computes Lift and WOEs. Print totals. Sorts result by lift.
#'
#' @param factor factor variable
#' @param target binary target variable
#' @return Data frame with the number of cases of each class for every factor value, lift and woe.
#' @examples
#' compute.woe(df$var, df$target)
#' @export
compute.woe <- function(factor, target)
{
  
  if (class(target) != "factor")
    stop("target variable must be factor")
  
  if (class(factor) != "factor")
    stop("factor variable must be factor")
  
  x <- table(factor, target)
  n.categories <- nrow(x)
  x <- cbind(x,rep(0,n.categories))
  x <- cbind(x,rep(0,n.categories))
  x <- cbind(x,rep(0,n.categories))
  x <- cbind(x,rep(0,n.categories))
  colnames(x) <- c('0','1','rate','avg','lift','woe')
  x <- rbind(x,c(0,0,0,0,0,0))
  rownames(x)[n.categories+1] <- 'Total'
  x[n.categories+1,1] <- sum(x[1:n.categories,1])
  x[n.categories+1,2] <- sum(x[1:n.categories,2])
  x[,3] <- round(x[,2] / ( x[,1] + x[,2] ), 2)
  x[,4] <- x[n.categories+1,3]
  x[,5] <- round(x[,3] / x[,4], 2)
  x[,6] <- log((x[,2] / x[,1]) / (x[n.categories+1,2] / x[n.categories+1,1]))
  
  x <- x[c(order(x[1:n.categories,5], decreasing=TRUE),n.categories+1),]
  
  return (x)
  
}
konstantin-kotochigov/R_ml_utils documentation built on May 3, 2019, 4:07 p.m.