R/smoteMod.R
In sambia: A Collection of Techniques Correcting for Sample Selection Bias

Documented in smoteMod

#' smoteMod is a modified version of the 'synthetic minority oversampling technique to generate new data.
#' @author Norbert Krautenbacher, Kevin Strauss, Maximilian Mandl, Christiane Fuchs
#' @description This method adapts SMOTE to the context of stratified random samples. Rather than enlarging only the
#' minority class, smoteMod generates synthetic data for all strata with a weight bigger than 1.
#' Note: this function has to apply SMOTE H-1 times:
#' 1. subsample data by smallest stratum and a stratum to oversample
#' 2. oversample with modified SMOTE function according to weight of the stratum
#' 3. do this for the other H-2 to subsamples
#' 4. build new data set with strata where H-1 strata contain synthetic data (stratum with smallest weight remains as is)
#' @param data.x A data frame or matrix of numeric-attributed dataset
#' @param stratum a numerical vector of the same length as the number of the rows of data. Depending on the number of strata variables and their number of exposures each such combination is assigned to a numeric class id. The i-th entry of stratum contains the class id (and therefore class belonging) of the i-th row (=observation) of data.
#' @param weights a numerical vector whose length must coincide with the number of the rows of data. The i-th value contains the inverse-probability e.g. determines how often the i-th observation of data shall be replicated.
#' @param data.y A vector of a target class attribute corresponding to a dataset data.x.
#' @param K The number of nearest neighbors during sampling process
#' @examples
#' ## simulate data for a population
#' require(pROC)
#'
#' set.seed(1342334)
#' N = 100000
#' x1 <- rnorm(N, mean=0, sd=1) 
#' x2 <- rt(N, df=25)
#' x3 <- x1 + rnorm(N, mean=0, sd=.6)
#' x4 <- x2 + rnorm(N, mean=0, sd=1.3)
#' x5 <- rbinom(N, 1, prob=.6)
#' x6 <- rnorm(N, 0, sd = 1) # noise not known as variable
#' x7 <- x1*x5 # interaction
#' x <- cbind(x1, x2, x3, x4, x5, x6, x7)
#'
#' ## stratum variable (covariate)
#' xs <- c(rep(1,0.1*N), rep(0,(1-0.1)*N))
#'
#' ## effects
#' beta <- c(-1, 0.2, 0.4, 0.4, 0.5, 0.5, 0.6)
#' beta0 <- -2
#'
#' ## generate binary outcome
#' linpred.slopes <-  log(0.5)*xs + c(x %*% beta)
#' eta <-  beta0 + linpred.slopes
#'
#' p <- 1/(1+exp(-eta)) # this is the probability P(Y=1|X), we want the binary outcome however:
#' y<-rbinom(n=N, size=1, prob=p) #
#'
#' population <- data.frame(y,xs,x)
#'
#' #### draw "given" data set for training
#' sel.prob <- rep(1,N)
#' sel.prob[population$xs == 1] <- 9
#' sel.prob[population$y == 1] <- 8
#' sel.prob[population$y == 1 & population$xs == 1] <- 150
#' ind <- sample(1:N, 200, prob = sel.prob)
#'
#' data = population[ind, ]
#'
#' ## calculate weights from original numbers for xs and y
#' w.matrix <- table(population$y, population$xs)/table(data$y, data$xs)
#' w <- rep(NA, nrow(data))
#' w[data$y==0 & data$xs ==0] <- w.matrix[1,1]
#' w[data$y==1 & data$xs ==0] <- w.matrix[2,1]
#' w[data$y==0 & data$xs ==1] <- w.matrix[1,2]
#' w[data$y==1 & data$xs ==1] <- w.matrix[2,2]
#'
#' ### draw a test data set
#' newdata = population[sample(1:N, size=200 ), ]
#'
#' K = 5
#' genData = smoteMod(data.x = data[ , -which(colnames(data) %in% c('y', 'xs'))] , 
#' stratum = w, data.y = data$y, weights = w, K=K)
#' @export
#' @import e1071 ranger pROC FNN mvtnorm

smoteMod <- function(data.x, stratum, weights, data.y=NULL, K){
  data <- data.frame(data.x, stratum, weights)

  w <- round(weights/min(weights))

  str.min <- unique(stratum[ w==1])

  # stratum unique
  str.un <- unique(stratum)

  # stratum without the smallest
  str.wo <- str.un[! str.un==str.min]

  dat.mod <- data.frame(data.x, w, stratum)

  dat.new <- subset(data, stratum==str.min, select= -weights) # add data of stratum with weight 1 (only stratum with no synth data)

  for(st in str.wo){ # generate data for other stratum and add it
    dat.2st <- subset(dat.mod, stratum==str.min | stratum== st) # build data set with only two "classes"
    data.x.st <- subset(data.x, stratum==str.min | stratum== st)
    dat.smoted <- smoteNew(data.x = data.x.st , data.y = dat.2st$stratum,
      K=K,
      dup_size = unique(dat.2st$w[dat.2st$stratum==st])-1,
      class.to.oversample=st)$data
    dat.new <- rbind(dat.new, subset(dat.smoted, stratum== st ))

  }
  if(!is.null(data.y)){ # add y-values if given
    data.y.strat.comb <- unique(data.frame(data.y, stratum))
    data.y.vals <- unique(data.y)
    dat.new$data.y = data.y.vals[1]
    dat.new$data.y[dat.new$stratum %in% data.y.strat.comb$stratum[data.y.strat.comb$data.y==data.y.vals[2]]]<- data.y.vals[2]
  }
  if(sum(w) != nrow(dat.new)) stop("Something went wrong: sum of weights is unequal to number of observations of new dataset!")
  return(dat.new)
}