R/quantile_discretization.R

Defines functions quantile_discretization DiscByQuantile quantileDisc cutpoints quantile_disc

Documented in quantile_discretization

quantile_disc = function(cov1,n=30) {

  cuts <- cutpoints(as.vector(cov1), n)
  cuts_val = numeric()
  #cuts[length(cuts)] = max(cov1)
  cuts[1] = -Inf
  disc = findInterval(cov1, cuts, rightmost.closed = T)
  while  ( sum(table(disc) < 30) > 0 )
  {
    tb = table(disc); val = as.numeric(names(tb[tb < 30]))
    for (i in val)
    {

      disc[disc == i] = i+1
      cuts_val = append(cuts_val, i+1) # found interval with less than 30 observations.
      #cuts = cuts[cuts != (i+1)]
    }

  }
  cuts[1] = -Inf;cuts[length(cuts)] = Inf;
  if (length(cuts_val) > 0)
  {
    cuts = cuts[-cuts_val]
    disc = findInterval(cov1, cuts, rightmost.closed = T)
  }
  list('data'=disc, 'cuts'=cuts[-c(1, length(cuts))]) # return without the extremes -Inf and Inf.
}


cutpoints = function(x, n=30) {
  len = length(x)
  if (len >= 30){ #data must have 30 observations
    if (len %/% n == 0) { n = 30 }
    if (len > n*200) { n = floor(0.005*len) }
    n_perc = len %/% n
    quants = seq(from=0, to=1, by=(100/n_perc)/100); quants[length(quants)] = 1
    #quants = quants[c(-1, -length(quants))]

    cpts = quantile(x, quants, na.rm=T)
    #print(cpts)
  } else { stop("Error: Data must have at least 30 observations.")}
  unique(cpts[order(cpts)])
}

# Quantile discretization for univariate model.
quantileDisc = function(cov1, n=30) {
  cuts <- cutpoints(cov1, n)
  cuts_val = numeric()
  cuts[1] = -Inf
  disc = findInterval(cov1, cuts, rightmost.closed = T)
  while  ( sum(table(disc) < 30) > 0 )
  {
    tb = table(disc); val = as.numeric(names(tb[tb < 30]))
    for (i in val)
    {

      disc[disc == i] = i+1
      cuts_val = append(cuts_val, i+1)
      #cuts = cuts[cuts != (i+1)]
    }

  }
  cuts[1] = -Inf;cuts[length(cuts)] = Inf;
  if (length(cuts_val) > 0)
  {
    disc = findInterval(cov1, cuts, rightmost.closed = T)
  }
  list('data'=disc, 'cuts'=cuts)
}

# cutpoints = function(x, n=30) {
#   len = length(x)
#   if (len >= 30){ #data must have 30 observations
#     if (len %/% n == 0) { n = 30 }
#     if (len > n*200) { n = floor(0.005*len) }
#     n_perc = len %/% n
#     quants = seq(from=0, to=1, by=(100/n_perc)/100); quants[length(quants)] = 1
#     #quants = quants[c(-1, -length(quants))]
#
#     cpts = quantile(x, quants)
#     #print(cpts)
#   } else { stop("Error: Data must have at least 30 observations.")}
#   unique(cpts)
# }

DiscByQuantile = function(db, n = 30) {
  len <- length(db)
  Names <- names(db)[-length(db)]
  resp <- db[, len]
  cuts <- list()
  for (i in Names) {
    mask <- !is.na(db[[i]])
    aux_ = quantileDisc(db[mask, i], n)
    cuts[[i]] = aux_$cuts
  }
  for (i in Names){
    aux <- cuts[[i]]
    aux <- aux[-length(aux)]
    db[, i] <- findInterval(db[, i], aux, rightmost.closed = T)
  }
  list('data'=db, 'cuts'=cuts)
}

#' Discretize a continuous database by quantile.
#'
#' @param db Dataframe to discretize.
#' @param n Numer of observations by quantile.
#'
#' @return List with the discretized dataframe and a vector of cutpoints.
#' @export
quantile_discretization = function(db, n = 30) {
  len = length(db)
  Names = names(db)[-length(db)]
  resp = db[, len]
  List = list()
  cuts = list()
  for (i in Names) {
    #DB = data.frame(db[i], resp)
    aux_ = quantile_disc(db[[i]], n)
    aux = factor(aux_$data)
    cuts[[i]] = aux_$cuts
    List[[i]] = aux
  }
  DiscPerc = data.frame(List, resp)

  list('data'=DiscPerc, 'cuts'=cuts)
}
diegomattozo/categorization documentation built on Sept. 26, 2020, 3:42 p.m.