R/pre_process.R

Defines functions rep_nom_attr replace_nominal_attributes remove_skewness_labels remove_unlabeled_instances remove_unique_attributes normalize_mldata fill_sparse_mldata

Documented in fill_sparse_mldata normalize_mldata remove_skewness_labels remove_unique_attributes remove_unlabeled_instances replace_nominal_attributes

#' Fill sparse dataset with 0 or '' values
#'
#' Transform a sparse dataset filling NA values to 0 or '' based on the column
#' type. Text columns with numeric values will be modified to numerical.
#'
#' @family pre process
#' @param mdata The mldr dataset to be filled.
#' @return a new mldr object.
#' @export
#'
#' @examples
#' sparse.toy <- toyml
#' sparse.toy$dataset$ratt10[sample(100, 30)] <- NA
#' complete.toy <- fill_sparse_mldata(sparse.toy)
fill_sparse_mldata <- function(mdata) {
  is.letter <- function(x) {
    grepl("[[:alpha:]]", x)
  }

  attrs <- seq(mdata$measures$num.attributes)
  new.cols <- lapply(mdata$dataset[, attrs], function(col) {
    if (anyNA(col)) {
      # Has NA value
      if (is.numeric(col)) {
        # Numeric value - fill with 0
        col[is.na(col)] <- 0
      }
      else if (any(is.letter(col))) {
        # Text value - fill with ''
        col <- as.character(col)
        col[is.na(col)] <- ""
      }
      else {
        # Text but with numeric values - convert to numeric and fill with 0
        col <- as.numeric(as.character(col))
        col[is.na(col)] <- 0
      }
    }
    col
  })

  dataset <- data.frame(row.names = rownames(mdata$dataset))
  dataset <- cbind(dataset, new.cols)
  mldr::mldr_from_dataframe(dataset, mdata$labels$index, name = mdata$name)
}

#' Normalize numerical attributes
#'
#' Normalize all numerical attributes to values between 0 and 1. The highest
#' value is changed to 1 and the lowest value to 0.
#'
#' @family pre process
#' @param mdata The mldr dataset to be normalized.
#' @return a new mldr object.
#' @export
#'
#' @examples
#' norm.toy <- normalize_mldata(toyml)
normalize_mldata <- function(mdata) {
  data <- mdata$dataset[seq(mdata$measures$num.attributes)]
  for (col in mdata$attributesIndexes) {
    if (is.numeric(data[, col])) {
      data[col] <- utiml_normalize(data[col])
    }
  }
  mldr::mldr_from_dataframe(data, mdata$labels$index, name = mdata$name)
}

#' Remove attributes from the dataset
#'
#' Remove specified attributes generating a new multi-label dataset.
#'
#' @family pre process
#' @param mdata The mldr dataset to remove labels.
#' @param attributes Attributes indexes or attributes names to be removed.
#' @return a new mldr object.
#' @note If invalid attributes names or indexes were informed, they will be
#'  ignored.
#' @export
#'
#' @examples
#' toyml1 <- remove_attributes(toyml, c("iatt8","iatt9", "ratt10"))
#' toyml2 <- remove_attributes(toyml, 10)
remove_attributes <- function (mdata, attributes) {
  if (mode(attributes) == "character") {
    attributes <- which(colnames(mdata$dataset) %in% attributes)
  }

  use.attributes <- setdiff(seq(mdata$measures$num.attributes), attributes)
  create_subset(mdata, seq(mdata$measures$num.instances), use.attributes)
}

#' Remove labels from the dataset
#'
#' Remove specified labels generating a new multi-label dataset.
#'
#' @family pre process
#' @param mdata The mldr dataset to remove labels.
#' @param labels Label indexes or label names to be removed.
#' @return a new mldr object.
#' @note If invalid labels names or indexes were informed, they will be ignored.
#' @export
#'
#' @examples
#' toyml1 <- remove_labels(toyml, c("y1","y5"))
#' toyml2 <- remove_labels(toyml, c(11, 15))
remove_labels <- function (mdata, labels) {
  if (mode(labels) == "character") {
    labels <- mdata$labels[labels, "index"]
    labels <- labels[!is.na(labels)]
  }
  else {
    # Only labels index, not attributes index
    labels <- mdata$labels$index[which(mdata$labels$index %in% labels)]
  }

  new.attrs <- setdiff(seq(mdata$measures$num.attributes), labels)
  dataset <- mdata$dataset[new.attrs]
  labels <- which(colnames(dataset) %in% rownames(mdata$labels))

  if (length(labels) <= 1) {
    stop("The pre process procedure result in a single label")
  }

  mldr::mldr_from_dataframe(dataset, labels, name = mdata$name)
}

#' Remove unique attributes
#'
#' Remove the attributes that have a single value for all instances. Empty and
#' NA values are considered different values.
#'
#' @family pre process
#' @param mdata The mldr dataset to remove.
#' @return a new mldr object.
#' @export
#'
#' @examples
#' alt.toy <- toyml
#' alt.toy$dataset$ratt10 <- mean(alt.toy$dataset$ratt10)
#' new.toy <- remove_unique_attributes(alt.toy)
remove_unique_attributes <- function(mdata) {
  labelsIndexes <- c()
  attributesIndexes <- c()

  for (col in seq(mdata$measures$num.attributes)) {
    if (col %in% mdata$labels$index) {
      attributesIndexes <- c(attributesIndexes, col)
      labelsIndexes <- c(labelsIndexes, length(attributesIndexes))
    }
    else {
      if (length(unique(mdata$dataset[, col])) > 1) {
        attributesIndexes <- c(attributesIndexes, col)
      }
    }
  }

  mldr::mldr_from_dataframe(mdata$dataset[attributesIndexes],
                            labelsIndexes,
                            name = mdata$name)
}

#' Remove examples without labels
#'
#' Remove the examples that do not have labels.
#'
#' @family pre process
#' @param mdata The mldr dataset to remove the instances.
#' @return a new mldr object.
#' @export
#'
#' @examples
#' new.toy <- remove_labels(toyml, c(12,14))
#' remove_unlabeled_instances(new.toy)
remove_unlabeled_instances <- function(mdata) {
  labelset <- rep(0, mdata$measures$num.labels)
  rows <- !apply(mdata$dataset[mdata$labels$index] == labelset, 1, all)
  create_subset(mdata, rows)
}

#' Remove unusual or very common labels
#'
#' Remove the labels that have smaller number of positive or negative examples
#' based on a specific threshold value.
#'
#' @family pre process
#' @param mdata The mldr dataset to remove the skewness labels.
#' @param t Threshold value. Number of minimum examples positive and negative.
#' @return a new mldr object.
#' @export
#'
#' @examples
#' remove_skewness_labels(toyml, 20)
remove_skewness_labels <- function(mdata, t = 1) {
  labelsIndexes <- c()

  for (col in mdata$labels$index) {
    tbl <- table(mdata$dataset[col])
    if (length(tbl) > 1 && all(tbl > t)) {
      labelsIndexes <- c(labelsIndexes, col)
    }
  }

  if (length(labelsIndexes) <= 1) {
    stop("The pre process procedure result in a single label")
  }

  dataset <- mdata$dataset[sort(c(mdata$attributesIndexes, labelsIndexes))]
  labels <- which(colnames(dataset) %in% rownames(mdata$labels))

  mldr::mldr_from_dataframe(dataset, labels, name = mdata$name)
}

#' Replace nominal attributes
#' Replace the nominal attributes by binary attributes.
#'
#' @family pre process
#' @param mdata The mldr dataset to remove.
#' @param ordinal.attributes Not yet, but it will be used to specify which
#'  attributes need to be replaced.
#' @return a new mldr object.
#' @export
#'
#' @examples
#' new.toy <- toyml
#' new.column <- as.factor(sample(c("a","b","c"), 100, replace = TRUE))
#' new.toy$dataset$ratt10 <- new.column
#' head(replace_nominal_attributes(new.toy))
replace_nominal_attributes <- function(mdata, ordinal.attributes = list()) {
  dataset <- data.frame(row.names = rownames(mdata$dataset))
  labelIndexes <- c()
  for (col in seq(mdata$measures$num.attributes)) {
    if (is.numeric(mdata$dataset[, col])) {
      dataset <- cbind(dataset, mdata$dataset[col])
      if (col %in% mdata$labels$index) {
        labelIndexes <- c(labelIndexes, ncol(dataset))
      }
    }
    else {
      column <- rep_nom_col(mdata$dataset[, col], colnames(mdata$dataset[col]))
      dataset <- cbind(dataset, column)
    }
  }

  mldr::mldr_from_dataframe(dataset, labelIndexes, name = mdata$name)
}

rep_nom_col <- function (column, column.name = "", type = 1) {
  # TODO ordinal.attributes
  column <- as.factor(column)
  symbols <- levels(column)
  result <- {}

  for (i in seq(length(symbols) - type)) {
    result <- cbind(result, as.double(column == symbols[i]))
  }
  names <- paste(column.name, symbols[seq(length(symbols) - type)], sep="_")

  if (column.name != "") {
    colnames(result) <- names
  }

  result
}

rep_nom_attr <- function(sdata, include.last = TRUE) {
  sdata <- as.data.frame(sdata)
  dataset <- data.frame(row.names = rownames(sdata))
  labelIndexes <- c()
  cols <- seq(ifelse(include.last, ncol(sdata), ncol(sdata)-1))
  for (col in cols) {
    if (is.numeric(sdata[, col])) {
      dataset <- cbind(dataset, sdata[col])
    }
    else {
      column <- rep_nom_col(sdata[, col], colnames(sdata[col]))
      dataset <- cbind(dataset, column)
    }
  }

  if (!include.last) {
    dataset <- cbind(dataset, sdata[ncol(sdata)])
  }

  dataset
}
rivolli/utiml documentation built on June 1, 2021, 11:48 p.m.