R/tuple.package.R

######################################################################
## CODE AND DOCUMENTATION FOR PACKAGE tuple
######################################################################

#' @name tuple-package
#' @docType package
#' @title
#'   Find Matches, or Orphan, Duplicate or Triplicate Values
#' @description
#'   This package extends the base R functionality around checking
#'   for \code{\link[base]{unique}} and duplicate values in vectors.
#' @details
#'   The following changes are documented since the first release
#'   of this package on CRAN:
#' \tabular{lll}{
#' Version \tab Change               \tab Description              \cr
#' 0.3-06  \tab None                 \tab Initial release to CRAN. \cr
#' 0.4-01  \tab Added \code{\link{\%!in\%}}                   \tab
#'     This function tests for the opposite of the commonly        \cr
#' \tab \tab used testing operator \code{"\%in\%"}
#'           as documented in \code{\link[base]{match}}.           \cr
#'         \tab Added documentation                           \tab
#'     Added documentation for the package as a whole.             \cr
#' \tab \tab Implemented this change log.                          \cr
#'         \tab Improved documentation                        \tab
#'     Cleaned and otherwise improved documentation                \cr
#' \tab \tab that is generated by way of the \pkg{roxygen2}
#'           package                                               \cr
#' \tab \tab for existing functions.                               \cr
#'         \tab Added \code{\link{tuplicated}}                \tab
#'     This function is a major addition to the package.           \cr
#' \tab \tab It provides a generic way to find elements of a       \cr
#' \tab \tab vector that are replicated n or more times.           \cr
#' \tab \tab Fundamentally it depends only on the code for         \cr
#' \tab \tab \code{\link{duplicated}} as in the first version
#'           of this                                               \cr
#' \tab \tab package released to CRAN. The implementation          \cr
#' \tab \tab of \code{\link{triplicated}} has not been changed
#'           in this                                               \cr
#' \tab \tab in this update from version 0.3-06, but it will be    \cr
#' \tab \tab changed to call \code{\link{tuplicated}} 
#'           with \code{tuple = 3}                                 \cr
#' \tab \tab in a future release.                                  \cr
#'         \tab Added \code{\link{tuplicate}}                 \tab
#'     This function is another major addition. It provides        \cr
#' \tab \tab a generic way to find elements of a vector that are   \cr
#' \tab \tab replicated exactly n times. It depends on the code    \cr
#' \tab \tab for the newly-released
#'           \code{\link{tuplicated}}, and on the code             \cr
#' \tab \tab for \code{\link{orphan}} as in the initial package
#'           released to CRAN.                                     \cr
#' \tab \tab The implementation of
#'           \code{\link{triplicate}} has not changed              \cr
#' \tab \tab from version 0.3-06, but it will be changed to call   \cr
#' \tab \tab \code{\link{tuplicate}} with \code{tuple = 3}
#'           in a future release.                                  \cr
#' 0.4-02  \tab Added \code{\link{matchNone}}                 \tab
#'     This function returns a character string, based             \cr
#' \tab \tab on the table, that does not appear in the data.       \cr
#' }
NULL

#' @title
#'   Match All Values
#' @description
#'   Extends the functionality of \code{\link[base]{match}} to identify
#'   all matching values, instead of just the first one.
#' @details
#'   Returns an integer vector of the index in \code{table} for all
#'   the matches. The result is not sorted in numerical index order when
#'   more than one value is sought to be matched.
#'   Instead, the matches of the first value in \code{x} are listed first,
#'   followed by matches to the second value in \code{x} and so on.
#'   Values of \code{NA} are treated as data.
#' @param x
#'   A vector.
#' @param table
#'   The lookup table as a vector.
#' @examples
#' matchAll(3, c(1:3, 3, 4:6, 3, NA, 4))
#' matchAll(3:4, c(1:3, 3, 4:6, 3, NA, 4))
#' matchAll(c(NA, 3:4), c(NA, 1:3, 3, 4:6, 3, NA, 4))
#' @keywords match
#' @seealso \code{\link[base]{match}}
#' @export
matchAll <- function(x, table) {
  myMatch <- match(x, table)
  myRecords <- integer()
  while(length(myMatch)>0) {
    myRecords <- c(myRecords, myMatch)
    table[myMatch] <- ""
    myMatch <- match(x, table)
    if(any(is.na(myMatch))) {
      x <- x[-which(is.na(myMatch))]
      myMatch <- myMatch[-which(is.na(myMatch))]
    }
  }
  if(all(is.na(myRecords))) myRecords <- NA
  if(length(myRecords)>1 && any(is.na(myRecords)))
    myRecords <- myRecords[!is.na(myRecords)]
  return(myRecords)
}

#' @title
#'   Return a Symbol That Matches No Values
#' @description
#'   The tag value is chosen from among special characters so
#'   that it does not appear anywhere in the reference input data.
#'   The shortest possible tag is chosen.
#' @details
#'   This function is used in other packages by the same author
#'   to extend missing data handling in R. It provides for
#'   flexible missing data identifiers where needed by an
#'   S4 class, and similar unmatched identifiers for other
#'   dirty data problems.
#' @param x
#'   A vector or matrix.
#' @param table
#'   The lookup table against which to seek non-matches.
#'   This can be a simple vector, or it can be a list of
#'   two vectors.
#' @return
#'   A string composed of the strings in the \code{table}.
#'   The default list choses the first non-matching value
#'   out of 179 values that are unlikely to be used in
#'   most real sets of data.
#'   If only \code{table} is specified, the possible
#'   values for a non-matching string, ordered from the
#'   most to the least preferable, are returned.
#' @examples
#' my.x <- c(1,2,3,2,3,1,2)
#' matchNone(my.x)
#' matchNone(c(my.x,"."))
#' matchNone(c(my.x,".","!"))
#' matchNone(c(my.x,".","!","/"))
#' matchNone(c(my.x,".","!","/",".."))
#' matchNone(table = ".")
#' @export
matchNone <- function(x, table = list(c(".", "!", "/"), c("NA", "na"))) {
  if(is.list(table)) my.blocks1 <- table[[1]]
  else my.blocks1 <- table
  my.blocks2 <- apply(expand.grid(my.blocks1, my.blocks1), 1,
                      function(Y) { paste(Y, collapse = "") })
  if(is.list(table) && length(table)>1)
    my.blocks2 <- c(my.blocks2, table[[2]])
  my.blocks3 <- c(apply(expand.grid(my.blocks1, my.blocks2), 1,
                        function(Y) { paste(Y, collapse = "") }),
                  apply(expand.grid(my.blocks2, my.blocks1), 1,
                        function(Y) { paste(Y, collapse = "") }))
  my.blocks4 <- apply(expand.grid(my.blocks1, my.blocks2,
                                  my.blocks1), 1,
                      function(Y) { paste(Y, collapse = "") })
  my.preferred <- c(my.blocks1, my.blocks2, my.blocks3, my.blocks4)
  if(missing(x)) return(my.preferred)
  if(is.factor(x)) my.data <- levels(x)
  else my.data <- as.character(x)
  my.in <- TRUE
  my.counter <- -1
  my.sample.size <- 4
  while(all(my.in)) {
    if(my.counter > -1) {
      if(my.counter == 0)
        my.preferred <-
            apply(expand.grid(my.blocks1, my.blocks3, my.blocks1),
                  1, function(Y) { paste(Y, collapse = "") })
      else my.preferred <-
               paste(sample(c(letters, LETTERS, my.blocks1),
                            my.sample.size), collapse = "")
    }
    my.in <- my.preferred %in% my.data
    my.counter <- my.counter + 1
    if(my.counter > 1000) {
      my.sample.size <- my.sample.size + 1
      my.counter <- 1
    }
  }
  return(my.preferred[!my.in][1])
}

#' @name not-in
#' @title
#'   Mismatch Test
#' @description
#'   Test whether some data are not in a table.
#' @details
#'   This helps avoid code structures like \code{!(x \%in\% table)}.
#' @param x
#'   A vector of data.
#' @param table
#'   A table of reference values.
#' @keywords time, arithmetic
#' @examples
#' 1:2 %!in% 2:4
#' @keywords match
#' @seealso \code{\link[base]{match}}
#' @export
"%!in%" <- function(
  x,
  table
) {
  !(x %in% table)
}

#' @title
#'   Find Orphan Values
#' @description
#'   Finds values that occur exactly once in a vector.
#' @details
#'   Returns the unique values in the same order that they would be
#'   returned in a call to \code{\link[base]{unique}}.
#' @param x
#'   A vector.
#' @examples
#' orphan(c(NA, 1:3, 3, 4:6, 3, NA, 4))
#' @keywords orphan, unique
#' @seealso \code{\link[base]{unique}}
#' @export
orphan <- function(x) {
  return(x[!(duplicated(x) | duplicated(x, fromLast = TRUE))])
}

#' @title
#'   Find Duplicate Values
#' @description
#'   Finds values that occur exactly twice in a vector.
#' @details
#'   Returns the duplicated values in the same order that they would be
#'   returned in a call to \code{\link[tuple]{orphan}}.  This fundamentally
#'   differs from \code{\link[base]{duplicated}}, which returns
#'   a logical vector that is \code{TRUE} when it runs into any but
#'   the first occurrence of a value (and is therefore dependent on
#'   the direction of testing of the vector).
#' @param x
#'   A vector.
#' @examples
#' duplicate(c(NA, 1:3, 3, 4:6, 3, NA, 4))
#' @keywords duplicate, duplicated, repeat, repeated
#' @seealso \code{\link[base]{unique}} for similar output, and
#'          \code{\link{duplicated}} for the underlying calculations
#' @export
duplicate <- function(x) {
  return(x[duplicated(x)][!(duplicated(x[duplicated(x)]) |
             duplicated(x[duplicated(x)], fromLast=TRUE))])
}

#' @title
#'   Find Values That Are Repeated At Least Thrice
#' @description
#'   Finds values that are repeated at least three times in a vector.
#' @details
#'   Returns a logical vector that is \code{TRUE} when it runs into
#'   any but the first or second occurrences of a value, analogous
#'   to \code{\link[base]{duplicated}}.
#' @param x
#'   A vector.
#' @param fromLast
#'   A logical indicating if triplication should be considered from
#'   the reverse side, i.e., the two last (or rightmost) of identical
#'   elements would return \code{FALSE}.
#' @param ...
#'   Other optional arguments are ignored.
#' @examples
#' triplicated(c(NA, 1:3, 3, 4:6, 3, NA, 4, 3))
#' @keywords triplicate, triplicated, repeat, repeated
#' @seealso \code{\link[base]{duplicated}}
#' @export
triplicated <- function(x, ..., fromLast = FALSE) {
  retVec <- rep(FALSE, length(x))
  retVec[which(duplicated(x, fromLast = fromLast))[
      which(duplicated(x[which(duplicated(x, fromLast = fromLast))],
            fromLast = fromLast))]] <- TRUE
  return(retVec)
}

#' @title
#'   Find Triplicate Values
#' @description
#'   Finds values that occur exactly three times in a vector.
#' @details
#'   Returns the triplicated values in the same order that they would be
#'   returned in a call to \code{\link{orphan}}.  This fundamentally
#'   differs from \code{\link{triplicated}}, which returns
#'   a logical vector that is \code{TRUE} when it runs into any but
#'   the first or second occurrences of a value (and is therefore
#'   dependent on the direction of testing of the vector).
#' @param x
#'   A vector.
#' @examples
#' triplicate(c(NA, 1:3, 3, 4:6, 3, NA, 4))
#' triplicate(c(NA, 1:3, 3, 4:6, 3, NA, 4, 3))
#' @keywords triplicate, triplicated, repeat, repeated
#' @seealso \code{\link{duplicate}}
#' @export
triplicate <- function(x) {
  return(x[triplicated(x)][!(duplicated(x[triplicated(x)]) |
             duplicated(x[triplicated(x)], fromLast = TRUE))])
}

#' @title
#'   Find Elements That Are Repeated At Least n Times
#' @description
#'   Finds elements that are repeated at least n times in a vector.
#' @details
#'   Returns a logical vector that is \code{TRUE} when it runs into
#'   any but the \code{(n-1)}-st occurrences of an element, analogous
#'   to \code{\link[base]{duplicated}}.
#' @param x
#'   A vector.
#' @param n
#'   An integer.
#' @param fromLast
#'   A logical indicating if n-replication should be considered
#'   from the right side of the vector.  If \code{TRUE},
#'   the \code{n-1} last (or rightmost) of replicated
#'   identical elements return \code{FALSE}.
#' @param ...
#'   Other optional arguments are ignored.
#' @examples
#' x <- c(NA, 1:3, 4:5, rep(6, 6), 3, NA, 4, 3, 3)
#' all(tuplicated(x, 3) == triplicated(x))
#' @keywords n-replicate, n-replicated, repeat, repeated, match
#' @seealso \code{\link[base]{duplicated}}
#' @export
tuplicated <- function(x, n, ..., fromLast = FALSE) {
  if(missing(n)) {
    warning(paste("Parameter 'n' not specified.",
                  "Triplicate checking presumed."))
    n <- 3
  } else if(!is.numeric(n) || length(n) != 1)
    stop("Parameter 'n' must be numeric of length 1.")
  else if(n < 2)
    stop("Parameter 'n' must be >= 2.")
  else if(as.integer(n) != n) {
    warning(paste("Parameter 'n' is not an integer (",
                  n, "). Truncating to ",
                  as.integer(n), ".", sep = ""))
    n <- as.integer(n)
  }
  if(n == 2)
    return(duplicated(x, fromLast = fromLast))
  retVec <- rep(FALSE, length(x))
  retVec[which(tuplicated(x, n - 1, fromLast = fromLast))[
      which(duplicated(x[which(
                tuplicated(x, n - 1, fromLast = fromLast))],
            fromLast = fromLast))]] <- TRUE
  return(retVec)
}

#' @title
#'   Find n-Replicated Elements
#' @description
#'   Finds elements that occur exactly n times in a vector.
#' @details
#'   Returns the n-replicated elements in the same order that they would be
#'   returned in a call to \code{\link{orphan}}.  This fundamentally
#'   differs from \code{\link{tuplicated}}, which returns
#'   a logical vector that is \code{TRUE} when it runs into any but
#'   the \code{(n-1)}-st and fewer occurrences of an element
#'   (and is therefore dependent on the direction of testing of the vector).
#' @inheritParams tuplicated
#' @examples
#' x <- c(NA, 1:3, 4:5, rep(6, 6), 3, NA, 4, 3, 3)
#' lapply(2:6, function(X) { tuplicate(x, X) })
#' @keywords n-replicate, n-replicated, repeat, repeated, match
#' @seealso \code{\link{duplicate}}
#' @export
tuplicate <- function(x, n) {
  if(missing(n)) {
    warning(paste("Parameter 'n' not specified.",
                  "Triplicate checking presumed."))
    n <- 3
  } else if(!is.numeric(n) || length(n) != 1)
    stop("Parameter 'n' must be numeric of length 1.")
  else if(n < 2)
    stop("Parameter 'n' must be >= 2.")
  else if(as.integer(n) != n) {
    warning(paste("Parameter 'n' is not an integer (",
                  n, "). Truncating to ",
                  as.integer(n), ".", sep = ""))
    n <- as.integer(n)
  }
  if(n == 2)
    return(duplicate(x))
  return(orphan(x[tuplicated(x, n)][
      !(tuplicated(x[tuplicated(x, n)], n - 1) |
      tuplicated(x[tuplicated(x, n)], n - 1, fromLast = TRUE))]))
}

Try the tuple package in your browser

Any scripts or data that you put into this service are public.

tuple documentation built on May 2, 2019, 12:39 a.m.