# R/vacuum_cleaner.R In Sielinski/vacuum: Tukey's Vacuum Cleaner

#### Documented in vacuum_cleaner

l2_norm <- function(v) {
v ^ 2 %>% sum() %>% sqrt
}

#' Returns the residuals of a two-way table after removing systemic effects
#' @description
#' To remove systemic effects from values in a contingency table,
#' vacuum cleaner uses regression to identify the table's main effect
#' (dual regression), row effect (deviations of row
#' regression from dual regression), and column effect (deviations of
#' column regression from dual regression).
#'
#' Regression is performed twice: First on the table's original values,
#' then on the resulting residuals. The output is a table of residuals
#' "vacuum cleaned" of likely systemic effects.
#' @param x
#' Two-way table to analyze (must be 3x3 or greater).
#' @return
#' Residuals of \code{x}
#' @seealso
#' [vacuum::funop()], [vacuum::funor_funom()]
#' @references
#' Tukey, John W. "The Future of Data Analysis."
#' \emph{The Annals of Mathematical Statistics},
#' \emph{33}(1), 1962, pp 1-67. \emph{JSTOR},
#' \url{https://www.jstor.org/stable/2237638}.
#' @examples
#' vacuum_cleaner(table_8)
#' @export

vacuum_cleaner <- function(x) {
x <- as.matrix(x)

if (!is.matrix(x) | !is.numeric(x)) {
warning('argument "x" must be convertable to a numeric matrix')
} else if (nrow(x) < 3 | ncol(x) < 3) {
warning('argument "x" must have at least 3 rows and columns') # (p 53)
} else {

input_table <- x

r <- nrow(input_table)
c <- ncol(input_table)

######################
## Initial carriers ##
######################

# sqrt(1/n)

carrier_r <- rep(sqrt(1 / r), r)
carrier_c <- rep(sqrt(1 / c), c)

###################
## Start of loop ##
###################

# Tukey passes through the loop twice.
# Suggests further passes possible in the "attachments" section (p 53)
for (pass in 1:2) {
##################
## Coefficients ##
##################

# Calculate the column coefficients
coef_c <- rep(NA, c)
for (i in 1:c) {
# loop through every column, summing every row
# denominator is based upon the number of rows in the column
coef_c[i] <-
sum(carrier_r * input_table[, i]) / sum(carrier_r ^ 2)
}

# Calculate the row coefficients
coef_r <- rep(NA, r)
for (i in 1:r) {
# loop through every row, summing every column
# denominator is based upon the number of columns in the row
coef_r[i] <-
sum(carrier_c * input_table[i, ]) / sum(carrier_c ^ 2)
}

##########
## y_ab ##
##########

# either one of these
y_ab <- sum(coef_c * carrier_c) / sum(carrier_c ^ 2)
y_ab <- sum(carrier_r * coef_r) / sum(carrier_r ^ 2)

########################
## Apply subprocedure ##
########################

# create a destination for the output
output_table <- input_table

for (i in 1:r) {
for (j in 1:c) {
output_table[i, j] <- input_table[i, j] -
carrier_r[i] * (coef_c[j] - carrier_c[j] * y_ab) -
carrier_c[j] * (coef_r[i] - carrier_r[i] * y_ab) -
y_ab * carrier_c[j] * carrier_r[i]
}
}

# These are the coefficients that will get carried forward
coef_c <- coef_c - carrier_c * y_ab
coef_r <- coef_r - carrier_r * y_ab

#########
## aov ##
#########

#sum(coef_c ^ 2)
#var(coef_c)
#
#sum(coef_r ^ 2)
#var(coef_r)
#
#dat <- output_table %>%
#  dplyr::mutate('row' = dplyr::row_number()) %>%
#  tidyr::pivot_longer(cols = starts_with('X'),
#               names_to = 'col',
#               values_to = 'value')
#
#dat$row <- as.factor(dat$row)
#
#aov_results <- aov(value ~ row + col, dat)
#summary(aov_results)

########################
## Prep for next pass ##
########################

# normalize coefficients because we want sqrt(sum(a^2)) == 1
carrier_r <- coef_r / l2_norm(coef_r)
carrier_c <- coef_c / l2_norm(coef_c)

input_table <- output_table

#################
## End of loop ##
#################

}
output_table
}
}

Sielinski/vacuum documentation built on Sept. 15, 2020, 11:32 a.m.