R/datasets.R

#' Production Measurements of High-Tech Parts - Singular Case
#'
#' The \code{HTP2} data set contains 457 high-tech parts designed for consumer
#' products characterized by 149 tests.
#' These tests are performed to ensure a high quality of the production.
#' All these 457 parts were considered functional and have been sold.
#' However the part 28 showed defects in use and was
#' returned to the manufacturer by the customer. Therefore this part
#' can be considered as outlier.
#'
#' @usage
#' data("HTP2")
#' 
#' @format A data frame with 457 rows and 149 variables V.1 - V.149, presenting
#' some collinearity issues.
#'
#' @source Anonymized data from a nondisclosed manufacturer.
#' 
#' @references 
#' Archimbaud, A., Drmac, Z., Nordhausen, K., Radojcic, U. and Ruiz-Gazen, A.
#' (2023) Numerical Considerations and a New Implementation for Invariant
#' Coordinate Selection. \emph{SIAM Journal on Mathematics of Data Science},
#' \bold{5}(1), 97--121. \doi{10.1137/22M1498759}.
#' 
#' @examples
#' # HTP2 data: the observation 28 is considered as an outlier
#' data("HTP2")
#' outliers <- c(28)
#' boxplot(HTP2, horizontal = TRUE)
#' # Outlier detection using ICS
#' library(ICS)
#' \dontrun{
#' out <- ICS_outlier(HTP2, ICS_algorithm = "QR",
#'                    method = "norm_test",
#'                    test = "agostino.test", level_test = 0.05,
#'                    level_dist = 0.01, n_dist = 50)
#' 
#' # Here there is a singularity issue. One solution is to first reduce the 
#' # dimension. To ensure higher numerical stability of the subsequent methods
#' # we suggest to permute the data and to use the QR decomposition instead of
#' # the regular SVD decomposition.
#' Xt <- HTP2
#' # Normalization by the mean 
#' Xt.c <- sweep(HTP2, 2, colMeans(HTP2), "-")
#' 
#' # Permutation by rows 
#' # decreasing by infinity norm:  absolute maximum
#' norm_inf <- apply(Xt.c, 1, function(x) max(abs(x)))
#' order_rows <- order(norm_inf, decreasing = TRUE)
#' Xt_row_per <- Xt.c[order_rows,]
#' 
#' # QR decomposition of Xt with column pivoting from LAPACK 
#' qr_Xt <- qr(1/sqrt(nrow(Xt.c)-1)*Xt_row_per, LAPACK = TRUE)
#' 
#' # Estimation of rank q 
#' # R is nxp, but with only zero for rows > p
#' # the diag of R is already in decreasing order and is a good approximation
#' # of the rank of X.c. To decide on which singular values are zero we use
#' # a relative criteria based on previous values.
#' # R should be pxp
#' R <- qr.R(qr_Xt)
#' r_all <- abs(diag(R))
#' r_ratios <- r_all[2:length(r_all)]/r_all[1:(length(r_all)-1)]
#' q <- which(r_ratios < max(dim(Xt.c)) *.Machine$double.eps)[1]
#' q <- ifelse(is.na(q), length(r_all), q)
#' 
#' # Q should be nxp but we are only interested in nxq
#' Q1 <- qr.Q(qr_Xt)[,1:q]
#' 
#' # QR decomposition of Rt 
#' R_q <- R[1:q, ]
#' qr_R <- qr(t(R_q), LAPACK = TRUE)
#' Tau <- qr.Q(qr_R)[1:q, ]
#' Omega1 <- qr.R(qr_R)[1:q, 1:q]
#' 
#' # New X tilde 
#' # permutation matrices
#' # permutation of rows
#' Pi2 <- data.frame(model.matrix(~ . -1, data = data.frame(row=as.character(order_rows))))
#' Pi2 <- Pi2[,order(as.numeric(substr(colnames(Pi2), start = 4, stop = nchar(colnames(Pi2)))))]
#' colnames(Pi2) <- rownames(Xt)
#' 
#' # permutation of cols
#' Pi3 <- data.frame(model.matrix(~ . -1, data = data.frame(col=as.character( qr_R$pivot))))
#' Pi3 <- t(Pi3[,order(as.numeric(substr(colnames(Pi3), start = 4, stop = nchar(colnames(Pi3)))))])
#' 
#' X_tilde <- sqrt(nrow(Xt)-1)* Tau %*% t(Pi3) %*% t(Q1)
#' 
#' Xt_tilde <- t(Pi2) %*% t(X_tilde)
#' 
#' # Run ICS_outlier
#' out <- ICS_outlier(Xt_tilde, ICS_algorithm = "QR",
#' method = "norm_test",
#' test = "agostino.test", level_test = 0.01,
#' level_dist = 0.01, n_dist = 50)
#' 
#' summary(out)
#' plot(out)
#' text(outliers, out$ics_distances[outliers], outliers, pos = 2, cex = 0.9, col = 2)
#' }
#' 
"HTP2"


#' Production Measurements of High-Tech Parts - Nearly Singular Case
#'
#' The \code{HTP3} data set contains 371 high-tech parts designed for consumer
#' products characterized by 33 tests.
#' These tests are performed to ensure a high quality of the production.
#' All these 371 parts were considered functional and have been sold.
#' However the part 32 showed defects in use and was
#' returned to the manufacturer by the customer. Therefore this part
#' can be considered as outlier.
#'
#' @usage
#' data("HTP3")
#'
#'
#' @format A data frame with 371 rows and 33 variables V.1 - V.33,
#' presenting some approximate collinearity issues which may cause
#' some numerical inaccuracies.
#'
#' @source Anonymized data from a nondisclosed manufacturer.
#' 
#' @references 
#' Archimbaud, A., Drmac, Z., Nordhausen, K., Radojcic, U. and Ruiz-Gazen, A.
#' (2023) Numerical Considerations and a New Implementation for Invariant
#' Coordinate Selection. \emph{SIAM Journal on Mathematics of Data Science},
#' \bold{5}(1), 97--121. \doi{10.1137/22M1498759}.
#' 
#' @examples
#' # HTP3 data: the observation 32 is considered as an outlier
#' data("HTP3")
#' outliers <- c(32)
#' boxplot(HTP3)
#' 
#' # Outlier detection using ICS
#' library(ICS)
#' out <- ICS_outlier(HTP3, ICS_algorithm = "QR",
#'                    method = "norm_test",
#'                    test = "agostino.test", level_test = 0.05,
#'                    level_dist = 0.01, n_dist = 50)
#' 
#' summary(out)
#' plot(out)
#' text(outliers, out$ics_distances[outliers], outliers, pos = 2, cex = 0.9, col = 2)
#' 
"HTP3"

Try the ICSOutlier package in your browser

Any scripts or data that you put into this service are public.

ICSOutlier documentation built on May 29, 2024, 2:08 a.m.