Nothing
sono_freq <- function(data, probs, alpha = 0.01, r = 2, MAXLEN = 0, verbose = TRUE){
### INPUT CHECKS ###
disc_cols <- c(1:ncol(data))
if (!is.data.frame(data)){
stop("Data set should be of class 'data.frame'.")
}
for (i in disc_cols){
stopifnot("Discrete variables should be of class 'factor'." = (is.factor(data[, i])))
}
if (length(disc_cols)==1){
data[, disc_cols] <- as.data.frame(data[, disc_cols])
}
stopifnot("alpha should be of class 'numeric'." = is.numeric(alpha))
if (length(alpha) > 1){
stop("alpha should be of unit length.")
}
if (alpha <= 0 | alpha > 0.50){
stop("alpha should be positive and at most equal to 0.50.")
}
if (length(MAXLEN) > 1){
stop("MAXLEN should be an integer at most equal to the number of discrete variables.")
}
if (MAXLEN %% 1 !=0){
stop("MAXLEN should be an integer at most equal to the number of discrete variables.")
}
if (MAXLEN < 0 | MAXLEN > length(disc_cols)){
stop("MAXLEN should be an integer at most equal to the number of discrete variables.")
}
if (length(probs) != length(disc_cols)){
stop("Incorrect number of probability vectors.")
}
if (any(sapply(probs, sum) != 1)){
stop("Probability vectors should sum to 1.")
}
if (any(sapply(probs, min) <= 0) | any(sapply(probs, max) >= 1)){
stop("Probabilities can only be between zero and one.")
}
if (any(sapply(data, FUN = function(x) length(unique(x))) != sapply(probs, length))){
stop("Number of elements in probability vectors need to match the number of levels of each variable.")
}
### END OF CHECKS ###
# Get all power sets up to length MAXLEN
# For MAXLEN we need to make sure that the threshold value s >= 2
# In order to do that, we take combinations of variables and calculate s
# Until we achieve s^u < 2
# The above of course only applies as long as MAXLEN is set equal to 0
MAXLEN_User <- FALSE
if (MAXLEN == 0){
MAXLEN_User <- TRUE
if (length(disc_cols)==1){
MAXLEN <- 1
} else {
# Max expected probabilities
max_probs <- unlist(lapply(probs, max))
ordered_max_probs <- order(max_probs, decreasing = TRUE)
for (i in 1:length(disc_cols)){
probs_vec <- Reduce(kronecker, probs[c(ordered_max_probs[1:i])])
s <- max(floor(as.numeric(nrow(data) * DescTools::MultinomCI(probs_vec*nrow(data), conf.level=(1-2*alpha))[, 3])))
if (s == nrow(data)){
MAXLEN <- i-1
break
} else {
MAXLEN <- i
}
}
}
} else {
MAXLEN_estimate <- MAXLEN_est(data, probs, alpha, frequent = TRUE)
if (MAXLEN_estimate < MAXLEN){
warning('MAXLEN value larger than estimated; MAXLEN set to ', MAXLEN_estimate)
}
}
if (verbose){
message('MAXLEN: ', MAXLEN)
}
# Get power set
powerset_test <- rje::powerSet(disc_cols, MAXLEN)
if (verbose){
message('Power set object created.')
}
# Empty list to store probability vectors
probs_list <- list()
# Empty list to store outlier scores data frames
outlierdfs_list <- list()
# create corresponding list of data frames with sequences for each data point
for (i in disc_cols){
data[,i] <- as.numeric(data[,i])
}
if (verbose){
message('Pre-processing done.')
}
dfs <- list()
# List with infrequent items and powersets
freq_list <- list()
for (i in rev(1:MAXLEN)){
inxs <- which(sapply(X=powerset_test, FUN=length)==i)
aux_vec <- c()
for (j in inxs){
nam <- "df"
# Setting lower threshold value s
for (k in 1:length(powerset_test[[j]])){
nam <- paste(nam, powerset_test[[j]][k], sep="_")
}
probs_vec <- Reduce(kronecker, probs[powerset_test[[j]]])
s_obj <- DescTools::MultinomCI(probs_vec*nrow(data), conf.level=(1-2*alpha))
s <- as.numeric(nrow(data) * s_obj[, 3])
s_probs <- s_obj[, 1]
df <- data.frame('Sequence'=character(),
'Count'=integer(),
'Frequent'=logical(),
'Threshold' =numeric(),
stringsAsFactors = FALSE)
rows <- c()
if (length(freq_list)>0){
for (k in 1:length(freq_list)){
if (rje::is.subset(powerset_test[[j]], freq_list[[k]]$Variables)){
loc_inx <- which(freq_list[[k]]$Variables %in% powerset_test[[j]])
ifelse(length(powerset_test[[j]])>1, {
rows <- c(rows,
which(sapply(1:nrow(data),
FUN = function(i) check_vecs_equal(data[i, powerset_test[[j]]],
vec2=as.numeric(strsplit(freq_list[[k]]$Sequence, split="_")[[1]])[loc_inx]))==length(powerset_test[[j]])))
}, {
rows <- c(rows,
which(data[, powerset_test[[j]]]==as.numeric(strsplit(freq_list[[k]]$Sequence, split = "_")[[1]])[loc_inx]))
})
}
}
}
ifelse(length(rows)>0, dt <- data[-rows,powerset_test[[j]]], dt <- data[,powerset_test[[j]]])
if (i==1){
if (length(dt) == 0){
newrow <- data.frame('Sequence' = 'X',
'Count' = Inf,
'Frequent' = TRUE,
'Threshold' = 0)
df <- rbind(df, newrow)
dfs[[nam]] <- assign(nam, df)
next
}
tab <- table(dt)
for (k in 1:length(tab)){
df <- rbind(df, data.frame('Sequence'=names(tab)[k],
'Count'=as.numeric(tab)[k],
'Frequent'=as.numeric(tab)[k]>=s[as.numeric(names(tab)[k])],
'Threshold'=s[as.numeric(names(tab)[k])]))
if (as.numeric(tab)[k] >= s[as.numeric(names(tab)[k])]){
freq_list[[length(freq_list)+1]] <- list("Variables"=powerset_test[[j]],
"Sequence"=names(tab)[k])
}
}
} else {
if (dim(dt)[1] == 0){
newrow <- data.frame('Sequence' = 'X',
'Count' = Inf,
'Frequent' = TRUE,
'Threshold' = 0)
df <- rbind(df, newrow)
dfs[[nam]] <- assign(nam, df)
next
}
for (k in 1:nrow(dt)){
row <- dt[k,]
rownam <- character()
for (l in 1:length(row)){
ifelse(l==1, rownam <- paste0(rownam, row[l]), rownam <- paste(rownam, row[l], sep="_"))
}
# Save sequences based on power set, with their support
ifelse(rownam %in% df$Sequence,
{row_inx <- which(df$Sequence==rownam, arr.ind=TRUE)
df[row_inx, 2] <- df[row_inx, 2]+1},
{prod <- 1
probs_sublist <- probs[powerset_test[[j]]]
for (row_num in c(1:length(row))){
prod <- prod*probs_sublist[[row_num]][as.numeric(row[row_num])]
}
newrow <- data.frame('Sequence'=rownam,
'Count'=1,
'Frequent'=TRUE,
'Threshold'=s[match_numeric(prod, s_probs)])
df <- rbind(df, newrow)})
}
}
# Sort by increasing sequence name to be able to do comparison with thresholds
df <- df[order(df$Sequence, decreasing = FALSE),]
for (k in 1:nrow(df)){
ifelse(df[k,2] <= df[k,4], df[k,3] <- FALSE, freq_list[[length(freq_list)+1]] <- list("Variables"=powerset_test[[j]],
"Sequence"=df[k,1]))
}
if (all(df[,4] == nrow(data)) | all(df[, 3])){
aux_vec <- c(aux_vec, TRUE)
} else {
aux_vec <- c(aux_vec, FALSE)
}
dfs[[nam]] <- assign(nam, df)
}
if (all(aux_vec) & MAXLEN_User){
MAXLEN <- i-1
# Remove redundant dfs
redundant_dfs <- as.numeric(which(sapply(dfs, function(df) sapply(df[1,1], count_digits))==i))
dfs <- dfs[-redundant_dfs]
break
}
}
# Preallocate memory for outputs
outscoredf <- data.frame('Observation' = 1:nrow(data), 'Score' = rep(0, nrow(data)))
outscoredfcells <- matrix(0, nrow = nrow(data), ncol = length(disc_cols))
colnames(outscoredfcells) <- colnames(data[, disc_cols, drop = FALSE])
# Initialize vectors for nod_vec and nod_counts
nod_vec <- numeric(nrow(data))
nod_counts <- numeric(nrow(data))
# Iterate over subset lengths in reverse order
for (j in rev(1:MAXLEN)) {
inxs <- which(sapply(powerset_test, length) == j)
for (k in inxs) {
power_subset <- powerset_test[[k]]
df_name <- paste0("df_", paste(power_subset, collapse = "_"))
# Precompute concatenated sequences for all rows
seq_vals <- apply(data[, power_subset, drop = FALSE], 1, function(row) {
paste(row, collapse = "_")
})
df <- dfs[[match(df_name, names(dfs))]]
# Skip processing if dfs entry is a placeholder
if (nrow(df) == 1 && df[1, 1] == "X") next
# Match sequences in the current data frame
matched_rows <- match(seq_vals, df$Sequence, nomatch = 0)
valid <- matched_rows > 0
if (any(valid)) {
rows <- which(valid)
matches <- matched_rows[valid]
# Compute scores for valid matches
scores <- (df[matches, 3]) *
df[matches, 2] /
(df[matches, 4] * (MAXLEN - j + 1)^r)
# Update nod_vec, nod_counts, and outscoredf
nod_vec[rows] <- nod_vec[rows] + (df[matches, 3]) * (MAXLEN - j + 1)
nod_counts[rows] <- nod_counts[rows] + (df[matches, 3])
outscoredf$Score[rows] <- outscoredf$Score[rows] + scores
# Update cell-wise scores
if (length(disc_cols) > 1) {
cell_scores <- scores / length(power_subset)
for (col in power_subset) {
col_index <- match(col, disc_cols)
outscoredfcells[rows, col_index] <- outscoredfcells[rows, col_index] + cell_scores
}
}
}
}
}
# Convert outscoredfcells to a data frame for consistency
outscoredfcells <- as.data.frame(outscoredfcells)
if (length(disc_cols) == 1) {
outscoredfcells <- as.matrix(outscoredf[, 2], ncol = 1)
colnames(outscoredfcells) <- colnames(data[, disc_cols])
outscoredfcells <- as.data.frame(outscoredfcells)
}
# Compute average depth
nod_vec[which(nod_vec > 0)] <- nod_vec[which(nod_vec > 0)]/nod_counts[which(nod_vec > 0)]
if (verbose){
message('Outlyingness scores for discrete variables calculated.')
}
return(list('MAXLEN'=MAXLEN, 'Discrete Scores'=outscoredf, 'Contributions'=outscoredfcells, 'Depth'=nod_vec))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.