Nothing
# Generates n x n neighborhood matrix N, where N_{ij}=1 if
# point s_j is in the neighborhood of s_i and N_{ij}=0 if not.
# coords: 2 x n matrix or data frame with first column of x coordinates and second of y coordinates.
# radius: Radius of the neighborhoods.
# returns neighborhood matrix.
neighborhood_fixed_radius <- function(coords, radius) {
n <- dim(coords)[1]
d <- as.matrix(distances::distances(coords))
d[which(d <= radius)] <- 1
d[which(d > radius)] <- 0
d <- d - diag(n)
return(d)
}
# Generates n x n neighborhood matrix N, where N_{ij}=1 if
# point s_j is in the neighborhood of s_i and N_{ij}=0 if not.
# coords: 2 x n matrix or data frame with first column of x coordinates and second of y coordinates.
# neighborhood_size: Number of points in each neighborhood.
# returns neighborhood matrix.
neighborhood_fixed_n <- function(coords, neighborhood_size) {
n <- nrow(coords)
d <- as.matrix(distances::distances(coords))
# Save indexes in another matrix:
index_matrix <- matrix(rep(seq(1:n), n), ncol = n, byrow = TRUE)
# Sort indexes row wise by distances:
index_sorted <- matrix(index_matrix[order(row(d), d)], ncol=n, byrow=TRUE)
# Select indexes to
neighbor_indexes <- index_sorted[,1:neighborhood_size]
neighborhood_matrix <- index_matrix
for (i in 1:n) {
neighborhood_matrix[i,][neighbor_indexes[i,]] <- 1
neighborhood_matrix[i,][-neighbor_indexes[i,]] <- 0
}
return(neighborhood_matrix)
}
# Generates local outliers in condition that there are at most one local outlier in
# each neighborhood. The outliers are generated by swapping k observations with smallest values of
# the first principal component (PC-1) and k observations with largest values of PC-1.
# coords: 2 x n matrix or data frame with first column of x coordinates and second of y coordinates
# data: p x n observation matrix or data frame
# alpha: proportion of observations to contaminate
# neighborhood_type: Type of neighborhoods to use, 'radius' or 'neighborhood_fixed_n'.
# If 'radius' is used, neighborhoods are generated by adding every point within the radius
# to neighborhood. If 'neighborhood_fixed_n' is used, neighborhood_size closest neighbors are
# added to neighborhood.
# radius: Radius for neighborhoods when neighborhood_type is 'radius'.
# neighborhood_size: The number of points in one neighborhood when neighborhood_type
# is 'number_of_neighbors'.
# swap_order: Order how to swap the observations with small and large PC-1 values.
# Either 'regular', 'reverse' or 'random'. The local outliers are generated by swapping values
# X_small[i,] and X_large[k + 1 - i,], i=1,...k, where X_small is p x k matrix containing
# the observations with smallest PC-1 scores and
# X_large is p x k matrix containing observations with largest PC-1 scores.
# When 'regular' is used, X_small and X_large are sorted by PC-1 score from smallest to largest.
# When 'reverse' is used, X_small is sorted from largest to smallest and X_large from smallest to largest.
# When 'random' is used, X_small and X_large are in random order.
# return: data frame with contaminated data.
gen_loc_outl <- function(x, coords, alpha = 0.05, neighborhood_type = c("radius", "fixed_n"), radius = NULL,
neighborhood_size = NULL, swap_order = c("regular", "reverse", "random")) {
n <- dim(coords)[1]
x <- as.data.frame(x)
type <- match.arg(neighborhood_type)
order <- match.arg(swap_order)
if (is.null(radius)){
radius = 0.01*n
}
if (is.null(neighborhood_size)) {
neighborhood_size = ceiling(0.01*n)
}
neighborhood_matrix <- switch(type, radius = neighborhood_fixed_radius(coords, radius),
fixed_n = neighborhood_fixed_n(coords, neighborhood_size))
Cov <- robustbase::covMcd(x, alpha = 0.75)
pca <- princomp(x)
sc <- pca$scores[,1]
indice <- sort(sc, index.return=T)$ix
ind1 <- ind2 <- NULL
k <- round(alpha*n/2)
for(j in 1:k) {
ind1 <- c(ind1, indice[1])
ind2<- c(ind2, indice[length(indice)])
# remove observations such that s_ind1 or s_ind2 are in their neighborhoods
rm <- (1:n)[neighborhood_matrix[,c(indice[1])]!=0 | neighborhood_matrix[,c(indice[length(indice)])]!=0]
rm <- c(rm, indice[1], indice[length(indice)])
indice <- indice[!(indice%in%rm)]
}
# Contamination
dataCont <- x
if (order == "reverse") {
ind1 <- rev(ind1)
} else if (order == "random") {
ind1 <- sample(ind1)
ind2 <- sample(ind2)
}
dataCont[ind1,] <- x[ind2,]
dataCont[ind2,] <- x[ind1,]
outliers <- c(ind1, ind2)
dataCont$outlier <- FALSE
dataCont$outlier[outliers] <- TRUE
return(dataCont)
}
# Generates global outliers randomly
# data: p x n matrix or dataframe
# alpha: proportion of data to contaminate
# h: a constant value determining how large the outliers should be
# random_sign: a logical value to determine if random sign should be
# applied to the outliers
# returns contaminated data
gen_glob_outl <- function(x, alpha = 0.05, h = 10, random_sign = FALSE) {
n <- dim(x)[1]
p <- dim(x)[2]
x <- as.data.frame(x)
n_outliers <- round(alpha*n)
ind <- sample(1:n, n_outliers)
if (random_sign) {
cont_values <- matrix(ifelse(rnorm(p*n_outliers) > 0, h, -h), ncol=p)
} else {
cont_values <- matrix(rep(h, p*n_outliers), ncol = p)
}
dataCont <- x
dataCont[ind, ] <- dataCont[ind, ] + cont_values
dataCont$outlier <- FALSE
dataCont$outlier[ind] <- TRUE
return(dataCont)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.