R/outl_gen.R

Defines functions gen_glob_outl gen_loc_outl neighborhood_fixed_n neighborhood_fixed_radius

Documented in gen_glob_outl gen_loc_outl

# Generates n x n neighborhood matrix N, where N_{ij}=1 if 
# point s_j is in the neighborhood of s_i and N_{ij}=0 if not.
# coords: 2 x n matrix or data frame with first column of x coordinates and second of y coordinates.
# radius: Radius of the neighborhoods.
# returns neighborhood matrix.
neighborhood_fixed_radius <- function(coords, radius) {
  n <- dim(coords)[1]
  d <- as.matrix(distances::distances(coords))
  d[which(d <= radius)] <- 1
  d[which(d > radius)] <- 0
  d <- d - diag(n)
  return(d)
}


# Generates n x n neighborhood matrix N, where N_{ij}=1 if 
# point s_j is in the neighborhood of s_i and N_{ij}=0 if not.
# coords: 2 x n matrix or data frame with first column of x coordinates and second of y coordinates.
# neighborhood_size: Number of points in each neighborhood.
# returns neighborhood matrix.
neighborhood_fixed_n <- function(coords, neighborhood_size) {
  n <- nrow(coords)
  d <- as.matrix(distances::distances(coords))
  # Save indexes in another matrix:
  index_matrix <- matrix(rep(seq(1:n), n), ncol = n, byrow = TRUE)
  # Sort indexes row wise by distances:
  index_sorted <- matrix(index_matrix[order(row(d), d)], ncol=n, byrow=TRUE)
  # Select indexes to 
  neighbor_indexes <- index_sorted[,1:neighborhood_size]
  neighborhood_matrix <- index_matrix
  for (i in 1:n) {
    neighborhood_matrix[i,][neighbor_indexes[i,]] <- 1
    neighborhood_matrix[i,][-neighbor_indexes[i,]] <- 0
  }
  return(neighborhood_matrix)
}


# Generates local outliers in condition that there are at most one local outlier in 
# each neighborhood. The outliers are generated by swapping k observations with smallest values of
# the first principal component (PC-1) and k observations with largest values of PC-1.
# coords: 2 x n matrix or data frame with first column of x coordinates and second of y coordinates
# data: p x n observation matrix or data frame
# alpha: proportion of observations to contaminate
# neighborhood_type: Type of neighborhoods to use, 'radius' or 'neighborhood_fixed_n'.
# If 'radius' is used, neighborhoods are generated by adding every point within the radius 
# to neighborhood. If 'neighborhood_fixed_n' is used, neighborhood_size closest neighbors are 
# added to neighborhood.
# radius: Radius for neighborhoods when neighborhood_type is 'radius'.
# neighborhood_size: The number of points in one neighborhood when neighborhood_type 
# is 'number_of_neighbors'.
# swap_order: Order how to swap the observations with small and large PC-1 values. 
# Either 'regular', 'reverse' or 'random'. The local outliers are generated by swapping values
# X_small[i,] and X_large[k + 1 - i,], i=1,...k, where X_small is p x k matrix containing 
# the observations with smallest PC-1 scores and 
# X_large is p x k matrix containing observations with largest PC-1 scores.
# When 'regular' is used, X_small and X_large are sorted by PC-1 score from smallest to largest.
# When 'reverse' is used, X_small is sorted from largest to smallest and X_large from smallest to largest.
# When 'random' is used, X_small and X_large are in random order.
# return: data frame with contaminated data.
gen_loc_outl <- function(x, coords, alpha = 0.05, neighborhood_type = c("radius", "fixed_n"), radius = NULL,
                         neighborhood_size = NULL, swap_order = c("regular", "reverse", "random")) {
  n <- dim(coords)[1]
  x <- as.data.frame(x)
  type <- match.arg(neighborhood_type)
  order <- match.arg(swap_order)
  if (is.null(radius)){
    radius = 0.01*n
  }
  if (is.null(neighborhood_size)) {
    neighborhood_size = ceiling(0.01*n)
  }
  neighborhood_matrix <- switch(type, radius = neighborhood_fixed_radius(coords, radius), 
                                fixed_n = neighborhood_fixed_n(coords, neighborhood_size))
  Cov <- robustbase::covMcd(x, alpha = 0.75)
  pca <- princomp(x)
  sc <- pca$scores[,1]
  
  indice <- sort(sc, index.return=T)$ix
  ind1 <- ind2 <- NULL
  k <- round(alpha*n/2)
  
  for(j in 1:k) {
    ind1 <- c(ind1, indice[1])
    ind2<- c(ind2, indice[length(indice)])
    # remove observations such that s_ind1 or s_ind2 are in their neighborhoods  
    rm <- (1:n)[neighborhood_matrix[,c(indice[1])]!=0 | neighborhood_matrix[,c(indice[length(indice)])]!=0]
    rm <- c(rm, indice[1], indice[length(indice)])
    indice <- indice[!(indice%in%rm)]
  }
  
  # Contamination 
  dataCont <- x
  if (order == "reverse") {
    ind1 <- rev(ind1)
  } else if (order == "random") {
    ind1 <- sample(ind1)
    ind2 <- sample(ind2)
  }
  dataCont[ind1,] <- x[ind2,]
  dataCont[ind2,] <- x[ind1,]
  outliers <- c(ind1, ind2)
  dataCont$outlier <- FALSE
  dataCont$outlier[outliers] <- TRUE
  
  return(dataCont)
}

# Generates global outliers randomly
# data: p x n matrix or dataframe
# alpha: proportion of data to contaminate
# h: a constant value determining how large the outliers should be
# random_sign: a logical value to determine if random sign should be
# applied to the outliers
# returns contaminated data
gen_glob_outl <- function(x, alpha = 0.05, h = 10, random_sign = FALSE) {
  n <- dim(x)[1]
  p <- dim(x)[2]
  x <- as.data.frame(x)
  n_outliers <- round(alpha*n)
  ind <- sample(1:n, n_outliers)
  if (random_sign) {
    cont_values <- matrix(ifelse(rnorm(p*n_outliers) > 0, h, -h), ncol=p)
  } else {
    cont_values <- matrix(rep(h, p*n_outliers), ncol = p)
  }
  dataCont <- x
  dataCont[ind, ] <- dataCont[ind, ] + cont_values
  dataCont$outlier <- FALSE
  dataCont$outlier[ind] <- TRUE
  return(dataCont)
}

Try the SpatialBSS package in your browser

Any scripts or data that you put into this service are public.

SpatialBSS documentation built on July 26, 2023, 5:37 p.m.