#' Simulate multivariate gaussian distribution with some outliers.
#'
#' @param n
#' Number of points
#'
#' @param d
#' Number of dimensions
#'
#' @param out_perc
#' The percentage of outliers
#'
#' @param out_mag
#' The magnitude of the outliers in terms of the covariance
#'
#' @param independent
#' Whether the correlation of points are zero. Default to be true.
#'
#' @param cov_scale
#' Constant parameter for setting the covariance of the non-outliers. Default to be 1.
#'
#' @return
#' A list with:
#'
#' mu - mean vector
#'
#' sigma - covariance matrix
#'
#' gauss - matrix of points
#'
#' outlier - matrix of outliers
#'
#' @export
#' @importFrom MASS mvrnorm
#' @importFrom stats runif
#'
#' @examples
#' simulation <- multivarGaussian(n = 200, d = 3, out_perc = 0.03, out_mag = 4)
#'
multivarGaussian = function(n, d, out_perc, out_mag, independent = TRUE, cov_scale = 1){
# Mean of the cluster
mu = runif(d,1,50)
if (independent){
sigma = diag(runif(d, 1, 3))
} else{
sigma = matrix(runif(d*d, -3, 3), d, d)
sigma = (sigma + t(sigma))/2
eigs = eigen(sigma)$values
if (min(eigs) <= 0.5) {sigma = sigma - (min(eigs) - 0.5) * diag(d)}
}
sigma = cov_scale * sigma * runif(1,1,4)
sigma_out = cov_scale * sigma * out_mag
if (out_perc == 0 ) {
gauss = mvrnorm(n, mu = mu, Sigma = sigma) #, tol = 1)
}
else {
n_inlier <- round(n*(1-out_perc))
n_outlier <- n - n_inlier
# Else, pull from an MVN with the specified covariance matrix
gauss1 = mvrnorm(n_inlier, mu = mu, Sigma = sigma) #, tol = 1)
# Then pull the outliers from an MVN with the scaled up covariance matrix
gauss2 = mvrnorm(n_outlier, mu = mu, Sigma = sigma_out) #, tol= 1)
# Combine for one dataset
gauss = rbind(gauss1, gauss2)
}
mvGauss = list(mu = mu, sigma = sigma, gauss = gauss,outlier = gauss2, n_inlier = n_inlier)
return(mvGauss)
}
#' @title Simulate MVN clusters with outliers
#'
#' Simulate data points from several different gaussian distributions. The number
#' of points from different distributions are the same.
#'
#' @param n
#' Number of points in each Gaussian distribution
#'
#' @param d
#' Dimension of each point
#'
#' @param cluster
#' Number of clusters
#'
#' @param out_perc
#' Proportion of outliers in each cluster
#'
#'
#' @param out_mag
#' Magitude of covariance difference between outliers and non-outliers
#'
#' @param cov_scale
#' Covariance Scaling constant for the covariance of non-outliers
#'
#' @return
#' A list with:
#'
#' All the means of each cluster in a c*d matrix
#'
#' All the covariances of each cluster in a list of matrices(length c, each matrix d*d)
#'
#' All the simulated data points(n*c rows, d columns)
#'
#' All the simulated outliers
#'
#' @export
#'
#'
#' @examples
#' sim_info <- simMultGauss(n = 120, d = 2, cluster = 6, out_perc = 0.03, out_mag = 4)
#'
#'
simMultGauss = function(n, d, cluster, out_perc, out_mag, cov_scale = 1){
samples_simMultGauss = replicate(cluster, multivarGaussian(n = n, d = d,
out_perc = out_perc, out_mag = out_mag, cov_scale))
sample_inline = do.call(rbind, samples_simMultGauss[5,])
loop = 0
idx = c()
for (i in c(1:cluster)){
print(i)
temp = loop*n+c(1:sample_inline[i,])
idx=c(idx,temp)
loop = loop+1
}
sampleMu = do.call(rbind, samples_simMultGauss[1,])
sampleSigma = lapply(samples_simMultGauss[2,], function(y) as.matrix(y))
simSamp = do.call(rbind, samples_simMultGauss[3,])
simOut = do.call(rbind, samples_simMultGauss[4,])
return(list(mus = sampleMu, sigmas = sampleSigma, simdata = simSamp, outliers= simOut, idx))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.