#' @include clhs-internal.R
#' @rdname clhs
#' @method clhs data.frame
#' @importFrom stats quantile runif
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @importFrom plyr alply
#' @noRd
#' @export
clhs.data.frame <- function(
x, # data.frame
size, # Number of samples you want
must.include = NULL, # row index of data that must be in the final sample
can.include = NULL, # Indexes from which sampling is allowed
cost = NULL, # Number or name of the attribute used as a cost
iter = 10000, # Number of iterations
use.cpp = TRUE, # use C++ code for metropolis-hasting loop?
temp = 1, # initial temperature
tdecrease = 0.95, # temperature decrease rate
weights = list(numeric = 1, factor = 1, correlation = 1), # weight for continuous data , weight for correlation among data, weight for object data
eta = 1,
obj.limit = -Inf, # Stopping criterion
length.cycle = 10, # Number of cycles done at each constant temperature value
simple = TRUE, # only return selected indices (if false, return a more complex S3 object)
progress = TRUE, # progress bar,
track = NULL, # just to have the cost computed without having it guiding the process
use.coords = FALSE, # ignored by the `data.frame` method.
... # ignored
) {
##check input
if (tdecrease >= 1) stop("tdecrease should be < 1")
if(!is.null(can.include) & !use.cpp) warning("can.include is not implemented in the R version and will be ignored")
include <- must.include
# Unless restricted by can.include, all the rows in x can potentially be sampled
if(is.null(can.include)){
can.include <- 1:nrow(x)
}
if (!is.null(include)) {
if (size <= length(include)) {
stop(paste0("size (", size, ") should be larger than length of include (", length(include), ")"))
}
}
# C++ version
if (use.cpp) {
x <- as.data.frame(x)
if (is.null(cost)) {
costVec <- rep(0,5)
costFlag <- FALSE
} else {
if (is.numeric(cost)) i_cost <- cost
else i_cost <- which(names(x) == cost)
if (!length(i_cost)) stop("Could not find the cost attribute.")
costVec <- x[, i_cost, drop = T]
costVec[is.infinite(costVec)] <- 1000
x <- x[, -i_cost]
costFlag <- TRUE
}
areFactors <- FALSE
i_factor <- which(!sapply(x, is.numeric))
n_factor <- length(i_factor)
if (n_factor > 0) {
areFactors <- TRUE
data_continuous <- x[, -1*i_factor, drop = FALSE]
data_factor <- x[, i_factor, drop = FALSE]
# for(i in 1:ncol(data_factor)){
# data_factor[,i] <- as.numeric(as.factor(data_factor[,i]))
# }
data_factor <- apply(data_factor, 2, function(x) as.numeric(as.factor(x)))
data <- as.matrix(cbind(data_factor, data_continuous))
factIdx <- 1:n_factor
ncont <- ncol(data_continuous)
} else {
data <- as.matrix(x)
factIdx <- 1:5 # why five??
ncont <- ncol(data)
data_continuous <- x
}
if (length(eta) == 1) {
eMat <- matrix(data = eta, nrow = size, ncol = ncont)
} else {
etaDim <- dim(eta)
if (!all(c(size, ncont) == etaDim)){
stop("eta matrix is incorrect dimension")
} else {
eMat <- eta
}
}
continuous_strata <- apply(
data_continuous,
2,
function(x) {
quantile(x, probs = seq(0, 1, length.out = size + 1), na.rm = TRUE)
}
)
if(is.null(include)){
dat <- data
inc <- dat[0,]
ssize <- size
} else{
dat <- data[-include,]
inc <- data[include,,drop = FALSE] ##keep as matrix if just one row
ssize <- size - length(include)
can.include <- 1:nrow(dat)
}
can.include <- can.include - 1 # convert to zero based for C
res <- CppLHS(xA = dat, cost = costVec, strata = continuous_strata, include = inc, idx = can.include,
factors = areFactors, i_fact = factIdx-1, nsample = ssize, cost_mode = costFlag, iter = iter,
wCont = weights$numeric, wFact = weights$factor, wCorr = weights$correlation, etaMat = eMat,
temperature = temp, tdecrease = tdecrease, length_cycle = length.cycle)
res$index_samples <- res$index_samples + 1 ##fix indexing difference
if(!is.null(include)){
res$index_samples <- c(res$index_samples,include)
}
res$sampled_data <- x[res$index_samples,]
if (simple) res <- res$index_samples
else {
# Making up the object to be returned
class(res) = c("cLHS_result","list")
}
return(res)
}else{
## No cost taking into account during optimisation
if (is.null(cost)) {
cost_mode <- FALSE
if (!is.null(track)) {
## Track mode: tracking cost (without taking it into account during optimisation)
# Get the id of the column used as a cost attribute
if (is.numeric(track)) i_cost <- track
else i_cost <- which(names(x) == track)
# Get cost attribute column
cost <- x[ , i_cost, drop = FALSE]
# Remove cost attribute from attribute table
x <- x[, -1*i_cost, drop = FALSE]
# Flags
cost_mode <- TRUE
track_mode <- TRUE
} else {
## No cost tracking, just plain optimisation of attributes
track_mode <- FALSE
}
} else {
## Cost is taken into account for optimisation
# Get the id of the column used as a cost attribute
if (is.numeric(cost)) i_cost <- cost
else i_cost <- which(names(x) == cost)
if (!length(i_cost)) stop("Could not find the cost attribute.")
# Get cost attribute column
cost <- x[ , i_cost, drop = FALSE]
# Remove cost attribute from attribute table
x <- x[, -1*i_cost, drop = FALSE]
# Si include, cost is 0
if (!is.null(include)) {
cost[include, ] <- 0
}
# Flags
cost_mode <- TRUE
track_mode <- FALSE # cost is taken into account, therefore computed
}
# Detection of any factor data
i_factor <- which(!sapply(x, is.numeric))
n_factor <- length(i_factor)
if (n_factor > 0) {
data_continuous <- x[, -1*i_factor, drop = FALSE]
data_factor <- x[, i_factor, drop = FALSE]
# Creating a list storing the levels of each factor
factor_levels <- apply(data_factor, 2, function(x) {
ifelse(is.factor(x), res <- levels(x), res <- levels(factor(x)))
res}
)
} else {
data_continuous <- x
}
metropolis <- exp(-1*0/temp) # Initial Metropolis value
n_data <- nrow(data_continuous) # Number of individuals in the data set
# Edge of the strata
continuous_strata <- apply(
data_continuous,
2,
function(x) {
quantile(x, probs = seq(0, 1, length.out = size + 1), na.rm = TRUE)
}
)
# Data correlation
cor_mat <- cor(data_continuous, use = "complete.obs")
# For object/class variable
if (n_factor == 0) data_factor_sampled <- data.frame(stringsAsFactors = TRUE)
else factor_obj <- alply(data_factor, 2, function(x) table(x)/n_data)
# Mandatory data in the sample
sampled_size <- size - length(include)
not_included <- setdiff(1:n_data, include)
# initialise, pick randomly
n_remainings <- n_data - size # number of individuals remaining unsampled
i_sampled <- c(sample(not_included, size = sampled_size, replace = FALSE), include) # individuals randomly chosen
i_unsampled <- setdiff(1:n_data, i_sampled) # individuals remaining unsampled
data_continuous_sampled <- data_continuous[i_sampled, , drop = FALSE] # sampled continuous data
if (n_factor > 0) data_factor_sampled <- data_factor[i_sampled, , drop = FALSE] # sampled factor data
# objective function
res <- .lhs_obj(size = size, data_continuous_sampled = data_continuous_sampled, data_factor_sampled = data_factor_sampled, continuous_strata = continuous_strata, cor_mat = cor_mat, factor_obj = factor_obj, weights = weights, eta = eta)
obj <- res$obj # value of the objective function
delta_obj_continuous <- res$delta_obj_continuous
sample.weights <- res$sample.weights
if (cost_mode) {
# (initial) operational cost
op_cost <- sum(cost[i_sampled, ])
# vector storing operational costs
op_cost_values <- vector(mode = 'numeric', length = iter)
} else op_cost_values <- NULL
# vector storing the values of the objective function
obj_values <- vector(mode = 'numeric', length = iter)
# progress bar
if (progress) pb <- txtProgressBar(min = 1, max = iter, style = 3)
if (any(duplicated(i_sampled))) browser()
if (any(i_sampled %in% i_unsampled)) browser()
for (i in 1:iter) {
# storing previous values
previous <- list()
previous$obj <- obj
previous$i_sampled <- i_sampled
previous$i_unsampled <- i_unsampled
previous$delta_obj_continuous <- delta_obj_continuous
if (cost_mode) previous$op_cost <- op_cost
if (runif(1) < 0.5) {
# pick a random sampled point and random unsampled point and swap them
idx_removed <- sample(1:length(setdiff(i_sampled, include)), size = 1, replace = FALSE)
spl_removed <- setdiff(i_sampled, include)[idx_removed]
idx_added <- sample(1:length(i_unsampled), size = 1, replace = FALSE)
i_sampled <- setdiff(i_sampled, include)[-idx_removed]
i_sampled <- c(i_sampled, i_unsampled[idx_added], include)
i_unsampled <- i_unsampled[-idx_added]
i_unsampled <- c(i_unsampled, spl_removed)
if (any(duplicated(i_sampled))) browser()
if (any(i_sampled %in% i_unsampled)) browser()
# creating new data sampled
data_continuous_sampled <- data_continuous[i_sampled, , drop = FALSE]
if (n_factor > 0) data_factor_sampled <- data_factor[i_sampled, , drop = FALSE]
}
else {
# remove the worse sampled & resample
sample.weights <- sample.weights + 1 ## to ensure all weights are positive
sample.weights[i_sampled %in% include] <- 0 ## to ensure we don't select any include samples as the worst
worse <- max(sample.weights)
i_worse <- which(sample.weights == worse)
# If there's more than one worse candidate, we pick one at random
if (length(i_worse) > 1) i_worse <- sample(i_worse, size = 1)
# swap with reservoir
spl_removed <- setdiff(i_sampled, include)[i_worse] # will be removed from the sampled set.
idx_added <- sample(1:n_remainings, size = 1, replace = FALSE) # new candidate that will take their place
i_sampled <- setdiff(i_sampled, include)[-i_worse]
i_sampled <- c(i_sampled, i_unsampled[idx_added], include)
i_unsampled <- i_unsampled[-idx_added]
i_unsampled <- c(i_unsampled, spl_removed)
if (any(duplicated(i_sampled))) browser()
if (any(i_sampled %in% i_unsampled)) browser()
# creating new data sampled
data_continuous_sampled <- data_continuous[i_sampled, , drop = FALSE]
if (n_factor > 0) data_factor_sampled <- data_factor[i_sampled, , drop = FALSE]
}
# calc obj
res <- .lhs_obj(size = size, data_continuous_sampled = data_continuous_sampled, data_factor_sampled = data_factor_sampled, continuous_strata = continuous_strata, cor_mat = cor_mat, factor_obj = factor_obj, weights = weights, eta = eta)
obj <- res$obj
delta_obj_continuous <- res$delta_obj_continuous
sample.weights <- res$sample.weights
# Compare with previous iterations
delta_obj <- obj - previous$obj
metropolis <- exp(-1*delta_obj/temp) #+ runif(1)*temp
if (cost_mode) {
# op costs
op_cost <- sum(cost[i_sampled, ])
delta_cost <- op_cost - previous$op_cost
if (track_mode) metropolis_cost <- Inf # runif(1) >= Inf is always FALSE
else metropolis_cost <- exp(-1*delta_cost/temp)
}
else metropolis_cost <- Inf # runif(1) >= Inf is always FALSE
# If the optimum has been reached
if (obj <= obj.limit) {
warning("\nThe objective function has reached its minimum value, as specified by the obj.limit option.")
if (progress) {
setTxtProgressBar(pb, i)
close(pb)
}
obj_values[i] <- obj
if (cost_mode) op_cost_values[i] <- op_cost
break
}
# Revert change
if (delta_obj > 0 & runif(1) >= metropolis | runif(1) >= metropolis_cost) {
i_sampled <- previous$i_sampled
i_unsampled <- previous$i_unsampled
data_continuous_sampled <- data_continuous[i_sampled, , drop = FALSE]
if (n_factor > 0) data_factor_sampled <- data_factor[i_sampled, , drop = FALSE]
obj <- previous$obj
delta_obj_continuous <- previous$delta_obj_continuous
if (cost_mode) op_cost <- previous$op_cost
}
# Storing the objective function value of the current iteration
obj_values[i] <- obj
if (cost_mode) op_cost_values[i] <- op_cost
# Temperature decrease
if ((i %% length.cycle) == 0) temp <- temp*tdecrease
# Update progress bar
if (progress) setTxtProgressBar(pb, i)
}
# Close progress bar
if (progress) close(pb)
if (n_factor > 0) {
sampled_data <- data.frame(data_continuous_sampled, data_factor_sampled, stringsAsFactors = TRUE)
# reordering cols
sampled_data <- sampled_data[, names(x)]
} else sampled_data <- data_continuous_sampled
# Simple output - just the sampled object
if (simple) res <- i_sampled
else {
# Making up the object to be returned
res <- list(
initial_object = x,
index_samples = i_sampled,
sampled_data = sampled_data,
obj = obj_values,
cost = op_cost_values
)
class(res) = c("cLHS_result","list")
}
res
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.