#' @importFrom utils head
#' @importFrom utils tail
NULL
#' Takes an original sample, and returns \code{count.required} entities
#' selected according to the weight column \code{colname.weight} of this sample (if any).
#'
#' If the population is too small, it will be made bigger by replication of entities.
#' If the population is too big, only some entities will be selected depending to weights.
#' If the population is exactly of the right size, then it will be entirely kept as
#' in the original sample.
#'
#' The actual sampling is entirely delegated to the \link{dplyr}{sample_n} function.
#'
#'
#' @param sample the sample to use for loading
#' @param count.required the count of entities expected as a result
#' @param colnaĆ.weight a string describing the column to be used as a weight in the sample
#'
#' @importFrom dplyr sample_n
#'
#'
#' @author Samuel Thiriot <samuel.thiriot@res-ear.ch>
#'
#' @keywords internal
#'
matching.generate.resize_population <- function(sample, count.required, colname.weight) {
# weights.factor <- count.required / weights.available
#print(weights.factor)
#print(count.required)
selected <- sample_n(sample, count.required, weight=sample[,colname.weight], replace=TRUE)
#print(nrow(selected))
# return everything but the weights
selected[,names(selected) != colname.weight]
}
matching.generate.copy_population <- function(n, sample, verbose=FALSE) {
if (verbose)
cat("copying population based on attributes '",paste(names(n),collapse=","),"' with ",
paste(n,collapse=","),"\n",sep="")
target <- sample$sample[FALSE,] # empty dataframe with the same properties
colnames(target) <- colnames(sample$sample)
for (name in names(n)) {
count.required <- n[name]
if (count.required > 0) {
if (verbose)
cat("\tcopying ", count.required, " having ",name,"\n",sep="")
k2v <- extract_attributes_values(name)
#print(k2v)
# accumulate the conditions on each key / value
selected_ids <- 1:nrow(sample$sample)
for (k in names(k2v)) {
selected_ids <- intersect(selected_ids, which(sample$sample[k] == k2v[[k]]))
}
# identify the subpart of the population having these values
available <- sample$sample[selected_ids,]
count.available <- nrow(available)
weights.available <- sum(available[,sample$dictionary$colname.weight])
if (verbose)
cat("\tshould copy ", count.required, " individuals for ", name,
" and found ",count.available, " individuals with weights summing to ", weights.available,"\n",sep="")
if (count.required > 0) {
toAdd <- matching.generate.resize_population(available, count.required, sample$dictionary$colname.weight)
# extend the original target population
target <- rbind(target, toAdd)
}
}
}
if (sum(n) != nrow(target)) {
stop(paste("we should have copied ",sum(n)," rows but we only copied",nrow(target),"instead\n",sep=""))
}
target
}
#' Adds a target degree column to a population
#' based on the distribution of contigency for degrees passed as a parameter.
#'
#' @param samp the original sample (used for its dictionary)
#' @param pop the population to which the column will be added to
#' @param n the expected contigencies (used as a control)
#' @param ndx the contigencies for each degree
#' @param verbose if TRUE, detailed messages are printed
#'
#' @author Samuel Thiriot <samuel.thiriot@res-ear.ch>
#'
#' @keywords internal
#'
matching.generate.add_degree <- function(samp, pop, n, ndx, verbose) {
# add the column with NA
pop$target.degree <- rep.int(NA, nrow(pop))
# for each different value
# colnames(pdi$data)
for (name in names(ndx)) {
k2v <- extract_attributes_values(name)
totalForDegree <- 0
lastDegreeNonNull <- 0
for (degree in 0:(nrow(ndx)-1)) {
count.required <- ndx[degree+1,name]
if (count.required > 0) {
totalForDegree <- totalForDegree + count.required*degree
# accumulate selection criteria: write set of attributes, and also no defined degree
criteriaRaw <- which( is.na(pop["target.degree"]) )
for (k in names(k2v)) {
criteriaRaw <- intersect(criteriaRaw, which(pop[k] == k2v[[k]]))
}
if (verbose)
cat("\tset target degree ", degree, "\t for ", count.required, " over\t", length(criteriaRaw), "\t having ", name,"\n", sep="")
if (count.required == 0) {
next
}
lastDegreeNonNull <- degree
criteria <- NULL
if (count.required < length(criteriaRaw)) {
criteria <- sample(criteriaRaw, count.required)
} else {
criteria <- criteriaRaw
}
pop[criteria,"target.degree"] <- degree
}
}
if (totalForDegree < n[name]) {
warning(paste("oops, not created enough slots here:",totalForDegree," for ",n[name]," expected; ",
"defining the last ",length(criteriaRaw), "to degree ",lastDegreeNonNull,"\n",sep=""))
}
}
still.missing <- pop[ which( is.na(pop["target.degree"]) ),]
if (nrow(still.missing)>0) {
warning("some entities were not given any degree, they will be fixed at zero target degree")
print(head(still.missing))
pop[ which( is.na(pop["target.degree"]) ),"target.degree"] <- 0
}
# so we might assess how much of each we need to precisely meet the expectation
pop
}
#' Generates a synthetic population based on the resolution of the problem,
#' applied on two samples \code{sample.A} and \code{sample.B}.
#' It first resizes the two populations so they reach the expected sizes,
#' the adds degrees to them, then creates links based on degrees
#' and characteristics.
#'
#' @param case the case resolved by the \code{\link{matching.solve}} function.
#' @param sample.A the sample to use as population A
#' @param sample.B the sample to use as population B
#' @param verbose displays detailed messages if TRUE
#' @param force if TRUE, will try to generate even very big populations (and will likely fail)
#' @return the generated population
#'
#' @seealso \code{\link{matching.solve}} to prepare the case for this call
#'
#' @export
#'
#' @author Samuel Thiriot <samuel.thiriot@res-ear.ch>
#'
matching.generate <- function(case, sample.A, sample.B, verbose=FALSE, force=FALSE) {
if (class(case) != "dpp_resolved")
stop("case should be the result of a solving by matching.solve")
if (verbose)
cat("starting generation\n")
# avoid starting the generation of a whole country without checking the user really wants it
# (s/he often doesn't)
if (case$gen$hat.nA + case$gen$hat.nB > 1e6) {
if (force)
warning(paste("you are trying to generate a population of more than 1 million of individuals, ",
"which probably does not make sense, and also might fail and freeze your system.\n",
" Good luck...",
sep=""),
immediate.=TRUE)
else
stop(paste("you are trying to generate a population of more than 1 million of individuals: ",
"case$gen$hat.nA=", case$gen$hat.nA , ", case$gen$hat.nB=", case$gen$hat.nB,
" .\nThat probably does not make much sense. It also would likely fail and freeze your system.\n",
"If you really want to do it, add parameter force=TRUE and cross fingers.",
sep=""))
}
# copy the individuals required for A
targetA <- sample.A$sample[FALSE,] # empty with the same properties
nni <- case$gen$hat.ci
nnj <- case$gen$hat.cj
# copy individuals
targetA <- matching.generate.copy_population(n=nni, sample=sample.A, verbose=verbose)
targetB <- matching.generate.copy_population(n=nnj, sample=sample.B, verbose=verbose)
# cat("creating ids...\n")
# create unique ids (unique at the level of both populations)
targetA$id.A <- seq.int(1, nrow(targetA))
targetB$id.B <- seq.int(nrow(targetA)+1, nrow(targetA)+nrow(targetB))
# create a matching table with links (we init it with nothing so we avoid costly resizing later)
links <- data.frame(
id.A=c(),
id.B=c()
)
if (verbose)
cat("\nadding target degrees for population A...\n")
# add columns for target degree
targetA <- matching.generate.add_degree(sample.A, targetA, case$gen$hat.ni, case$gen$hat.ndi, verbose=verbose)
if (verbose)
cat("\nadding target degrees for population B...\n")
targetB <- matching.generate.add_degree(sample.B, targetB, case$gen$hat.nj, case$gen$hat.ndj, verbose=verbose)
# add columns for obtained degrees (0 for now)
targetA$current.degree <- rep.int(0, nrow(targetA))
targetB$current.degree <- rep.int(0, nrow(targetB))
if (verbose)
cat("\nmatching A and B to create total",case$gen$hat.nL,"links...\n")
# match them
for (cA in colnames(case$gen$hat.nij)) {
k2v.A <- extract_attributes_values(cA)
for (cB in rownames(case$gen$hat.nij)) {
k2v.B <- extract_attributes_values(cB)
count.required <- case$gen$hat.nij[cB,cA]
if (count.required > 0) {
if (verbose)
cat("\tshould create", count.required, "\t links for A:\t", cA, "\tand B:\t", cB, "\n")
pass.remaining <- 1
links.toadd.A <- c()
links.toadd.B <- c()
while ( (pass.remaining > 0) & (count.required > 0) ) {
pass.remaining <- pass.remaining - 1
criteriaAraw <- which( targetA$current.degree < targetA$target.degree )
for (k in names(k2v.A)) {
criteriaAraw <- intersect(criteriaAraw, which(targetA[k] == k2v.A[[k]]))
}
criteriaBraw <- which( targetB$current.degree < targetB$target.degree )
for (k in names(k2v.B)) {
criteriaBraw <- intersect(criteriaBraw, which(targetB[k] == k2v.B[[k]]))
}
available.degree.A <- max(targetA[criteriaAraw,"target.degree"] - targetA[criteriaAraw,"current.degree"])
available.degree.B <- max(targetB[criteriaBraw,"target.degree"] - targetB[criteriaBraw,"current.degree"])
# first pass for largely connectable ones
if ( (available.degree.A > 1) | (available.degree.B > 1) ) {
pass.remaining <- 1
}
count.found <- min(length(criteriaAraw), length(criteriaBraw), count.required)
if ( count.found == 0) {
cat("\tfound no valid candidate...\n")
break
}
criteriaA <- sample(criteriaAraw, count.found)
criteriaB <- sample(criteriaBraw, count.found)
# add the links
links.toadd.A <- c(links.toadd.A, targetA[criteriaA,"id.A"])
links.toadd.B <- c(links.toadd.B, targetB[criteriaB,"id.B"])
# update the degree
targetA[criteriaA,"current.degree"] <- targetA[criteriaA,"current.degree"] + 1
targetB[criteriaB,"current.degree"] <- targetB[criteriaB,"current.degree"] + 1
count.required <- count.required - count.found
}
if ( count.required > 0) {
warning(paste("\t/!\\ failed to create ", count.required," link(s) ",
"for ", cA,
"\tand B:\t", cB, "\n",sep=""))
}
links.toadd <- data.frame(
id.A=links.toadd.A,
id.B=links.toadd.B
)
links <- rbind(links, links.toadd)
}
}
}
# add a column for current degree
res <- case
res$pop <- list(A=targetA, B=targetB, links=links)
class(res$pop) <- "dpp_population"
class(res) <- "dpp_result"
res
}
#' Display a generated population
#'
#' @param x the population to print
#' @param ... ignored
#'
#' @export
#'
#' @author Samuel Thiriot <samuel.thiriot@res-ear.ch>
#'
print.dpp_population <- function(x,...) {
cat("Synthetic population with ",nrow(x$A)," parents, ",nrow(x$B)," children, ",nrow(x$links)," links\n",sep="")
cat("\n$A (first lines)\n")
print(head(x$A))
cat("\n$B (first lines)\n")
print(head(x$B))
cat("\n$links (first lines)\n")
print(head(x$links,n=15))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.