#' Names distributions
#'
#' Class for analysing time-distribution of names from a names database with fields: name, quantity, year
#'
#' @section Methods:
#' \describe{
#' \item{\code{process()}}{}
#' }
#' @field name.year.count name year count distribution
#' @format \code{\link{R6Class}} object.
#' @docType class
#' @importFrom R6 R6Class
#' @import dplyr
#' @import lgr
#' @importFrom reshape2 dcast
#' @export
NamesDistribution.class <- R6::R6Class("NamesDistribution",
public = list(
name.year.count = NA,
argentina.names.retriever = NA,
available.years = NULL,
available.names = NULL,
#state
names.distribution = NULL,
# logger
logger = NA,
initialize = function(argentina.names.retriever) {
#initialize = function(name.year.count) {
#self$name.year.count <- name.year.count
self$argentina.names.retriever <- argentina.names.retriever
self$logger <- genLogger(self)
self
},
initDefaultValues = function(){
universe <- self$argentina.names.retriever$name.year.count
if (is.null(self$available.years)){
self$available.years <- sort(unique(universe$year))
}
# if (is.null(self$available.names)){
# self$available.names <- sort(unique(universe$name))
# }
},
getFilteredNameYearCount = function(
names = self$available.names,
years = self$available.years){
filtered.names <- self$name.year.count
filtered.names <- filtered.names %>%
filter (year %in% years)
if (!is.null(names)){
filtered.names$name <- normalizeString(filtered.names$name)
names <- normalizeString(names)
filtered.names <- filtered.names %>%
filter (name %in% names)
}
filtered.names
},
getNamesRanking = function(n = 20,
names = self$available.names,
years = self$available.years){
filtered.names <- self$getFilteredNameYearCount(names = names,
years = years)
filtered.names %>%
dplyr::group_by(name) %>%
summarize( count = sum(count)) %>%
arrange(desc(count)) %>%
dplyr::top_n(n = n)
},
setUpDistribution = function(names.count,
years = self$available.years) {
names.count$name <- normalizeString(names.count$name)
self$name.year.count <- self$argentina.names.retriever$name.year.count %>%
filter(name %in% names.count$name) %>%
filter(year %in% years)
getLogger(self)$info(
paste("Distribution for", nrow(names.count), "names and",
length(years), "years has", nrow(self$name.year.count),
"observations"))
self$name.year.count
},
getNamesDistribution = function(names = self$available.names,
years, relative = FALSE,
decimals = 5){
filtered.names <- self$getFilteredNameYearCount(names = names,
years = years)
ret <- dcast(filtered.names,
formula = name ~ year,
value.var = "count",
fun.aggregate = sum,
fill = 0)
data.cols <- 2:ncol(ret)
total <- apply(ret[, data.cols], MARGIN = 1, FUN = sum)
if (relative){
for (i in seq_len(nrow(ret))){
ret[i, data.cols] <- round(ret[i, data.cols] / total[i],
digits = decimals)
}
#check
#print(abs(apply(ret[,data.cols], MARGIN = 1, FUN = sum)-1) <
#10^-(decimals-1))
}
ret <- ret[order(-total), ]
ret
},
runSimulations = function(names.count,
years = self$available.years,
seed = 34441222,
runs = 1){
seeds <- round(runif(runs, 0, 10 ^ 8))
getLogger(self)$info(paste("Running", runs, "simulations using seeds",
paste(seeds, collapse = ",")))
self$setUpDistribution(names.count, years)
ret <- NamesDistributionSimulationMultipleRuns.class$new()
#debug
ret <<- ret
self$names.distribution <- self$getNamesDistribution(years = years,
names = names.count$name,
relative = TRUE)
for (i in seq_len(runs)){
run <- self$simulateDistribution(names.count, years,
seed = seeds[i])
ret$addSimulation(run)
}
ret
},
simulateDistribution = function(names.count,
years = self$available.years,
seed){
#Validate and prepare data
mandatory.fields <- c("name", "count")
valid.data <- c("name", "count") %in% names(names.count)
if (min(valid.data) == 0){
#If there is missing row
stop(paste("Missing fields in names.count:",
"name and count must be present but there are:",
paste(names(names.count), collapse = ",")))
}
names.count <- names.count %>%
dplyr::group_by(name) %>% summarize( count = sum(count))
names.count$name <- normalizeString(names.count$name)
getLogger(self)$info(paste("Running simulation with seed", seed,
"for", nrow(names.count), "names using",
nrow(self$names.distribution), "names distributions"))
set.seed(seed)
ret <- self$names.distribution[0, ]
i <- 1
names.not.processed <- NULL
for (current.name in names.count$name){
current.name.count <- names.count %>% filter(name == current.name)
current.name.distribution <-
self$names.distribution %>%
filter(name == current.name)
if (nrow(current.name.distribution) > 0){
cols.data <- 2:ncol(current.name.distribution)
current.name.distribution <- current.name.distribution[, cols.data]
#debug
#print(years)
#current.name.distribution <<- current.name.distribution
#current.name.count <<- current.name.count
#current.name.distribution <<- current.name.distribution
current.name.sample <- sample(years, size = current.name.count$count,
replace = TRUE,
prob = current.name.distribution)
sample.distribution <- data.frame(year = years) %>%
dplyr::left_join(data.frame(year = current.name.sample, count = 1),
by = "year") %>%
dplyr::mutate_each(funs(replace(., which(is.na(.)), 0))) %>%
dplyr::group_by(year) %>%
summarize(count = sum(count))
sample.distribution.tab <- t(sample.distribution)[2, ]
names(sample.distribution.tab) <- sample.distribution$year
sample.distribution.tab <- as.data.frame(t(sample.distribution.tab))
sample.distribution.tab$name <- current.name
ret[i, ] <- sample.distribution.tab[, names(ret)]
i <- i + 1
}
else{
names.not.processed <- c(names.not.processed, current.name)
}
}
getLogger(self)$info(paste(i, "names processed and",
length(names.not.processed),
"names not processed as not present in names distribution"))
getLogger(self)$debug(paste(names.not.processed, collapse = ","))
total <- as.data.frame(t(apply(ret[, cols.data], MARGIN = 2, sum)))
total.relative <- round(total / sum(total), 6)
total$name <- "total"
total.relative$name <- "total.relative"
ret <- rbind(ret, total[, names(ret)])
ret <- rbind(ret, total.relative[, names(ret)])
ret <- NamesDistributionSimulationRun.class$new(distribution.matrix = ret)
ret
}
))
#' Generate Sample Distribution
#' Gets a subset of
generateSampleDistribution <- function(name.year.count,
seed,
sample.ratio = .01,
min.count = 100){
universe.size <- nrow(name.year.count)
set.seed(seed)
names.year.sample <- name.year.count[
sample(1 : universe.size,
size = sample.ratio * universe.size,
replace = FALSE), ]
nrow(names.year.sample)
names.sample <- names.year.sample %>%
group_by(name) %>%
summarize( count = sum(count)) %>%
arrange(desc(count)) %>%
filter(count > min.count)
nrow(names.sample)
nrow(names.year.sample)
# Names with min.count in years
names.year.sample <- names.year.sample %>%
filter(name %in% names.sample$name)
nrow(names.year.sample)
names.year.sample$year.count <- names.year.sample$count
names.year.sample$count <-
vapply(names.year.sample$count,
FUN = function(x){
sample(1:x, size = 1, replace = FALSE)
},
FUN.VALUE = numeric(1))
names.year.sample
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.