#' DataSets class
#'
#' This class defines DataSets, a class holding one or more @DataSet@ objects.
#' @export DataSets
#' @exportClass DataSets
DataSets <- R6::R6Class(
classname = "DataSets",
inherit = Node,
## Properties
private = list(
.name = NULL, # character
.datasets = NULL, # list<DataSet>
.xVar = NULL # Variable
),
## Methods
public = list(
initialize = function(name, datasets, desc = NULL) {
if (!is.list(datasets) || length(datasets) < 1) stop("DataSets$new(..) requires a list of datasets, with >= 1 element")
super$initialize(desc)
self$name <- name
self$datasets <- datasets
## if (rhaskell::any(rhaskell::pNIdentical(rhaskell::head(datasets)$xVar$vals), rhaskell::map(function(x) x$xVar$vals, rhaskell::tail(datasets))))
## stop("In DataSets$new(..) the xVar values have to be equal for all datasets: ", paste0(rhaskell::map(function(x) x$xVar, datasets)))
if (rhaskell::any(rhaskell::pNIdentical(rhaskell::head(datasets)$xVar$vals) %comp% (function(x) x$xVar$vals), rhaskell::tail(datasets)))
stop("In DataSets$new(..) the xVar *values* have to be equal for all datasets")
if (rhaskell::any(rhaskell::pNeq(rhaskell::head(datasets)$xVar$name) %comp% (function(x) x$xVar$name), rhaskell::tail(datasets)))
warning("In DataSets$new(..) the xVar *names* are different. Using the first given xVar-name")
self$xVar <- rhaskell::head(datasets)$xVar
},
#' Get the dataset by the name.
#'
#' @param dsName Name of DataSet
#' @return rhaskell::Either Character DataSet
getDataSet = function(dsName) {
for (ds in self$datasets) if (ds$name == dsName) return(rhaskell::Right(ds))
return(rhaskell::Left(paste0("Could not find dataset with name '", dsName, "'")))
},
#' Apply a function `f :: a -> b` to each element of one specific variable and for all DataSets.
#'
#' @param fun: function to apply of type `a -> b`.
#' @param variable: variable name to apply function to.
#' @param funDesc: Textual description of function.
#' @return a new DataSets object.
map = function(fun, column, funDesc = deparse1(fun)) {
if (base::is.list(column)) stop("Cannot use multiple columns in function `map`")
dss <- rhaskell::map(function(ds) ds$map(fun, column, funDesc), self$datasets)
dsNew <- DataSets$new(paste0(self$name, " mapped"), dss, desc = paste0("mapped ", funDesc, " over ", column))
self$addChild(dsNew)
return(dsNew)
},
#' Accumulate one or more variable values to a new variable using a function `f :: [Vector a] -> b`. Adds the new variable to the DataSet.
#'
#' @param newVarName: Name of new variable
#' @param fun: function to apply of type `[Vector a] -> b`. Each element is a scalar or vector (in case of matrix variables) of inputs from the specified columns.
#' @param columns: columns used as input to the function.
#' @param funDesc: Textual description of function.
#' @return a new DataSet object.
accumTo = function(newVarName, fun, columns, funDesc = deparse1(fun)) {
dss <- rhaskell::map(function(ds) ds$accumTo(newVarName, fun, columns, funDesc), self$datasets)
dsNew <- DataSets$new(self$name, dss, desc = paste0("'", newVarName, "' created: accumulation with ", funDesc, " of ", paste0(columns, collapse = ",")))
self$addChild(dsNew)
return(dsNew)
},
#' Rename a variable over all DataSets
#'
#' @param newVarName: Name of new variable
#' @param oldVarName: Old variable name
#' @return a new DataSet object.
renameVariableTo = function(newVarName, oldVarName) {
dss <- rhaskell::map(function(ds) ds$addVariable(ds$getVariable(oldVarName)$rename(newVarName))$removeVariable(oldVarName), self$datasets)
dsNew <- DataSets$new(self$name, dss, desc = paste0("'", oldVarName, "' renamed to '", newVarName, "'"))
self$addChild(dsNew)
return(dsNew)
},
#' Group all datasets by the specified columns and aggregate using the Aggregate function objects.
#'
#' @param columns: Columns to group on. Every tuple of values from these columns will only occur once after grouping.
#' @param aggregates: Functions to apply on the data that will be aggregated, i.e. if and how other columns that will appear be kept.
#' @param xVarName: Name of new variable.
#' @return a new DataSet object.
groupBy = function(columns, aggregates, xVarName = "t", na.rm = TRUE) {
dss <- rhaskell::map(function(ds) ds$groupBy(columns, aggregates, xVarName, na.rm), self$datasets)
dsNew <- DataSets$new(paste0(self$name, " grouped"), dss, desc = paste0("grouped by ", rhaskell::unlines(rhaskell::intersperse(", ", columns))))
self$addChild(dsNew)
return(dsNew)
},
#' Remove a variable from all datasets.
#'
#' @param column: Column/Variable name to remove.
#' @param stopIfNotExists call `stop` if variable does not eixt
#' @return a new DataSet object.
removeVariable = function(column, stopIfNotExists = TRUE) {
dss <- rhaskell::map(function(ds) ds$removeVariable(column, stopIfNotExists), self$datasets)
dsNew <- DataSets$new(paste0(self$name, " ,removed var"), dss, desc = paste0("removed ", column))
self$addChild(dsNew)
return(dsNew)
},
#' Create a core model for each dataset
#' TODO!!!
createCoreModelsFor = function(outcomes, fitters, formulas, adaptions, selection) {
if (!base::is.list(outcomes)) outcomes <- list(outcomes)
if (!base::is.list(fitters)) fitters <- list(fitters)
if (!base::is.list(formulas)) formulas <- list(formulas)
coreModels <- CoreModelSelectors$new(paste0("Possible Core Models for DataSets of '", self$name, "'"), self)
for (ds in self$datasets) {
env <- ds$asEnvironment() # create environment
for (y in outcomes) {
selector <- CoreModelSelector$new(paste(ds$name, y), ds)
for (fitter in fitters) {
for (formula in formulas) {
fitter$data <- env # set data frame
fit <- fitter$fit(paste(y, formula)) # fit model
selector$addPossibleCoreModel(fit) # save as possible model
}
}
if (!selector$hasAnyConvergedModel()){
stop("No core model for dataset '", ds$name, "' and '", y, "' converge. Cannot proceed")
}
coreModels$addCoreModelSelector(y, selector)
}
}
return(coreModels)
},
#' Create CITS Model
#' TODO: improve interface to no require 1/0 vectors for periods
#'
createCITSModel = function(mainDsName, outcomes, selectionVars, addModelSelection, autoCorrTerms = list(), resultFolder = "results", citsFolder = "CITS", interactive = TRUE) {
resPath <- paste0(resultFolder, "/", citsFolder)
ds <- self$getDataSet(mainDsName)$fromRightOrStop()
models <- list()
for (outcome in outcomes) {
varNamePeriods <- "CITS_periods"
dssCITS <- self$accumTo(varNamePeriods, sum, selectionVars)
dsCITS <- dssCITS$getDataSet(mainDsName)$fromRightOrStop()
## Fit model: outcome ~ + trend + CITS_periods
## TODO: if outcome continous: guassian, otherwise quasipoisson
fitter <- FitterGLM$new(family = stats::quasipoisson, na.action = "na.exclude")
fitter$data <- dsCITS$asEnvironment(as_tibble = FALSE)
## rhs <- paste(base::append(selectionVars, addModelSelection), collapse = " + ")
rhs <- paste(base::append(dsCITS$xVar$name, varNamePeriods), collapse = " + ")
## rhs <- paste(base::append(base::append(dsCITS$xVar$name, varNamePeriods), addModelSelection), collapse = " + ")
rhs <- paste(base::append(varNamePeriods, addModelSelection), collapse = " + ")
fit <- fitter$fit(paste(outcome, "~", rhs))
## datanew <- tibble::new_tibble(dsCITS$xVar$asMatrix())
## datanew <- tibble::add_column(datanew, dsCITS$getVariable(varNamePeriods)$asMatrix())
## datanew <- rhaskell::foldl(function(df, out) tibble::add_column(df, dsCITS$getVariable(out)$asMatrix()), datanew, selectionVars)
preds <- tibble::as_tibble(stats::predict(fit$model, type = "response", newdata = dsCITS$asEnvironment()))
env <- dsCITS$asEnvironment()
dsCITSCF <- dsCITS$map(function(x) 0, varNamePeriods)
predcf <- tibble::as_tibble(stats::predict(fit$model, type = "response", dsCITSCF$asEnvironment())) # counterfactual
# dpred <- data.frame(T = datanew$T, pred1 = stats::predict(fit$model, type = "response", newdata = datanew))
xVals <- dsCITS$xVar$asTibble()
## xVals <- dsCITS$getVariable("date")$asTibble()
yVals <- dsCITS$getVariable(outcome)$asTibble()
plDs <- rhaskell::map(function(sel) {
selVals <- dsCITS$getVariable(sel)$asTibble()
xs <- xVals[selVals == 1, ]
ys <- yVals[selVals == 1, ]
return(PlotDataGeomRect$new(sel
, xMin = base::min(base::as.vector(xs)[[1]], na.rm = TRUE)
, xMax = base::max(base::as.vector(xVals)[[1]], na.rm = TRUE)
, yMin = base::min(base::as.vector(ys)[[1]], na.rm = TRUE)
, yMax = base::max(base::as.vector(yVals)[[1]], na.rm = TRUE)
, alpha = 0.35))
}, selectionVars)
plDs <- base::append(plDs, list(PlotDataGeomPoint$new(name = outcome, xVals, yVals, alpha = 0.40)
, PlotDataGeomLine$new(name = "prediction", xVals, preds, colour = "orangered3", linetype = 2, size = 1.0, alpha = 1.0)
, PlotDataGeomLine$new(name = "counterfactual", xVals, predcf, colour = "firebrick1", linetype = 1, size = 1.0, alpha = 1.0)
))
plot <- Plot$new(paste0(self$name, ": ", outcome, " ~ ", rhs), plotData = plDs, yAxisTitle = outcome, subtitle = "Only change in level, all periods. Bf diagnosis"
, path = resPath, filename = paste0(self$name, "_", "1-level-change_", outcome))
plot$plot()
fitModelAndPlot <- function(rhsSel) {
rhs <- paste(base::append(rhsSel, addModelSelection), collapse = " + ")
fitter <- FitterGLM$new(family = stats::quasipoisson, na.action = "na.exclude")
fitter$data <- dsCITS$asEnvironment(as_tibble = FALSE)
fit <- fitter$fit(paste(outcome, "~", rhs))
preds <- tibble::as_tibble(stats::predict(fit$model, type = "response", newdata = dsCITS$asEnvironment()))
dsCITSCF <- dsCITS$map(function(x) 0, rhsSel)
predcf <- tibble::as_tibble(stats::predict(fit$model, type = "response", dsCITSCF$asEnvironment())) # counterfactual
xVals <- dsCITS$xVar$asTibble()
## xVals <- dsCITS$getVariable("date")$asTibble()
yVals <- dsCITS$getVariable(outcome)$asTibble()
plDs <- rhaskell::map(function(sel) {
selVals <- dsCITS$getVariable(sel)$asTibble()
xs <- xVals[selVals == 1, ]
ys <- yVals[selVals == 1, ]
return(PlotDataGeomRect$new(sel
, xMin = base::min(base::as.vector(xs)[[1]], na.rm = TRUE)
, xMax = base::max(base::as.vector(xVals)[[1]], na.rm = TRUE)
, yMin = base::min(base::as.vector(ys)[[1]], na.rm = TRUE)
, yMax = base::max(base::as.vector(yVals)[[1]], na.rm = TRUE)
, alpha = 0.35))
}, list(rhsSel))
plDs <- base::append(plDs, list(PlotDataGeomPoint$new(name = outcome, xVals, yVals, alpha = 0.40)
, PlotDataGeomLine$new(name = "prediction", xVals, preds, colour = "orangered3", linetype = 2, size = 1.0, alpha = 1.0)
, PlotDataGeomLine$new(name = "counterfactual", xVals, predcf, colour = "firebrick1", linetype = 1, size = 1.0, alpha = 1.0)
))
plot <- Plot$new(paste0(self$name, ": ", outcome, " ~ ", rhs), plotData = plDs, yAxisTitle = outcome, subtitle = paste0("Only change in level, ", rhsSel, ". Bf diagnosis")
, path = resPath, filename = paste0(self$name, "_", "1-level-change_", outcome, "_", rhsSel))
plot$plot()
return(fit)
}
baseModels <- rhaskell::map(fitModelAndPlot, selectionVars)
## rhaskell::map(function(mdl) mdl$model$formula, baseModels)
cis <- rhaskell::map(function(mdl) Epi::ci.lin(mdl$model, Exp = TRUE), baseModels)
pVals <- rhaskell::zipWith(function(sel, ci) ci[sel, "P"] , selectionVars, cis)
pVal <- base::min(base::unlist(pVals))
idx <- rhaskell::find(function(i) (pVals[[i]] == pVal), base::seq_len(base::length(pVals)))$fromJust()
## Use
rhsAdd <- paste(addModelSelection, collapse = " + ")
rhsSels <- paste(selectionVars, collapse = ", ")
base::print(paste0("Plots have been written for '", outcome, " ~ ", rhsAdd, " + OneOf(", rhsSels, ")'. Take a look and decide on the base model for futher analysis."))
rhaskell::void(rhaskell::zipWith3(function(idx, var, pVal) {
base::print(paste0("[", idx, "] ", var, " (p-value: ", pVal, ")"))
}, base::seq_len(base::length(pVals)), selectionVars, pVals))
if (interactive) {
v <- base::readline(paste0("Enter value [", idx, "]"))
if (v != "")
idx <- base::as.numeric(v)
}
print(paste0("Using index ", idx, ". Base Model: ", baseModels[[idx]]$model$formula))
model <- baseModels[[idx]]
rhsSel <- selectionVars[[idx]]
## Check residuals
res <- stats::residuals(model$model, type = "deviance")
plDt <- PlotDataGeomPoint$new(name = "Residuals", xVals = dsCITS$xVar$asTibble(), yVals = res, colour = "grey", alpha = 0.6, size = 0.7)
plot <- Plot$new(paste0(self$name, ": ", outcome, " ~ ", rhs, ". Residuals over time."), yAxisTitle = "Deviance residuals", xAxisTitle = "Date",
plotData = plDt, path = resPath, filename = paste0(self$name, "_", "_", outcome, "_", rhsSel, "_residuals-base-model"))
plot$plot()
## plot(dSCTFm$T,res,pch=19,cex=0.7,col=grey(0.6), main="Residuals over time",ylab="Deviance residuals",xlab="Date")
## abline(h=0,lty=2,lwd=2)
# QUESTION: Using na.pass is good?
acf <- stats::acf(res, na.action = na.pass)
pacf <- stats::pacf(res, na.action = na.pass)
bgacf <- lmtest::bgtest(model$model, order = 12)
pVal <- bgacf$p.value
## solve auto-correlation
solveAutocorrelation <- function(term) {
##:ess-bp-start::conditional@:##
browser(expr={TRUE})##:ess-bp-end:##
rhs <- paste(base::append(rhsSel, base::append(addModelSelection, term)), collapse = " + ")
fitter <- FitterGLM$new(family = stats::quasipoisson, na.action = "na.exclude")
fitter$data <- dsCITS$asEnvironment(as_tibble = FALSE)
fit <- fitter$fit(paste(outcome, "~", rhs))
res <- stats::residuals(model$model, type = "deviance")
plDt <- PlotDataGeomPoint$new(name = "Residuals", xVals = dsCITS$xVar$asTibble(), yVals = res, colour = "grey", alpha = 0.6, size = 0.7)
plot <- Plot$new(paste0(self$name, ": ", fit$model$formula, ". Residuals over time."), yAxisTitle = "Deviance residuals", xAxisTitle = "Date",
plotData = plDt, path = resPath, filename = paste0(self$name, "_", "_", outcome, "_", rhsSel, "_residuals-base-model_ac-", term))
plot$plot()
acf <- stats::acf(res, na.action = na.pass)
pacf <- stats::pacf(res, na.action = na.pass)
bgacf <- lmtest::bgtest(model$model, order = 12)
pVal <- bgacf$p.value
valid <- car::vif(fit$model)
# TODO
stop("TODO")
}
acModels <- rhaskell::map(solveAutocorrelation, autoCorrTerms)
## models <- base::append(models, model)
}
## TODO!!!
return(models)
## if (!base::is.list(outcomes)) outcomes <- list(outcomes)
## if (!base::is.list(fitters)) fitters <- list(fitters)
## if (!base::is.list(formulas)) formulas <- list(formulas)
## coreModels <- CoreModelSelectors$new(paste0("Possible Core Models for DataSets of '", self$name, "'"), self)
## for (ds in self$datasets) {
## env <- ds$asEnvironment() # create environment
## for (y in outcomes) {
## selector <- CoreModelSelector$new(paste(ds$name, y), ds)
## for (fitter in fitters) {
## for (formula in formulas) {
## fitter$data <- env # set data frame
## fit <- fitter$fit(paste(y, formula)) # fit model
## selector$addPossibleCoreModel(fit) # save as possible model
## }
## }
## if (!selector$hasAnyConvergedModel()){
## stop("No core model for dataset '", ds$name, "' and '", y, "' converge. Cannot proceed")
## }
## coreModels$addCoreModelSelector(y, selector)
## }
## }
## return(coreModels)
},
#' Plot descriptive information of all variables and datasets.
#' All graphs are written to files.
#'
#' @param parentPath Character Parent path (must exist). Default: "."
#' @param resultFolder Character Folder to place results into. Default "results"
#' @param descriptivesFolder Character Folder to place descriptive information into. Default "descriptives"
#' @param dataSetFolder Bool Create a a subfolder for for each dataset. Default: TRUE
plotDescriptives = function(parentPath = ".", resultFolder = "results", descriptivesFolder = "descriptives", dataSetFolder = TRUE) {
path <- paste0(parentPath, "/", resultFolder)
rhaskell::mapM_(function(ds) ds$plotDescriptives(path, descriptivesFolder, dataSetFolder), self$datasets)
}
),
## Accessable properties. Active bindings look like fields, but each time they are accessed,
## they call a function. They are always publicly visible.
active = list(
name = function(value) {
if (missing(value)) return(private$.name)
if (!(base::is.character(value)))
propError("name", value, getSrcFilename(function(){}), getSrcLocation(function(){}))
private$.name <- value
return(self)
},
datasets = function(value) {
if (missing(value)) return(private$.datasets)
if (!(base::is.list(value) && length(value) >= 1))
propError("datasets", value, getSrcFilename(function(){}), getSrcLocation(function(){}))
private$.datasets <- value
return(self)
},
xVar = function(value) {
if (missing(value)) return(private$.xVar)
if (!("Variable" %in% class(value)))
propError("xVar", value, getSrcFilename(function(){}), getSrcLocation(function(){}))
private$.xVar <- value
return(self)
}
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.