#### Extract data set from RDL
#############################################################################
# as simple as possible: in longformat, no reshaping
#' Extract data frame.
#'
#' Extract data frame in long format from hierarchical data list.
#'
#' Extract a single data frame in maximum long format from relational list of data frames. Note that lists
#' can be nested in lists. Variables names are supplied as one character vector.
#'
#'@param RDL Relational List of data Sets
#'@param varVec Character vector of variables names in the data set
#'
#'@return Returns a data.table (or data.frame?) object in long format
#'
#'@examples
#'# Example data set
#'listMerge(sim_RDL, varVec = c("stud_1", "sch_1"))
#'listMerge(sim_RDL[[1]], varVec = c("stud_1", "imp_stud_2"))
extract_long <- function(RDL, varVec, dropIDs = TRUE) {
#checkGADSlist(RDL)
# if any Element is a list on its own, create data frame from this list!
for(i in seq_along(RDL)) {
if(!is.data.table(RDL[[i]])) RDL[[i]] <- extract_long(RDL[[i]], varVec, dropIDs = FALSE)
}
subRDL <- lapply(RDL, function(dat) extract_vars(dat = dat, varNames = varVec))
# drop subdatasets, till first data set with more than ID and imputation variables is found
# noch debuggen
subRDL <- dropIDs(subRDL)
# problematisch für n:n Relationen???
# Achtung: AUch ID_stud nicht rein genestet, da mehrere Lehrer -> ein schüler
# wichtig: Fach ist Key-Variable!!
# hui, wie wird dann gemerged???
# Achtung, Kompetenzbereich: auch ID-Variable
# d.h.: bisheriges Idee nicht durchführbar!
# man brauch darunterliegende Informationen, um darüber liegende Datensätze zu mergen
# vlt. doch relationale Datenbanken??
# n:n Relationen problematisch!!!
# "Master-ID-Variable", diese aus allen IDs erstellen
# anhand ihr mergen, dann Duplikate (doppelte Zeilen) löschen
### überlegeeeeen!!!!!!!!!!!!!!
## merge subdatasets
tempGADS <- NULL
for(s in seq_along(subRDL)) {
if(ncol(subRDL[[s]]) == 1) tempGADS <- tempGADS
else tempGADS <- singleMerge(GADS_L = tempGADS, GADS_H = subRDL[[s]])
}
# return final data set
tempGADS
}
## extract variables from data frame
extract_vars <- function(dat, varNames) {
stopifnot(is.data.table(dat))
varNames <- varNames[varNames %in% names(dat)]
varNames <- unique(c(getID(dat), varNames))
dat[, varNames, with = F]
}
## get names for Keys/ID Variables of data frame (maybe modify for OO-alternatives?)
getID <- function(dat, string = c("^ID_", "^n_imp")) {
IDlist <- lapply(string, function(pat) grep(pat, names(dat), value = TRUE))
IDnames <- unlist(IDlist)
IDnames
}
# merge two data.tables
singleMerge <- function(GADS_H, GADS_L) {
if(is.null(GADS_L)) return (GADS_H)
# select ID which is in both data sets
allIDs <- c(getID(GADS_H), getID(GADS_L))
# use all IDs which are duplicated (in both data sets) as keys for matching
ID_H <- allIDs[duplicated(allIDs)]
if(length(ID_H) == 0) stop("No Matching Key in data sets")
if(length(ID_H) >= 1) message(paste(ID_H, collapse = " and "), " are used for matching")
mergedGADS <- merge(GADS_H, GADS_L, by = ID_H, all = T)
mergedGADS
}
# drop irrelevant IDs (which don't carry any data information or are unimportant for linking data sets)
dropIDs <- function(subdatList) {
logList <- lapply(subdatList, checkDat4info)
# here debugging, prinzip müsste stimmen
logVec <- as.logical(logList)
# drop data sets without information which are not necessary for linking between other data sets
indices <- which(logVec)
logVec[min(indices):max(indices)] <- TRUE
subdatList[logVec]
}
checkDat4info <- function(dat) {
IDs <- getID(dat = dat)
any(!names(dat) %in% IDs)
}
#### Reshaping
#############################################################################
# wie soll das longformat aussehen?
# welche Variablen sind alle imputiert, Leistung/SFB, LFB etc.?
long2wide <- function(GADSlong, imputed) {
reshape2::dcast(mtcars, cyl~mpg)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.