old_code/03_merge.R

#### Extract data set from RDL
#############################################################################
# as simple as possible: in longformat, no reshaping
#' Extract data frame.
#'
#' Extract data frame in long format from hierarchical data list.
#'
#' Extract a single data frame in maximum long format from relational list of data frames. Note that lists
#' can be nested in lists. Variables names are supplied as one character vector.
#'
#'@param RDL Relational List of data Sets
#'@param varVec Character vector of variables names in the data set
#'
#'@return Returns a data.table (or data.frame?) object in long format
#'
#'@examples
#'# Example data set
#'listMerge(sim_RDL, varVec = c("stud_1", "sch_1"))
#'listMerge(sim_RDL[[1]], varVec = c("stud_1", "imp_stud_2"))
extract_long <- function(RDL, varVec, dropIDs = TRUE) {
  #checkGADSlist(RDL)
  # if any Element is a list on its own, create data frame from this list!
  for(i in seq_along(RDL)) {
    if(!is.data.table(RDL[[i]])) RDL[[i]] <- extract_long(RDL[[i]], varVec, dropIDs = FALSE)
  }
  subRDL <- lapply(RDL, function(dat) extract_vars(dat = dat, varNames = varVec))
  # drop subdatasets, till first data set with more than ID and imputation variables is found
  # noch debuggen
  subRDL <- dropIDs(subRDL)

  # problematisch für n:n Relationen???
  # Achtung: AUch ID_stud nicht rein genestet, da mehrere Lehrer -> ein schüler
  # wichtig: Fach ist Key-Variable!!
  # hui, wie wird dann gemerged???
  # Achtung, Kompetenzbereich: auch ID-Variable

  # d.h.: bisheriges Idee nicht durchführbar!
  # man brauch darunterliegende Informationen, um darüber liegende Datensätze zu mergen

  # vlt. doch relationale Datenbanken??
  # n:n Relationen problematisch!!!

  # "Master-ID-Variable", diese aus allen IDs erstellen
  # anhand ihr mergen, dann Duplikate (doppelte Zeilen) löschen

  ### überlegeeeeen!!!!!!!!!!!!!!

  ## merge subdatasets
  tempGADS <- NULL
  for(s in seq_along(subRDL)) {
    if(ncol(subRDL[[s]]) == 1) tempGADS <- tempGADS
    else tempGADS <- singleMerge(GADS_L = tempGADS, GADS_H = subRDL[[s]])
  }
  # return final data set
  tempGADS
}

## extract variables from data frame
extract_vars <- function(dat, varNames) {
  stopifnot(is.data.table(dat))
  varNames <- varNames[varNames %in% names(dat)]
  varNames <- unique(c(getID(dat), varNames))
  dat[, varNames, with = F]
}

## get names for Keys/ID Variables of data frame (maybe modify for OO-alternatives?)
getID <- function(dat, string = c("^ID_", "^n_imp")) {
  IDlist <- lapply(string, function(pat) grep(pat, names(dat), value = TRUE))
  IDnames <- unlist(IDlist)
  IDnames
}

# merge two data.tables
singleMerge <- function(GADS_H, GADS_L) {
  if(is.null(GADS_L)) return (GADS_H)
  # select ID which is in both data sets
  allIDs <- c(getID(GADS_H), getID(GADS_L))
  # use all IDs which are duplicated (in both data sets) as keys for matching
  ID_H <- allIDs[duplicated(allIDs)]
  if(length(ID_H) == 0) stop("No Matching Key in data sets")
  if(length(ID_H) >= 1) message(paste(ID_H, collapse = " and "), " are used for matching")
  mergedGADS <- merge(GADS_H, GADS_L, by = ID_H, all = T)
  mergedGADS
}

# drop irrelevant IDs (which don't carry any data information or are unimportant for linking data sets)
dropIDs <- function(subdatList) {
  logList <- lapply(subdatList, checkDat4info)
  # here debugging, prinzip müsste stimmen
  logVec <- as.logical(logList)
  # drop data sets without information which are not necessary for linking between other data sets
  indices <- which(logVec)
  logVec[min(indices):max(indices)] <- TRUE
  subdatList[logVec]
}

checkDat4info <- function(dat) {
  IDs <- getID(dat = dat)
  any(!names(dat) %in% IDs)
}




#### Reshaping
#############################################################################
# wie soll das longformat aussehen?
# welche Variablen sind alle imputiert, Leistung/SFB, LFB etc.?

long2wide <- function(GADSlong, imputed) {
  reshape2::dcast(mtcars, cyl~mpg)
}
b-becker/eatGADS documentation built on May 24, 2019, 8:47 p.m.