old_code/01_splitting.R

### Checks and cleaning for splitting
#############################################################################
#' Prepare data frame.
#'
#' Prepares a data frame for being used in the eatGADS packages.
#'
#'This function deletes cases with missings on ID-variables (other variables can be specified too).
#'Data.frame is transformed to data.table for faster handling in eatGADS.
#'
#'@param dat Data set (in long format)
#'@param IDs Character vector of ID variables in the data set
#'
#'@return Returns a data.table object for further processing in eatGADS
#'
#'@examples
#'# Example data set
#'studQuest <- data.frame(ID_stud = 1:50,
#'                       HISEI = rnorm(50, 50, 10),
#'                       extr = rnorm(50, 7, 2))
#'studPVs <- data.frame(ID_stud = rep(1:50, 5),
#'                      PV_comp = rnorm(250, 0, 1.5))
#'longDat <- merge(studPVs, studQuest, by = "ID_stud")
#'NAs <- data.table(ID_stud = NA, HISEI = NA, extr = NA, PV_comp = 1)
#'longDat <- as.data.frame(rbind(longDat, NAs))
#'
#'# Prepare data
#'prepDat(longDat, IDs = "ID_stud")
#'
prepDat <- function(dat, IDs) {
  dat <- dt_transform(dat)
  datClean <- na.omit(dat, cols = IDs)
  missings <- nrow(dat) - nrow(datClean)
  if(missings > 0) message(paste(missings, "rows have been removed due to missings on ID-Variables"))
  datClean
}

dt_transform <- function(dat) {
  stopifnot(is.data.frame(dat))
  if(!is.data.table(dat)) message("Data converted to data.table format")
  as.data.table(dat)
}

### Guess structure of the data set
#############################################################################
#' Guess hierarchical data structure.
#'
#' Tries to guess the hierarchical structure of a data set by ID variables.
#'
#'To split up a hierarchical data set into relational data, the underlying structure has to be known. This function
#'guesses the underlying structure using the given ID-Variables of the data (e.g. Pupil-, Teacher-, School-ID).
#'
#'@param GADS Data set in long format
#'@param IDvars Character vector of IDnames in the data set
#'
#'@return Returns a list of Variable names nested into ID-Variables
#'
#'@examples
#'# Example data set
#'studQuest <- data.table(ID_stud = 1:50,
#'                       HISEI = rnorm(50, 50, 10),
#'                       extr = rnorm(50, 7, 2))
#'studPVs <- data.table(ID_stud = rep(1:50, 5),
#'                      PV_comp = rnorm(250, 0, 1.5))
#'longDat <- merge(studPVs, studQuest, by = "ID_stud")
#'
#'# Guess data structure
#'guessStructure(longDat, IDvars = "ID_stud")
#'
guessStructure <- function(GADS, IDvars) {
  stopifnot(all(IDvars %in% names(GADS)))
  nameList <- vector("list", length(IDvars))
  subGADS <- GADS

  # I. Determine related variables for each ID
  for(i in seq_along(IDvars)) {
    nameList[[i]] <- guessVars(subGADS, IDvars[i])
    subGADS <- subGADS[, -nameList[[i]], with = FALSE]
    message(paste("Data set ", i, " has been assumed, with ID ", IDvars[i], " and ", length(nameList[[i]]), " variables"))
  }
  names(nameList) <- IDvars
  if(any(duplicated(do.call(c, nameList)))) stop("Variables sorted to more than 1 DF")

  # II. Pack rest of variables in additional df (e.g. imputed data/PVs)
  leftVars <- names(GADS)[ !names(GADS) %in% unlist(nameList)]
  nameList$leftVars <- leftVars
  message(paste("Data set ", i + 1, " has been assumed, with no ID and ", length(nameList$leftVars), " variables left"))

  nameList
}

guessVars <- function(GADS, ID) {
  IDcases <- length(unique(GADS[[ID]]))
  duplications <- nrow(GADS)/IDcases
  # I) In Frage kommende Variablen auswählen (weniger/gleiche Anzahl uniquer Werte)
  # browser()
  NAcases <- GADS[, lapply(.SD, function(x) sum(is.na(x))/IDcases)]
  # test einbauen für Bruch hier
  noNAcases <- GADS[, lapply(.SD, function(x) length(unique(x)))]
  Varcases <- as.double(NAcases + noNAcases)
  subDat <- GADS[, (!(Varcases > IDcases)), with = FALSE]
  stopifnot(ID %in% names(subDat))

  # II) Veränderungen der Variable innerhalb des Index checken
  changeLog <- subDat[, lapply(.SD, function(x) length(unique(x))), by = ID]
  varDesc <- lapply(changeLog, function(x) all(x <= 1))
  # hier noch Message für Grenzfälle einbauen;

  nameVec <- names(varDesc)[varDesc == TRUE]
  nameVec <- c(ID, nameVec)
  nameVec
}



#### split/normalize GADS
#############################################################################
# Main Functions
#' Splitting relational data from long format.
#'
#' Splits a data set in long format into relational data sets stored in a list.
#'
#'This function splits a long formatted data set into a list of relational data sets. Data is
#'split by ID-Variables, called Keys. Note that function arguments have to name exactly as
#'described. The ID-Variable is duplicated between neighboured hierarchical levels. Note that
#'on level 1 there often is no ID variable (e.g. when using Multiple Imputation).
#'
#'@param GADS Data set in long format
#'@param L1_Vars Character vector or column indices of variables of the first level
#'@param L1_ID String with the name of the ID-Variable
#'@param ... further L<x>_Vars and L<x>_ID arguments
#'
#'@return Returns a list of data.tables, connected via IDs/Keys.
#'
#'@examples
#'# Example data set
#'studQuest <- data.table(ID_stud = 1:50,
#'                       HISEI = rnorm(50, 50, 10),
#'                       extr = rnorm(50, 7, 2))
#'studPVs <- data.table(ID_stud = rep(1:50, 5),
#'                      PV_comp = rnorm(250, 0, 1.5))
#'longDat <- merge(studPVs, studQuest, by = "ID_stud")
#'
#'# Split hierarchical data set
#'datList <- splitGADS(longDat, L1_Vars = c("PV_comp"), L2_Vars = c("HISEI", "extr"), L2_ID = "ID_stud")
#'
splitGADS <- function(GADS, L1_Vars, L1_ID = NULL, ...) {
  # I. prepare arguments (list with levels)
  args <- list(L1_Vars = L1_Vars, L1_ID = L1_ID, ...)
  argsMod <- prepArgs(args)
  # I.b check variable selection and transform all into variable names (extraction otherwise impossible!)
  for(i in seq_along(argsMod)) {
    if(is.numeric(argsMod[[i]]$variables)) argsMod[[i]]$variables <- names(GADS)[argsMod[[i]]$variables]
  }
  ## hier noch check für Variablennamen einbauen

  #II. split data frame
  datList <- list()
  dat <- GADS
  oldID <- character(0)
  for(i in seq_along(argsMod)) {
    #if(i == 4) browser()
    tempData <- singleSplit(dat = dat,
                            vars = argsMod[[i]]$variables, ID = argsMod[[i]]$IDVar, double = oldID)
    datList[[i]] <- tempData$extracted
    dat <- tempData$leftover
    oldID <- argsMod[[i]]$IDVar
  }
  if(ncol(dat) > 0) cat("The following variables were not mentioned and are not included in the new data:", "\n" ,names(dat))
  names(datList) <- names(argsMod)
  datList
}

# prepare Arguments, list per hierarchical level, all strings (names)
prepArgs <- function(args) {
  stopifnot(length(args) >= 4)
  # 1.a) testing if arguments named correctly
  l <- length(args)/2
  VarNames <- paste("L", 1:l, "_Vars", sep = "")
  IDNames <- paste("L", 1:l, "_ID", sep = "")
  test <- identical(sort(c(VarNames, IDNames)), sort(names(args)))
  if(!test) stop("Stop: Invalid Names for Arguments. Valid names are L<x>_Vars and L<x>_ID.")

  # 2.) rearranging arguments into level specific form
  lvlList <- vector("list", l)
  for(i in seq_along(VarNames)) {
    lvlList[[i]]$variables <- args[[VarNames[i]]]
    lvlList[[i]]$IDVar <- args[[IDNames[i]]]
  }
  names(lvlList) <- substr(VarNames, 1, 2)
  # give out reversed list, as needed for splitting
  rev(lvlList)
}

# split data frame into two, shorten extracted
singleSplit <- function(dat, vars, ID, double = character(0)) {
  #browser()
  if(is.null(ID)) { delVars <- vars
     } else { delVars <- vars[vars != ID]} # leave ID in Data set, if there is one!
  newVars <- c(ID, delVars, double)
  stopifnot(all(newVars %in% names(dat)))

  extracted <- unique(dat[, newVars, with = F]) # delete doubled rows
  leftover <- dat[, -c(delVars, double), with = F] # clean double ID from data set (all keys only in 2 data sets!)

  list(leftover = leftover, extracted = extracted)
}
b-becker/eatGADS documentation built on May 24, 2019, 8:47 p.m.