### Checks and cleaning for splitting
#############################################################################
#' Prepare data frame.
#'
#' Prepares a data frame for being used in the eatGADS packages.
#'
#'This function deletes cases with missings on ID-variables (other variables can be specified too).
#'Data.frame is transformed to data.table for faster handling in eatGADS.
#'
#'@param dat Data set (in long format)
#'@param IDs Character vector of ID variables in the data set
#'
#'@return Returns a data.table object for further processing in eatGADS
#'
#'@examples
#'# Example data set
#'studQuest <- data.frame(ID_stud = 1:50,
#' HISEI = rnorm(50, 50, 10),
#' extr = rnorm(50, 7, 2))
#'studPVs <- data.frame(ID_stud = rep(1:50, 5),
#' PV_comp = rnorm(250, 0, 1.5))
#'longDat <- merge(studPVs, studQuest, by = "ID_stud")
#'NAs <- data.table(ID_stud = NA, HISEI = NA, extr = NA, PV_comp = 1)
#'longDat <- as.data.frame(rbind(longDat, NAs))
#'
#'# Prepare data
#'prepDat(longDat, IDs = "ID_stud")
#'
prepDat <- function(dat, IDs) {
dat <- dt_transform(dat)
datClean <- na.omit(dat, cols = IDs)
missings <- nrow(dat) - nrow(datClean)
if(missings > 0) message(paste(missings, "rows have been removed due to missings on ID-Variables"))
datClean
}
dt_transform <- function(dat) {
stopifnot(is.data.frame(dat))
if(!is.data.table(dat)) message("Data converted to data.table format")
as.data.table(dat)
}
### Guess structure of the data set
#############################################################################
#' Guess hierarchical data structure.
#'
#' Tries to guess the hierarchical structure of a data set by ID variables.
#'
#'To split up a hierarchical data set into relational data, the underlying structure has to be known. This function
#'guesses the underlying structure using the given ID-Variables of the data (e.g. Pupil-, Teacher-, School-ID).
#'
#'@param GADS Data set in long format
#'@param IDvars Character vector of IDnames in the data set
#'
#'@return Returns a list of Variable names nested into ID-Variables
#'
#'@examples
#'# Example data set
#'studQuest <- data.table(ID_stud = 1:50,
#' HISEI = rnorm(50, 50, 10),
#' extr = rnorm(50, 7, 2))
#'studPVs <- data.table(ID_stud = rep(1:50, 5),
#' PV_comp = rnorm(250, 0, 1.5))
#'longDat <- merge(studPVs, studQuest, by = "ID_stud")
#'
#'# Guess data structure
#'guessStructure(longDat, IDvars = "ID_stud")
#'
guessStructure <- function(GADS, IDvars) {
stopifnot(all(IDvars %in% names(GADS)))
nameList <- vector("list", length(IDvars))
subGADS <- GADS
# I. Determine related variables for each ID
for(i in seq_along(IDvars)) {
nameList[[i]] <- guessVars(subGADS, IDvars[i])
subGADS <- subGADS[, -nameList[[i]], with = FALSE]
message(paste("Data set ", i, " has been assumed, with ID ", IDvars[i], " and ", length(nameList[[i]]), " variables"))
}
names(nameList) <- IDvars
if(any(duplicated(do.call(c, nameList)))) stop("Variables sorted to more than 1 DF")
# II. Pack rest of variables in additional df (e.g. imputed data/PVs)
leftVars <- names(GADS)[ !names(GADS) %in% unlist(nameList)]
nameList$leftVars <- leftVars
message(paste("Data set ", i + 1, " has been assumed, with no ID and ", length(nameList$leftVars), " variables left"))
nameList
}
guessVars <- function(GADS, ID) {
IDcases <- length(unique(GADS[[ID]]))
duplications <- nrow(GADS)/IDcases
# I) In Frage kommende Variablen auswählen (weniger/gleiche Anzahl uniquer Werte)
# browser()
NAcases <- GADS[, lapply(.SD, function(x) sum(is.na(x))/IDcases)]
# test einbauen für Bruch hier
noNAcases <- GADS[, lapply(.SD, function(x) length(unique(x)))]
Varcases <- as.double(NAcases + noNAcases)
subDat <- GADS[, (!(Varcases > IDcases)), with = FALSE]
stopifnot(ID %in% names(subDat))
# II) Veränderungen der Variable innerhalb des Index checken
changeLog <- subDat[, lapply(.SD, function(x) length(unique(x))), by = ID]
varDesc <- lapply(changeLog, function(x) all(x <= 1))
# hier noch Message für Grenzfälle einbauen;
nameVec <- names(varDesc)[varDesc == TRUE]
nameVec <- c(ID, nameVec)
nameVec
}
#### split/normalize GADS
#############################################################################
# Main Functions
#' Splitting relational data from long format.
#'
#' Splits a data set in long format into relational data sets stored in a list.
#'
#'This function splits a long formatted data set into a list of relational data sets. Data is
#'split by ID-Variables, called Keys. Note that function arguments have to name exactly as
#'described. The ID-Variable is duplicated between neighboured hierarchical levels. Note that
#'on level 1 there often is no ID variable (e.g. when using Multiple Imputation).
#'
#'@param GADS Data set in long format
#'@param L1_Vars Character vector or column indices of variables of the first level
#'@param L1_ID String with the name of the ID-Variable
#'@param ... further L<x>_Vars and L<x>_ID arguments
#'
#'@return Returns a list of data.tables, connected via IDs/Keys.
#'
#'@examples
#'# Example data set
#'studQuest <- data.table(ID_stud = 1:50,
#' HISEI = rnorm(50, 50, 10),
#' extr = rnorm(50, 7, 2))
#'studPVs <- data.table(ID_stud = rep(1:50, 5),
#' PV_comp = rnorm(250, 0, 1.5))
#'longDat <- merge(studPVs, studQuest, by = "ID_stud")
#'
#'# Split hierarchical data set
#'datList <- splitGADS(longDat, L1_Vars = c("PV_comp"), L2_Vars = c("HISEI", "extr"), L2_ID = "ID_stud")
#'
splitGADS <- function(GADS, L1_Vars, L1_ID = NULL, ...) {
# I. prepare arguments (list with levels)
args <- list(L1_Vars = L1_Vars, L1_ID = L1_ID, ...)
argsMod <- prepArgs(args)
# I.b check variable selection and transform all into variable names (extraction otherwise impossible!)
for(i in seq_along(argsMod)) {
if(is.numeric(argsMod[[i]]$variables)) argsMod[[i]]$variables <- names(GADS)[argsMod[[i]]$variables]
}
## hier noch check für Variablennamen einbauen
#II. split data frame
datList <- list()
dat <- GADS
oldID <- character(0)
for(i in seq_along(argsMod)) {
#if(i == 4) browser()
tempData <- singleSplit(dat = dat,
vars = argsMod[[i]]$variables, ID = argsMod[[i]]$IDVar, double = oldID)
datList[[i]] <- tempData$extracted
dat <- tempData$leftover
oldID <- argsMod[[i]]$IDVar
}
if(ncol(dat) > 0) cat("The following variables were not mentioned and are not included in the new data:", "\n" ,names(dat))
names(datList) <- names(argsMod)
datList
}
# prepare Arguments, list per hierarchical level, all strings (names)
prepArgs <- function(args) {
stopifnot(length(args) >= 4)
# 1.a) testing if arguments named correctly
l <- length(args)/2
VarNames <- paste("L", 1:l, "_Vars", sep = "")
IDNames <- paste("L", 1:l, "_ID", sep = "")
test <- identical(sort(c(VarNames, IDNames)), sort(names(args)))
if(!test) stop("Stop: Invalid Names for Arguments. Valid names are L<x>_Vars and L<x>_ID.")
# 2.) rearranging arguments into level specific form
lvlList <- vector("list", l)
for(i in seq_along(VarNames)) {
lvlList[[i]]$variables <- args[[VarNames[i]]]
lvlList[[i]]$IDVar <- args[[IDNames[i]]]
}
names(lvlList) <- substr(VarNames, 1, 2)
# give out reversed list, as needed for splitting
rev(lvlList)
}
# split data frame into two, shorten extracted
singleSplit <- function(dat, vars, ID, double = character(0)) {
#browser()
if(is.null(ID)) { delVars <- vars
} else { delVars <- vars[vars != ID]} # leave ID in Data set, if there is one!
newVars <- c(ID, delVars, double)
stopifnot(all(newVars %in% names(dat)))
extracted <- unique(dat[, newVars, with = F]) # delete doubled rows
leftover <- dat[, -c(delVars, double), with = F] # clean double ID from data set (all keys only in 2 data sets!)
list(leftover = leftover, extracted = extracted)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.