eatFDZ: Data checking and validation

#   foo <- fdz(fileName = "n:/Dropbox/Projekte/2014/Aleks/StEG_Datensatz_Eltern_IZA.sav", saveFolder = "n:/Archiv/temp", nameListe = "liste1.csv", nameSyntax = "syntax1.txt")
#   foo <- fdz(fileName = "q:/BT2016/BT/50_Daten/03_Aufbereitet/06_Gesamtdatensatz/BS_LV_Primar_2016_Matchingvorlaeufig_09_erweiterteGadsversion.sav", saveFolder = "N:/archiv/temp", nameListe = "liste2.csv", nameSyntax = "syntax2.txt", exclude = exclude)

#### Perform FDZ Data Cleansing
#############################################################################
#' Create a Statistical Disclosure Control Report.
#'
#' Create a statistical disclosure control report: Which variables have categories with low absolute frequencies,
#' which might lead to statistical data disclosure issues?
#'
#' Individual participants of studies such as educational large-scale assessments usually must remain non-identifiable on individual level.
#' This function checks all variables in a \code{GADSdat} object
#' for low frequency categories which might lead to statistical disclosure control issues.
#' Currently, only a uni-variate check is implemented.
#'
#'@param fileName Character string of the SPSS file
#'@param boundary Integer number: categories with less than or equal to \code{boundary} observations will be flagged
#'@param exclude Optional: character vector of variable names which should be excluded from the report
#'@param encoding Optional: The character encoding used for reading the \code{.sav} file. The default, \code{NULL}, uses the encoding specified in the file, but sometimes this value is incorrect and it is useful to be able to override it.
#'
#'@return A \code{data.frame}.
#'
#'@examples
#'sav_path <- system.file("extdata", "LV_2011_CF.sav", package = "eatFDZ")
#'
#'## don't report low frequencies for unique id variables
#'exclude<- c("idstud_FDZ", "idsch_FDZ")
#'
#'##
#'sdc_report <- sdc_check(fileName = sav_path, exclude=exclude)
#'
#'
#'
#'@export
sdc_check <- function ( fileName, boundary = 5, exclude = NULL, encoding = NULL) {
  GADSdat     <- eatGADS::import_spss(fileName, checkVarNames = FALSE, encoding = encoding)
  # load("t:/Sebastian/gd.rda")
  #GADSdat     <- eatGADS::checkMissings(GADSdat, missingLabel = "missing", addMissingCode = TRUE, addMissingLabel = TRUE)
  #datOM  <- eatGADS::miss2NA(GADSdat) ## alle was man braucht? oder extractData
  varLab <- unique(GADSdat[["labels"]][, c("varName", "varLabel")])
  liste <- create_overview(GADSdat, boundary = boundary)

  #browser()

  ## Fehler werfen statt warning und pruefen ob rausgenommen oder nur in Spalte geschrieben
  if (!is.null(exclude) ) {
    chk <- setdiff(exclude,liste[,"variable"])
    if ( length(chk)>0) {
      warning("Variables '",paste(chk, collapse="', '"), "' from the 'exclude' argument are not available in the data set and will be ignored.")
    }
    liste[stats::na.omit(match(exclude, liste[,"variable"])),"exclude"] <- TRUE
  }

  liste
}


create_overview <- function(GADSdat, boundary) {
  skala  <- sapply(GADSdat[["dat"]], class)
  datOM  <- eatGADS::miss2NA(GADSdat)
  # to do: Funktion schreiben, die in dem data.frame die missingwerte (z.B. -9994) auf basis der labels in NA umwandelt
  # folgende Zeilen braucht einen Datensatz mit NAs statt -9994                                                  # datensatz ohne missings
  tab    <- lapply(datOM, FUN = function (y ) { table(as.character(y)) } )
  nKatOM <- sapply(tab, FUN = function ( y ) { length(y)})                 ### Anzahl Kategorien (ohne Missingkategorien)
  nValid <- sapply(tab, FUN = function ( y ) { sum(y)})                    ### untere Zeile: Kategorien mit Haeufigkeit kleiner gleich 5, aber groesser als 0!
  freq5  <- sapply(tab, FUN = function ( y ) { length(which(y <= boundary & y > 0 ))>0 })
  varLab <- unique(GADSdat[["labels"]][, c("varName", "varLabel")])
  existVarLab <- nchar(varLab[,"varLabel"])>0 & !is.na(varLab[,"varLabel"])
  existValLab <- do.call(rbind, by(data = GADSdat[["labels"]], INDICES = GADSdat[["labels"]][,"varName"], FUN = function (x ) {
    data.frame(variable = unique(x[, "varName"]), existValLab = any(!is.na(x[,"valLabel"])), stringsAsFactors = FALSE)
    }))
  out1 <- data.frame ( variable = varLab[,"varName"], varLab = varLab[,"varLabel"], existVarLab = existVarLab,
                         skala = unlist(skala), nKatOhneMissings = nKatOM, nValid = nValid,
                         nKl5 = freq5, exclude = FALSE, stringsAsFactors = FALSE)
  out2 <- merge(out1, existValLab, all = TRUE, by = "variable", sort = FALSE)
  out2[, c("variable", "varLab", "existVarLab", "existValLab", "skala", "nKatOhneMissings", "nValid", "nKl5", "exclude")]
}


makeAnonymous <- function (x, liste, boundary, datOM, df, varLab) {
  message(length(x), " numeric variables with category size <= ", boundary," will be recoded anonymously.")
  liste[x, "makeAnonymous"] <- TRUE
  toRec <- liste[which(liste[,"makeAnonymous"]==TRUE),]
  snipp1<- unlist(by(toRec, INDICES = toRec[,"variable"], FUN = function ( tr ) {
    werte <- table(datOM[, as.character(tr[["variable"]]) ] )       ### hier: Variablenweise!
    werteM<- table(df[["dat"]][, as.character(tr[["variable"]]) ] )
    unter6<- werte[which(werte < (boundary + 1) )]
    if ( length(unter6) ==0) {cat("darf nicht passieren."); browser()}
    unter6<- data.frame ( Nummer = seq_along(unter6), kategorie = names(unter6), belegung = unter6)
    aufb  <- do.call("rbind", by(data = unter6, INDICES = unter6[,"Nummer"], FUN = function ( z ) {
      matchU <- match( as.character(z[["kategorie"]]), names(werte))
      matchW <- matchU                                  ### hier: werteweise (je Variable)
      toNA   <- FALSE
      while ( sum(werte[matchU:matchW])< (boundary + 1) & toNA == FALSE ) {
        if( (matchW+2)< length(werte))  {
          matchW <- matchW+1
        }  else {
          toNA <- TRUE
        }
      }
      inkl   <- names(werte[matchU:matchW])[-1]         ### wenn zwei Werte mit weniger als 5 Belegungen direkt benachbart sind, muss ggf. nicht
      if(length(inkl) == 0 ) { inkl <- NA}              ### zweimal recodiert werden, falls beide Kategorien zusammen mehr als 5 Belegungen haben
      z      <- data.frame ( Nummer = z[["Nummer"]], kategorie = z[["kategorie"]], inkludiert = inkl, toNA = toNA)
      return(z)}))
    weg   <- which(aufb[,"kategorie"] %in% aufb[,"inkludiert"])
    if(length(weg)>0) { aufb <- aufb[-weg,]}
    if(length(which( aufb[,"toNA"] == TRUE))>0) {              ### Fuer alle, die zu "NA" recodiert werden, muss nur ein Recodierungsstatement, nicht mehrere verfasst werden
      minNum <- min(aufb[which( aufb[,"toNA"] == TRUE),"Nummer"])
      aufb[which( aufb[,"toNA"] == TRUE),"Nummer"] <- minNum
    }
    misLab<- setdiff(names(werteM), names(werte))              ### Missinglabels fuer Variable
    recSt1<- c("RECODE", as.character(tr[["variable"]]) )      ### erster Teil des Recodierungsstatements
    recSt2<- unlist(by(data = aufb, INDICES = aufb[,"Nummer"], FUN = function ( r ) {
      if(r[1,"toNA"] == FALSE) {
        newValue<- r[1,"kategorie"]
        recStat1<- paste("(",as.numeric(as.character(r[1,"kategorie"])), " THRU ", max(as.numeric(as.character(r[,"inkludiert"]))), " = ", newValue, ")",sep="")
      }  else  {
        allVal  <- sort(unique(stats::na.omit(c(as.numeric(as.character(r[,"kategorie"])), as.numeric(as.character(r[,"inkludiert"]))))))
        if ( length(allVal)>1) {
          recStat1<- paste("(",allVal[1], " THRU ", allVal[length(allVal)], " = SYSMIS )",sep="")
        }  else  {
          recStat1<- paste("(",allVal[1], " = SYSMIS )",sep="")
        }
      }
      return(recStat1)}))                                    ### Syntaxgenerierung: "c:\Users\weirichs\Dropbox\IQB\Projekte\Aleks\Aufbereitung_SUFs_StEG_ak_Elterndaten.sps"
    recSt3<- c("(ELSE = COPY)", "INTO", paste( as.character(tr[["variable"]]), "_FDZ.\n", sep=""),
               paste("VARIABLE LABELS ", paste( as.character(tr[["variable"]]), "_FDZ", sep="")," '",
                     varLab[which(varLab[,"varName"] == as.character(tr[["variable"]])), "varLabel"], " (FDZ)'.", sep=""))
    recSt4<- unlist(by(data = aufb, INDICES = aufb[,"Nummer"], FUN = function ( r ) {
      if(r[1,"toNA"] == FALSE) {
        newValue<- r[1,"kategorie"]
        recStat <- paste("VALUE LABELS ", paste( as.character(tr[["variable"]]), "_FDZ ", sep=""), newValue,
                         " 'von ",as.numeric(as.character(r[1,"kategorie"]))," bis ",max(as.numeric(as.character(r[,"inkludiert"]))),
                         " (zur Anonymisierung aggregiert (FDZ))'.",sep="")
      }  else  {
        recStat <- NULL
      }
      return(recStat)}))
    ### wertelabels fuer missings, falls vorhanden
    recSt4b <- NULL # initialisieren
    lbs <- df$labels[df$labels$varName == as.character(tr[["variable"]]),]
    lbs <- lbs[which(lbs[,"missings"]=="miss"),]
    new_misLab <- lbs[["value"]]
    if ( length(new_misLab)>0) {
      stopifnot(nrow(lbs)>0)
      recSt4b<- unlist(apply(lbs, MARGIN = 1, FUN = function (r){
        paste("VALUE LABELS ", paste( as.character(tr[["variable"]]), "_FDZ ", sep=""), r[["value"]],
              " '", r[["valLabel"]], "'.", sep="")}))
    }
    recSt5<- c ( paste(c("VARIABLE LEVEL ", paste( as.character(tr[["variable"]]), "_FDZ ", sep=""), "(ORDINAL)."), sep="", collapse=""),
                 paste("FORMATS ", paste( as.character(tr[["variable"]]), "_FDZ ", sep=""), "(F3.0).", sep="", collapse=""))
    if(length(new_misLab)>0) {
      recSt5 <- c(recSt5, paste("MISSING VALUES ", paste( as.character(tr[["variable"]]), "_FDZ ", sep=""), "(", paste(new_misLab, collapse=", "),").",sep="", collapse=""))
    }
    recSt6<- "EXECUTE.\n\n"
    recSt <- c(recSt1, recSt2, recSt3, recSt4, recSt4b, recSt5, recSt6)
    return(recSt)}))
  return(snipp1)}


makeNumeric <- function (x, df_labels, liste, datOM) {
  message("Recode ", length(x), " non-numeric variables into numeric variables.")
  liste[x, "recodeToNumeric"] <- TRUE
  toRec <- liste[which(liste[,"recodeToNumeric"]==TRUE),]
  snipp2<- unlist(by(toRec, INDICES = toRec[,"variable"], FUN = function ( tr ) {
    werte <- eatTools::crop(names(table(datOM[, as.character(tr[["variable"]]) ] )))
    if ( length(setdiff(werte, ""))==0 ) {
      recSt <- NULL
    }  else  {
      miss  <- df_labels[intersect(which(df_labels[,"varName"] == as.character(tr[["variable"]])), which(df_labels[,"missings"] == "miss")),]
      wom   <- setdiff ( setdiff (werte, miss[,"value"]), "")## werte ohne missings
      # if (length(wom)==0) {browser()}
      recSt1<- c("RECODE", as.character(tr[["variable"]]) ) ### erster Teil des Recodierungsstatements
      oldnew<- data.frame ( old = wom, new = 1:length(wom), stringsAsFactors = FALSE)
      recSt2<- unlist(by ( data = oldnew, INDICES = oldnew[,"old"], FUN = function (z) { paste0("('",z[["old"]], "' = ", z[["new"]], ")") }))
      recSt3<- c("INTO" , paste0(as.character(tr[["variable"]]), "_FDZ."))
      recSt4<- paste0("VARIABLE LABELS ", as.character(tr[["variable"]]), "_FDZ '",as.character(tr[["varLab"]]), "'.")
      recSt5<- paste0("VALUE LABELS ", as.character(tr[["variable"]]), "_FDZ '", paste(oldnew[,"new"], paste0("'",oldnew[,"old"], "'") , collapse=" "), ".")
      recSt6<- paste0("VARIABLE LEVEL ", as.character(tr[["variable"]]), "_FDZ (NOMINAL).")
      recSt7<- paste0("FORMATS ", as.character(tr[["variable"]]), "_FDZ (F8.0).")
      recSt8<- NULL                                         ### initialisieren
      if(length(stats::na.omit(miss[,"value"]))>0) {
        recSt8<- paste0("MISSING VALUES ", as.character(tr[["variable"]]), "_FDZ (", paste(stats::na.omit(miss[,"value"]), collapse = ", "), ").")
      }
      recSt9<- "EXECUTE."
      recSt <- c(recSt1, recSt2, recSt3, recSt4, recSt5, recSt6, recSt7, recSt8, recSt9)
    }
    return(recSt)}))
  return(snipp2)}